@@ -277,20 +277,26 @@ The high-level API provides a simple managed interface through the [`Llama`](htt
277
277
Below is a short example demonstrating how to use the high-level API to for basic text completion:
278
278
279
279
``` python
280
- >> > from llama_cpp import Llama
281
- >> > llm = Llama(
280
+ from llama_cpp import Llama
281
+
282
+ llm = Llama(
282
283
model_path = " ./models/7B/llama-model.gguf" ,
283
284
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
284
285
# seed=1337, # Uncomment to set a specific seed
285
286
# n_ctx=2048, # Uncomment to increase the context window
286
287
)
287
- >> > output = llm(
288
+ output = llm(
288
289
" Q: Name the planets in the solar system? A: " , # Prompt
289
290
max_tokens = 32 , # Generate up to 32 tokens, set to None to generate up to the end of the context window
290
291
stop = [" Q:" , " \n " ], # Stop generating just before the model would generate a new question
291
292
echo = True # Echo the prompt back in the output
292
293
) # Generate a completion, can also call create_completion
293
- >> > print (output)
294
+ print (output)
295
+ ```
296
+
297
+ By default ` llama-cpp-python ` generates completions in an OpenAI compatible format:
298
+
299
+ ``` python
294
300
{
295
301
" id" : " cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" ,
296
302
" object" : " text_completion" ,
@@ -345,12 +351,12 @@ The model will will format the messages into a single prompt using the following
345
351
Set ` verbose=True ` to see the selected chat format.
346
352
347
353
``` python
348
- >> > from llama_cpp import Llama
349
- >> > llm = Llama(
354
+ from llama_cpp import Llama
355
+ llm = Llama(
350
356
model_path = " path/to/llama-2/llama-model.gguf" ,
351
357
chat_format = " llama-2"
352
358
)
353
- >> > llm.create_chat_completion(
359
+ llm.create_chat_completion(
354
360
messages = [
355
361
{" role" : " system" , " content" : " You are an assistant who perfectly describes images." },
356
362
{
@@ -375,9 +381,9 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the
375
381
The following example will constrain the response to valid JSON strings only.
376
382
377
383
``` python
378
- >> > from llama_cpp import Llama
379
- >> > llm = Llama(model_path = " path/to/model.gguf" , chat_format = " chatml" )
380
- >> > llm.create_chat_completion(
384
+ from llama_cpp import Llama
385
+ llm = Llama(model_path = " path/to/model.gguf" , chat_format = " chatml" )
386
+ llm.create_chat_completion(
381
387
messages = [
382
388
{
383
389
" role" : " system" ,
@@ -397,9 +403,9 @@ The following example will constrain the response to valid JSON strings only.
397
403
To constrain the response further to a specific JSON Schema add the schema to the ` schema ` property of the ` response_format ` argument.
398
404
399
405
``` python
400
- >> > from llama_cpp import Llama
401
- >> > llm = Llama(model_path = " path/to/model.gguf" , chat_format = " chatml" )
402
- >> > llm.create_chat_completion(
406
+ from llama_cpp import Llama
407
+ llm = Llama(model_path = " path/to/model.gguf" , chat_format = " chatml" )
408
+ llm.create_chat_completion(
403
409
messages = [
404
410
{
405
411
" role" : " system" ,
@@ -424,9 +430,9 @@ To constrain the response further to a specific JSON Schema add the schema to th
424
430
The high-level API supports OpenAI compatible function and tool calling. This is possible through the ` functionary ` pre-trained models chat format or through the generic ` chatml-function-calling ` chat format.
425
431
426
432
``` python
427
- >> > from llama_cpp import Llama
428
- >> > llm = Llama(model_path = " path/to/chatml/llama-model.gguf" , chat_format = " chatml-function-calling" )
429
- >> > llm.create_chat_completion(
433
+ from llama_cpp import Llama
434
+ llm = Llama(model_path = " path/to/chatml/llama-model.gguf" , chat_format = " chatml-function-calling" )
435
+ llm.create_chat_completion(
430
436
messages = [
431
437
{
432
438
" role" : " system" ,
@@ -476,9 +482,9 @@ The various gguf-converted files for this set of models can be found [here](http
476
482
Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The ` LlamaHFTokenizer ` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
477
483
478
484
``` python
479
- >> > from llama_cpp import Llama
480
- >> > from llama_cpp.llama_tokenizer import LlamaHFTokenizer
481
- >> > llm = Llama.from_pretrained(
485
+ from llama_cpp import Llama
486
+ from llama_cpp.llama_tokenizer import LlamaHFTokenizer
487
+ llm = Llama.from_pretrained(
482
488
repo_id = " meetkai/functionary-small-v2.2-GGUF" ,
483
489
filename = " functionary-small-v2.2.q4_0.gguf" ,
484
490
chat_format = " functionary-v2" ,
@@ -504,15 +510,15 @@ You'll first need to download one of the available multi-modal models in GGUF fo
504
510
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
505
511
506
512
``` python
507
- >> > from llama_cpp import Llama
508
- >> > from llama_cpp.llama_chat_format import Llava15ChatHandler
509
- >> > chat_handler = Llava15ChatHandler(clip_model_path = " path/to/llava/mmproj.bin" )
510
- >> > llm = Llama(
513
+ from llama_cpp import Llama
514
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
515
+ chat_handler = Llava15ChatHandler(clip_model_path = " path/to/llava/mmproj.bin" )
516
+ llm = Llama(
511
517
model_path = " ./path/to/llava/llama-model.gguf" ,
512
518
chat_handler = chat_handler,
513
519
n_ctx = 2048 , # n_ctx should be increased to accomodate the image embedding
514
520
)
515
- >> > llm.create_chat_completion(
521
+ llm.create_chat_completion(
516
522
messages = [
517
523
{" role" : " system" , " content" : " You are an assistant who perfectly describes images." },
518
524
{
@@ -709,18 +715,18 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github
709
715
Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
710
716
711
717
``` python
712
- >> > import llama_cpp
713
- >> > import ctypes
714
- >> > llama_cpp.llama_backend_init(False ) # Must be called once at the start of each program
715
- >> > params = llama_cpp.llama_context_default_params()
718
+ import llama_cpp
719
+ import ctypes
720
+ llama_cpp.llama_backend_init(False ) # Must be called once at the start of each program
721
+ params = llama_cpp.llama_context_default_params()
716
722
# use bytes for char * params
717
- >> > model = llama_cpp.llama_load_model_from_file(b " ./models/7b/llama-model.gguf" , params)
718
- >> > ctx = llama_cpp.llama_new_context_with_model(model, params)
719
- >> > max_tokens = params.n_ctx
723
+ model = llama_cpp.llama_load_model_from_file(b " ./models/7b/llama-model.gguf" , params)
724
+ ctx = llama_cpp.llama_new_context_with_model(model, params)
725
+ max_tokens = params.n_ctx
720
726
# use ctypes arrays for array params
721
- >> > tokens = (llama_cpp.llama_token * int (max_tokens))()
722
- >> > n_tokens = llama_cpp.llama_tokenize(ctx, b " Q: Name the planets in the solar system? A: " , tokens, max_tokens, llama_cpp.c_bool(True ))
723
- >> > llama_cpp.llama_free(ctx)
727
+ tokens = (llama_cpp.llama_token * int (max_tokens))()
728
+ n_tokens = llama_cpp.llama_tokenize(ctx, b " Q: Name the planets in the solar system? A: " , tokens, max_tokens, llama_cpp.c_bool(True ))
729
+ llama_cpp.llama_free(ctx)
724
730
```
725
731
726
732
Check out the [ examples folder] ( examples/low_level_api ) for more examples of using the low-level API.
0 commit comments