WIP pass llamacpp_dict

h2oai · Aug 9, 2023 · 1d43e42 · 1d43e42
1 parent 8acc7fe
commit 1d43e42
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 36 deletions.
diff --git a/docs/README_CPU.md b/docs/README_CPU.md
@@ -27,7 +27,7 @@ For another llama.cpp model:
   ```
   For `llama.cpp` based models on CPU, for computers with low system RAM or slow CPUs, we recommend running:
   ```bash
-   python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --use_mlock=False --max_seq_len=512 --n_batch=256 --score_model=None --langchain_mode='UserData' --user_path=user_path
+   python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --llamacpp_dict="{use_mlock:False,n_batch:256}" --max_seq_len=512 --score_model=None --langchain_mode='UserData' --user_path=user_path
   ```
 
 ### GPT4ALL

diff --git a/docs/README_LINUX.md b/docs/README_LINUX.md
@@ -119,7 +119,7 @@ These instructions are for Ubuntu x86_64 (other linux would be similar with diff
     export FORCE_CMAKE=1
     CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.73 --no-cache-dir --verbose
    ```
-  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--n_gpu_layers` or setting in UI.  For highest performance, offload *all* layers.
+  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI.  For highest performance, offload *all* layers.
     That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
       ```text
     llama_model_load_internal: offloaded 35/35 layers to GPU

diff --git a/docs/README_WINDOWS.md b/docs/README_WINDOWS.md
@@ -96,7 +96,7 @@ For newer builds of windows versions of 10/11.
     set FORCE_CMAKE=1
     pip install llama-cpp-python==0.1.68 --no-cache-dir --verbose
     ```
-  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--n_gpu_layers` or setting in UI.  For highest performance, offload *all* layers.
+  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI.  For highest performance, offload *all* layers.
     That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
       ```text
     llama_model_load_internal: offloaded 35/35 layers to GPU

diff --git a/src/cli.py b/src/cli.py
@@ -17,6 +17,7 @@ def run_cli(  # for local function:
         use_gpu_id=None, tokenizer_base_model=None,
         gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
         trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
+        llamacpp_dict=None,
         # for some evaluate args
         stream_output=None, async_output=None, num_async=None,
         prompt_type=None, prompt_dict=None, system_prompt=None,

diff --git a/src/eval.py b/src/eval.py
@@ -24,6 +24,7 @@ def run_eval(  # for local function:
         use_gpu_id=None, tokenizer_base_model=None,
         gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
         trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
+        llamacpp_dict=None,
         # for evaluate args beyond what's already above, or things that are always dynamic and locally created
         temperature=None,
         top_p=None,

diff --git a/src/gen.py b/src/gen.py
@@ -72,10 +72,7 @@ def main(
         use_system_prompt: bool = False,
 
         # llama and gpt4all settings
-        n_gpu_layers: int = 100,
-        use_mlock: bool = True,
-        n_batch: int = 1024,
-        n_gqa: int = 8,
+        llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=8),
         model_path_llama: str = 'llama-2-7b-chat.ggmlv3.q8_0.bin',
         model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin',
         model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin',
@@ -261,10 +258,12 @@ def main(
     :param use_system_prompt: Whether to use system prompt (e.g. llama2 safe system prompt) present in prompt_type itself
            Independent of system_prompt, which is used for OpenAI, Replicate.
 
-    :param n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
-    :param use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
-    :param n_batch: Can make smaller to 128 for slower low-memory CPU systems
-    :param n_gqa: Required to be 8 for LLaMa 70B
+    :param llamacpp_dict:
+           n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
+           use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
+           n_batch: Can make smaller to 128 for slower low-memory CPU systems
+           n_gqa: Required to be 8 for LLaMa 70B
+           ... etc. anything that could be passed to llama.cpp or GPT4All models
     :param model_path_llama: model path or URL (for auto-download)
     :param model_name_gptj: model path or URL (for auto-download)
     :param model_name_gpt4all_llama: model path or URL (for auto-download)
@@ -503,6 +502,14 @@ def main(
     model_lock = os.getenv('model_lock', str(model_lock))
     model_lock = ast.literal_eval(model_lock)
 
+    if isinstance(llamacpp_dict, str):
+        llamacpp_dict = ast.literal_eval(llamacpp_dict)
+    # add others to single dict
+    llamacpp_dict['model_path_llama'] = model_path_llama
+    llamacpp_dict['model_name_gptj'] = model_name_gptj
+    llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama
+    llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config
+
     if model_lock:
         assert gradio, "model_lock only supported for gradio=True"
         if len(model_lock) > 1:
@@ -1152,6 +1159,7 @@ def get_model(
         rope_scaling: dict = None,
         max_seq_len: int = None,
         compile_model: bool = True,
+        llamacpp_dict=None,
 
         verbose: bool = False,
 ):
@@ -1181,8 +1189,9 @@ def get_model(
     :param offload_folder: offload folder
     :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}"
     :param max_seq_len: override for maximum sequence length for model
-    :param compile_model: whether to compile torch model
     :param max_seq_len: if set, use as max_seq_len for model
+    :param compile_model: whether to compile torch model
+    :param llamacpp_dict: dict of llama.cpp and GPT4All model options
     :param verbose:
     :return:
     """
@@ -1281,7 +1290,8 @@ def get_model(
     if base_model in non_hf_types:
         from gpt4all_llm import get_model_tokenizer_gpt4all
         model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs,
-                                                               max_seq_len=max_seq_len)
+                                                               max_seq_len=max_seq_len,
+                                                               llamacpp_dict=llamacpp_dict)
         return model, tokenizer, device
     if load_exllama:
         return model_loader, tokenizer, 'cuda'
@@ -1554,6 +1564,7 @@ def get_score_model(score_model: str = None,
                     offload_folder: str = None,
                     rope_scaling: dict = None,
                     compile_model: bool = True,
+                    llamacpp_dict: typing.Dict = None,
 
                     verbose: bool = False,
                     ):
@@ -1572,6 +1583,7 @@ def get_score_model(score_model: str = None,
         llama_type = False
         max_seq_len = None
         compile_model = False
+        llamacpp_dict = {}
         smodel, stokenizer, sdevice = get_model(reward_type=True,
                                                 **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
     else:

diff --git a/src/gpt4all_llm.py b/src/gpt4all_llm.py
@@ -10,7 +10,8 @@
 from utils import FakeTokenizer, get_ngpus_vis
 
 
-def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
+def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None):
+    assert llamacpp_dict is not None
     # defaults (some of these are generation parameters, so need to be passed in at generation time)
     model_name = base_model.lower()
     model = get_llm_gpt4all(model_name, model=None,
@@ -28,6 +29,7 @@ def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
                             # iinput=iinput,
                             inner_class=True,
                             max_seq_len=max_seq_len,
+                            llamacpp_dict=llamacpp_dict,
                             )
     return model, FakeTokenizer(), 'cpu'
 
@@ -45,13 +47,13 @@ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
         pass
 
 
-def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]):
+def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]):
     # default from class
     model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list}
     # from our defaults
     model_kwargs.update(default_kwargs)
     # from user defaults
-    model_kwargs.update(env_kwargs)
+    model_kwargs.update(llamacpp_dict)
     # ensure only valid keys
     func_names = list(inspect.signature(cls).parameters)
     model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
@@ -78,11 +80,10 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
                                ):
     if n_jobs is None:
         n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count())))
-    max_tokens = env_kwargs.pop('max_tokens', max_seq_len - max_new_tokens)
     n_gpus = get_ngpus_vis()
     default_kwargs = dict(context_erase=0.5,
                           n_batch=1,
-                          max_tokens=max_tokens,
+                          max_tokens=max_seq_len - max_new_tokens,
                           n_predict=max_new_tokens,
                           repeat_last_n=64 if repetition_penalty != 1.0 else 0,
                           repeat_penalty=repetition_penalty,
@@ -96,7 +97,7 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
                           verbose=verbose)
     if n_gpus != 0:
         default_kwargs.update(dict(n_gpu_layers=100))
-    return default_kwargs, env_kwargs
+    return default_kwargs
 
 
 def get_llm_gpt4all(model_name,
@@ -115,11 +116,13 @@ def get_llm_gpt4all(model_name,
                     verbose=False,
                     inner_class=False,
                     max_seq_len=None,
+                    llamacpp_dict=None,
                     ):
     if not inner_class:
         assert prompter is not None
+    assert llamacpp_dict is not None
 
-    default_kwargs, env_kwargs = \
+    default_kwargs = \
         get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens,
                                    temperature=temperature,
                                    repetition_penalty=repetition_penalty,
@@ -131,26 +134,26 @@ def get_llm_gpt4all(model_name,
                                    )
     if model_name == 'llama':
         cls = H2OLlamaCpp
-        model_path = env_kwargs.pop('model_path_llama') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_path_llama') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
                                  prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
         llm.client.verbose = verbose
         inner_model = llm.client
     elif model_name == 'gpt4all_llama':
         cls = H2OGPT4All
-        model_path = env_kwargs.pop('model_name_gpt4all_llama') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_name_gpt4all_llama') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
             dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
                  prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
         inner_model = llm.client
     elif model_name == 'gptj':
         cls = H2OGPT4All
-        model_path = env_kwargs.pop('model_name_gptj') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
             dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
                  prompter=prompter, context=context, iinput=iinput))

diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py
@@ -831,6 +831,7 @@ def get_llm(use_openai_model=False,
             system_prompt='',
             n_jobs=None,
             cli=False,
+            llamacpp_dict=None,
             verbose=False,
             ):
     if n_jobs is None:
@@ -1031,6 +1032,7 @@ def get_llm(use_openai_model=False,
                               context=context,
                               iinput=iinput,
                               max_seq_len=max_max_tokens,
+                              llamacpp_dict=llamacpp_dict,
                               )
     elif hasattr(model, 'is_exlama') and model.is_exlama():
         async_output = False  # FIXME: not implemented yet
@@ -2371,6 +2373,7 @@ def _run_qa_db(query=None,
                pre_prompt_summary=None,
                prompt_summary=None,
                n_jobs=-1,
+               llamacpp_dict=None,
                verbose=False,
                cli=False,
                reverse_docs=True,
@@ -2452,6 +2455,7 @@ def _run_qa_db(query=None,
                 sanitize_bot_response=sanitize_bot_response,
                 system_prompt=system_prompt,
                 n_jobs=n_jobs,
+                llamacpp_dict=llamacpp_dict,
                 cli=cli,
                 verbose=verbose,
                 )