From 1d43e42990aaf68ad0d115cf129e22c621040839 Mon Sep 17 00:00:00 2001
From: "Jonathan C. McKinney" <pseudotensor@gmail.com>
Date: Wed, 9 Aug 2023 13:50:28 -0700
Subject: [PATCH] WIP pass llamacpp_dict

---
 docs/README_CPU.md            |  2 +-
 docs/README_LINUX.md          |  2 +-
 docs/README_WINDOWS.md        |  2 +-
 src/cli.py                    |  1 +
 src/eval.py                   |  1 +
 src/gen.py                    | 32 ++++++++++++++++++++++----------
 src/gpt4all_llm.py            | 29 ++++++++++++++++-------------
 src/gpt_langchain.py          |  4 ++++
 tests/test_langchain_units.py | 27 +++++++++++++++++----------
 9 files changed, 64 insertions(+), 36 deletions(-)

diff --git a/docs/README_CPU.md b/docs/README_CPU.md
index 6d773fe8a..59c72ca30 100644
--- a/docs/README_CPU.md
+++ b/docs/README_CPU.md
@@ -27,7 +27,7 @@ For another llama.cpp model:
   ```
   For `llama.cpp` based models on CPU, for computers with low system RAM or slow CPUs, we recommend running:
   ```bash
-   python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --use_mlock=False --max_seq_len=512 --n_batch=256 --score_model=None --langchain_mode='UserData' --user_path=user_path
+   python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --llamacpp_dict="{use_mlock:False,n_batch:256}" --max_seq_len=512 --score_model=None --langchain_mode='UserData' --user_path=user_path
   ```
 
 ### GPT4ALL
diff --git a/docs/README_LINUX.md b/docs/README_LINUX.md
index 0069ea11d..bcfdc11f1 100644
--- a/docs/README_LINUX.md
+++ b/docs/README_LINUX.md
@@ -119,7 +119,7 @@ These instructions are for Ubuntu x86_64 (other linux would be similar with diff
     export FORCE_CMAKE=1
     CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.73 --no-cache-dir --verbose
    ```
-  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--n_gpu_layers` or setting in UI.  For highest performance, offload *all* layers.
+  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI.  For highest performance, offload *all* layers.
     That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
       ```text
     llama_model_load_internal: offloaded 35/35 layers to GPU
diff --git a/docs/README_WINDOWS.md b/docs/README_WINDOWS.md
index 3c2b3e4bc..a9d91b94c 100644
--- a/docs/README_WINDOWS.md
+++ b/docs/README_WINDOWS.md
@@ -96,7 +96,7 @@ For newer builds of windows versions of 10/11.
     set FORCE_CMAKE=1
     pip install llama-cpp-python==0.1.68 --no-cache-dir --verbose
     ```
-  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--n_gpu_layers` or setting in UI.  For highest performance, offload *all* layers.
+  * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance.  You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI.  For highest performance, offload *all* layers.
     That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
       ```text
     llama_model_load_internal: offloaded 35/35 layers to GPU
diff --git a/src/cli.py b/src/cli.py
index 6f08f1d4f..90666149a 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -17,6 +17,7 @@ def run_cli(  # for local function:
         use_gpu_id=None, tokenizer_base_model=None,
         gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
         trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
+        llamacpp_dict=None,
         # for some evaluate args
         stream_output=None, async_output=None, num_async=None,
         prompt_type=None, prompt_dict=None, system_prompt=None,
diff --git a/src/eval.py b/src/eval.py
index c3f6db9d2..5e384e8aa 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -24,6 +24,7 @@ def run_eval(  # for local function:
         use_gpu_id=None, tokenizer_base_model=None,
         gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
         trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
+        llamacpp_dict=None,
         # for evaluate args beyond what's already above, or things that are always dynamic and locally created
         temperature=None,
         top_p=None,
diff --git a/src/gen.py b/src/gen.py
index ed911cd79..4a0de5aa3 100644
--- a/src/gen.py
+++ b/src/gen.py
@@ -72,10 +72,7 @@ def main(
         use_system_prompt: bool = False,
 
         # llama and gpt4all settings
-        n_gpu_layers: int = 100,
-        use_mlock: bool = True,
-        n_batch: int = 1024,
-        n_gqa: int = 8,
+        llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=8),
         model_path_llama: str = 'llama-2-7b-chat.ggmlv3.q8_0.bin',
         model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin',
         model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin',
@@ -261,10 +258,12 @@ def main(
     :param use_system_prompt: Whether to use system prompt (e.g. llama2 safe system prompt) present in prompt_type itself
            Independent of system_prompt, which is used for OpenAI, Replicate.
 
-    :param n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
-    :param use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
-    :param n_batch: Can make smaller to 128 for slower low-memory CPU systems
-    :param n_gqa: Required to be 8 for LLaMa 70B
+    :param llamacpp_dict:
+           n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
+           use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
+           n_batch: Can make smaller to 128 for slower low-memory CPU systems
+           n_gqa: Required to be 8 for LLaMa 70B
+           ... etc. anything that could be passed to llama.cpp or GPT4All models
     :param model_path_llama: model path or URL (for auto-download)
     :param model_name_gptj: model path or URL (for auto-download)
     :param model_name_gpt4all_llama: model path or URL (for auto-download)
@@ -503,6 +502,14 @@ def main(
     model_lock = os.getenv('model_lock', str(model_lock))
     model_lock = ast.literal_eval(model_lock)
 
+    if isinstance(llamacpp_dict, str):
+        llamacpp_dict = ast.literal_eval(llamacpp_dict)
+    # add others to single dict
+    llamacpp_dict['model_path_llama'] = model_path_llama
+    llamacpp_dict['model_name_gptj'] = model_name_gptj
+    llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama
+    llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config
+
     if model_lock:
         assert gradio, "model_lock only supported for gradio=True"
         if len(model_lock) > 1:
@@ -1152,6 +1159,7 @@ def get_model(
         rope_scaling: dict = None,
         max_seq_len: int = None,
         compile_model: bool = True,
+        llamacpp_dict=None,
 
         verbose: bool = False,
 ):
@@ -1181,8 +1189,9 @@ def get_model(
     :param offload_folder: offload folder
     :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}"
     :param max_seq_len: override for maximum sequence length for model
-    :param compile_model: whether to compile torch model
     :param max_seq_len: if set, use as max_seq_len for model
+    :param compile_model: whether to compile torch model
+    :param llamacpp_dict: dict of llama.cpp and GPT4All model options
     :param verbose:
     :return:
     """
@@ -1281,7 +1290,8 @@ def get_model(
     if base_model in non_hf_types:
         from gpt4all_llm import get_model_tokenizer_gpt4all
         model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs,
-                                                               max_seq_len=max_seq_len)
+                                                               max_seq_len=max_seq_len,
+                                                               llamacpp_dict=llamacpp_dict)
         return model, tokenizer, device
     if load_exllama:
         return model_loader, tokenizer, 'cuda'
@@ -1554,6 +1564,7 @@ def get_score_model(score_model: str = None,
                     offload_folder: str = None,
                     rope_scaling: dict = None,
                     compile_model: bool = True,
+                    llamacpp_dict: typing.Dict = None,
 
                     verbose: bool = False,
                     ):
@@ -1572,6 +1583,7 @@ def get_score_model(score_model: str = None,
         llama_type = False
         max_seq_len = None
         compile_model = False
+        llamacpp_dict = {}
         smodel, stokenizer, sdevice = get_model(reward_type=True,
                                                 **get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
     else:
diff --git a/src/gpt4all_llm.py b/src/gpt4all_llm.py
index 625c97413..96248db0e 100644
--- a/src/gpt4all_llm.py
+++ b/src/gpt4all_llm.py
@@ -10,7 +10,8 @@
 from utils import FakeTokenizer, get_ngpus_vis
 
 
-def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
+def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None):
+    assert llamacpp_dict is not None
     # defaults (some of these are generation parameters, so need to be passed in at generation time)
     model_name = base_model.lower()
     model = get_llm_gpt4all(model_name, model=None,
@@ -28,6 +29,7 @@ def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
                             # iinput=iinput,
                             inner_class=True,
                             max_seq_len=max_seq_len,
+                            llamacpp_dict=llamacpp_dict,
                             )
     return model, FakeTokenizer(), 'cpu'
 
@@ -45,13 +47,13 @@ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
         pass
 
 
-def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]):
+def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]):
     # default from class
     model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list}
     # from our defaults
     model_kwargs.update(default_kwargs)
     # from user defaults
-    model_kwargs.update(env_kwargs)
+    model_kwargs.update(llamacpp_dict)
     # ensure only valid keys
     func_names = list(inspect.signature(cls).parameters)
     model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
@@ -78,11 +80,10 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
                                ):
     if n_jobs is None:
         n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count())))
-    max_tokens = env_kwargs.pop('max_tokens', max_seq_len - max_new_tokens)
     n_gpus = get_ngpus_vis()
     default_kwargs = dict(context_erase=0.5,
                           n_batch=1,
-                          max_tokens=max_tokens,
+                          max_tokens=max_seq_len - max_new_tokens,
                           n_predict=max_new_tokens,
                           repeat_last_n=64 if repetition_penalty != 1.0 else 0,
                           repeat_penalty=repetition_penalty,
@@ -96,7 +97,7 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
                           verbose=verbose)
     if n_gpus != 0:
         default_kwargs.update(dict(n_gpu_layers=100))
-    return default_kwargs, env_kwargs
+    return default_kwargs
 
 
 def get_llm_gpt4all(model_name,
@@ -115,11 +116,13 @@ def get_llm_gpt4all(model_name,
                     verbose=False,
                     inner_class=False,
                     max_seq_len=None,
+                    llamacpp_dict=None,
                     ):
     if not inner_class:
         assert prompter is not None
+    assert llamacpp_dict is not None
 
-    default_kwargs, env_kwargs = \
+    default_kwargs = \
         get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens,
                                    temperature=temperature,
                                    repetition_penalty=repetition_penalty,
@@ -131,8 +134,8 @@ def get_llm_gpt4all(model_name,
                                    )
     if model_name == 'llama':
         cls = H2OLlamaCpp
-        model_path = env_kwargs.pop('model_path_llama') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_path_llama') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
                                  prompter=prompter, context=context, iinput=iinput))
         llm = cls(**model_kwargs)
@@ -140,8 +143,8 @@ def get_llm_gpt4all(model_name,
         inner_model = llm.client
     elif model_name == 'gpt4all_llama':
         cls = H2OGPT4All
-        model_path = env_kwargs.pop('model_name_gpt4all_llama') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_name_gpt4all_llama') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
             dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
                  prompter=prompter, context=context, iinput=iinput))
@@ -149,8 +152,8 @@ def get_llm_gpt4all(model_name,
         inner_model = llm.client
     elif model_name == 'gptj':
         cls = H2OGPT4All
-        model_path = env_kwargs.pop('model_name_gptj') if model is None else model
-        model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
+        model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model
+        model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
         model_kwargs.update(
             dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
                  prompter=prompter, context=context, iinput=iinput))
diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py
index 468ef28ac..bce74f505 100644
--- a/src/gpt_langchain.py
+++ b/src/gpt_langchain.py
@@ -831,6 +831,7 @@ def get_llm(use_openai_model=False,
             system_prompt='',
             n_jobs=None,
             cli=False,
+            llamacpp_dict=None,
             verbose=False,
             ):
     if n_jobs is None:
@@ -1031,6 +1032,7 @@ def get_llm(use_openai_model=False,
                               context=context,
                               iinput=iinput,
                               max_seq_len=max_max_tokens,
+                              llamacpp_dict=llamacpp_dict,
                               )
     elif hasattr(model, 'is_exlama') and model.is_exlama():
         async_output = False  # FIXME: not implemented yet
@@ -2371,6 +2373,7 @@ def _run_qa_db(query=None,
                pre_prompt_summary=None,
                prompt_summary=None,
                n_jobs=-1,
+               llamacpp_dict=None,
                verbose=False,
                cli=False,
                reverse_docs=True,
@@ -2452,6 +2455,7 @@ def _run_qa_db(query=None,
                 sanitize_bot_response=sanitize_bot_response,
                 system_prompt=system_prompt,
                 n_jobs=n_jobs,
+                llamacpp_dict=llamacpp_dict,
                 cli=cli,
                 verbose=verbose,
                 )
diff --git a/tests/test_langchain_units.py b/tests/test_langchain_units.py
index 30c478312..736b8b2f4 100644
--- a/tests/test_langchain_units.py
+++ b/tests/test_langchain_units.py
@@ -57,7 +57,7 @@ def run_qa_wiki(use_openai_model=False, first_para=True, text_limit=None, chain_
 
     sources = get_wiki_sources(first_para=first_para, text_limit=text_limit)
     llm, model_name, streamer, prompt_type_out, async_output = \
-        get_llm(use_openai_model=use_openai_model, prompt_type=prompt_type)
+        get_llm(use_openai_model=use_openai_model, prompt_type=prompt_type, llamacpp_dict={})
     chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
 
     question = "What are the main differences between Linux and Windows?"
@@ -88,7 +88,7 @@ def test_qa_wiki_db_openai():
                      hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                      db_type='faiss',
                      langchain_mode='wiki',
-                     langchain_action=LangChainAction.QUERY.value, langchain_agents=[])
+                     langchain_action=LangChainAction.QUERY.value, langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -105,7 +105,7 @@ def test_qa_wiki_db_hf():
                      db_type='faiss',
                      langchain_mode='wiki',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -120,7 +120,7 @@ def test_qa_wiki_db_chunk_hf():
                      db_type='faiss',
                      langchain_mode='wiki',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -136,7 +136,7 @@ def test_qa_wiki_db_chunk_openai():
                      db_type='faiss',
                      langchain_mode='wiki',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -152,7 +152,7 @@ def test_qa_github_db_chunk_openai():
                      db_type='faiss',
                      langchain_mode='github h2oGPT',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -168,7 +168,7 @@ def test_qa_daidocs_db_chunk_hf():
                      db_type='faiss',
                      langchain_mode='DriverlessAI docs',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -183,6 +183,7 @@ def test_qa_daidocs_db_chunk_hf_faiss():
                      langchain_mode='DriverlessAI docs',
                      langchain_action=LangChainAction.QUERY.value,
                      langchain_agents=[],
+                     llamacpp_dict={},
                      db_type='faiss',
                      hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2",
                      )
@@ -217,6 +218,7 @@ def test_qa_daidocs_db_chunk_hf_dbs(db_type, top_k_docs):
                      db_type=db_type,
                      top_k_docs=top_k_docs,
                      model_name=model_name,
+                     llamacpp_dict={},
                      )
     check_ret(ret)
 
@@ -253,6 +255,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type):
                       rope_scaling=None,
                       max_seq_len=None,
                       compile_model=True,
+                      llamacpp_dict={},
 
                       verbose=False)
     model, tokenizer, device = get_model(reward_type=False,
@@ -279,6 +282,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type):
                      langchain_action=langchain_action,
                      langchain_agents=langchain_agents,
                      db_type=db_type,
+                     llamacpp_dict={},
                      )
     check_ret(ret)
 
@@ -297,6 +301,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type):
                      langchain_action=langchain_action,
                      langchain_agents=langchain_agents,
                      db_type=db_type,
+                     llamacpp_dict={},
                      )
     check_ret(ret)
 
@@ -307,7 +312,8 @@ def test_qa_wiki_db_chunk_hf_dbs_llama(db_type):
     kill_weaviate(db_type)
     from src.gpt4all_llm import get_model_tokenizer_gpt4all
     model_name = 'llama'
-    model, tokenizer, device = get_model_tokenizer_gpt4all(model_name)
+    model, tokenizer, device = get_model_tokenizer_gpt4all(model_name,
+                                                           llamacpp_dict=dict(n_gpu_layers=100, use_mlock=True, n_batch=1024))
 
     from src.gpt_langchain import _run_qa_db
     query = "What are the main differences between Linux and Windows?"
@@ -322,6 +328,7 @@ def test_qa_wiki_db_chunk_hf_dbs_llama(db_type):
                      prompt_type='llama2',
                      langchain_only_model=True,
                      model_name=model_name, model=model, tokenizer=tokenizer,
+                     llamacpp_dict=dict(n_gpu_layers=100, use_mlock=True, n_batch=1024),
                      )
     check_ret(ret)
 
@@ -337,7 +344,7 @@ def test_qa_daidocs_db_chunk_openai():
                      chunk_size=256,
                      langchain_mode='DriverlessAI docs',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)
 
 
@@ -352,7 +359,7 @@ def test_qa_daidocs_db_chunk_openaiembedding_hfmodel():
                      db_type='faiss',
                      langchain_mode='DriverlessAI docs',
                      langchain_action=LangChainAction.QUERY.value,
-                     langchain_agents=[])
+                     langchain_agents=[], llamacpp_dict={})
     check_ret(ret)