From 1d43e42990aaf68ad0d115cf129e22c621040839 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Wed, 9 Aug 2023 13:50:28 -0700 Subject: [PATCH] WIP pass llamacpp_dict --- docs/README_CPU.md | 2 +- docs/README_LINUX.md | 2 +- docs/README_WINDOWS.md | 2 +- src/cli.py | 1 + src/eval.py | 1 + src/gen.py | 32 ++++++++++++++++++++++---------- src/gpt4all_llm.py | 29 ++++++++++++++++------------- src/gpt_langchain.py | 4 ++++ tests/test_langchain_units.py | 27 +++++++++++++++++---------- 9 files changed, 64 insertions(+), 36 deletions(-) diff --git a/docs/README_CPU.md b/docs/README_CPU.md index 6d773fe8a..59c72ca30 100644 --- a/docs/README_CPU.md +++ b/docs/README_CPU.md @@ -27,7 +27,7 @@ For another llama.cpp model: ``` For `llama.cpp` based models on CPU, for computers with low system RAM or slow CPUs, we recommend running: ```bash - python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --use_mlock=False --max_seq_len=512 --n_batch=256 --score_model=None --langchain_mode='UserData' --user_path=user_path + python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --llamacpp_dict="{use_mlock:False,n_batch:256}" --max_seq_len=512 --score_model=None --langchain_mode='UserData' --user_path=user_path ``` ### GPT4ALL diff --git a/docs/README_LINUX.md b/docs/README_LINUX.md index 0069ea11d..bcfdc11f1 100644 --- a/docs/README_LINUX.md +++ b/docs/README_LINUX.md @@ -119,7 +119,7 @@ These instructions are for Ubuntu x86_64 (other linux would be similar with diff export FORCE_CMAKE=1 CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.73 --no-cache-dir --verbose ``` - * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--n_gpu_layers` or setting in UI. For highest performance, offload *all* layers. + * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI. For highest performance, offload *all* layers. That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded: ```text llama_model_load_internal: offloaded 35/35 layers to GPU diff --git a/docs/README_WINDOWS.md b/docs/README_WINDOWS.md index 3c2b3e4bc..a9d91b94c 100644 --- a/docs/README_WINDOWS.md +++ b/docs/README_WINDOWS.md @@ -96,7 +96,7 @@ For newer builds of windows versions of 10/11. set FORCE_CMAKE=1 pip install llama-cpp-python==0.1.68 --no-cache-dir --verbose ``` - * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--n_gpu_layers` or setting in UI. For highest performance, offload *all* layers. + * By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI. For highest performance, offload *all* layers. That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded: ```text llama_model_load_internal: offloaded 35/35 layers to GPU diff --git a/src/cli.py b/src/cli.py index 6f08f1d4f..90666149a 100644 --- a/src/cli.py +++ b/src/cli.py @@ -17,6 +17,7 @@ def run_cli( # for local function: use_gpu_id=None, tokenizer_base_model=None, gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None, trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None, + llamacpp_dict=None, # for some evaluate args stream_output=None, async_output=None, num_async=None, prompt_type=None, prompt_dict=None, system_prompt=None, diff --git a/src/eval.py b/src/eval.py index c3f6db9d2..5e384e8aa 100644 --- a/src/eval.py +++ b/src/eval.py @@ -24,6 +24,7 @@ def run_eval( # for local function: use_gpu_id=None, tokenizer_base_model=None, gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None, trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None, + llamacpp_dict=None, # for evaluate args beyond what's already above, or things that are always dynamic and locally created temperature=None, top_p=None, diff --git a/src/gen.py b/src/gen.py index ed911cd79..4a0de5aa3 100644 --- a/src/gen.py +++ b/src/gen.py @@ -72,10 +72,7 @@ def main( use_system_prompt: bool = False, # llama and gpt4all settings - n_gpu_layers: int = 100, - use_mlock: bool = True, - n_batch: int = 1024, - n_gqa: int = 8, + llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=8), model_path_llama: str = 'llama-2-7b-chat.ggmlv3.q8_0.bin', model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin', model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin', @@ -261,10 +258,12 @@ def main( :param use_system_prompt: Whether to use system prompt (e.g. llama2 safe system prompt) present in prompt_type itself Independent of system_prompt, which is used for OpenAI, Replicate. - :param n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value) - :param use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False - :param n_batch: Can make smaller to 128 for slower low-memory CPU systems - :param n_gqa: Required to be 8 for LLaMa 70B + :param llamacpp_dict: + n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value) + use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False + n_batch: Can make smaller to 128 for slower low-memory CPU systems + n_gqa: Required to be 8 for LLaMa 70B + ... etc. anything that could be passed to llama.cpp or GPT4All models :param model_path_llama: model path or URL (for auto-download) :param model_name_gptj: model path or URL (for auto-download) :param model_name_gpt4all_llama: model path or URL (for auto-download) @@ -503,6 +502,14 @@ def main( model_lock = os.getenv('model_lock', str(model_lock)) model_lock = ast.literal_eval(model_lock) + if isinstance(llamacpp_dict, str): + llamacpp_dict = ast.literal_eval(llamacpp_dict) + # add others to single dict + llamacpp_dict['model_path_llama'] = model_path_llama + llamacpp_dict['model_name_gptj'] = model_name_gptj + llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama + llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config + if model_lock: assert gradio, "model_lock only supported for gradio=True" if len(model_lock) > 1: @@ -1152,6 +1159,7 @@ def get_model( rope_scaling: dict = None, max_seq_len: int = None, compile_model: bool = True, + llamacpp_dict=None, verbose: bool = False, ): @@ -1181,8 +1189,9 @@ def get_model( :param offload_folder: offload folder :param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}" :param max_seq_len: override for maximum sequence length for model - :param compile_model: whether to compile torch model :param max_seq_len: if set, use as max_seq_len for model + :param compile_model: whether to compile torch model + :param llamacpp_dict: dict of llama.cpp and GPT4All model options :param verbose: :return: """ @@ -1281,7 +1290,8 @@ def get_model( if base_model in non_hf_types: from gpt4all_llm import get_model_tokenizer_gpt4all model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs, - max_seq_len=max_seq_len) + max_seq_len=max_seq_len, + llamacpp_dict=llamacpp_dict) return model, tokenizer, device if load_exllama: return model_loader, tokenizer, 'cuda' @@ -1554,6 +1564,7 @@ def get_score_model(score_model: str = None, offload_folder: str = None, rope_scaling: dict = None, compile_model: bool = True, + llamacpp_dict: typing.Dict = None, verbose: bool = False, ): @@ -1572,6 +1583,7 @@ def get_score_model(score_model: str = None, llama_type = False max_seq_len = None compile_model = False + llamacpp_dict = {} smodel, stokenizer, sdevice = get_model(reward_type=True, **get_kwargs(get_model, exclude_names=['reward_type'], **locals())) else: diff --git a/src/gpt4all_llm.py b/src/gpt4all_llm.py index 625c97413..96248db0e 100644 --- a/src/gpt4all_llm.py +++ b/src/gpt4all_llm.py @@ -10,7 +10,8 @@ from utils import FakeTokenizer, get_ngpus_vis -def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None): +def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None): + assert llamacpp_dict is not None # defaults (some of these are generation parameters, so need to be passed in at generation time) model_name = base_model.lower() model = get_llm_gpt4all(model_name, model=None, @@ -28,6 +29,7 @@ def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None): # iinput=iinput, inner_class=True, max_seq_len=max_seq_len, + llamacpp_dict=llamacpp_dict, ) return model, FakeTokenizer(), 'cpu' @@ -45,13 +47,13 @@ def on_llm_new_token(self, token: str, **kwargs: Any) -> None: pass -def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]): +def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]): # default from class model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list} # from our defaults model_kwargs.update(default_kwargs) # from user defaults - model_kwargs.update(env_kwargs) + model_kwargs.update(llamacpp_dict) # ensure only valid keys func_names = list(inspect.signature(cls).parameters) model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names} @@ -78,11 +80,10 @@ def get_gpt4all_default_kwargs(max_new_tokens=256, ): if n_jobs is None: n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count()))) - max_tokens = env_kwargs.pop('max_tokens', max_seq_len - max_new_tokens) n_gpus = get_ngpus_vis() default_kwargs = dict(context_erase=0.5, n_batch=1, - max_tokens=max_tokens, + max_tokens=max_seq_len - max_new_tokens, n_predict=max_new_tokens, repeat_last_n=64 if repetition_penalty != 1.0 else 0, repeat_penalty=repetition_penalty, @@ -96,7 +97,7 @@ def get_gpt4all_default_kwargs(max_new_tokens=256, verbose=verbose) if n_gpus != 0: default_kwargs.update(dict(n_gpu_layers=100)) - return default_kwargs, env_kwargs + return default_kwargs def get_llm_gpt4all(model_name, @@ -115,11 +116,13 @@ def get_llm_gpt4all(model_name, verbose=False, inner_class=False, max_seq_len=None, + llamacpp_dict=None, ): if not inner_class: assert prompter is not None + assert llamacpp_dict is not None - default_kwargs, env_kwargs = \ + default_kwargs = \ get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens, temperature=temperature, repetition_penalty=repetition_penalty, @@ -131,8 +134,8 @@ def get_llm_gpt4all(model_name, ) if model_name == 'llama': cls = H2OLlamaCpp - model_path = env_kwargs.pop('model_path_llama') if model is None else model - model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs']) + model_path = llamacpp_dict.pop('model_path_llama') if model is None else model + model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput)) llm = cls(**model_kwargs) @@ -140,8 +143,8 @@ def get_llm_gpt4all(model_name, inner_model = llm.client elif model_name == 'gpt4all_llama': cls = H2OGPT4All - model_path = env_kwargs.pop('model_name_gpt4all_llama') if model is None else model - model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs']) + model_path = llamacpp_dict.pop('model_name_gpt4all_llama') if model is None else model + model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update( dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput)) @@ -149,8 +152,8 @@ def get_llm_gpt4all(model_name, inner_model = llm.client elif model_name == 'gptj': cls = H2OGPT4All - model_path = env_kwargs.pop('model_name_gptj') if model is None else model - model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs']) + model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model + model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs']) model_kwargs.update( dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming, prompter=prompter, context=context, iinput=iinput)) diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py index 468ef28ac..bce74f505 100644 --- a/src/gpt_langchain.py +++ b/src/gpt_langchain.py @@ -831,6 +831,7 @@ def get_llm(use_openai_model=False, system_prompt='', n_jobs=None, cli=False, + llamacpp_dict=None, verbose=False, ): if n_jobs is None: @@ -1031,6 +1032,7 @@ def get_llm(use_openai_model=False, context=context, iinput=iinput, max_seq_len=max_max_tokens, + llamacpp_dict=llamacpp_dict, ) elif hasattr(model, 'is_exlama') and model.is_exlama(): async_output = False # FIXME: not implemented yet @@ -2371,6 +2373,7 @@ def _run_qa_db(query=None, pre_prompt_summary=None, prompt_summary=None, n_jobs=-1, + llamacpp_dict=None, verbose=False, cli=False, reverse_docs=True, @@ -2452,6 +2455,7 @@ def _run_qa_db(query=None, sanitize_bot_response=sanitize_bot_response, system_prompt=system_prompt, n_jobs=n_jobs, + llamacpp_dict=llamacpp_dict, cli=cli, verbose=verbose, ) diff --git a/tests/test_langchain_units.py b/tests/test_langchain_units.py index 30c478312..736b8b2f4 100644 --- a/tests/test_langchain_units.py +++ b/tests/test_langchain_units.py @@ -57,7 +57,7 @@ def run_qa_wiki(use_openai_model=False, first_para=True, text_limit=None, chain_ sources = get_wiki_sources(first_para=first_para, text_limit=text_limit) llm, model_name, streamer, prompt_type_out, async_output = \ - get_llm(use_openai_model=use_openai_model, prompt_type=prompt_type) + get_llm(use_openai_model=use_openai_model, prompt_type=prompt_type, llamacpp_dict={}) chain = load_qa_with_sources_chain(llm, chain_type=chain_type) question = "What are the main differences between Linux and Windows?" @@ -88,7 +88,7 @@ def test_qa_wiki_db_openai(): hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2", db_type='faiss', langchain_mode='wiki', - langchain_action=LangChainAction.QUERY.value, langchain_agents=[]) + langchain_action=LangChainAction.QUERY.value, langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -105,7 +105,7 @@ def test_qa_wiki_db_hf(): db_type='faiss', langchain_mode='wiki', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -120,7 +120,7 @@ def test_qa_wiki_db_chunk_hf(): db_type='faiss', langchain_mode='wiki', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -136,7 +136,7 @@ def test_qa_wiki_db_chunk_openai(): db_type='faiss', langchain_mode='wiki', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -152,7 +152,7 @@ def test_qa_github_db_chunk_openai(): db_type='faiss', langchain_mode='github h2oGPT', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -168,7 +168,7 @@ def test_qa_daidocs_db_chunk_hf(): db_type='faiss', langchain_mode='DriverlessAI docs', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -183,6 +183,7 @@ def test_qa_daidocs_db_chunk_hf_faiss(): langchain_mode='DriverlessAI docs', langchain_action=LangChainAction.QUERY.value, langchain_agents=[], + llamacpp_dict={}, db_type='faiss', hf_embedding_model="sentence-transformers/all-MiniLM-L6-v2", ) @@ -217,6 +218,7 @@ def test_qa_daidocs_db_chunk_hf_dbs(db_type, top_k_docs): db_type=db_type, top_k_docs=top_k_docs, model_name=model_name, + llamacpp_dict={}, ) check_ret(ret) @@ -253,6 +255,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type): rope_scaling=None, max_seq_len=None, compile_model=True, + llamacpp_dict={}, verbose=False) model, tokenizer, device = get_model(reward_type=False, @@ -279,6 +282,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type): langchain_action=langchain_action, langchain_agents=langchain_agents, db_type=db_type, + llamacpp_dict={}, ) check_ret(ret) @@ -297,6 +301,7 @@ def test_qa_daidocs_db_chunk_hf_dbs_switch_embedding(db_type): langchain_action=langchain_action, langchain_agents=langchain_agents, db_type=db_type, + llamacpp_dict={}, ) check_ret(ret) @@ -307,7 +312,8 @@ def test_qa_wiki_db_chunk_hf_dbs_llama(db_type): kill_weaviate(db_type) from src.gpt4all_llm import get_model_tokenizer_gpt4all model_name = 'llama' - model, tokenizer, device = get_model_tokenizer_gpt4all(model_name) + model, tokenizer, device = get_model_tokenizer_gpt4all(model_name, + llamacpp_dict=dict(n_gpu_layers=100, use_mlock=True, n_batch=1024)) from src.gpt_langchain import _run_qa_db query = "What are the main differences between Linux and Windows?" @@ -322,6 +328,7 @@ def test_qa_wiki_db_chunk_hf_dbs_llama(db_type): prompt_type='llama2', langchain_only_model=True, model_name=model_name, model=model, tokenizer=tokenizer, + llamacpp_dict=dict(n_gpu_layers=100, use_mlock=True, n_batch=1024), ) check_ret(ret) @@ -337,7 +344,7 @@ def test_qa_daidocs_db_chunk_openai(): chunk_size=256, langchain_mode='DriverlessAI docs', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret) @@ -352,7 +359,7 @@ def test_qa_daidocs_db_chunk_openaiembedding_hfmodel(): db_type='faiss', langchain_mode='DriverlessAI docs', langchain_action=LangChainAction.QUERY.value, - langchain_agents=[]) + langchain_agents=[], llamacpp_dict={}) check_ret(ret)