Skip to content

Commit

Permalink
WIP pass llamacpp_dict
Browse files Browse the repository at this point in the history
  • Loading branch information
pseudotensor committed Aug 9, 2023
1 parent 8acc7fe commit 1d43e42
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 36 deletions.
2 changes: 1 addition & 1 deletion docs/README_CPU.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ For another llama.cpp model:
```
For `llama.cpp` based models on CPU, for computers with low system RAM or slow CPUs, we recommend running:
```bash
python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --use_mlock=False --max_seq_len=512 --n_batch=256 --score_model=None --langchain_mode='UserData' --user_path=user_path
python generate.py --base_model=llama --model_path_llama=WizardLM-7B-uncensored.ggmlv3.q8_0.bin --llamacpp_dict="{use_mlock:False,n_batch:256}" --max_seq_len=512 --score_model=None --langchain_mode='UserData' --user_path=user_path
```
### GPT4ALL
Expand Down
2 changes: 1 addition & 1 deletion docs/README_LINUX.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ These instructions are for Ubuntu x86_64 (other linux would be similar with diff
export FORCE_CMAKE=1
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.73 --no-cache-dir --verbose
```
* By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--n_gpu_layers` or setting in UI. For highest performance, offload *all* layers.
* By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI. For highest performance, offload *all* layers.
That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
```text
llama_model_load_internal: offloaded 35/35 layers to GPU
Expand Down
2 changes: 1 addition & 1 deletion docs/README_WINDOWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ For newer builds of windows versions of 10/11.
set FORCE_CMAKE=1
pip install llama-cpp-python==0.1.68 --no-cache-dir --verbose
```
* By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--n_gpu_layers` or setting in UI. For highest performance, offload *all* layers.
* By default, we set `n_gpu_layers` to large value, so llama.cpp offloads all layers for maximum GPU performance. You can control this by passing `--llamacpp_dict="{n_gpu_layers=20}"` for value 20, or setting in UI. For highest performance, offload *all* layers.
That is, one gets maximum performance if one sees in startup of h2oGPT all layers offloaded:
```text
llama_model_load_internal: offloaded 35/35 layers to GPU
Expand Down
1 change: 1 addition & 0 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def run_cli( # for local function:
use_gpu_id=None, tokenizer_base_model=None,
gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
llamacpp_dict=None,
# for some evaluate args
stream_output=None, async_output=None, num_async=None,
prompt_type=None, prompt_dict=None, system_prompt=None,
Expand Down
1 change: 1 addition & 0 deletions src/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def run_eval( # for local function:
use_gpu_id=None, tokenizer_base_model=None,
gpu_id=None, n_jobs=None, local_files_only=None, resume_download=None, use_auth_token=None,
trust_remote_code=None, offload_folder=None, rope_scaling=None, max_seq_len=None, compile_model=None,
llamacpp_dict=None,
# for evaluate args beyond what's already above, or things that are always dynamic and locally created
temperature=None,
top_p=None,
Expand Down
32 changes: 22 additions & 10 deletions src/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,7 @@ def main(
use_system_prompt: bool = False,

# llama and gpt4all settings
n_gpu_layers: int = 100,
use_mlock: bool = True,
n_batch: int = 1024,
n_gqa: int = 8,
llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=8),
model_path_llama: str = 'llama-2-7b-chat.ggmlv3.q8_0.bin',
model_name_gptj: str = 'ggml-gpt4all-j-v1.3-groovy.bin',
model_name_gpt4all_llama: str = 'ggml-wizardLM-7B.q4_2.bin',
Expand Down Expand Up @@ -261,10 +258,12 @@ def main(
:param use_system_prompt: Whether to use system prompt (e.g. llama2 safe system prompt) present in prompt_type itself
Independent of system_prompt, which is used for OpenAI, Replicate.
:param n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
:param use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
:param n_batch: Can make smaller to 128 for slower low-memory CPU systems
:param n_gqa: Required to be 8 for LLaMa 70B
:param llamacpp_dict:
n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
n_batch: Can make smaller to 128 for slower low-memory CPU systems
n_gqa: Required to be 8 for LLaMa 70B
... etc. anything that could be passed to llama.cpp or GPT4All models
:param model_path_llama: model path or URL (for auto-download)
:param model_name_gptj: model path or URL (for auto-download)
:param model_name_gpt4all_llama: model path or URL (for auto-download)
Expand Down Expand Up @@ -503,6 +502,14 @@ def main(
model_lock = os.getenv('model_lock', str(model_lock))
model_lock = ast.literal_eval(model_lock)

if isinstance(llamacpp_dict, str):
llamacpp_dict = ast.literal_eval(llamacpp_dict)
# add others to single dict
llamacpp_dict['model_path_llama'] = model_path_llama
llamacpp_dict['model_name_gptj'] = model_name_gptj
llamacpp_dict['model_name_gpt4all_llama'] = model_name_gpt4all_llama
llamacpp_dict['model_name_exllama_if_no_config'] = model_name_exllama_if_no_config

if model_lock:
assert gradio, "model_lock only supported for gradio=True"
if len(model_lock) > 1:
Expand Down Expand Up @@ -1152,6 +1159,7 @@ def get_model(
rope_scaling: dict = None,
max_seq_len: int = None,
compile_model: bool = True,
llamacpp_dict=None,

verbose: bool = False,
):
Expand Down Expand Up @@ -1181,8 +1189,9 @@ def get_model(
:param offload_folder: offload folder
:param rope_scaling: scaling for rope-based models, e.g. "{'type':'dynamic', 'factor':4}"
:param max_seq_len: override for maximum sequence length for model
:param compile_model: whether to compile torch model
:param max_seq_len: if set, use as max_seq_len for model
:param compile_model: whether to compile torch model
:param llamacpp_dict: dict of llama.cpp and GPT4All model options
:param verbose:
:return:
"""
Expand Down Expand Up @@ -1281,7 +1290,8 @@ def get_model(
if base_model in non_hf_types:
from gpt4all_llm import get_model_tokenizer_gpt4all
model, tokenizer, device = get_model_tokenizer_gpt4all(base_model, n_jobs=n_jobs,
max_seq_len=max_seq_len)
max_seq_len=max_seq_len,
llamacpp_dict=llamacpp_dict)
return model, tokenizer, device
if load_exllama:
return model_loader, tokenizer, 'cuda'
Expand Down Expand Up @@ -1554,6 +1564,7 @@ def get_score_model(score_model: str = None,
offload_folder: str = None,
rope_scaling: dict = None,
compile_model: bool = True,
llamacpp_dict: typing.Dict = None,

verbose: bool = False,
):
Expand All @@ -1572,6 +1583,7 @@ def get_score_model(score_model: str = None,
llama_type = False
max_seq_len = None
compile_model = False
llamacpp_dict = {}
smodel, stokenizer, sdevice = get_model(reward_type=True,
**get_kwargs(get_model, exclude_names=['reward_type'], **locals()))
else:
Expand Down
29 changes: 16 additions & 13 deletions src/gpt4all_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from utils import FakeTokenizer, get_ngpus_vis


def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None, llamacpp_dict=None):
assert llamacpp_dict is not None
# defaults (some of these are generation parameters, so need to be passed in at generation time)
model_name = base_model.lower()
model = get_llm_gpt4all(model_name, model=None,
Expand All @@ -28,6 +29,7 @@ def get_model_tokenizer_gpt4all(base_model, n_jobs=None, max_seq_len=None):
# iinput=iinput,
inner_class=True,
max_seq_len=max_seq_len,
llamacpp_dict=llamacpp_dict,
)
return model, FakeTokenizer(), 'cpu'

Expand All @@ -45,13 +47,13 @@ def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
pass


def get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=[]):
def get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=[]):
# default from class
model_kwargs = {k: v.default for k, v in dict(inspect.signature(cls).parameters).items() if k not in exclude_list}
# from our defaults
model_kwargs.update(default_kwargs)
# from user defaults
model_kwargs.update(env_kwargs)
model_kwargs.update(llamacpp_dict)
# ensure only valid keys
func_names = list(inspect.signature(cls).parameters)
model_kwargs = {k: v for k, v in model_kwargs.items() if k in func_names}
Expand All @@ -78,11 +80,10 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
):
if n_jobs is None:
n_jobs = int(os.getenv('OMP_NUM_THREADS', str(os.cpu_count())))
max_tokens = env_kwargs.pop('max_tokens', max_seq_len - max_new_tokens)
n_gpus = get_ngpus_vis()
default_kwargs = dict(context_erase=0.5,
n_batch=1,
max_tokens=max_tokens,
max_tokens=max_seq_len - max_new_tokens,
n_predict=max_new_tokens,
repeat_last_n=64 if repetition_penalty != 1.0 else 0,
repeat_penalty=repetition_penalty,
Expand All @@ -96,7 +97,7 @@ def get_gpt4all_default_kwargs(max_new_tokens=256,
verbose=verbose)
if n_gpus != 0:
default_kwargs.update(dict(n_gpu_layers=100))
return default_kwargs, env_kwargs
return default_kwargs


def get_llm_gpt4all(model_name,
Expand All @@ -115,11 +116,13 @@ def get_llm_gpt4all(model_name,
verbose=False,
inner_class=False,
max_seq_len=None,
llamacpp_dict=None,
):
if not inner_class:
assert prompter is not None
assert llamacpp_dict is not None

default_kwargs, env_kwargs = \
default_kwargs = \
get_gpt4all_default_kwargs(max_new_tokens=max_new_tokens,
temperature=temperature,
repetition_penalty=repetition_penalty,
Expand All @@ -131,26 +134,26 @@ def get_llm_gpt4all(model_name,
)
if model_name == 'llama':
cls = H2OLlamaCpp
model_path = env_kwargs.pop('model_path_llama') if model is None else model
model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_path = llamacpp_dict.pop('model_path_llama') if model is None else model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(dict(model_path=model_path, callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
llm = cls(**model_kwargs)
llm.client.verbose = verbose
inner_model = llm.client
elif model_name == 'gpt4all_llama':
cls = H2OGPT4All
model_path = env_kwargs.pop('model_name_gpt4all_llama') if model is None else model
model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_path = llamacpp_dict.pop('model_name_gpt4all_llama') if model is None else model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='llama', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
llm = cls(**model_kwargs)
inner_model = llm.client
elif model_name == 'gptj':
cls = H2OGPT4All
model_path = env_kwargs.pop('model_name_gptj') if model is None else model
model_kwargs = get_model_kwargs(env_kwargs, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_path = llamacpp_dict.pop('model_name_gptj') if model is None else model
model_kwargs = get_model_kwargs(llamacpp_dict, default_kwargs, cls, exclude_list=['lc_kwargs'])
model_kwargs.update(
dict(model=model_path, backend='gptj', callbacks=callbacks, streaming=streaming,
prompter=prompter, context=context, iinput=iinput))
Expand Down
4 changes: 4 additions & 0 deletions src/gpt_langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ def get_llm(use_openai_model=False,
system_prompt='',
n_jobs=None,
cli=False,
llamacpp_dict=None,
verbose=False,
):
if n_jobs is None:
Expand Down Expand Up @@ -1031,6 +1032,7 @@ def get_llm(use_openai_model=False,
context=context,
iinput=iinput,
max_seq_len=max_max_tokens,
llamacpp_dict=llamacpp_dict,
)
elif hasattr(model, 'is_exlama') and model.is_exlama():
async_output = False # FIXME: not implemented yet
Expand Down Expand Up @@ -2371,6 +2373,7 @@ def _run_qa_db(query=None,
pre_prompt_summary=None,
prompt_summary=None,
n_jobs=-1,
llamacpp_dict=None,
verbose=False,
cli=False,
reverse_docs=True,
Expand Down Expand Up @@ -2452,6 +2455,7 @@ def _run_qa_db(query=None,
sanitize_bot_response=sanitize_bot_response,
system_prompt=system_prompt,
n_jobs=n_jobs,
llamacpp_dict=llamacpp_dict,
cli=cli,
verbose=verbose,
)
Expand Down
Loading

0 comments on commit 1d43e42

Please sign in to comment.