-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
gen.py
5479 lines (4966 loc) · 287 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import ast
import copy
import functools
import inspect
import queue
import sys
import os
import json
import time
import traceback
import typing
import uuid
import warnings
from datetime import datetime
from random import randint
import filelock
import requests
if os.path.dirname(os.path.abspath(__file__)) not in sys.path:
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
try:
from importlib.metadata import distribution, PackageNotFoundError
assert distribution('hf_transfer') is not None
have_hf_transfer = True
except (PackageNotFoundError, AssertionError):
have_hf_transfer = False
if have_hf_transfer and os.getenv('HF_HUB_ENABLE_HF_TRANSFER', 'None') != '0':
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
os.environ['SCARF_NO_ANALYTICS'] = 'true'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['OTEL_SDK_DISABLED'] = 'true'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['BITSANDBYTES_NOWELCOME'] = '1'
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
os.environ['FIFTYONE_SHOW_PROGRESS_BARS'] = 'false'
# more is not useful typically, don't let these go beyond limits and eat up resources
max_cores = max(1, os.cpu_count() // 2)
if os.getenv('NUMEXPR_MAX_THREADS') is None:
os.environ['NUMEXPR_MAX_THREADS'] = str(min(8, max_cores))
if os.getenv('NUMEXPR_NUM_THREADS') is None:
os.environ['NUMEXPR_NUM_THREADS'] = str(min(8, max_cores))
if os.getenv('OMP_NUM_THREADS') is None:
os.environ['OMP_NUM_THREADS'] = str(min(8, max_cores))
if os.getenv('OPENBLAS_NUM_THREADS') is None:
os.environ['OPENBLAS_NUM_THREADS'] = str(min(8, max_cores))
if os.getenv('DUCKDB_NUM_THREADS') is None:
os.environ['DUCKDB_NUM_THREADS'] = str(min(4, max_cores))
if os.getenv('RAYON_RS_NUM_CPUS') is None:
os.environ['RAYON_RS_NUM_CPUS'] = str(min(8, max_cores))
if os.getenv('RAYON_NUM_THREADS') is None:
os.environ['RAYON_NUM_THREADS'] = str(min(8, max_cores))
from gradio_funcs import merge_chat_conversation_history
from db_utils import fetch_user
from model_utils import switch_a_roo_llama, get_score_model, get_model_retry, get_model, \
get_client_from_inference_server, model_lock_to_state
from evaluate_params import eval_func_param_names, no_default_param_names, input_args_list
from enums import DocumentSubset, LangChainMode, no_lora_str, no_model_str, \
LangChainAction, LangChainAgent, DocumentChoice, LangChainTypes, super_source_prefix, \
super_source_postfix, t5_type, get_langchain_prompts, gr_to_lg, invalid_key_msg, docs_joiner_default, \
docs_ordering_types_default, docs_token_handling_default, max_input_tokens_public, max_total_input_tokens_public, \
max_top_k_docs_public, max_top_k_docs_default, max_total_input_tokens_public_api, max_top_k_docs_public_api, \
max_input_tokens_public_api, anthropic_mapping, \
base_langchain_actions, generic_prefix, \
generic_postfix, langchain_modes_intrinsic, valid_imagechange_models, \
valid_imagegen_models, valid_imagestyle_models, \
langchain_modes0, langchain_mode_types0, langchain_mode_paths0, \
llava_num_max, response_formats, noop_prompt_type, unknown_prompt_type, \
json_object_prompt0, json_object_prompt_simpler0, json_code_prompt0, user_prompt_for_fake_system_prompt0, \
json_schema_instruction0, json_code_prompt_if_no_schema0, my_db_state0, empty_prompt_type, is_gradio_vision_model, \
is_json_model, is_vision_model, \
model_state_none0, other_model_state_defaults0, image_batch_image_prompt0, image_batch_final_prompt0, \
tokens_per_image, openai_supports_functiontools, openai_supports_parallel_functiontools
from utils import set_seed, clear_torch_cache, NullContext, wrapped_partial, EThread, get_githash, \
import_matplotlib, get_device, makedirs, get_kwargs, start_faulthandler, get_hf_server, \
have_langchain, set_openai, cuda_vis_check, H2O_Fire, lg_to_gr, str_to_list, str_to_dict, get_token_count, \
have_wavio, have_soundfile, have_deepspeed, have_doctr, have_librosa, have_TTS, have_flash_attention_2, \
have_diffusers, sanitize_filename, get_gradio_tmp, get_is_gradio_h2oai, get_json, \
get_docs_tokens, deduplicate_names, have_autogen, get_model_name, is_empty
start_faulthandler()
import_matplotlib()
SEED = 1236
set_seed(SEED)
from typing import Union
import torch
from transformers import GenerationConfig, TextIteratorStreamer
from prompter import Prompter, non_hf_types, PromptType, get_prompt, generate_prompt, \
openai_gpts, get_vllm_extra_dict, gradio_to_llm, history_for_llm, apply_chat_template, model_name_to_prompt_type
from stopping import get_stopping
from prompter_utils import get_use_chat_template, base64_decode_jinja_template
langchain_actions = [x.value for x in list(LangChainAction)]
langchain_agents_list = [x.value for x in list(LangChainAgent)]
def main(
load_8bit: bool = False,
load_4bit: bool = False,
low_bit_mode: int = 1,
load_half: bool = None,
use_flash_attention_2=False,
load_gptq: str = '',
use_autogptq: bool = False,
load_awq: str = '',
load_exllama: bool = False,
use_safetensors: bool = True,
revision: str = None,
use_gpu_id: bool = True,
base_model: str = '',
display_name: str = None,
tokenizer_base_model: str = '',
lora_weights: str = "",
gpu_id: int = 0,
compile_model: bool = None,
use_cache: bool = None,
inference_server: str = "",
regenerate_clients: bool = True,
regenerate_gradio_clients: bool = False,
validate_clients: bool = True,
fail_if_invalid_client: bool = False,
prompt_type: Union[int, str] = None,
prompt_dict: typing.Dict = None,
chat_template: str = '',
system_prompt: str = 'auto',
allow_chat_system_prompt: bool = True,
# llama and gpt4all settings
llamacpp_path: str = 'llamacpp_path',
llamacpp_dict: typing.Dict = dict(n_gpu_layers=100, use_mlock=True, n_batch=1024, n_gqa=0),
model_path_llama: str = '',
model_name_gptj: str = '',
model_name_gpt4all_llama: str = '',
model_name_exllama_if_no_config: str = '',
exllama_dict: typing.Dict = dict(),
gptq_dict: typing.Dict = dict(),
attention_sinks: bool = False,
sink_dict: typing.Dict = dict(),
truncation_generation: bool = False,
hf_model_dict: typing.Dict = dict(),
force_seq2seq_type: bool = False,
force_t5_type: bool = False,
model_lock: typing.List[typing.Dict[str, str]] = None,
model_lock_columns: int = None,
model_lock_layout_based_upon_initial_visible: bool = False,
fail_if_cannot_connect: bool = False,
# input to generation
temperature: float = None,
top_p: float = None,
top_k: int = None,
penalty_alpha: float = None,
num_beams: int = None,
repetition_penalty: float = None,
num_return_sequences: int = None,
do_sample: bool = None,
seed: int = None,
max_new_tokens: int = None,
min_new_tokens: int = None,
early_stopping: Union[bool, str] = None,
max_time: float = None,
memory_restriction_level: int = None,
debug: bool = False,
save_dir: str = None,
local_files_only: bool = False,
resume_download: bool = True,
use_auth_token: Union[str, bool] = False,
admin_pass: str = None,
trust_remote_code: Union[str, bool] = True,
rope_scaling: dict = None,
max_seq_len: int = None,
max_output_seq_len: int = None,
offload_folder: str = "offline_folder",
src_lang: str = "English",
tgt_lang: str = "Russian",
prepare_offline_level: int = 0,
cli: bool = False,
cli_loop: bool = True,
eval: bool = False,
gradio: bool = True,
function: bool = False,
force_streaming_on_to_handle_timeouts: bool = True,
openai_server: bool = True,
openai_port: int = 5001 if sys.platform == "darwin" else 5000,
openai_workers: int = 1,
function_server: bool = False,
function_server_port: int = 5003 if sys.platform == "darwin" else 5002,
function_server_workers: int = 1,
function_api_key: str = None,
agent_server: bool = False, # WIP
agent_port: int = 5004 if sys.platform == "darwin" else 5004,
agent_workers: int = 1,
multiple_workers_gunicorn: bool = False,
gradio_offline_level: int = 0,
server_name: str = "0.0.0.0",
share: bool = False,
open_browser: bool = False,
close_button: bool = True,
shutdown_via_api: bool = False,
root_path: str = "",
ssl_verify: bool = True,
ssl_keyfile: str | None = None,
ssl_certfile: str | None = None,
ssl_keyfile_password: str | None = None,
chat: bool = True,
chat_conversation: typing.List[typing.Tuple[str, str]] = None,
text_context_list: typing.List[str] = None,
stream_output: bool = True,
async_output: bool = True,
num_async: int = 3,
stream_map: bool = False,
show_examples: bool = None,
verbose: bool = False,
h2ocolors: bool = True,
dark: bool = False, # light tends to be best
height: int = 600,
render_markdown: bool = True,
show_lora: bool = True,
show_llama: bool = True,
show_gpt4all: bool = False,
login_mode_if_model0: bool = False,
block_gradio_exit: bool = True,
concurrency_count: int = None,
api_open: bool = False,
allow_api: bool = True,
system_api_open: bool = False,
input_lines: int = 1,
gradio_size: str = None,
show_copy_button: bool = True,
large_file_count_mode: bool = False,
gradio_ui_stream_chunk_size: int = None,
gradio_ui_stream_chunk_min_seconds: float = 0.2,
gradio_ui_stream_chunk_seconds: float = 2.0,
gradio_api_use_same_stream_limits: bool = True,
gradio_upload_to_chatbot: bool = False,
gradio_upload_to_chatbot_num_max: bool = 2,
gradio_errors_to_chatbot: bool = True,
pre_load_embedding_model: bool = True,
embedding_gpu_id: Union[int, str] = 'auto',
auth: Union[typing.List[typing.Tuple[str, str]], str] = None,
auth_filename: str = None,
auth_access: str = 'open',
auth_freeze: bool = False,
auth_message: str = None,
google_auth: bool = False,
guest_name: str = None,
enforce_h2ogpt_api_key: bool = None,
enforce_h2ogpt_ui_key: bool = None,
h2ogpt_api_keys: Union[list, str] = [],
h2ogpt_key: str = None,
extra_allowed_paths: list = [],
blocked_paths: list = [],
max_max_time=None,
max_max_new_tokens=None,
visible_models: list = None,
max_visible_models: int = None,
visible_ask_anything_high: bool = True,
visible_visible_models: bool = True,
visible_submit_buttons: bool = True,
visible_side_bar: bool = True,
visible_document_subset: bool = True,
visible_max_quality: bool = True,
visible_add_doc_to_chat: bool = True,
visible_chat_history: bool = True,
visible_doc_track: bool = True,
visible_chat_tab: bool = True,
visible_doc_selection_tab: bool = True,
visible_doc_view_tab: bool = True,
visible_chat_history_tab: bool = True,
visible_expert_tab: bool = True,
visible_models_tab: bool = True,
visible_system_tab: bool = True,
visible_tos_tab: bool = False,
visible_login_tab: bool = True,
visible_hosts_tab: bool = False,
visible_langchain_action_radio: bool = True,
chat_tabless: bool = False,
visible_h2ogpt_links: bool = True,
visible_h2ogpt_qrcode: bool = True,
visible_h2ogpt_logo: bool = True,
visible_chatbot_label: bool = True,
visible_all_prompter_models: bool = False,
visible_curated_models: bool = True,
actions_in_sidebar: bool = False,
document_choice_in_sidebar: bool = True,
enable_add_models_to_list_ui: bool = False,
max_raw_chunks: int = None,
pdf_height: int = 800,
avatars: bool = True,
add_disk_models_to_ui: bool = True,
page_title: str = "h2oGPT",
model_label_prefix: str = "h2oGPT",
favicon_path: str = None,
visible_ratings: bool = False,
reviews_file: str = None,
sanitize_user_prompt: bool = False,
sanitize_bot_response: bool = False,
extra_model_options: typing.List[str] = [],
extra_lora_options: typing.List[str] = [],
extra_server_options: typing.List[str] = [],
score_model: str = 'auto',
verifier_model: str = None,
verifier_tokenizer_base_model: str = None,
verifier_inference_server: str = None,
eval_filename: str = None,
eval_prompts_only_num: int = 0,
eval_prompts_only_seed: int = 1234,
eval_as_output: bool = False,
langchain_mode: str = None,
user_path: str = None,
langchain_modes: list = langchain_modes0,
langchain_mode_paths: dict = langchain_mode_paths0,
langchain_mode_types: dict = langchain_mode_types0,
detect_user_path_changes_every_query: bool = False,
update_selection_state_from_cli: bool = True,
langchain_action: str = LangChainAction.QUERY.value,
langchain_agents: list = [],
force_langchain_evaluate: bool = False,
visible_langchain_actions: list = base_langchain_actions.copy(),
visible_langchain_agents: list = langchain_agents_list.copy(),
document_subset: str = DocumentSubset.Relevant.name,
document_choice: list = [DocumentChoice.ALL.value],
document_source_substrings: list = [],
document_source_substrings_op: str = 'and',
document_content_substrings: list = [],
document_content_substrings_op: str = 'and',
use_llm_if_no_docs: bool = True,
load_db_if_exists: bool = True,
keep_sources_in_context: bool = False,
db_type: str = 'chroma',
use_openai_embedding: bool = False,
use_openai_model: bool = False,
hf_embedding_model: str = None,
migrate_embedding_model: str = False,
cut_distance: float = 1.64,
answer_with_sources: bool = True,
append_sources_to_answer: bool = False,
append_sources_to_chat: bool = True,
sources_show_text_in_accordion: bool = True,
top_k_docs_max_show: int = 10,
show_link_in_sources: bool = True,
langchain_instruct_mode: bool = True,
pre_prompt_query: str = None,
prompt_query: str = None,
pre_prompt_summary: str = None,
prompt_summary: str = None,
hyde_llm_prompt: str = None,
all_docs_start_prompt: str = 'auto',
all_docs_finish_prompt: str = 'auto',
user_prompt_for_fake_system_prompt: str = None,
json_object_prompt: str = None,
json_object_prompt_simpler: str = None,
json_code_prompt: str = None,
json_code_prompt_if_no_schema: str = None,
json_schema_instruction: str = None,
json_preserve_system_prompt: bool = False,
add_chat_history_to_context: bool = True,
add_search_to_context: bool = False,
context: str = '',
iinput: str = '',
allow_upload_to_user_data: bool = True,
reload_langchain_state: bool = True,
allow_upload_to_my_data: bool = True,
enable_url_upload: bool = True,
enable_text_upload: bool = True,
enable_sources_list: bool = True,
chunk: bool = True,
chunk_size: int = 512,
top_k_docs: int = None,
docs_ordering_type: str = docs_ordering_types_default,
min_max_new_tokens=512,
max_input_tokens=None,
max_total_input_tokens=None,
docs_token_handling: str = docs_token_handling_default,
docs_joiner: str = docs_joiner_default,
hyde_level: int = 0,
hyde_template: str = None,
hyde_show_only_final: bool = False,
hyde_show_intermediate_in_accordion: bool = True,
map_reduce_show_intermediate_in_accordion: bool = True,
doc_json_mode: bool = False,
metadata_in_context: Union[str, list] = 'auto',
auto_reduce_chunks: bool = True,
max_chunks: int = 100,
headsize: int = 50,
n_jobs: int = -1,
n_gpus: int = None,
clear_torch_cache_level: int = 1,
# urls
use_unstructured: bool = True,
use_playwright: bool = False,
use_selenium: bool = False,
use_scrapeplaywright: bool = False,
use_scrapehttp: bool = False,
# pdfs
use_pymupdf: Union[bool, str] = 'auto',
use_unstructured_pdf: Union[bool, str] = 'auto',
use_pypdf: Union[bool, str] = 'auto',
enable_pdf_ocr: Union[bool, str] = 'auto',
enable_pdf_doctr: Union[bool, str] = 'auto',
try_pdf_as_html: Union[bool, str] = 'auto',
# images
enable_ocr: bool = False,
enable_doctr: bool = True,
enable_pix2struct: bool = False,
enable_captions: bool = True,
enable_llava: bool = True,
enable_transcriptions: bool = True,
pre_load_image_audio_models: bool = False,
caption_gpu: bool = True,
caption_gpu_id: Union[int, str] = 'auto',
captions_model: str = "microsoft/Florence-2-base",
doctr_gpu: bool = True,
doctr_gpu_id: Union[int, str] = 'auto',
llava_model: str = None,
llava_prompt: str = 'auto',
image_file: str = None,
image_control: str = None,
images_num_max: int = None,
image_resolution: tuple = None,
image_format: str = None,
rotate_align_resize_image: bool = None,
video_frame_period: int = None,
image_batch_image_prompt: str = None,
image_batch_final_prompt: str = None,
image_batch_stream: bool = False,
visible_vision_models: Union[str, int, list] = None,
video_file: str = None,
response_format: str = 'text',
guided_json: Union[str, dict] = '',
guided_regex: str = '',
guided_choice: typing.List[str] = None,
guided_grammar: str = '',
guided_whitespace_pattern: str = None,
asr_model: str = "openai/whisper-medium",
asr_gpu: bool = True,
asr_gpu_id: Union[int, str] = 'auto',
asr_use_better: bool = True,
asr_use_faster: bool = False,
enable_stt: Union[str, bool] = 'auto',
stt_model: str = "openai/whisper-base.en",
stt_gpu: bool = True,
stt_gpu_id: Union[int, str] = 'auto',
stt_continue_mode: int = 1,
enable_tts: Union[str, bool] = 'auto',
tts_gpu: bool = True,
tts_gpu_id: Union[int, str] = 'auto',
tts_model: str = 'microsoft/speecht5_tts',
tts_gan_model: str = 'microsoft/speecht5_hifigan',
tts_coquiai_deepspeed: bool = False,
tts_coquiai_roles: dict = None,
chatbot_role: str = "None", # "Female AI Assistant",
speaker: str = "None", # "SLT (female)",
tts_language: str = 'autodetect',
tts_speed: float = 1.0,
tts_action_phrases: typing.List[str] = [], # ['Nimbus'],
tts_stop_phrases: typing.List[str] = [], # ['Yonder'],
sst_floor: float = 100,
enable_image: bool = False,
visible_image_models: typing.List[str] = [],
image_gpu_ids: typing.List[Union[str, int]] = None,
enable_llava_chat: bool = False,
# json
jq_schema='.[]',
extract_frames: int = 10,
max_quality: bool = False,
enable_heap_analytics: bool = True,
heap_app_id: str = "1680123994",
cert_lookup_directory: str = "/etc/ssl/more-certs",
):
"""
:param load_8bit: load model in 8-bit using bitsandbytes
:param load_4bit: load model in 4-bit using bitsandbytes
:param low_bit_mode: 0: no quantization config 1: change compute 2: nf4 3: double quant 4: 2 and 3
See: https://huggingface.co/docs/transformers/main_classes/quantization
If using older bitsandbytes or transformers, 0 is required
:param load_half: load model in float16 (None means auto, which means True unless t5 based model)
otherwise specify bool
:param use_flash_attention_2: Whether to try to use flash attention 2 if available when loading HF models
Warning: We have seen nans and type mismatches with flash-attn==2.3.4 installed and this enabled,
even for other models like embedding model that is unrelated to primary models.
:param load_gptq: to load model with GPTQ, put model_basename here, e.g. 'model' for TheBloke models
:param use_autogptq: whether to use AutoGPTQ (True) or HF Transformers (False)
Some models are only supported by one or the other
:param load_awq: load model with AWQ, e.g. 'model' for TheBloke models
:param load_exllama: whether to use exllama (only applicable to LLaMa1/2 models with 16-bit or GPTQ
:param use_safetensors: to use safetensors version (assumes file/HF points to safe tensors version)
:param revision: Which HF revision to use
:param use_gpu_id: whether to control devices with gpu_id. If False, then spread across GPUs
:param base_model: model HF-type name. If use --base_model to preload model, cannot unload in gradio in models tab
:param display_name: display name for model (used in UI and API to access)
:param tokenizer_base_model: tokenizer HF-type name. Usually not required, inferred from base_model.
If model is private or doesn't exist as HF model, can use "tiktoken" and pass max_seq_len and (if different) max_output_seq_len
For inference servers like OpenAI etc. if have model name, we use tiktoken with known input/output sequence lengths.
:param lora_weights: LORA weights path/HF link
:param gpu_id: if use_gpu_id, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
:param compile_model Whether to compile the model
:param use_cache: Whether to use caching in model (some models fail when multiple threads use)
:param inference_server: Consume base_model as type of model at this address
Address can be text-generation-server hosting that base_model
e.g. python generate.py --inference_server="http://192.168.1.46:6112" --base_model=HuggingFaceH4/zephyr-7b-beta
For a gradio server, use same as TGI server. We infer if it's TGI or Gradio.
e.g. python generate.py --inference_server="http://192.168.1.46:7860" --base_model=HuggingFaceH4/zephyr-7b-beta
For auth protected gradio, do:
e.g. python generate.py --inference_server="http://192.168.1.46:7860:user:password" --base_model=HuggingFaceH4/zephyr-7b-beta
If don't want to specify port, do:
e.g. python generate.py --inference_server="https://gpt.h2o.ai:None:user:password" --base_model=HuggingFaceH4/zephyr-7b-beta
Or Address can be "openai_chat" or "openai" for OpenAI API
Or Address can be "openai_azure_chat" or "openai_azure" for Azure OpenAI API
e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo
e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003
e.g. python generate.py --inference_server="openai_azure_chat:<deployment_name>:<baseurl>:<api_version>:<access key>" --base_model=gpt-3.5-turbo
e.g. python generate.py --inference_server="openai_azure:<deployment_name>:<baseurl>:<api_version>:<access key>" --base_model=text-davinci-003
Optionals (Replace with None or just leave empty but keep :)
<deployment_name> of some deployment name
<baseurl>: e.g. "<endpoint>.openai.azure.com" for some <endpoint> without https://
<api_version> of some api, e.g. 2023-05-15
Or Address can be for vLLM:
Use: "vllm:IP:port" for OpenAI-compliant vLLM endpoint
Use: "vllm_chat:IP:port" for OpenAI-Chat-compliant vLLM endpoint
Use: "vllm:http://IP:port/v1" for OpenAI-compliant vLLM endpoint
Use: "vllm_chat:http://IP:port/v1" for OpenAI-Chat-compliant vLLM endpoint
Use: "vllm:https://IP/v1" for OpenAI-compliant vLLM endpoint
Use: "vllm_chat:https://IP/v1" for OpenAI-Chat-compliant vLLM endpoint
For example, for standard URL and API key for vllm, one would do:
vllm_chat:https://vllm.h2o.ai:None:/v1:1234ABCD
or for non-standard URL:
vllm_chat:https://vllm.h2o.ai:None:/1b1219f7-4bb4-43e9-881f-fa8fa9fe6e04/v1:1234ABCD
where vllm.h2o.ai is the DNS name of the IP, None means no extra port, so will be dropped from base_url when using API, /1b1219f7-4bb4-43e9-881f-fa8fa9fe6e04/v1 is the url of the "page" to access, and 1234ABCD is the api key
Or for example:
vllm_chat:https://vllm.h2o.ai:5001:/1b1219f7-4bb4-43e9-881f-fa8fa9fe6e04/v1:1234ABCD
where vllm.h2o.ai is the DNS name of the IP, 5001 is the port, /1b1219f7-4bb4-43e9-881f-fa8fa9fe6e04/v1 is the url of the "page" to access, and 1234ABCD is the api key
If you have any other OpenAI compatible chat completion endpoint, you should use vllm_chat way. E.g. llama.cpp http server: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
For sglang, text models are supported via OpenAI API and can use vllm_chat or vllm as usual.
For sglang and vision models, need to specify sglang so we use http requests API via generate endpoint. Use "sglang" prefix and otherwise it is like vllm endpoint
Currently it's not clear how to make an API key work: https://github.com/sgl-project/sglang/issues/466, so one should rely upon firewalls
One should also pass the name of the python module used for conversation, e.g. for
python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --tokenizer-path lmms-lab/llama3-llava-next-8b-tokenizer --port=30000 --host="0.0.0.0" --tp-size=1 --random-seed=1234 --context-length=8192
One should use:
sglang:conv_llava_llama_3:http://IP:port
For together.ai that is OpenAI compliant, use:
vllm_chat:https://api.together.xyz:None:/v1:1234ABCD
Or for groq, can use OpenAI API like:
GROQ IS BROKEN FOR OPENAI API:
vllm:https://api.groq.com/openai:None:/v1:<api key>'
with: other model_lock or CLI options: {'inference_server': 'vllm:https://api.groq.com/openai:None:/v1:<api key>', 'base_model':'mixtral-8x7b-32768', 'visible_models':'mixtral-8x7b-32768', 'max_seq_len': 31744, 'prompt_type':'plain'}
i.e.ensure to use 'plain' prompt, not mixtral.
For groq:
groq and ensures set env GROQ_API_KEY
or groq:<api key>
with: other model_lock or CLI options: {'inference_server': 'groq:<api key>', 'base_model':'mixtral-8x7b-32768', 'visible_models':'mixtral-8x7b-32768', 'max_seq_len': 31744, 'prompt_type':'plain'}
Or Address can be replicate:
Use:
--inference_server=replicate:<model name string> will use a Replicate server, requiring a Replicate key.
e.g. <model name string> looks like "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
Or Address can be for AWS SageMaker:
Use: "sagemaker_chat:<endpoint name>" for chat models that AWS sets up as dialog
Use: "sagemaker:<endpoint name>" for foundation models that AWS only text as inputs
Or Address can be for Anthropic Claude. Ensure key is set in env ANTHROPIC_API_KEY
Use: "anthropic
E.g. --base_model=claude-2.1 --inference_server=anthropic
Or Address can be for Google Gemini. Ensure key is set in env GOOGLE_API_KEY
Use: "google"
E.g. --base_model=gemini-pro --inference_server=google
Or Address can be for MistralAI. Ensure key is set in env MISTRAL_API_KEY
Use: "mistralai"
E.g. --base_model=mistral-medium --inference_server=mistralai
:param regenerate_clients: Whether to regenerate client every LLM call or use start-up version
Benefit of doing each LLM call is timeout can be controlled to max_time in expert settings, else we use default of 600s.
Maybe risky, some lack of thread safety: https://github.com/encode/httpx/discussions/3043, so disabled
Because gradio clients take long time to start-up, we don't ever regenerate them each time (including llava models)
:param regenerate_gradio_clients: Whether to also regenerate gradio clients (slow)
:param validate_clients: Whether to validate clients, and if invalid, do not add them to list (e.g. if OpenAI API key is invalid, then just report in logs, do not hard fail, but do not add the model to model list)
Currently only done for OpenAI or vLLM endpoints
:param fail_if_invalid_client: Whether to fail hard if any client fails validation
:param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
:param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True)
:param chat_template: jinja HF transformers chat_template to use. '' or None means no change to template
Sometimes hard to pass string with proper escapes etc. So string can be base64 encoded with base64_encode_jinja_template()
:param system_prompt: Universal system prompt to use if model supports, like LLaMa2, regardless of prompt_type definition.
Useful for langchain case to control behavior, or OpenAI and Replicate.
If None, 'None', or 'auto', then for LLaMa or other models that internally have system_prompt, will use default for each model
If '', then no system prompt (no empty template given to model either, just no system part added at all)
If some string not in ['None', 'auto'], then use that as system prompt
Default is '', no system_prompt, because often it hurts performance/accuracy
:param allow_chat_system_prompt:
Whether to use conversation_history to pre-append system prompt
:param llamacpp_path: Location to store downloaded gguf or load list of models from
Note HF models go into hf cache folder, and gpt4all models go into their own cache folder
Can override with ENV LLAMACPP_PATH
:param llamacpp_dict:
n_gpu_layers: for llama.cpp based models, number of GPU layers to offload (default is all by using large value)
use_mlock: when using `llama.cpp` based CPU models, for computers with low system RAM or slow CPUs, recommended False
n_batch: Can make smaller to 128 for slower low-memory CPU systems
n_gqa: Required to be 8 for LLaMa 70B
... etc. anything that could be passed to llama.cpp or GPT4All models
e.g. python generate.py --base_model='llama' --prompt_type=llama2 --score_model=None --langchain_mode='UserData' --user_path=user_path --llamacpp_dict="{'n_gpu_layers':25,'n_batch':128}"
:param model_path_llama: model path or URL (for auto-download)
:param model_name_gptj: model path or URL (for auto-download)
:param model_name_gpt4all_llama: model path or URL (for auto-download)
:param model_name_exllama_if_no_config: exllama model's full path for model, tokenizer, generator for use when no HuggingFace config
:param exllama_dict for setting various things for Exllama class
E.g. compress_pos_emb,
set_auto_map,
gpu_peer_fix,
alpha_value,
matmul_recons_thd,
fused_mlp_thd
sdp_thd
fused_attn
matmul_fused_remap
rmsnorm_no_half2
rope_no_half2
matmul_no_half2
silu_no_half2
concurrent_streams
E.g. to set memory to be split across 2 GPUs, use --exllama_dict="{'set_auto_map':20,20}"
:param gptq_dict: Choices for AutoGPTQ, e.g. one can change defaults to these non-defaults:
inject_fused_attention=False
disable_exllama=True
use_triton=True
:param attention_sinks: Whether to enable attention sinks.
:param sink_dict: dict of options for attention sinks
E.g. {'window_length': 1024, 'num_sink_tokens': 4}
Default is window length same size as max_input_tokens (max_seq_len if max_input_tokens not set)
:param hf_model_dict: dict of options for HF models using transformers
:param truncation_generation: Whether (for torch) to terminate generation once reach context length of model.
For some models, perplexity becomes critically large beyond context
For other models like Mistral, one can generate beyond max_seq_len set to 4096 or 8192 without issue, since based upon 32k embeddings
codellama can also generate beyond its 16k context length
So default is off, but for simpler/older models True may be wise to avoid bad generations
:param model_lock: Lock models to specific combinations, for ease of use and extending to many models
Only used if gradio = True
List of dicts, each dict has base_model, tokenizer_base_model, lora_weights, inference_server, prompt_type, and prompt_dict
If all models have same prompt_type, and prompt_dict, can still specify that once in CLI outside model_lock as default for dict
Can specify model_lock instead of those items on CLI
As with CLI itself, base_model can infer prompt_type and prompt_dict if in prompter.py.
Also, tokenizer_base_model and lora_weights are optional.
Also, inference_server is optional if loading model from local system.
All models provided will automatically appear in compare model mode
Model loading-unloading and related choices will be disabled. Model/lora/server adding will be disabled
:param model_lock_columns: How many columns to show if locking models (and so showing all at once)
If None, then defaults to up to 3
if -1, then all goes into 1 row
Maximum value is 4 due to non-dynamic gradio rendering elements
:param model_lock_layout_based_upon_initial_visible: Whether to base any layout upon visible models (True)
or upon all possible models. gradio does not allow dynamic objects, so all layouts are preset,
and these are two reasonable options.
False is best when there are many models and user excludes middle ones as being visible.
:param fail_if_cannot_connect: if doing model locking (e.g. with many models), fail if True. Otherwise ignore.
Useful when many endpoints and want to just see what works, but still have to wait for timeout.
:param temperature: generation temperature
:param top_p: generation top_p
:param top_k: generation top_k
:param penalty_alpha: penalty_alpha>0 and top_k>1 enables contrastive search (not all models support)
:param num_beams: generation number of beams
:param repetition_penalty: generation repetition penalty
:param num_return_sequences: generation number of sequences (1 forced for chat)
:param do_sample: generation sample. Enable for sampling for given temperature, top_p, top_k, else greedy decoding and then temperature, top_p, top_k not used.
https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.do_sample
https://txt.cohere.com/llm-parameters-best-outputs-language-ai/
https://medium.com/@daniel.puenteviejo/the-science-of-control-how-temperature-top-p-and-top-k-shape-large-language-models-853cb0480dae
:param seed: seed (0 means random seed, >0 uses that seed for sampling so reproducible even for sampling). None becomes 0.
:param max_new_tokens: generation max new tokens
:param min_new_tokens: generation min tokens
:param early_stopping: generation early stopping
:param max_time: maximum time to allow for generation
:param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case
:param debug: enable debug mode
:param save_dir: directory chat data is saved to
:param local_files_only: whether to only use local files instead of doing to HF for models
:param resume_download: whether to resume downloads from HF for models
:param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
:param admin_pass: Administrator password
:param trust_remote_code: whether to use trust any code needed for HF model
:param rope_scaling:
For HF transformers model: scaling for rope-based models.
For long context models that have been tuned for a specific size, you have to only use that specific size by setting the `--rope_scaling` exactly correctly
e.g. --rope_scaling="{'type':'dynamic', 'factor':4}"
e.g. --rope_scaling="{'type':'linear', 'factor':4}"
e.g. python generate.py --rope_scaling="{'type':'linear','factor':4}" --base_model=lmsys/vicuna-13b-v1.5-16k --hf_embedding_model=sentence-transformers/all-MiniLM-L6-v2 --load_8bit=True --langchain_mode=UserData --user_path=user_path --prompt_type=vicuna11 --h2ocolors=False
For exllama model: --rope_scaling="{'alpha_value':4}" . This automatically scales max_seq_len for exllama
:param max_seq_len: Manually set maximum sequence length for the LLM
:param max_output_seq_len: Manually set maximum output length for the LLM
:param offload_folder: path for spilling model onto disk
:param src_lang: source languages to include if doing translation (None = all)
:param tgt_lang: target languages to include if doing translation (None = all)
:param prepare_offline_level:
Whether to just prepare for offline use, do not go into cli, eval, or gradio run modes
0 : no prep
1: prepare just h2oGPT with exact same setup as passed to CLI and ensure all artifacts for h2oGPT alone added to ~/.cache/
2: prepare h2oGPT + all inference servers so h2oGPT+inference servers can use the ~/.cache/
:param cli: whether to use CLI (non-gradio) interface.
:param eval: whether to run evals
:param cli_loop: whether to loop for CLI (False usually only for testing)
:param gradio: whether to enable gradio, or to enable benchmark mode
:param function: whether to run function mode to just return locals for function server
:param force_streaming_on_to_handle_timeouts: whether to force streaming internally even if UI/API doesn't do it, so can handle timeouts and avoid blocking calls.
:param openai_server: whether to launch OpenAI proxy server for local gradio server
Disabled if API is disabled
:param openai_port: port for OpenAI proxy server
:param openai_workers: number of workers for OpenAI (1 means 1 worker, 0 means all physical cores, else choose)
:param function_server: whether to launch Function server to handle document loading offloading to separate thread or forks
:param function_server_port: port for OpenAI proxy server
:param function_server_workers: number of workers for Function Server (1 means 1 worker, 0 means all physical cores, else choose)
:param function_api_key: API key for function server, auto-set if not provided, uses first key like OpenAI proxy server does as well
:param agent_server: whether to launch Agent proxy server
Disabled if API is disabled
:param agent_port: port for Agent proxy server
:param agent_workers: number of workers for Agent Server (1 means 1 worker, 0 means all physical cores, else choose)
:param multiple_workers_gunicorn: whether to use gunicorn (True) or uvicorn (False) for multiple workers
:param gradio_offline_level: > 0, then change fonts so full offline
== 1 means backend won't need internet for fonts, but front-end UI might if font not cached
== 2 means backend and frontend don't need internet to download any fonts.
Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading.
This option further disables google fonts for downloading, which is less intrusive than uploading,
but still required in air-gapped case. The fonts don't look as nice as google fonts, but ensure full offline behavior.
Also set --share=False to avoid sharing a gradio live link.
:param server_name: IP to use. In linux 0.0.0.0 is good choice so exposed to outside host, else for only local use 127.0.0.1.
For windows/MAC 0.0.0.0 or 127.0.0.1 will work, but may need to specify actual LAN IP address for other LAN clients to see.
:param share: whether to share the gradio app with sharable URL
:param open_browser: whether to automatically open browser tab with gradio UI
:param close_button: Whether to show close button in system tab (if not public)
:param shutdown_via_api: Whether to allow shutdown via API
:param root_path: The root path (or "mount point") of the application,
if it's not served from the root ("/") of the domain. Often used when the application is behind a reverse proxy
that forwards requests to the application. For example, if the application is served at "https://example.com/myapp",
the `root_path` should be set to "/myapp".
:param ssl_verify: passed go gradio launch
:param ssl_keyfile: passed go gradio launch
:param ssl_certfile: passed go gradio launch
:param ssl_keyfile_password: passed go gradio launch
:param chat: whether to enable chat mode with chat history
:param chat_conversation: list of tuples of (human, bot) conversation pre-appended to existing chat when using instruct/chat models
Requires also add_chat_history_to_context = True
It does *not* require chat=True, so works with nochat_api etc.
:param text_context_list: List of strings to add to context for non-database version of document Q/A for faster handling via API etc.
Forces LangChain code path and uses as many entries in list as possible given max_seq_len, with first assumed to be most relevant and to go near prompt.
:param stream_output: whether to stream output
:param async_output: Whether to do asyncio handling
For summarization
Applicable to HF TGI server
Only if stream_output=False in CLI, UI, or API
:param num_async: Number of simultaneously allowed asyncio calls to make for async_output
Too many will overload inference server, too few will be too slow
:param stream_map: Whether to stream map_reduce fully even while doing async (if async, then only first map in any group map will be streamed)
Experimental, not working fully.
:param show_examples: whether to show clickable examples in gradio
:param verbose: whether to show verbose prints
:param h2ocolors: whether to use H2O.ai theme
:param dark: whether to use dark mode for UI by default (still controlled in UI)
:param height: height of chat window
:param render_markdown: Whether to render markdown in chatbot UI. In some cases this distorts the rendering.
https://github.com/gradio-app/gradio/issues/4344#issuecomment-1771963021
:param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
:param show_llama: whether to show LLaMa.cpp/GPT4All options in UI (only likely useful if have weak GPUs)
:param show_gpt4all: whether to show GPT4All models in UI (not often useful, llama.cpp models best)
:param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
:param block_gradio_exit: whether to block gradio exit (used for testing)
:param concurrency_count: gradio concurrency count (1 is optimal for local LLMs to avoid sharing cache that messes up models, else 64 is used if hosting remote inference servers only)
:param api_open: If False, don't let API calls skip gradio queue
:param allow_api: whether to allow API calls at all to gradio server
:param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
:param gradio_size: Overall size of text and spaces: "xsmall", "small", "medium", "large".
Small useful for many chatbots in model_lock mode
:param show_copy_button: Whether to show copy button for chatbots
:param large_file_count_mode: Whether to force manual update to UI of drop-downs, good idea if millions of chunks or documents
:param gradio_ui_stream_chunk_size: Number of characters to wait before pushing text to ui.
None is default, which is 0 when not doing model lock. Else 20 by default.
20 is reasonable value for fast models and fast systems when handling several models at once
Choose 0 to disable (this disables use of gradio_ui_stream_chunk_min_seconds and gradio_ui_stream_chunk_seconds too)
Work around for these bugs that lead to UI being overwhelmed under various cases
https://github.com/gradio-app/gradio/issues/5914
https://github.com/gradio-app/gradio/issues/6609
:param gradio_ui_stream_chunk_min_seconds: Number of seconds before allow yield to avoid spamming yields at rate user would not care about, regardless of chunk_size
:param gradio_ui_stream_chunk_seconds: Number of seconds to yield regardless of reaching gradio_ui_stream_chunk_size as long as something to yield
Helps case when streaming is slow and want to see progress at least every couple seconds
:param gradio_api_use_same_stream_limits: Whether to use same streaming limits as UI for API
:param gradio_upload_to_chatbot: Whether to show upload in chatbots
:param gradio_upload_to_chatbot_num_max: Max number of things to add to chatbot
:param gradio_errors_to_chatbot: Whether to show errors in Accordion in chatbot or just in exceptions in each tab
:param pre_load_embedding_model: Whether to preload embedding model for shared use across DBs and users (multi-thread safe only)
:param embedding_gpu_id: which GPU to place embedding model on.
Only used if preloading embedding model.
If 'auto', then use first device as is default
If 'cpu' or some other string like 'mps', then use that as device name.
:param auth: gradio auth for launcher in form [(user1, pass1), (user2, pass2), ...]
e.g. --auth=[('jon','password')] with no spaces
e.g. --auth="[('jon', 'password)())(')]" so any special characters can be used
e.g. --auth=auth.db to specify persisted state file with name auth.db (auth_filename then not required)
e.g. --auth='' will use default auth.db as file name for persisted state file (auth_filename good idea to control location)
e.g. --auth=None will use no auth, but still keep track of auth state, just not from logins
:param auth_filename:
Set auth filename, used only if --auth= was passed list of user/passwords
If use auth.db will use sqlite3 database for auth for faster access for large number of users
If you had .json and want to use faster .db, just pass filename with .db instead of .json and at startup it will be migrated automatically to .db and used.
:param auth_access:
'open': Allow new users to be added
'closed': Stick to existing users
:param auth_freeze: whether freeze authentication based upon current file, no longer update file
:param auth_message: Message to show if having users login, fixed if passed, else dynamic internally
:param google_auth: Whether to use google auth
:param guest_name: guess name if using auth and have open access.
If '', then no guest allowed even if open access, then all databases for each user always persisted
If None, then set to 'guest' for open access, or '' for closed access
For open or closed access, if guest_name is set, that forms prefix of actual internal userID apart from authentication and can serve as way to access UI or API freshly via auth with fixed password with no document persistence beyond that single session.
:param enforce_h2ogpt_api_key: Whether to enforce h2oGPT token usage for API
:param enforce_h2ogpt_ui_key: Whether to enforce h2oGPT token usage for UI (same keys as API assumed)
:param h2ogpt_api_keys: list of tokens allowed for API access or file accessed on demand for json of list of keys
:param h2ogpt_key: E.g. can be set when accessing gradio h2oGPT server from local gradio h2oGPT server that acts as client to that inference server
Only applied for API at runtime when API accesses using gradio inference_server are made
:param extra_allowed_paths: List of strings for extra allowed paths users could access for file viewing/downloading. '.' can be used but be careful what that exposes.
Note by default all paths in langchain_mode_paths given at startup are allowed
:param blocked_paths: Any blocked paths to add for gradio access for file viewing/downloading.
:param max_max_time: Maximum max_time for gradio slider
:param max_max_new_tokens: Maximum max_new_tokens for gradio slider
:param min_max_new_tokens: Minimum of max_new_tokens, when auto-scaling down to handle more docs/prompt, but still let generation have some tokens
:param max_input_tokens: Max input tokens to place into model context for each LLM call
-1 means auto, fully fill context for query, and fill by original document chunk for summarization
>=0 means use that to limit context filling to that many tokens
:param max_total_input_tokens: like max_input_tokens but instead of per LLM call, applies across all LLM calls for single summarization/extraction action
:param docs_token_handling: 'chunk' means fill context with top_k_docs (limited by max_input_tokens or model_max_len) chunks for query
or top_k_docs original document chunks summarization
None or 'split_or_merge' means same as 'chunk' for query, while for summarization merges documents to fill up to max_input_tokens or model_max_len tokens
:param docs_joiner: string to join lists of text when doing split_or_merge. None means '\n\n'
:param hyde_level: HYDE level for HYDE approach (https://arxiv.org/abs/2212.10496)
0: No HYDE
1: Use non-document-based LLM response and original query for embedding query
2: Use document-based LLM response and original query for embedding query
3+: Continue iterations of embedding prior answer and getting new response
:param hyde_template:
None, 'None', 'auto' uses internal value and enable
'{query}' is minimal template one can pass
:param hyde_show_only_final: Whether to show only last result of HYDE, not intermediate steps
:param hyde_show_intermediate_in_accordion: Whether to show intermediate HYDE, but inside HTML accordion
:param map_reduce_show_intermediate_in_accordion: Whether to show intermediate map_reduce, but inside HTML accordion
:param visible_models: Which models in model_lock list to show by default
Takes integers of position in model_lock (model_states) list or strings of base_model names
Ignored if model_lock not used
For nochat API, this is single item within a list for model by name or by index in model_lock
If None, then just use first model in model_lock list
If model_lock not set, use model selected by CLI --base_model etc.
Note that unlike h2ogpt_key, this visible_models only applies to this running h2oGPT server,
and the value is not used to access the inference server.
If need a visible_models for an inference server, then use --model_lock and group together.
:param max_visible_models: maximum visible models to allow to select in UI
:param visible_ask_anything_high: Whether ask anything block goes near top or near bottom of UI Chat
:param visible_visible_models: Whether visible models drop-down is visible in UI
:param visible_submit_buttons: whether submit buttons are visible when UI first comes up
:param visible_side_bar: whether left side bar is visible when UI first comes up
:param visible_document_subset: whether document subset is visible when UI first comes up
:param visible_max_quality: whether max quality is visible when UI first comes up
:param visible_add_doc_to_chat: whether add document to chat is visible when UI first comes up
:param visible_chat_history: whether chat history being choosable is visible when UI first comes up
:param visible_doc_track: whether left side bar's document tracking is visible when UI first comes up
:param visible_chat_tab: "" for chat tab
:param visible_doc_selection_tab: "" for doc selection tab
:param visible_doc_view_tab: "" for doc view tab
:param visible_chat_history_tab: "" for chat history tab
:param visible_expert_tab: "" for expert tab
:param visible_models_tab: "" for models tab
:param visible_system_tab: "" for system tab
:param visible_tos_tab: "" for ToS tab
:param visible_login_tab: "" for Login tab (needed for persistence or to enter key for UI access to models and ingestion)
:param visible_hosts_tab: "" for hosts tab
:param chat_tabless: Just show Chat as block without tab (useful if want only chat view)
:param visible_h2ogpt_links: Whether github stars, URL are visible
:param visible_h2ogpt_qrcode: Whether QR code is visible
:param visible_h2ogpt_logo: Whether central logo is visible
:param visible_chatbot_label: Whether to show label in chatbot (e.g. if only one model for own purpose, then can set to False)
:param visible_all_prompter_models: Whether to show all prompt_type_to_model_name items or just curated ones
:param visible_curated_models: Whether to show curated models (useful to see few good options)
:param actions_in_sidebar: Whether to show sidebar with actions in old style
:param document_choice_in_sidebar: Whether to show document choices in sidebar
Useful if often changing picking specific document(s)
:param enable_add_models_to_list_ui: Whether to show add model, lora, server to dropdown list
Disabled by default since clutters Models tab in UI, and can just add custom item directly in dropdown
:param max_raw_chunks: Maximum number of chunks to show in UI when asking for raw DB text from documents/collection
:param pdf_height: Height of PDF viewer in UI
:param avatars: Whether to show avatars in chatbot
:param add_disk_models_to_ui: Whether to add HF cache models and llama.cpp models to UI
:param page_title: Title of the web page, default is h2oGPT
:param favicon_path: Path to favicon, default is h2oGPT favicon
:param visible_ratings: Whether full review is visible, else just likable chatbots
:param reviews_file: File to store reviews, set to `reviews.csv` if visible_ratings=True if this isn't set
:param sanitize_user_prompt: whether to remove profanity from user input (slows down input processing)
Requires optional packages:
pip install alt-profanity-check==1.2.2 better-profanity==0.7.0
:param sanitize_bot_response: whether to remove profanity and repeat lines from bot output (about 2x slower generation for long streaming cases due to better_profanity being slow)
:param extra_model_options: extra models to show in list in gradio