Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · YellowRoseCx · commit bacc20203efb · 2023-08-09T20:37:17.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,9 +3,9 @@
 # IT WILL NOT BE UPDATED OR MAINTAINED !!!
 
 message(STATUS "============== ============== ==============")
-message(STATUS "WARNING! Do NOT use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
-message(STATUS "It is ONLY for CUBLAS build testing on windows visual studio. IT WILL NOT BE UPDATED OR MAINTAINED !!!")
-message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building AN EXPERIMENAL WINDOWS CUBLAS BUILD! NOTHING ELSE WILL BE SUPPORTED !!!")
+message(STATUS "WARNING! Recommend NOT to use this file. It is UNSUPPORTED for normal users. Use MAKE instead.")
+message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL OVERWRITE YOUR EXISTING MAKEFILE !!!")
+message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
 message(STATUS "============== ============== ==============")
 
 cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
@@ -110,7 +110,12 @@ if (LLAMA_CUBLAS)
         if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
         else()
-            set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
+            if(CUDAToolkit_VERSION VERSION_GREATER 12)
+                 set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            else()
+                 set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            endif()
         endif()
     endif()
     message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/Makefile b/Makefile
@@ -174,6 +174,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
+ifdef LLAMA_CUDA_MMQ_Y
+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
+else
+	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
+endif # LLAMA_CUDA_MMQ_Y
 #ifdef LLAMA_CUDA_CUBLAS
 #	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
diff --git a/klite.embd b/klite.embd
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -304,7 +304,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = threading.Lock()
 defaultport = 5001
-KcppVersion = "1.39.1"
+KcppVersion = "1.40.1"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
@@ -496,7 +496,7 @@ def do_GET(self):
             laste = handle.get_last_eval_time()
             lastc = handle.get_last_token_count()
             stopreason = handle.get_last_stop_reason()
-            response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason}).encode())
+            response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "stop_reason":stopreason, "idle":(0 if modelbusy.locked() else 1)}).encode())
 
         if response_body is None:
             self.send_response(404)
@@ -674,7 +674,7 @@ def show_new_gui():
         root.destroy()
         if not args.model_param:
             print("\nNo ggml model file was selected. Exiting.")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit(2)
         return
 
@@ -1306,7 +1306,7 @@ def display_help():
 
     if nextstate==0:
         print("Exiting by user request.")
-        time.sleep(2)
+        time.sleep(3)
         sys.exit()
     elif nextstate==2:
         time.sleep(0.1)
@@ -1317,7 +1317,7 @@ def display_help():
 
         if not args.model_param:
             print("\nNo ggml model file was selected. Exiting.")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit(2)
 
 def show_gui_warning(issue=None):
@@ -1329,7 +1329,7 @@ def show_gui_warning(issue=None):
         messagebox.showerror(title="No Backends Available!", message="KoboldCPP couldn't locate any backends to use.\n\nTo use the program, please run the 'make' command from the directory.")
         root.destroy()
         print("No Backend Available (i.e Default, OpenBLAS, CLBlast, CuBLAS). To use the program, please run the 'make' command from the directory.")
-        time.sleep(2)
+        time.sleep(3)
         sys.exit(2)
     else:
         messagebox.showerror(title="New GUI failed, using Old GUI", message="The new GUI failed to load.\n\nTo use new GUI, please install the customtkinter python module.")
@@ -1423,7 +1423,7 @@ def onDropdownChange(event):
 
         if launchclicked==False:
             print("Exiting by user request.")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit()
 
         #load all the vars
@@ -1479,7 +1479,7 @@ def onDropdownChange(event):
         root.destroy()
         if not args.model_param:
             print("\nNo ggml model file was selected. Exiting.")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit(2)
 
     else:
@@ -1489,7 +1489,7 @@ def onDropdownChange(event):
         root.destroy()
         if not args.model_param:
             print("\nNo ggml model file was selected. Exiting.")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit(2)
 
 #A very simple and stripped down embedded horde worker with no dependencies
@@ -1534,7 +1534,7 @@ def make_url_request(url, data, method='POST'):
     BRIDGE_AGENT = f"KoboldCppEmbedWorker:1:https://github.com/LostRuins/koboldcpp"
     cluster = "https://horde.koboldai.net"
     while exitcounter < 10:
-        time.sleep(2)
+        time.sleep(3)
         readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
         if readygo:
             print("Embedded Horde Worker is started.")
@@ -1610,10 +1610,10 @@ def make_url_request(url, data, method='POST'):
         time.sleep(1)
     if exitcounter<100:
         print("Horde Worker Shutdown - Too many errors.")
-        time.sleep(2)
+        time.sleep(3)
     else:
         print("Horde Worker Shutdown - Server Closing.")
-        time.sleep(1)
+        time.sleep(2)
     sys.exit(2)
 
 def main(args):
@@ -1637,7 +1637,7 @@ def main(args):
                 except Exception as ex2:
                     print("File selection GUI unsupported. Please check command line: script.py --help")
                     print("Reason for no GUI: " + str(ex2))
-                    time.sleep(2)
+                    time.sleep(3)
                     sys.exit(2)
 
     if args.hordeconfig and args.hordeconfig[0]!="":
@@ -1681,20 +1681,20 @@ def main(args):
     time.sleep(1)
     if not os.path.exists(args.model_param):
         print(f"Cannot find model file: {args.model_param}")
-        time.sleep(2)
+        time.sleep(3)
         sys.exit(2)
 
     if args.lora and args.lora[0]!="":
         if not os.path.exists(args.lora[0]):
             print(f"Cannot find lora file: {args.lora[0]}")
-            time.sleep(2)
+            time.sleep(3)
             sys.exit(2)
         else:
             args.lora[0] = os.path.abspath(args.lora[0])
             if len(args.lora) > 1:
                 if not os.path.exists(args.lora[1]):
                     print(f"Cannot find lora base: {args.lora[1]}")
-                    time.sleep(2)
+                    time.sleep(3)
                     sys.exit(2)
                 else:
                     args.lora[1] = os.path.abspath(args.lora[1])
@@ -1715,7 +1715,7 @@ def main(args):
 
     if not loadok:
         print("Could not load model: " + modelname)
-        time.sleep(2)
+        time.sleep(3)
         sys.exit(3)
     try:
         basepath = os.path.abspath(os.path.dirname(__file__))
@@ -1743,6 +1743,7 @@ def main(args):
 
     if args.hordeconfig and len(args.hordeconfig)>4:
         horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordeconfig[3],args.hordeconfig[4]))
+        horde_thread.daemon = True
         horde_thread.start()
 
     print(f"Please connect to custom endpoint at {epurl}")
diff --git a/llama.cpp b/llama.cpp
@@ -158,8 +158,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
         { MODEL_7B,   512ull * kB },
         { MODEL_13B,  640ull * kB },
         { MODEL_30B,  768ull * kB },
-        { MODEL_65B, 1280ull * kB },
-        { MODEL_70B, 1280ull * kB },
+        { MODEL_65B, 1360ull * kB },
+        { MODEL_70B, 1360ull * kB },
     };
     return k_sizes;
 }
@@ -173,8 +173,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
         { MODEL_7B,  128ull },
         { MODEL_13B, 160ull },
         { MODEL_30B, 208ull },
-        { MODEL_65B, 256ull },
-        { MODEL_70B, 256ull },
+        { MODEL_65B, 320ull },
+        { MODEL_70B, 320ull },
     };
     return k_sizes;
 }
@@ -937,6 +937,11 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
+int get_blas_batch_mul(int batch)
+{
+    return (batch>512?(batch>1024?4:2):1);
+}
+
 void llama_backend_init(bool numa) {
     ggml_time_init();
 
@@ -1042,7 +1047,7 @@ static void llama_model_load_internal(
         void * progress_callback_user_data) {
 
     model.t_start_us = ggml_time_us();
-    size_t blasbatchmul = (n_batch>512?(n_batch>1024?4:2):1);
+    size_t blasbatchmul = get_blas_batch_mul(n_batch);
 
     std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
 
@@ -1076,7 +1081,7 @@ static void llama_model_load_internal(
         // LLaMAv2
         // TODO: temporary until GGUF
         //patch for llama2 gqa
-        if (model.type == e_model::MODEL_65B && hparams.n_mult == 4096) {
+        if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
             fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
             n_gqa = 8;
         }
@@ -3248,7 +3253,7 @@ struct llama_context * llama_new_context_with_model(
         params.seed = time(NULL);
     }
 
-    size_t blasbatchmul = (params.n_batch>512?2:1);
+    size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -133,28 +133,36 @@ void print_tok_vec(std::vector<float> &embd)
        else if(vocabsiz==50257 || (vocabsiz>=49152&&vocabsiz<=49157)) //49152-6 is starcoder
        {
            fileformat = FileFormat::GPT2_1;
-           uint32_t temp;
-           fin.read((char *)&temp, sizeof(temp)); //ctx
-           fin.read((char *)&temp, sizeof(temp)); //n_embd
-           fin.read((char *)&temp, sizeof(temp)); //n_head
+           uint32_t temp, v1,v2,v3;
+           fin.read((char *)&v1, sizeof(temp)); //ctx
+           fin.read((char *)&v2, sizeof(temp)); //n_embd
+           fin.read((char *)&v3, sizeof(temp)); //n_head
            fin.read((char *)&temp, sizeof(temp)); //n_layer
-           fin.read((char *)&temp, sizeof(temp)); //f16
-           const int32_t qntvr = temp / 1000;
-           temp %= 1000;
-           if (qntvr != 0)
+           if(vocabsiz==49152 && v1==4096 && v2==2560 && v3==32 && temp==32)
            {
-               if (qntvr == 1)
-               {
-                   fileformat = FileFormat::GPT2_3;
-               }
-               else
-               {
-                   fileformat = FileFormat::GPT2_4;
-               }
+                //special case, Stablecode Completion Alpha 3B
+               fileformat = FileFormat::NEOX_6;
            }
-           else if (temp != 0 && temp != 1)
+           else
            {
-               fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
+                fin.read((char *)&temp, sizeof(temp)); //f16
+                const int32_t qntvr = temp / 1000;
+                temp %= 1000;
+                if (qntvr != 0)
+                {
+                    if (qntvr == 1)
+                    {
+                        fileformat = FileFormat::GPT2_3;
+                    }
+                    else
+                    {
+                        fileformat = FileFormat::GPT2_4;
+                    }
+                }
+                else if (temp != 0 && temp != 1)
+                {
+                    fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
+                }
            }
        }
        else if(vocabsiz < 31998 || vocabsiz > 33000)