integrated libopenblas for greatly accelerated prompt processing. Win…

…dows binaries are included - feel free to build your own or to build for other platforms, but that is beyond the scope of this repo. Will fall back to non-blas if libopenblas is removed.
YellowRoseCx · Mar 29, 2023 · 664b277 · 664b277
1 parent 49c4c22
commit 664b277
Show file tree

Hide file tree

Showing 12 changed files with 625 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,4 +34,4 @@ __pycache__
 .swiftpm
 
 dist/
-llama_for_kobold.spec
+*.spec
diff --git a/Makefile b/Makefile
@@ -182,8 +182,8 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-	LDFLAGS += -lopenblas
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas 
+	LDFLAGS += -l:libopenblas.lib -L.
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
@@ -221,7 +221,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main llamalib quantize
+default: main llamalib quantize llamalib_blas
 
 #
 # Build library
@@ -230,6 +230,9 @@ default: main llamalib quantize
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
 
+ggml_blas.o: ggml.c ggml.h
+	$(CC)  $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_blas.o
+
 llama.o: llama.cpp llama.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
@@ -251,6 +254,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o
 llamalib: expose.cpp ggml.o common.o extra.o
 	$(CXX) $(CXXFLAGS) expose.cpp ggml.o common.o extra.o -shared -o llamacpp.dll $(LDFLAGS)
 
+llamalib_blas: expose.cpp ggml_blas.o common.o extra.o
+	$(CXX) $(CXXFLAGS) expose.cpp ggml_blas.o common.o extra.o libopenblas.lib -shared -o llamacpp_blas.dll $(LDFLAGS)
+
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
 

diff --git a/README.md b/README.md
@@ -10,15 +10,22 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin
 - [Download the latest release here](https://github.com/LostRuins/llamacpp-for-kobold/releases/latest) or clone the repo.
 - Windows binaries are provided in the form of **llamacpp-for-kobold.exe**, which is a pyinstaller wrapper for **llamacpp.dll** and **llama-for-kobold.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
 - Weights are not included, you can use the `quantize.exe` to generate them from your official weight files (or download them from other places).
-- To run, execute **llamacpp-for-kobold.exe** or drag and drop your quantized ggml model.bin file onto the .exe, and then connect with Kobold or Kobold Lite. 
+- To run, execute **llamacpp-for-kobold.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. 
 - By default, you can connect to http://localhost:5001 
+- You can also run it using the command line `llamacpp-for-kobold.exe [ggml_model.bin] [port]`. For info, please check `llamacpp-for-kobold.exe --help` 
+- If you are having crashes or issues with OpenBLAS, please try the `--noblas` flag.
+
+## OSX and Linux
+- You will have to compile your binaries from source. A makefile is provided, simply run `make`
+- After all binaries are built, you can run the python script with the command `llama_for_kobold.py [ggml_model.bin] [port]`
 
 ## Considerations
 - Don't want to use pybind11 due to dependencies on MSVCC
 - ZERO or MINIMAL changes as possible to main.cpp - do not move their function declarations elsewhere!
 - Leave main.cpp UNTOUCHED, We want to be able to update the repo and pull any changes automatically.
 - No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields. Python will ALWAYS provide the memory, we just write to it.
 - No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python.
+- Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS.
 
 ## License
 - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License

diff --git a/cblas.h b/cblas.h
diff --git a/expose.cpp b/expose.cpp
@@ -7,8 +7,22 @@
 //No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
 //Python will ALWAYS provide the memory, we just write to it.
 
+#include <time.h>
 #include "./examples/main/main.cpp"
 #include "extra.h"
+#include "ggml.h"
+
+clock_t bench_timer = 0;
+void timer_start()
+{
+   bench_timer = clock();
+}
+double timer_check()
+{
+    double ticks = clock() - bench_timer;
+    double time_taken = ((double)ticks)/CLOCKS_PER_SEC;
+    return time_taken;
+}
 
 void print_tok_vec(std::vector<llama_token> & embd)
 {
@@ -67,6 +81,8 @@ extern "C" {
 
     bool load_model(const load_model_inputs inputs)
     {
+        printf("System Info: %s\n", llama_print_system_info());
+
         ctx_params = llama_context_default_params();
 
         n_threads = inputs.threads;       
@@ -115,7 +131,7 @@ extern "C" {
         params.n_ctx = inputs.max_context_length;
         params.n_batch = n_batch;
         params.n_threads = n_threads;
-      
+
         if(params.repeat_last_n<1)
         {
             params.repeat_last_n = 1;
@@ -183,6 +199,16 @@ extern "C" {
 
         last_n_tokens.erase(last_n_tokens.begin(),last_n_tokens.begin()+n_past);
         embd_inp.erase(embd_inp.begin(),embd_inp.begin()+n_past);
+
+        //if using BLAS and prompt is big enough, switch to single thread and use a huge batch
+        bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas());
+        int original_batch = params.n_batch;
+        int original_threads = params.n_threads;
+        if(blasmode)
+        {
+            params.n_batch = 512;
+            params.n_threads = 1;
+        }
 
         current_context_tokens.resize(n_past);
 
@@ -192,7 +218,9 @@ extern "C" {
 		std::string concat_output = "";  
 
 		bool startedsampling = false;
-        printf("\nProcessing Prompt (%d tokens): ",embd_inp.size());
+        printf("\nProcessing Prompt (%d tokens%s): ",embd_inp.size(),(blasmode?", BLAS":""));
+        timer_start();
+        double time1=0,time2=0;
 
 		while (remaining_tokens > 0) 
 		{
@@ -224,6 +252,10 @@ extern "C" {
             	if(!startedsampling)
                 {
                     startedsampling = true;
+                    params.n_batch = original_batch;
+                    params.n_threads = original_threads;
+                    time1 = timer_check();
+                    timer_start();
                     printf("\nGenerating (%d tokens): ",params.n_predict);
                 }
 
@@ -268,6 +300,8 @@ extern "C" {
         	}
 
 		}
+        time2 = timer_check();
+        printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs",time1,time2,(time1+time2));
 
 		output.status = 1;
         snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());

diff --git a/libopenblas.dll b/libopenblas.dll
diff --git a/libopenblas.lib b/libopenblas.lib
diff --git a/llama_for_kobold.py b/llama_for_kobold.py
@@ -31,14 +31,23 @@ class generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
                 ("text", ctypes.c_char * 16384)]
 
-dir_path = os.path.dirname(os.path.realpath(__file__))
-handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))
-
-handle.load_model.argtypes = [load_model_inputs] 
-handle.load_model.restype = ctypes.c_bool
-handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
-handle.generate.restype = generation_outputs
-
+handle = None
+use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
+
+def init_library():
+    global handle, use_blas
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    if use_blas:
+        #OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible.
+        handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp_blas.dll"))
+    else:
+        handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))
+
+    handle.load_model.argtypes = [load_model_inputs] 
+    handle.load_model.restype = ctypes.c_bool
+    handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
+    handle.generate.restype = generation_outputs
+
 def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6):
     inputs = load_model_inputs()
     inputs.model_filename = model_filename.encode("UTF-8")
@@ -276,6 +285,14 @@ def stop(self):
             sys.exit(0)
 
 def main(args): 
+    global use_blas
+    if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")):
+        print("Warning: libopenblas.dll not found. OpenBLAS will be disabled.")
+        use_blas = False
+    elif not args.noblas:
+        print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
+        use_blas = True
+    init_library() # Note: if blas does not exist and is enabled, program will crash.
     ggml_selected_file = args.model_file
     embedded_kailite = None 
     if not ggml_selected_file:     
@@ -331,5 +348,6 @@ def main(args):
     default_threads = (os.cpu_count() if os.cpu_count()<=6 else max(6,os.cpu_count()-2))
     parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
     parser.add_argument("--nostream", help="Disables pseudo streaming", action='store_true')
+    parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
     args = parser.parse_args()
     main(args)
diff --git a/llamacpp.dll b/llamacpp.dll
diff --git a/llamacpp_blas.dll b/llamacpp_blas.dll
diff --git a/make_pyinstaller.bat b/make_pyinstaller.bat
@@ -1 +1 @@
-pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;."  "./llama_for_kobold.py"
+pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;." --add-data "./llamacpp_blas.dll;." --add-data "./libopenblas.dll;." "./llama_for_kobold.py" -n "llamacpp-for-kobold.exe"
diff --git a/openblas_config.h b/openblas_config.h
@@ -0,0 +1,133 @@
+#ifndef OPENBLAS_CONFIG_H
+#define OPENBLAS_CONFIG_H
+#define OPENBLAS_OS_WINNT 1
+#define OPENBLAS_ARCH_X86_64 1
+#define OPENBLAS_C_GCC 1
+#define OPENBLAS___64BIT__ 1
+#define OPENBLAS_HAVE_C11 1
+#define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create
+#define OPENBLAS_BUNDERSCORE _
+#define OPENBLAS_NEEDBUNDERSCORE 1
+#define OPENBLAS_GENERIC 
+#define OPENBLAS_L1_DATA_SIZE 32768
+#define OPENBLAS_L1_DATA_LINESIZE 128
+#define OPENBLAS_L2_SIZE 512488
+#define OPENBLAS_L2_LINESIZE 128
+#define OPENBLAS_DTB_DEFAULT_ENTRIES 128
+#define OPENBLAS_DTB_SIZE 4096
+#define OPENBLAS_L2_ASSOCIATIVE 8
+#define OPENBLAS_CORE_generic 
+#define OPENBLAS_CHAR_CORENAME "generic"
+#define OPENBLAS_SLOCAL_BUFFER_SIZE 4096
+#define OPENBLAS_DLOCAL_BUFFER_SIZE 4096
+#define OPENBLAS_CLOCAL_BUFFER_SIZE 8192
+#define OPENBLAS_ZLOCAL_BUFFER_SIZE 8192
+#define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4
+#define OPENBLAS_VERSION " OpenBLAS 0.3.22 "
+/*This is only for "make install" target.*/
+
+#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
+#define OPENBLAS_WINDOWS_ABI
+#define OPENBLAS_OS_WINDOWS
+
+#ifdef DOUBLE
+#define DOUBLE_DEFINED DOUBLE
+#undef  DOUBLE
+#endif
+#endif
+
+#ifdef OPENBLAS_NEEDBUNDERSCORE
+#define BLASFUNC(FUNC) FUNC##_
+#else
+#define BLASFUNC(FUNC) FUNC
+#endif
+
+#ifdef OPENBLAS_QUAD_PRECISION
+typedef struct {
+  unsigned long x[2];
+}  xdouble;
+#elif defined OPENBLAS_EXPRECISION
+#define xdouble long double
+#else
+#define xdouble double
+#endif
+
+#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifndef BFLOAT16
+#include <stdint.h>
+typedef uint16_t bfloat16;
+#endif
+
+#ifdef OPENBLAS_USE64BITINT
+typedef BLASLONG blasint;
+#else
+typedef int blasint;
+#endif
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define FLOATRET	FLOAT
+#else
+#ifdef NEED_F2CCONV
+#define FLOATRET	double
+#else
+#define FLOATRET	float
+#endif
+#endif
+
+/* Inclusion of a standard header file is needed for definition of __STDC_*
+   predefined macros with some compilers (e.g. GCC 4.7 on Linux).  This occurs
+   as a side effect of including either <features.h> or <stdc-predef.h>. */
+#include <stdio.h>
+
+/* C99 supports complex floating numbers natively, which GCC also offers as an
+   extension since version 3.0.  If neither are available, use a compatible
+   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
+#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
+      (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
+  #define OPENBLAS_COMPLEX_C99
+#ifndef __cplusplus
+  #include <complex.h>
+#endif
+  typedef float _Complex openblas_complex_float;
+  typedef double _Complex openblas_complex_double;
+  typedef xdouble _Complex openblas_complex_xdouble;
+  #define openblas_make_complex_float(real, imag)    ((real) + ((imag) * _Complex_I))
+  #define openblas_make_complex_double(real, imag)   ((real) + ((imag) * _Complex_I))
+  #define openblas_make_complex_xdouble(real, imag)  ((real) + ((imag) * _Complex_I))
+  #define openblas_complex_float_real(z)             (creal(z))
+  #define openblas_complex_float_imag(z)             (cimag(z))
+  #define openblas_complex_double_real(z)            (creal(z))
+  #define openblas_complex_double_imag(z)            (cimag(z))
+  #define openblas_complex_xdouble_real(z)           (creal(z))
+  #define openblas_complex_xdouble_imag(z)           (cimag(z))
+#else
+  #define OPENBLAS_COMPLEX_STRUCT
+  typedef struct { float real, imag; } openblas_complex_float;
+  typedef struct { double real, imag; } openblas_complex_double;
+  typedef struct { xdouble real, imag; } openblas_complex_xdouble;
+  #define openblas_make_complex_float(real, imag)    {(real), (imag)}
+  #define openblas_make_complex_double(real, imag)   {(real), (imag)}
+  #define openblas_make_complex_xdouble(real, imag)  {(real), (imag)}
+  #define openblas_complex_float_real(z)             ((z).real)
+  #define openblas_complex_float_imag(z)             ((z).imag)
+  #define openblas_complex_double_real(z)            ((z).real)
+  #define openblas_complex_double_imag(z)            ((z).imag)
+  #define openblas_complex_xdouble_real(z)           ((z).real)
+  #define openblas_complex_xdouble_imag(z)           ((z).imag)
+#endif
+
+/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
+#ifdef OPENBLAS_OS_LINUX
+#ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+#endif
+#include <sched.h>
+#endif
+#endif /* OPENBLAS_CONFIG_H */
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,4 +34,4 @@ __pycache__ @@
     .swiftpm
     dist/
-    llama_for_kobold.spec
+    *.spec
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;." "./llama_for_kobold.py"
		pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;." --add-data "./llamacpp_blas.dll;." --add-data "./libopenblas.dll;." "./llama_for_kobold.py" -n "llamacpp-for-kobold.exe"