Skip to content

Commit

Permalink
integrated libopenblas for greatly accelerated prompt processing. Win…
Browse files Browse the repository at this point in the history
…dows binaries are included - feel free to build your own or to build for other platforms, but that is beyond the scope of this repo. Will fall back to non-blas if libopenblas is removed.
  • Loading branch information
LostRuins committed Mar 29, 2023
1 parent 49c4c22 commit 664b277
Show file tree
Hide file tree
Showing 12 changed files with 625 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ __pycache__
.swiftpm

dist/
llama_for_kobold.spec
*.spec
12 changes: 9 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -l:libopenblas.lib -L.
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
Expand Down Expand Up @@ -221,7 +221,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

default: main llamalib quantize
default: main llamalib quantize llamalib_blas

#
# Build library
Expand All @@ -230,6 +230,9 @@ default: main llamalib quantize
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o

ggml_blas.o: ggml.c ggml.h
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_blas.o

llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

Expand All @@ -251,6 +254,9 @@ main: examples/main/main.cpp ggml.o llama.o common.o
llamalib: expose.cpp ggml.o common.o extra.o
$(CXX) $(CXXFLAGS) expose.cpp ggml.o common.o extra.o -shared -o llamacpp.dll $(LDFLAGS)

llamalib_blas: expose.cpp ggml_blas.o common.o extra.o
$(CXX) $(CXXFLAGS) expose.cpp ggml_blas.o common.o extra.o libopenblas.lib -shared -o llamacpp_blas.dll $(LDFLAGS)

quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)

Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,22 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin
- [Download the latest release here](https://github.com/LostRuins/llamacpp-for-kobold/releases/latest) or clone the repo.
- Windows binaries are provided in the form of **llamacpp-for-kobold.exe**, which is a pyinstaller wrapper for **llamacpp.dll** and **llama-for-kobold.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts.
- Weights are not included, you can use the `quantize.exe` to generate them from your official weight files (or download them from other places).
- To run, execute **llamacpp-for-kobold.exe** or drag and drop your quantized ggml model.bin file onto the .exe, and then connect with Kobold or Kobold Lite.
- To run, execute **llamacpp-for-kobold.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite.
- By default, you can connect to http://localhost:5001
- You can also run it using the command line `llamacpp-for-kobold.exe [ggml_model.bin] [port]`. For info, please check `llamacpp-for-kobold.exe --help`
- If you are having crashes or issues with OpenBLAS, please try the `--noblas` flag.

## OSX and Linux
- You will have to compile your binaries from source. A makefile is provided, simply run `make`
- After all binaries are built, you can run the python script with the command `llama_for_kobold.py [ggml_model.bin] [port]`

## Considerations
- Don't want to use pybind11 due to dependencies on MSVCC
- ZERO or MINIMAL changes as possible to main.cpp - do not move their function declarations elsewhere!
- Leave main.cpp UNTOUCHED, We want to be able to update the repo and pull any changes automatically.
- No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields. Python will ALWAYS provide the memory, we just write to it.
- No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python.
- Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS.

## License
- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
Expand Down
411 changes: 411 additions & 0 deletions cblas.h

Large diffs are not rendered by default.

38 changes: 36 additions & 2 deletions expose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,22 @@
//No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
//Python will ALWAYS provide the memory, we just write to it.

#include <time.h>
#include "./examples/main/main.cpp"
#include "extra.h"
#include "ggml.h"

clock_t bench_timer = 0;
void timer_start()
{
bench_timer = clock();
}
double timer_check()
{
double ticks = clock() - bench_timer;
double time_taken = ((double)ticks)/CLOCKS_PER_SEC;
return time_taken;
}

void print_tok_vec(std::vector<llama_token> & embd)
{
Expand Down Expand Up @@ -67,6 +81,8 @@ extern "C" {

bool load_model(const load_model_inputs inputs)
{
printf("System Info: %s\n", llama_print_system_info());

ctx_params = llama_context_default_params();

n_threads = inputs.threads;
Expand Down Expand Up @@ -115,7 +131,7 @@ extern "C" {
params.n_ctx = inputs.max_context_length;
params.n_batch = n_batch;
params.n_threads = n_threads;

if(params.repeat_last_n<1)
{
params.repeat_last_n = 1;
Expand Down Expand Up @@ -183,6 +199,16 @@ extern "C" {

last_n_tokens.erase(last_n_tokens.begin(),last_n_tokens.begin()+n_past);
embd_inp.erase(embd_inp.begin(),embd_inp.begin()+n_past);

//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas());
int original_batch = params.n_batch;
int original_threads = params.n_threads;
if(blasmode)
{
params.n_batch = 512;
params.n_threads = 1;
}

current_context_tokens.resize(n_past);

Expand All @@ -192,7 +218,9 @@ extern "C" {
std::string concat_output = "";

bool startedsampling = false;
printf("\nProcessing Prompt (%d tokens): ",embd_inp.size());
printf("\nProcessing Prompt (%d tokens%s): ",embd_inp.size(),(blasmode?", BLAS":""));
timer_start();
double time1=0,time2=0;

while (remaining_tokens > 0)
{
Expand Down Expand Up @@ -224,6 +252,10 @@ extern "C" {
if(!startedsampling)
{
startedsampling = true;
params.n_batch = original_batch;
params.n_threads = original_threads;
time1 = timer_check();
timer_start();
printf("\nGenerating (%d tokens): ",params.n_predict);
}

Expand Down Expand Up @@ -268,6 +300,8 @@ extern "C" {
}

}
time2 = timer_check();
printf("\nTime Taken - Processing:%.1fs, Generation:%.1fs, Total:%.1fs",time1,time2,(time1+time2));

output.status = 1;
snprintf(output.text, sizeof(output.text), "%s", concat_output.c_str());
Expand Down
Binary file added libopenblas.dll
Binary file not shown.
Binary file added libopenblas.lib
Binary file not shown.
34 changes: 26 additions & 8 deletions llama_for_kobold.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,23 @@ class generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("text", ctypes.c_char * 16384)]

dir_path = os.path.dirname(os.path.realpath(__file__))
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))

handle.load_model.argtypes = [load_model_inputs]
handle.load_model.restype = ctypes.c_bool
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
handle.generate.restype = generation_outputs

handle = None
use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.

def init_library():
global handle, use_blas
dir_path = os.path.dirname(os.path.realpath(__file__))
if use_blas:
#OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible.
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp_blas.dll"))
else:
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))

handle.load_model.argtypes = [load_model_inputs]
handle.load_model.restype = ctypes.c_bool
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
handle.generate.restype = generation_outputs

def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6):
inputs = load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
Expand Down Expand Up @@ -276,6 +285,14 @@ def stop(self):
sys.exit(0)

def main(args):
global use_blas
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")):
print("Warning: libopenblas.dll not found. OpenBLAS will be disabled.")
use_blas = False
elif not args.noblas:
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
use_blas = True
init_library() # Note: if blas does not exist and is enabled, program will crash.
ggml_selected_file = args.model_file
embedded_kailite = None
if not ggml_selected_file:
Expand Down Expand Up @@ -331,5 +348,6 @@ def main(args):
default_threads = (os.cpu_count() if os.cpu_count()<=6 else max(6,os.cpu_count()-2))
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
parser.add_argument("--nostream", help="Disables pseudo streaming", action='store_true')
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
args = parser.parse_args()
main(args)
Binary file modified llamacpp.dll
Binary file not shown.
Binary file added llamacpp_blas.dll
Binary file not shown.
2 changes: 1 addition & 1 deletion make_pyinstaller.bat
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;." "./llama_for_kobold.py"
pyinstaller --noconfirm --onefile --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./llamacpp.dll;." --add-data "./llamacpp_blas.dll;." --add-data "./libopenblas.dll;." "./llama_for_kobold.py" -n "llamacpp-for-kobold.exe"
133 changes: 133 additions & 0 deletions openblas_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#ifndef OPENBLAS_CONFIG_H
#define OPENBLAS_CONFIG_H
#define OPENBLAS_OS_WINNT 1
#define OPENBLAS_ARCH_X86_64 1
#define OPENBLAS_C_GCC 1
#define OPENBLAS___64BIT__ 1
#define OPENBLAS_HAVE_C11 1
#define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create
#define OPENBLAS_BUNDERSCORE _
#define OPENBLAS_NEEDBUNDERSCORE 1
#define OPENBLAS_GENERIC
#define OPENBLAS_L1_DATA_SIZE 32768
#define OPENBLAS_L1_DATA_LINESIZE 128
#define OPENBLAS_L2_SIZE 512488
#define OPENBLAS_L2_LINESIZE 128
#define OPENBLAS_DTB_DEFAULT_ENTRIES 128
#define OPENBLAS_DTB_SIZE 4096
#define OPENBLAS_L2_ASSOCIATIVE 8
#define OPENBLAS_CORE_generic
#define OPENBLAS_CHAR_CORENAME "generic"
#define OPENBLAS_SLOCAL_BUFFER_SIZE 4096
#define OPENBLAS_DLOCAL_BUFFER_SIZE 4096
#define OPENBLAS_CLOCAL_BUFFER_SIZE 8192
#define OPENBLAS_ZLOCAL_BUFFER_SIZE 8192
#define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4
#define OPENBLAS_VERSION " OpenBLAS 0.3.22 "
/*This is only for "make install" target.*/

#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
#define OPENBLAS_WINDOWS_ABI
#define OPENBLAS_OS_WINDOWS

#ifdef DOUBLE
#define DOUBLE_DEFINED DOUBLE
#undef DOUBLE
#endif
#endif

#ifdef OPENBLAS_NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
#else
#define BLASFUNC(FUNC) FUNC
#endif

#ifdef OPENBLAS_QUAD_PRECISION
typedef struct {
unsigned long x[2];
} xdouble;
#elif defined OPENBLAS_EXPRECISION
#define xdouble long double
#else
#define xdouble double
#endif

#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
typedef long long BLASLONG;
typedef unsigned long long BLASULONG;
#else
typedef long BLASLONG;
typedef unsigned long BLASULONG;
#endif

#ifndef BFLOAT16
#include <stdint.h>
typedef uint16_t bfloat16;
#endif

#ifdef OPENBLAS_USE64BITINT
typedef BLASLONG blasint;
#else
typedef int blasint;
#endif

#if defined(XDOUBLE) || defined(DOUBLE)
#define FLOATRET FLOAT
#else
#ifdef NEED_F2CCONV
#define FLOATRET double
#else
#define FLOATRET float
#endif
#endif

/* Inclusion of a standard header file is needed for definition of __STDC_*
predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs
as a side effect of including either <features.h> or <stdc-predef.h>. */
#include <stdio.h>

/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
#define OPENBLAS_COMPLEX_C99
#ifndef __cplusplus
#include <complex.h>
#endif
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
typedef xdouble _Complex openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_complex_float_real(z) (creal(z))
#define openblas_complex_float_imag(z) (cimag(z))
#define openblas_complex_double_real(z) (creal(z))
#define openblas_complex_double_imag(z) (cimag(z))
#define openblas_complex_xdouble_real(z) (creal(z))
#define openblas_complex_xdouble_imag(z) (cimag(z))
#else
#define OPENBLAS_COMPLEX_STRUCT
typedef struct { float real, imag; } openblas_complex_float;
typedef struct { double real, imag; } openblas_complex_double;
typedef struct { xdouble real, imag; } openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) {(real), (imag)}
#define openblas_make_complex_double(real, imag) {(real), (imag)}
#define openblas_make_complex_xdouble(real, imag) {(real), (imag)}
#define openblas_complex_float_real(z) ((z).real)
#define openblas_complex_float_imag(z) ((z).imag)
#define openblas_complex_double_real(z) ((z).real)
#define openblas_complex_double_imag(z) ((z).imag)
#define openblas_complex_xdouble_real(z) ((z).real)
#define openblas_complex_xdouble_imag(z) ((z).imag)
#endif

/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
#ifdef OPENBLAS_OS_LINUX
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <sched.h>
#endif
#endif /* OPENBLAS_CONFIG_H */

0 comments on commit 664b277

Please sign in to comment.