Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
5daa5f5
Link to cublas dynamically on Windows even with LLAMA_STATIC (#4506)
bullno1 Dec 17, 2023
62bd52b
server : allow requests larger than 8K (#4500)
mzcu Dec 17, 2023
eb16dae
server : fix possible ambiguity in content type charset (#4501)
z80maniac Dec 17, 2023
8edd2b4
server : fix grammar being ignored (#4494)
AdithyanI Dec 17, 2023
0ffc92d
server : disable llm logs if SERVER_VERBOSE is off (#3792)
olexiyb Dec 17, 2023
4566863
finetune : keep allocs alive until all allocations are done (#4486)
slaren Dec 17, 2023
919c406
build : Check the ROCm installation location (#4485)
Su3h7aM Dec 17, 2023
f7f468a
gguf-py : fail fast on nonsensical special token IDs (#4489)
cebtenzzre Dec 17, 2023
800a489
llama.swiftui : add bench functionality (#4483)
ggerganov Dec 17, 2023
b1306c4
readme : update hot topics
ggerganov Dec 17, 2023
2994f0c
decode : fix logits_valid for legacy API (#4516)
cebtenzzre Dec 18, 2023
3c04bf6
llama : fix try_override for bool_value which always return true (#4519)
hankcs Dec 18, 2023
b9e74f9
llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490)
ebeyabraham Dec 18, 2023
6ff39b1
llama.swiftui : add more models
ggerganov Dec 18, 2023
0e18b2e
llama.swiftui : add tinyllama 1.1B F16
ggerganov Dec 18, 2023
a7aee47
ggml-cuda: Fix HIP build (#4528)
arlo-phoenix Dec 18, 2023
4c274dc
fix tools compilation
LostRuins Dec 19, 2023
6948da5
Fix for windows model unloading not releasing memory (#569)
ebolam Dec 19, 2023
1f77d2a
move multiprocessing import into function scope
LostRuins Dec 19, 2023
49a5dfc
Merge branch 'master' into concedo_experimental
LostRuins Dec 19, 2023
da2db03
Added support for ssl cert and key
LostRuins Dec 19, 2023
3f863ee
add presence penalty
LostRuins Dec 19, 2023
328b83d
ggml : fixed check for _MSC_VER (#4535)
es0m Dec 19, 2023
799fc22
CUDA: Faster Mixtral prompt processing (#4538)
JohannesGaessler Dec 20, 2023
a787ebe
Handle broken pipe error (#572)
mahou-shoujo Dec 21, 2023
1d7a191
Fix access violation in ggml_cuda_free_data if tensor->extra is NULL …
LoganDark Dec 21, 2023
e1f013b
testing workflow for windows cuda builds
LostRuins Dec 21, 2023
96c12cf
Merge branch 'master' into concedo_experimental
LostRuins Dec 21, 2023
ff4c2b1
testing workflow for windows cuda builds
LostRuins Dec 21, 2023
c05d195
Merge branch 'concedo' into concedo_experimental
LostRuins Dec 21, 2023
2378a29
better error handling, try to avoid segfault in sillytavern
LostRuins Dec 21, 2023
d3223af
llama : disable per-tensor info prints on model load (#4562)
JohannesGaessler Dec 21, 2023
1398823
cuda : replace asserts in wrong architecture checks with __trap (#4556)
slaren Dec 21, 2023
66f35a2
cuda : better error message for ggml_get_rows (#4561)
bobqianic Dec 21, 2023
880e352
py : open merges file as 'utf-8' (#4566)
howlger Dec 21, 2023
c083718
readme : update coding guidelines
ggerganov Dec 21, 2023
9154494
CUDA: mul_mat_id always on GPU for batches >= 32 (#4553)
JohannesGaessler Dec 21, 2023
8fe03ff
common : remove incorrect --model-draft default (#4568)
cebtenzzre Dec 21, 2023
562cf22
ggml-cuda: Fix HIP build by adding define for __trap (#4569)
arlo-phoenix Dec 21, 2023
0f630fb
cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)
ekg Dec 21, 2023
56fa508
metal : fix `ggml_metal_log` vargs (#4373)
finnvoor Dec 21, 2023
31f2775
llama : allow getting n_batch from llama_context in c api (#4540)
MarcusDunn Dec 21, 2023
d232aca
llama : initial ggml-backend integration (#4520)
slaren Dec 21, 2023
4a5f9d6
ci : add `jlumbroso/free-disk-space` to docker workflow (#4150)
samm81 Dec 21, 2023
32259b2
gguf : simplify example dependencies
ggerganov Dec 21, 2023
769a7bc
gguf-py : fix broken link
ggerganov Dec 21, 2023
afefa31
ggml : change ggml_scale to take a float instead of tensor (#4573)
ggerganov Dec 21, 2023
375003b
always show reported arch
LostRuins Dec 22, 2023
c7e9701
llama : add ability to cancel model loading (#4462)
crasm Dec 22, 2023
230a638
Merge branch 'master' into concedo_experimental
LostRuins Dec 22, 2023
0137ef8
ggml : extend `enum ggml_log_level` with `GGML_LOG_LEVEL_DEBUG` (#4579)
bobqianic Dec 22, 2023
2bb9827
readme : add zig bindings (#4581)
Deins Dec 22, 2023
f31b984
ci : tag docker image with build number (#4584)
Dec 22, 2023
77463e0
batch size improvements
LostRuins Dec 22, 2023
28cb35a
make : add LLAMA_HIP_UMA option (#4587)
Dec 22, 2023
48b24b1
ggml : add comment about backward GGML_OP_DIAG_MASK_INF (#4203)
GermanAizek Dec 22, 2023
48b7ff1
llama : fix platforms without mmap (#4578)
slaren Dec 22, 2023
852ca78
cherrypicked the Hipblas fixed from PR #571
LostRuins Dec 22, 2023
6724ef1
Fix CudaMemcpy direction (#4599)
Ttl Dec 22, 2023
3bca03d
Merge branch 'master' into concedo_experimental
LostRuins Dec 22, 2023
a558769
cuda : fix jetson compile error (#4560)
FantasyGmm Dec 22, 2023
ba66175
sync : ggml (fix im2col) (#4591)
ggerganov Dec 22, 2023
b814bb2
Merge branch 'master' into concedo_experimental
LostRuins Dec 22, 2023
7082d24
lookup : add prompt lookup decoding example (#4484)
LeonEricsson Dec 22, 2023
8823e8b
added presence penalty into lite ui
LostRuins Dec 23, 2023
4a8308b
Merge branch 'master' into concedo_experimental
LostRuins Dec 23, 2023
71a5afa
fixed incorrect localflag
LostRuins Dec 23, 2023
af0a669
Merge branch 'exp-dynatemp-minp-latest' into try-update-concedo
kalomaze Dec 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ insert_final_newline = unset

[examples/server/public/*]
indent_size = 2

[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
36 changes: 36 additions & 0 deletions .github/workflows/kcpp-build-release-win-cuda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Koboldcpp Builder Windows CUDA

on: workflow_dispatch
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

jobs:
windows:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
ref: concedo_experimental

- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: '11.7.1'
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

- name: Save artifact
uses: actions/upload-artifact@v3
with:
name: kcpp_windows_cuda_binary
path: build/bin/Release/
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ models-mnt
/llama-bench
/llava-cli
/lookahead
/lookup
/main
/metal
/perplexity
Expand Down
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,12 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
Expand Down
71 changes: 45 additions & 26 deletions class.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import requests
import numpy as np
from typing import List, Optional, Union
import os
import os, time
from . import koboldcpp

import utils
Expand All @@ -20,11 +20,9 @@
InferenceModel,
)

model_backend_name = "koboldcpp" #specific instead of ggml
model_backend_name = "KoboldCPP" #specific instead of ggml
model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

kcpp_backend_loaded = False

class KoboldCppException(Exception):
"""To be used for errors on cpp side of KoboldCpp."""

Expand All @@ -35,6 +33,7 @@ def __init__(self, **kwargs):
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
self.kcpp_backend_loaded = False

def is_valid(self, model_name, model_path, menu_path):

Expand Down Expand Up @@ -257,26 +256,31 @@ def set_input_parameters(self, parameters):

def unload(self):
print("Attemping to unload library")
koboldcpp.unload_libs()
global kcpp_backend_loaded
kcpp_backend_loaded = False
pass
self.process.terminate()


def _load(self, save_model: bool, initial_load: bool) -> None:
global kcpp_backend_loaded
self.tokenizer = self._get_tokenizer("gpt2")
if not kcpp_backend_loaded:
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
highpriority=False, contextsize=self.kcpp_ctxsize, blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase],
smartcontext=self.kcpp_smartcontext, bantokens=None, forceversion=0, nommap=self.kcpp_nommap,
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
onready='', multiuser=False, foreground=False)
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize,
blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
onready='', multiuser=False, foreground=False, preloadstory=None, noshift=False, remotetunnel=False)


koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
kcpp_backend_loaded = True
pass
#koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
(self.output_queue, self.input_queue, self.process) = koboldcpp.start_in_seperate_process(kcppargs)
while True:
data = self.output_queue.get()
if data['command'] == 'load status':
utils.koboldai_vars.total_layers = data['data']['total']
utils.koboldai_vars.loaded_layers = data['data']['loaded']
elif data['command'] == 'complete':
break
time.sleep(0.02)

def _save_settings(self):
pass
Expand All @@ -297,16 +301,31 @@ def _raw_generate(
# Store context in memory to use it for comparison with generated content
utils.koboldai_vars.lastctx = decoded_prompt

genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)
self.input_queue.put({'command': 'generate', 'data': [(decoded_prompt,max_new,utils.koboldai_vars.max_length,
gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range),
{"sampler_order": gen_settings.sampler_order, "use_default_badwordsids": utils.koboldai_vars.use_default_badwordsids}
]})

#genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
#gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
#gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
#sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)

genresult = []
while True:
data = self.output_queue.get()
print(data)
if data['command'] == 'generated text':
genresult.append(data['data'])
if self.output_queue.empty():
break
time.sleep(0.02)

outputs = [genresult]
return GenerationResult(
model=self,
out_batches=np.array(
[self.tokenizer.encode(x) for x in outputs]
[self.tokenizer.encode(x) for x in genresult]
),
prompt=prompt_tokens,
is_whole_generation=True,
Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" draft model for speculative decoding\n");
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
Expand Down Expand Up @@ -248,3 +248,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

22 changes: 22 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def from_model_architecture(model_architecture):
return QwenModel
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "PhiForCausalLM":
return Phi2Model
return Model

def _is_model_safetensors(self) -> bool:
Expand Down Expand Up @@ -221,6 +223,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
return gguf.MODEL_ARCH.QWEN
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2

raise NotImplementedError(f'Architecture "{arch}" not supported!')

Expand Down Expand Up @@ -980,6 +984,24 @@ def write_tensors(self):
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)


class Phi2Model(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]

self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False)


###### CONVERSION LOGIC ######


Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ else()
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
Expand Down
15 changes: 3 additions & 12 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,7 @@ static struct ggml_tensor * forward(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));

// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]
Expand Down Expand Up @@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, n_batch]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);

// KQ_masked = mask_past(KQ_scaled)
Expand Down Expand Up @@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));

// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
ab = ggml_scale(ctx, ab, scaling);
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);

Expand Down
Loading