Skip to content

Commit b6b4f65

Browse files
authored
Merge pull request #73 from ggerganov/master
b1876
2 parents 76484fb + ddb008d commit b6b4f65

File tree

14 files changed

+1793
-159
lines changed

14 files changed

+1793
-159
lines changed

CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake_minimum_required(VERSION 3.13) # for add_link_options
1+
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
22
project("llama.cpp" C CXX)
33

44
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -76,6 +76,10 @@ if (NOT MSVC)
7676
option(LLAMA_F16C "llama: enable F16C" ${INS_ENB})
7777
endif()
7878

79+
if (WIN32)
80+
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
81+
endif()
82+
7983
# 3rd party libs
8084
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
8185
option(LLAMA_BLAS "llama: use BLAS" OFF)
@@ -686,7 +690,7 @@ endif()
686690

687691
if (MINGW)
688692
# Target Windows 8 for PrefetchVirtualMemory
689-
add_compile_definitions(_WIN32_WINNT=0x602)
693+
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
690694
endif()
691695

692696
#

examples/benchmark/benchmark-matmult.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
194194
// Set up a the benchmark matrices
195195
// printf("Creating new tensor q11 & Running quantize\n");
196196
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
197-
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
197+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
198198

199199
// Set up a the compute graph
200200
// printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv) {
207207
// Set up a second graph computation to make sure we override the CPU cache lines
208208
// printf("Creating new tensor q12 & Running quantize\n");
209209
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
210-
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
210+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
211211

212212
// printf("Creating new tensor q32\n");
213213
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

examples/quantize/quantize.cpp

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include <cstring>
66
#include <vector>
77
#include <string>
8+
#include <unordered_map>
9+
#include <fstream>
10+
#include <cmath>
11+
#include <algorithm>
812

913
struct quant_option {
1014
std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1721
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
1822
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
1923
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
24+
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
25+
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2026
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
2127
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
2228
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
7278
//
7379
[[noreturn]]
7480
static void usage(const char * executable) {
75-
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
81+
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
7682
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
7783
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
7884
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
85+
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
86+
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
87+
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
88+
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
7989
printf("\nAllowed quantization types:\n");
8090
for (auto & it : QUANT_OPTIONS) {
8191
if (it.name != "COPY") {
8292
printf(" %2d or ", it.ftype);
8393
} else {
8494
printf(" ");
8595
}
86-
printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
96+
printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str());
8797
}
8898
exit(1);
8999
}
90100

101+
static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
102+
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
103+
if (!in) {
104+
printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
105+
return;
106+
}
107+
int n_entries;
108+
in.read((char*)&n_entries, sizeof(n_entries));
109+
if (in.fail() || n_entries < 1) {
110+
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
111+
return;
112+
}
113+
for (int i = 0; i < n_entries; ++i) {
114+
int len; in.read((char *)&len, sizeof(len));
115+
std::vector<char> name_as_vec(len+1);
116+
in.read((char *)name_as_vec.data(), len);
117+
if (in.fail()) {
118+
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
119+
return;
120+
}
121+
name_as_vec[len] = 0;
122+
std::string name{name_as_vec.data()};
123+
auto& e = imatrix_data[std::move(name)];
124+
int ncall;
125+
in.read((char*)&ncall, sizeof(ncall));
126+
int nval;
127+
in.read((char *)&nval, sizeof(nval));
128+
if (in.fail() || nval < 1) {
129+
printf("%s: failed reading number of values for entry %d\n",__func__,i);
130+
imatrix_data = {};
131+
return;
132+
}
133+
e.resize(nval);
134+
in.read((char*)e.data(), nval*sizeof(float));
135+
if (in.fail()) {
136+
printf("%s: failed reading data for entry %d\n",__func__,i);
137+
imatrix_data = {};
138+
return;
139+
}
140+
if (ncall > 0) {
141+
for (auto& v : e) v /= ncall;
142+
}
143+
}
144+
printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
145+
}
146+
147+
static void prepare_imatrix(const std::string& imatrix_file,
148+
const std::vector<std::string>& included_weights,
149+
const std::vector<std::string>& excluded_weights,
150+
std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
151+
if (!imatrix_file.empty()) {
152+
load_imatrix(imatrix_file, imatrix_data);
153+
}
154+
if (imatrix_data.empty()) {
155+
return;
156+
}
157+
if (!excluded_weights.empty()) {
158+
for (auto& name : excluded_weights) {
159+
for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) {
160+
auto pos = it->first.find(name);
161+
if (pos != std::string::npos) it = imatrix_data.erase(it);
162+
else ++it;
163+
}
164+
}
165+
}
166+
if (!included_weights.empty()) {
167+
std::unordered_map<std::string, std::vector<float>> tmp;
168+
for (auto& name : included_weights) {
169+
for (auto& e : imatrix_data) {
170+
auto pos = e.first.find(name);
171+
if (pos != std::string::npos) {
172+
tmp.emplace(std::move(e));
173+
}
174+
}
175+
}
176+
imatrix_data = std::move(tmp);
177+
}
178+
if (!imatrix_data.empty()) {
179+
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
180+
}
181+
}
182+
91183
int main(int argc, char ** argv) {
92184
if (argc < 3) {
93185
usage(argv[0]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
96188
llama_model_quantize_params params = llama_model_quantize_default_params();
97189

98190
int arg_idx = 1;
191+
std::string imatrix_file;
192+
std::vector<std::string> included_weights, excluded_weights;
99193

100194
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
101195
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
104198
params.allow_requantize = true;
105199
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
106200
params.pure = true;
201+
} else if (strcmp(argv[arg_idx], "--imatrix") == 0) {
202+
if (arg_idx < argc-1) {
203+
imatrix_file = argv[++arg_idx];
204+
} else {
205+
usage(argv[0]);
206+
}
207+
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
208+
if (arg_idx < argc-1) {
209+
included_weights.push_back(argv[++arg_idx]);
210+
} else {
211+
usage(argv[0]);
212+
}
213+
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
214+
if (arg_idx < argc-1) {
215+
excluded_weights.push_back(argv[++arg_idx]);
216+
} else {
217+
usage(argv[0]);
218+
}
107219
} else {
108220
usage(argv[0]);
109221
}
110222
}
111223

112224
if (argc - arg_idx < 2) {
225+
printf("%s: bad arguments\n", argv[0]);
226+
usage(argv[0]);
227+
}
228+
if (!included_weights.empty() && !excluded_weights.empty()) {
113229
usage(argv[0]);
114230
}
115231

232+
std::unordered_map<std::string, std::vector<float>> imatrix_data;
233+
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
234+
if (!imatrix_data.empty()) {
235+
params.imatrix = &imatrix_data;
236+
}
237+
116238
llama_backend_init(false);
117239

118240
// parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
163285
}
164286
}
165287

288+
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
289+
fprintf(stderr, "\n===============================================================================================\n");
290+
fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
291+
fprintf(stderr, "===============================================================================================\n\n\n");
292+
return 1;
293+
}
294+
166295
print_build_info();
167296

168297
fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());

ggml-cuda.cu

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,6 +1105,61 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
11051105
#endif // GGML_CUDA_F16
11061106
}
11071107

1108+
template<typename dst_t>
1109+
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1110+
1111+
const int i = blockIdx.x;
1112+
1113+
// assume 32 threads
1114+
const int tid = threadIdx.x;
1115+
const int il = tid/8;
1116+
const int ir = tid%8;
1117+
const int ib = 8*i + ir;
1118+
if (ib >= nb32) {
1119+
return;
1120+
}
1121+
1122+
dst_t * y = yy + 256*i + 32*ir + 4*il;
1123+
1124+
const block_q4_0 * x = (const block_q4_0 *)vx + ib;
1125+
const float d = __half2float(x->d);
1126+
const float dm = -8*d;
1127+
1128+
const uint8_t * q = x->qs + 4*il;
1129+
1130+
for (int l = 0; l < 4; ++l) {
1131+
y[l+ 0] = d * (q[l] & 0xF) + dm;
1132+
y[l+16] = d * (q[l] >> 4) + dm;
1133+
}
1134+
}
1135+
1136+
template<typename dst_t>
1137+
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
1138+
1139+
const int i = blockIdx.x;
1140+
1141+
// assume 32 threads
1142+
const int tid = threadIdx.x;
1143+
const int il = tid/8;
1144+
const int ir = tid%8;
1145+
const int ib = 8*i + ir;
1146+
if (ib >= nb32) {
1147+
return;
1148+
}
1149+
1150+
dst_t * y = yy + 256*i + 32*ir + 4*il;
1151+
1152+
const block_q4_1 * x = (const block_q4_1 *)vx + ib;
1153+
const float2 d = __half22float2(x->dm);
1154+
1155+
const uint8_t * q = x->qs + 4*il;
1156+
1157+
for (int l = 0; l < 4; ++l) {
1158+
y[l+ 0] = d.x * (q[l] & 0xF) + d.y;
1159+
y[l+16] = d.x * (q[l] >> 4) + d.y;
1160+
}
1161+
}
1162+
11081163
//================================== k-quants
11091164

11101165
template<typename dst_t>
@@ -6253,6 +6308,20 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
62536308
#endif
62546309
}
62556310

6311+
template<typename dst_t>
6312+
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6313+
const int nb32 = k / 32;
6314+
const int nb = (k + 255) / 256;
6315+
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
6316+
}
6317+
6318+
template<typename dst_t>
6319+
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6320+
const int nb32 = k / 32;
6321+
const int nb = (k + 255) / 256;
6322+
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
6323+
}
6324+
62566325
template<typename dst_t>
62576326
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
62586327
const int nb = k / QK_K;
@@ -6301,9 +6370,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
63016370
int id;
63026371
switch (type) {
63036372
case GGML_TYPE_Q4_0:
6304-
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6373+
return dequantize_row_q4_0_cuda;
63056374
case GGML_TYPE_Q4_1:
6306-
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6375+
return dequantize_row_q4_1_cuda;
63076376
case GGML_TYPE_Q5_0:
63086377
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
63096378
case GGML_TYPE_Q5_1:
@@ -6338,9 +6407,9 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
63386407
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
63396408
switch (type) {
63406409
case GGML_TYPE_Q4_0:
6341-
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
6410+
return dequantize_row_q4_0_cuda;
63426411
case GGML_TYPE_Q4_1:
6343-
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
6412+
return dequantize_row_q4_1_cuda;
63446413
case GGML_TYPE_Q5_0:
63456414
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
63466415
case GGML_TYPE_Q5_1:

ggml-metal.m

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
330330
}
331331
}
332332

333-
#if TARGET_OS_OSX
334333
// print MTL GPU family:
335334
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
336335

@@ -370,6 +369,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
370369
GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false");
371370
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
372371
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
372+
#if TARGET_OS_OSX
373373
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
374374
if (ctx->device.maxTransferRate != 0) {
375375
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);

0 commit comments

Comments
 (0)