55#include  < cstring> 
66#include  < vector> 
77#include  < string> 
8+ #include  < unordered_map> 
9+ #include  < fstream> 
10+ #include  < cmath> 
11+ #include  < algorithm> 
812
913struct  quant_option  {
1014    std::string name;
@@ -17,6 +21,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
1721    { " Q4_1" "  3.90G, +0.1585 ppl @ LLaMA-v1-7B" 
1822    { " Q5_0" "  4.33G, +0.0683 ppl @ LLaMA-v1-7B" 
1923    { " Q5_1" "  4.70G, +0.0349 ppl @ LLaMA-v1-7B" 
24+     { " IQ2_XXS" "  2.06 bpw quantization" 
25+     { " IQ2_XS" "  2.31 bpw quantization" 
2026    { " Q2_K" "  2.63G, +0.6717 ppl @ LLaMA-v1-7B" 
2127    { " Q2_K_S" "  2.16G, +9.0634 ppl @ LLaMA-v1-7B" 
2228    { " Q3_K" " alias for Q3_K_M" 
@@ -72,22 +78,108 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
7278// 
7379[[noreturn]]
7480static  void  usage (const  char  * executable) {
75-     printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " 
81+     printf (" usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]  model-f32.gguf [model-quant.gguf] type [nthreads]\n\n " 
7682    printf ("   --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n " 
7783    printf ("   --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n " 
7884    printf ("   --pure: Disable k-quant mixtures and quantize all tensors to the same type\n " 
85+     printf ("   --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n " 
86+     printf ("   --include-weights tensor_name: use importance matrix for this/these tensor(s)\n " 
87+     printf ("   --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n " 
88+     printf (" Note: --include-weights and --exclude-weights cannot be used together\n " 
7989    printf (" \n Allowed quantization types:\n " 
8090    for  (auto  & it : QUANT_OPTIONS) {
8191        if  (it.name  != " COPY" 
8292            printf ("   %2d  or  " ftype );
8393        } else  {
8494            printf ("           " 
8595        }
86-         printf (" %-6s  : %s\n " name .c_str (), it.desc .c_str ());
96+         printf (" %-7s  : %s\n " name .c_str (), it.desc .c_str ());
8797    }
8898    exit (1 );
8999}
90100
101+ static  void  load_imatrix (const  std::string& imatrix_file, std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
102+     std::ifstream in (imatrix_file.c_str (), std::ios::binary);
103+     if  (!in) {
104+         printf (" %s: failed to open %s\n " c_str ());
105+         return ;
106+     }
107+     int  n_entries;
108+     in.read ((char *)&n_entries, sizeof (n_entries));
109+     if  (in.fail () || n_entries < 1 ) {
110+         printf (" %s: no data in file %s\n " c_str ());
111+         return ;
112+     }
113+     for  (int  i = 0 ; i < n_entries; ++i) {
114+         int  len; in.read ((char  *)&len, sizeof (len));
115+         std::vector<char > name_as_vec (len+1 );
116+         in.read ((char  *)name_as_vec.data (), len);
117+         if  (in.fail ()) {
118+             printf (" %s: failed reading name for entry %d from %s\n " 1 ,imatrix_file.c_str ());
119+             return ;
120+         }
121+         name_as_vec[len] = 0 ;
122+         std::string name{name_as_vec.data ()};
123+         auto & e = imatrix_data[std::move (name)];
124+         int  ncall;
125+         in.read ((char *)&ncall, sizeof (ncall));
126+         int  nval;
127+         in.read ((char  *)&nval, sizeof (nval));
128+         if  (in.fail () || nval < 1 ) {
129+             printf (" %s: failed reading number of values for entry %d\n " 
130+             imatrix_data = {};
131+             return ;
132+         }
133+         e.resize (nval);
134+         in.read ((char *)e.data (), nval*sizeof (float ));
135+         if  (in.fail ()) {
136+             printf (" %s: failed reading data for entry %d\n " 
137+             imatrix_data = {};
138+             return ;
139+         }
140+         if  (ncall > 0 ) {
141+             for  (auto & v : e) v /= ncall;
142+         }
143+     }
144+     printf (" %s: loaded %d importance matrix entries from %s\n " int (imatrix_data.size ()),imatrix_file.c_str ());
145+ }
146+ 
147+ static  void  prepare_imatrix (const  std::string& imatrix_file,
148+         const  std::vector<std::string>& included_weights,
149+         const  std::vector<std::string>& excluded_weights,
150+         std::unordered_map<std::string, std::vector<float >>& imatrix_data) {
151+     if  (!imatrix_file.empty ()) {
152+         load_imatrix (imatrix_file, imatrix_data);
153+     }
154+     if  (imatrix_data.empty ()) {
155+         return ;
156+     }
157+     if  (!excluded_weights.empty ()) {
158+         for  (auto & name : excluded_weights) {
159+             for  (auto  it = imatrix_data.begin (); it != imatrix_data.end (); ) {
160+                 auto  pos = it->first .find (name);
161+                 if  (pos != std::string::npos) it = imatrix_data.erase (it);
162+                 else  ++it;
163+             }
164+         }
165+     }
166+     if  (!included_weights.empty ()) {
167+         std::unordered_map<std::string, std::vector<float >> tmp;
168+         for  (auto & name : included_weights) {
169+             for  (auto & e : imatrix_data) {
170+                 auto  pos = e.first .find (name);
171+                 if  (pos != std::string::npos) {
172+                     tmp.emplace (std::move (e));
173+                 }
174+             }
175+         }
176+         imatrix_data = std::move (tmp);
177+     }
178+     if  (!imatrix_data.empty ()) {
179+         printf (" %s: have %d importance matrix entries\n " int (imatrix_data.size ()));
180+     }
181+ }
182+ 
91183int  main (int  argc, char  ** argv) {
92184    if  (argc < 3 ) {
93185        usage (argv[0 ]);
@@ -96,6 +188,8 @@ int main(int argc, char ** argv) {
96188    llama_model_quantize_params params = llama_model_quantize_default_params ();
97189
98190    int  arg_idx = 1 ;
191+     std::string imatrix_file;
192+     std::vector<std::string> included_weights, excluded_weights;
99193
100194    for  (; arg_idx < argc && strncmp (argv[arg_idx], " --" 2 ) == 0 ; arg_idx++) {
101195        if  (strcmp (argv[arg_idx], " --leave-output-tensor" 0 ) {
@@ -104,15 +198,43 @@ int main(int argc, char ** argv) {
104198            params.allow_requantize  = true ;
105199        } else  if  (strcmp (argv[arg_idx], " --pure" 0 ) {
106200            params.pure  = true ;
201+         } else  if  (strcmp (argv[arg_idx], " --imatrix" 0 ) {
202+             if  (arg_idx < argc-1 ) {
203+                 imatrix_file = argv[++arg_idx];
204+             } else  {
205+                 usage (argv[0 ]);
206+             }
207+         } else  if  (strcmp (argv[arg_idx], " --include-weights" 0 ) {
208+             if  (arg_idx < argc-1 ) {
209+                 included_weights.push_back (argv[++arg_idx]);
210+             } else  {
211+                 usage (argv[0 ]);
212+             }
213+         } else  if  (strcmp (argv[arg_idx], " --exclude-weights" 0 ) {
214+             if  (arg_idx < argc-1 ) {
215+                 excluded_weights.push_back (argv[++arg_idx]);
216+             } else  {
217+                 usage (argv[0 ]);
218+             }
107219        } else  {
108220            usage (argv[0 ]);
109221        }
110222    }
111223
112224    if  (argc - arg_idx < 2 ) {
225+         printf (" %s: bad arguments\n " 0 ]);
226+         usage (argv[0 ]);
227+     }
228+     if  (!included_weights.empty () && !excluded_weights.empty ()) {
113229        usage (argv[0 ]);
114230    }
115231
232+     std::unordered_map<std::string, std::vector<float >> imatrix_data;
233+     prepare_imatrix (imatrix_file, included_weights, excluded_weights, imatrix_data);
234+     if  (!imatrix_data.empty ()) {
235+         params.imatrix  = &imatrix_data;
236+     }
237+ 
116238    llama_backend_init (false );
117239
118240    //  parse command line arguments
@@ -163,6 +285,13 @@ int main(int argc, char ** argv) {
163285        }
164286    }
165287
288+     if  ((params.ftype  == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype  == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype  == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty ()) {
289+         fprintf (stderr, " \n ===============================================================================================\n " 
290+         fprintf (stderr, " Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n " 
291+         fprintf (stderr, " ===============================================================================================\n\n\n " 
292+         return  1 ;
293+     }
294+ 
166295    print_build_info ();
167296
168297    fprintf (stderr, " %s: quantizing '%s' to '%s' as %s" c_str (), fname_out.c_str (), ftype_str.c_str ());
0 commit comments