Update gptj example with the newest GPTQ API. (#1277)

YIYANGCAI · pre-commit-ci[bot] · web-flow · commit 22daf80321f6 · 2023-10-18T09:35:06.000+08:00
Signed-off-by: YIYANGCAI &lt;yiyang.cai@intel.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/README.md
@@ -37,7 +37,7 @@ sh run_quant.sh --topology=gpt_j_wikitext_weight_only --input_model=EleutherAI/g
 >
 > `weight_only_bits`, `weight_only_group`, `weight_only_scheme`, and `weight_only_algorithm` can be modified by user. For details, please refer to [README](../../../../../../../docs/source/quantization_weight_only.md).
 
-### Run MLPerf on GPT-J-6B
+### Run MLPerf on GPT-J-6B using GPTQ
 Use the following link to get
 [**CNN Daily Mail** datasets](https://github.com/intel-innersource/frameworks.ai.benchmarking.mlperf.submission.inference-submission-v3-1/tree/master/closed/Intel/code/gpt-j/pytorch-cpu#download-and-prepare-dataset)
 and [gpt-j-6B mlperf model](https://github.com/mlcommons/inference/tree/master/language/gpt-j#download-gpt-j-model)
@@ -54,11 +54,28 @@ python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization
     --val-data-path /your/data/validation-data/cnn_dailymail_validation.json \
     --calib-iters 128 \
     --use_max_length \
-    --use_fp16 \
+    --pad_max_length 2048 \
     --use_gpu
 ```
 Notes: for per channel quantization, set group_size to **-1**, otherwise 32, 128, etc. More comprehensive details about user-defined arguments are available at our [weight_onlly quantization documentations](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md#quantization-capability)
 
+### Run general examples on a wide variety of LLMs using GPTQ
+We also support GPTQ algorithm on various language models (OPTs, Blooms, LLaMAs, MPTs, Falcons, ChatGLMs, etc.) in a generalized code. Please refer to script *run-gptq-llm.py* for more information.
+
+You can simply use following command to do quantization (please refer to *run-gptq-llm.sh*).
+```shell
+python examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run-gptq-llm.py \
+    --model_name_or_path facebook/opt-125m \
+    --weight_only_algo GPTQ \
+    --dataset NeelNanda/pile-10k \
+    --wbits 4 \
+    --group_size 128 \
+    --pad_max_length 2048 \
+    --use_max_length \
+    --seed 0 \
+    --gpu
+```
+
 ## 2. Benchmark
 ```bash
 # int8
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.py
@@ -31,8 +31,8 @@ def skip(*args, **kwargs):
     torch.nn.init.uniform_ = skip
     torch.nn.init.normal_ = skip
     from transformers import GPTJForCausalLM, AutoModelForCausalLM
-    model = GPTJForCausalLM.from_pretrained(model) # load the model with fp32 precision
-    #model = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    # model = GPTJForCausalLM.from_pretrained(model) # load the model with fp32 precision
+    model = GPTJForCausalLM.from_pretrained(model, torch_dtype=torch.bfloat16)
     return model
 
 def postprocess_text(preds, targets):
@@ -90,7 +90,6 @@ def sync():
     predictions = []
     ground_truths = []
 
-    # import pdb;pdb.set_trace()
     with torch.no_grad(), torch.inference_mode():
         times = []
         #for i, (input_ids, labels) in enumerate(benchmark_dataset):# in range(input_ids.numel()):
@@ -266,7 +265,6 @@ def forward(self, *inp, **kwargs):
     args = parser.parse_args()
     # method 1: directly import AutoModelForCausalLM
     model = get_gptj(args.model_name_or_path)
-    model.seqlen = args.pad_max_length
     model.eval()
 
     if args.use_gpu and torch.cuda.is_available():
@@ -288,15 +286,19 @@ def forward(self, *inp, **kwargs):
 
     # # do the quantization
     print('Starting ...')
+    if args.sym:
+        sym_opt = 'sym'
+    else:
+        sym_opt = 'asym'
 
     conf = PostTrainingQuantConfig(
         approach='weight_only',
         op_type_dict={
             '.*':{ 	# re.match
                 "weight": {
-                    'bits': 4, # 1-8 bits 
-                    'group_size': 128,  # -1 (per-channel)
-                    'scheme': 'sym', 
+                    'bits': args.wbits, # 1-8 bits 
+                    'group_size': args.group_size,  # -1 (per-channel)
+                    'scheme': sym_opt, 
                     'algorithm': 'GPTQ', 
                 },
             },
@@ -314,7 +316,8 @@ def forward(self, *inp, **kwargs):
                 'act_order':args.act_order,
                 'block_size': args.block_size, 
                 'nsampeles': args.nsamples,
-                'use_max_length': args.use_max_length
+                'use_max_length': args.use_max_length,
+                'pad_max_length': args.pad_max_length
             },
         },
     )
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/ptq_weight_only/run_gptj_mlperf_int4.sh
@@ -12,5 +12,5 @@ python -u examples/pytorch/nlp/huggingface_models/language-modeling/quantization
     --val-data-path ${VALIDATION_DATA} \
     --calib-iters 128 \
     --use_max_length \
-    --use_fp16 \
+    --pad_max_length 2048 \
     --use_gpu
diff --git a/neural_compressor/adaptor/torch_utils/gptq.py b/neural_compressor/adaptor/torch_utils/gptq.py
@@ -236,10 +236,11 @@ def prepare_dataloader(self):
             # general selection, no padding, not GPTQ original implementation.
             self.obtain_first_n_samples()
         try:
-            self.inp = [torch.zeros(1) for _ in range(len(self.dataloader))]
-            self.cache = {"i": 0}  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
+            self.cache_key_arguments = {
+                "i": 0
+            }  # a dict of list, keyword arguments ("attention_masks", "position_ids", etc.)
+            # Note that the first elements in cache_positional_arguments is main input: hidden_states
             self.cache_positional_arguments = []  # a list of list, positional arguments ("rotary_pos_emb" in chatglm)
-            self.out = [torch.zeros(1) for _ in range(len(self.dataloader))]
             self.is_ready = True
         except:
             logger.warning("GPTQ Quantizer initialization failed!")
@@ -259,9 +260,14 @@ def obtain_first_n_samples(self, seed=0):
                 if batch[0].shape[-1] > self.pad_max_length:
                     i = random.randint(0, batch[0].shape[-1] - self.pad_max_length - 1)
                     j = i + self.pad_max_length
-                    batch_final = batch[0][:, i:j]
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
                 else:
-                    batch_final = batch[0]
+                    batch_final = batch[:]
             # dict
             elif isinstance(batch, dict):
                 try:
@@ -302,18 +308,22 @@ def obtain_first_n_samples_fulllength(self, seed=0):
             if len(self.dataloader) == self.nsamples:
                 logger.info(f"Successfully collect {self.nsamples} calibration samples.")
                 break
-            # list & tuple
+            # list & tuple, gpt-j-6b mlperf, etc.
             if isinstance(batch, list) or isinstance(batch, tuple):
                 if batch[0].shape[-1] == unified_length:
-                    batch_final = batch[0]
+                    batch_final = batch[:]
                 elif batch[0].shape[-1] > unified_length:
                     i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
                     j = i + unified_length
-                    batch_final = batch[0][:, i:j]
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
                 else:
                     # not match max length, not include in target dataset
                     continue
-                self.dataloader.append(batch_final)
             # dict
             elif isinstance(batch, dict):
                 try:
@@ -406,17 +416,16 @@ def pre_quantization(self):
         """Prepare input calibration data and other attributes which are critical for gptq execution."""
 
         # critical: hooker function which collects inputs
-        def forward(layer, hidden_states, *args, **kwargs):
+        def forward(layer, *args, **kwargs):
             # inputs[inputs_info['idx']] = input_ids # TODO solve the problem of batchsize!=1
-            self.inp[self.cache["i"]] = hidden_states
-            self.cache["i"] += 1
+            self.cache_key_arguments["i"] += 1
             for arg in kwargs:
                 # TODO: investigate include parameters
                 # each outputs can be different shape, hence also use list to store
                 if isinstance(kwargs[arg], torch.Tensor) or arg == "alibi":
-                    if self.cache.get(arg, None) is None:
-                        self.cache[arg] = []
-                    self.cache[arg].append(kwargs[arg])
+                    if self.cache_key_arguments.get(arg, None) is None:
+                        self.cache_key_arguments[arg] = []
+                    self.cache_key_arguments[arg].append(kwargs[arg])
                 continue
             # copy positional arguments, positional arguments are sensitive for their order, be cautious!
             # Most models in HF has avoid this, but some models still use positional arguments other than
@@ -454,8 +463,12 @@ def forward(layer, hidden_states, *args, **kwargs):
                 pass
         # output inp data shape
         logger.info("All calibration data's shape =>")
-        for idx in range(len(self.dataloader)):
-            logger.info(self.inp[idx].shape)
+        # check all hidden_states shape
+        try:
+            for hidden_states in self.cache_positional_arguments[0]:
+                logger.info(hidden_states.shape)
+        except:
+            pass
         logger.info("Done.")
 
         # Step 4: restore original forward function, relocate layers back to cpu.
@@ -481,12 +494,20 @@ def gather_single_batch_from_list(self, data_list, idx):
             single_batch.append(data_item[idx])
         return single_batch
 
+    def update_blockwise_hidden_states(self, outs):
+        if "hidden_states" in self.cache_key_arguments:
+            self.cache_key_arguments["hidden_states"] = outs[:]
+        else:
+            self.cache_positional_arguments[0] = outs[:]
+
     @torch.no_grad()
     def execute_quantization(self, means=None, stds=None):
         """Run quantization."""
         # Step1: prepare quantization (calibration datasets)
+
         logger.info("Begin ====>")
         self.pre_quantization()
+
         # Step2: run gptq quantization in a transformer block-wise manner.
         gptq_config = {}
         tblock_length = len(self.gptq_related_blocks["transformers"])
@@ -533,13 +554,13 @@ def tmp(_, inp, out):
             handles = []  # register handles which add inputs and outputs to gptq object
             for layer_name in sub_layers:
                 handles.append(sub_layers[layer_name].register_forward_hook(add_batch(layer_name)))
-            idx = self.cache.pop("i")
+            idx = self.cache_key_arguments.pop("i")
+            # import pdb;pdb.set_trace()
             for j in range(len(self.dataloader)):
-                # self.inp[j] shape: [1, seq_len, hidden_size] (batchsize is 1 by default)
-                cache_batch = self.gather_single_batch_from_dict(self.cache, j)
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                self.out[j] = transformer_block(self.inp[j], *cache_positional_batch, **cache_batch)[0]
-            self.cache["i"] = idx
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)[0]
+            self.cache_key_arguments["i"] = idx
             for h in handles:
                 h.remove()
             # Step 2.4: everything is prepared, so start quantization!
@@ -565,18 +586,19 @@ def tmp(_, inp, out):
                 gptq_for_this_block[layer_name].free()
 
             # Step 2.5: replace output data with quantized weights
-            idx = self.cache.pop("i")
+            outs = []
+            idx = self.cache_key_arguments.pop("i")
             for j in range(len(self.dataloader)):
-                # self.inp[j] shape: [1, seq_len, hidden_size] (batchsize is 1 by default)
-                cache_batch = self.gather_single_batch_from_dict(self.cache, j)
+                cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
                 cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
-                self.out[j] = transformer_block(self.inp[j], *cache_positional_batch, **cache_batch)[0]
-            self.cache["i"] = idx
+                out = transformer_block(*cache_positional_batch, **cache_keyword_batch)[0]
+                outs.append(out)
+            self.cache_key_arguments["i"] = idx
             self.gptq_related_blocks["transformers"][block_idx] = transformer_block.cpu()
             del gptq_for_this_block
             torch.cuda.empty_cache()
             # iteratively replace the input with output, thus layerwise quantization can continue.
-            self.inp, self.out = self.out, self.inp
+            self.update_blockwise_hidden_states(outs)
             logger.info("------------------------------")
 
         logger.info("Quantization done")