intel
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/signround/README.md‎
Lines changed: 43 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/signround/README.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/signround/eval.py‎
Lines changed: 94 additions & 0 deletions b/‎examples/pytorch/nlp/huggingface_models/language-modeling/quantization/signround/eval.py‎
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,43 @@
+This is a sample code for SignRound ([arxiv](https://arxiv.org/abs/2309.05516)), which currently only supports LlaMa, OPT, and BLOOM models. We will provide a unified API that will support a broader range of models in Intel Neural Compressor.
+
+![overview](./overview.png)
+
+
+
+# Prerequisite
+python 3.9 or higher 
+
+pip install -r requirements.txt
+
+
+# Run
+
+```bash
+CUDA_VISIBLE_DEVICES=0  python3 signround.py --model_name facebook/opt-125m --amp --num_bits 4 --group_size -1 --seqlen 512
+```
+
+To optimize GPU memory usage, you can enable the "low_gpu_mem_usage" option. Additionally, you can reduce the training batch size (train_bs) and increase the gradient_accumulate_steps accordingly.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python3 signround.py --model_name facebook/opt-125m --amp --num_bits 4 --group_size -1 --seqlen 512 --low_gpu_mem_usage --train_bs 1 --gradient_accumulate_steps 8
+```
+## Known issue
+To address the original lambada evaluation bug in the old version of lm-eval, we have incorporated the lm-eval from intel extension for transformers(ITREX). This discrepancy may lead to certain variations.
+
+To reproduce our results in the paper, please install ITREX 
+
+```bash
+pip install intel-extension-for-transformers
+```
+## Reference
+If you find SignRound useful or relevant to your research, please kindly cite our paper
+
+```
+@article{cheng2023optimize,
+  title={Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLMs},
+  author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao},
+  journal={arXiv preprint arXiv:2309.05516},
+  year={2023}
+}
+```
+
@@ -0,0 +1,94 @@
+import os.path
+import torch
+import torch.nn as nn
+
+
+def eval_model(model, model_name, tokenizer, tasks=["lambada_openai", "hellaswag", "winogrande", "piqa"], eval_bs=32):
+    try:
+        from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate as lm_evaluate
+        print("evaluation with itrex lm-eval", flush=True)
+
+        if str(model.device) == "cpu":
+            model = model.to(torch.bfloat16)
+            dtype = 'bfloat16'
+        else:
+            model = model.half()
+            dtype = 'float16'
+        model.eval()
+        results = lm_evaluate(model="hf-causal",
+                              model_args=f'pretrained="{model_name}",tokenizer="{model_name}",dtype={dtype}',
+                              user_model=model,
+                              tasks=tasks,
+                              device=str(model.device),
+                              batch_size=eval_bs)
+
+    except:
+        print("evaluation with official lm-eval", flush=True)
+        from lm_eval.evaluator import simple_evaluate
+        import json
+        import shutil
+
+        ##save model
+        output_dir = "./tmp_signround"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+        if output_dir is not None:
+            model.save_pretrained(output_dir)
+            tokenizer.save_pretrained(output_dir)
+        if str(model.device) == "cpu":
+            dtype = 'bfloat16'
+        else:
+            dtype = 'float16'
+        results = simple_evaluate(model="hf-causal",
+                                  model_args=f'pretrained="{output_dir}",tokenizer="{output_dir}",dtype={dtype}',
+                                  tasks=tasks,
+                                  device=str(model.device),
+                                  batch_size=eval_bs,
+                                  no_cache=True)
+        dumped = json.dumps(results, indent=2)
+        print(dumped)
+
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    @torch.no_grad()
+    def eval_same_with_gptq(model, testenc, dev):
+        print('Evaluating ...', flush=True)
+        # model.eval()
+        model.to(dev)
+
+        testenc = testenc.input_ids
+        nsamples = testenc.numel() // model.seqlen
+
+        use_cache = model.config.use_cache
+        model.config.use_cache = False
+
+        testenc = testenc.to(dev)
+        nlls = []
+        for i in range(nsamples):
+            batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+            lm_logits = model(batch).logits
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            shift_labels = testenc[
+                           :, (i * model.seqlen):((i + 1) * model.seqlen)
+                           ][:, 1:]
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            neg_log_likelihood = loss.float() * model.seqlen
+            nlls.append(neg_log_likelihood)
+        ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+        print(ppl.item())
+
+        model.config.use_cache = use_cache
+        return ppl.item()
+
+    datasets = ['wikitext2', 'ptb-new', 'c4-new']
+
+    from gptq_data_loader import get_loaders
+    for dataset in datasets:
+        dataloader, testloader = get_loaders(
+            dataset, seed=0, model=model_name, seqlen=model.seqlen
+        )
+        print(dataset, flush=True)
+        ppl = eval_same_with_gptq(model, testloader, str(model.device))
+        results.update({dataset: ppl})