From 55698e3596a6d89164ac50c70f919f460cec4fca Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 9 Apr 2024 09:58:25 -0400
Subject: [PATCH] Remove usage of SparseAutoModel.log_model_load (#2232)

---
 ...ment-analysis-python-custom-teacher-rottentomatoes.ipynb | 4 +---
 .../docs-sentiment-analysis-python-sst2.ipynb               | 6 +-----
 ...ext-classification-python-custom-teacher-tweeteval.ipynb | 5 +----
 .../docs-text-classification-python-qqp.ipynb               | 4 +---
 .../docs-text-classification-python-sick.ipynb              | 1 -
 .../docs-token-classification-python-conll2003.ipynb        | 2 +-
 ...cs-token-classification-python-custom-teacher-wnut.ipynb | 3 +--
 7 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
index 0eeb728df72..8fd73c5b3d1 100644
--- a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
+++ b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
@@ -364,13 +364,11 @@
     "model_kwargs = {\"config\": model_config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path,**model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# initialize teacher using familiar HF AutoModel\n",
     "teacher_kwargs = {\"config\": teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path,**teacher_kwargs,)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed)"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path,**teacher_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
index a734dc09fe6..0f34fa42328 100644
--- a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
+++ b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
@@ -440,11 +440,7 @@
     "\n",
     "teacher_kwargs = {'config':teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n",
-    "\n",
-    "# optional - prints metrics about sparsity profiles of the models\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed)"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
index 510fd548551..271dcad9f01 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
@@ -515,10 +515,7 @@
     "# initialize model using familiar HF AutoModel\n",
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "\n",
-    "# prints metrics on sparsity profile\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed)"
+    "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
index 15831a3309a..ada42932630 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
@@ -375,13 +375,11 @@
     "model_kwargs = {\"config\": model_config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# initialize teacher using familiar HF AutoModel\n",
     "teacher_kwargs = {\"config\": teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed) # prints metrics on sparsity profile"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
index 03ec4a54d16..f086165632a 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
@@ -361,7 +361,6 @@
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path,**model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# FYI: there is a factory function called SparseAutoModel that does the same as above\n",
     "# model, teacher = SparseAutoModel.text_classification_from_pretrained_distil(\n",
diff --git a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
index a32705402b3..676747fa06e 100644
--- a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
+++ b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{"id":"kSNEB-3orJ9C"},"source":["# **Token Classification: Sparse Transfer Learning with the Python API**\n","\n","In this example, you will fine-tune a 90% pruned BERT model onto the Conll2003 NER dataset using SparseML's Hugging Face Integration.\n","\n","### **Sparse Transfer Learning Overview**\n","\n","Sparse Transfer Learning is very similiar to typical fine-tuning you are used to when training models. However, with Sparse Transfer Learning, we start the training process from a pre-sparsified checkpoint and maintain the sparsity structure while the fine tuning occurs. \n","\n","At the end, you will have a sparse model trained on your dataset, ready to be deployed with DeepSparse for GPU-class performance on CPUs!\n","\n","### **Pre-Sparsified BERT**\n","SparseZoo, Neural Magic's open source repository of pre-sparsified models, contains a 90% pruned version of BERT, which has been sparsified on the upstream Wikipedia and BookCorpus datasets with the\n","masked language modeling objective. [Check out the model card](https://sparsezoo.neuralmagic.com/models/nlp%2Fmasked_language_modeling%2Fobert-base%2Fpytorch%2Fhuggingface%2Fwikipedia_bookcorpus%2Fpruned90-none). We will use this model as the starting point for the transfer learning process.\n","\n","\n","**Let's dive in!**"]},{"cell_type":"markdown","metadata":{"id":"Y0WybTbssU0g"},"source":["## **Installation**\n","\n","Install SparseML via `pip`.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"collapsed":true,"id":"AkR1u2_NnXqY"},"outputs":[],"source":["!pip install sparseml[transformers]"]},{"cell_type":"markdown","metadata":{"id":"_jY0SKdXFGO3"},"source":["If you are running on Google Colab, restart the runtime after this step."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XXj0S5Jdq2M-"},"outputs":[],"source":["import sparseml\n","from sparsezoo import Model\n","from sparseml.transformers.utils import SparseAutoModel\n","from sparseml.transformers.sparsification import Trainer, TrainingArguments\n","import numpy as np\n","from transformers import (\n","    AutoModelForTokenClassification,\n","    AutoConfig, \n","    AutoTokenizer,\n","    EvalPrediction,\n","    DataCollatorForTokenClassification,\n","    PreTrainedTokenizerFast\n",")\n","from datasets import ClassLabel, load_dataset, load_metric"]},{"cell_type":"markdown","metadata":{"id":"A6GwDnLL2Zn_"},"source":["## **Step 1: Load a Dataset**\n","\n","SparseML is integrated with Hugging Face, so we can use the `datasets` class to load datasets from the Hugging Face hub or from local files. \n","\n","[Conll2003 Dataset Card](https://huggingface.co/datasets/conll2003)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"CkvbT1i9p87z"},"outputs":[],"source":["# load dataset from HF hub\n","dataset = load_dataset(\"conll2003\")\n","\n","# alternatively, load from JSONL file\n","data_files = {}\n","dataset[\"train\"].to_json(\"conll2003-train.json\")\n","dataset[\"validation\"].to_json(\"conll2003-validation.json\")\n","data_files[\"train\"] = \"conll2003-train.json\"\n","data_files[\"validation\"] = \"conll2003-validation.json\"\n","dataset_from_json = load_dataset('json', data_files=data_files)"]},{"cell_type":"markdown","metadata":{"id":"IiFcAKt82qSh"},"source":["We can see the input is `tokens` which is a list of words and the labels are `ner_tags` which are a list of integers corresponding to a tag type for each word."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":725,"status":"ok","timestamp":1677766765729,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"kc8DQY2HyUWy","outputId":"366d295f-ff33-4409-cba0-1b30963b674f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\"id\":\"0\",\"tokens\":[\"EU\",\"rejects\",\"German\",\"call\",\"to\",\"boycott\",\"British\",\"lamb\",\".\"],\"pos_tags\":[22,42,16,21,35,37,16,21,7],\"chunk_tags\":[11,21,11,12,21,22,11,12,0],\"ner_tags\":[3,0,7,0,0,0,7,0,0]}\n","{\"id\":\"1\",\"tokens\":[\"Peter\",\"Blackburn\"],\"pos_tags\":[22,22],\"chunk_tags\":[11,12],\"ner_tags\":[1,2]}\n","{\"id\":\"2\",\"tokens\":[\"BRUSSELS\",\"1996-08-22\"],\"pos_tags\":[22,11],\"chunk_tags\":[11,12],\"ner_tags\":[5,0]}\n","{\"id\":\"3\",\"tokens\":[\"The\",\"European\",\"Commission\",\"said\",\"on\",\"Thursday\",\"it\",\"disagreed\",\"with\",\"German\",\"advice\",\"to\",\"consumers\",\"to\",\"shun\",\"British\",\"lamb\",\"until\",\"scientists\",\"determine\",\"whether\",\"mad\",\"cow\",\"disease\",\"can\",\"be\",\"transmitted\",\"to\",\"sheep\",\".\"],\"pos_tags\":[12,22,22,38,15,22,28,38,15,16,21,35,24,35,37,16,21,15,24,41,15,16,21,21,20,37,40,35,21,7],\"chunk_tags\":[11,12,12,21,13,11,11,21,13,11,12,13,11,21,22,11,12,17,11,21,17,11,12,12,21,22,22,13,11,0],\"ner_tags\":[0,3,4,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}\n","{\"id\":\"4\",\"tokens\":[\"Germany\",\"'s\",\"representative\",\"to\",\"the\",\"European\",\"Union\",\"'s\",\"veterinary\",\"committee\",\"Werner\",\"Zwingmann\",\"said\",\"on\",\"Wednesday\",\"consumers\",\"should\",\"buy\",\"sheepmeat\",\"from\",\"countries\",\"other\",\"than\",\"Britain\",\"until\",\"the\",\"scientific\",\"advice\",\"was\",\"clearer\",\".\"],\"pos_tags\":[22,27,21,35,12,22,22,27,16,21,22,22,38,15,22,24,20,37,21,15,24,16,15,22,15,12,16,21,38,17,7],\"chunk_tags\":[11,11,12,13,11,12,12,11,12,12,12,12,21,13,11,12,21,22,11,13,11,1,13,11,17,11,12,12,21,1,0],\"ner_tags\":[5,0,0,0,0,3,4,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0]}\n"]}],"source":["!head conll2003-train.json --lines=5"]},{"cell_type":"markdown","metadata":{"id":"1urGaah73OUm"},"source":["## **Step 2: Setup Evaluation Metric**\n","\n","Token classification predicts a category for every word in the input sentence. We can use the [seqeval metric](https://huggingface.co/spaces/evaluate-metric/seqeval) to evaluate the tag-level precision and recall of the pipeline. \n","\n","The seqeval metric needs to be passed tags rather than tag indexes, so we need to create a mapping between the indexes and the tags so that we can pass the tags to the seqeval metric.\n","\n","The Conll2003 named-entity-recognition tags map to the following classes:\n","\n","```\n","{\n","  'O': 0, \n","  'B-PER': 1, \n","  'I-PER': 2, \n","  'B-ORG': 3, \n","  'I-ORG': 4, \n","  'B-LOC': 5, \n","  'I-LOC': 6, \n","  'B-MISC': 7, \n","  'I-MISC': 8\n","}\n","```"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7ti52fgQqdSU"},"outputs":[],"source":["# label mapping\n","LABEL_MAP = {\n","    0: 'O', \n","    1: 'B-PER', \n","    2: 'I-PER', \n","    3: 'B-ORG', \n","    4: 'I-ORG', \n","    5: 'B-LOC', \n","    6: 'I-LOC', \n","    7: 'B-MISC', \n","    8: 'I-MISC'\n","}\n","\n","# other configs\n","INPUT_COL = \"tokens\"\n","LABEL_COL = \"ner_tags\"\n","NUM_LABELS = len(LABEL_MAP)\n","SPECIAL_TOKEN_ID = -100"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZZUOmaW1u7C1"},"outputs":[],"source":["# load evaluation metric\n","metric = load_metric(\"seqeval\")\n","\n","# setup metrics function\n","def compute_metrics(p: EvalPrediction):\n","  predictions, labels = p\n","  predictions = np.argmax(predictions, axis=2)\n","   \n","  # Remove ignored index (special tokens) and convert indexed tags to labels\n","  true_predictions = [\n","    [LABEL_MAP[pred] for (pred, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  true_labels = [\n","    [LABEL_MAP[lab] for (_, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  \n","  # example: results = metrics.compute(predictions=[\"0\", \"B-group\", \"0\"], true_labels=[\"0\", \"B-org\", \"I-org\"])\n","  #   we used the LABEL to convert the tags (which are integers) into the corresponding LABEL\n","  #   seqeval should be passed the actual labels\n","  results = metric.compute(predictions=true_predictions, references=true_labels)\n","  return {\n","    \"precision\": results[\"overall_precision\"],\n","    \"recall\": results[\"overall_recall\"],\n","    \"f1\": results[\"overall_f1\"],\n","    \"accuracy\": results[\"overall_accuracy\"],\n","  }"]},{"cell_type":"markdown","metadata":{"id":"1GEhYi53HoAH"},"source":["## **Step 3: Download Files for Sparse Transfer Learning**\n","\n","First, we need to select a sparse checkpoint to begin the training process. In this case, we will fine-tune a 90% pruned version of BERT onto the Conll2003 NER dataset. This model is available in SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\n","```\n","\n","Next, we need to create a sparsification recipe for usage in the training process. Recipes are YAML files that encode the sparsity related algorithms and parameters to be applied by SparseML. For Sparse Transfer Learning, we need to use a recipe that instructs SparseML to maintain sparsity during the training process and to apply quantization over the final few epochs. \n","\n","In the case of Conll2003, there is a transfer learning recipe available in the SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\n","```\n","\n","Finally, SparseML has the optional ability to apply model distillation from a teacher model during the transfer learning process to boost accuracy. In this case, we will use a dense version of BERT trained on the Conll2003 dataset which is hosted in SparseZoo. This model is identified by the following stub:\n","\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\n","```"]},{"cell_type":"markdown","metadata":{"id":"U_iyuuB4Wq7N"},"source":["Use the `sparsezoo` python client to download the models and recipe using their SparseZoo stubs."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ykg8fEN2Q5o_"},"outputs":[],"source":["# downloads 90% pruned upstream BERT trained on MLM objective (pruned90)\n","model_stub = \"zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\" \n","model_path = Model(model_stub, download_path=\"./model\").training.path\n","\n","# downloads dense BERT trained on CONLL2003 (base_none)\n","teacher_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\"\n","teacher_path = Model(teacher_stub, download_path=\"./teacher\").training.path\n","\n","# download pruned quantized transfer recipe for CONLL2003 (pruned90_quant)\n","transfer_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\"\n","recipe_path = Model(transfer_stub, download_path=\"./transfer_recipe\").recipes.default.path"]},{"cell_type":"markdown","metadata":{"id":"RLe8iEWxV_zz"},"source":["We can see that the upstream model (trained on Wikipedia BookCorpus) and  configuration files have been downloaded to the local directory."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25,"status":"ok","timestamp":1677766834654,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"0NTVj1kPRSCW","outputId":"cfdf5ff4-9b8a-4f4d-a1b0-d9a9fa1dec19"},"outputs":[{"name":"stdout","output_type":"stream","text":["all_results.json   special_tokens_map.json  trainer_state.json  vocab.txt\n","config.json        tokenizer_config.json    training_args.bin\n","pytorch_model.bin  tokenizer.json           train_results.json\n"]}],"source":["%ls ./model/training"]},{"cell_type":"markdown","metadata":{"id":"orjvrvdCWEUi"},"source":["We can see that a transfer learning recipe has been downloaded. The `ConstantPruningModifier` instructs SparseML to maintain the sparsity structure of the network as the model trains and the `QuantizationModifier` instructs SparseML to run Quantization Aware Training at the end of training."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eUYg-7eBRT5f"},"outputs":[],"source":["%cat ./transfer_recipe/recipe/recipe_original.md"]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"0824QZuAqdVY"},"source":["#### **Inspecting the Recipe**\n","\n","Here is the transfer learning recipe:\n","\n","```yaml\n","version: 1.1.0\n","\n","# General Variables\n","num_epochs: 13\n","init_lr: 1.5e-4 \n","final_lr: 0\n","\n","qat_start_epoch: 8.0\n","observer_epoch: 12.0\n","quantize_embeddings: 1\n","\n","distill_hardness: 1.0\n","distill_temperature: 2.0\n","\n","# Modifiers:\n","\n","training_modifiers:\n","  - !EpochRangeModifier\n","      end_epoch: eval(num_epochs)\n","      start_epoch: 0.0\n","\n","  - !LearningRateFunctionModifier\n","      start_epoch: 0\n","      end_epoch: eval(num_epochs)\n","      lr_func: linear\n","      init_lr: eval(init_lr)\n","      final_lr: eval(final_lr)\n","    \n","quantization_modifiers:\n","  - !QuantizationModifier\n","      start_epoch: eval(qat_start_epoch)\n","      disable_quantization_observer_epoch: eval(observer_epoch)\n","      freeze_bn_stats_epoch: eval(observer_epoch)\n","      quantize_embeddings: eval(quantize_embeddings)\n","      quantize_linear_activations: 0\n","      exclude_module_types: ['LayerNorm']\n","      submodules:\n","        - bert.embeddings\n","        - bert.encoder\n","        - classifier\n","\n","distillation_modifiers:\n","  - !DistillationModifier\n","     hardness: eval(distill_hardness)\n","     temperature: eval(distill_temperature)\n","     distill_output_keys: [logits]\n","\n","constant_modifiers:\n","  - !ConstantPruningModifier\n","      start_epoch: 0.0\n","      params: __ALL_PRUNABLE__\n","```\n","\n","\n","The `Modifiers` in the transfer learning recipe are the important items that encode how SparseML should modify the training process for Sparse Transfer Learning:\n","- `ConstantPruningModifier` tells SparseML to pin weights at 0 over all epochs, maintaining the sparsity structure of the network\n","- `QuantizationModifier` tells SparseML to quanitze the weights with quantization aware training over the last 5 epochs\n","- `DistillationModifier` tells SparseML how to apply distillation during the trainign process, targeting the logits\n","\n","Below, SparseML's `Trainer` will parses the modifiers and updates the training process to implement the algorithms specified here."]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"FStnDScEKoMX"},"source":["## **Step 4: Setup Hugging Face Model Objects**\n","\n","Next, we will set up the Hugging Face `tokenizer`, `config`, and `model`. \n","\n","These are all native Hugging Face objects, so check out the Hugging Face docs for more details on `AutoModel`, `AutoConfig`, and `AutoTokenizer` as needed. \n","\n","We instantiate these classes by passing the local path to the directory containing the `pytorch_model.bin`, `tokenizer.json`, and `config.json` files from the SparseZoo download."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dhN1oGcTQ9RE"},"outputs":[],"source":["# shared tokenizer between teacher and student\n","tokenizer = AutoTokenizer.from_pretrained(model_path)\n","assert(isinstance(tokenizer, PreTrainedTokenizerFast))\n","\n","# setup configs\n","model_config = AutoConfig.from_pretrained(model_path, num_labels=NUM_LABELS)\n","teacher_config = AutoConfig.from_pretrained(teacher_path, num_labels=NUM_LABELS)\n","\n","# initialize model using familiar HF AutoModel\n","model_kwargs = {\"config\": model_config}\n","model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n","model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n","model.config.id2label = LABEL_MAP\n","\n","# initialize teacher using familiar HF AutoModel\n","teacher_kwargs = {\"config\": teacher_config}\n","teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n","teacher = AutoModelForTokenClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n","\n","# optional - prints metrics about sparsity profiles of the models\n","SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n","SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed) # prints metrics on sparsity profile"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":336,"status":"ok","timestamp":1677767281068,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"xogGex9gsZ-8","outputId":"cb5fb0b2-60fb-40cf-b886-ca439ad08252"},"outputs":[{"name":"stdout","output_type":"stream","text":["{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}\n","{'B-LOC': 5, 'B-MISC': 7, 'B-ORG': 3, 'B-PER': 1, 'I-LOC': 6, 'I-MISC': 8, 'I-ORG': 4, 'I-PER': 2, 'O': 0}\n"]}],"source":["model.config.id2label = LABEL_MAP\n","model.config.label2id = {LABEL_MAP[id]: id for id in LABEL_MAP.keys()}\n","\n","print(model.config.id2label)\n","print(teacher.config.id2label)\n","\n","print(model.config.label2id)\n","print(teacher.config.label2id)"]},{"cell_type":"markdown","metadata":{"id":"K1JSDkCdMghS"},"source":["## **Step 5: Tokenize Dataset**\n","\n","Run the tokenizer on the dataset. \n","\n","In this function, we handle the case where an individual word is tokenized into multiple tokens. In particular, we set the `label_id = SPECIAL_TOKEN_ID` for each token besides the first token in a word. \n","\n","When evaluating the accuracy with `compute_metrics` (defined above), we filter out tokens with `SPECIAL_TOKEN_ID`, such that each word counts only once in the precision and recall calculations."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2EUuFSTzRAvp"},"outputs":[],"source":["MAX_LEN = 128\n","\n","def preprocess_fn(examples):\n","  tokenized_inputs = tokenizer(\n","    examples[INPUT_COL], \n","    padding=\"max_length\", \n","    max_length=min(tokenizer.model_max_length, MAX_LEN), \n","    truncation=True,\n","    is_split_into_words=True # the texts in our dataset are lists of words (with a label for each word)\n","  )\n","  \n","  labels = []\n","  for i, label in enumerate(examples[LABEL_COL]):\n","    word_ids = tokenized_inputs.word_ids(batch_index=i)\n","    previous_word_idx = None\n","    label_ids = []\n","    for word_idx in word_ids:\n","      # Special tokens have a word id that is None. We set the label to SPECIAL_TOKEN_ID\n","      # so they are automatically ignored in the loss function.\n","      if word_idx is None:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","\n","      # We set the label for the first token of each word.\n","      elif word_idx != previous_word_idx:\n","        label_ids.append(label[word_idx])\n","\n","      # We will not label the other tokens of a word, so set to SPECIAL_TOKEN_ID\n","      else:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","      previous_word_idx = word_idx\n","\n","    labels.append(label_ids)\n","\n","  tokenized_inputs[\"labels\"] = labels\n","  return tokenized_inputs\n","\n","# tokenize the dataset\n","tokenized_dataset = dataset_from_json.map(\n","    preprocess_fn,\n","    batched=True,\n","    desc=\"Running tokenizer on dataset\"\n",")"]},{"cell_type":"markdown","metadata":{"id":"19mnPsKHN_y1"},"source":["## **Step 6: Run Training**\n","\n","SparseML has a custom `Trainer` class that inherits from the [Hugging Face `Trainer` Class](https://huggingface.co/docs/transformers/main_classes/trainer). As such, the SparseML `Trainer` has all of the existing functionality of the HF trainer. However, in addition, we can supply a `recipe` and (optionally) a `teacher`. \n","\n","\n","As we saw above, the `recipe` encodes the sparsity related algorithms and hyperparameters of the training process in a YAML file. The SparseML `Trainer` parses the `recipe` and adjusts the training workflow to apply the algorithms in the recipe.\n","\n","The `teacher` is an optional argument that instructs SparseML to apply model distillation to support the training process."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"34IXj1n6RCgQ"},"outputs":[],"source":["# run with subset of dataset so we can complete in 15 minutes\n","MAX_SAMPLES = 2000\n","if MAX_SAMPLES is not None:\n","  train_dataset = tokenized_dataset[\"train\"].select(range(MAX_SAMPLES))\n","else:\n","  train_dataset = tokenized_dataset[\"train\"]\n","eval_dataset = tokenized_dataset[\"validation\"]\n","\n","# setup trainer arguments\n","training_args = TrainingArguments(\n","    output_dir=\"./training_output\",\n","    do_train=True,\n","    do_eval=True,\n","    resume_from_checkpoint=False,\n","    evaluation_strategy=\"epoch\",\n","    save_strategy=\"epoch\",\n","    logging_strategy=\"epoch\",\n","    save_total_limit=1,\n","    per_device_train_batch_size=32,\n","    per_device_eval_batch_size=32,\n","    fp16=True)\n","\n","# initialize trainer\n","trainer = Trainer(\n","    model=model,\n","    model_state_path=model_path,\n","    recipe=recipe_path,\n","    teacher=teacher,\n","    metadata_args=[\"per_device_train_batch_size\",\"per_device_eval_batch_size\",\"fp16\"],\n","    args=training_args,\n","    train_dataset=train_dataset,\n","    eval_dataset=eval_dataset,\n","    tokenizer=tokenizer,\n","    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None),\n","    compute_metrics=compute_metrics)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LLBgAdqyRDro"},"outputs":[],"source":["%rm -rf training_output\n","train_result = trainer.train(resume_from_checkpoint=False)\n","trainer.save_model()\n","trainer.save_state()\n","trainer.save_optimizer_and_scheduler(training_args.output_dir)"]},{"cell_type":"markdown","metadata":{"id":"2vgxbUDKqdVZ"},"source":["## **Step 7: Export To ONNX**\n","\n","Run the following to export the model to ONNX. The script creates a `deployment` folder containing ONNX file and the necessary configuration files (e.g. `tokenizer.json`) for deployment with DeepSparse."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-rhWjiHBeR7M"},"outputs":[],"source":["!sparseml.transformers.export_onnx \\\n","  --model_path training_output \\\n","  --task token_classification"]},{"cell_type":"markdown","metadata":{"id":"a72xHJ5594C4"},"source":["## **Deploy with DeepSparse**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-XubpXohO_8A"},"outputs":[],"source":["%pip install deepsparse"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"m_USM8mCPETg"},"outputs":[],"source":["from deepsparse import Pipeline\n","\n","pipeline = Pipeline.create(\"token_classification\", model_path=\"./deployment\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"Bncg7Xx5ONqB","outputId":"b753f175-289c-4fd1-bc4f-468060349f0b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9966669082641602, index=1, word='japan', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7956981062889099, index=8, word='world', start=23, end=28, is_grouped=False),\n","  TokenClassificationResult(entity='I-MISC', score=0.9346566796302795, index=9, word='cup', start=29, end=32, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.4572566747665405, index=19, word='fifa', start=73, end=77, is_grouped=False)]]\n"]}],"source":["from pprint import pprint\n","prediction = pipeline(\"Japan, co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA, are favourites to regain their title here.\")\n","pprint(prediction.predictions)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"4zqbsVpoSZ-R","outputId":"0032570b-6637-47b8-ad19-b25d834a944b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9878184795379639, index=1, word='china', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7045027613639832, index=18, word='u', start=93, end=94, is_grouped=False),\n","  TokenClassificationResult(entity='I-LOC', score=0.31070953607559204, index=19, word='##zbek', start=94, end=98, is_grouped=False),\n","  TokenClassificationResult(entity='B-PER', score=0.9934289455413818, index=21, word='igor', start=107, end=111, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9966109395027161, index=22, word='sh', start=112, end=114, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9972546696662903, index=23, word='##k', start=114, end=115, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9971543550491333, index=24, word='##vy', start=115, end=117, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9964032173156738, index=25, word='##rin', start=117, end=120, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.8585354089736938, index=44, word='chinese', start=205, end=212, is_grouped=False)]]\n"]}],"source":["prediction = pipeline(\"China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net.\")\n","pprint(prediction.predictions)"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[{"file_id":"1NzTgvXgE5e17JdD1BKXJu-ABvE2-5gKj","timestamp":1677770924764},{"file_id":"1nCs9zm2goooiw0gfU6S4ACRiBiPCxst9","timestamp":1677449350236},{"file_id":"1cXfeYQ_ZbnJRoQsaYOIDR2N7YP--mMiL","timestamp":1677358343826},{"file_id":"1Zawa0sifXr2wIl9tbF7ySJ7xYY0dtTzI","timestamp":1677345946788}]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.6"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"markdown","metadata":{"id":"kSNEB-3orJ9C"},"source":["# **Token Classification: Sparse Transfer Learning with the Python API**\n","\n","In this example, you will fine-tune a 90% pruned BERT model onto the Conll2003 NER dataset using SparseML's Hugging Face Integration.\n","\n","### **Sparse Transfer Learning Overview**\n","\n","Sparse Transfer Learning is very similiar to typical fine-tuning you are used to when training models. However, with Sparse Transfer Learning, we start the training process from a pre-sparsified checkpoint and maintain the sparsity structure while the fine tuning occurs. \n","\n","At the end, you will have a sparse model trained on your dataset, ready to be deployed with DeepSparse for GPU-class performance on CPUs!\n","\n","### **Pre-Sparsified BERT**\n","SparseZoo, Neural Magic's open source repository of pre-sparsified models, contains a 90% pruned version of BERT, which has been sparsified on the upstream Wikipedia and BookCorpus datasets with the\n","masked language modeling objective. [Check out the model card](https://sparsezoo.neuralmagic.com/models/nlp%2Fmasked_language_modeling%2Fobert-base%2Fpytorch%2Fhuggingface%2Fwikipedia_bookcorpus%2Fpruned90-none). We will use this model as the starting point for the transfer learning process.\n","\n","\n","**Let's dive in!**"]},{"cell_type":"markdown","metadata":{"id":"Y0WybTbssU0g"},"source":["## **Installation**\n","\n","Install SparseML via `pip`.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"collapsed":true,"id":"AkR1u2_NnXqY"},"outputs":[],"source":["!pip install sparseml[transformers]"]},{"cell_type":"markdown","metadata":{"id":"_jY0SKdXFGO3"},"source":["If you are running on Google Colab, restart the runtime after this step."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XXj0S5Jdq2M-"},"outputs":[],"source":["import sparseml\n","from sparsezoo import Model\n","from sparseml.transformers.utils import SparseAutoModel\n","from sparseml.transformers.sparsification import Trainer, TrainingArguments\n","import numpy as np\n","from transformers import (\n","    AutoModelForTokenClassification,\n","    AutoConfig, \n","    AutoTokenizer,\n","    EvalPrediction,\n","    DataCollatorForTokenClassification,\n","    PreTrainedTokenizerFast\n",")\n","from datasets import ClassLabel, load_dataset, load_metric"]},{"cell_type":"markdown","metadata":{"id":"A6GwDnLL2Zn_"},"source":["## **Step 1: Load a Dataset**\n","\n","SparseML is integrated with Hugging Face, so we can use the `datasets` class to load datasets from the Hugging Face hub or from local files. \n","\n","[Conll2003 Dataset Card](https://huggingface.co/datasets/conll2003)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"CkvbT1i9p87z"},"outputs":[],"source":["# load dataset from HF hub\n","dataset = load_dataset(\"conll2003\")\n","\n","# alternatively, load from JSONL file\n","data_files = {}\n","dataset[\"train\"].to_json(\"conll2003-train.json\")\n","dataset[\"validation\"].to_json(\"conll2003-validation.json\")\n","data_files[\"train\"] = \"conll2003-train.json\"\n","data_files[\"validation\"] = \"conll2003-validation.json\"\n","dataset_from_json = load_dataset('json', data_files=data_files)"]},{"cell_type":"markdown","metadata":{"id":"IiFcAKt82qSh"},"source":["We can see the input is `tokens` which is a list of words and the labels are `ner_tags` which are a list of integers corresponding to a tag type for each word."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":725,"status":"ok","timestamp":1677766765729,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"kc8DQY2HyUWy","outputId":"366d295f-ff33-4409-cba0-1b30963b674f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\"id\":\"0\",\"tokens\":[\"EU\",\"rejects\",\"German\",\"call\",\"to\",\"boycott\",\"British\",\"lamb\",\".\"],\"pos_tags\":[22,42,16,21,35,37,16,21,7],\"chunk_tags\":[11,21,11,12,21,22,11,12,0],\"ner_tags\":[3,0,7,0,0,0,7,0,0]}\n","{\"id\":\"1\",\"tokens\":[\"Peter\",\"Blackburn\"],\"pos_tags\":[22,22],\"chunk_tags\":[11,12],\"ner_tags\":[1,2]}\n","{\"id\":\"2\",\"tokens\":[\"BRUSSELS\",\"1996-08-22\"],\"pos_tags\":[22,11],\"chunk_tags\":[11,12],\"ner_tags\":[5,0]}\n","{\"id\":\"3\",\"tokens\":[\"The\",\"European\",\"Commission\",\"said\",\"on\",\"Thursday\",\"it\",\"disagreed\",\"with\",\"German\",\"advice\",\"to\",\"consumers\",\"to\",\"shun\",\"British\",\"lamb\",\"until\",\"scientists\",\"determine\",\"whether\",\"mad\",\"cow\",\"disease\",\"can\",\"be\",\"transmitted\",\"to\",\"sheep\",\".\"],\"pos_tags\":[12,22,22,38,15,22,28,38,15,16,21,35,24,35,37,16,21,15,24,41,15,16,21,21,20,37,40,35,21,7],\"chunk_tags\":[11,12,12,21,13,11,11,21,13,11,12,13,11,21,22,11,12,17,11,21,17,11,12,12,21,22,22,13,11,0],\"ner_tags\":[0,3,4,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}\n","{\"id\":\"4\",\"tokens\":[\"Germany\",\"'s\",\"representative\",\"to\",\"the\",\"European\",\"Union\",\"'s\",\"veterinary\",\"committee\",\"Werner\",\"Zwingmann\",\"said\",\"on\",\"Wednesday\",\"consumers\",\"should\",\"buy\",\"sheepmeat\",\"from\",\"countries\",\"other\",\"than\",\"Britain\",\"until\",\"the\",\"scientific\",\"advice\",\"was\",\"clearer\",\".\"],\"pos_tags\":[22,27,21,35,12,22,22,27,16,21,22,22,38,15,22,24,20,37,21,15,24,16,15,22,15,12,16,21,38,17,7],\"chunk_tags\":[11,11,12,13,11,12,12,11,12,12,12,12,21,13,11,12,21,22,11,13,11,1,13,11,17,11,12,12,21,1,0],\"ner_tags\":[5,0,0,0,0,3,4,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0]}\n"]}],"source":["!head conll2003-train.json --lines=5"]},{"cell_type":"markdown","metadata":{"id":"1urGaah73OUm"},"source":["## **Step 2: Setup Evaluation Metric**\n","\n","Token classification predicts a category for every word in the input sentence. We can use the [seqeval metric](https://huggingface.co/spaces/evaluate-metric/seqeval) to evaluate the tag-level precision and recall of the pipeline. \n","\n","The seqeval metric needs to be passed tags rather than tag indexes, so we need to create a mapping between the indexes and the tags so that we can pass the tags to the seqeval metric.\n","\n","The Conll2003 named-entity-recognition tags map to the following classes:\n","\n","```\n","{\n","  'O': 0, \n","  'B-PER': 1, \n","  'I-PER': 2, \n","  'B-ORG': 3, \n","  'I-ORG': 4, \n","  'B-LOC': 5, \n","  'I-LOC': 6, \n","  'B-MISC': 7, \n","  'I-MISC': 8\n","}\n","```"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7ti52fgQqdSU"},"outputs":[],"source":["# label mapping\n","LABEL_MAP = {\n","    0: 'O', \n","    1: 'B-PER', \n","    2: 'I-PER', \n","    3: 'B-ORG', \n","    4: 'I-ORG', \n","    5: 'B-LOC', \n","    6: 'I-LOC', \n","    7: 'B-MISC', \n","    8: 'I-MISC'\n","}\n","\n","# other configs\n","INPUT_COL = \"tokens\"\n","LABEL_COL = \"ner_tags\"\n","NUM_LABELS = len(LABEL_MAP)\n","SPECIAL_TOKEN_ID = -100"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZZUOmaW1u7C1"},"outputs":[],"source":["# load evaluation metric\n","metric = load_metric(\"seqeval\")\n","\n","# setup metrics function\n","def compute_metrics(p: EvalPrediction):\n","  predictions, labels = p\n","  predictions = np.argmax(predictions, axis=2)\n","   \n","  # Remove ignored index (special tokens) and convert indexed tags to labels\n","  true_predictions = [\n","    [LABEL_MAP[pred] for (pred, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  true_labels = [\n","    [LABEL_MAP[lab] for (_, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  \n","  # example: results = metrics.compute(predictions=[\"0\", \"B-group\", \"0\"], true_labels=[\"0\", \"B-org\", \"I-org\"])\n","  #   we used the LABEL to convert the tags (which are integers) into the corresponding LABEL\n","  #   seqeval should be passed the actual labels\n","  results = metric.compute(predictions=true_predictions, references=true_labels)\n","  return {\n","    \"precision\": results[\"overall_precision\"],\n","    \"recall\": results[\"overall_recall\"],\n","    \"f1\": results[\"overall_f1\"],\n","    \"accuracy\": results[\"overall_accuracy\"],\n","  }"]},{"cell_type":"markdown","metadata":{"id":"1GEhYi53HoAH"},"source":["## **Step 3: Download Files for Sparse Transfer Learning**\n","\n","First, we need to select a sparse checkpoint to begin the training process. In this case, we will fine-tune a 90% pruned version of BERT onto the Conll2003 NER dataset. This model is available in SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\n","```\n","\n","Next, we need to create a sparsification recipe for usage in the training process. Recipes are YAML files that encode the sparsity related algorithms and parameters to be applied by SparseML. For Sparse Transfer Learning, we need to use a recipe that instructs SparseML to maintain sparsity during the training process and to apply quantization over the final few epochs. \n","\n","In the case of Conll2003, there is a transfer learning recipe available in the SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\n","```\n","\n","Finally, SparseML has the optional ability to apply model distillation from a teacher model during the transfer learning process to boost accuracy. In this case, we will use a dense version of BERT trained on the Conll2003 dataset which is hosted in SparseZoo. This model is identified by the following stub:\n","\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\n","```"]},{"cell_type":"markdown","metadata":{"id":"U_iyuuB4Wq7N"},"source":["Use the `sparsezoo` python client to download the models and recipe using their SparseZoo stubs."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ykg8fEN2Q5o_"},"outputs":[],"source":["# downloads 90% pruned upstream BERT trained on MLM objective (pruned90)\n","model_stub = \"zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\" \n","model_path = Model(model_stub, download_path=\"./model\").training.path\n","\n","# downloads dense BERT trained on CONLL2003 (base_none)\n","teacher_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\"\n","teacher_path = Model(teacher_stub, download_path=\"./teacher\").training.path\n","\n","# download pruned quantized transfer recipe for CONLL2003 (pruned90_quant)\n","transfer_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\"\n","recipe_path = Model(transfer_stub, download_path=\"./transfer_recipe\").recipes.default.path"]},{"cell_type":"markdown","metadata":{"id":"RLe8iEWxV_zz"},"source":["We can see that the upstream model (trained on Wikipedia BookCorpus) and  configuration files have been downloaded to the local directory."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25,"status":"ok","timestamp":1677766834654,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"0NTVj1kPRSCW","outputId":"cfdf5ff4-9b8a-4f4d-a1b0-d9a9fa1dec19"},"outputs":[{"name":"stdout","output_type":"stream","text":["all_results.json   special_tokens_map.json  trainer_state.json  vocab.txt\n","config.json        tokenizer_config.json    training_args.bin\n","pytorch_model.bin  tokenizer.json           train_results.json\n"]}],"source":["%ls ./model/training"]},{"cell_type":"markdown","metadata":{"id":"orjvrvdCWEUi"},"source":["We can see that a transfer learning recipe has been downloaded. The `ConstantPruningModifier` instructs SparseML to maintain the sparsity structure of the network as the model trains and the `QuantizationModifier` instructs SparseML to run Quantization Aware Training at the end of training."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eUYg-7eBRT5f"},"outputs":[],"source":["%cat ./transfer_recipe/recipe/recipe_original.md"]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"0824QZuAqdVY"},"source":["#### **Inspecting the Recipe**\n","\n","Here is the transfer learning recipe:\n","\n","```yaml\n","version: 1.1.0\n","\n","# General Variables\n","num_epochs: 13\n","init_lr: 1.5e-4 \n","final_lr: 0\n","\n","qat_start_epoch: 8.0\n","observer_epoch: 12.0\n","quantize_embeddings: 1\n","\n","distill_hardness: 1.0\n","distill_temperature: 2.0\n","\n","# Modifiers:\n","\n","training_modifiers:\n","  - !EpochRangeModifier\n","      end_epoch: eval(num_epochs)\n","      start_epoch: 0.0\n","\n","  - !LearningRateFunctionModifier\n","      start_epoch: 0\n","      end_epoch: eval(num_epochs)\n","      lr_func: linear\n","      init_lr: eval(init_lr)\n","      final_lr: eval(final_lr)\n","    \n","quantization_modifiers:\n","  - !QuantizationModifier\n","      start_epoch: eval(qat_start_epoch)\n","      disable_quantization_observer_epoch: eval(observer_epoch)\n","      freeze_bn_stats_epoch: eval(observer_epoch)\n","      quantize_embeddings: eval(quantize_embeddings)\n","      quantize_linear_activations: 0\n","      exclude_module_types: ['LayerNorm']\n","      submodules:\n","        - bert.embeddings\n","        - bert.encoder\n","        - classifier\n","\n","distillation_modifiers:\n","  - !DistillationModifier\n","     hardness: eval(distill_hardness)\n","     temperature: eval(distill_temperature)\n","     distill_output_keys: [logits]\n","\n","constant_modifiers:\n","  - !ConstantPruningModifier\n","      start_epoch: 0.0\n","      params: __ALL_PRUNABLE__\n","```\n","\n","\n","The `Modifiers` in the transfer learning recipe are the important items that encode how SparseML should modify the training process for Sparse Transfer Learning:\n","- `ConstantPruningModifier` tells SparseML to pin weights at 0 over all epochs, maintaining the sparsity structure of the network\n","- `QuantizationModifier` tells SparseML to quanitze the weights with quantization aware training over the last 5 epochs\n","- `DistillationModifier` tells SparseML how to apply distillation during the trainign process, targeting the logits\n","\n","Below, SparseML's `Trainer` will parses the modifiers and updates the training process to implement the algorithms specified here."]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"FStnDScEKoMX"},"source":["## **Step 4: Setup Hugging Face Model Objects**\n","\n","Next, we will set up the Hugging Face `tokenizer`, `config`, and `model`. \n","\n","These are all native Hugging Face objects, so check out the Hugging Face docs for more details on `AutoModel`, `AutoConfig`, and `AutoTokenizer` as needed. \n","\n","We instantiate these classes by passing the local path to the directory containing the `pytorch_model.bin`, `tokenizer.json`, and `config.json` files from the SparseZoo download."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dhN1oGcTQ9RE"},"outputs":[],"source":["# shared tokenizer between teacher and student\n","tokenizer = AutoTokenizer.from_pretrained(model_path)\n","assert(isinstance(tokenizer, PreTrainedTokenizerFast))\n","\n","# setup configs\n","model_config = AutoConfig.from_pretrained(model_path, num_labels=NUM_LABELS)\n","teacher_config = AutoConfig.from_pretrained(teacher_path, num_labels=NUM_LABELS)\n","\n","# initialize model using familiar HF AutoModel\n","model_kwargs = {\"config\": model_config}\n","model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n","model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n","model.config.id2label = LABEL_MAP\n","\n","# initialize teacher using familiar HF AutoModel\n","teacher_kwargs = {\"config\": teacher_config}\n","teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n","teacher = AutoModelForTokenClassification.from_pretrained(teacher_path, **teacher_kwargs,)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":336,"status":"ok","timestamp":1677767281068,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"xogGex9gsZ-8","outputId":"cb5fb0b2-60fb-40cf-b886-ca439ad08252"},"outputs":[{"name":"stdout","output_type":"stream","text":["{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}\n","{'B-LOC': 5, 'B-MISC': 7, 'B-ORG': 3, 'B-PER': 1, 'I-LOC': 6, 'I-MISC': 8, 'I-ORG': 4, 'I-PER': 2, 'O': 0}\n"]}],"source":["model.config.id2label = LABEL_MAP\n","model.config.label2id = {LABEL_MAP[id]: id for id in LABEL_MAP.keys()}\n","\n","print(model.config.id2label)\n","print(teacher.config.id2label)\n","\n","print(model.config.label2id)\n","print(teacher.config.label2id)"]},{"cell_type":"markdown","metadata":{"id":"K1JSDkCdMghS"},"source":["## **Step 5: Tokenize Dataset**\n","\n","Run the tokenizer on the dataset. \n","\n","In this function, we handle the case where an individual word is tokenized into multiple tokens. In particular, we set the `label_id = SPECIAL_TOKEN_ID` for each token besides the first token in a word. \n","\n","When evaluating the accuracy with `compute_metrics` (defined above), we filter out tokens with `SPECIAL_TOKEN_ID`, such that each word counts only once in the precision and recall calculations."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2EUuFSTzRAvp"},"outputs":[],"source":["MAX_LEN = 128\n","\n","def preprocess_fn(examples):\n","  tokenized_inputs = tokenizer(\n","    examples[INPUT_COL], \n","    padding=\"max_length\", \n","    max_length=min(tokenizer.model_max_length, MAX_LEN), \n","    truncation=True,\n","    is_split_into_words=True # the texts in our dataset are lists of words (with a label for each word)\n","  )\n","  \n","  labels = []\n","  for i, label in enumerate(examples[LABEL_COL]):\n","    word_ids = tokenized_inputs.word_ids(batch_index=i)\n","    previous_word_idx = None\n","    label_ids = []\n","    for word_idx in word_ids:\n","      # Special tokens have a word id that is None. We set the label to SPECIAL_TOKEN_ID\n","      # so they are automatically ignored in the loss function.\n","      if word_idx is None:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","\n","      # We set the label for the first token of each word.\n","      elif word_idx != previous_word_idx:\n","        label_ids.append(label[word_idx])\n","\n","      # We will not label the other tokens of a word, so set to SPECIAL_TOKEN_ID\n","      else:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","      previous_word_idx = word_idx\n","\n","    labels.append(label_ids)\n","\n","  tokenized_inputs[\"labels\"] = labels\n","  return tokenized_inputs\n","\n","# tokenize the dataset\n","tokenized_dataset = dataset_from_json.map(\n","    preprocess_fn,\n","    batched=True,\n","    desc=\"Running tokenizer on dataset\"\n",")"]},{"cell_type":"markdown","metadata":{"id":"19mnPsKHN_y1"},"source":["## **Step 6: Run Training**\n","\n","SparseML has a custom `Trainer` class that inherits from the [Hugging Face `Trainer` Class](https://huggingface.co/docs/transformers/main_classes/trainer). As such, the SparseML `Trainer` has all of the existing functionality of the HF trainer. However, in addition, we can supply a `recipe` and (optionally) a `teacher`. \n","\n","\n","As we saw above, the `recipe` encodes the sparsity related algorithms and hyperparameters of the training process in a YAML file. The SparseML `Trainer` parses the `recipe` and adjusts the training workflow to apply the algorithms in the recipe.\n","\n","The `teacher` is an optional argument that instructs SparseML to apply model distillation to support the training process."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"34IXj1n6RCgQ"},"outputs":[],"source":["# run with subset of dataset so we can complete in 15 minutes\n","MAX_SAMPLES = 2000\n","if MAX_SAMPLES is not None:\n","  train_dataset = tokenized_dataset[\"train\"].select(range(MAX_SAMPLES))\n","else:\n","  train_dataset = tokenized_dataset[\"train\"]\n","eval_dataset = tokenized_dataset[\"validation\"]\n","\n","# setup trainer arguments\n","training_args = TrainingArguments(\n","    output_dir=\"./training_output\",\n","    do_train=True,\n","    do_eval=True,\n","    resume_from_checkpoint=False,\n","    evaluation_strategy=\"epoch\",\n","    save_strategy=\"epoch\",\n","    logging_strategy=\"epoch\",\n","    save_total_limit=1,\n","    per_device_train_batch_size=32,\n","    per_device_eval_batch_size=32,\n","    fp16=True)\n","\n","# initialize trainer\n","trainer = Trainer(\n","    model=model,\n","    model_state_path=model_path,\n","    recipe=recipe_path,\n","    teacher=teacher,\n","    metadata_args=[\"per_device_train_batch_size\",\"per_device_eval_batch_size\",\"fp16\"],\n","    args=training_args,\n","    train_dataset=train_dataset,\n","    eval_dataset=eval_dataset,\n","    tokenizer=tokenizer,\n","    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None),\n","    compute_metrics=compute_metrics)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LLBgAdqyRDro"},"outputs":[],"source":["%rm -rf training_output\n","train_result = trainer.train(resume_from_checkpoint=False)\n","trainer.save_model()\n","trainer.save_state()\n","trainer.save_optimizer_and_scheduler(training_args.output_dir)"]},{"cell_type":"markdown","metadata":{"id":"2vgxbUDKqdVZ"},"source":["## **Step 7: Export To ONNX**\n","\n","Run the following to export the model to ONNX. The script creates a `deployment` folder containing ONNX file and the necessary configuration files (e.g. `tokenizer.json`) for deployment with DeepSparse."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-rhWjiHBeR7M"},"outputs":[],"source":["!sparseml.transformers.export_onnx \\\n","  --model_path training_output \\\n","  --task token_classification"]},{"cell_type":"markdown","metadata":{"id":"a72xHJ5594C4"},"source":["## **Deploy with DeepSparse**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-XubpXohO_8A"},"outputs":[],"source":["%pip install deepsparse"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"m_USM8mCPETg"},"outputs":[],"source":["from deepsparse import Pipeline\n","\n","pipeline = Pipeline.create(\"token_classification\", model_path=\"./deployment\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"Bncg7Xx5ONqB","outputId":"b753f175-289c-4fd1-bc4f-468060349f0b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9966669082641602, index=1, word='japan', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7956981062889099, index=8, word='world', start=23, end=28, is_grouped=False),\n","  TokenClassificationResult(entity='I-MISC', score=0.9346566796302795, index=9, word='cup', start=29, end=32, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.4572566747665405, index=19, word='fifa', start=73, end=77, is_grouped=False)]]\n"]}],"source":["from pprint import pprint\n","prediction = pipeline(\"Japan, co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA, are favourites to regain their title here.\")\n","pprint(prediction.predictions)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"4zqbsVpoSZ-R","outputId":"0032570b-6637-47b8-ad19-b25d834a944b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9878184795379639, index=1, word='china', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7045027613639832, index=18, word='u', start=93, end=94, is_grouped=False),\n","  TokenClassificationResult(entity='I-LOC', score=0.31070953607559204, index=19, word='##zbek', start=94, end=98, is_grouped=False),\n","  TokenClassificationResult(entity='B-PER', score=0.9934289455413818, index=21, word='igor', start=107, end=111, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9966109395027161, index=22, word='sh', start=112, end=114, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9972546696662903, index=23, word='##k', start=114, end=115, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9971543550491333, index=24, word='##vy', start=115, end=117, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9964032173156738, index=25, word='##rin', start=117, end=120, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.8585354089736938, index=44, word='chinese', start=205, end=212, is_grouped=False)]]\n"]}],"source":["prediction = pipeline(\"China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net.\")\n","pprint(prediction.predictions)"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[{"file_id":"1NzTgvXgE5e17JdD1BKXJu-ABvE2-5gKj","timestamp":1677770924764},{"file_id":"1nCs9zm2goooiw0gfU6S4ACRiBiPCxst9","timestamp":1677449350236},{"file_id":"1cXfeYQ_ZbnJRoQsaYOIDR2N7YP--mMiL","timestamp":1677358343826},{"file_id":"1Zawa0sifXr2wIl9tbF7ySJ7xYY0dtTzI","timestamp":1677345946788}]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.6"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0}
diff --git a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
index 45370c6af56..b3152185af8 100644
--- a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
+++ b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
@@ -587,8 +587,7 @@
     "# initialize model\n",
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
-    "model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile"
+    "model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)"
    ]
   },
   {