\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f30e8952c50e4f6ca374dc6972b95fca", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:12 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:16 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "520c7840eb5440859d2b9ba2123049a7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ddc5b2c750d4a4cbe67f0b8f7be4faf", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "69846e3229834f7992738d50a33b354b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Quantizing decoder with past\n" - ] - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4c48f03790324ee99afdb4031429a09a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "03249855c07f4b83bfb4289608bca05b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fea73d3c378a442bacb43bf1ab11b4ec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the encoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n" - ] - } - ], + "outputs": [], "source": [ "%%skip not $to_quantize.value\n", "\n", "import gc\n", "import shutil\n", "import nncf\n", - "from datasets import load_dataset\n", - "from tqdm.notebook import tqdm\n", - "\n", - "def extract_input_features(sample):\n", - " input_features = processor(\n", - " sample[\"audio\"][\"array\"],\n", - " sampling_rate=sample[\"audio\"][\"sampling_rate\"],\n", - " return_tensors=\"pt\",\n", - " ).input_features\n", - " return input_features\n", - "\n", + "import openvino as ov\n", "\n", "\n", - "CALIBRATION_DATASET_SIZE = 50\n", + "CALIBRATION_DATASET_SIZE = 30\n", "quantized_model_path = Path(f\"{model_dir}_quantized\")\n", "\n", "\n", "def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n", " if not quantized_model_path.exists():\n", - " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n", - " ov_model, calibration_dataset_size\n", - " )\n", + " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(ov_model, calibration_dataset_size)\n", " print(\"Quantizing encoder\")\n", " quantized_encoder = nncf.quantize(\n", " ov_model.encoder.model,\n", @@ -1146,7 +779,7 @@ " subset_size=len(encoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80),\n", " )\n", " ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n", " del quantized_encoder\n", @@ -1160,7 +793,7 @@ " subset_size=len(decoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96),\n", " )\n", " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n", " del quantized_decoder_with_past\n", @@ -1173,14 +806,24 @@ " shutil.copy(model_path / \"generation_config.json\", quantized_model_path / \"generation_config.json\")\n", " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n", " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n", - "\n", - " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, compile=False)\n", - " quantized_ov_model.to(device.value)\n", - " quantized_ov_model.compile()\n", - " return quantized_ov_model\n", - "\n", - "\n", - "ov_quantized_model = quantize(ov_model, CALIBRATION_DATASET_SIZE)" + " shutil.copy(model_path / \"openvino_tokenizer.xml\", quantized_model_path / \"openvino_tokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_tokenizer.bin\", quantized_model_path / \"openvino_tokenizer.bin\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.xml\", quantized_model_path / \"openvino_detokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.bin\", quantized_model_path / \"openvino_detokenizer.bin\")\n", + " shutil.copy(model_path / \"tokenizer_config.json\", quantized_model_path / \"tokenizer_config.json\")\n", + " shutil.copy(model_path / \"tokenizer.json\", quantized_model_path / \"tokenizer.json\")\n", + " shutil.copy(model_path / \"vocab.json\", quantized_model_path / \"vocab.json\")\n", + " shutil.copy(model_path / \"preprocessor_config.json\", quantized_model_path / \"preprocessor_config.json\")\n", + " shutil.copy(model_path / \"special_tokens_map.json\", quantized_model_path / \"special_tokens_map.json\")\n", + " shutil.copy(model_path / \"normalizer.json\", quantized_model_path / \"normalizer.json\")\n", + " shutil.copy(model_path / \"merges.txt\", quantized_model_path / \"merges.txt\")\n", + " shutil.copy(model_path / \"added_tokens.json\", quantized_model_path / \"added_tokens.json\")\n", + "\n", + " quantized_ov_pipe = openvino_genai.WhisperPipeline(str(quantized_model_path), device=device.value)\n", + " return quantized_ov_pipe\n", + "\n", + "\n", + "quantized_ov_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE)" ] }, { @@ -1196,64 +839,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "00:00:00,000 --> 00:00:05,000\n", - " What's that?\n", - "\n", - "2\n", - "00:00:05,000 --> 00:00:07,000\n", - " Oh, wow.\n", - "\n", - "3\n", - "00:00:09,000 --> 00:00:11,000\n", - " Hello humans.\n", - "\n", - "4\n", - "00:00:14,000 --> 00:00:15,000\n", - " Focus on me.\n", - "\n", - "5\n", - "00:00:15,000 --> 00:00:16,000\n", - " Focus on the guard.\n", - "\n", - "6\n", - "00:00:18,000 --> 00:00:20,000\n", - " Don't tell anyone what you're seen in here.\n", - "\n", - "7\n", - "00:00:22,000 --> 00:00:24,000\n", - " Have you seen what's in there?\n", - "\n", - "8\n", - "00:00:24,000 --> 00:00:25,000\n", - " They have intel.\n", - "\n", - "9\n", - "00:00:25,000 --> 00:00:27,000\n", - " This is where it all changes.\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "if ov_quantized_model is not None:\n", - " int8_pipe = pipeline(\n", - " \"automatic-speech-recognition\",\n", - " model=ov_quantized_model,\n", - " chunk_length_s=30,\n", - " tokenizer=processor.tokenizer,\n", - " feature_extractor=processor.feature_extractor,\n", - " )\n", " inputs, duration = get_audio(output_file)\n", - " transcription = int8_pipe(inputs, generate_kwargs={\"task\": task.value}, return_timestamps=True)[\"chunks\"]\n", + " transcription = quantized_ov_pipe.generate(inputs[\"raw\"], task=task.value, return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, filter_duration=duration)\n", " print(\"\".join(srt_lines))\n", " widgets.Video.from_file(output_file, loop=False, width=800, height=800)" @@ -1269,20 +861,18 @@ "\n", "Finally, we compare original and quantized Whisper models from accuracy and performance stand-points.\n", "\n", - "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate.\n", - "\n", - "When measuring inference time, we do it separately for encoder and decoder-with-past model forwards, and for the whole model inference too." + "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a2822b0dcd584fd2aa28e01c607926d0", + "model_id": "d748cc7a3f394026907c0ef3780d58a8", "version_major": 2, "version_minor": 0 }, @@ -1296,7 +886,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3dc0e232e81c4de0ad82737f98f69d2a", + "model_id": "c2377a7e2dd840328913bc77e6f51ca9", "version_major": 2, "version_minor": 0 }, @@ -1311,11 +901,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Encoder performance speedup: 1.352\n", - "Decoder with past performance speedup: 1.342\n", - "Whole pipeline performance speedup: 1.350\n", - "Whisper transcription word accuracy. Original model: 81.67%. Quantized model: 83.67%.\n", - "Accuracy drop: -1.99%.\n" + "Whole pipeline performance speedup: 1.452\n", + "Whisper transcription word accuracy. Original model: 81.77%. Quantized model: 82.97%.\n", + "Accuracy drop: -1.20%.\n" ] } ], @@ -1326,69 +914,34 @@ "from contextlib import contextmanager\n", "from jiwer import wer, wer_standardize\n", "\n", - "\n", "TEST_DATASET_SIZE = 50\n", - "MEASURE_TIME = False\n", - "\n", - "@contextmanager\n", - "def time_measurement():\n", - " global MEASURE_TIME\n", - " try:\n", - " MEASURE_TIME = True\n", - " yield\n", - " finally:\n", - " MEASURE_TIME = False\n", - "\n", - "def time_fn(obj, fn_name, time_list):\n", - " original_fn = getattr(obj, fn_name)\n", - "\n", - " def wrapper(*args, **kwargs):\n", - " if not MEASURE_TIME:\n", - " return original_fn(*args, **kwargs)\n", - " start_time = time.perf_counter()\n", - " result = original_fn(*args, **kwargs)\n", - " end_time = time.perf_counter()\n", - " time_list.append(end_time - start_time)\n", - " return result\n", - "\n", - " setattr(obj, fn_name, wrapper)\n", "\n", "def calculate_transcription_time_and_accuracy(ov_model, test_samples):\n", - " encoder_infer_times = []\n", - " decoder_with_past_infer_times = []\n", " whole_infer_times = []\n", - " time_fn(ov_model, \"generate\", whole_infer_times)\n", - " time_fn(ov_model.encoder, \"forward\", encoder_infer_times)\n", - " time_fn(ov_model.decoder_with_past, \"forward\", decoder_with_past_infer_times)\n", "\n", " ground_truths = []\n", " predictions = []\n", " for data_item in tqdm(test_samples, desc=\"Measuring performance and accuracy\"):\n", - " input_features = extract_input_features(data_item)\n", - "\n", - " with time_measurement():\n", - " predicted_ids = ov_model.generate(input_features)\n", - " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + " start_time = time.perf_counter()\n", + " transcription = ov_model.generate(data_item[\"audio\"][\"array\"], return_timestamps=True)\n", + " end_time = time.perf_counter()\n", + " whole_infer_times.append(end_time - start_time)\n", "\n", " ground_truths.append(data_item[\"text\"])\n", - " predictions.append(transcription[0])\n", + " predictions.append(transcription.texts[0])\n", "\n", " word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,\n", " hypothesis_transform=wer_standardize)) * 100\n", " mean_whole_infer_time = sum(whole_infer_times)\n", - " mean_encoder_infer_time = sum(encoder_infer_times)\n", - " mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)\n", - " return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)\n", + " return word_accuracy, mean_whole_infer_time\n", "\n", "test_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n", "test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)\n", "test_samples = [sample for sample in test_dataset]\n", "\n", - "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_model, test_samples)\n", - "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_model, test_samples)\n", - "print(f\"Encoder performance speedup: {times_original[1] / times_quantized[1]:.3f}\")\n", - "print(f\"Decoder with past performance speedup: {times_original[2] / times_quantized[2]:.3f}\")\n", - "print(f\"Whole pipeline performance speedup: {times_original[0] / times_quantized[0]:.3f}\")\n", + "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples)\n", + "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(quantized_ov_pipe, test_samples)\n", + "print(f\"Whole pipeline performance speedup: {times_original / times_quantized:.3f}\")\n", "print(f\"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.\")\n", "print(f\"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.\")" ] @@ -1406,6 +959,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "scrolled": true, "test_replace": { " demo.launch(debug=True)": " demo.launch()", " demo.launch(share=True, debug=True)": " demo.launch(share=True)" @@ -1413,18 +967,26 @@ }, "outputs": [], "source": [ - "def transcribe(url, task, use_int8):\n", - " output_file = Path(\"downloaded_video.mp4\")\n", - " ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", - " with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", - " ydl.download(link.value)\n", - " inputs, duration = get_audio(output_file)\n", - " m_pipe = int8_pipe if use_int8 else pipe\n", - " transcription = m_pipe(inputs, generate_kwargs={\"task\": task.lower()}, return_timestamps=True)[\"chunks\"]\n", + "def_config = ov_pipe.get_generation_config()\n", + "\n", + "\n", + "def transcribe(video_path, task, use_int8):\n", + " data_path = Path(video_path)\n", + " inputs, duration = get_audio(data_path)\n", + " m_pipe = quantized_ov_pipe if use_int8 else ov_pipe\n", + "\n", + " frame_num = len(inputs[\"raw\"]) / 16000\n", + " if frame_num > 30:\n", + " config = ov_pipe.get_generation_config()\n", + " chink_num = math.ceil(frame_num / 30)\n", + " config.max_length = chink_num * def_config.max_length\n", + " m_pipe.set_generation_config(config)\n", + "\n", + " transcription = m_pipe.generate(inputs[\"raw\"], task=task.lower(), return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, duration)\n", - " with output_file.with_suffix(\".srt\").open(\"w\") as f:\n", + " with data_path.with_suffix(\".srt\").open(\"w\") as f:\n", " f.writelines(srt_lines)\n", - " return [str(output_file), str(output_file.with_suffix(\".srt\"))]\n", + " return [str(data_path), str(data_path.with_suffix(\".srt\"))]\n", "\n", "\n", "if not Path(\"gradio_helper.py\").exists():\n", @@ -1433,7 +995,7 @@ "\n", "from gradio_helper import make_demo\n", "\n", - "demo = make_demo(fn=transcribe, quantized=ov_quantized_model is not None)\n", + "demo = make_demo(fn=transcribe, quantized=ov_quantized_model is not None, sample_path=output_file)\n", "\n", "try:\n", " demo.launch(debug=True)\n", @@ -1447,7 +1009,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1461,7 +1023,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" }, "openvino_notebooks": { "imageUrl": "https://user-images.githubusercontent.com/29454499/204548693-1304ef33-c790-490d-8a8b-d5766acb6254.png",