diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml index a6c2db6b337..5804367cb9f 100644 --- a/.ci/skipped_notebooks.yml +++ b/.ci/skipped_notebooks.yml @@ -435,13 +435,6 @@ - ubuntu-20.04 - ubuntu-22.04 - windows-2019 -- notebook: notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb - skips: - - os: - - macos-12 - - ubuntu-20.04 - - ubuntu-22.04 - - windows-2019 - notebook: notebooks/hunyuan-dit-image-generation/hunyuan-dit-image-generation.ipynb skips: - os: diff --git a/.github/workflows/build_treon_reusable.yml b/.github/workflows/build_treon_reusable.yml index 06c6c49e3e9..0a8605b8df0 100644 --- a/.github/workflows/build_treon_reusable.yml +++ b/.github/workflows/build_treon_reusable.yml @@ -76,16 +76,20 @@ jobs: uses: xom9ikk/dotenv@ac290ca23a42155a0cba1031d23afa46240116a9 # v2.3.0 with: path: ./.github/workflows - + # Packages that notebooks need to run in plain os - name: Install required packages if: ${{ !inputs.container }} shell: bash run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get update -y - sudo apt-get install libsndfile1 -y + sudo apt-get install libsndfile1 ffmpeg -y + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install ffmpeg + elif [ "$RUNNER_OS" == "Windows" ]; then + choco install ffmpeg-full fi - + # Packages that notebooks need to run in docker container - name: Install required packages (container) if: ${{ inputs.container }} shell: bash @@ -95,7 +99,7 @@ jobs: run: | if [ "$RUNNER_OS" == "Linux" ]; then apt-get update -y - apt-get install git curl wget libsndfile1 libssl-dev unzip libsqlite3-dev libedit-dev libgl1 libgl1-mesa-glx libglib2.0-0 tk -y + apt-get install git curl wget libsndfile1 libssl-dev unzip libsqlite3-dev libedit-dev libgl1 libgl1-mesa-glx libglib2.0-0 ffmpeg tk -y wget https://raw.githubusercontent.com/openvinotoolkit/openvino/master/scripts/install_dependencies/install_openvino_dependencies.sh chmod +x ./install_openvino_dependencies.sh ./install_openvino_dependencies.sh -c=core -c=dev -c=gpu -y diff --git a/notebooks/whisper-asr-genai/README.md b/notebooks/whisper-asr-genai/README.md index 1ea38b3abb4..b00eefa228e 100644 --- a/notebooks/whisper-asr-genai/README.md +++ b/notebooks/whisper-asr-genai/README.md @@ -22,7 +22,7 @@ The tutorial consists of following steps: ## Installation Instructions -This is a self-contained example that relies solely on its code.
+This example requires `ffmpeg` to be installed. All other required dependencies will be installed by the notebook itself.
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. For details, please refer to [Installation Guide](../../README.md). \ No newline at end of file diff --git a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb index f83573f4bcc..2e354bae169 100644 --- a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb +++ b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb @@ -22,7 +22,6 @@ "\n", "\n", "\n", - "\n", "### Installation Instructions\n", "\n", "This is a self-contained example that relies solely on its own code.\n", @@ -33,8 +32,6 @@ "\n", "\n", "\n", - "\n", - "\n", "#### Table of contents:\n", "\n", "- [Prerequisites](#Prerequisites)\n", @@ -64,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "bb9fc7f3-cea0-4adf-9ee6-4a3d15931db7", "metadata": { "ExecuteTime": { @@ -128,7 +125,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c5035bf98145426fbc8f05edc8d7924a", + "model_id": "ab4d5dd6e7794d5184db6d9b49281717", "version_major": 2, "version_minor": 0 }, @@ -156,6 +153,10 @@ " \"openai/whisper-tiny\",\n", " ],\n", " \"English-only models\": [\n", + " \"distil-whisper/distil-large-v2\",\n", + " \"distil-whisper/distil-large-v3\",\n", + " \"distil-whisper/distil-medium.en\",\n", + " \"distil-whisper/distil-small.en\",\n", " \"openai/whisper-medium.en\",\n", " \"openai/whisper-small.en\",\n", " \"openai/whisper-base.en\",\n", @@ -175,22 +176,22 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "9f42d9b8", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3209303207294a53b9b41ac90241982c", + "model_id": "fb48f63ba96c43388fa987c6c78f6e2c", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model:', index=6, options=('openai/whisper-large-v3', 'openai/whisper-large-v2', 'openai…" + "Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '…" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -208,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "e5382431-497e-4688-b4ec-8958a92163e7", "metadata": { "ExecuteTime": { @@ -248,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "620020a6", "metadata": {}, "outputs": [ @@ -256,16 +257,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "'data/librispeech_asr_demo_validation_short.wav' already exists.\n" + "'data/courtroom.wav' already exists.\n" ] }, { "data": { "text/plain": [ - "PosixPath('/home/labuser/work/notebook/openvino_notebooks/notebooks/whisper-asr-genai/data/librispeech_asr_demo_validation_short.wav')" + "PosixPath('/home/labuser/work/notebook/openvino_notebooks/notebooks/whisper-asr-genai/data/courtroom.wav')" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -273,11 +274,11 @@ "source": [ "from notebook_utils import download_file\n", "\n", - "en_example_short = Path(\"data\", \"librispeech_asr_demo_validation_short.wav\")\n", + "en_example_short = Path(\"data\", \"courtroom.wav\")\n", "\n", "# a wav sample\n", "download_file(\n", - " \"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/librispeech_asr_demo_validation_0.wav\",\n", + " \"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/courtroom.wav\",\n", " en_example_short.name,\n", " directory=en_example_short.parent,\n", ")" @@ -285,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "61558185", "metadata": {}, "outputs": [], @@ -307,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "218836e9", "metadata": {}, "outputs": [ @@ -316,7 +317,7 @@ "text/html": [ "\n", " \n", " " @@ -340,7 +341,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result: Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\n" + "Result: Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth.\n" ] } ], @@ -369,14 +370,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "9c55f94d", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "88e7bfa5930f4d4ab70361207e3011ef", + "model_id": "0faae0d4b763434f83d90306719f5767", "version_major": 2, "version_minor": 0 }, @@ -384,7 +385,7 @@ "Dropdown(description='Dataset language:', index=4, options=('japanese', 'dutch', 'french', 'spanish', 'italian…" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -408,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "515bec62", "metadata": {}, "outputs": [], @@ -424,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "bb3b3e3f", "metadata": {}, "outputs": [ @@ -489,66 +490,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "36f756d5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n" - ] - }, - { - "data": { - "text/markdown": [ - "**Export command:**" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "`optimum-cli export openvino --model openai/whisper-tiny --library transformers --task automatic-speech-recognition-with-past --framework pt whisper-tiny`" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n", - "Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}\n", - "Using framework PyTorch: 2.3.1+cpu\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> False\n", - "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:1070: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if input_features.shape[-1] != expected_seq_length:\n", - "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:387: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):\n", - "Using framework PyTorch: 2.3.1+cpu\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> True\n", - "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.\n", - "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:100: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", - " if sequence_length != 1:\n", - "Using framework PyTorch: 2.3.1+cpu\n", - "Overriding 1 configuration item(s)\n", - "\t- use_cache -> True\n" - ] - } - ], + "outputs": [], "source": [ "import logging\n", "import nncf\n", @@ -584,14 +529,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "49665de3", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "85a51314434140b0b86368a2160bb2ff", + "model_id": "98cd4ef7fc3145719f7c88b902a286fd", "version_major": 2, "version_minor": 0 }, @@ -599,7 +544,7 @@ "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -614,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "e2896f4c", "metadata": {}, "outputs": [], @@ -635,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "a8ad6087", "metadata": {}, "outputs": [ @@ -644,7 +589,7 @@ "text/html": [ "\n", " \n", " " @@ -660,19 +605,72 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result: Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\n" + "Result: Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth.\n" ] } ], "source": [ - "sample = copy.deepcopy(en_raw_speech)\n", - "\n", - "genai_result = ov_pipe.generate(sample)\n", + "genai_result = ov_pipe.generate(en_raw_speech)\n", "\n", - "display(ipd.Audio(sample, rate=samplerate))\n", + "display(ipd.Audio(en_raw_speech, rate=samplerate))\n", "print(f\"Result: {genai_result}\")" ] }, + { + "cell_type": "markdown", + "id": "413fbb0e", + "metadata": {}, + "source": [ + "Whisper could provide a phrase-level timestamps for audio. Let's try this scenario, we will specify `return_timestamps=True` for `generate` method.\n", + "\n", + "`generate` method with `return_timestamps` set to `True` will return `chunks`, which contain attributes: `text`, `start_ts` and `end_ts` in seconds." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "88a94ff4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0sec. ---> 3.0sec.\n", + " Colonel Jessif, did you order the code rate?\n", + "\n", + "3.0sec. ---> 4.5sec.\n", + " You don't have to answer that question.\n", + "\n", + "4.5sec. ---> 6.5sec.\n", + " I'll answer the question.\n", + "\n", + "6.5sec. ---> 8.0sec.\n", + " You want answers?\n", + "\n", + "8.0sec. ---> 9.0sec.\n", + " I think I'm entitled.\n", + "\n", + "9.0sec. ---> 10.0sec.\n", + " You want answers?\n", + "\n", + "10.0sec. ---> 11.0sec.\n", + " I want the truth.\n", + "\n", + "11.0sec. ---> 13.0sec.\n", + " You can't handle the truth.\n", + "\n" + ] + } + ], + "source": [ + "genai_result_timestamps = ov_pipe.generate(en_raw_speech, return_timestamps=True)\n", + "\n", + "for segment in genai_result_timestamps.chunks:\n", + " print(f\"{segment.start_ts}sec. ---> {segment.end_ts}sec.\")\n", + " print(f\"{segment.text}\\n\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -685,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "4d821d5e", "metadata": {}, "outputs": [ @@ -728,7 +726,7 @@ "}\n", "\n", "if model_type.value == \"Multilingual models\":\n", - " sample = copy.deepcopy(mls_example[\"audio\"])\n", + " sample = mls_example[\"audio\"]\n", "\n", " genai_result_ml = ov_pipe.generate(sample[\"array\"], max_new_tokens=100, task=\"translate\", language=languages_genai[SAMPLE_LANG.value])\n", "\n", @@ -749,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "6ddafe5c-3238-40d3-b8ed-9d50c73f0d8a", "metadata": { "ExecuteTime": { @@ -780,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "94025726-5c09-42b8-9046-9fbbe73afc47", "metadata": { "ExecuteTime": { @@ -792,7 +790,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e3457f845611449794cd97a9c91c03fe", + "model_id": "ac60456bdcfd492fadf812ec87f3c67a", "version_major": 2, "version_minor": 0 }, @@ -806,7 +804,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4df786e07f43403a96622ac0048fd7a8", + "model_id": "8c857e2a5a8b4b96bf44fc7adf5e0c3f", "version_major": 2, "version_minor": 0 }, @@ -825,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "31a08241-497e-4fd9-9ca9-d59c2602b8d4", "metadata": { "ExecuteTime": { @@ -838,9 +836,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mean torch openai/whisper-tiny generation time: 0.291s\n", - "Mean openvino openai/whisper-tiny generation time: 0.159s\n", - "Performance openai/whisper-tiny openvino speedup: 1.832\n" + "Mean torch openai/whisper-tiny generation time: 0.624s\n", + "Mean openvino openai/whisper-tiny generation time: 0.344s\n", + "Performance openai/whisper-tiny openvino speedup: 1.814\n" ] } ], @@ -874,14 +872,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "00597544", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "76d00c1fe75f4b4aa8ecbfa18e80b295", + "model_id": "16f7742648b34d4bbf69ecc45eeb7262", "version_major": 2, "version_minor": 0 }, @@ -889,7 +887,7 @@ "Checkbox(value=True, description='Quantization')" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -904,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "ead4ab0b", "metadata": {}, "outputs": [], @@ -942,26 +940,15 @@ "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", "```\n", "\n", - "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. \n", - "Pipeline can be used for long audio transcription. Distil-Whisper uses a chunked algorithm to transcribe long-form audio files. In practice, this chunked long-form algorithm is 9x faster than the sequential algorithm proposed by OpenAI in the Whisper paper. To enable chunking, pass the chunk_length_s parameter to the pipeline. For Distil-Whisper, a chunk length of 15 seconds is optimal. To activate batching, pass the argument batch_size." + "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. " ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "c9c4ee71", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the encoder to CPU ...\n", - "Compiling the decoder to CPU ...\n", - "Compiling the decoder to CPU ...\n" - ] - } - ], + "outputs": [], "source": [ "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", "\n", @@ -983,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "97e0015d", "metadata": {}, "outputs": [], @@ -1035,7 +1022,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "233c1436", "metadata": {}, "outputs": [], @@ -1049,14 +1036,6 @@ "from datasets import load_dataset\n", "from tqdm.notebook import tqdm\n", "\n", - "def extract_input_features(sample):\n", - " input_features = processor(\n", - " sample[\"audio\"][\"array\"],\n", - " sampling_rate=sample[\"audio\"][\"sampling_rate\"],\n", - " return_tensors=\"pt\",\n", - " ).input_features\n", - " return input_features\n", - "\n", "\n", "\n", "CALIBRATION_DATASET_SIZE = 30\n", @@ -1135,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "fae6280a", "metadata": {}, "outputs": [ @@ -1144,7 +1123,7 @@ "text/html": [ "\n", " \n", " " @@ -1160,20 +1139,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Original : Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\n", - "Quantized: Mr Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\n" + "Original : Colonel Jessif, did you order the code rate? You don't have to answer that question. I'll answer the question. You want answers? I think I'm entitled. You want answers? I want the truth. You can't handle the truth.\n", + "Quantized: Don, I'll just, if you order the code right. You don have to answer that question. I'll answer the question. You want answers. I think I'm entitled you want answer. I want the truth. You can't handle the truth. You can't handle the truth.\n" ] } ], "source": [ "%%skip not $to_quantize.value\n", "\n", - "sample = copy.deepcopy(en_raw_speech)\n", + "genai_result = ov_pipe.generate(en_raw_speech)\n", + "quantized_genai_result = ov_quantized_pipe.generate(en_raw_speech)\n", "\n", - "genai_result = ov_pipe.generate(sample)\n", - "quantized_genai_result = ov_quantized_pipe.generate(sample)\n", - "\n", - "display(ipd.Audio(sample, rate=samplerate))\n", + "display(ipd.Audio(en_raw_speech, rate=samplerate))\n", "\n", "print(f\"Original : {genai_result}\")\n", "print(f\"Quantized: {quantized_genai_result}\")" @@ -1195,14 +1172,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "e61446fa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0662e84788804369927b0648a2be3b53", + "model_id": "8110c224fb8b46bba00a532d00ae134e", "version_major": 2, "version_minor": 0 }, @@ -1216,7 +1193,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5ae11e7aea904de89ee5bcd4bd96c65c", + "model_id": "ebce710a92a041329c33ee2c1a33738e", "version_major": 2, "version_minor": 0 }, @@ -1231,7 +1208,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Whole pipeline performance speedup: 1.339\n", + "Whole pipeline performance speedup: 1.499\n", "Whisper transcription word accuracy. Original model: 82.88%. Quantized model: 84.13%.\n", "Accuracy drop: -1.25%.\n" ] @@ -1305,7 +1282,7 @@ "import requests\n", "\n", "if not Path(\"gradio_helper.py\").exists():\n", - " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/distil-whisper-asr/gradio_helper.py\")\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/whisper-asr-genai/gradio_helper.py\")\n", " open(\"gradio_helper.py\", \"w\").write(r.text)\n", "\n", "from gradio_helper import make_demo, GradioPipeline\n", diff --git a/notebooks/whisper-subtitles-generation/README.md b/notebooks/whisper-subtitles-generation/README.md index 941b949c3b8..02a16de1834 100644 --- a/notebooks/whisper-subtitles-generation/README.md +++ b/notebooks/whisper-subtitles-generation/README.md @@ -1,20 +1,20 @@ -# Video Subtitle Generation with OpenAI Whisper +# Video Subtitle Generation with OpenAI Whisper and OpenVINO Generate API [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb) [Whisper](https://openai.com/index/whisper/) is a general-purpose speech recognition model from [OpenAI](https://openai.com). The model is able to almost flawlessly transcribe speech across dozens of languages and even handle poor audio quality or excessive background noise. -This notebook will run the model with OpenVINO to generate transcription of a video. +This notebook will run the model with OpenVINO Generate API to generate transcription of a video. ## Notebook Contents This notebook demonstrates how to generate video subtitles using the open-source Whisper model. Whisper is an automatic speech recognition (ASR) system trained on 680,000 hours of multilingual and multitask supervised data collected from the web. It is a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. You can find more information about this model in the [research paper](https://cdn.openai.com/papers/whisper.pdf), [OpenAI blog](https://openai.com/index/whisper/), [model card](https://github.com/openai/whisper/blob/main/model-card.md) and GitHub [repository](https://github.com/openai/whisper). -This folder contains notebook that show how to convert and quantize model with OpenVINO. We will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization. +This folder contains notebook that show how to convert and quantize model with OpenVINO and run pipeline with [Generate API](https://github.com/openvinotoolkit/openvino.genai). We will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization. The notebook contains the following steps: 1. Download the model. 2. Instantiate original PyTorch model pipeline. 3. Convert model to OpenVINO IR, using model conversion API. -4. Run the Whisper pipeline with OpenVINO. +4. Run the Whisper pipeline with OpenVINO Generate API. 5. Quantize the OpenVINO model with NNCF. 6. Check quantized model result for the demo video. 7. Compare model size, performance and accuracy of FP32 and quantized INT8 models. @@ -37,8 +37,8 @@ The second notebook will guide you through steps of : ## Installation Instructions -This is a self-contained example that relies solely on its own code.
-We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +This example requires `ffmpeg` to be installed. All other required dependencies will be installed by the notebook itself.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. For details, please refer to [Installation Guide](../../README.md). diff --git a/notebooks/whisper-subtitles-generation/gradio_helper.py b/notebooks/whisper-subtitles-generation/gradio_helper.py index 9edac14e3e2..4a658202fd5 100644 --- a/notebooks/whisper-subtitles-generation/gradio_helper.py +++ b/notebooks/whisper-subtitles-generation/gradio_helper.py @@ -1,12 +1,29 @@ from typing import Callable +from pathlib import Path import gradio as gr -def make_demo(fn: Callable, quantized: bool): +def make_demo(fn: Callable, quantized: bool, sample_path: Path): demo = gr.Interface( + description=f""" +
+
+

+ OpenVINO Generate API Whisper demo {'with quantized model.' if quantized else ''} +

+
+ If you use video more then 30 sec, please, note, max_length will be increased. You also could be update it useing generation_config. +
+
+
+ """, fn=fn, inputs=[ - gr.Textbox(label="YouTube URL"), + gr.Video(label="Video"), gr.Radio(["Transcribe", "Translate"], value="Transcribe"), gr.Checkbox( value=quantized, @@ -15,7 +32,7 @@ def make_demo(fn: Callable, quantized: bool): ), ], outputs="video", - examples=[["https://youtu.be/kgL5LBM-hFI", "Transcribe"]], + examples=[[sample_path, "Transcribe"]], allow_flagging="never", ) diff --git a/notebooks/whisper-subtitles-generation/utils.py b/notebooks/whisper-subtitles-generation/utils.py deleted file mode 100644 index ff43b5b5eb0..00000000000 --- a/notebooks/whisper-subtitles-generation/utils.py +++ /dev/null @@ -1,323 +0,0 @@ -from collections import namedtuple -from functools import partial -import openvino as ov -from pathlib import Path -from typing import List, Optional, Union -from math import floor, ceil - -import io -from scipy.io import wavfile -from moviepy.editor import VideoFileClip - -import numpy as np -import torch - -from whisper.decoding import DecodingTask, Inference, DecodingOptions, DecodingResult - - -class OpenVINOAudioEncoder(torch.nn.Module): - """ - Helper for inference Whisper encoder model with OpenVINO - """ - - def __init__(self, core: ov.Core, model_path: Path, device="CPU"): - super().__init__() - self.model = core.read_model(model_path) - self.compiled_model = core.compile_model(self.model, device) - self.output_blob = self.compiled_model.output(0) - - def forward(self, mel: torch.Tensor): - """ - Inference OpenVINO whisper encoder model. - - Parameters: - mel: input audio fragment mel spectrogram. - Returns: - audio_features: torch tensor with encoded audio features. - """ - return torch.from_numpy(self.compiled_model(mel)[self.output_blob]) - - -class OpenVINOTextDecoder(torch.nn.Module): - """ - Helper for inference OpenVINO decoder model - """ - - def __init__(self, core: ov.Core, model_path: Path, device: str = "CPU"): - super().__init__() - self._core = core - self.model = core.read_model(model_path) - self._input_names = [inp.any_name for inp in self.model.inputs] - self.compiled_model = core.compile_model(self.model, device) - self.device = device - self.blocks = [] - - def init_past_inputs(self, feed_dict): - """ - Initialize cache input for first step. - - Parameters: - feed_dict: Dictonary with inputs for inference - Returns: - feed_dict: updated feed_dict - """ - beam_size = feed_dict["x"].shape[0] - audio_len = feed_dict["xa"].shape[2] - previous_seq_len = 0 - for name in self._input_names: - if name in ["x", "xa"]: - continue - feed_dict[name] = ov.Tensor(np.zeros((beam_size, previous_seq_len, audio_len), dtype=np.float32)) - return feed_dict - - def preprocess_kv_cache_inputs(self, feed_dict, kv_cache): - """ - Transform kv_cache to inputs - - Parameters: - feed_dict: dictionary with inputs for inference - kv_cache: dictionary with cached attention hidden states from previous step - Returns: - feed_dict: updated feed dictionary with additional inputs - """ - if not kv_cache: - return self.init_past_inputs(feed_dict) - for k, v in zip(self._input_names[2:], kv_cache): - feed_dict[k] = ov.Tensor(v) - return feed_dict - - def postprocess_outputs(self, outputs): - """ - Transform model output to format expected by the pipeline - - Parameters: - outputs: outputs: raw inference results. - Returns: - logits: decoder predicted token logits - kv_cache: cached attention hidden states - """ - logits = torch.from_numpy(outputs[0]) - kv_cache = list(outputs.values())[1:] - return logits, kv_cache - - def forward(self, x: torch.Tensor, xa: torch.Tensor, kv_cache: Optional[dict] = None): - """ - Inference decoder model. - - Parameters: - x: torch.LongTensor, shape = (batch_size, <= n_ctx) the text tokens - xa: torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) - the encoded audio features to be attended on - kv_cache: Dict[str, torch.Tensor], attention modules hidden states cache from previous steps - Returns: - logits: decoder predicted logits - kv_cache: updated kv_cache with current step hidden states - """ - feed_dict = {"x": ov.Tensor(x.numpy()), "xa": ov.Tensor(xa.numpy())} - feed_dict = self.preprocess_kv_cache_inputs(feed_dict, kv_cache) - res = self.compiled_model(feed_dict) - return self.postprocess_outputs(res) - - -class OpenVINOInference(Inference): - """ - Wrapper for inference interface - """ - - def __init__(self, model: "Whisper", initial_token_length: int): - self.model: "Whisper" = model - self.initial_token_length = initial_token_length - self.kv_cache = {} - - def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor) -> torch.Tensor: - """ - getting logits for given tokens sequence and audio features and save kv_cache - - Parameters: - tokens: input tokens - audio_features: input audio features - Returns: - logits: predicted by decoder logits - """ - if tokens.shape[-1] > self.initial_token_length: - # only need to use the last token except in the first forward pass - tokens = tokens[:, -1:] - logits, self.kv_cache = self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache) - return logits - - def cleanup_caching(self): - """ - Reset kv_cache to initial state - """ - self.kv_cache = {} - - def rearrange_kv_cache(self, source_indices): - """ - Update hidden states cache for selected sequences - Parameters: - source_indicies: sequences indicies - Returns: - None - """ - for module, tensor in self.kv_cache.items(): - # update the key/value cache to contain the selected sequences - self.kv_cache[module] = tensor[source_indices].detach() - - -class OpenVINODecodingTask(DecodingTask): - """ - Class for decoding using OpenVINO - """ - - def __init__(self, model: "Whisper", options: DecodingOptions): - super().__init__(model, options) - self.inference = OpenVINOInference(model, len(self.initial_tokens)) - - -def patch_whisper_for_ov_inference(model): - @torch.no_grad() - def decode( - model: "Whisper", - mel: torch.Tensor, - options: DecodingOptions = DecodingOptions(), - ) -> Union[DecodingResult, List[DecodingResult]]: - """ - Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s). - - Parameters - ---------- - model: Whisper - the Whisper model instance - - mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000) - A tensor containing the Mel spectrogram(s) - - options: DecodingOptions - A dataclass that contains all necessary options for decoding 30-second segments - - Returns - ------- - result: Union[DecodingResult, List[DecodingResult]] - The result(s) of decoding contained in `DecodingResult` dataclass instance(s) - """ - single = mel.ndim == 2 - if single: - mel = mel.unsqueeze(0) - - result = OpenVINODecodingTask(model, options).run(mel) - - if single: - result = result[0] - - return result - - Parameter = namedtuple("Parameter", ["device"]) - - def parameters(): - return iter([Parameter(torch.device("cpu"))]) - - def logits(model, tokens: torch.Tensor, audio_features: torch.Tensor): - """ - Override for logits extraction method - Parameters: - tokens: input tokens - audio_features: input audio features - Returns: - logits: decoder predicted logits - """ - return model.decoder(tokens, audio_features, None)[0] - - model.parameters = parameters - model.decode = partial(decode, model) - model.logits = partial(logits, model) - - -def resample(audio, src_sample_rate, dst_sample_rate): - """ - Resample audio to specific sample rate - - Parameters: - audio: input audio signal - src_sample_rate: source audio sample rate - dst_sample_rate: destination audio sample rate - Returns: - resampled_audio: input audio signal resampled with dst_sample_rate - """ - if src_sample_rate == dst_sample_rate: - return audio - duration = audio.shape[0] / src_sample_rate - resampled_data = np.zeros(shape=(int(duration * dst_sample_rate)), dtype=np.float32) - x_old = np.linspace(0, duration, audio.shape[0], dtype=np.float32) - x_new = np.linspace(0, duration, resampled_data.shape[0], dtype=np.float32) - resampled_audio = np.interp(x_new, x_old, audio) - return resampled_audio.astype(np.float32) - - -def audio_to_float(audio): - """ - convert audio signal to floating point format - """ - return audio.astype(np.float32) / np.iinfo(audio.dtype).max - - -def get_audio(video_file): - """ - Extract audio signal from a given video file, then convert it to float, - then mono-channel format and resample it to the expected sample rate - - Parameters: - video_file: path to input video file - Returns: - resampled_audio: mono-channel float audio signal with 16000 Hz sample rate - extracted from video - duration: duration of video fragment in seconds - """ - input_video = VideoFileClip(str(video_file)) - duration = input_video.duration - input_video.audio.write_audiofile(video_file.stem + ".wav", verbose=False, logger=None) - input_audio_file = video_file.stem + ".wav" - sample_rate, audio = wavfile.read(io.BytesIO(open(input_audio_file, "rb").read())) - audio = audio_to_float(audio) - if audio.ndim == 2: - audio = audio.mean(axis=1) - - # The model expects mono-channel audio with a 16000 Hz sample rate, represented in floating point range. When the - # audio from the input video does not meet these requirements, we will need to apply preprocessing. - resampled_audio = resample(audio, sample_rate, 16000) - return resampled_audio, duration - - -def format_timestamp(seconds: float): - """ - format time in srt-file expected format - """ - assert seconds >= 0, "non-negative timestamp expected" - milliseconds = round(seconds * 1000.0) - - hours = milliseconds // 3_600_000 - milliseconds -= hours * 3_600_000 - - minutes = milliseconds // 60_000 - milliseconds -= minutes * 60_000 - - seconds = milliseconds // 1_000 - milliseconds -= seconds * 1_000 - - return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}" - - -def prepare_srt(transcription, filter_duration=None): - """ - Format transcription into srt file format - """ - segment_lines = [] - for segment in transcription["segments"]: - if filter_duration is not None and (segment["start"] >= floor(filter_duration) or segment["end"] > ceil(filter_duration) + 1): - break - segment_lines.append(str(segment["id"] + 1) + "\n") - time_start = format_timestamp(segment["start"]) - time_end = format_timestamp(segment["end"]) - time_str = f"{time_start} --> {time_end}\n" - segment_lines.append(time_str) - segment_lines.append(segment["text"] + "\n\n") - return segment_lines diff --git a/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb b/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb index 0ed0c8ac67d..342e2bedce7 100644 --- a/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb +++ b/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb @@ -13,7 +13,7 @@ "\n", "You can find more information about this model in the [research paper](https://cdn.openai.com/papers/whisper.pdf), [OpenAI blog](https://openai.com/blog/whisper/), [model card](https://github.com/openai/whisper/blob/main/model-card.md) and GitHub [repository](https://github.com/openai/whisper).\n", "\n", - "In this notebook, we will use Whisper with OpenVINO to generate subtitles in a sample video. Additionally, we will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization.\n", + "In this notebook, we will use Whisper model with [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai) for [Whisper automatic speech recognition scenarios](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/whisper_speech_recognition/README.md) to generate subtitles in a sample video. Additionally, we will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization.\n", "Notebook contains the following steps:\n", "1. Download the model.\n", "2. Instantiate the PyTorch model pipeline.\n", @@ -33,10 +33,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", - "\n", "#### Table of contents:\n", "\n", "- [Prerequisites](#Prerequisites)\n", @@ -80,18 +76,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": { "tags": [] }, "outputs": [], "source": [ - "%pip install -q \"openvino>=2024.1.0\" \"nncf>=2.10.0\"\n", - "%pip install -q \"python-ffmpeg<=1.0.16\" moviepy \"onnx!=1.16.2\" \"git+https://github.com/huggingface/optimum-intel.git\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu\n", - "%pip install -q \"yt_dlp>=2024.8.6\" soundfile librosa jiwer\n", + "%pip install -q \"nncf>=2.13.0\"\n", + "%pip install -q --pre -U \"openvino\" \"openvino-tokenizers\" \"openvino-genai\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n", + "%pip install -q \"python-ffmpeg<=1.0.16\" \"ffmpeg\" \"moviepy\" \"onnx!=1.16.2\" \"git+https://github.com/huggingface/optimum-intel.git\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install -q -U \"yt_dlp>=2024.8.6\" soundfile librosa jiwer\n", "%pip install -q \"gradio>=4.19\"" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from pathlib import Path\n", + "\n", + "if not Path(\"notebook_utils.py\").exists():\n", + " r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + " )\n", + " open(\"notebook_utils.py\", \"w\").write(r.text)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -123,12 +136,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6f1b0c699e444e7fa375cf5cc59d9a7c", + "model_id": "f12e1281744d4a769015824e4382c3c1", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model:', index=6, options=('openai/whisper-large-v3', 'openai/whisper-large-v2', 'openai…" + "Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '…" ] }, "execution_count": 2, @@ -168,23 +181,16 @@ "### Convert model to OpenVINO Intermediate Representation (IR) format using Optimum-Intel.\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "The Hugging Face Optimum API is a high-level API that enables us to convert and quantize models from the Hugging Face Transformers library to the OpenVINO™ IR format. For more details, refer to the [Hugging Face Optimum documentation](https://huggingface.co/docs/optimum/intel/inference).\n", - "\n", - "Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/docs/optimum/intel/hf.co/models) and create pipelines to run an inference with OpenVINO Runtime using Hugging Face APIs. The Optimum Inference models are API compatible with Hugging Face Transformers models. This means we just need to replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.\n", + "Listed Whisper model are available for downloading via the [HuggingFace hub](https://huggingface.co/openai). We will use optimum-cli interface for exporting it into OpenVINO Intermediate Representation (IR) format.\n", "\n", - "Below is an example of the whisper-tiny model\n", + "Optimum CLI interface for converting models supports export to OpenVINO (supported starting optimum-intel 1.12 version).\n", + "General command format:\n", "\n", - "```diff\n", - "-from transformers import AutoModelForSpeechSeq2Seq\n", - "+from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", - "from transformers import AutoTokenizer, pipeline\n", - "\n", - "model_id = \"openai/whisper-tiny\"\n", - "-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)\n", - "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", + "```bash\n", + "optimum-cli export openvino --model --task \n", "```\n", "\n", - "Model class initialization starts with calling the `from_pretrained` method. When downloading and converting the Transformers model, the parameter `export=True` should be added. We can save the converted model for the next usage with the `save_pretrained` method. Alternatively, model conversion can be performed using Optimum-CLI interface. You can find more details about Optimum-Intel and Optimum CLI usage in this [tutorial](../hugging-face-hub/hugging-face-hub.ipynb). The command bellow illustrates how to convert whisper using optimum cli.\n" + "where `--model` argument is model id from HuggingFace Hub or local directory with model (saved using `.save_pretrained` method), `--task ` is one of [supported task](https://huggingface.co/docs/optimum/exporters/task_manager) that exported model should solve. For LLMs it will be `automatic-speech-recognition-with-past`. If model initialization requires to use remote code, `--trust-remote-code` flag additionally should be passed. Full list of supported arguments available via `--help` For more details and examples of usage, please check [optimum documentation](https://huggingface.co/docs/optimum/intel/inference#export).\n" ] }, { @@ -193,8 +199,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", - "\n", "model_dir = model_id.value.split(\"/\")[-1]\n", "\n", "if not Path(model_dir).exists():\n", @@ -214,10 +218,8 @@ "![whisper_pipeline.png](https://user-images.githubusercontent.com/29454499/204536733-1f4342f7-2328-476a-a431-cb596df69854.png)\n", "\n", "\n", - "Preprocessing and post-processing are important in this model use. `transformers.AutoProcessor` class used for initialization `WhisperProcessor` is responsible for preparing audio input data for the PyTorch model, converting it to Mel-spectrogram and decoding predicted output token_ids into string using tokenizer. Tokenizers and Processors are distributed with models also compatible with the OpenVINO model.\n", "\n", - "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. \n", - "Pipeline can be used for long audio transcription. Distil-Whisper uses a chunked algorithm to transcribe long-form audio files. In practice, this chunked long-form algorithm is 9x faster than the sequential algorithm proposed by OpenAI in the Whisper paper. To enable chunking, pass the chunk_length_s parameter to the pipeline. For Distil-Whisper, a chunk length of 15 seconds is optimal. To activate batching, pass the argument batch_size." + "To simplify user experience we will use [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/whisper_speech_recognition/README.md). Firstly we will create pipeline with `WhisperPipeline`. You can construct it straight away from the folder with the converted model. It will automatically load the `model`, `tokenizer`, `detokenizer` and default `generation configuration`. " ] }, { @@ -234,36 +236,20 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "import openvino as ov\n", - "\n", - "core = ov.Core()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a37643c49958440285805210af50b2c2", + "model_id": "4edc715b0ede40168416a4514ddfeae0", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')" + "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -278,31 +264,20 @@ "\n", "from notebook_utils import device_widget\n", "\n", - "device = device_widget()\n", + "device = device_widget(default=\"CPU\", exclude=[\"NPU\"])\n", "\n", "device" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", - "from transformers import AutoProcessor, pipeline\n", + "import openvino_genai\n", "\n", - "ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value)\n", - "\n", - "processor = AutoProcessor.from_pretrained(model_dir)\n", - "\n", - "pipe = pipeline(\n", - " \"automatic-speech-recognition\",\n", - " model=ov_model,\n", - " chunk_length_s=30,\n", - " tokenizer=processor.tokenizer,\n", - " feature_extractor=processor.feature_extractor,\n", - ")" + "ov_pipe = openvino_genai.WhisperPipeline(str(model_dir), device=device.value)" ] }, { @@ -313,70 +288,41 @@ "## Run video transcription pipeline\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "Now, we are ready to start transcription. We select a video from YouTube that we want to transcribe. Be patient, as downloading the video may take some time." + "Now, we are ready to start transcription. Let's load the video first." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'downloaded_video.mp4' already exists.\n" + ] + }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f493dd18def948e5a50f5950b5e5c4ca", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "Text(value='https://youtu.be/kgL5LBM-hFI', description='Video:', placeholder='Type link for video')" + "PosixPath('/home/labuser/work/notebook/openvino_notebooks/notebooks/whisper-subtitles-generation/downloaded_video.mp4')" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import ipywidgets as widgets\n", - "\n", - "VIDEO_LINK = \"https://youtu.be/kgL5LBM-hFI\"\n", - "link = widgets.Text(\n", - " value=VIDEO_LINK,\n", - " placeholder=\"Type link for video\",\n", - " description=\"Video:\",\n", - " disabled=False,\n", - ")\n", - "\n", - "link" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading video https://youtu.be/kgL5LBM-hFI started\n", - "Video saved to downloaded_video.mp4\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import yt_dlp\n", - "\n", - "print(f\"Downloading video {link.value} started\")\n", + "from notebook_utils import download_file\n", "\n", "output_file = Path(\"downloaded_video.mp4\")\n", - "ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", - "with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", - " ydl.download(link.value)\n", "\n", - "print(f\"Video saved to {output_file}\")" + "download_file(\n", + " \"https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Sheldon%20Cooper%20Jim%20Parsons%20at%20Intels%20Lab.mp4\",\n", + " filename=output_file.name,\n", + ")" ] }, { @@ -392,13 +338,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "692ad019edab4ffc81a9cf2808e50d36", + "model_id": "de3b9c7002264661a855fa84c3411de7", "version_major": 2, "version_minor": 0 }, @@ -406,7 +352,7 @@ "Select(description='Select task:', index=1, options=('transcribe', 'translate'), value='translate')" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -423,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -449,27 +395,36 @@ " input_video.audio.write_audiofile(audio_file, verbose=False, logger=None)\n", " with open(audio_file, \"rb\") as f:\n", " inputs = f.read()\n", - " audio = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)\n", + " audio = ffmpeg_read(inputs, 16000)\n", " return {\n", " \"raw\": audio,\n", - " \"sampling_rate\": pipe.feature_extractor.sampling_rate,\n", + " \"sampling_rate\": 16000,\n", " }, duration" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run generation method. We will put input data as `np array`. Also we will specify `task` and `return_timestamps=True` options. If task is `translate`, you can place `language` option, for example `<|fr|>` for French or it would be detect automatically. We can set up generation parameters in different ways. We can get default config with `get_generation_config()`, setup parameters and put config directly to `generate()`. It's also possible to specify the needed options just as inputs in the `generate()` method and we will use this way. Then we just run `generate` method and get the output in text format.\n", + "\n", + "`generate` method with `return_timestamps` set to `True` will return `chunks`, which contain attributes: `text`, `start_ts` and `end_ts`" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inputs, duration = get_audio(output_file)\n", "\n", - "transcription = pipe(inputs, generate_kwargs={\"task\": task.value}, return_timestamps=True)[\"chunks\"]" + "transcription = ov_pipe.generate(inputs[\"raw\"], task=task.value, return_timestamps=True).chunks" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -501,18 +456,19 @@ " \"\"\"\n", " segment_lines = []\n", " for idx, segment in enumerate(transcription):\n", + " timestamp = (segment.start_ts, segment.end_ts)\n", " # for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word.\n", - " if segment[\"timestamp\"][1] is None:\n", - " segment[\"timestamp\"] = (segment[\"timestamp\"][0], filter_duration)\n", + " if segment.end_ts == -1:\n", + " timestamp[1] = filter_duration\n", "\n", - " if filter_duration is not None and (segment[\"timestamp\"][0] >= math.floor(filter_duration) or segment[\"timestamp\"][1] > math.ceil(filter_duration) + 1):\n", + " if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1):\n", " break\n", " segment_lines.append(str(idx + 1) + \"\\n\")\n", - " time_start = format_timestamp(segment[\"timestamp\"][0])\n", - " time_end = format_timestamp(segment[\"timestamp\"][1])\n", + " time_start = format_timestamp(timestamp[0])\n", + " time_end = format_timestamp(timestamp[1])\n", " time_str = f\"{time_start} --> {time_end}\\n\"\n", " segment_lines.append(time_str)\n", - " segment_lines.append(segment[\"text\"] + \"\\n\\n\")\n", + " segment_lines.append(segment.text + \"\\n\\n\")\n", " return segment_lines" ] }, @@ -526,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -546,21 +502,21 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8bcb6af81fc14c9a9e3e003b9a2a6e0f", + "model_id": "b27923d770014dde84ce789a0a6042ec", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Video(value=b\"\\x00\\x00\\x00\\x18ftypmp42\\x00\\x00\\x00\\x00isommp42\\x00\\x00:'moov\\x00\\x00\\x00lmvhd...\", height='800…" + "Video(value=b'\\x00\\x00\\x00\\x18ftypmp42\\x00\\x00\\x00\\x00isommp42\\x00\\x00Aimoov\\x00\\x00\\x00lmvhd...', height='800…" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -571,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": { "tags": [] }, @@ -647,13 +603,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a21bfe2e9278413a9d1adaf94ac7388a", + "model_id": "14d51e7bb1104de1862ccb72adf305ce", "version_major": 2, "version_minor": 0 }, @@ -661,7 +617,7 @@ "Checkbox(value=True, description='Quantization')" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -678,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -695,6 +651,42 @@ "%load_ext skip_kernel_extension" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load converted OpenVINO model format using Optimum-Intel to easily quantize it.\n", + "\n", + "Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/docs/optimum/intel/hf.co/models) or local folder to create pipelines to run an inference with OpenVINO Runtime using Hugging Face APIs. The Optimum Inference models are API compatible with Hugging Face Transformers models. This means we just need to replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.\n", + "\n", + "Below is an example of the whisper-tiny model\n", + "\n", + "```diff\n", + "-from transformers import AutoModelForSpeechSeq2Seq\n", + "+from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", + "from transformers import AutoTokenizer, pipeline\n", + "\n", + "model_id = \"openai/whisper-tiny\"\n", + "-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)\n", + "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", + "```\n", + "\n", + "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoProcessor\n", + "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", + "\n", + "ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value)\n", + "processor = AutoProcessor.from_pretrained(model_dir)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -708,13 +700,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "%%skip not $to_quantize.value\n", "\n", "from itertools import islice\n", + "from tqdm.notebook import tqdm\n", + "from datasets import load_dataset\n", + "from transformers import pipeline\n", "from optimum.intel.openvino.quantization import InferRequestWrapper\n", "\n", "\n", @@ -758,387 +753,25 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "148b4131e11f4363bcebfc0c78ed13df", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Collecting calibration data: 0%| | 0/50 [00:00\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f30e8952c50e4f6ca374dc6972b95fca", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:12 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:16 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "520c7840eb5440859d2b9ba2123049a7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ddc5b2c750d4a4cbe67f0b8f7be4faf", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "69846e3229834f7992738d50a33b354b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Quantizing decoder with past\n" - ] - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4c48f03790324ee99afdb4031429a09a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "03249855c07f4b83bfb4289608bca05b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fea73d3c378a442bacb43bf1ab11b4ec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the encoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n" - ] - } - ], + "outputs": [], "source": [ "%%skip not $to_quantize.value\n", "\n", "import gc\n", "import shutil\n", "import nncf\n", - "from datasets import load_dataset\n", - "from tqdm.notebook import tqdm\n", - "\n", - "def extract_input_features(sample):\n", - " input_features = processor(\n", - " sample[\"audio\"][\"array\"],\n", - " sampling_rate=sample[\"audio\"][\"sampling_rate\"],\n", - " return_tensors=\"pt\",\n", - " ).input_features\n", - " return input_features\n", - "\n", + "import openvino as ov\n", "\n", "\n", - "CALIBRATION_DATASET_SIZE = 50\n", + "CALIBRATION_DATASET_SIZE = 30\n", "quantized_model_path = Path(f\"{model_dir}_quantized\")\n", "\n", "\n", "def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n", " if not quantized_model_path.exists():\n", - " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n", - " ov_model, calibration_dataset_size\n", - " )\n", + " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(ov_model, calibration_dataset_size)\n", " print(\"Quantizing encoder\")\n", " quantized_encoder = nncf.quantize(\n", " ov_model.encoder.model,\n", @@ -1146,7 +779,7 @@ " subset_size=len(encoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80),\n", " )\n", " ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n", " del quantized_encoder\n", @@ -1160,7 +793,7 @@ " subset_size=len(decoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96),\n", " )\n", " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n", " del quantized_decoder_with_past\n", @@ -1173,14 +806,24 @@ " shutil.copy(model_path / \"generation_config.json\", quantized_model_path / \"generation_config.json\")\n", " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n", " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n", - "\n", - " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, compile=False)\n", - " quantized_ov_model.to(device.value)\n", - " quantized_ov_model.compile()\n", - " return quantized_ov_model\n", - "\n", - "\n", - "ov_quantized_model = quantize(ov_model, CALIBRATION_DATASET_SIZE)" + " shutil.copy(model_path / \"openvino_tokenizer.xml\", quantized_model_path / \"openvino_tokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_tokenizer.bin\", quantized_model_path / \"openvino_tokenizer.bin\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.xml\", quantized_model_path / \"openvino_detokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.bin\", quantized_model_path / \"openvino_detokenizer.bin\")\n", + " shutil.copy(model_path / \"tokenizer_config.json\", quantized_model_path / \"tokenizer_config.json\")\n", + " shutil.copy(model_path / \"tokenizer.json\", quantized_model_path / \"tokenizer.json\")\n", + " shutil.copy(model_path / \"vocab.json\", quantized_model_path / \"vocab.json\")\n", + " shutil.copy(model_path / \"preprocessor_config.json\", quantized_model_path / \"preprocessor_config.json\")\n", + " shutil.copy(model_path / \"special_tokens_map.json\", quantized_model_path / \"special_tokens_map.json\")\n", + " shutil.copy(model_path / \"normalizer.json\", quantized_model_path / \"normalizer.json\")\n", + " shutil.copy(model_path / \"merges.txt\", quantized_model_path / \"merges.txt\")\n", + " shutil.copy(model_path / \"added_tokens.json\", quantized_model_path / \"added_tokens.json\")\n", + "\n", + " quantized_ov_pipe = openvino_genai.WhisperPipeline(str(quantized_model_path), device=device.value)\n", + " return quantized_ov_pipe\n", + "\n", + "\n", + "quantized_ov_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE)" ] }, { @@ -1196,64 +839,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "00:00:00,000 --> 00:00:05,000\n", - " What's that?\n", - "\n", - "2\n", - "00:00:05,000 --> 00:00:07,000\n", - " Oh, wow.\n", - "\n", - "3\n", - "00:00:09,000 --> 00:00:11,000\n", - " Hello humans.\n", - "\n", - "4\n", - "00:00:14,000 --> 00:00:15,000\n", - " Focus on me.\n", - "\n", - "5\n", - "00:00:15,000 --> 00:00:16,000\n", - " Focus on the guard.\n", - "\n", - "6\n", - "00:00:18,000 --> 00:00:20,000\n", - " Don't tell anyone what you're seen in here.\n", - "\n", - "7\n", - "00:00:22,000 --> 00:00:24,000\n", - " Have you seen what's in there?\n", - "\n", - "8\n", - "00:00:24,000 --> 00:00:25,000\n", - " They have intel.\n", - "\n", - "9\n", - "00:00:25,000 --> 00:00:27,000\n", - " This is where it all changes.\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "if ov_quantized_model is not None:\n", - " int8_pipe = pipeline(\n", - " \"automatic-speech-recognition\",\n", - " model=ov_quantized_model,\n", - " chunk_length_s=30,\n", - " tokenizer=processor.tokenizer,\n", - " feature_extractor=processor.feature_extractor,\n", - " )\n", " inputs, duration = get_audio(output_file)\n", - " transcription = int8_pipe(inputs, generate_kwargs={\"task\": task.value}, return_timestamps=True)[\"chunks\"]\n", + " transcription = quantized_ov_pipe.generate(inputs[\"raw\"], task=task.value, return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, filter_duration=duration)\n", " print(\"\".join(srt_lines))\n", " widgets.Video.from_file(output_file, loop=False, width=800, height=800)" @@ -1269,20 +861,18 @@ "\n", "Finally, we compare original and quantized Whisper models from accuracy and performance stand-points.\n", "\n", - "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate.\n", - "\n", - "When measuring inference time, we do it separately for encoder and decoder-with-past model forwards, and for the whole model inference too." + "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a2822b0dcd584fd2aa28e01c607926d0", + "model_id": "d748cc7a3f394026907c0ef3780d58a8", "version_major": 2, "version_minor": 0 }, @@ -1296,7 +886,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3dc0e232e81c4de0ad82737f98f69d2a", + "model_id": "c2377a7e2dd840328913bc77e6f51ca9", "version_major": 2, "version_minor": 0 }, @@ -1311,11 +901,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Encoder performance speedup: 1.352\n", - "Decoder with past performance speedup: 1.342\n", - "Whole pipeline performance speedup: 1.350\n", - "Whisper transcription word accuracy. Original model: 81.67%. Quantized model: 83.67%.\n", - "Accuracy drop: -1.99%.\n" + "Whole pipeline performance speedup: 1.452\n", + "Whisper transcription word accuracy. Original model: 81.77%. Quantized model: 82.97%.\n", + "Accuracy drop: -1.20%.\n" ] } ], @@ -1326,69 +914,34 @@ "from contextlib import contextmanager\n", "from jiwer import wer, wer_standardize\n", "\n", - "\n", "TEST_DATASET_SIZE = 50\n", - "MEASURE_TIME = False\n", - "\n", - "@contextmanager\n", - "def time_measurement():\n", - " global MEASURE_TIME\n", - " try:\n", - " MEASURE_TIME = True\n", - " yield\n", - " finally:\n", - " MEASURE_TIME = False\n", - "\n", - "def time_fn(obj, fn_name, time_list):\n", - " original_fn = getattr(obj, fn_name)\n", - "\n", - " def wrapper(*args, **kwargs):\n", - " if not MEASURE_TIME:\n", - " return original_fn(*args, **kwargs)\n", - " start_time = time.perf_counter()\n", - " result = original_fn(*args, **kwargs)\n", - " end_time = time.perf_counter()\n", - " time_list.append(end_time - start_time)\n", - " return result\n", - "\n", - " setattr(obj, fn_name, wrapper)\n", "\n", "def calculate_transcription_time_and_accuracy(ov_model, test_samples):\n", - " encoder_infer_times = []\n", - " decoder_with_past_infer_times = []\n", " whole_infer_times = []\n", - " time_fn(ov_model, \"generate\", whole_infer_times)\n", - " time_fn(ov_model.encoder, \"forward\", encoder_infer_times)\n", - " time_fn(ov_model.decoder_with_past, \"forward\", decoder_with_past_infer_times)\n", "\n", " ground_truths = []\n", " predictions = []\n", " for data_item in tqdm(test_samples, desc=\"Measuring performance and accuracy\"):\n", - " input_features = extract_input_features(data_item)\n", - "\n", - " with time_measurement():\n", - " predicted_ids = ov_model.generate(input_features)\n", - " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + " start_time = time.perf_counter()\n", + " transcription = ov_model.generate(data_item[\"audio\"][\"array\"], return_timestamps=True)\n", + " end_time = time.perf_counter()\n", + " whole_infer_times.append(end_time - start_time)\n", "\n", " ground_truths.append(data_item[\"text\"])\n", - " predictions.append(transcription[0])\n", + " predictions.append(transcription.texts[0])\n", "\n", " word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,\n", " hypothesis_transform=wer_standardize)) * 100\n", " mean_whole_infer_time = sum(whole_infer_times)\n", - " mean_encoder_infer_time = sum(encoder_infer_times)\n", - " mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)\n", - " return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)\n", + " return word_accuracy, mean_whole_infer_time\n", "\n", "test_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n", "test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)\n", "test_samples = [sample for sample in test_dataset]\n", "\n", - "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_model, test_samples)\n", - "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_model, test_samples)\n", - "print(f\"Encoder performance speedup: {times_original[1] / times_quantized[1]:.3f}\")\n", - "print(f\"Decoder with past performance speedup: {times_original[2] / times_quantized[2]:.3f}\")\n", - "print(f\"Whole pipeline performance speedup: {times_original[0] / times_quantized[0]:.3f}\")\n", + "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples)\n", + "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(quantized_ov_pipe, test_samples)\n", + "print(f\"Whole pipeline performance speedup: {times_original / times_quantized:.3f}\")\n", "print(f\"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.\")\n", "print(f\"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.\")" ] @@ -1406,6 +959,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "scrolled": true, "test_replace": { " demo.launch(debug=True)": " demo.launch()", " demo.launch(share=True, debug=True)": " demo.launch(share=True)" @@ -1413,18 +967,26 @@ }, "outputs": [], "source": [ - "def transcribe(url, task, use_int8):\n", - " output_file = Path(\"downloaded_video.mp4\")\n", - " ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", - " with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", - " ydl.download(link.value)\n", - " inputs, duration = get_audio(output_file)\n", - " m_pipe = int8_pipe if use_int8 else pipe\n", - " transcription = m_pipe(inputs, generate_kwargs={\"task\": task.lower()}, return_timestamps=True)[\"chunks\"]\n", + "def_config = ov_pipe.get_generation_config()\n", + "\n", + "\n", + "def transcribe(video_path, task, use_int8):\n", + " data_path = Path(video_path)\n", + " inputs, duration = get_audio(data_path)\n", + " m_pipe = quantized_ov_pipe if use_int8 else ov_pipe\n", + "\n", + " frame_num = len(inputs[\"raw\"]) / 16000\n", + " if frame_num > 30:\n", + " config = ov_pipe.get_generation_config()\n", + " chink_num = math.ceil(frame_num / 30)\n", + " config.max_length = chink_num * def_config.max_length\n", + " m_pipe.set_generation_config(config)\n", + "\n", + " transcription = m_pipe.generate(inputs[\"raw\"], task=task.lower(), return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, duration)\n", - " with output_file.with_suffix(\".srt\").open(\"w\") as f:\n", + " with data_path.with_suffix(\".srt\").open(\"w\") as f:\n", " f.writelines(srt_lines)\n", - " return [str(output_file), str(output_file.with_suffix(\".srt\"))]\n", + " return [str(data_path), str(data_path.with_suffix(\".srt\"))]\n", "\n", "\n", "if not Path(\"gradio_helper.py\").exists():\n", @@ -1433,7 +995,7 @@ "\n", "from gradio_helper import make_demo\n", "\n", - "demo = make_demo(fn=transcribe, quantized=ov_quantized_model is not None)\n", + "demo = make_demo(fn=transcribe, quantized=ov_quantized_model is not None, sample_path=output_file)\n", "\n", "try:\n", " demo.launch(debug=True)\n", @@ -1447,7 +1009,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1461,7 +1023,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" }, "openvino_notebooks": { "imageUrl": "https://user-images.githubusercontent.com/29454499/204548693-1304ef33-c790-490d-8a8b-d5766acb6254.png",