DataForScience
diff --git a/‎1. Generative AI.ipynb‎
Lines changed: 2978 additions & 0 deletions b/‎1. Generative AI.ipynb‎
Lines changed: 2978 additions & 0 deletions
diff --git a/‎2. Prompt Engineering.ipynb‎
Lines changed: 1097 additions & 0 deletions b/‎2. Prompt Engineering.ipynb‎
Lines changed: 1097 additions & 0 deletions
diff --git a/‎3. NLP with HuggingFace.ipynb‎
Lines changed: 1945 additions & 0 deletions b/‎3. NLP with HuggingFace.ipynb‎
Lines changed: 1945 additions & 0 deletions
diff --git a/‎4. Whisper.ipynb‎
Lines changed: 367 additions & 0 deletions b/‎4. Whisper.ipynb‎
Lines changed: 367 additions & 0 deletions
@@ -0,0 +1,367 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1fb4b605",
+   "metadata": {},
+   "source": [
+    "<div style=\"width: 100%; overflow: hidden;\">\n",
+    "    <div style=\"width: 150px; float: left;\"> <img src=\"data/D4Sci_logo_ball.png\" alt=\"Data For Science, Inc\" align=\"left\" border=\"0\" width=150px> </div>\n",
+    "    <div style=\"float: left; margin-left: 10px;\"> <h1>LLMs for Data Science</h1>\n",
+    "    <h1>Text to Speech with OpenAI</h1>\n",
+    "        <p>Bruno Gonçalves<br/>\n",
+    "        <a href=\"http://www.data4sci.com/\">www.data4sci.com</a><br/>\n",
+    "            @bgoncalves, @data4sci</p></div>\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6cf47f33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The watermark extension is already loaded. To reload it, use:\n",
+      "  %reload_ext watermark\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter, defaultdict\n",
+    "import random\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt \n",
+    "\n",
+    "import openai\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "import tqdm as tq\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "import watermark\n",
+    "\n",
+    "%load_ext watermark\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d27ac09b",
+   "metadata": {},
+   "source": [
+    "We start by printing out the versions of the libraries we're using for future reference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "acecf310",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Python implementation: CPython\n",
+      "Python version       : 3.13.3\n",
+      "IPython version      : 9.7.0\n",
+      "\n",
+      "Compiler    : Clang 17.0.0 (clang-1700.0.13.3)\n",
+      "OS          : Darwin\n",
+      "Release     : 25.1.0\n",
+      "Machine     : arm64\n",
+      "Processor   : arm\n",
+      "CPU cores   : 16\n",
+      "Architecture: 64bit\n",
+      "\n",
+      "Git hash: 9c0f005b405008a296ca6d92ba14b5bde219449c\n",
+      "\n",
+      "watermark : 2.5.0\n",
+      "pandas    : 2.3.3\n",
+      "openai    : 2.8.0\n",
+      "tqdm      : 4.67.1\n",
+      "numpy     : 2.3.5\n",
+      "matplotlib: 3.10.7\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%watermark -n -v -m -g -iv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94cf9da3",
+   "metadata": {},
+   "source": [
+    "Load default figure style"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "03cc2e1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.style.use('d4sci.mplstyle')\n",
+    "colors = plt.rcParams['axes.prop_cycle'].by_key()['color']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7309e242",
+   "metadata": {},
+   "source": [
+    "# Audio to Text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ae0313d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49713958",
+   "metadata": {},
+   "source": [
+    "Let us parse a small local file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "81fc83ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!open data/gettysburg10.wav"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c126e84b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 10.9 ms, sys: 6.11 ms, total: 17.1 ms\n",
+      "Wall time: 2.3 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "transcription = client.audio.transcriptions.create(\n",
+    "    model=\"gpt-4o-transcribe\", \n",
+    "    file=open(\"data/gettysburg10.wav\", \"rb\"),\n",
+    "    response_format=\"text\",\n",
+    "    language=\"en\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7016614",
+   "metadata": {},
+   "source": [
+    "And the transcript is simply:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "55d4c9ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal.\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(transcription)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b309b6c3-9601-4490-9b40-678fd7054edf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98b9aab5",
+   "metadata": {},
+   "source": [
+    "# Text to Speech"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fce51d44",
+   "metadata": {},
+   "source": [
+    "Now the opposite approach, going from written text to high quality audio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "6db6fd86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quote = \"\"\"\n",
+    "Scientists have calculated that the chances of something so patently absurd \n",
+    "actually existing are millions to one.\n",
+    "But magicians have calculated that million-to-one chances crop up nine times out of ten.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15217f9b",
+   "metadata": {},
+   "source": [
+    "You can learn more about text to speech (and sample the various voices) in the [Official documentation](https://platform.openai.com/docs/guides/text-to-speech/quickstart)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6cbb88b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 24.1 ms, sys: 17.7 ms, total: 41.9 ms\n",
+      "Wall time: 2.46 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "audio = client.audio.speech.create(\n",
+    "    input=quote, \n",
+    "    model=\"gpt-4o-mini-tts\", \n",
+    "    voice='fable',\n",
+    "    response_format='mp3')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e12f1247",
+   "metadata": {},
+   "source": [
+    "Which we can write directly to a file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "a55a56b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio.write_to_file('data/pratchett.mp3')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "717a1f60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!open data/pratchett.mp3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36364452",
+   "metadata": {},
+   "source": [
+    "<center>\n",
+    "     <img src=\"data/D4Sci_logo_full.png\" alt=\"Data For Science, Inc\" align=\"center\" border=\"0\" width=300px> \n",
+    "</center>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}