feat: Running notebook in Colab (#28)

carlosgjs · web-flow · commit 8ebbeb66080e · 2024-01-29T14:55:35.000-08:00
diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb
@@ -3,7 +3,42 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "3q1p1MKYxZei"
+   },
+   "outputs": [],
+   "source": [
+    "# Uncoment the following to work around an ocassisional bug in Colab:\n",
+    "# \"A UTF-8 locale is required. Got ANSI_X3.4-1968\"\n",
+    "# import locale\n",
+    "# locale.getpreferredencoding = lambda: \"UTF-8\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "YE9-CnCLUueM"
+   },
+   "outputs": [],
+   "source": [
+    "# Uncomment to clone and install autodoc from GitHub\n",
+    "# !pip uninstall -y autora-doc\n",
+    "# !git clone https://github.com/AutoResearch/autodoc.git\n",
+    "# !pip install -e \"./autodoc[cuda,train]\"\n",
+    "\n",
+    "# Login to Huggingface since access to the model repo is private\n",
+    "# 1) Request access through: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n",
+    "# 2) Get a Huggingface token from: https://huggingface.co/settings/token (use same email as above)\n",
+    "# !huggingface-cli login --token <your HF token>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "jeu8zXoFUtXM"
+   },
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -17,7 +52,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "yOJoE_pnUtXN"
+   },
    "outputs": [],
    "source": [
     "model = \"meta-llama/Llama-2-7b-chat-hf\""
@@ -26,23 +63,29 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "l6zK76t5UtXN"
+   },
    "outputs": [],
    "source": [
     "pred = Predictor(model)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "id": "uAKbxADnUtXN"
+   },
    "source": [
     "## Test generation for the variable declararion only"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "EfOMJxaFUtXN"
+   },
    "outputs": [],
    "source": [
     "TEST_VAR_CODE = \"\"\"\n",
@@ -56,7 +99,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "pc7LAzGoUtXN"
+   },
    "outputs": [],
    "source": [
     "def test(promptid, code, label):\n",
@@ -70,14 +115,16 @@
     "        num_ret_seq=1,\n",
     "    )\n",
     "    bleu, meteor = evaluate_documentation(output, [label])\n",
-    "    for i, o in enumerate(output[0]):\n",
+    "    for i, o in enumerate(output):\n",
     "        print(f\"{promptid}\\n******* Output {i} ********. bleu={bleu}, meteor={meteor}\\n{o}\\n*************\\n\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "BJgptog3UtXO"
+   },
    "outputs": [],
    "source": [
     "# Zero shot test\n",
@@ -87,7 +134,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "8T5dVHUMUtXO"
+   },
    "outputs": [],
    "source": [
     "# One shot test\n",
@@ -96,18 +145,24 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "id": "fIeXbZXxUtXO"
+   },
    "source": [
     "## One-shot generation for the complete code sample"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "K0241jrdUtXO"
+   },
    "outputs": [],
    "source": [
     "data_file = \"../data/autora/data.jsonl\"\n",
+    "# Use this path if running in Colab and cloning the repo\n",
+    "# data_file = \"./autodoc/data/autora/data.jsonl\"\n",
     "inputs, labels = load_data(data_file)\n",
     "# preprocessing removes comments, import statements and empty lines\n",
     "inputs = [preprocess_code(i) for i in inputs]\n",
@@ -119,25 +174,33 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "ws7LUe7mUtXO"
+   },
    "outputs": [],
    "source": [
     "out, bleu, meteor = eval_prompt(data_file, pred, prompt, {\"max_new_tokens\": 800.0})\n",
-    "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0][0]}\\n*************\\n\")"
+    "print(f\"bleu={bleu}, meteor={meteor}\\n{out[0]}\\n*************\\n\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "5L2RZveeUtXO"
+   },
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
   "kernelspec": {
-   "display_name": "autodoc",
-   "language": "python",
+   "display_name": "Python 3",
    "name": "python3"
   },
   "language_info": {
@@ -154,5 +217,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 0
 }