Update colab, expose additional args

* Exposed draft model args for speculative decoding * Exposed int8 cache, dummy models, and no flash attention * Resolved CUDA 11.8 dependency issue
Vhallo · Dec 5, 2023 · 6750710 · 6750710
1 parent 37f8f3e
commit 6750710
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 26 deletions.
diff --git a/TabbyAPI_Colab_Example.ipynb b/TabbyAPI_Colab_Example.ipynb
@@ -16,11 +16,19 @@
     "accelerator": "GPU"
   },
   "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **TabbyAPI Colab**"
+      ],
+      "metadata": {
+        "id": "NcgQp3r7BS-q"
+      }
+    },
     {
       "cell_type": "code",
       "source": [
-        "#CELL 1\n",
-        "#@title Keep this widget playing to prevent Colab from disconnecting you { display-mode: \"form\" }\n",
+        "#@title # Keep this widget playing to prevent Colab from disconnecting you { display-mode: \"form\" }\n",
         "#@markdown Press play on the audio player that will appear below:\n",
         "%%html\n",
         "<audio src=\"https://oobabooga.github.io/silence.m4a\" controls>"
@@ -39,12 +47,16 @@
       },
       "outputs": [],
       "source": [
-        "# @title # **Cell 1 - Installation w/ Model Downloading** { display-mode: \"form\" }\n",
+        "# @title # Install and download model { display-mode: \"form\" }\n",
         "# @markdown ---\n",
-        "# @markdown # Download Model\n",
+        "# @markdown Select model:\n",
         "# Select model and branch\n",
-        "repo_id = \"royallab/airoboros-mistral2.2-7b-exl2\" # @param {type:\"string\"}\n",
-        "revision = \"6bpw\" # @param {type:\"string\"}\n",
+        "repo_id = \"royallab/Noromaid-13b-v0.1.1-exl2\" # @param {type:\"string\"}\n",
+        "revision = \"4bpw\" # @param {type:\"string\"}\n",
+        "# @markdown ---\n",
+        "# @markdown Select draft model (optional, for speculative decoding):\n",
+        "draft_repo_id = \"\" # @param {type:\"string\"}\n",
+        "draft_revision = \"\" # @param {type:\"string\"}\n",
         "# @markdown ---\n",
         "\n",
         "# Install tabbyAPI\n",
@@ -53,8 +65,7 @@
         "!git clone https://github.com/theroyallab/tabbyAPI\n",
         "%cd tabbyAPI\n",
         "!pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu118-cp310-cp310-linux_x86_64.whl -q\n",
-        "!pip install -r requirements.txt -q\n",
-        "!pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl -q\n",
+        "!pip install -r requirements-colab.txt -q\n",
         "!pip install huggingface-hub -q\n",
         "\n",
         "# Download cloudflared tunnel\n",
@@ -68,26 +79,31 @@
         "\n",
         "from huggingface_hub import snapshot_download\n",
         "snapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./models/{repo_id.replace('/', '_')}\")\n",
-        "\n",
-        "!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb\n",
-        "!dpkg -i cuda-keyring_1.0-1_all.deb\n",
-        "!apt-get -qq update\n",
-        "!apt-get -y -qq install cuda\n",
-        "\n",
-        "print(f\"Model dir: './models/{repo_id.replace('/', '_')}'\")"
+        "if len(draft_repo_id) > 0: snapshot_download(repo_id=draft_repo_id, revision=draft_revision, local_dir=f\"./models/{draft_repo_id.replace('/', '_')}\")"
       ]
     },
     {
       "cell_type": "code",
       "source": [
-        "# @title # **Cell 2 - Edit Config and Start Tabby** { display-mode: \"form\" }\n",
+        "# @title # Configure and launch API { display-mode: \"form\" }\n",
         "# @markdown ---\n",
-        "# @markdown # Edit Config\n",
+        "# @markdown Model parameters:\n",
         "\n",
         "model = repo_id.replace('/', '_')\n",
-        "ContextSize = 4096 # @param {type:\"raw\"}\n",
-        "RopeScale = 1.0 # @param {type:\"raw\"}\n",
-        "RopeAlpha = 1.0 # @param {type:\"raw\"}\n",
+        "draft_model = draft_repo_id.replace('/', '_')\n",
+        "ContextSize = 4096 # @param {type:\"integer\"}\n",
+        "RopeScale = 1.0 # @param {type:\"number\"}\n",
+        "RopeAlpha = 1.0 # @param {type:\"number\"}\n",
+        "# @markdown ---\n",
+        "# @markdown Draft model parameters (optional, for speculative decoding):\n",
+        "DraftRopeAlpha = None # @param {type:\"number\"}\n",
+        "# @markdown ---\n",
+        "# @markdown Misc options:\n",
+        "CacheMode = \"FP16\" # @param [\"FP8\", \"FP16\"] {type:\"string\"}\n",
+        "UseDummyModels = False # @param {type:\"boolean\"}\n",
+        "NoFlashAttention = False # @param {type:\"boolean\"}\n",
+        "# @markdown ---\n",
+        "# @markdown To connect, make note of the cloudflared URL and your auto-generated API key after launching and provide it to your preferred frontend.\n",
         "\n",
         "# Setup Config - edit parameters to fit your needs\n",
         "%cd /content/tabbyAPI/\n",
@@ -120,7 +136,7 @@
         "\n",
         "  # Sends dummy model names when the models endpoint is queried\n",
         "  # Enable this if the program is looking for a specific OAI model\n",
-        "  use_dummy_models: False\n",
+        "  use_dummy_models: {UseDummyModels}\n",
         "\n",
         "  # The below parameters apply only if model_name is set\n",
         "\n",
@@ -138,22 +154,22 @@
         "  rope_alpha: {RopeAlpha}\n",
         "\n",
         "  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
-        "  no_flash_attention: False\n",
+        "  no_flash_attention: {NoFlashAttention}\n",
         "\n",
         "  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
-        "  cache_mode: FP16\n",
+        "  cache_mode: {CacheMode}\n",
         "\n",
         "  # Options for draft models (speculative decoding). This will use more VRAM!\n",
         "  draft:\n",
         "    # Overrides the directory to look for draft (default: models)\n",
-        "    # draft_model_dir: Your draft model directory path\n",
+        "    draft_model_dir: models\n",
         "\n",
         "    # An initial draft model to load. Make sure this model is located in the model directory!\n",
         "    # A draft model can be loaded later via the API.\n",
-        "    # draft_model_name: A model name\n",
+        "    draft_model_name: {draft_model}\n",
         "\n",
         "    # Rope parameters for draft models (default: 1.0)\n",
-        "    # draft_rope_alpha: 1.0\n",
+        "    draft_rope_alpha: {DraftRopeAlpha}\n",
         "'''\n",
         "with open(\"./config.yml\", \"w\") as file:\n",
         "    file.write(write)\n",

diff --git a/requirements-colab.txt b/requirements-colab.txt
@@ -0,0 +1,10 @@
+fastapi
+pydantic < 2,>= 1
+PyYAML
+progress
+uvicorn
+
+# Wheels
+
+# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"