JohnSnowLabs
diff --git a/‎examples/python/annotation/text/english/DocumentTokenSplitter.ipynb‎
Lines changed: 372 additions & 0 deletions b/‎examples/python/annotation/text/english/DocumentTokenSplitter.ipynb‎
Lines changed: 372 additions & 0 deletions
diff --git a/‎examples/python/annotation/text/english/openai-completion‎
Lines changed: 1 addition & 0 deletions b/‎examples/python/annotation/text/english/openai-completion‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/python/training/english/classification/ClassifierDL_Training_using_INSTRUCTOR_Embeddings.ipynb‎
Lines changed: 1 addition & 0 deletions b/‎examples/python/training/english/classification/ClassifierDL_Training_using_INSTRUCTOR_Embeddings.ipynb‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,372 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "97EiXueJA9cY"
+      },
+      "source": [
+        "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zmxL_blSA9ce"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/DocumentTokenSplitter.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uI7yhCibA9cf"
+      },
+      "source": [
+        "## Colab + Data Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4WQLLrIUA9cg",
+        "outputId": "93e96731-45c2-4c82-97fe-f08472b649fe"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Installing PySpark 3.2.3 and Spark NLP 5.2.2\n",
+            "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.2\n"
+          ]
+        }
+      ],
+      "source": [
+        "!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "id": "nVTDX8SdiSD9"
+      },
+      "outputs": [],
+      "source": [
+        "!wget https://github.com/JohnSnowLabs/spark-nlp/blob/587f79020de7bc09c2b2fceb37ec258bad57e425/src/test/resources/spell/sherlockholmes.txt > /dev/null 2>&1"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_S-XJDfUA9ci"
+      },
+      "source": [
+        "# Download DocumentTokenSplitter Model and Create Spark NLP Pipeline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KzMHa0HdA9ch",
+        "outputId": "a1c6ff34-8b07-40e6-c207-b6f77894ad74"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Warning::Spark Session already created, some configs may not take.\n",
+            "Spark NLP version 5.2.2\n",
+            "Apache Spark version: 3.2.3\n"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "from sparknlp.base import *\n",
+        "from sparknlp.annotator import *\n",
+        "from pyspark.ml import Pipeline\n",
+        "\n",
+        "spark = sparknlp.start()\n",
+        "\n",
+        "print(f\"Spark NLP version {sparknlp.version()}\\nApache Spark version: {spark.version}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "id": "6qAa9p6ohtfi"
+      },
+      "outputs": [],
+      "source": [
+        "textDF = spark.read.text(\n",
+        "   \"sherlockholmes.txt\",\n",
+        "    wholetext=True\n",
+        ").toDF(\"text\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DVHludGFMSCk",
+        "outputId": "bced22c6-794b-4fd8-ad78-2bc0a1880f5a"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "sparknlp.annotator.document_token_splitter.DocumentTokenSplitter"
+            ]
+          },
+          "execution_count": 14,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "DocumentTokenSplitter"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O4uPbdrSA9ci"
+      },
+      "source": [
+        "Lets create a Spark NLP pipeline with the following stages:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ASQ5Ot2NA9ci",
+        "outputId": "3a8c06d6-f8ce-442f-b8c9-b107610d7b54"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "+--------------------------------------------------------------------------------+-----+-----+------+------+\n",
+            "|                                                                          result|begin|  end|length|tokens|\n",
+            "+--------------------------------------------------------------------------------+-----+-----+------+------+\n",
+            "|[{\"payload\":{\"allShortcutsEnabled\":false,\"fileTree\":{\"src/test/resources/spel...|    0|11335| 11335|   512|\n",
+            "|[the case of the Trepoff murder, of his clearing up\",\"of the singular tragedy...|11280|14436|  3156|   512|\n",
+            "|[order to remove crusted mud from it.\",\"Hence, you see, my double deduction t...|14379|17697|  3318|   512|\n",
+            "|[a \\\"P,\\\" and a\",\"large \\\"G\\\" with a small \\\"t\\\" woven into the texture of th...|17644|20993|  3349|   512|\n",
+            "|[which he had apparently adjusted that very moment,\",\"for his hand was still ...|20928|24275|  3347|   512|\n",
+            "|[his high white forehead, \\\"you\",\"can understand that I am not accustomed to ...|24214|27991|  3777|   512|\n",
+            "|[send it on the day when the\",\"betrothal was publicly proclaimed. That will b...|27927|31354|  3427|   512|\n",
+            "|[and helpless, in the\",\"chair.\",\"\",\"\\\"What is it?\\\"\",\"\",\"\\\"It's quite too fun...|31273|34428|  3155|   512|\n",
+            "+--------------------------------------------------------------------------------+-----+-----+------+------+\n",
+            "only showing top 8 rows\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "documentAssembler = DocumentAssembler() \\\n",
+        "    .setInputCol(\"text\") \\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "textSplitter = DocumentTokenSplitter() \\\n",
+        "    .setInputCols([\"document\"]) \\\n",
+        "    .setOutputCol(\"splits\") \\\n",
+        "    .setNumTokens(512) \\\n",
+        "    .setTokenOverlap(10) \\\n",
+        "    .setExplodeSplits(True)\n",
+        "\n",
+        "pipeline = Pipeline().setStages([documentAssembler, textSplitter])\n",
+        "result = pipeline.fit(textDF).transform(textDF)\n",
+        "\n",
+        "result.selectExpr(\n",
+        "      \"splits.result as result\",\n",
+        "      \"splits[0].begin as begin\",\n",
+        "      \"splits[0].end as end\",\n",
+        "      \"splits[0].end - splits[0].begin as length\",\n",
+        "      \"splits[0].metadata.numTokens as tokens\") \\\n",
+        "    .show(8, truncate = 80)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CALoU6tSofto"
+      },
+      "source": [
+        "# Now let's make another pipeline to see if this actually works!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "H5DFx2DOosri"
+      },
+      "source": [
+        "let's get the data ready"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "ZqR7pcQ9pw7a"
+      },
+      "outputs": [],
+      "source": [
+        "df = spark.createDataFrame([\n",
+        "    [(\"All emotions, and that\\none particularly, were abhorrent to his cold, \"\n",
+        "      \"precise but\\nadmirably balanced mind.\\n\\nHe was, I take it, the most \"\n",
+        "      \"perfect\\nreasoning and observing machine that the world has seen.\")]\n",
+        "]).toDF(\"text\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ArsOgKafoft0"
+      },
+      "source": [
+        "Lets create a Spark NLP pipeline following the same stages as before:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "id": "x5ZwHjKSoft2"
+      },
+      "outputs": [],
+      "source": [
+        "documentAssembler = DocumentAssembler() \\\n",
+        "    .setInputCol(\"text\") \\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "document_token_splitter = DocumentTokenSplitter() \\\n",
+        "    .setInputCols(\"document\") \\\n",
+        "    .setOutputCol(\"splits\") \\\n",
+        "    .setNumTokens(3) \\\n",
+        "    .setTokenOverlap(1) \\\n",
+        "    .setExplodeSplits(True) \\\n",
+        "    .setTrimWhitespace(True) \\\n",
+        "\n",
+        "pipeline = Pipeline().setStages([documentAssembler, document_token_splitter])\n",
+        "pipeline_df = pipeline.fit(df).transform(df)\n",
+        "\n",
+        "results = pipeline_df.select(\"splits\").collect()\n",
+        "\n",
+        "splits = [\n",
+        "    row[\"splits\"][0].result.replace(\"\\n\\n\", \" \").replace(\"\\n\", \" \")\n",
+        "    for row in results\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mjUiY6sOp-jY"
+      },
+      "source": [
+        "**Evaluation**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "s5wMKcnVp94o",
+        "outputId": "9a4ef0f9-76af-403d-81e3-0117e538f887"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 18,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "expected = [\n",
+        "    \"All emotions, and\",\n",
+        "    \"and that one\",\n",
+        "    \"one particularly, were\",\n",
+        "    \"were abhorrent to\",\n",
+        "    \"to his cold,\",\n",
+        "    \"cold, precise but\",\n",
+        "    \"but admirably balanced\",\n",
+        "    \"balanced mind. He\",\n",
+        "    \"He was, I\",\n",
+        "    \"I take it,\",\n",
+        "    \"it, the most\",\n",
+        "    \"most perfect reasoning\",\n",
+        "    \"reasoning and observing\",\n",
+        "    \"observing machine that\",\n",
+        "    \"that the world\",\n",
+        "    \"world has seen.\",\n",
+        "]\n",
+        "\n",
+        "splits == expected"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wq4G03A2qB5U"
+      },
+      "source": [
+        "Great it works!"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python [conda env:tempspark]",
+      "language": "python",
+      "name": "conda-env-tempspark-py"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.16"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -0,0 +1 @@
+