From 5cbdda6887ee66e35bde84f66fa4d2fc9b29ba13 Mon Sep 17 00:00:00 2001
From: Vineet Anil Likhitkar <vind3v17@gmail.com>
Date: Sat, 20 May 2023 23:57:05 +0530
Subject: [PATCH] Upload Notebook

---
 POS_Tagger.ipynb | 402 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 POS_Tagger.ipynb

diff --git a/POS_Tagger.ipynb b/POS_Tagger.ipynb
new file mode 100644
index 0000000..1624766
--- /dev/null
+++ b/POS_Tagger.ipynb
@@ -0,0 +1,402 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9CvPbvI1hh8T",
+        "outputId": "3f23b690-58d1-41c6-cd7d-b6502d89cb94"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.3)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.2.0)\n",
+            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2022.10.31)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.65.0)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install nltk"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import nltk\n",
+        "nltk.download('treebank')\n",
+        "nltk.download('punkt')\n",
+        "\n",
+        "\n",
+        "from nltk import word_tokenize, pos_tag"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "TL5KSHM0iGt8",
+        "outputId": "abcd01b9-d13d-4afa-d25a-80a17e6ef4b2"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "[nltk_data] Downloading package treebank to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping corpora/treebank.zip.\n",
+            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import nltk\n",
+        "\n",
+        "tagged_sentences = nltk.corpus.treebank.tagged_sents()\n",
+        " \n",
+        "print (tagged_sentences[0])\n",
+        "print (\"Tagged sentences: \", len(tagged_sentences))\n",
+        "print (\"Tagged words:\", len(nltk.corpus.treebank.tagged_words()))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Nyc3wcnchwKr",
+        "outputId": "ed48c3ea-d0e2-4c4a-feb2-588e12301847"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]\n",
+            "Tagged sentences:  3914\n",
+            "Tagged words: 100676\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#Feature Extraction"
+      ],
+      "metadata": {
+        "id": "uHPkhIIBkEWz"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def features(sentence, index):\n",
+        "    \"\"\" sentence: [w1, w2, ...], index: the index of the word \"\"\"\n",
+        "    return {\n",
+        "        'word': sentence[index],\n",
+        "        'is_first': index == 0,\n",
+        "        'is_last': index == len(sentence) - 1,\n",
+        "        'is_capitalized': sentence[index][0].upper() == sentence[index][0],\n",
+        "        'is_all_caps': sentence[index].upper() == sentence[index],\n",
+        "        'is_all_lower': sentence[index].lower() == sentence[index],\n",
+        "        'prefix-1': sentence[index][0],\n",
+        "        'prefix-2': sentence[index][:2],\n",
+        "        'prefix-3': sentence[index][:3],\n",
+        "        'suffix-1': sentence[index][-1],\n",
+        "        'suffix-2': sentence[index][-2:],\n",
+        "        'suffix-3': sentence[index][-3:],\n",
+        "        'prev_word': '' if index == 0 else sentence[index - 1],\n",
+        "        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],\n",
+        "        'has_hyphen': '-' in sentence[index],\n",
+        "        'is_numeric': sentence[index].isdigit(),\n",
+        "        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]\n",
+        "    }\n",
+        " "
+      ],
+      "metadata": {
+        "id": "tDzz5GWMh5Hb"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pprint \n",
+        "pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))\n",
+        " \n",
+        "# {'capitals_inside': False,\n",
+        "#  'has_hyphen': False,\n",
+        "#  'is_all_caps': False,\n",
+        "#  'is_all_lower': True,\n",
+        "#  'is_capitalized': False,\n",
+        "#  'is_first': False,\n",
+        "#  'is_last': False,\n",
+        "#  'is_numeric': False,\n",
+        "#  'next_word': 'sentence',\n",
+        "#  'prefix-1': 'a',\n",
+        "#  'prefix-2': 'a',\n",
+        "#  'prefix-3': 'a',\n",
+        "#  'prev_word': 'is',\n",
+        "#  'suffix-1': 'a',\n",
+        "#  'suffix-2': 'a',\n",
+        "#  'suffix-3': 'a',\n",
+        "#  'word': 'a'}"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "BIeyCofPiUhT",
+        "outputId": "ba70ce02-c222-4f8c-ad49-28b2f5bf65da"
+      },
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "{'capitals_inside': False,\n",
+            " 'has_hyphen': False,\n",
+            " 'is_all_caps': False,\n",
+            " 'is_all_lower': True,\n",
+            " 'is_capitalized': False,\n",
+            " 'is_first': False,\n",
+            " 'is_last': False,\n",
+            " 'is_numeric': False,\n",
+            " 'next_word': 'sentence',\n",
+            " 'prefix-1': 'a',\n",
+            " 'prefix-2': 'a',\n",
+            " 'prefix-3': 'a',\n",
+            " 'prev_word': 'is',\n",
+            " 'suffix-1': 'a',\n",
+            " 'suffix-2': 'a',\n",
+            " 'suffix-3': 'a',\n",
+            " 'word': 'a'}\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Small helper function to strip the tags from our tagged corpus and feed it to our classifier\n",
+        "def untag(tagged_sentence):\n",
+        "    return [w for w, t in tagged_sentence]\n",
+        " "
+      ],
+      "metadata": {
+        "id": "sPM_syfhiYwZ"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Building training set\n"
+      ],
+      "metadata": {
+        "id": "3jMaXt-5ixRM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Split the dataset for training and testing\n",
+        "cutoff = int(.75 * len(tagged_sentences))\n",
+        "training_sentences = tagged_sentences[:cutoff]\n",
+        "test_sentences = tagged_sentences[cutoff:]\n",
+        " \n",
+        "print (len(training_sentences))   # 2935\n",
+        "print (len(test_sentences))       # 979\n",
+        " \n",
+        "def transform_to_dataset(tagged_sentences):\n",
+        "    X, y = [], []\n",
+        " \n",
+        "    for tagged in tagged_sentences:\n",
+        "        for index in range(len(tagged)):\n",
+        "            X.append(features(untag(tagged), index))\n",
+        "            y.append(tagged[index][1])\n",
+        " \n",
+        "    return X, y\n",
+        " \n",
+        "X, y = transform_to_dataset(training_sentences)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_mXQzdSHijoO",
+        "outputId": "1dfdf59b-e96a-4e93-cd60-0509b74d175f"
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "2935\n",
+            "979\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## **Training**\n",
+        "\n",
+        "Training the Classifier with Decision Tree Classifier"
+      ],
+      "metadata": {
+        "id": "Km6wyU5hjIYb"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.tree import DecisionTreeClassifier\n",
+        "from sklearn.feature_extraction import DictVectorizer\n",
+        "from sklearn.pipeline import Pipeline\n",
+        " \n",
+        "clf = Pipeline([\n",
+        "    ('vectorizer', DictVectorizer(sparse=False)),\n",
+        "    ('classifier', DecisionTreeClassifier(criterion='entropy'))\n",
+        "])\n",
+        " \n",
+        "clf.fit(X[:20000], y[:20000])   \n",
+        " \n",
+        "print ('Training completed')\n",
+        " \n",
+        "X_test, y_test = transform_to_dataset(test_sentences)\n",
+        " \n",
+        "print (\"Accuracy:\", clf.score(X_test, y_test))\n",
+        " "
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "r1SF7o5pioRc",
+        "outputId": "76e1f97d-cf26-41ff-f0b4-19537c629d8f"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Training completed\n",
+            "Accuracy: 0.9153945042583963\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def pos_ttag(sentence):\n",
+        "    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])\n",
+        "    l = []\n",
+        "    for i in range(len(sentence)):\n",
+        "      l.append((sentence[i],tags[i]))  \n",
+        "    return l\n",
+        " \n",
+        "\n",
+        "sentence = \"Hi, My name is Vineet and I am a Developer\"\n",
+        "\n",
+        "print (pos_ttag(word_tokenize(sentence)))"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7CwH1bnhi9F4",
+        "outputId": "34b5c9f1-8506-445a-be03-5fea499c76ad"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[('Hi', 'RB'), (',', ','), ('My', 'NNP'), ('name', 'NN'), ('is', 'VBZ'), ('Vineet', 'NNP'), ('and', 'CC'), ('I', 'PRP'), ('am', 'NN'), ('a', 'DT'), ('Developer', 'NNP')]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### POS Tags for Reference\n",
+        "\n",
+        "1.\tCC\tCoordinating conjunction\n",
+        "2.\tCD\tCardinal number\n",
+        "3.\tDT\tDeterminer\n",
+        "4.\tEX\tExistential there\n",
+        "5.\tFW\tForeign word\n",
+        "6.\tIN\tPreposition or subordinating conjunction\n",
+        "7.\tJJ\tAdjective\n",
+        "8.\tJJR\tAdjective, comparative\n",
+        "9.\tJJS\tAdjective, superlative\n",
+        "10.\tLS\tList item marker\n",
+        "11.\tMD\tModal\n",
+        "12.\tNN\tNoun, singular or mass\n",
+        "13.\tNNS\tNoun, plural\n",
+        "14.\tNNP\tProper noun, singular\n",
+        "15.\tNNPS\tProper noun, plural\n",
+        "16.\tPDT\tPredeterminer\n",
+        "17.\tPOS\tPossessive ending\n",
+        "18.\tPRP\tPersonal pronoun\n",
+        "19.\tPRP\tPossessive pronoun\n",
+        "20.\tRB\tAdverb\n",
+        "21.\tRBR\tAdverb, comparative\n",
+        "22.\tRBS\tAdverb, superlative\n",
+        "23.\tRP\tParticle\n",
+        "24.\tSYM\tSymbol\n",
+        "25.\tTO\tto\n",
+        "26.\tUH\tInterjection\n",
+        "27.\tVB\tVerb, base form\n",
+        "28.\tVBD\tVerb, past tense\n",
+        "29.\tVBG\tVerb, gerund or present participle\n",
+        "30.\tVBN\tVerb, past participle\n",
+        "31.\tVBP\tVerb, non-3rd person singular present\n",
+        "32.\tVBZ\tVerb, 3rd person singular present\n",
+        "33.\tWDT\tWh-determiner\n",
+        "34.\tWP\tWh-pronoun\n",
+        "35.\tWP\tPossessive wh-pronoun\n",
+        "36.\tWRB\tWh-adverb\n"
+      ],
+      "metadata": {
+        "id": "K3VNsrLpn8b2"
+      }
+    }
+  ]
+}
\ No newline at end of file