From 5cbdda6887ee66e35bde84f66fa4d2fc9b29ba13 Mon Sep 17 00:00:00 2001 From: Vineet Anil Likhitkar Date: Sat, 20 May 2023 23:57:05 +0530 Subject: [PATCH] Upload Notebook --- POS_Tagger.ipynb | 402 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 POS_Tagger.ipynb diff --git a/POS_Tagger.ipynb b/POS_Tagger.ipynb new file mode 100644 index 0000000..1624766 --- /dev/null +++ b/POS_Tagger.ipynb @@ -0,0 +1,402 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9CvPbvI1hh8T", + "outputId": "3f23b690-58d1-41c6-cd7d-b6502d89cb94" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.3)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.2.0)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2022.10.31)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.65.0)\n" + ] + } + ], + "source": [ + "!pip install nltk" + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('treebank')\n", + "nltk.download('punkt')\n", + "\n", + "\n", + "from nltk import word_tokenize, pos_tag" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TL5KSHM0iGt8", + "outputId": "abcd01b9-d13d-4afa-d25a-80a17e6ef4b2" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package treebank to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/treebank.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "\n", + "tagged_sentences = nltk.corpus.treebank.tagged_sents()\n", + " \n", + "print (tagged_sentences[0])\n", + "print (\"Tagged sentences: \", len(tagged_sentences))\n", + "print (\"Tagged words:\", len(nltk.corpus.treebank.tagged_words()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Nyc3wcnchwKr", + "outputId": "ed48c3ea-d0e2-4c4a-feb2-588e12301847" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]\n", + "Tagged sentences: 3914\n", + "Tagged words: 100676\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Feature Extraction" + ], + "metadata": { + "id": "uHPkhIIBkEWz" + } + }, + { + "cell_type": "code", + "source": [ + "def features(sentence, index):\n", + " \"\"\" sentence: [w1, w2, ...], index: the index of the word \"\"\"\n", + " return {\n", + " 'word': sentence[index],\n", + " 'is_first': index == 0,\n", + " 'is_last': index == len(sentence) - 1,\n", + " 'is_capitalized': sentence[index][0].upper() == sentence[index][0],\n", + " 'is_all_caps': sentence[index].upper() == sentence[index],\n", + " 'is_all_lower': sentence[index].lower() == sentence[index],\n", + " 'prefix-1': sentence[index][0],\n", + " 'prefix-2': sentence[index][:2],\n", + " 'prefix-3': sentence[index][:3],\n", + " 'suffix-1': sentence[index][-1],\n", + " 'suffix-2': sentence[index][-2:],\n", + " 'suffix-3': sentence[index][-3:],\n", + " 'prev_word': '' if index == 0 else sentence[index - 1],\n", + " 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],\n", + " 'has_hyphen': '-' in sentence[index],\n", + " 'is_numeric': sentence[index].isdigit(),\n", + " 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]\n", + " }\n", + " " + ], + "metadata": { + "id": "tDzz5GWMh5Hb" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pprint \n", + "pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))\n", + " \n", + "# {'capitals_inside': False,\n", + "# 'has_hyphen': False,\n", + "# 'is_all_caps': False,\n", + "# 'is_all_lower': True,\n", + "# 'is_capitalized': False,\n", + "# 'is_first': False,\n", + "# 'is_last': False,\n", + "# 'is_numeric': False,\n", + "# 'next_word': 'sentence',\n", + "# 'prefix-1': 'a',\n", + "# 'prefix-2': 'a',\n", + "# 'prefix-3': 'a',\n", + "# 'prev_word': 'is',\n", + "# 'suffix-1': 'a',\n", + "# 'suffix-2': 'a',\n", + "# 'suffix-3': 'a',\n", + "# 'word': 'a'}" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BIeyCofPiUhT", + "outputId": "ba70ce02-c222-4f8c-ad49-28b2f5bf65da" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'capitals_inside': False,\n", + " 'has_hyphen': False,\n", + " 'is_all_caps': False,\n", + " 'is_all_lower': True,\n", + " 'is_capitalized': False,\n", + " 'is_first': False,\n", + " 'is_last': False,\n", + " 'is_numeric': False,\n", + " 'next_word': 'sentence',\n", + " 'prefix-1': 'a',\n", + " 'prefix-2': 'a',\n", + " 'prefix-3': 'a',\n", + " 'prev_word': 'is',\n", + " 'suffix-1': 'a',\n", + " 'suffix-2': 'a',\n", + " 'suffix-3': 'a',\n", + " 'word': 'a'}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Small helper function to strip the tags from our tagged corpus and feed it to our classifier\n", + "def untag(tagged_sentence):\n", + " return [w for w, t in tagged_sentence]\n", + " " + ], + "metadata": { + "id": "sPM_syfhiYwZ" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Building training set\n" + ], + "metadata": { + "id": "3jMaXt-5ixRM" + } + }, + { + "cell_type": "code", + "source": [ + "# Split the dataset for training and testing\n", + "cutoff = int(.75 * len(tagged_sentences))\n", + "training_sentences = tagged_sentences[:cutoff]\n", + "test_sentences = tagged_sentences[cutoff:]\n", + " \n", + "print (len(training_sentences)) # 2935\n", + "print (len(test_sentences)) # 979\n", + " \n", + "def transform_to_dataset(tagged_sentences):\n", + " X, y = [], []\n", + " \n", + " for tagged in tagged_sentences:\n", + " for index in range(len(tagged)):\n", + " X.append(features(untag(tagged), index))\n", + " y.append(tagged[index][1])\n", + " \n", + " return X, y\n", + " \n", + "X, y = transform_to_dataset(training_sentences)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_mXQzdSHijoO", + "outputId": "1dfdf59b-e96a-4e93-cd60-0509b74d175f" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2935\n", + "979\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## **Training**\n", + "\n", + "Training the Classifier with Decision Tree Classifier" + ], + "metadata": { + "id": "Km6wyU5hjIYb" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.feature_extraction import DictVectorizer\n", + "from sklearn.pipeline import Pipeline\n", + " \n", + "clf = Pipeline([\n", + " ('vectorizer', DictVectorizer(sparse=False)),\n", + " ('classifier', DecisionTreeClassifier(criterion='entropy'))\n", + "])\n", + " \n", + "clf.fit(X[:20000], y[:20000]) \n", + " \n", + "print ('Training completed')\n", + " \n", + "X_test, y_test = transform_to_dataset(test_sentences)\n", + " \n", + "print (\"Accuracy:\", clf.score(X_test, y_test))\n", + " " + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r1SF7o5pioRc", + "outputId": "76e1f97d-cf26-41ff-f0b4-19537c629d8f" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Training completed\n", + "Accuracy: 0.9153945042583963\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def pos_ttag(sentence):\n", + " tags = clf.predict([features(sentence, index) for index in range(len(sentence))])\n", + " l = []\n", + " for i in range(len(sentence)):\n", + " l.append((sentence[i],tags[i])) \n", + " return l\n", + " \n", + "\n", + "sentence = \"Hi, My name is Vineet and I am a Developer\"\n", + "\n", + "print (pos_ttag(word_tokenize(sentence)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7CwH1bnhi9F4", + "outputId": "34b5c9f1-8506-445a-be03-5fea499c76ad" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('Hi', 'RB'), (',', ','), ('My', 'NNP'), ('name', 'NN'), ('is', 'VBZ'), ('Vineet', 'NNP'), ('and', 'CC'), ('I', 'PRP'), ('am', 'NN'), ('a', 'DT'), ('Developer', 'NNP')]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### POS Tags for Reference\n", + "\n", + "1.\tCC\tCoordinating conjunction\n", + "2.\tCD\tCardinal number\n", + "3.\tDT\tDeterminer\n", + "4.\tEX\tExistential there\n", + "5.\tFW\tForeign word\n", + "6.\tIN\tPreposition or subordinating conjunction\n", + "7.\tJJ\tAdjective\n", + "8.\tJJR\tAdjective, comparative\n", + "9.\tJJS\tAdjective, superlative\n", + "10.\tLS\tList item marker\n", + "11.\tMD\tModal\n", + "12.\tNN\tNoun, singular or mass\n", + "13.\tNNS\tNoun, plural\n", + "14.\tNNP\tProper noun, singular\n", + "15.\tNNPS\tProper noun, plural\n", + "16.\tPDT\tPredeterminer\n", + "17.\tPOS\tPossessive ending\n", + "18.\tPRP\tPersonal pronoun\n", + "19.\tPRP\tPossessive pronoun\n", + "20.\tRB\tAdverb\n", + "21.\tRBR\tAdverb, comparative\n", + "22.\tRBS\tAdverb, superlative\n", + "23.\tRP\tParticle\n", + "24.\tSYM\tSymbol\n", + "25.\tTO\tto\n", + "26.\tUH\tInterjection\n", + "27.\tVB\tVerb, base form\n", + "28.\tVBD\tVerb, past tense\n", + "29.\tVBG\tVerb, gerund or present participle\n", + "30.\tVBN\tVerb, past participle\n", + "31.\tVBP\tVerb, non-3rd person singular present\n", + "32.\tVBZ\tVerb, 3rd person singular present\n", + "33.\tWDT\tWh-determiner\n", + "34.\tWP\tWh-pronoun\n", + "35.\tWP\tPossessive wh-pronoun\n", + "36.\tWRB\tWh-adverb\n" + ], + "metadata": { + "id": "K3VNsrLpn8b2" + } + } + ] +} \ No newline at end of file