diff --git a/nlp/.ipynb_checkpoints/Avshalom Manevich - NLP_HW1-checkpoint.ipynb b/nlp/.ipynb_checkpoints/Avshalom Manevich - NLP_HW1-checkpoint.ipynb new file mode 100644 index 0000000..6669719 --- /dev/null +++ b/nlp/.ipynb_checkpoints/Avshalom Manevich - NLP_HW1-checkpoint.ipynb @@ -0,0 +1,514 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sentence reformulation" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from gensim.models import KeyedVectors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download FastText pretrained vectors for English: \n", + "[cc.en300.vec.gz](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz)\n", + "\n", + "And download Yelp! dataset composed of reviews: \n", + "[Yelp.train.text](https://drive.google.com/file/d/1TAcfL091lKb2LipaUELFteZqJjQu-gMa/view?usp=sharing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load downloaded pretrained FastText vectors by gensim library:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'cc.en.300.vec'\n", + "vectors = KeyedVectors.load_word2vec_format(filename, binary=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute similarity of two words using gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('pooches', 0.4657370448112488), ('dogs', 0.4568704068660736), ('doggie', 0.44288891553878784), ('pooch', 0.433940052986145), ('non-dog', 0.4216308891773224)]\n", + "[('cats', 0.4473192095756531), ('litter-box', 0.4075545072555542), ('litterboxes', 0.40291157364845276), ('kitty', 0.39684420824050903), ('kitties', 0.39618998765945435)]\n", + "0.50369895\n", + "0.47312132\n" + ] + } + ], + "source": [ + "#We discussed different words, look and similarity of 'king' and 'queen' for example. Could you put it inot context?\n", + "print(vectors.most_similar(positive=['dog'], negative=['wolf'], topn=5))\n", + "print(vectors.most_similar(positive=['cat'], negative=['tiger'], topn=5))\n", + "print(vectors.similarity('cat', 'tiger'))\n", + "print(vectors.similarity('dog', 'wolf'))\n", + "\n", + "#your code here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sentence tokenization. Split Yelp! texts into separate tokens (words and punctuation marks) by space" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk import word_tokenize\n", + "yelp_file = 'Yelp.train.text'\n", + "with open(yelp_file, \"r\") as f:\n", + " lines = f.read().splitlines()\n", + "\n", + "tokens = list(map(word_tokenize, lines))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['i', 'was', 'sadly', 'mistaken', '.'],\n", + " ['so',\n", + " 'on',\n", + " 'to',\n", + " 'the',\n", + " 'hoagies',\n", + " ',',\n", + " 'the',\n", + " 'italian',\n", + " 'is',\n", + " 'general',\n", + " 'run',\n", + " 'of',\n", + " 'the',\n", + " 'mill',\n", + " '.'],\n", + " ['minimal', 'meat', 'and', 'a', 'ton', 'of', 'shredded', 'lettuce', '.']]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def tokenize_sentence(sentence):\n", + " return word_tokenize(sentence)\n", + "\n", + "tokens[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try part of speech tagging using [NLTK POS-tagger](https://www.nltk.org/book/ch05.html).\n", + "The function returns list of tuples (word, POS_tag)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('are', 'VBP'), ('you', 'PRP'), ('kidding', 'VBG'), ('me', 'PRP'), ('?', '.')]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def POS_tagging(sentence):\n", + " return nltk.pos_tag(sentence)\n", + "\n", + "POS_tagging(tokens[7])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Can you find the most similar word to the given? Can you write a method that returns a list of tuples (word, similarity) in order of decreasing similarity?" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('fairly', 0.46226567029953003),\n", + " ('pretttty', 0.394969642162323),\n", + " ('farily', 0.3877026438713074),\n", + " ('prettty', 0.37976735830307007),\n", + " ('amazingly', 0.37048882246017456),\n", + " ('prettttty', 0.36940139532089233),\n", + " ('quite', 0.3675154447555542),\n", + " ('ever-so', 0.36676985025405884),\n", + " ('remarkably', 0.3537715673446655),\n", + " ('failry', 0.34980258345603943),\n", + " ('somewhat', 0.3493301272392273),\n", + " ('impressively', 0.3471192717552185),\n", + " ('retty', 0.34354445338249207),\n", + " ('incredibly', 0.3428688049316406),\n", + " ('oretty', 0.34186863899230957)]" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def most_similar(word, topn=15, neg=[]):\n", + " return vectors.most_similar(positive=[word], negative=neg, topn=topn)\n", + "\n", + "most_similar('pretty', neg=['bad'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do the simplest reformulation task. We just want to reformulate some sentences replacing an ajective with a similar one" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "outdoor seating in good weather .\n" + ] + }, + { + "data": { + "text/plain": [ + "'outdoor seating in excellent great weather .'" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nltk.stem import WordNetLemmatizer \n", + "\n", + "lemmatizer = WordNetLemmatizer() \n", + "\n", + "def reformulate_sentence(sentence, bias=['bad'], adjectives_num=2):\n", + " # Sentence tokenization\n", + " tokenized_sentence = tokenize_sentence(sentence)\n", + "# print(tokenized_sentence)\n", + " # Part of speech tagging\n", + " POS_tagged_words = POS_tagging(tokenized_sentence)\n", + "# print(POS_tagged_words)\n", + "\n", + " reformulated_sentence_words = []\n", + " for word, pos_tag in POS_tagged_words:\n", + " # If the word is adjective...\n", + " if pos_tag in ['JJR', 'JJS', 'JJ']:\n", + " try:\n", + " # ...look for the word most similar to the given and replace it\n", + " most_similar_words = most_similar(word, neg=bias)\n", + " chosen_words = []\n", + " for similar_word, similarity in most_similar_words:\n", + " if nltk.edit_distance(similar_word.lower(), word.lower()) < 4:\n", + "# print('skipping', similar_word)\n", + " continue\n", + " if '.' in similar_word:\n", + " continue\n", + " if not similar_word.isalpha():\n", + " continue\n", + " chosen_words.append(similar_word)\n", + " if len(chosen_words) == adjectives_num:\n", + " break\n", + " if not chosen_words:\n", + " chosen_words = [word]\n", + " reformulated_sentence_words.extend(chosen_words)\n", + " \n", + " pass\n", + " except:\n", + " print('There is no {} word in FastText dictionary! ...'.format(word))\n", + " \n", + " else:\n", + " reformulated_sentence_words.append(word)\n", + " # Join words list in a sentence\n", + " return ' '.join(reformulated_sentence_words)\n", + "print(lines[634])\n", + "reformulate_sentence(lines[634])" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'very enjoyable environment which i will begin to frequent .'" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reformulate_sentence(lines[163])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sentiment analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package vader_lexicon to\n", + "[nltk_data] /Users/avshalommanevich/nltk_data...\n" + ] + } + ], + "source": [ + "nltk.download('vader_lexicon')\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "VADER sentiment classifier from NLTK library. The range of sentiment is from -1 to 1 where -1 is negative, 0 is neutral and 1 is positive\n", + "\n", + "Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014." + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "sentiment_analyzer = SentimentIntensityAnalyzer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read the dataset text file line by line and put lines into the list" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "random.shuffle(lines)\n", + "num_of_sentences = 1000\n", + "sample = lines[:num_of_sentences]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read Yelp dataset from text file and get 1000 random sentences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute average sentiment of 1000 sentences sentences set by VADER sentiment classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2775767000000003" + ] + }, + "execution_count": 220, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "def sentiment(sentence):\n", + " return sentiment_analyzer.polarity_scores(sentence)['compound']\n", + "\n", + "def average_sentiment(sentences):\n", + " return sum(map(sentiment, sentences)) / len(sentences)\n", + "\n", + "\n", + "avg_sentiment = average_sentiment(sample)\n", + "avg_sentiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Reformulate sentences and compute average sentiment again. Try to come up with ways to make senteces more positive on average. What about more negative? Can you come up with some interesting experiment on this data with POS-tagged reformulations?" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n", + "There is no _num_ word in FastText dictionary! ...\n" + ] + } + ], + "source": [ + "avg_sentiment_reformed = average_sentiment(list(map(reformulate_sentence, sample)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2993779999999999" + ] + }, + "execution_count": 219, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_sentiment_reformed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/nlp/.ipynb_checkpoints/Maria Osipova - NLP_HW1-checkpoint.ipynb b/nlp/.ipynb_checkpoints/Maria Osipova - NLP_HW1-checkpoint.ipynb index 9ea4919..7526b37 100644 --- a/nlp/.ipynb_checkpoints/Maria Osipova - NLP_HW1-checkpoint.ipynb +++ b/nlp/.ipynb_checkpoints/Maria Osipova - NLP_HW1-checkpoint.ipynb @@ -9,12 +9,22 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /Users/mariao/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ], "source": [ "import numpy as np\n", "import nltk\n", + "from nltk import word_tokenize\n", "import warnings\n", "from gensim import corpora, similarities\n", "from gensim.models import KeyedVectors\n", @@ -22,6 +32,7 @@ " strip_punctuation, stem_text\n", "\n", "warnings.filterwarnings('ignore')\n", + "nltk.download('punkt')\n", "%matplotlib inline" ] }, @@ -62,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -89,41 +100,49 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "with open('Yelp.train.text', 'r') as f:\n", - " yelp_set = np.array(f.readlines())\n", + "yelp_file = 'Yelp.train.text'\n", "\n", - "preprocessors = [\n", - " lambda word: word.lower(), # Lowercase the word.\n", - " strip_multiple_whitespaces, # Remove repeating whitespaces.\n", - "]\n", + "with open(yelp_file, 'r') as f:\n", + " lines = f.read().splitlines()\n", "\n", - "tokenize_sentence = lambda x: preprocess_string(x, preprocessors)\n", - "\n", - "tokens = np.array([tokenize_sentence(sentence) for sentence in yelp_set])" + "tokens = list(map(word_tokenize, lines))" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['i', 'wa', 'sadli', 'mistaken']" + "[['i',\n", + " 'ordered',\n", + " 'it',\n", + " 'without',\n", + " 'lettuce',\n", + " ',',\n", + " 'tomato',\n", + " ',',\n", + " 'onions',\n", + " ',',\n", + " 'or',\n", + " 'dressing',\n", + " '.'],\n", + " ['are', 'you', 'kidding', 'me', '?']]" ] }, - "execution_count": 24, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tokens[0]" + "tokens[9:11]" ] }, { @@ -166,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -196,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -214,7 +233,7 @@ " ('WHOAAAAAA', 0.41194313764572144)]" ] }, - "execution_count": 22, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -234,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -242,15 +261,14 @@ "output_type": "stream", "text": [ "minimal meat and a ton of shredded lettuce .\n", - "\n", - "minim meat and a ton of shread lettuc\n" + "minmal meat and a ton of shreaded lettuce .\n" ] } ], "source": [ "def reformulate_sentence(sentence):\n", " # Sentence tokenization\n", - " tokenized_sentence = tokenize_sentence(sentence)\n", + " tokenized_sentence = word_tokenize(sentence)\n", "\n", " # Part of speech tagging\n", " POS_tagged_words = POS_tagging(tokenized_sentence)\n", @@ -270,8 +288,8 @@ " return ' '.join(reformulated_sentence_words)\n", "\n", "idx = 2\n", - "print(yelp_set[idx])\n", - "print(reformulate_sentence(yelp_set[idx]))" + "print(lines[idx])\n", + "print(reformulate_sentence(lines[idx]))" ] }, { @@ -283,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -292,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -321,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -341,8 +359,8 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO: Not clear why should we read it.\n", - "lexicon_list = sentiment_analyzer.lexicon_file.split('\\r\\n')" + "with open(yelp_file, 'r') as f:\n", + " lines = np.array([line for line in f])" ] }, { @@ -354,11 +372,16 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "sentences = yelp_set[np.random.randint(0, 1000, (1000, 1))]\n", + "preprocessors = [\n", + " lambda word: word.lower(), # Lowercase the word.\n", + " strip_multiple_whitespaces, # Remove repeating whitespaces.\n", + "]\n", + "\n", + "sentences = lines[np.random.randint(0, 1000, (1000, 1))]\n", "\n", "processed_sentences = np.array([' '.join(preprocess_string(sentence[0], preprocessors)) for sentence in sentences])" ] @@ -372,16 +395,16 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "-0.1151995" + "-0.1232498" ] }, - "execution_count": 55, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -401,23 +424,23 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['this beer is like champaign and it makes my whole stomach nice and cold .'\n", - " \"that 's where the praise ends .\"\n", - " 'ambiance , like i said , left much to be desired .'\n", + "['the coffee at this place was brewed to perfection which a appreciate .'\n", + " 'this place is huge and was moderately clean except the bathrooms .'\n", + " 'the music the guys played was all stuff from like _num_ .'\n", " 'i really really want this place to do better .'\n", - " 'while sitting there we noticed _num_ other parties walked out as well .'\n", - " \"but good lord , they 're just so loud !\"\n", - " 'thanks for giving me one more reason never to come back .'\n", - " 'too many other good choices nearby to give them a second chance - sorry .'\n", " \"but that 's not the worst .\"\n", - " \"but yes , you guessed it , that 's overpriced too .\"]\n" + " 'while sitting there we noticed _num_ other parties walked out as well .'\n", + " '- none of the food was super awesome .'\n", + " \"that 's where the praise ends .\"\n", + " '- none of the food was super awesome .'\n", + " \"however , if you 're looking for something specific , good luck .\"]\n" ] } ], @@ -430,16 +453,16 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array(['thanks'], dtype='