Add files via upload

robsalgado · Mar 30, 2019 · 8e9a803 · 8e9a803
1 parent 2fe461a
commit 8e9a803
Showing 1 changed file with 287 additions and 0 deletions.
diff --git a/mulitclass_text_class/nlp_utils_news.ipynb b/mulitclass_text_class/nlp_utils_news.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n",
+    "import itertools, string, operator, re, unicodedata, nltk\n",
+    "from operator import itemgetter\n",
+    "from nltk.corpus import wordnet\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.tokenize import TweetTokenizer, RegexpTokenizer\n",
+    "from bs4 import BeautifulSoup\n",
+    "import numpy as np\n",
+    "from itertools import combinations\n",
+    "from gensim.models import Phrases\n",
+    "from collections import Counter\n",
+    "\n",
+    "#Contraction map\n",
+    "c_dict = {\n",
+    "  \"ain't\": \"am not\",\n",
+    "  \"aren't\": \"are not\",\n",
+    "  \"can't\": \"cannot\",\n",
+    "  \"can't've\": \"cannot have\",\n",
+    "  \"'cause\": \"because\",\n",
+    "  \"could've\": \"could have\",\n",
+    "  \"couldn't\": \"could not\",\n",
+    "  \"couldn't've\": \"could not have\",\n",
+    "  \"didn't\": \"did not\",\n",
+    "  \"doesn't\": \"does not\",\n",
+    "  \"don't\": \"do not\",\n",
+    "  \"hadn't\": \"had not\",\n",
+    "  \"hadn't've\": \"had not have\",\n",
+    "  \"hasn't\": \"has not\",\n",
+    "  \"haven't\": \"have not\",\n",
+    "  \"he'd\": \"he would\",\n",
+    "  \"he'd've\": \"he would have\",\n",
+    "  \"he'll\": \"he will\",\n",
+    "  \"he'll've\": \"he will have\",\n",
+    "  \"he's\": \"he is\",\n",
+    "  \"how'd\": \"how did\",\n",
+    "  \"how'd'y\": \"how do you\",\n",
+    "  \"how'll\": \"how will\",\n",
+    "  \"how's\": \"how is\",\n",
+    "  \"i'd\": \"I would\",\n",
+    "  \"i'd've\": \"I would have\",\n",
+    "  \"i'll\": \"I will\",\n",
+    "  \"i'll've\": \"I will have\",\n",
+    "  \"i'm\": \"I am\",\n",
+    "  \"i've\": \"I have\",\n",
+    "  \"isn't\": \"is not\",\n",
+    "  \"it'd\": \"it had\",\n",
+    "  \"it'd've\": \"it would have\",\n",
+    "  \"it'll\": \"it will\",\n",
+    "  \"it'll've\": \"it will have\",\n",
+    "  \"it's\": \"it is\",\n",
+    "  \"let's\": \"let us\",\n",
+    "  \"ma'am\": \"madam\",\n",
+    "  \"mayn't\": \"may not\",\n",
+    "  \"might've\": \"might have\",\n",
+    "  \"mightn't\": \"might not\",\n",
+    "  \"mightn't've\": \"might not have\",\n",
+    "  \"must've\": \"must have\",\n",
+    "  \"mustn't\": \"must not\",\n",
+    "  \"mustn't've\": \"must not have\",\n",
+    "  \"needn't\": \"need not\",\n",
+    "  \"needn't've\": \"need not have\",\n",
+    "  \"o'clock\": \"of the clock\",\n",
+    "  \"oughtn't\": \"ought not\",\n",
+    "  \"oughtn't've\": \"ought not have\",\n",
+    "  \"shan't\": \"shall not\",\n",
+    "  \"sha'n't\": \"shall not\",\n",
+    "  \"shan't've\": \"shall not have\",\n",
+    "  \"she'd\": \"she would\",\n",
+    "  \"she'd've\": \"she would have\",\n",
+    "  \"she'll\": \"she will\",\n",
+    "  \"she'll've\": \"she will have\",\n",
+    "  \"she's\": \"she is\",\n",
+    "  \"should've\": \"should have\",\n",
+    "  \"shouldn't\": \"should not\",\n",
+    "  \"shouldn't've\": \"should not have\",\n",
+    "  \"so've\": \"so have\",\n",
+    "  \"so's\": \"so is\",\n",
+    "  \"that'd\": \"that would\",\n",
+    "  \"that'd've\": \"that would have\",\n",
+    "  \"that's\": \"that is\",\n",
+    "  \"there'd\": \"there had\",\n",
+    "  \"there'd've\": \"there would have\",\n",
+    "  \"there's\": \"there is\",\n",
+    "  \"they'd\": \"they would\",\n",
+    "  \"they'd've\": \"they would have\",\n",
+    "  \"they'll\": \"they will\",\n",
+    "  \"they'll've\": \"they will have\",\n",
+    "  \"they're\": \"they are\",\n",
+    "  \"they've\": \"they have\",\n",
+    "  \"to've\": \"to have\",\n",
+    "  \"wasn't\": \"was not\",\n",
+    "  \"we'd\": \"we had\",\n",
+    "  \"we'd've\": \"we would have\",\n",
+    "  \"we'll\": \"we will\",\n",
+    "  \"we'll've\": \"we will have\",\n",
+    "  \"we're\": \"we are\",\n",
+    "  \"we've\": \"we have\",\n",
+    "  \"weren't\": \"were not\",\n",
+    "  \"what'll\": \"what will\",\n",
+    "  \"what'll've\": \"what will have\",\n",
+    "  \"what're\": \"what are\",\n",
+    "  \"what's\": \"what is\",\n",
+    "  \"what've\": \"what have\",\n",
+    "  \"when's\": \"when is\",\n",
+    "  \"when've\": \"when have\",\n",
+    "  \"where'd\": \"where did\",\n",
+    "  \"where's\": \"where is\",\n",
+    "  \"where've\": \"where have\",\n",
+    "  \"who'll\": \"who will\",\n",
+    "  \"who'll've\": \"who will have\",\n",
+    "  \"who's\": \"who is\",\n",
+    "  \"who've\": \"who have\",\n",
+    "  \"why's\": \"why is\",\n",
+    "  \"why've\": \"why have\",\n",
+    "  \"will've\": \"will have\",\n",
+    "  \"won't\": \"will not\",\n",
+    "  \"won't've\": \"will not have\",\n",
+    "  \"would've\": \"would have\",\n",
+    "  \"wouldn't\": \"would not\",\n",
+    "  \"wouldn't've\": \"would not have\",\n",
+    "  \"y'all\": \"you all\",\n",
+    "  \"y'alls\": \"you alls\",\n",
+    "  \"y'all'd\": \"you all would\",\n",
+    "  \"y'all'd've\": \"you all would have\",\n",
+    "  \"y'all're\": \"you all are\",\n",
+    "  \"y'all've\": \"you all have\",\n",
+    "  \"you'd\": \"you had\",\n",
+    "  \"you'd've\": \"you would have\",\n",
+    "  \"you'll\": \"you you will\",\n",
+    "  \"you'll've\": \"you you will have\",\n",
+    "  \"you're\": \"you are\",\n",
+    "  \"you've\": \"you have\"\n",
+    "}\n",
+    "\n",
+    "c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))\n",
+    "\n",
+    "add_stop = ['', ' ', 'say', 's', 'u', 'ap', 'afp', '...', 'n', '\\\\']\n",
+    "\n",
+    "stop_words = ENGLISH_STOP_WORDS.union(add_stop)\n",
+    "\n",
+    "tokenizer = TweetTokenizer()\n",
+    "pattern = r\"(?u)\\b\\w\\w+\\b\" \n",
+    "\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "punc = list(set(string.punctuation))\n",
+    "\n",
+    "def casual_tokenizer(text): #Splits words on white spaces (leaves contractions intact) and splits out trailing punctuation\n",
+    "    tokens = tokenizer.tokenize(text)\n",
+    "    return tokens\n",
+    "\n",
+    "#Function to replace the nltk pos tags with the corresponding wordnet pos tag to use the wordnet lemmatizer\n",
+    "def get_word_net_pos(treebank_tag):\n",
+    "    if treebank_tag.startswith('J'):\n",
+    "        return wordnet.ADJ\n",
+    "    elif treebank_tag.startswith('V'):\n",
+    "        return wordnet.VERB\n",
+    "    elif treebank_tag.startswith('N'):\n",
+    "        return wordnet.NOUN\n",
+    "    elif treebank_tag.startswith('R'):\n",
+    "        return wordnet.ADV\n",
+    "    else:\n",
+    "        return None\n",
+    "    \n",
+    "def lemma_wordnet(tagged_text):\n",
+    "    final = []\n",
+    "    for word, tag in tagged_text:\n",
+    "        wordnet_tag = get_word_net_pos(tag)\n",
+    "        if wordnet_tag is None:\n",
+    "            final.append(lemmatizer.lemmatize(word))\n",
+    "        else:\n",
+    "            final.append(lemmatizer.lemmatize(word, pos=wordnet_tag))\n",
+    "    return final\n",
+    "\n",
+    "def expandContractions(text, c_re=c_re):\n",
+    "    def replace(match):\n",
+    "        return c_dict[match.group(0)]\n",
+    "    return c_re.sub(replace, text)\n",
+    "\n",
+    "def remove_html(text):\n",
+    "    soup = BeautifulSoup(text, \"html5lib\")\n",
+    "    tags_del = soup.get_text()\n",
+    "    uni = unicodedata.normalize(\"NFKD\", tags_del)\n",
+    "    bracket_del = re.sub(r'\\[.*?\\]', '  ', uni)\n",
+    "    apostrphe = re.sub('’', \"'\", bracket_del)\n",
+    "    string = apostrphe.replace('\\r','  ')\n",
+    "    string = string.replace('\\n','  ')\n",
+    "    extra_space = re.sub(' +',' ', string)\n",
+    "    return extra_space\n",
+    "\n",
+    "def process_text(text):\n",
+    "    soup = BeautifulSoup(text, \"lxml\")\n",
+    "    tags_del = soup.get_text()\n",
+    "    no_html = re.sub('<[^>]*>', '', tags_del)\n",
+    "    tokenized = casual_tokenizer(no_html)\n",
+    "    lower = [item.lower() for item in tokenized]\n",
+    "    decontract = [expandContractions(item, c_re=c_re) for item in lower]\n",
+    "    tagged = nltk.pos_tag(decontract)\n",
+    "    lemma = lemma_wordnet(tagged)\n",
+    "    no_num = [re.sub('[0-9]+', '', each) for each in lemma]\n",
+    "    no_punc = [w for w in no_num if w not in punc]\n",
+    "    no_stop = [w for w in no_punc if w not in stop_words]\n",
+    "    return no_stop\n",
+    "\n",
+    "def word_count(text):\n",
+    "    return len(str(text).split(' '))\n",
+    "\n",
+    "def word_freq(clean_text_list, top_n):\n",
+    "    \"\"\"\n",
+    "    Word Frequency\n",
+    "    \"\"\"\n",
+    "    flat = [item for sublist in clean_text_list for item in sublist]\n",
+    "    with_counts = Counter(flat)\n",
+    "    top = with_counts.most_common(top_n)\n",
+    "    word = [each[0] for each in top]\n",
+    "    num = [each[1] for each in top]\n",
+    "    return pd.DataFrame([word, num]).T\n",
+    "\n",
+    "def word_freq_bigrams(clean_text_list, top_n):\n",
+    "    \"\"\"\n",
+    "    Word Frequency With Bigrams\n",
+    "    \"\"\"\n",
+    "    bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)\n",
+    "    w_bigrams = list(bigram_model[clean_text_list])\n",
+    "    flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]\n",
+    "    with_counts = Counter(flat_w_bigrams)\n",
+    "    top = with_counts.most_common(top_n)\n",
+    "    word = [each[0] for each in top]\n",
+    "    num = [each[1] for each in top]\n",
+    "    return pd.DataFrame([word, num]).T\n",
+    "\n",
+    "\n",
+    "def bigram_freq(clean_text_list, top_n):\n",
+    "    bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)\n",
+    "    w_bigrams = list(bigram_model[clean_text_list])\n",
+    "    flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]\n",
+    "    bigrams = []\n",
+    "    for each in flat_w_bigrams:\n",
+    "        if '_' in each:\n",
+    "            bigrams.append(each)\n",
+    "    counts = Counter(bigrams)\n",
+    "    top = counts.most_common(top_n)\n",
+    "    word = [each[0] for each in top]\n",
+    "    num = [each[1] for each in top]\n",
+    "    return pd.DataFrame([word, num]).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}