Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
robsalgado authored Mar 30, 2019
1 parent 2fe461a commit 8e9a803
Showing 1 changed file with 287 additions and 0 deletions.
287 changes: 287 additions & 0 deletions mulitclass_text_class/nlp_utils_news.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS\n",
"import itertools, string, operator, re, unicodedata, nltk\n",
"from operator import itemgetter\n",
"from nltk.corpus import wordnet\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.tokenize import TweetTokenizer, RegexpTokenizer\n",
"from bs4 import BeautifulSoup\n",
"import numpy as np\n",
"from itertools import combinations\n",
"from gensim.models import Phrases\n",
"from collections import Counter\n",
"\n",
"#Contraction map\n",
"c_dict = {\n",
" \"ain't\": \"am not\",\n",
" \"aren't\": \"are not\",\n",
" \"can't\": \"cannot\",\n",
" \"can't've\": \"cannot have\",\n",
" \"'cause\": \"because\",\n",
" \"could've\": \"could have\",\n",
" \"couldn't\": \"could not\",\n",
" \"couldn't've\": \"could not have\",\n",
" \"didn't\": \"did not\",\n",
" \"doesn't\": \"does not\",\n",
" \"don't\": \"do not\",\n",
" \"hadn't\": \"had not\",\n",
" \"hadn't've\": \"had not have\",\n",
" \"hasn't\": \"has not\",\n",
" \"haven't\": \"have not\",\n",
" \"he'd\": \"he would\",\n",
" \"he'd've\": \"he would have\",\n",
" \"he'll\": \"he will\",\n",
" \"he'll've\": \"he will have\",\n",
" \"he's\": \"he is\",\n",
" \"how'd\": \"how did\",\n",
" \"how'd'y\": \"how do you\",\n",
" \"how'll\": \"how will\",\n",
" \"how's\": \"how is\",\n",
" \"i'd\": \"I would\",\n",
" \"i'd've\": \"I would have\",\n",
" \"i'll\": \"I will\",\n",
" \"i'll've\": \"I will have\",\n",
" \"i'm\": \"I am\",\n",
" \"i've\": \"I have\",\n",
" \"isn't\": \"is not\",\n",
" \"it'd\": \"it had\",\n",
" \"it'd've\": \"it would have\",\n",
" \"it'll\": \"it will\",\n",
" \"it'll've\": \"it will have\",\n",
" \"it's\": \"it is\",\n",
" \"let's\": \"let us\",\n",
" \"ma'am\": \"madam\",\n",
" \"mayn't\": \"may not\",\n",
" \"might've\": \"might have\",\n",
" \"mightn't\": \"might not\",\n",
" \"mightn't've\": \"might not have\",\n",
" \"must've\": \"must have\",\n",
" \"mustn't\": \"must not\",\n",
" \"mustn't've\": \"must not have\",\n",
" \"needn't\": \"need not\",\n",
" \"needn't've\": \"need not have\",\n",
" \"o'clock\": \"of the clock\",\n",
" \"oughtn't\": \"ought not\",\n",
" \"oughtn't've\": \"ought not have\",\n",
" \"shan't\": \"shall not\",\n",
" \"sha'n't\": \"shall not\",\n",
" \"shan't've\": \"shall not have\",\n",
" \"she'd\": \"she would\",\n",
" \"she'd've\": \"she would have\",\n",
" \"she'll\": \"she will\",\n",
" \"she'll've\": \"she will have\",\n",
" \"she's\": \"she is\",\n",
" \"should've\": \"should have\",\n",
" \"shouldn't\": \"should not\",\n",
" \"shouldn't've\": \"should not have\",\n",
" \"so've\": \"so have\",\n",
" \"so's\": \"so is\",\n",
" \"that'd\": \"that would\",\n",
" \"that'd've\": \"that would have\",\n",
" \"that's\": \"that is\",\n",
" \"there'd\": \"there had\",\n",
" \"there'd've\": \"there would have\",\n",
" \"there's\": \"there is\",\n",
" \"they'd\": \"they would\",\n",
" \"they'd've\": \"they would have\",\n",
" \"they'll\": \"they will\",\n",
" \"they'll've\": \"they will have\",\n",
" \"they're\": \"they are\",\n",
" \"they've\": \"they have\",\n",
" \"to've\": \"to have\",\n",
" \"wasn't\": \"was not\",\n",
" \"we'd\": \"we had\",\n",
" \"we'd've\": \"we would have\",\n",
" \"we'll\": \"we will\",\n",
" \"we'll've\": \"we will have\",\n",
" \"we're\": \"we are\",\n",
" \"we've\": \"we have\",\n",
" \"weren't\": \"were not\",\n",
" \"what'll\": \"what will\",\n",
" \"what'll've\": \"what will have\",\n",
" \"what're\": \"what are\",\n",
" \"what's\": \"what is\",\n",
" \"what've\": \"what have\",\n",
" \"when's\": \"when is\",\n",
" \"when've\": \"when have\",\n",
" \"where'd\": \"where did\",\n",
" \"where's\": \"where is\",\n",
" \"where've\": \"where have\",\n",
" \"who'll\": \"who will\",\n",
" \"who'll've\": \"who will have\",\n",
" \"who's\": \"who is\",\n",
" \"who've\": \"who have\",\n",
" \"why's\": \"why is\",\n",
" \"why've\": \"why have\",\n",
" \"will've\": \"will have\",\n",
" \"won't\": \"will not\",\n",
" \"won't've\": \"will not have\",\n",
" \"would've\": \"would have\",\n",
" \"wouldn't\": \"would not\",\n",
" \"wouldn't've\": \"would not have\",\n",
" \"y'all\": \"you all\",\n",
" \"y'alls\": \"you alls\",\n",
" \"y'all'd\": \"you all would\",\n",
" \"y'all'd've\": \"you all would have\",\n",
" \"y'all're\": \"you all are\",\n",
" \"y'all've\": \"you all have\",\n",
" \"you'd\": \"you had\",\n",
" \"you'd've\": \"you would have\",\n",
" \"you'll\": \"you you will\",\n",
" \"you'll've\": \"you you will have\",\n",
" \"you're\": \"you are\",\n",
" \"you've\": \"you have\"\n",
"}\n",
"\n",
"c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))\n",
"\n",
"add_stop = ['', ' ', 'say', 's', 'u', 'ap', 'afp', '...', 'n', '\\\\']\n",
"\n",
"stop_words = ENGLISH_STOP_WORDS.union(add_stop)\n",
"\n",
"tokenizer = TweetTokenizer()\n",
"pattern = r\"(?u)\\b\\w\\w+\\b\" \n",
"\n",
"lemmatizer = WordNetLemmatizer()\n",
"\n",
"punc = list(set(string.punctuation))\n",
"\n",
"def casual_tokenizer(text): #Splits words on white spaces (leaves contractions intact) and splits out trailing punctuation\n",
" tokens = tokenizer.tokenize(text)\n",
" return tokens\n",
"\n",
"#Function to replace the nltk pos tags with the corresponding wordnet pos tag to use the wordnet lemmatizer\n",
"def get_word_net_pos(treebank_tag):\n",
" if treebank_tag.startswith('J'):\n",
" return wordnet.ADJ\n",
" elif treebank_tag.startswith('V'):\n",
" return wordnet.VERB\n",
" elif treebank_tag.startswith('N'):\n",
" return wordnet.NOUN\n",
" elif treebank_tag.startswith('R'):\n",
" return wordnet.ADV\n",
" else:\n",
" return None\n",
" \n",
"def lemma_wordnet(tagged_text):\n",
" final = []\n",
" for word, tag in tagged_text:\n",
" wordnet_tag = get_word_net_pos(tag)\n",
" if wordnet_tag is None:\n",
" final.append(lemmatizer.lemmatize(word))\n",
" else:\n",
" final.append(lemmatizer.lemmatize(word, pos=wordnet_tag))\n",
" return final\n",
"\n",
"def expandContractions(text, c_re=c_re):\n",
" def replace(match):\n",
" return c_dict[match.group(0)]\n",
" return c_re.sub(replace, text)\n",
"\n",
"def remove_html(text):\n",
" soup = BeautifulSoup(text, \"html5lib\")\n",
" tags_del = soup.get_text()\n",
" uni = unicodedata.normalize(\"NFKD\", tags_del)\n",
" bracket_del = re.sub(r'\\[.*?\\]', ' ', uni)\n",
" apostrphe = re.sub('’', \"'\", bracket_del)\n",
" string = apostrphe.replace('\\r',' ')\n",
" string = string.replace('\\n',' ')\n",
" extra_space = re.sub(' +',' ', string)\n",
" return extra_space\n",
"\n",
"def process_text(text):\n",
" soup = BeautifulSoup(text, \"lxml\")\n",
" tags_del = soup.get_text()\n",
" no_html = re.sub('<[^>]*>', '', tags_del)\n",
" tokenized = casual_tokenizer(no_html)\n",
" lower = [item.lower() for item in tokenized]\n",
" decontract = [expandContractions(item, c_re=c_re) for item in lower]\n",
" tagged = nltk.pos_tag(decontract)\n",
" lemma = lemma_wordnet(tagged)\n",
" no_num = [re.sub('[0-9]+', '', each) for each in lemma]\n",
" no_punc = [w for w in no_num if w not in punc]\n",
" no_stop = [w for w in no_punc if w not in stop_words]\n",
" return no_stop\n",
"\n",
"def word_count(text):\n",
" return len(str(text).split(' '))\n",
"\n",
"def word_freq(clean_text_list, top_n):\n",
" \"\"\"\n",
" Word Frequency\n",
" \"\"\"\n",
" flat = [item for sublist in clean_text_list for item in sublist]\n",
" with_counts = Counter(flat)\n",
" top = with_counts.most_common(top_n)\n",
" word = [each[0] for each in top]\n",
" num = [each[1] for each in top]\n",
" return pd.DataFrame([word, num]).T\n",
"\n",
"def word_freq_bigrams(clean_text_list, top_n):\n",
" \"\"\"\n",
" Word Frequency With Bigrams\n",
" \"\"\"\n",
" bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)\n",
" w_bigrams = list(bigram_model[clean_text_list])\n",
" flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]\n",
" with_counts = Counter(flat_w_bigrams)\n",
" top = with_counts.most_common(top_n)\n",
" word = [each[0] for each in top]\n",
" num = [each[1] for each in top]\n",
" return pd.DataFrame([word, num]).T\n",
"\n",
"\n",
"def bigram_freq(clean_text_list, top_n):\n",
" bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)\n",
" w_bigrams = list(bigram_model[clean_text_list])\n",
" flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]\n",
" bigrams = []\n",
" for each in flat_w_bigrams:\n",
" if '_' in each:\n",
" bigrams.append(each)\n",
" counts = Counter(bigrams)\n",
" top = counts.most_common(top_n)\n",
" word = [each[0] for each in top]\n",
" num = [each[1] for each in top]\n",
" return pd.DataFrame([word, num]).T"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 8e9a803

Please sign in to comment.