Skip to content

Commit aa8f425

Browse files
committed
Update 02 vectors
1 parent ade471f commit aa8f425

File tree

4 files changed

+354
-2
lines changed

4 files changed

+354
-2
lines changed

NLP_02_Vector/read.me

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Named Entity Recognition
22

33
Named Entity is anything that can be referred to as a proper name
4-
proper name is a full phrase, 4 tags most commeon
4+
proper name is a full phrase, 4 tags most common
55
PER (person) Marie Curie
66
LOC (location) New York City
77
GPE (geo-political entity) Boulder, Colorado
@@ -106,7 +106,7 @@ large values in the same dimensions
106106
- normalize by dividing by the length
107107

108108
It is the cos of the angle btw
109-
from 1 (vectors pointng to same direction)
109+
from 1 (vectors pointing to same direction)
110110
to -1 (opposite),
111111
frequencies are not negative, so cos is from 0 to 1
112112

NLP_02_Vector/src/data_manager.py

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
import pandas as pd
2+
import re
3+
import numpy as np
4+
5+
from pathlib import Path
6+
7+
# from stanfordcorenlp import StanfordCoreNLP
8+
import json
9+
import nltk
10+
from nltk.tokenize import word_tokenize, sent_tokenize
11+
from nltk.corpus import shakespeare, brown
12+
from functools import reduce
13+
import math
14+
from gensim.models import Word2Vec
15+
import multiprocessing
16+
17+
from numpy.ma.core import negative
18+
from regex import split
19+
20+
# nltk.download('shakespeare')
21+
# nltk.download('brown')
22+
23+
# Uncomment if run for the first time
24+
# nltk.download('words')
25+
# nltk.download('maxent_ne_chunker_tab')
26+
# nltk.download('averaged_perceptron_tagger_eng')
27+
28+
# Constants
29+
TERMS = ['battle', 'good', 'fool', 'wit']
30+
WORD_WINDOW = 2
31+
32+
33+
class WordVector:
34+
def __init__(self):
35+
self.nltk_skipgram()
36+
37+
def skipgram_model(self):
38+
"""
39+
Train a classifier that is given a candidate (word, context) pair
40+
(apricot, jam) -> P(+|apricot, jam) high
41+
(apricot, aardvark) -> P(-|apricot, aardvark) high
42+
And assigns each pair a probability
43+
c is a real context word for +
44+
P(+|w,c)
45+
P(-|w,c) = 1 - P(+|w,c)
46+
"""
47+
target_word = 'apricot'
48+
train_sentence = 'lemon, a tablespoon of apricot jam, a pinch'
49+
print('-'*20)
50+
print(f'Target word : {target_word}')
51+
print(f'Train sentence : {train_sentence}')
52+
print(f'Word window : +/- {WORD_WINDOW} context words')
53+
54+
# target word
55+
train_sentence_lst = train_sentence.split()
56+
print(train_sentence_lst)
57+
target_ind = train_sentence_lst.index('apricot')
58+
context_words = train_sentence_lst[target_ind-WORD_WINDOW:target_ind] + train_sentence_lst[target_ind+1:target_ind+WORD_WINDOW+1]
59+
print(f'Positive examples : {context_words}')
60+
61+
62+
def nltk_skipgram(self):
63+
sentences = brown.sents()
64+
EMB_DIM = 300
65+
66+
# train model
67+
# sentences=None, size=100, alpha=0.025, window=5,
68+
# min_count=5, max_vocab_size=None, sample=0.001,
69+
# seed=1, workers=3, min_alpha=0.0001, sg=0,
70+
# hs=0, negative=5, cbow_mean=1,
71+
# hashfxn=<built-in function hash>, iter=5, null_word=0,
72+
# trim_rule=None, sorted_vocab=1, batch_words=10000)
73+
74+
w2v = Word2Vec(sentences, vector_size=EMB_DIM, window=5, min_count=5,
75+
negative=15, workers=multiprocessing.cpu_count())
76+
77+
word_vectors = w2v.wv
78+
result = word_vectors.similar_by_word('Saturday')
79+
print("Most similar to 'Saturday':\n", result[:3])
80+
81+
result = word_vectors.similar_by_word('money')
82+
print("Most similar to 'money':\n", result[:3])
83+
84+
result = word_vectors.similar_by_word('child')
85+
print("Most similar to 'child':\n", result[:3])
86+
87+
result = word_vectors.most_similar(positive=['child'], negative=['person'])
88+
print("Most similar to 'child' but dissimilar to 'person':\n", result[:3])
89+
90+
# not present in vocabulary
91+
# result = word_vectors.most_similar(positive=['king, woman'], negative=['man'])
92+
# print("Most similar to 'king' and 'woman' but dissimilar to 'man':\n", result[:3])
93+
94+
95+
96+
97+
98+
def cosine_example(self):
99+
"""
100+
COmpute the similarity
101+
btw cherry and information
102+
"""
103+
104+
columns = ['pie', 'data', 'computer']
105+
data = [(442, 8, 2), (5, 1683, 1670), (5, 3982, 3325)]
106+
index_ = ['cherry', 'digital', 'information']
107+
df = pd.DataFrame(data, columns=columns, index=index_)
108+
109+
def calculate_cosine(a,b):
110+
up = np.dot(df.loc[a].values, df.loc[b].values)
111+
down = np.linalg.norm(df.loc[a].values) * np.linalg.norm(df.loc[b].values)
112+
113+
print(f'Cosine similariry between "{a}" and "{b}" is {round(up/down,4)}.')
114+
return
115+
116+
calculate_cosine(a='cherry', b='information')
117+
calculate_cosine(a='digital', b='information')
118+
119+
120+
def load_local_data(self):
121+
122+
print(f'Which plays of Shakespeare contain the words?')
123+
df = pd.DataFrame(index=TERMS)
124+
125+
# Load four plays
126+
as_you_like_it_txt = Path('./data/as_you_like_it.txt').read_text()
127+
twelve_night_txt = Path('./data/twelve_night.txt').read_text()
128+
julius_caesar_txt = Path('./data/julius_caesar.txt').read_text()
129+
henri_v_txt = Path('./data/henri_v.txt').read_text()
130+
131+
132+
for txt, title_ in zip([as_you_like_it_txt,twelve_night_txt,julius_caesar_txt,henri_v_txt],
133+
['as_you_like_it_txt','twelve_night_txt','julius_caesar_txt','henri_v_txt']):
134+
column_if_present = [1 if t.lower() in txt else 0 for t in TERMS]
135+
column_tf = [len(re.findall(f'\\b{t.lower()}\\b', txt.lower())) if t.lower() in txt else 0 for t in TERMS]
136+
column_log_tf = [math.log10(t+1) if t > 0 else 0 for t in column_tf] # log10(count (t,d) + 1)
137+
df[f'{title_}_boolean'] = column_if_present
138+
df[f'{title_}_TF'] = column_tf
139+
df[f'{title_}_LOGTF'] = column_log_tf
140+
141+
N = 4
142+
boolean_columns = [c for c in df.columns if 'boolean' in c]
143+
df['df'] = df[boolean_columns].sum(axis=1, numeric_only=True)
144+
145+
df['idf'] = np.log10(N / df['df'])
146+
TF_columns = [c for c in df.columns if '_LOGTF' in c]
147+
148+
for c in TF_columns:
149+
title = re.search(r'^(.*?)\_LOGTF', c).group(1)
150+
df[f'{title}_TFIDF'] = df[c] * df['idf']
151+
152+
with pd.option_context('display.max_rows', None, 'display.max_columns',None):
153+
print(round(df,2))
154+
155+
def load_data(self):
156+
# return term document matrix
157+
plays = shakespeare.fileids()
158+
print(f'Which plays of Shakespeare contain the words?')
159+
df = pd.DataFrame(index=TERMS)
160+
161+
for p in plays:
162+
163+
def list_to_string(list_):
164+
# Support function to preprocess list of strings
165+
temp = ' '.join(list_)
166+
temp = re.sub(r'\n?', '', temp)
167+
temp = temp.lower()
168+
return temp
169+
170+
# Load a play
171+
play = shakespeare.xml(f'{p}')
172+
173+
# Title is the only element, use [0][0] to extract it
174+
title_ = [list(p.itertext()) for p in play if p.tag == 'TITLE'][0][0]
175+
full_text = [list(p.itertext()) for p in play if p.tag == 'ACT']
176+
# Flatten the list
177+
text_ = reduce(lambda x, y: x + y, full_text)
178+
# Apply custom function to clean
179+
txt = list_to_string(text_)
180+
181+
# Create a column for if a term is present in play
182+
column_if_present = [1 if t.lower() in txt else 0 for t in TERMS]
183+
184+
column_tf = [len(re.findall(f'\\b{t.lower()}\\b', txt.lower())) if t.lower() in txt else 0 for t in TERMS]
185+
column_log_tf = [1 + math.log10(t) if t > 0 else 0 for t in column_tf]
186+
187+
df[f'{title_}_boolean'] = column_if_present
188+
df[f'{title_}_TF_'] = column_tf
189+
df[f'{title_}_TF'] = column_log_tf
190+
191+
# Calculate Inversed Document frequency / one value per collection
192+
# We have a different collection compared to example in lecture, so final numbers differ
193+
N = len(plays)
194+
boolean_columns = [c for c in df.columns if 'boolean' in c]
195+
df['df'] = df[boolean_columns].sum(axis=1, numeric_only=True)
196+
197+
df['idf'] = np.log10(N / df['df']) # should have checked for zero
198+
199+
TF_columns = [c for c in df.columns if '_TF' in c]
200+
201+
for c in TF_columns:
202+
title = re.search(r'^(.*?)\_TF', c).group(1)
203+
df[f'{title}_TFIDF'] = df[c] * df['idf']
204+
205+
print('Document matrix ---------- ')
206+
columns_name_tf_idf = [c for c in df.columns if '_TFIDF' in c]
207+
columns_name_tf = [c for c in df.columns if '_TF_' in c]
208+
209+
with pd.option_context('display.max_rows', None, 'display.max_columns',
210+
None): # more options can be specified also
211+
print(df[columns_name_tf])
212+
213+
return
214+
215+
216+
217+
class EntityData:
218+
def __init__(self):
219+
self.extract_entity()
220+
221+
def extract_entity(self):
222+
223+
sample_text = """Jane Villanueva of United, a unit of United Airlines Holding, said the fare applies to the Chicago route."""
224+
225+
# Tokenization: Split the sample_text into a list of words or tokens
226+
tokens = nltk.word_tokenize(sample_text)
227+
228+
# Tagging
229+
tagged_tokens = nltk.pos_tag(tokens)
230+
231+
# Extract entities
232+
entities = nltk.ne_chunk(tagged_tokens)
233+
234+
print('-'*30)
235+
print('BIO tagging')
236+
print(f'Sample text : {sample_text}')
237+
238+
words = [e.leaves()[0][0] if type(e) is nltk.tree.tree.Tree else e[0] for e in entities]
239+
labels = [e.label() if type(e) is nltk.tree.tree.Tree else e[1] for e in entities]
240+
241+
df = pd.DataFrame({'words': words, 'labels': labels})
242+
print(df)
243+
244+
def do_part_of_speech_tagging(self):
245+
txt = """There were 70 children there. Preliminary findings were reported in today's New England Journal of Medicine."""
246+
247+
tokenized = sent_tokenize(txt)
248+
249+
for i in tokenized:
250+
wordsList = nltk.word_tokenize(i)
251+
tagged = nltk.pos_tag(wordsList)
252+
print(tagged)
253+
254+
class StanfordNLP:
255+
def __init__(self, host='http://localhost', port=9000):
256+
self.nlp = StanfordCoreNLP(host, port=port,
257+
timeout=30000) # , quiet=False, logging_level=logging.DEBUG)
258+
self.props = {
259+
'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
260+
'pipelineLanguage': 'en',
261+
'outputFormat': 'json'
262+
}
263+
264+
def word_tokenize(self, sentence):
265+
return self.nlp.word_tokenize(sentence)
266+
267+
def pos(self, sentence):
268+
return self.nlp.pos_tag(sentence)
269+
270+
def ner(self, sentence):
271+
return self.nlp.ner(sentence)
272+
273+
def parse(self, sentence):
274+
return self.nlp.parse(sentence)
275+
276+
def dependency_parse(self, sentence):
277+
return self.nlp.dependency_parse(sentence)
278+
279+
def annotate(self, sentence):
280+
return json.loads(self.nlp.annotate(sentence, properties=self.props))
281+
282+
@staticmethod
283+
def tokens_to_dict(_tokens):
284+
tokens = defaultdict(dict)
285+
for token in _tokens:
286+
tokens[int(token['index'])] = {
287+
'word': token['word'],
288+
'lemma': token['lemma'],
289+
'pos': token['pos'],
290+
'ner': token['ner']
291+
}
292+
return tokens
293+
294+
295+
class NRC_VAD:
296+
def __init__(self):
297+
self.arousal = pd.read_csv('./data/arousal-NRC-VAD-Lexicon.txt',
298+
sep='\t', header=None, names=['word', 'score'])
299+
self.dominance = pd.read_csv('./data/dominance-NRC-VAD-Lexicon.txt',
300+
sep='\t', header=None, names=['word', 'score'])
301+
self.valence = pd.read_csv('./data/valence-NRC-VAD-Lexicon.txt',
302+
sep='\t', header=None, names=['word', 'score'])
303+
304+
def search(self, word_):
305+
print(f'Scores for "{word_}"')
306+
print(f" - arousal {self.arousal.query('word==@word_')['score'].values[0]}")
307+
print(f" - dominance {self.dominance.query('word==@word_')['score'].values[0]}")
308+
print(f" - valence {self.valence.query('word==@word_')['score'].values[0]}")
309+
310+
311+
312+
313+
314+
315+
316+
317+
# ----------------

NLP_02_Vector/src/main.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from data_manager import WordVector
2+
3+
word_vector = WordVector()
4+
# word_vector.cosine_example()
5+
6+
# Initiate class and extract entity
7+
# data = EntityData()
8+
9+
# Part of speech tagging
10+
# data.do_part_of_speech_tagging()
11+
12+
# VAD
13+
# vad = NRC_VAD()
14+
# vad.search(word_='toxic')
15+
16+
# sNLP = StanfordNLP()
17+
#
18+
# print('-'*30)
19+
# # text = 'A blog post using Stanford CoreNLP Server. Visit www.khalidalnajjar.com for more details.'
20+
# text = """Jane Villanueva of United, a unit of United Airlines Holding, said the fare applies to the Chicago route."""
21+
# text1 = """There were 70 children there. Preliminary findings were reported in today's New England Journal of Medicine."""
22+
#
23+
# # print("Annotate:", sNLP.annotate(text))
24+
# # print("POS:", sNLP.pos(text))
25+
# # print ("Tokens:", sNLP.word_tokenize(text))
26+
# for txt in [text, text1]:
27+
# print ("NER:", sNLP.ner(txt))
28+
# print ("POS:", sNLP.pos(txt))
29+
# # print ("Parse:", sNLP.parse(text))
30+
# # print ("Dep Parse:", sNLP.dependency_parse(text))
31+
32+
# ------------

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ Logistic regression for simplified sentiment analysis.
4242
**NLP 02 TF IDF**<br>
4343
Information Retrieval: tf.idf, evaluate search engines.
4444

45+
**NLP 02 Vector**<br>
46+
Words and vectors.
47+
4548
## Naming convention
4649
Code: NLP for Natural Language Processing<br>
4750
01: difficulty of the concept<br>

0 commit comments

Comments
 (0)