1
+ import pandas as pd
2
+ import re
3
+ import numpy as np
4
+
5
+ from pathlib import Path
6
+
7
+ # from stanfordcorenlp import StanfordCoreNLP
8
+ import json
9
+ import nltk
10
+ from nltk .tokenize import word_tokenize , sent_tokenize
11
+ from nltk .corpus import shakespeare , brown
12
+ from functools import reduce
13
+ import math
14
+ from gensim .models import Word2Vec
15
+ import multiprocessing
16
+
17
+ from numpy .ma .core import negative
18
+ from regex import split
19
+
20
+ # nltk.download('shakespeare')
21
+ # nltk.download('brown')
22
+
23
+ # Uncomment if run for the first time
24
+ # nltk.download('words')
25
+ # nltk.download('maxent_ne_chunker_tab')
26
+ # nltk.download('averaged_perceptron_tagger_eng')
27
+
28
+ # Constants
29
+ TERMS = ['battle' , 'good' , 'fool' , 'wit' ]
30
+ WORD_WINDOW = 2
31
+
32
+
33
+ class WordVector :
34
+ def __init__ (self ):
35
+ self .nltk_skipgram ()
36
+
37
+ def skipgram_model (self ):
38
+ """
39
+ Train a classifier that is given a candidate (word, context) pair
40
+ (apricot, jam) -> P(+|apricot, jam) high
41
+ (apricot, aardvark) -> P(-|apricot, aardvark) high
42
+ And assigns each pair a probability
43
+ c is a real context word for +
44
+ P(+|w,c)
45
+ P(-|w,c) = 1 - P(+|w,c)
46
+ """
47
+ target_word = 'apricot'
48
+ train_sentence = 'lemon, a tablespoon of apricot jam, a pinch'
49
+ print ('-' * 20 )
50
+ print (f'Target word : { target_word } ' )
51
+ print (f'Train sentence : { train_sentence } ' )
52
+ print (f'Word window : +/- { WORD_WINDOW } context words' )
53
+
54
+ # target word
55
+ train_sentence_lst = train_sentence .split ()
56
+ print (train_sentence_lst )
57
+ target_ind = train_sentence_lst .index ('apricot' )
58
+ context_words = train_sentence_lst [target_ind - WORD_WINDOW :target_ind ] + train_sentence_lst [target_ind + 1 :target_ind + WORD_WINDOW + 1 ]
59
+ print (f'Positive examples : { context_words } ' )
60
+
61
+
62
+ def nltk_skipgram (self ):
63
+ sentences = brown .sents ()
64
+ EMB_DIM = 300
65
+
66
+ # train model
67
+ # sentences=None, size=100, alpha=0.025, window=5,
68
+ # min_count=5, max_vocab_size=None, sample=0.001,
69
+ # seed=1, workers=3, min_alpha=0.0001, sg=0,
70
+ # hs=0, negative=5, cbow_mean=1,
71
+ # hashfxn=<built-in function hash>, iter=5, null_word=0,
72
+ # trim_rule=None, sorted_vocab=1, batch_words=10000)
73
+
74
+ w2v = Word2Vec (sentences , vector_size = EMB_DIM , window = 5 , min_count = 5 ,
75
+ negative = 15 , workers = multiprocessing .cpu_count ())
76
+
77
+ word_vectors = w2v .wv
78
+ result = word_vectors .similar_by_word ('Saturday' )
79
+ print ("Most similar to 'Saturday':\n " , result [:3 ])
80
+
81
+ result = word_vectors .similar_by_word ('money' )
82
+ print ("Most similar to 'money':\n " , result [:3 ])
83
+
84
+ result = word_vectors .similar_by_word ('child' )
85
+ print ("Most similar to 'child':\n " , result [:3 ])
86
+
87
+ result = word_vectors .most_similar (positive = ['child' ], negative = ['person' ])
88
+ print ("Most similar to 'child' but dissimilar to 'person':\n " , result [:3 ])
89
+
90
+ # not present in vocabulary
91
+ # result = word_vectors.most_similar(positive=['king, woman'], negative=['man'])
92
+ # print("Most similar to 'king' and 'woman' but dissimilar to 'man':\n", result[:3])
93
+
94
+
95
+
96
+
97
+
98
+ def cosine_example (self ):
99
+ """
100
+ COmpute the similarity
101
+ btw cherry and information
102
+ """
103
+
104
+ columns = ['pie' , 'data' , 'computer' ]
105
+ data = [(442 , 8 , 2 ), (5 , 1683 , 1670 ), (5 , 3982 , 3325 )]
106
+ index_ = ['cherry' , 'digital' , 'information' ]
107
+ df = pd .DataFrame (data , columns = columns , index = index_ )
108
+
109
+ def calculate_cosine (a ,b ):
110
+ up = np .dot (df .loc [a ].values , df .loc [b ].values )
111
+ down = np .linalg .norm (df .loc [a ].values ) * np .linalg .norm (df .loc [b ].values )
112
+
113
+ print (f'Cosine similariry between "{ a } " and "{ b } " is { round (up / down ,4 )} .' )
114
+ return
115
+
116
+ calculate_cosine (a = 'cherry' , b = 'information' )
117
+ calculate_cosine (a = 'digital' , b = 'information' )
118
+
119
+
120
+ def load_local_data (self ):
121
+
122
+ print (f'Which plays of Shakespeare contain the words?' )
123
+ df = pd .DataFrame (index = TERMS )
124
+
125
+ # Load four plays
126
+ as_you_like_it_txt = Path ('./data/as_you_like_it.txt' ).read_text ()
127
+ twelve_night_txt = Path ('./data/twelve_night.txt' ).read_text ()
128
+ julius_caesar_txt = Path ('./data/julius_caesar.txt' ).read_text ()
129
+ henri_v_txt = Path ('./data/henri_v.txt' ).read_text ()
130
+
131
+
132
+ for txt , title_ in zip ([as_you_like_it_txt ,twelve_night_txt ,julius_caesar_txt ,henri_v_txt ],
133
+ ['as_you_like_it_txt' ,'twelve_night_txt' ,'julius_caesar_txt' ,'henri_v_txt' ]):
134
+ column_if_present = [1 if t .lower () in txt else 0 for t in TERMS ]
135
+ column_tf = [len (re .findall (f'\\ b{ t .lower ()} \\ b' , txt .lower ())) if t .lower () in txt else 0 for t in TERMS ]
136
+ column_log_tf = [math .log10 (t + 1 ) if t > 0 else 0 for t in column_tf ] # log10(count (t,d) + 1)
137
+ df [f'{ title_ } _boolean' ] = column_if_present
138
+ df [f'{ title_ } _TF' ] = column_tf
139
+ df [f'{ title_ } _LOGTF' ] = column_log_tf
140
+
141
+ N = 4
142
+ boolean_columns = [c for c in df .columns if 'boolean' in c ]
143
+ df ['df' ] = df [boolean_columns ].sum (axis = 1 , numeric_only = True )
144
+
145
+ df ['idf' ] = np .log10 (N / df ['df' ])
146
+ TF_columns = [c for c in df .columns if '_LOGTF' in c ]
147
+
148
+ for c in TF_columns :
149
+ title = re .search (r'^(.*?)\_LOGTF' , c ).group (1 )
150
+ df [f'{ title } _TFIDF' ] = df [c ] * df ['idf' ]
151
+
152
+ with pd .option_context ('display.max_rows' , None , 'display.max_columns' ,None ):
153
+ print (round (df ,2 ))
154
+
155
+ def load_data (self ):
156
+ # return term document matrix
157
+ plays = shakespeare .fileids ()
158
+ print (f'Which plays of Shakespeare contain the words?' )
159
+ df = pd .DataFrame (index = TERMS )
160
+
161
+ for p in plays :
162
+
163
+ def list_to_string (list_ ):
164
+ # Support function to preprocess list of strings
165
+ temp = ' ' .join (list_ )
166
+ temp = re .sub (r'\n?' , '' , temp )
167
+ temp = temp .lower ()
168
+ return temp
169
+
170
+ # Load a play
171
+ play = shakespeare .xml (f'{ p } ' )
172
+
173
+ # Title is the only element, use [0][0] to extract it
174
+ title_ = [list (p .itertext ()) for p in play if p .tag == 'TITLE' ][0 ][0 ]
175
+ full_text = [list (p .itertext ()) for p in play if p .tag == 'ACT' ]
176
+ # Flatten the list
177
+ text_ = reduce (lambda x , y : x + y , full_text )
178
+ # Apply custom function to clean
179
+ txt = list_to_string (text_ )
180
+
181
+ # Create a column for if a term is present in play
182
+ column_if_present = [1 if t .lower () in txt else 0 for t in TERMS ]
183
+
184
+ column_tf = [len (re .findall (f'\\ b{ t .lower ()} \\ b' , txt .lower ())) if t .lower () in txt else 0 for t in TERMS ]
185
+ column_log_tf = [1 + math .log10 (t ) if t > 0 else 0 for t in column_tf ]
186
+
187
+ df [f'{ title_ } _boolean' ] = column_if_present
188
+ df [f'{ title_ } _TF_' ] = column_tf
189
+ df [f'{ title_ } _TF' ] = column_log_tf
190
+
191
+ # Calculate Inversed Document frequency / one value per collection
192
+ # We have a different collection compared to example in lecture, so final numbers differ
193
+ N = len (plays )
194
+ boolean_columns = [c for c in df .columns if 'boolean' in c ]
195
+ df ['df' ] = df [boolean_columns ].sum (axis = 1 , numeric_only = True )
196
+
197
+ df ['idf' ] = np .log10 (N / df ['df' ]) # should have checked for zero
198
+
199
+ TF_columns = [c for c in df .columns if '_TF' in c ]
200
+
201
+ for c in TF_columns :
202
+ title = re .search (r'^(.*?)\_TF' , c ).group (1 )
203
+ df [f'{ title } _TFIDF' ] = df [c ] * df ['idf' ]
204
+
205
+ print ('Document matrix ---------- ' )
206
+ columns_name_tf_idf = [c for c in df .columns if '_TFIDF' in c ]
207
+ columns_name_tf = [c for c in df .columns if '_TF_' in c ]
208
+
209
+ with pd .option_context ('display.max_rows' , None , 'display.max_columns' ,
210
+ None ): # more options can be specified also
211
+ print (df [columns_name_tf ])
212
+
213
+ return
214
+
215
+
216
+
217
+ class EntityData :
218
+ def __init__ (self ):
219
+ self .extract_entity ()
220
+
221
+ def extract_entity (self ):
222
+
223
+ sample_text = """Jane Villanueva of United, a unit of United Airlines Holding, said the fare applies to the Chicago route."""
224
+
225
+ # Tokenization: Split the sample_text into a list of words or tokens
226
+ tokens = nltk .word_tokenize (sample_text )
227
+
228
+ # Tagging
229
+ tagged_tokens = nltk .pos_tag (tokens )
230
+
231
+ # Extract entities
232
+ entities = nltk .ne_chunk (tagged_tokens )
233
+
234
+ print ('-' * 30 )
235
+ print ('BIO tagging' )
236
+ print (f'Sample text : { sample_text } ' )
237
+
238
+ words = [e .leaves ()[0 ][0 ] if type (e ) is nltk .tree .tree .Tree else e [0 ] for e in entities ]
239
+ labels = [e .label () if type (e ) is nltk .tree .tree .Tree else e [1 ] for e in entities ]
240
+
241
+ df = pd .DataFrame ({'words' : words , 'labels' : labels })
242
+ print (df )
243
+
244
+ def do_part_of_speech_tagging (self ):
245
+ txt = """There were 70 children there. Preliminary findings were reported in today's New England Journal of Medicine."""
246
+
247
+ tokenized = sent_tokenize (txt )
248
+
249
+ for i in tokenized :
250
+ wordsList = nltk .word_tokenize (i )
251
+ tagged = nltk .pos_tag (wordsList )
252
+ print (tagged )
253
+
254
+ class StanfordNLP :
255
+ def __init__ (self , host = 'http://localhost' , port = 9000 ):
256
+ self .nlp = StanfordCoreNLP (host , port = port ,
257
+ timeout = 30000 ) # , quiet=False, logging_level=logging.DEBUG)
258
+ self .props = {
259
+ 'annotators' : 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation' ,
260
+ 'pipelineLanguage' : 'en' ,
261
+ 'outputFormat' : 'json'
262
+ }
263
+
264
+ def word_tokenize (self , sentence ):
265
+ return self .nlp .word_tokenize (sentence )
266
+
267
+ def pos (self , sentence ):
268
+ return self .nlp .pos_tag (sentence )
269
+
270
+ def ner (self , sentence ):
271
+ return self .nlp .ner (sentence )
272
+
273
+ def parse (self , sentence ):
274
+ return self .nlp .parse (sentence )
275
+
276
+ def dependency_parse (self , sentence ):
277
+ return self .nlp .dependency_parse (sentence )
278
+
279
+ def annotate (self , sentence ):
280
+ return json .loads (self .nlp .annotate (sentence , properties = self .props ))
281
+
282
+ @staticmethod
283
+ def tokens_to_dict (_tokens ):
284
+ tokens = defaultdict (dict )
285
+ for token in _tokens :
286
+ tokens [int (token ['index' ])] = {
287
+ 'word' : token ['word' ],
288
+ 'lemma' : token ['lemma' ],
289
+ 'pos' : token ['pos' ],
290
+ 'ner' : token ['ner' ]
291
+ }
292
+ return tokens
293
+
294
+
295
+ class NRC_VAD :
296
+ def __init__ (self ):
297
+ self .arousal = pd .read_csv ('./data/arousal-NRC-VAD-Lexicon.txt' ,
298
+ sep = '\t ' , header = None , names = ['word' , 'score' ])
299
+ self .dominance = pd .read_csv ('./data/dominance-NRC-VAD-Lexicon.txt' ,
300
+ sep = '\t ' , header = None , names = ['word' , 'score' ])
301
+ self .valence = pd .read_csv ('./data/valence-NRC-VAD-Lexicon.txt' ,
302
+ sep = '\t ' , header = None , names = ['word' , 'score' ])
303
+
304
+ def search (self , word_ ):
305
+ print (f'Scores for "{ word_ } "' )
306
+ print (f" - arousal { self .arousal .query ('word==@word_' )['score' ].values [0 ]} " )
307
+ print (f" - dominance { self .dominance .query ('word==@word_' )['score' ].values [0 ]} " )
308
+ print (f" - valence { self .valence .query ('word==@word_' )['score' ].values [0 ]} " )
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+ # ----------------
0 commit comments