pep8

sourcedexter · Nov 30, 2017 · 9ffca02 · 9ffca02
1 parent 9c77a1a
commit 9ffca02
Showing 1 changed file with 30 additions and 30 deletions.
diff --git a/text_classification/classify_text.py b/text_classification/classify_text.py
@@ -11,45 +11,46 @@
 
 # a table structure to hold the different punctuation used
 tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
-                      if unicodedata.category(unichr(i)).startswith('P'))
+                    if unicodedata.category(unichr(i)).startswith('P'))
+
 
 # method to remove punctuations from sentences.
 def remove_punctuation(text):
     return text.translate(tbl)
 
-#initialize the stemmer
+# initialize the stemmer
 stemmer = LancasterStemmer()
-#variable to hold the Json data read from the file
+# variable to hold the Json data read from the file
 data = None
 
 # read the json file and load the training data
 with open('data.json') as json_data:
     data = json.load(json_data)
-    print (data)
+    print(data)
 
 # get a list of all categories to train for
 categories = list(data.keys())
 words = []
-# a list of tuples with words in the sentence and category name 
+# a list of tuples with words in the sentence and category name
 docs = []
 
 for each_category in data.keys():
-    for each_sentence in data[each_category]: 
+    for each_sentence in data[each_category]:
         # remove any punctuation from the sentence
-	each_sentence = remove_punctuation(each_sentence)
-	print each_sentence
+        each_sentence = remove_punctuation(each_sentence)
+        print each_sentence
         # extract words from each sentence and append to the word list
         w = nltk.word_tokenize(each_sentence)
-	print "tokenized words: ", w
+        print "tokenized words: ", w
         words.extend(w)
         docs.append((w, each_category))
 
 # stem and lower each word and remove duplicates
 words = [stemmer.stem(w.lower()) for w in words]
 words = sorted(list(set(words)))
 
-print (words)
-print (docs)
+print(words)
+print(docs)
 
 # create our training data
 training = []
@@ -68,20 +69,21 @@ def remove_punctuation(text):
     # create our bag of words array
     for w in words:
         bow.append(1) if w in token_words else bow.append(0)
-    
+
     output_row = list(output_empty)
     output_row[categories.index(doc[1])] = 1
-
-    # our training set will contain a the bag of words model and the output row that tells which catefory that bow belongs to.
+
+    # our training set will contain a the bag of words model and the output row that tells
+    # which catefory that bow belongs to.
     training.append([bow, output_row])
 
-# shuffle our features and turn into np.array as tensorflow	 takes in numpy array
+# shuffle our features and turn into np.array as tensorflow  takes in numpy array
 random.shuffle(training)
 training = np.array(training)
 
 # trainX contains the Bag of words and train_y contains the label/ category
-train_x = list(training[:,0])
-train_y = list(training[:,1])
+train_x = list(training[:, 0])
+train_y = list(training[:, 1])
 
 # reset underlying graph data
 tf.reset_default_graph()
@@ -100,36 +102,34 @@ def remove_punctuation(text):
 
 
 # let's test the mdodel for a few sentences:
-# the first two sentences are used for training, and the last two sentences are not present in the training data. 
+# the first two sentences are used for training, and the last two sentences are not present in the training data.
 sent_1 = "what time is it?"
 sent_2 = "I gotta go now"
 sent_3 = "do you know the time now?"
 sent_4 = "you must be a couple of years older then her!"
 
-# a method that takes in a sentence and list of all words 
+# a method that takes in a sentence and list of all words
 # and returns the data in a form the can be fed to tensorflow
+
+
 def get_tf_record(sentence):
     global words
     # tokenize the pattern
     sentence_words = nltk.word_tokenize(sentence)
     # stem each word
     sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
     # bag of words
-    bow = [0]*len(words)  
+    bow = [0]*len(words)
     for s in sentence_words:
-        for i,w in enumerate(words):
-            if w == s: 
+        for i, w in enumerate(words):
+            if w == s:
                 bow[i] = 1
 
     return(np.array(bow))
 
 
 # we can start to predict the results for each of the 4 sentences
-print( categories[np.argmax(model.predict([get_tf_record(sent_1)]))])
-print( categories[np.argmax(model.predict([get_tf_record(sent_2)]))])
-print( categories[np.argmax(model.predict([get_tf_record(sent_3)]))])
-print( categories[np.argmax(model.predict([get_tf_record(sent_4)]))])
-
-
-
-
+print(categories[np.argmax(model.predict([get_tf_record(sent_1)]))])
+print(categories[np.argmax(model.predict([get_tf_record(sent_2)]))])
+print(categories[np.argmax(model.predict([get_tf_record(sent_3)]))])
+print(categories[np.argmax(model.predict([get_tf_record(sent_4)]))])