Skip to content

Commit

Permalink
pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
diogobernini authored Nov 30, 2017
1 parent 9c77a1a commit 9ffca02
Showing 1 changed file with 30 additions and 30 deletions.
60 changes: 30 additions & 30 deletions text_classification/classify_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,45 +11,46 @@

# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
if unicodedata.category(unichr(i)).startswith('P'))
if unicodedata.category(unichr(i)).startswith('P'))


# method to remove punctuations from sentences.
def remove_punctuation(text):
return text.translate(tbl)

#initialize the stemmer
# initialize the stemmer
stemmer = LancasterStemmer()
#variable to hold the Json data read from the file
# variable to hold the Json data read from the file
data = None

# read the json file and load the training data
with open('data.json') as json_data:
data = json.load(json_data)
print (data)
print(data)

# get a list of all categories to train for
categories = list(data.keys())
words = []
# a list of tuples with words in the sentence and category name
# a list of tuples with words in the sentence and category name
docs = []

for each_category in data.keys():
for each_sentence in data[each_category]:
for each_sentence in data[each_category]:
# remove any punctuation from the sentence
each_sentence = remove_punctuation(each_sentence)
print each_sentence
each_sentence = remove_punctuation(each_sentence)
print each_sentence
# extract words from each sentence and append to the word list
w = nltk.word_tokenize(each_sentence)
print "tokenized words: ", w
print "tokenized words: ", w
words.extend(w)
docs.append((w, each_category))

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words]
words = sorted(list(set(words)))

print (words)
print (docs)
print(words)
print(docs)

# create our training data
training = []
Expand All @@ -68,20 +69,21 @@ def remove_punctuation(text):
# create our bag of words array
for w in words:
bow.append(1) if w in token_words else bow.append(0)

output_row = list(output_empty)
output_row[categories.index(doc[1])] = 1

# our training set will contain a the bag of words model and the output row that tells which catefory that bow belongs to.

# our training set will contain a the bag of words model and the output row that tells
# which catefory that bow belongs to.
training.append([bow, output_row])

# shuffle our features and turn into np.array as tensorflow takes in numpy array
# shuffle our features and turn into np.array as tensorflow takes in numpy array
random.shuffle(training)
training = np.array(training)

# trainX contains the Bag of words and train_y contains the label/ category
train_x = list(training[:,0])
train_y = list(training[:,1])
train_x = list(training[:, 0])
train_y = list(training[:, 1])

# reset underlying graph data
tf.reset_default_graph()
Expand All @@ -100,36 +102,34 @@ def remove_punctuation(text):


# let's test the mdodel for a few sentences:
# the first two sentences are used for training, and the last two sentences are not present in the training data.
# the first two sentences are used for training, and the last two sentences are not present in the training data.
sent_1 = "what time is it?"
sent_2 = "I gotta go now"
sent_3 = "do you know the time now?"
sent_4 = "you must be a couple of years older then her!"

# a method that takes in a sentence and list of all words
# a method that takes in a sentence and list of all words
# and returns the data in a form the can be fed to tensorflow


def get_tf_record(sentence):
global words
# tokenize the pattern
sentence_words = nltk.word_tokenize(sentence)
# stem each word
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
# bag of words
bow = [0]*len(words)
bow = [0]*len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
for i, w in enumerate(words):
if w == s:
bow[i] = 1

return(np.array(bow))


# we can start to predict the results for each of the 4 sentences
print( categories[np.argmax(model.predict([get_tf_record(sent_1)]))])
print( categories[np.argmax(model.predict([get_tf_record(sent_2)]))])
print( categories[np.argmax(model.predict([get_tf_record(sent_3)]))])
print( categories[np.argmax(model.predict([get_tf_record(sent_4)]))])




print(categories[np.argmax(model.predict([get_tf_record(sent_1)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_2)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_3)]))])
print(categories[np.argmax(model.predict([get_tf_record(sent_4)]))])

0 comments on commit 9ffca02

Please sign in to comment.