forked from LittleYUYU/StackOverflow-Question-Code-Dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_stack_over_flow_embeddings.py
153 lines (106 loc) · 4.78 KB
/
generate_stack_over_flow_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import matplotlib
matplotlib.use('Agg')
import pickle
import sys
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import json
from gensim.models import Word2Vec
import random
import numpy as np
sys.path.append("data_processing/codenn/src")
from data_processing.code_processing import *
from keras.preprocessing.text import text_to_word_sequence
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
iid_labeled = pickle.load(open('annotation_tool/crowd_sourcing/python_annotator/all_agreed_iid_to_label.pickle','rb'))
q_code_snippet = pickle.load(open('annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_by_classifier_multiple_iid_to_code.pickle', 'rb'))
qid_to_title = pickle.load(open('annotation_tool/data/code_solution_labeled_data/source/python_how_to_do_it_by_classifier_multiple_qid_to_title.pickle','rb'))
qid_code_labeled = dict([(key, q_code_snippet[key]) for key in iid_labeled])
tokenized_code, bool_failed_var, bool_failed_token = tokenize_code_corpus(qid_code_labeled, "python")
code_samples = [' '.join(tokenized_code[key]) for key in tokenized_code]
question_samples = [qid_to_title[qid] for qid, code_idx in iid_labeled]
samples = code_samples + question_samples
samples_preprocessed = [text_to_word_sequence(s) for s in samples]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(samples)
word_index = tokenizer.word_index
# run model
size = 100
model = Word2Vec(samples_preprocessed, size=size, min_count=1, window=5, sg=1, iter=15)
weights = model.wv.syn0
d = dict([(k, v.index) for k, v in model.wv.vocab.items()])
emb = np.zeros(shape=(len(word_index)+1, size), dtype='float32')
for w, i in word_index.items():
if w not in d: continue
emb[i, :] = weights[d[w], :]
word_vectors = model.wv
sorted_by_word_count = sorted(tokenizer.word_counts.items(), key=lambda (word, count): count, reverse=True)
wanted_words = []
count = 0
for word, freq in sorted_by_word_count:
if count<200:
wanted_words.append(word)
count += 1
else:
break
wanted_vocab = dict((k, word_vectors.vocab[k]) for k in wanted_words if k in word_vectors.vocab)
X = model[wanted_vocab] # X is an array of word vectors, each vector containing 150 tokens
tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=5000, random_state=23)
Y = tsne_model.fit_transform(X)
fig, ax = plt.subplots(figsize=(20,10))
ax.scatter(Y[:, 0], Y[:, 1])
words = list(wanted_vocab)
for i, word in enumerate(words):
plt.annotate(word, xy=(Y[i, 0], Y[i, 1]))
ax.set_yticklabels([]) #Hide ticks
ax.set_xticklabels([]) #Hide ticks
plt.savefig('tsne-output.png')
plt.clf()
np.save(open('word2vec_%d_dim.embeddings' % size, 'wb'), emb)
# generate histograms for setence length and number of words
question_length = [len(qid_to_title[qid]) for qid, label in iid_labeled]
plt.hist(question_length, bins='auto')
plt.title("Question Length")
plt.savefig('question_length_hist.png')
plt.clf()
question_number_of_words = [len(text_to_word_sequence(qid_to_title[qid])) for qid, label in iid_labeled]
plt.hist(question_number_of_words, bins='auto')
plt.title("Number of Words of Question")
plt.savefig('question_number_of_words_hist.png')
plt.clf()
code_snippet_length = [len(q_code_snippet[key]) for key in iid_labeled]
plt.hist(code_snippet_length, bins='auto')
plt.title("Code Snippet Length")
plt.savefig('code_snippet_length_hist.png')
plt.clf()
code_snippet_number_of_words = [len(text_to_word_sequence(q_code_snippet[key])) for key in iid_labeled]
plt.hist(code_snippet_number_of_words, bins='auto')
plt.title("Number of Words of Code Snippet")
plt.savefig('code_snippet_number_of_words_hist.png')
plt.clf()
code_snippet_tokenized_length = [len(' '.join(tokenized_code[key])) for key in iid_labeled]
plt.hist(code_snippet_tokenized_length, bins='auto')
plt.title("Code Snippet Tokenized Length")
plt.savefig('code_snippet_tokenized_length_hist.png')
plt.clf()
code_snippet_tokenized_number_of_words = [len(text_to_word_sequence(' '.join(tokenized_code[key])))
for key in iid_labeled]
plt.hist(code_snippet_tokenized_number_of_words, bins='auto')
plt.title("Number of Words of Code Snippet Tokenized")
plt.savefig('code_snippet_tokenized_number_of_words_hist.png')
plt.clf()
# generate hist for correct answers per question
qid_code_tokenized = {}
for key, label in iid_labeled.items():
qid, code_idx = key
if label == 1:
if qid in qid_code_tokenized:
qid_code_tokenized[qid] += 1
else:
qid_code_tokenized[qid] = 1
correct_answers_per_question = [value for key, value in qid_code_tokenized.items()]
plt.hist(correct_answers_per_question, bins='auto')
plt.title("Number of Correct Answers per Question")
plt.savefig('correct_answers_per_question_hist.png')
plt.clf()