forked from rkadlec/ubuntu-ranking-dataset-creator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
125 lines (95 loc) · 4.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import unicodecsv
import matplotlib.pyplot as plt
import numpy
from collections import defaultdict
from scipy.stats import chisquare, ttest_ind
def n_utterances_counts(f_name, eou='__eou__'):
n_utterances = []
reader = unicodecsv.reader(open(f_name))
next(reader) # skip header
for line in reader:
n_utterances.append(line[0].count(eou))
return n_utterances
def train_stats(f_name, eou='__eou__', eot='__eot__'):
pos_utterances = []
pos_turns = []
pos_words = []
neg_utterances = []
neg_turns = []
neg_words = []
reader = unicodecsv.reader(open(f_name))
next(reader) # skip header
for line in reader:
if int(float(line[2])) == 1:
pos_utterances.append(line[0].count(eou))
pos_turns.append(line[0].count(eot))
pos_words.append(len(line[0].split()))
elif int(float(line[2])) == 0:
neg_utterances.append(line[0].count(eou))
neg_turns.append(line[0].count(eot))
neg_words.append(len(line[0].split()))
else:
print line[2]
return pos_utterances, pos_turns, pos_words, neg_utterances, neg_turns, neg_words
def normalize(data):
total = float(sum(data))
return data/total
def distribution(data, max_utt):
counts = defaultdict(int)
for d in data:
counts[d] += 1
total = float(len(data))
distr = numpy.zeros(max_utt)
for key, val in counts.iteritems():
distr[key] = val
return distr, normalize(distr)
def plot_histogram(data, title, x_label, y_label, **kwargs):
n, bins, patches = plt.hist(data, 500, facecolor='green', alpha=0.75, **kwargs)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title)
plt.grid(True)
plt.show()
if __name__ == "__main__":
# load lists of number of utterances
train_n_uterrances = n_utterances_counts("/home/petrbel/ubuntu-ranking-dataset-creator/src/train.csv")
test_n_uterrances = n_utterances_counts("/home/petrbel/ubuntu-ranking-dataset-creator/src/test.csv")
valid_n_uterrances = n_utterances_counts("/home/petrbel/ubuntu-ranking-dataset-creator/src/valid.csv")
max_utt = max(max(train_n_uterrances), max(test_n_uterrances), max(valid_n_uterrances)) + 1
# train distribution
train_counts, train_distr = distribution(train_n_uterrances, max_utt=max_utt)
# test
expected_test_counts = train_distr * len(test_n_uterrances)
real_test_counts, test_distr = distribution(test_n_uterrances, max_utt=max_utt)
_, pvalue = chisquare(real_test_counts+1, expected_test_counts+1)
print("TestDataset: ChiSq pvalue={}".format(pvalue))
# valid
expected_valid_counts = train_distr * len(valid_n_uterrances)
real_valid_counts, valid_distr = distribution(valid_n_uterrances, max_utt=max_utt)
_, pvalue = chisquare(real_valid_counts+1, expected_valid_counts+1)
print("ValidDataset: ChiSq pvalue={}".format(pvalue))
# histograms
plot_histogram(train_n_uterrances, "Train Utterances", "Number of utterances", "Count")
plot_histogram(test_n_uterrances, "Test Utterances", "Number of utterances", "Count")
plot_histogram(valid_n_uterrances, "Valid Utterances", "Number of utterances", "Count")
# train stats
print("Train Min: {}".format(min(train_n_uterrances)))
print("Train Max: {}".format(max(train_n_uterrances)))
print("Train Mean: {}".format(numpy.mean(train_n_uterrances)))
print("Train Std: {}".format(numpy.std(train_n_uterrances)))
# test stats
print("Test Min: {}".format(min(test_n_uterrances)))
print("Test Max: {}".format(max(test_n_uterrances)))
print("Test Mean: {}".format(numpy.mean(test_n_uterrances)))
print("Test Std: {}".format(numpy.std(test_n_uterrances)))
# valid stats
print("Valid Min: {}".format(min(valid_n_uterrances)))
print("Valid Max: {}".format(max(valid_n_uterrances)))
print("Valid Mean: {}".format(numpy.mean(valid_n_uterrances)))
print("Valid Std: {}".format(numpy.std(valid_n_uterrances)))
# ttest of means
pvalue = ttest_ind(train_n_uterrances, test_n_uterrances, equal_var=False)
print("ttest: train-test, pvalue={}".format(pvalue))
pvalue = ttest_ind(train_n_uterrances, valid_n_uterrances, equal_var=False)
print("ttest: train-valid, pvalue={}".format(pvalue))
pos_utterances, pos_turns, pos_words, neg_utterances, neg_turns, neg_words = train_stats("/home/petrbel/ubuntu-ranking-dataset-creator/src/train.csv")