Skip to content
This repository was archived by the owner on Jan 14, 2020. It is now read-only.

Commit e2654af

Browse files
*19.11.14 Update.
1 parent 0291448 commit e2654af

File tree

17 files changed

+2419
-49
lines changed

17 files changed

+2419
-49
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import csv
2+
import seaborn as sns
3+
import matplotlib.pyplot as plt
4+
5+
ages = []
6+
repo_nums = []
7+
follower_nums = []
8+
commit_comment_nums = []
9+
commit_nums = []
10+
issue_comment_nums = []
11+
issue_event_nums = []
12+
issue_numbers = []
13+
org_numbers = []
14+
pr_comment_nums = []
15+
pr_nums = []
16+
collaborator_nums = []
17+
18+
19+
20+
with open('data/data_users_ready_to_analysis_2.csv', newline='') as csvfile:
21+
index = 0
22+
rows = csv.reader(csvfile)
23+
24+
for row in rows:
25+
# print(row)
26+
if index != 0:
27+
ages.append(round(int(row[1]) / 365, 2))
28+
repo_nums.append(round(int(row[2]), 3))
29+
follower_nums.append(round(int(row[3]), 3))
30+
commit_comment_nums.append(round(int(row[4]), 3))
31+
commit_nums.append(round(int(row[5]), 3))
32+
issue_comment_nums.append(round(int(row[6]), 3))
33+
issue_event_nums.append(round(int(row[7]), 3))
34+
issue_numbers.append(round(int(row[8]), 3))
35+
org_numbers.append(round(int(row[9]), 3))
36+
pr_comment_nums.append(round(int(row[10]), 3))
37+
pr_nums.append(round(int(row[11]), 3))
38+
collaborator_nums.append(round(int(row[12]), 3))
39+
index += 1
40+
41+
plt.hist(ages, bins=50, color='steelblue', density=True)
42+
plt.title("Age")
43+
plt.show()
44+
45+
plt.hist(repo_nums, bins=50, color='steelblue', density=True)
46+
plt.title("Repo number")
47+
plt.show()
48+
49+
plt.hist(follower_nums, bins=50, color='steelblue', density=True)
50+
plt.title("follower Number")
51+
plt.show()
52+
53+
plt.hist(commit_comment_nums, bins=50, color='steelblue', density=True)
54+
plt.title("commit_comment Number")
55+
plt.show()
56+
57+
plt.hist(commit_nums, bins=50, color='steelblue', density=True)
58+
plt.title("commit_nums")
59+
plt.show()
60+
61+
plt.hist(issue_comment_nums, bins=50, color='steelblue', density=True)
62+
plt.title("issue_comment Number")
63+
plt.show()
64+
65+
plt.hist(issue_event_nums, bins=50, color='steelblue', density=True)
66+
plt.title("issue_event Number")
67+
plt.show()
68+
69+
plt.hist(issue_numbers, bins=50, color='steelblue', density=True)
70+
plt.title("issue Number")
71+
plt.show()
72+
73+
plt.hist(org_numbers, bins=50, color='steelblue', density=True)
74+
plt.title("org Number")
75+
plt.show()
76+
77+
plt.hist(pr_comment_nums, bins=50, color='steelblue', density=True)
78+
plt.title("pr_commen Number")
79+
plt.show()
80+
81+
plt.hist(pr_nums, bins=50, color='steelblue', density=True)
82+
plt.title("pr Number")
83+
plt.show()
84+
85+
plt.hist(collaborator_nums, bins=50, color='steelblue', density=True)
86+
plt.title("Collaborator Number")
87+
plt.show()
88+
89+

python37/3.1. User_Classification.py

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,53 +38,65 @@
3838

3939
# Age
4040
# row[1] = round(int(row[1]) / 365, 2)
41-
row[1] = 0
41+
row[1] = int(row[1]) / 365
42+
if int(row[1]) != 0:
43+
row[1] = math.log(int(row[1]), 2)
4244

4345
# repo_num
44-
# if int(row[2]) != 0:
45-
# row[2] = math.log(int(row[2]), 10)
46+
if int(row[2]) != 0:
47+
row[2] = math.log(int(row[2]), 2)
4648

4749
# follower_num
48-
if int(row[3]) != 0:
49-
row[3] = math.log(int(row[3]), 2)
50+
# if int(row[3]) != 0:
51+
# row[3] = math.log(int(row[3]), 2)
52+
row[3] = 0
5053

5154
# commit_comment_num
5255
if int(row[4]) != 0:
53-
row[4] = math.log(int(row[4]), 10)
56+
row[4] = math.log(int(row[4]), 2)
5457

5558
# commit_num
5659
if int(row[5]) != 0:
57-
row[5] = math.log(int(row[5]), 10)
60+
row[5] = math.log(int(row[5]), 2)
5861

5962
# issue_comment_num
6063
if int(row[6]) != 0:
61-
row[6] = math.log(int(row[6]), 10)
64+
row[6] = math.log(int(row[6]), 2)
6265

6366
# issue_event_num
6467
if int(row[7]) != 0:
65-
row[7] = math.log(int(row[7]), 10)
68+
row[7] = math.log(int(row[7]), 2)
6669

6770
# issue_number
6871
if int(row[8]) != 0:
69-
row[8] = math.log(int(row[8]), 10)
72+
row[8] = math.log(int(row[8]), 2)
7073

7174
# org_number
72-
# if int(row[9]) != 0:
73-
# row[9] = math.log(int(row[9]), 10)
75+
if int(row[9]) != 0:
76+
row[9] = math.log(int(row[9]), 2)
7477

7578
# pr_comment_num
7679
if int(row[10]) != 0:
77-
row[10] = math.log(int(row[10]), 10)
80+
row[10] = math.log(int(row[10]), 2)
7881

7982
# pr_num
8083
if int(row[11]) != 0:
8184
row[11] = math.log(int(row[11]), 2)
8285

8386
# collaborator_num
84-
# if int(row[12]) != 0:
85-
# row[12] = math.log(int(row[12]), 10)
87+
if int(row[12]) != 0:
88+
row[12] = math.log(int(row[12]), 2)
8689

8790
data.append(row[1:12])
91+
# data.append(row[2:12])
92+
# data.append([row[1], row[6], row[7], row[8]])
93+
# data.append([row[6], row[7], row[8]])
94+
# data.append([row[1], row[10], row[11]])
95+
# data.append([row[10], row[11]])
96+
# data.append([row[1], row[6], row[7], row[8], row[10], row[11]])
97+
# data.append([row[6], row[7], row[8], row[10], row[11]])
98+
99+
88100
data_original.append(row)
89101
index += 1
90102

@@ -96,20 +108,21 @@
96108
# ----------------KMeans-----------------
97109
# ---------------------------------------
98110
# ---------------------------------------
99-
STOP = False
100111

101-
while not STOP:
112+
best_acc = 0
113+
best_results = {}
102114

103-
kmeans = KMeans(n_clusters=CLUSTER_NUM, init='random', n_init=10, max_iter=50, tol=0.0001,
104-
precompute_distances='auto',
105-
verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='elkan').fit(data_kmeans)
115+
for i in range(100):
116+
kmeans = KMeans(n_clusters=CLUSTER_NUM, init='random', n_init=100, max_iter=100, tol=0.0001,
117+
precompute_distances=True,
118+
verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='elkan').fit(data_kmeans)
106119

107120
# print(kmeans.labels_)
108121
target = kmeans.labels_
109122

110123
# print(target)
111124

112-
print(kmeans.cluster_centers_)
125+
# print(kmeans.cluster_centers_)
113126

114127
# save the model to disk
115128
filename = 'models/user_kmeans_{}c.sav'.format(CLUSTER_NUM)
@@ -123,17 +136,39 @@
123136
# print(data_original[i])
124137
# print(target[i])
125138
if data_original[i][0] == 1:
126-
if data_original[i][0] == target[i]:
139+
if 1 == target[i]:
127140
results["TP"] += 1
128-
elif data_original[i][0] != target[i]:
129-
results["FP"] += 1
141+
elif 0 == target[i]:
142+
results["FN"] += 1
130143
elif data_original[i][0] == 0:
131-
if data_original[i][0] == target[i]:
144+
if 0 == target[i]:
132145
results["TN"] += 1
133-
elif data_original[i][0] != target[i]:
134-
results["FN"] += 1
135-
print(results)
146+
elif 1 == target[i]:
147+
results["FP"] += 1
148+
# print(results)
136149
accuracy = ((results["TP"] + results["TN"]) / (results["TP"] + results["TN"] + results["FP"] + results["FN"]))
137-
print(accuracy)
138-
if accuracy > 0.55:
139-
STOP = True
150+
# print(accuracy)
151+
if accuracy > best_acc:
152+
best_acc = accuracy
153+
best_results = results
154+
155+
print(best_acc)
156+
print(best_results)
157+
158+
# target = target.tolist()
159+
160+
# Open CSV reader
161+
with open('data/data_users_cluster_with_results.csv', 'w', newline='') as csvfile:
162+
# Create CSV writer
163+
writer = csv.writer(csvfile)
164+
# Write first row
165+
writer.writerow(
166+
['result', 'newcomer', 'age', 'repo_num', 'follower_num', 'commit_comment_num', 'commit_num', 'issue_comment_num', 'issue_event_num', 'issue_number', 'org_number', 'pr_comment_num', 'pr_num', 'collaborator_num'])
167+
168+
i = 0
169+
while i < len(target):
170+
writer.writerow(
171+
[target[i], data_original[i][0], data_original[i][1], data_original[i][2], data_original[i][3], data_original[i][4],
172+
data_original[i][5], data_original[i][6], data_original[i][7],
173+
data_original[i][8], data_original[i][9], data_original[i][10], data_original[i][11], data_original[i][12]])
174+
i += 1

python37/3.2. User_Classification_Silhouette_Analysis.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,53 +41,56 @@
4141

4242
# Age
4343
# row[1] = round(int(row[1]) / 365, 2)
44-
row[1] = 0
44+
row[1] = int(row[1]) / 365
45+
if int(row[1]) != 0:
46+
row[1] = math.log(int(row[1]), 2)
4547

4648
# repo_num
47-
# if int(row[2]) != 0:
48-
# row[2] = math.log(int(row[2]), 10)
49+
if int(row[2]) != 0:
50+
row[2] = math.log(int(row[2]), 2)
4951

5052
# follower_num
51-
if int(row[3]) != 0:
52-
row[3] = math.log(int(row[3]), 2)
53+
# if int(row[3]) != 0:
54+
# row[3] = math.log(int(row[3]), 2)
55+
row[3] = 0
5356

5457
# commit_comment_num
5558
if int(row[4]) != 0:
56-
row[4] = math.log(int(row[4]), 10)
59+
row[4] = math.log(int(row[4]), 2)
5760

5861
# commit_num
5962
if int(row[5]) != 0:
60-
row[5] = math.log(int(row[5]), 10)
63+
row[5] = math.log(int(row[5]), 2)
6164

6265
# issue_comment_num
6366
if int(row[6]) != 0:
64-
row[6] = math.log(int(row[6]), 10)
67+
row[6] = math.log(int(row[6]), 2)
6568

6669
# issue_event_num
6770
if int(row[7]) != 0:
68-
row[7] = math.log(int(row[7]), 10)
71+
row[7] = math.log(int(row[7]), 2)
6972

7073
# issue_number
7174
if int(row[8]) != 0:
72-
row[8] = math.log(int(row[8]), 10)
75+
row[8] = math.log(int(row[8]), 2)
7376

7477
# org_number
75-
# if int(row[9]) != 0:
76-
# row[9] = math.log(int(row[9]), 10)
78+
if int(row[9]) != 0:
79+
row[9] = math.log(int(row[9]), 2)
7780

7881
# pr_comment_num
7982
if int(row[10]) != 0:
80-
row[10] = math.log(int(row[10]), 10)
83+
row[10] = math.log(int(row[10]), 2)
8184

8285
# pr_num
8386
if int(row[11]) != 0:
8487
row[11] = math.log(int(row[11]), 2)
8588

8689
# collaborator_num
87-
# if int(row[12]) != 0:
88-
# row[12] = math.log(int(row[12]), 10)
90+
if int(row[12]) != 0:
91+
row[12] = math.log(int(row[12]), 2)
8992

90-
data.append(row[1:12])
93+
data.append([row[6], row[7], row[8]])
9194
index += 1
9295

9396
data_kmeans = np.array(data).astype(np.float64)
@@ -119,7 +122,7 @@
119122

120123
# Initialize the clusterer with n_clusters value and a random generator
121124
# seed of 10 for reproducibility.
122-
clusterer = KMeans(n_clusters=n_clusters, init='random', n_init=10, max_iter=50, tol=0.0001,
125+
clusterer = KMeans(n_clusters=n_clusters, init='random', n_init=10, max_iter=10000, tol=0.0001,
123126
precompute_distances='auto',
124127
verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='elkan')
125128
cluster_labels = clusterer.fit_predict(X)
@@ -189,5 +192,6 @@
189192
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
190193
"with n_clusters = %d" % n_clusters),
191194
fontsize=14, fontweight='bold')
195+
plt.show()
196+
192197

193-
plt.show()

0 commit comments

Comments
 (0)