Skip to content

Commit 70a67dc

Browse files
committed
homework2 first version.
1 parent 81d6dac commit 70a67dc

12 files changed

+7414
-0
lines changed

homework2/.ipynb_checkpoints/main-checkpoint.ipynb

Lines changed: 178 additions & 0 deletions
Large diffs are not rendered by default.

homework2/Data Science.pptx

47.4 KB
Binary file not shown.

homework2/HW2_pokemon.csv

Lines changed: 369 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main-2018_0531_1359.ipynb

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main-2018_0531_1409.ipynb

Lines changed: 1701 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main-2018_0531_1449_first.ipynb

Lines changed: 1755 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main-2018_0531_1549_ver1.1.ipynb

Lines changed: 178 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main.ipynb

Lines changed: 178 additions & 0 deletions
Large diffs are not rendered by default.

homework2/main.py

Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
2+
# coding: utf-8
3+
4+
# In[ ]:
5+
6+
7+
# 第一欄取名:pair,值為0~999
8+
# 第二欄取名:answer,值為0 or 1
9+
# 這次Public data 70% Private data 30%
10+
# 作業繳交格式如下圖 : 在subject.csv中判斷兩個pokemon是否為相同屬性,是為1,不是為0
11+
12+
13+
# In[3]:
14+
15+
16+
import numpy as np
17+
import pandas as pd
18+
19+
import matplotlib.pyplot as plt
20+
import seaborn as sns
21+
22+
get_ipython().magic('matplotlib inline')
23+
sns.set(style='white', font_scale=0.9)
24+
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
25+
sns.color_palette(flatui)
26+
27+
np.set_printoptions(threshold=np.nan)
28+
pd.set_option("display.max_columns",100)
29+
30+
31+
# In[12]:
32+
33+
34+
dataset = pd.read_csv('HW2_pokemon.csv')
35+
36+
37+
# In[44]:
38+
39+
40+
print(dataset.shape)
41+
dataset.head(10)
42+
43+
44+
# In[41]:
45+
46+
47+
# dataset.info()
48+
49+
50+
# In[16]:
51+
52+
53+
dataset_num = dataset[['Total','HP','Attack','Defense','Sp_Atk','Sp_Def','Speed']]
54+
55+
56+
# In[17]:
57+
58+
59+
from sklearn.preprocessing import StandardScaler
60+
sc_X = StandardScaler()
61+
dataset_num = sc_X.fit_transform(dataset_num)
62+
63+
64+
# In[19]:
65+
66+
67+
dataset_scaled = dataset.copy()
68+
dataset_scaled[['Total','HP','Attack','Defense','Sp_Atk','Sp_Def','Speed']] = dataset_num
69+
70+
71+
# In[20]:
72+
73+
74+
from sklearn.cluster import KMeans
75+
wcss = []
76+
for i in range(1, 41):
77+
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
78+
kmeans.fit(dataset_num)
79+
wcss.append(kmeans.inertia_)
80+
plt.plot(range(1, 41), wcss)
81+
plt.title('The Elbow Method')
82+
plt.xlabel('Number of clusters')
83+
plt.ylabel('WCSS')
84+
plt.xticks(np.arange(1, 41, 1.0))
85+
plt.grid(which='major', axis='x')
86+
plt.show()
87+
88+
89+
# In[145]:
90+
91+
92+
# n_clusters 应该是 5~10之间比较好
93+
kmeans = KMeans(n_clusters = 82, init = 'k-means++', random_state = 84)
94+
y_kmeans = kmeans.fit_predict(dataset_num)
95+
96+
print("y_kmeans : ", type(y_kmeans), y_kmeans.shape)
97+
print(y_kmeans[:30])
98+
test_dataset = pd.read_csv('subject.csv')
99+
print(type(test_dataset), test_dataset.shape)
100+
101+
index = 0
102+
res = []
103+
MIDDLE = 29
104+
for td1,td2 in zip(test_dataset["0"], test_dataset["1"]):
105+
td1_i = int(td1[7:])
106+
td2_i = int(td2[7:])
107+
if y_kmeans[td1_i] == y_kmeans[td2_i]:
108+
# if y_kmeans[td1_i] < MIDDLE and y_kmeans[td2_i] < MIDDLE:
109+
# print(index, td1_i, td2_i, y_kmeans[td1_i] , y_kmeans[td2_i])
110+
res.append([index, 1])
111+
# elif y_kmeans[td1_i] >= MIDDLE and y_kmeans[td2_i] >= MIDDLE:
112+
# res.append([index, 1])
113+
else:
114+
res.append([index, 0])
115+
index += 1
116+
117+
print(res[:10])
118+
119+
# 把預測的結果生成 kaggle要求的格式
120+
# pair,值為0~999,第二欄取名:answer
121+
import csv
122+
res_csv_file_path = "result.csv"
123+
with open(res_csv_file_path, "w") as output:
124+
writer = csv.writer(output, lineterminator='\n')
125+
writer.writerow(('pair', 'answer'))
126+
ids = 0
127+
for val in res:
128+
writer.writerow((str(ids),val[1]))
129+
ids += 1
130+
print("---------------execute finished.")
131+
132+
133+
# In[22]:
134+
135+
136+
dataset['y_kmeans'] = y_kmeans
137+
138+
139+
# In[23]:
140+
141+
142+
dataset.head()
143+
144+
145+
# In[59]:
146+
147+
148+
sns.violinplot(x='y_kmeans', y='Total', data=dataset)
149+
plt.show()
150+
151+
152+
# In[25]:
153+
154+
155+
sns.violinplot(x='y_kmeans', y='Attack', data=dataset)
156+
plt.show()
157+
158+
159+
# In[26]:
160+
161+
162+
sns.violinplot(x='y_kmeans', y='Defense', data=dataset)
163+
plt.show()
164+
165+
166+
# In[27]:
167+
168+
169+
dataset.sort_values('Defense', axis=0, ascending=False).head(10)
170+
171+
172+
# In[28]:
173+
174+
175+
sns.violinplot(x='y_kmeans', y='Speed', data=dataset)
176+
plt.show()
177+
178+
179+
# In[29]:
180+
181+
182+
dataset.sort_values('Speed', axis=0, ascending=False).head(15)
183+
184+
185+
# In[31]:
186+
187+
188+
sns.violinplot(x='y_kmeans', y='Sp_Atk', data=dataset)
189+
plt.show()
190+
191+
192+
# In[32]:
193+
194+
195+
sns.violinplot(x='y_kmeans', y='Sp_Def', data=dataset)
196+
plt.show()
197+
198+
199+
# In[34]:
200+
201+
202+
dataset.sort_values('Sp_Def', axis=0, ascending=False).head(10)
203+
204+
205+
# In[38]:
206+
207+
208+
# Clusters by Height_m,Weight_kg
209+
210+
#Get counts by type and cluster
211+
#We need to merge the two columns Type 1 and Type 2 together the type can appear in either column
212+
data_pct_1 = dataset.groupby(['Height_m', 'y_kmeans'])['Name'].count().to_frame().reset_index()
213+
data_pct_1.columns = ['Type', 'y_kmeans', 'count_1']
214+
215+
data_pct_2 = dataset.groupby(['Weight_kg', 'y_kmeans'])['Name'].count().to_frame().reset_index()
216+
data_pct_2.columns = ['Type', 'y_kmeans', 'count_2']
217+
218+
data_pct = data_pct_1.merge(data_pct_2, how='outer',
219+
left_on=['Type', 'y_kmeans'],
220+
right_on=['Type', 'y_kmeans'])
221+
222+
data_pct.fillna(0, inplace=True)
223+
data_pct['count'] = data_pct['count_1'] + data_pct['count_2']
224+
225+
#Get counts by type
226+
data_pct_Total = data_pct.groupby(['Type']).sum()['count'].reset_index()
227+
data_pct_Total.columns = ['Type', 'count_total']
228+
229+
#Merge two dataframes and create percentage column
230+
data_pct = data_pct.merge(right=data_pct_Total,
231+
how='inner',
232+
left_on='Type',
233+
right_on='Type')
234+
235+
data_pct['pct'] = data_pct['count'] / data_pct['count_total']
236+
237+
#Create Graph
238+
sns.barplot(x='Type', y='pct', data=data_pct, estimator=sum, ci=None, color='#34495e', label='4')
239+
sns.barplot(x='Type', y='pct', data=data_pct[data_pct['y_kmeans'] <= 3],
240+
estimator=sum, ci=None, color='#e74c3c', label='3')
241+
sns.barplot(x='Type', y='pct', data=data_pct[data_pct['y_kmeans'] <= 2],
242+
estimator=sum, ci=None, color='#95a5a6', label='2')
243+
sns.barplot(x='Type', y='pct', data=data_pct[data_pct['y_kmeans'] <= 1],
244+
estimator=sum, ci=None, color='#3498db', label='1')
245+
sns.barplot(x='Type', y='pct', data=data_pct[data_pct['y_kmeans'] == 0],
246+
estimator=sum, ci=None, color='#9b59b6', label='0')
247+
248+
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1))
249+
plt.xticks(rotation=90)
250+
plt.ylabel('Percentage')
251+
plt.tight_layout()
252+
plt.show()
253+
254+
255+
# In[40]:
256+
257+
258+
#Clusters by Body_Style
259+
260+
261+
#Get counts by body_Style and cluster
262+
#We need to merge the two columns Type 1 and Type 2 together the type can appear in either column
263+
data_pct = dataset.groupby(['Body_Style', 'y_kmeans'])['Name'].count().to_frame().reset_index()
264+
data_pct.columns = ['Body_Style', 'y_kmeans', 'count']
265+
266+
#Get counts by type
267+
data_pct_Total = data_pct.groupby(['Body_Style']).sum()['count'].reset_index()
268+
data_pct_Total.columns = ['Body_Style', 'count_total']
269+
270+
#Merge two dataframes and create percentage column
271+
data_pct = data_pct.merge(right=data_pct_Total,
272+
how='inner',
273+
left_on='Body_Style',
274+
right_on='Body_Style')
275+
276+
data_pct['pct'] = data_pct['count'] / data_pct['count_total']
277+
278+
#Create Graph
279+
sns.barplot(x='Body_Style', y='pct', data=data_pct, estimator=sum, ci=None, color='#34495e', label='4')
280+
sns.barplot(x='Body_Style', y='pct', data=data_pct[data_pct['y_kmeans'] <= 3],
281+
estimator=sum, ci=None, color='#e74c3c', label='3')
282+
sns.barplot(x='Body_Style', y='pct', data=data_pct[data_pct['y_kmeans'] <= 2],
283+
estimator=sum, ci=None, color='#95a5a6', label='2')
284+
sns.barplot(x='Body_Style', y='pct', data=data_pct[data_pct['y_kmeans'] <= 1],
285+
estimator=sum, ci=None, color='#3498db', label='1')
286+
sns.barplot(x='Body_Style', y='pct', data=data_pct[data_pct['y_kmeans'] == 0],
287+
estimator=sum, ci=None, color='#9b59b6', label='0')
288+
289+
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1))
290+
plt.xticks(rotation=90)
291+
plt.ylabel('Percentage')
292+
plt.tight_layout()
293+
plt.show()
294+

homework2/main_ver1.1.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
2+
# coding: utf-8
3+
4+
# In[29]:
5+
6+
7+
import csv
8+
import numpy as np
9+
import pandas as pd
10+
import matplotlib.pyplot as plt
11+
import seaborn
12+
13+
from sklearn.preprocessing import StandardScaler
14+
from sklearn.cluster import KMeans
15+
16+
17+
# In[30]:
18+
19+
20+
dataset = pd.read_csv('HW2_pokemon.csv')
21+
print(dataset.shape)
22+
dataset_num = dataset[['Total','HP','Attack','Defense','Sp_Atk','Sp_Def','Speed']]
23+
24+
25+
# In[31]:
26+
27+
28+
ss = StandardScaler()
29+
dataset_num = ss.fit_transform(dataset_num)
30+
dataset_scaled = dataset.copy()
31+
dataset_scaled[['Total','HP','Attack','Defense','Sp_Atk','Sp_Def','Speed']] = dataset_num
32+
33+
loss = []
34+
for i in range(1, 41):
35+
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
36+
kmeans.fit(dataset_num)
37+
loss.append(kmeans.inertia_)
38+
plt.plot(range(1, 41), loss)
39+
# plt.title('The Elbow Method')
40+
# plt.xlabel('clusters数目')
41+
# plt.ylabel('WCSS')
42+
plt.xticks(np.arange(1, 41, 1.0))
43+
plt.grid( axis='x')
44+
plt.show()
45+
46+
47+
# In[32]:
48+
49+
50+
# n_clusters 应该是 5~10之间比较好
51+
kmeans = KMeans(n_clusters = 82, init = 'k-means++', random_state = 84)
52+
y_kmeans = kmeans.fit_predict(dataset_num)
53+
54+
# 画出 Kmeans 的分类情况
55+
dataset['y_kmeans'] = y_kmeans
56+
seaborn.violinplot(x='y_kmeans', y='Total', data=dataset)
57+
plt.show()
58+
59+
60+
# In[33]:
61+
62+
63+
print("y_kmeans : ", type(y_kmeans), y_kmeans.shape)
64+
print(y_kmeans[:20])
65+
test_dataset = pd.read_csv('subject.csv')
66+
print(type(test_dataset), test_dataset.shape)
67+
68+
index = 0
69+
res = []
70+
# MIDDLE = 29
71+
for td1,td2 in zip(test_dataset["0"], test_dataset["1"]):
72+
td1_i = int(td1[7:])
73+
td2_i = int(td2[7:])
74+
if y_kmeans[td1_i] == y_kmeans[td2_i]:
75+
# 后来发现这种判断不能用,会大幅降低判断的准确性
76+
# if y_kmeans[td1_i] < MIDDLE and y_kmeans[td2_i] < MIDDLE:
77+
# print(index, td1_i, td2_i, y_kmeans[td1_i] , y_kmeans[td2_i])
78+
res.append([index, 1])
79+
# elif y_kmeans[td1_i] >= MIDDLE and y_kmeans[td2_i] >= MIDDLE:
80+
# res.append([index, 1])
81+
else:
82+
res.append([index, 0])
83+
index += 1
84+
85+
print(res[:10])
86+
87+
# 把預測的結果生成 kaggle要求的格式
88+
# pair,值為0~999,第二欄取名:answer
89+
res_csv_file_path = "result.csv"
90+
with open(res_csv_file_path, "w") as output:
91+
writer = csv.writer(output, lineterminator='\n')
92+
writer.writerow(('pair', 'answer'))
93+
ids = 0
94+
for val in res:
95+
writer.writerow((str(ids),val[1]))
96+
ids += 1
97+
print("---------------execute finished.")
98+

0 commit comments

Comments
 (0)