Skip to content

Commit 4f45339

Browse files
committed
アメリカの赤ちゃんに名付けられた名前に関する分析を行った
- [] concatによるデータの連結 - [x] assertでのデータの検証 - [] plotの際には欠損値を置き換えてエラーを防ぐ - [] NumPyの転置行列 - [] subplotによる複数の図の表示
1 parent e68eca2 commit 4f45339

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

ch02/naming_list.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import study
4+
import pandas as pd
5+
import matplotlib.pyplot as plt
6+
import numpy as np
7+
8+
def main():
9+
columns = ['name', 'sex', 'births']
10+
11+
pieces = []
12+
years = range(1880, 2011)
13+
for year in years:
14+
path = '{0}/ch02/names/yob{1}.txt'.format(study.DATA_DIR, year)
15+
frame = pd.read_csv(path, names=columns)
16+
frame['year'] = year
17+
pieces.append(frame)
18+
# concat rows, delete index
19+
names = pd.concat(pieces, ignore_index=True)
20+
# cols is deprecated, use columns instead
21+
# rows is deprecated, use index instead
22+
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc='sum')
23+
total_births.plot(title='Total births by sex and year')
24+
plt.show()
25+
26+
# add prop
27+
groups = ['year', 'sex']
28+
names = names.groupby(groups).apply(add_prop)
29+
# verify sum is 1
30+
assert np.allclose(names.groupby(groups)['prop'].sum(), 1)
31+
32+
# same for loop list.append => pd.concat
33+
grouped = names.groupby(groups)
34+
top1000 = grouped.apply(get_top1000)
35+
36+
# u'男子・女子の名前の年代別推移の例
37+
# u'=>代表的な名前を付けなくなっている傾向があるのでは?という仮説をグラフから考えた
38+
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc='sum')
39+
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
40+
subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year')
41+
plt.show()
42+
43+
# u'名前の種類の多様性の確認(上位が全体の中でどれくらいの割合であるか)
44+
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc='sum')
45+
table.plot(title='Sum of table1000.prop by year and sex',
46+
yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
47+
plt.show()
48+
49+
# u'年度ごとの名前の多様性変遷
50+
# u'結果として、1980年以降に多様となり、特に女子の名前が増えている
51+
diversity = top1000.groupby(groups).apply(get_quantile_count)
52+
diversity = diversity.unstack('sex')
53+
diversity.plot(title='Number of popular names in top 50%')
54+
plt.show()
55+
56+
# u'名前の末尾文字の男女間の差異
57+
get_last_letter = lambda x: x[-1]
58+
last_letters = names.name.map(get_last_letter)
59+
last_letters.name = 'last_letter'
60+
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc='sum')
61+
# Error>: CGContextClosePath: no current point, if you don't excluding NaN using fillna.
62+
subtable = table.reindex(columns=[1910, 1960, 2010], level='year').fillna(0.0)
63+
letter_prop = subtable / subtable.sum().astype(float)
64+
fix, axes = plt.subplots(2, 1, figsize=(10, 8))
65+
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
66+
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)
67+
plt.show()
68+
69+
# u'上記で特に多かった男子の末尾文字について
70+
# u'T: 転置行列(transposed matrix)
71+
letter_prop = table / table.sum().astype(float)
72+
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
73+
dny_ts.plot(style={'d':'-.', 'n': '-', 'y': ':'})
74+
plt.show()
75+
76+
# u'男子名前から女子名前へ定着、とその逆
77+
all_names = top1000.name.unique()
78+
mask = np.array(['lesl' in x.lower() for x in all_names])
79+
lesley_like = all_names[mask]
80+
filtered = top1000[top1000.name.isin(lesley_like)]
81+
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')
82+
# u'sum of sex(columns), axis is '軸'. axis is year(index).
83+
table = table.div(table.sum(1), axis=0)
84+
table.plot(style={'M': 'k-', 'F': 'k--'})
85+
plt.show()
86+
87+
def add_prop(group):
88+
births = group['births'].astype(float)
89+
90+
group['prop'] = births / births.sum()
91+
92+
return group
93+
94+
def get_top1000(group):
95+
return group.sort_index(by='births', ascending=False)[:1000]
96+
97+
def get_quantile_count(group, q=0.5):
98+
"""
99+
u'cumsum is cumulative sum. (累積和)
100+
u'上記が0.5、つまり半分を超えるIndexを見つけることで、数がすくなければ上位の構成比が大きい
101+
"""
102+
group = group.sort_index(by='prop', ascending=False)
103+
104+
return group.prop.cumsum().values.searchsorted(q) + 1 # index started zero
105+
106+
if __name__ == '__main__':
107+
print main()

0 commit comments

Comments
 (0)