|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +import study |
| 4 | +import pandas as pd |
| 5 | +import matplotlib.pyplot as plt |
| 6 | +import numpy as np |
| 7 | + |
| 8 | +def main(): |
| 9 | + columns = ['name', 'sex', 'births'] |
| 10 | + |
| 11 | + pieces = [] |
| 12 | + years = range(1880, 2011) |
| 13 | + for year in years: |
| 14 | + path = '{0}/ch02/names/yob{1}.txt'.format(study.DATA_DIR, year) |
| 15 | + frame = pd.read_csv(path, names=columns) |
| 16 | + frame['year'] = year |
| 17 | + pieces.append(frame) |
| 18 | + # concat rows, delete index |
| 19 | + names = pd.concat(pieces, ignore_index=True) |
| 20 | + # cols is deprecated, use columns instead |
| 21 | + # rows is deprecated, use index instead |
| 22 | + total_births = names.pivot_table('births', index='year', columns='sex', aggfunc='sum') |
| 23 | + total_births.plot(title='Total births by sex and year') |
| 24 | + plt.show() |
| 25 | + |
| 26 | + # add prop |
| 27 | + groups = ['year', 'sex'] |
| 28 | + names = names.groupby(groups).apply(add_prop) |
| 29 | + # verify sum is 1 |
| 30 | + assert np.allclose(names.groupby(groups)['prop'].sum(), 1) |
| 31 | + |
| 32 | + # same for loop list.append => pd.concat |
| 33 | + grouped = names.groupby(groups) |
| 34 | + top1000 = grouped.apply(get_top1000) |
| 35 | + |
| 36 | + # u'男子・女子の名前の年代別推移の例 |
| 37 | + # u'=>代表的な名前を付けなくなっている傾向があるのでは?という仮説をグラフから考えた |
| 38 | + total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc='sum') |
| 39 | + subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']] |
| 40 | + subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year') |
| 41 | + plt.show() |
| 42 | + |
| 43 | + # u'名前の種類の多様性の確認(上位が全体の中でどれくらいの割合であるか) |
| 44 | + table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc='sum') |
| 45 | + table.plot(title='Sum of table1000.prop by year and sex', |
| 46 | + yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10)) |
| 47 | + plt.show() |
| 48 | + |
| 49 | + # u'年度ごとの名前の多様性変遷 |
| 50 | + # u'結果として、1980年以降に多様となり、特に女子の名前が増えている |
| 51 | + diversity = top1000.groupby(groups).apply(get_quantile_count) |
| 52 | + diversity = diversity.unstack('sex') |
| 53 | + diversity.plot(title='Number of popular names in top 50%') |
| 54 | + plt.show() |
| 55 | + |
| 56 | + # u'名前の末尾文字の男女間の差異 |
| 57 | + get_last_letter = lambda x: x[-1] |
| 58 | + last_letters = names.name.map(get_last_letter) |
| 59 | + last_letters.name = 'last_letter' |
| 60 | + table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc='sum') |
| 61 | + # Error>: CGContextClosePath: no current point, if you don't excluding NaN using fillna. |
| 62 | + subtable = table.reindex(columns=[1910, 1960, 2010], level='year').fillna(0.0) |
| 63 | + letter_prop = subtable / subtable.sum().astype(float) |
| 64 | + fix, axes = plt.subplots(2, 1, figsize=(10, 8)) |
| 65 | + letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male') |
| 66 | + letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False) |
| 67 | + plt.show() |
| 68 | + |
| 69 | + # u'上記で特に多かった男子の末尾文字について |
| 70 | + # u'T: 転置行列(transposed matrix) |
| 71 | + letter_prop = table / table.sum().astype(float) |
| 72 | + dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T |
| 73 | + dny_ts.plot(style={'d':'-.', 'n': '-', 'y': ':'}) |
| 74 | + plt.show() |
| 75 | + |
| 76 | + # u'男子名前から女子名前へ定着、とその逆 |
| 77 | + all_names = top1000.name.unique() |
| 78 | + mask = np.array(['lesl' in x.lower() for x in all_names]) |
| 79 | + lesley_like = all_names[mask] |
| 80 | + filtered = top1000[top1000.name.isin(lesley_like)] |
| 81 | + table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum') |
| 82 | + # u'sum of sex(columns), axis is '軸'. axis is year(index). |
| 83 | + table = table.div(table.sum(1), axis=0) |
| 84 | + table.plot(style={'M': 'k-', 'F': 'k--'}) |
| 85 | + plt.show() |
| 86 | + |
| 87 | +def add_prop(group): |
| 88 | + births = group['births'].astype(float) |
| 89 | + |
| 90 | + group['prop'] = births / births.sum() |
| 91 | + |
| 92 | + return group |
| 93 | + |
| 94 | +def get_top1000(group): |
| 95 | + return group.sort_index(by='births', ascending=False)[:1000] |
| 96 | + |
| 97 | +def get_quantile_count(group, q=0.5): |
| 98 | + """ |
| 99 | + u'cumsum is cumulative sum. (累積和) |
| 100 | + u'上記が0.5、つまり半分を超えるIndexを見つけることで、数がすくなければ上位の構成比が大きい |
| 101 | + """ |
| 102 | + group = group.sort_index(by='prop', ascending=False) |
| 103 | + |
| 104 | + return group.prop.cumsum().values.searchsorted(q) + 1 # index started zero |
| 105 | + |
| 106 | +if __name__ == '__main__': |
| 107 | + print main() |
0 commit comments