|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import study |
| 5 | + |
| 6 | +def main(): |
| 7 | + metadata = { |
| 8 | + 'users': { |
| 9 | + 'names': ['user_id', 'gender', 'age', 'occupation', 'zip'], |
| 10 | + 'dat': 'users.dat' |
| 11 | + }, |
| 12 | + 'ratings': { |
| 13 | + 'names': ['user_id', 'movie_id', 'rating', 'timestamp'], |
| 14 | + 'dat': 'ratings.dat' |
| 15 | + }, |
| 16 | + 'movies': { |
| 17 | + 'names': ['movie_id', 'title', 'genre'], |
| 18 | + 'dat': 'movies.dat' |
| 19 | + } |
| 20 | + } |
| 21 | + |
| 22 | + # data loading |
| 23 | + tables = {} |
| 24 | + for k, v in metadata.items(): |
| 25 | + tables[k] = read_table(study.DATA_DIR + 'ch02/movielens/' + v['dat'], v['names']) |
| 26 | + # join(ratings.user_id = users.user_id, ratings.movie_id = movies.movie_id) |
| 27 | + data = pd.merge(pd.merge(tables['ratings'], tables['users']), tables['movies']) |
| 28 | + |
| 29 | + # u'タイトル別評価件数のうち、件数が上位である映画に対する女性の平均評価 |
| 30 | + mean_ratings = data.pivot_table( |
| 31 | + 'rating', rows='title', cols='gender', aggfunc='mean' |
| 32 | + ) |
| 33 | + ratings_by_title = data.groupby('title').size() |
| 34 | + active_titles = ratings_by_title.index[ratings_by_title >= 250] |
| 35 | + mean_ratings = mean_ratings.ix[active_titles] |
| 36 | + top_female_ratings = mean_ratings.sort_index(by='F', ascending=False) |
| 37 | + print top_female_ratings['F'][:10] |
| 38 | + |
| 39 | + # calculate for each row |
| 40 | + mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] |
| 41 | + sorted_by_diff = mean_ratings.sort_index(by='diff') |
| 42 | + print sorted_by_diff[:15] |
| 43 | + |
| 44 | + # u'評価の別れた映画TOP10 |
| 45 | + ratings_std_by_title = data.groupby('title')['rating'].std() |
| 46 | + ratings_std_by_title = ratings_std_by_title.ix[active_titles] |
| 47 | + print ratings_std_by_title.order(ascending=False)[:10] |
| 48 | + |
| 49 | +def read_table(file_path, names, sep='::', header=None): |
| 50 | + return pd.read_table(file_path, sep=sep, header=header, names=names) |
| 51 | + |
| 52 | +if __name__ == '__main__': |
| 53 | + print main() |
0 commit comments