-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauthor.py
115 lines (105 loc) · 3.29 KB
/
author.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# 出版社数据
import pymongo
import re
import json
import csv
import jieba
import jieba.analyse
class author:
dbClient = pymongo.MongoClient('mongodb://localhost:27017/')
db = dbClient['bookdb']
book_col = db['book']
book_list = []
author_set = set()
# 图书列表数据、出版社 名集合
def get_book_list(self):
all = self.book_col.find()
for book in all:
# print(book['name'])
# print(book['author'])
self.book_list.append(book)
# print(self.book_list)
# 获取作者姓名,频次
def get_author_name(self):
self.get_book_list()
au_set = set()
au_dict = dict()
book_au_list = []
for book in self.book_list:
if 'author' in book:
pattern = r'\|a\s+([^\$\|]+)[\s\|$]?'
result = re.findall(pattern, string=book['author'])
book_au_list.append(result)
temp_set = set(result)
au_set = au_set.union(temp_set)
for au in au_set:
num = 0
for book in book_au_list:
if au in book:
num += 1
au_dict[au] = num
for au in au_dict:
print(au, au_dict[au])
print(len(au_dict))
# 责任方式 类型
def get_author_duty(self):
self.get_book_list()
duty_set = set()
duty_dict = dict()
duty_list = []
for book in self.book_list:
pattern = r'\|4\s([^\s\|\$]+)'
result = re.findall(pattern, book['author'])
res_list = []
for word in result:
temp_list = word.split('/')
res_list.extend(temp_list)
duty_list.append(result)
temp_set = set(result)
duty_set = duty_set.union(temp_set)
for duty in duty_set:
num = 0
for item in duty_list:
for word in item:
if word == duty:
num += 1
duty_dict[duty] = num
for duty in duty_dict:
print(duty, duty_dict[duty])
print(len(duty_set))
# 根据作者查书
def get_book_by_author(self, name):
all = self.book_col.find({'author': {"$regex": name}})
for book in all:
print(book)
# 获取作者国籍
def get_book_nation(self):
self.get_book_list()
nation_set = set()
na_list = []
na_dict = dict()
for book in self.book_list:
pattern = r'\|f\s\(([^)]+)\)'
res = re.findall(pattern, book['name'])
if len(res) > 0:
na_str = res[0].strip()
if na_str:
na_list.append(na_str)
nation_set.add(na_str)
for na in nation_set:
num = 0
for word in na_list:
if word == na:
num += 1
na_dict[na] = num
for na in na_dict:
print(na, na_dict[na])
# 字典转列表后排序
def sort_by_value(self, d):
l_data = list(d.items())
l_data.sort(key=lambda x: x[1], reverse=True)
return l_data
if __name__ == "__main__":
au = author()
# au.get_book_by_author('程焕文')
au.get_book_nation()