-
Notifications
You must be signed in to change notification settings - Fork 34
/
extract_role.py
38 lines (24 loc) · 994 Bytes
/
extract_role.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from collections import Counter
import jieba
from load_config import get_yaml_config
config = get_yaml_config()
min_length = config["potential"]["min_length"]
max_length = config["potential"]["max_length"]
top_n = config["potential"]["top_n"]
async def extract_potential_names(text):
# 使用 jieba 进行中文分词
words = jieba.lcut(text)
# 过滤出符合要求的词汇
filtered_words = [word for word in words if min_length <= len(word) <= max_length]
# 统计词频
word_counts = Counter(filtered_words)
# 选取频率最高的词汇
top_names = [word for word, count in word_counts.most_common(top_n)]
return top_names
if __name__ == "__main__":
# 假设这是你的中文小说文本
with open("./表白.txt", "r", encoding="utf8") as f:
novel_text = f.read().replace("\n", "").replace("\r", "").replace("\r\n", "")
# 提取文本中的潜在人名
names = extract_potential_names(novel_text)
print(names)