-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path101_key_words.py
58 lines (53 loc) · 1.75 KB
/
101_key_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
import glob, csv, MeCab
PYTHONIOENCODING='utf-8'
def make_tweet_list(file):
reader = csv.reader(file)
tweet_list = []
for row in reader:
tweet_list.append(row[5])
return tweet_list
def make_word_list(tweet_list):
ignore_list = ['BOS/EOS',
'記号',
'数',
'助詞',
'助動詞',
'接頭',
'接尾',
'接頭詞',
'接尾詞',
'特殊',
'非自立']
word_list = []
for tweet in tweet_list:
tagger = MeCab.Tagger('-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
text = tweet.replace(' ', ' ')
node = tagger.parseToNode(text)
while node:
feature = node.feature.split(',')
if (not '@' in node.surface and
not feature[-3] == '*' and
not any(ignore in feature for ignore in ignore_list)):
word_list.append((feature[-3], feature))
node = node.next
return word_list
def count_words(word_list):
counted_list = []
result = []
for word in word_list:
if word in counted_list:
continue
result.append((word, word_list.count(word)))
counted_list.append(word)
result.sort(key=lambda p:p[1], reverse=True)
return result
if __name__ == '__main__':
file_names = glob.glob('data/*.csv')
word_list = []
for name in file_names:
datafile = open(name)
word_list.extend(make_word_list(make_tweet_list(datafile)))
result = count_words(word_list)
for i in range(101):
print '%03d %s %s' % (i, result[i][0][0], result[i][0][1][0])