-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
172 lines (159 loc) · 5.94 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
# coding:utf-8
# author aibow
"""
遍历目录,查找所有word文档,检查文档是否包含指定关键词
结果日志(result.out):
path:命中次数:命中词列表
错误日志列表(result.err):
time:path:错误说明
工作流程:
- 读取命中词列表和处理目录
- 遍历目录,查找doc,docx文档
- 将文档转换为txt文本
- 检查命中词
- 记录结果
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from optparse import OptionParser, OptParseError
import os
import os.path
from win32com.client import Dispatch
import shutil
import time
import hashlib
def utf2gbk(s):
if not s:
return ''
try:
return s.decode('utf-8').encode('gbk', 'ignore')
except Exception as e:
return s
def gbk2utf(s):
if not s:
return s
try:
return s.decode('gbk').encode('utf-8', 'ignore')
except Exception as e:
return s
def convert(path, tempPath):
if not path or not os.path.exists(path) or not os.path.isfile(path):
raise Exception('Path Not Found Or Path Invalid')
app = Dispatch('Word.Application')
app.Visible = 0
app.DisplayAlerts = 0
app.Documents.Open(FileName=path)
app.ActiveDocument.SaveAs(FileName=tempPath, FileFormat=2)
app.Quit()
def log(msg):
print '[%s] %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), msg)
def main(dir, wordFile, flush):
log('Application Starting ...')
# 检查缓存目录是否存在,不存在则创建
currPath = os.path.abspath(os.path.curdir)
cachePath = os.path.join(currPath, 'temp')
errorPath = os.path.join(currPath, 'error.txt')
resultPath = os.path.join(currPath, 'result.txt')
tempPath = os.path.join(currPath, 'doc.tmp')
# 是否清空缓存
if flush and os.path.exists(cachePath):
shutil.rmtree(cachePath)
if not os.path.exists(cachePath):
# 尝试创建目录
os.mkdir(cachePath)
else:
if not os.path.isdir(cachePath):
raise Exception('Cache Path Invalid')
# 检查目录是否存在
if not dir or not os.path.exists(dir) or not os.path.isdir(dir):
raise Exception('Directory Invalid')
# 检查词库文件是否存在
if not wordFile or not os.path.exists(wordFile) or not os.path.isfile(wordFile):
raise Exception('Word File Invalid')
# 读取词库列表
log('Loading Word File ...')
wordList = []
with open(wordFile, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
if line[0] == '#':
continue
wordList.append(line)
if len(wordList) == 0:
raise Exception('Word File Empty')
log('Word File Load Success, Find %s Word' % len(wordList))
# 遍历读取word文档
log('Find Doc File ...')
docList = []
for root, dirs, files in os.walk(dir, False):
for name in files:
temp = os.path.join(root, name)
if temp.lower().endswith('.doc') or temp.lower().endswith('.docx'):
docList.append(temp)
log('Find Success, Find %s Document' % len(docList))
# 处理文件
log('Preprocessor Document ...')
for docFile in docList:
log('Convert Document %s' % docFile)
# 检查是否已经缓存了处理结果
cacheFile = os.path.join(cachePath, '%s.tmp' % hashlib.md5(docFile).hexdigest())
# 已经存在缓存
if os.path.exists(cacheFile):
continue
# 处理文档
try:
# 移除临时文件
if os.path.exists(tempPath):
os.remove(tempPath)
convert(docFile, tempPath)
with open(tempPath, 'r') as f:
body = f.read()
with open(cacheFile, 'w') as f:
f.write(gbk2utf('%s\n\n%s' % (docFile, body)))
except Exception as e:
with open(errorPath, 'a+') as f:
f.write('Convert Document Error\n%s\n%s\n' % (docFile, str(e)))
log('Convert Document Error {%s} %s' % (docFile, str(e)))
log('Convert Document {%s} Success' % tempPath)
log('Preprocessor Document Success')
# 检查结果文件是否存在,如果存在则删除
if os.path.exists(resultPath):
os.remove(resultPath)
# 检测
for root, dirs, files in os.walk(cachePath, False):
for name in files:
cacheFile = os.path.join(root, name)
# 读取文件内容
try:
with open(cacheFile, 'r') as f:
path = f.readline().strip()
body = f.read()
hits = []
log('Check Document {%s} ...' % utf2gbk(path))
for word in wordList:
if body.find(word) != -1:
hits.append(word)
if len(hits) > 0:
with open(resultPath, 'a+') as f:
f.write('%s:%s:%s\n' % (path, len(hits), ','.join(hits)))
log(utf2gbk('%s:%s:%s' % (path, len(hits), ','.join(hits))))
except Exception as e:
with open(errorPath, 'a+') as f:
f.write('Check Document Error\n%s\n%s\n' % (cacheFile, str(e)))
log('Application Finished')
if __name__ == '__main__':
try:
op = OptionParser()
op.add_option('-d', '--dir', type='string', dest='dir', default='', help='Directory Path')
op.add_option('-w', '--word', type='string', dest='word', default='', help='Word File Path')
op.add_option('-f', '--flush', type='int', dest='flush', default=0, help='Clear Cache File')
arg, _ = op.parse_args(sys.argv)
main(arg.dir, arg.word, arg.flush)
except OptParseError as e:
log('Option Parser Error')
except Exception as e:
log(str(e))