-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSousouwa_fetch.py
64 lines (50 loc) · 1.53 KB
/
Sousouwa_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def removeSpaceEntity(s):
pattern = re.compile('( | | )')
return pattern.sub('', s)
def convertEntity(s):
pattern = re.compile('(&#([0-9a-fA-F]{4});)')
return pattern.sub(lambda x: unichr(int(x.group(2), 16)), s)
def brRemove(s):
brRemover = re.compile(r'<br.*?>')
return brRemover.sub('', s)
def divRemove(s):
divRemover = re.compile(r'(<div.*?>|</div>)')
return divRemover.sub('', s)
def tagRemove(s):
tagRemover = re.compile(r'(<.*?>|</.*?>)')
return tagRemover.sub('', s)
def spaceRemove(s):
spaceRemover = re.compile(r'( | | |\n)')
return spaceRemover.sub('', s)
def unescape(s):
s = removeSpaceEntity(s)
s = convertEntity(s)
s = tagRemove(s)
return s
def main():
argvs = sys.argv
url = sys.argv[1]
# url = 'http://coolier-new.sytes.net:8080/sosowa/ssw_l/?mode=read&key=1342594292&log=0'
html = urllib2.urlopen(url).read().decode('sjis')
soup = BeautifulSoup(html)
title = soup.find('title')
body = soup.find("div", attrs={"class": "contents ss"})
aft = soup.find("div", attrs={"class": "aft"})
title = unescape(str(title))
body = unescape(str(body))
aft = unescape(str(aft))
title = spaceRemove(title)
filename = title + '.txt'
f = open(filename, 'w') # 書き込みモードで開く
f.write(body) # 引数の文字列をファイルに書き込む
f.close() # ファイルを閉じる
if __name__ == '__main__':
main()