-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
122 lines (95 loc) · 4.09 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# SFSpider
# - A spider using urllib and BeautifulSoup module to get all the tags from SegmentFault
# Author: LuRenJia https://untitled.pw/
# Time: 2018-06-04
# Import some necessary packages
from urllib import request
from bs4 import BeautifulSoup as soup
import ssl
import time
import os
import csv
import translate as t
import random
# Define some variables
counter = 0
sleepTime = 3
# Avoid the error message caused by unauthorized ssl certificate
sslContext = ssl._create_unverified_context()
def scratchTag(page, fileName):
global counter
translateBuffer = ""
translateResult = ""
translateResultArray = []
temp = ""
# Start scratching
response = request.urlopen("https://segmentfault.com/tags/all?page=" + page, context=sslContext)
html = response.read().decode("utf-8")
# Initialize the BeautifulSoup package
soupObj = soup(html, "html.parser")
tagsObj = soupObj.select("div.widget-tag h2 a")
# Fill the buffer with strings
for each in tagsObj:
translateBuffer += each.get_text()
# A little trick making google translate can translate multi words separately
translateBuffer += "、"
# Translate
try:
translateResult = t.translate("zh-CN", "en", translateBuffer)
translateResultArray = translateResult.split(",")
except ssl.SSLEOFError:
fuckingGoogle = '''
Fucking Google has blocked your connection again!\n
You can have a coffee and come back to your work again.\n
When you want to restart your process, input Y, and press Enter\n
'''
print(fuckingGoogle)
print("We will continue the scratch work from page" + page)
if input("Type Y/y to Continue") == "Y" or "y":
portal(page)
# Write to the CSV file
with open("./" + fileName, "a+", encoding="utf-8") as f:
for each in tagsObj:
fieldnames = ["id", "zh-cn", "en-us"]
eachRow = {"id": str(counter + 1), "zh-cn": each.get_text(), "en-us": translateResultArray.pop(0).strip()}
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow(eachRow)
print("- Data No." + str(counter + 1) + ":")
for key, value in eachRow.items():
print("\t" + key + " - " + value)
counter += 1
time.sleep(random.uniform(0.1, 0.3))
# In case of being banned by Google Translate, sleep for a while
print("\n\nCooling.......\n\n")
time.sleep(random.randint(3, 5))
return counter
def portal(startPage=0):
answer = {}
result = 0
print("\t\t\t\t\t\t\t\t****** SFSpider ******\n"
+ "\t- A spider using urllib and BeautifulSoup module to get all the tags from SegmentFault\n"
+ "\t\t\t\t\t\t- Author: LuRenJia https://untitled.pw/\n"
+ "\t\t\t\t\t\t\t\t- Time: 2018-06-04")
time.sleep(1)
answer["sleepTime"] = int(input("Please input the interval between each query (seconds): "))
answer["pages"] = int(input("How many pages do you want to scratch: "))
answer["fileName"] = str(input("Please input the specified filename you want to save (default is tags.csv): "))
if answer["fileName"] == "":
answer["fileName"] = "tags.csv"
# If the file exists, clear it
testFile = os.path.isfile(answer["fileName"])
if testFile is True:
fileObj = open(answer["fileName"], "w")
fileObj.close()
sleepTime = answer["sleepTime"]
for i in range(int(startPage), int(answer["pages"])):
print("\n\nStart the No." + str(i + 1) + " of " + str(answer["pages"]) + " scratchs......\n")
time.sleep(sleepTime)
result = scratchTag(str(i + 1), answer["fileName"])
print("--------------------")
print(str(result) + " tags get, sleep " + str(sleepTime) + " seconds\n")
time.sleep(sleepTime)
print("\n\n-----------Scratch Complete!-----------\n\n")
print("Statistics: " + str(result) + " tags get, saved in ./" + answer["fileName"])
if __name__ == "__main__":
portal()