forked from prathimacode-hub/Awesome_Python_Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
65 lines (52 loc) · 1.76 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from bs4 import BeautifulSoup
import requests
import json
fmt = "https://stackoverflow.com/questions/tagged/{tag}?tab={filter}&pagesize=15"
filters = [
"1. Newest",
"2. Active",
"3. Bounties",
"4. Unanswered",
"5. Frequent",
"6. Votes",
]
tag = input("enter any question tag (python, java)\n")
print("\n".join(filters))
filter = int(input("enter the filter number (1, 3, 5)\n"))
try:
filter = filters[filter].split(" ")[-1]
except:
filter = "Votes"
# generate dynamic URL with user preferences
URL = fmt.format(tag=tag, filter=filter)
print("generated URL ", URL)
content = requests.get(URL).content
soup = BeautifulSoup(content, "lxml")
# return only question tags
def is_question(tag):
try:
return tag.get("id").startswith("question-summary-")
except:
return False
questions = soup.find_all(is_question)
question_data = []
if questions:
# extract question data like votes, title, link and date
for question in questions:
question_dict = {}
question_dict["votes"] = (
question.find(class_="s-post-summary--stats-item-number").get_text().strip()
)
h3 = question.find(class_="s-post-summary--content-title")
question_dict["title"] = h3.get_text().strip()
question_dict["link"] = "https://stackoverflow.com" + h3.find("a").get("href")
question_dict["date"] = (
question.find(class_="s-user-card--time").span.get_text().strip()
)
question_data.append(question_dict)
with open(f"questions-{tag}.json", "w") as f:
json.dump(question_data, f)
print("file exported")
else:
print(URL)
print("looks like there are no questions matching your tag ", tag)