-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
78 lines (65 loc) · 3.06 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import os
from selenium.webdriver.firefox.options import Options
# Disables JS, making the pages load, like, a million times faster. Also prevents some ads.
op = Options()
op.set_preference('javascript.enabled', False)
# Sets the display environment variable, not sure if this is even needed for windows or even if it's just my computer that's stupid with this but it's there now.
os.environ['DISPLAY'] = ":0"
# Make the data directory to save the csv in
currentdir = os.listdir()
if "data" not in currentdir:
os.mkdir("data")
# Select the webdriver, firefox requires geckodriver to work, make sure it's in your PATH
driver = webdriver.Firefox(options=op)
# List of categories for mediabiasfactcheck, their inconsistent on the website, it's not a typo on my end I swear.
cats = {"left", "leftcenter", "center", "right-center",
"right", "conspiracy", "fake-news", "pro-science", "satire"}
# Do a loop for each category
for category in cats:
# Connect to the webpage
driver.get(f'https://mediabiasfactcheck.com/{category}/')
# Prebuild these dictionaries
group = []
href = []
links = []
# Pull the HTML of the page
content = driver.page_source
# Make soup from the aforementioned HTML
soup = BeautifulSoup(content, features="html.parser")
# Find the table and then rows of the news groups
table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr(
'id') and tag['id'] == "mbfc-table")
rows = table.findAll(lambda tag: tag.name == 'tr')
# Append each group name and the href link to the respective prebuilt dictionaries
for row in rows:
group.append(row.text)
# What in the sweet mother of god happened here, this causes me physical pain but it works so I'm not touching it anymore
try:
# Don't even want to attempt trying to explain what hell has occurred here, maybe someone with literally any experience using BeautifulSoup could make this work in a normal way.
href.append(row.contents[0].contents[0].contents[0].attrs["href"])
except:
try:
href.append(row.contents[0].contents[0].attrs["href"])
except: # This means that it's an advert row in the table so we'll remove it.
href.append(" ")
for url in href:
links.append("https://mediabiasfactcheck.com" + str(url))
# Frame the dictionaries and save them to csv using Pandas
df = pd.DataFrame({'Group': group, 'Link': links, "Type": category},)
noadvert = df[~(df["Group"] == " ")]
noadvert.to_csv(f'data/{category}.csv', index=False, encoding='utf-8')
driver.quit()
files = os.listdir("data")
try:
files.remove("all.csv")
except:
pass
if not os.path.exists("./data/all.csv"):
open("./data/all.csv", 'w').close()
# Consolidate all CSV files into one object
allcsv = pd.concat([pd.read_csv("./data/"+file) for file in files])
# Convert the above object into a csv file and export
allcsv.to_csv("./data/all.csv", index=False, encoding="utf-8")