-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_logic.py
135 lines (103 loc) · 4.79 KB
/
scraping_logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from newspaper import Article, ArticleException
import re
# List of user agents to rotate to avoid being blocked by Google:
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.277"
]
def get_domain(url):
parsed_url = urlparse(url)
return parsed_url.netloc
def get_search_results(topic, num_results=5):
search_results = []
headers = {'User-Agent': random.choice(user_agents)}
base_url = "https://www.google.com/search"
params = {'q': topic, 'num': num_results}
response = requests.get(base_url, params=params, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.select('div.tF2Cxc a')
for link in links:
url = link['href']
if len(search_results) < num_results:
if check_article_existence(url):
search_results.append(url)
return search_results[:num_results]
def check_article_existence(url):
try:
article = Article(url)
article.download()
article.parse()
return url if article.is_parsed else None
except ArticleException:
return None
def get_first_paragraph(url, min_words=20):
irrelevant_strings = [
"dont have permission", "don't have permission", "JavaScript and cookies", "<<", ">>",
"403 - Forbidden", "Access to this page is forbidden", "It looks like you",
"could not be decoded", "site requires", "gateway", "terms & conditions"
]
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text().strip()
if len(text.split()) >= min_words and is_xml_compatible(text):
if not any(irrelevant_str.lower() in text.lower() for irrelevant_str in irrelevant_strings):
return text
return None
def is_xml_compatible(text):
return all(32 <= ord(char) <= 126 for char in text)
def replace_spaces(input_string, c):
return ''.join(c if char == ' ' else char for char in input_string)
def create_folder():
folder_path = os.path.join(os.getcwd(), 'scraped_content')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return folder_path
def create_pdf_file(topic, file_directory):
base_filename = replace_spaces(topic, '_')
pdf_file_path = os.path.join(file_directory, f"{base_filename}.pdf")
count = 0
while os.path.exists(pdf_file_path):
count += 1
pdf_file_path = os.path.join(
file_directory, f"{base_filename}[{count}].pdf")
return pdf_file_path
def append_to_pdf(paras, pdf, results):
pdf.set_font("Arial", size=12)
for i, url in enumerate(results, start=1):
if check_article_existence(url):
first_paragraph = get_first_paragraph(url)
if first_paragraph and first_paragraph not in paras:
cleaned_paragraph = re.sub(
r'\[.*?\]|<.*?>', '', first_paragraph)
if not contains_invalid_characters(cleaned_paragraph) and not contains_high_digit_percentage(cleaned_paragraph):
pdf.set_font("Arial", "B", 12)
pdf.multi_cell(0, 10, f"URL {i}:", align='L')
pdf.set_text_color(0, 0, 255)
pdf.set_font("Arial", "U", 12)
pdf.multi_cell(0, 10, url, align='L')
pdf.set_text_color(0, 0, 0)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, cleaned_paragraph, align='L')
paras.add(cleaned_paragraph)
pdf.ln(10)
def contains_invalid_characters(para):
return (
para.count('<') > 1 or para.count('>') > 1 or para.count('#') > 1 or
para.count(
'/') > 2 or para.count('\\') > 2 or para.count('[') > 2 or para.count(']') > 2
)
def contains_high_digit_percentage(para):
total_chars = len(para)
digit_chars = sum(char.isdigit() for char in para)
return (digit_chars / total_chars * 100) >= 30 if total_chars > 0 else False