Skip to content

Commit 10923dd

Browse files
author
Domantas
committed
Code improvement
1 parent 40712c9 commit 10923dd

File tree

9 files changed

+472
-104
lines changed

9 files changed

+472
-104
lines changed

poetry.lock

Lines changed: 402 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ python = "~3.11"
1010
nltk = "~3.8.1"
1111
numpy = "~1.26.1"
1212
pandas = "~2.1.1"
13+
aiohttp = "^3.8.6"
1314

1415
[tool.poetry.group.dev.dependencies]
1516
black = "~23.10.0"

url_predictions/01_construct_features.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

url_predictions/construct_data.sh

Lines changed: 0 additions & 3 deletions
This file was deleted.

url_predictions/construct_features.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from url_predictions.config import TOKENS_PATH, WORDS_FREQUENCY_PATH
2+
from url_predictions.features import FeaturesExtraction
3+
from url_predictions.functions import save_to_pickle
4+
5+
6+
def main() -> None:
7+
features = FeaturesExtraction()
8+
features.preprocess_main_dataset()
9+
10+
features.scrape_urls_async_mode()
11+
features.analyze_responses_normal_mode()
12+
13+
features.df.to_csv(TOKENS_PATH, index=False)
14+
features.generate_words_frequency()
15+
save_to_pickle(features.words_frequency, WORDS_FREQUENCY_PATH, "wb")
16+
17+
18+
if __name__ == "__main__":
19+
main()

url_predictions/features.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
1+
from concurrent.futures import ThreadPoolExecutor
22
from typing import Any
33

44
import nltk
@@ -8,10 +8,9 @@
88
FREQUENCY_TOP_WORDS,
99
MAIN_DATASET_PATH,
1010
MULTIPROCESSING_WORKERS,
11-
THREADING_WORKERS,
1211
TOP_LEVEL_DOMAIN_WHITELIST,
1312
)
14-
from url_predictions.functions import parse_request, scrape
13+
from url_predictions.functions import fetch_html_content_sync, parse_request, scrape_url
1514

1615

1716
class FeaturesExtraction:
@@ -31,22 +30,20 @@ def preprocess_main_dataset(self) -> None:
3130
self.df["tokens"] = ""
3231

3332
def scrape_urls_normal_mode(self) -> None:
34-
self.url_responses = [(ind, scrape(url)) for ind, url in enumerate(self.df["url"].to_list())]
33+
self.url_responses = [(ind, scrape_url(url)) for ind, url in enumerate(self.df["url"].to_list())]
3534

3635
def analyze_responses_normal_mode(self) -> None:
37-
self.html_content = [(ind, parse_request(response)) for ind, response in self.url_responses]
36+
self.html_content = [parse_request([ind, response]) for ind, response in enumerate(self.url_responses)]
3837

3938
for ind, tokens in self.html_content:
4039
self.df.at[ind, "tokens"] = tokens
4140

42-
def scrape_urls_multithread_mode(self) -> None:
43-
with ThreadPoolExecutor(THREADING_WORKERS) as executor:
44-
self.url_responses = executor.map(
45-
scrape, [(i, elem) for i, elem in enumerate(self.df["url"])]
46-
) # pylint: disable=unnecessary-comprehension
41+
def scrape_urls_async_mode(self) -> None:
42+
urls = self.df["url"].to_list()
43+
self.url_responses = fetch_html_content_sync(urls)
4744

4845
def analyze_responses_multiprocessing_mode(self) -> None:
49-
with ProcessPoolExecutor(MULTIPROCESSING_WORKERS) as ex:
46+
with ThreadPoolExecutor(MULTIPROCESSING_WORKERS) as ex:
5047
self.html_content = ex.map(
5148
parse_request, [(i, elem) for i, elem in enumerate(self.url_responses)]
5249
) # pylint: disable=unnecessary-comprehension

url_predictions/functions.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1+
import asyncio
12
import pickle
23
import re
34
from typing import Any
45

6+
import aiohttp
57
import requests
68
from bs4 import BeautifulSoup
79
from nltk.stem import WordNetLemmatizer
810
from nltk.tokenize import word_tokenize
9-
from requests import Response
1011

1112
from url_predictions.config import FREQUENCY_TOP_WORDS, REQUEST_HEADERS, STOPWORDS, logger
1213

@@ -43,28 +44,57 @@ def remove_stopwords(tokens: list[str]) -> list[str]:
4344
return list(filter(lambda x: len(x) > 1, tokens_list))
4445

4546

46-
def scrape(url: str) -> Response | str:
47+
def scrape_url(url: str) -> str | None:
4748
try:
48-
return requests.get(url, headers=REQUEST_HEADERS, timeout=15)
49+
return requests.get(url, headers=REQUEST_HEADERS, timeout=15).text
4950
except requests.exceptions.RequestException as e:
5051
logger.error(e)
51-
return ""
52+
return None
5253

5354

54-
def parse_request(res: Response) -> list[str]:
55-
if res != "" and res.status_code == 200:
56-
soup = BeautifulSoup(res.text, "html.parser")
55+
async def fetch_url(url: str, session: Any) -> str | None:
56+
try:
57+
async with session.get(url) as response:
58+
return await response.text
59+
except aiohttp.ClientError as e:
60+
logger.error(e)
61+
return None
62+
63+
64+
async def fetch_html_content_async(urls: list[str]) -> Any:
65+
async with aiohttp.ClientSession() as session:
66+
tasks = [fetch_url(url, session) for url in urls]
67+
html_contents = await asyncio.gather(*tasks)
68+
return html_contents
69+
70+
71+
def fetch_html_content_sync(urls: list[str]) -> str:
72+
loop = asyncio.new_event_loop()
73+
asyncio.set_event_loop(loop)
74+
html_contents = loop.run_until_complete(fetch_html_content_async(urls))
75+
return html_contents
76+
77+
78+
def parse_request(res: list[int | str]) -> tuple[int | str, list[str]]:
79+
index = res[0]
80+
html_content = res[1]
81+
if res and html_content:
82+
soup = BeautifulSoup(html_content, "html.parser")
5783
[tag.decompose() for tag in soup("script")] # pylint: disable=expression-not-assigned
5884
[tag.decompose() for tag in soup("style")] # pylint: disable=expression-not-assigned
5985
text = soup.get_text()
6086
cleaned_text = re.sub("[^a-zA-Z]+", " ", text).strip()
6187
tokens = word_tokenize(cleaned_text)
6288
tokens_lemmatize = remove_stopwords(tokens)
63-
return tokens_lemmatize
64-
return [""]
89+
return index, tokens_lemmatize
90+
return index, [""]
6591

6692

6793
def save_to_pickle(target: Any, output_path: str, write_mode: str) -> None:
68-
pickle_out = open(output_path, write_mode) # pylint: disable=unspecified-encoding, consider-using-with
69-
pickle.dump(target, pickle_out)
70-
pickle_out.close()
94+
with open(output_path, write_mode) as pickle_out: # pylint: disable=unspecified-encoding
95+
pickle.dump(target, pickle_out)
96+
97+
98+
def read_pickle(input_path: str) -> Any:
99+
with open(input_path, "rb") as pickle_in:
100+
return pickle.load(pickle_in)

url_predictions/main.py

Lines changed: 0 additions & 12 deletions
This file was deleted.

0 commit comments

Comments
 (0)