domantasm96
diff --git a/‎frequency_models/word_frequency_2020-12-26.picle renamed to ‎frequency_models/word_frequency_2021.pickle b/‎frequency_models/word_frequency_2020-12-26.picle renamed to ‎frequency_models/word_frequency_2021.pickle
diff --git a/‎poetry.lock
Lines changed: 402 additions & 4 deletions b/‎poetry.lock
Lines changed: 402 additions & 4 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎url_predictions/01_construct_features.py
Lines changed: 0 additions & 62 deletions b/‎url_predictions/01_construct_features.py
Lines changed: 0 additions & 62 deletions
diff --git a/‎url_predictions/construct_data.sh
Lines changed: 0 additions & 3 deletions b/‎url_predictions/construct_data.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎url_predictions/construct_features.py
Lines changed: 19 additions & 0 deletions b/‎url_predictions/construct_features.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎url_predictions/features.py
Lines changed: 8 additions & 11 deletions b/‎url_predictions/features.py
Lines changed: 8 additions & 11 deletions
diff --git a/‎url_predictions/functions.py
Lines changed: 42 additions & 12 deletions b/‎url_predictions/functions.py
Lines changed: 42 additions & 12 deletions
diff --git a/‎url_predictions/main.py
Lines changed: 0 additions & 12 deletions b/‎url_predictions/main.py
Lines changed: 0 additions & 12 deletions
@@ -10,6 +10,7 @@ python = "~3.11"
 nltk = "~3.8.1"
 numpy = "~1.26.1"
 pandas = "~2.1.1"
+aiohttp = "^3.8.6"
 
 [tool.poetry.group.dev.dependencies]
 black = "~23.10.0"
 
@@ -0,0 +1,19 @@
+from url_predictions.config import TOKENS_PATH, WORDS_FREQUENCY_PATH
+from url_predictions.features import FeaturesExtraction
+from url_predictions.functions import save_to_pickle
+
+
+def main() -> None:
+    features = FeaturesExtraction()
+    features.preprocess_main_dataset()
+
+    features.scrape_urls_async_mode()
+    features.analyze_responses_normal_mode()
+
+    features.df.to_csv(TOKENS_PATH, index=False)
+    features.generate_words_frequency()
+    save_to_pickle(features.words_frequency, WORDS_FREQUENCY_PATH, "wb")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,4 +1,4 @@
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any
 
 import nltk
@@ -8,10 +8,9 @@
     FREQUENCY_TOP_WORDS,
     MAIN_DATASET_PATH,
     MULTIPROCESSING_WORKERS,
-    THREADING_WORKERS,
     TOP_LEVEL_DOMAIN_WHITELIST,
 )
-from url_predictions.functions import parse_request, scrape
+from url_predictions.functions import fetch_html_content_sync, parse_request, scrape_url
 
 
 class FeaturesExtraction:
@@ -31,22 +30,20 @@ def preprocess_main_dataset(self) -> None:
         self.df["tokens"] = ""
 
     def scrape_urls_normal_mode(self) -> None:
-        self.url_responses = [(ind, scrape(url)) for ind, url in enumerate(self.df["url"].to_list())]
+        self.url_responses = [(ind, scrape_url(url)) for ind, url in enumerate(self.df["url"].to_list())]
 
     def analyze_responses_normal_mode(self) -> None:
-        self.html_content = [(ind, parse_request(response)) for ind, response in self.url_responses]
+        self.html_content = [parse_request([ind, response]) for ind, response in enumerate(self.url_responses)]
 
         for ind, tokens in self.html_content:
             self.df.at[ind, "tokens"] = tokens
 
-    def scrape_urls_multithread_mode(self) -> None:
-        with ThreadPoolExecutor(THREADING_WORKERS) as executor:
-            self.url_responses = executor.map(
-                scrape, [(i, elem) for i, elem in enumerate(self.df["url"])]
-            )  # pylint: disable=unnecessary-comprehension
+    def scrape_urls_async_mode(self) -> None:
+        urls = self.df["url"].to_list()
+        self.url_responses = fetch_html_content_sync(urls)
 
     def analyze_responses_multiprocessing_mode(self) -> None:
-        with ProcessPoolExecutor(MULTIPROCESSING_WORKERS) as ex:
+        with ThreadPoolExecutor(MULTIPROCESSING_WORKERS) as ex:
             self.html_content = ex.map(
                 parse_request, [(i, elem) for i, elem in enumerate(self.url_responses)]
             )  # pylint: disable=unnecessary-comprehension
 
@@ -1,12 +1,13 @@
+import asyncio
 import pickle
 import re
 from typing import Any
 
+import aiohttp
 import requests
 from bs4 import BeautifulSoup
 from nltk.stem import WordNetLemmatizer
 from nltk.tokenize import word_tokenize
-from requests import Response
 
 from url_predictions.config import FREQUENCY_TOP_WORDS, REQUEST_HEADERS, STOPWORDS, logger
 
@@ -43,28 +44,57 @@ def remove_stopwords(tokens: list[str]) -> list[str]:
     return list(filter(lambda x: len(x) > 1, tokens_list))
 
 
-def scrape(url: str) -> Response | str:
+def scrape_url(url: str) -> str | None:
     try:
-        return requests.get(url, headers=REQUEST_HEADERS, timeout=15)
+        return requests.get(url, headers=REQUEST_HEADERS, timeout=15).text
     except requests.exceptions.RequestException as e:
         logger.error(e)
-        return ""
+        return None
 
 
-def parse_request(res: Response) -> list[str]:
-    if res != "" and res.status_code == 200:
-        soup = BeautifulSoup(res.text, "html.parser")
+async def fetch_url(url: str, session: Any) -> str | None:
+    try:
+        async with session.get(url) as response:
+            return await response.text
+    except aiohttp.ClientError as e:
+        logger.error(e)
+        return None
+
+
+async def fetch_html_content_async(urls: list[str]) -> Any:
+    async with aiohttp.ClientSession() as session:
+        tasks = [fetch_url(url, session) for url in urls]
+        html_contents = await asyncio.gather(*tasks)
+        return html_contents
+
+
+def fetch_html_content_sync(urls: list[str]) -> str:
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    html_contents = loop.run_until_complete(fetch_html_content_async(urls))
+    return html_contents
+
+
+def parse_request(res: list[int | str]) -> tuple[int | str, list[str]]:
+    index = res[0]
+    html_content = res[1]
+    if res and html_content:
+        soup = BeautifulSoup(html_content, "html.parser")
         [tag.decompose() for tag in soup("script")]  # pylint: disable=expression-not-assigned
         [tag.decompose() for tag in soup("style")]  # pylint: disable=expression-not-assigned
         text = soup.get_text()
         cleaned_text = re.sub("[^a-zA-Z]+", " ", text).strip()
         tokens = word_tokenize(cleaned_text)
         tokens_lemmatize = remove_stopwords(tokens)
-        return tokens_lemmatize
-    return [""]
+        return index, tokens_lemmatize
+    return index, [""]
 
 
 def save_to_pickle(target: Any, output_path: str, write_mode: str) -> None:
-    pickle_out = open(output_path, write_mode)  # pylint: disable=unspecified-encoding, consider-using-with
-    pickle.dump(target, pickle_out)
-    pickle_out.close()
+    with open(output_path, write_mode) as pickle_out:  # pylint: disable=unspecified-encoding
+        pickle.dump(target, pickle_out)
+
+
+def read_pickle(input_path: str) -> Any:
+    with open(input_path, "rb") as pickle_in:
+        return pickle.load(pickle_in)