update fetching

xujiahuayz · Nov 14, 2024 · b3bf716 · b3bf716
1 parent c85cebf
commit b3bf716
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 94 deletions.
diff --git a/governenv/llm.py b/governenv/llm.py
@@ -3,6 +3,7 @@
 """
 
 from openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential
 
 from governenv.settings import OPENAI_API_KEY
 
@@ -36,6 +37,9 @@ def _build_prompt(
 
         return prompt
 
+    @retry(
+        stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)
+    )
     def __call__(
         self,
         message: str,

diff --git a/governenv/prompts.py b/governenv/prompts.py
@@ -2,7 +2,7 @@
 Prompts and instructions
 """
 
-IDF_PROMPT = """Given the following website HTTP response, determine \
+IDF_PROMPT = """Given the following HTTP website response, determine \
 whether it satisfies the following criteria. If it satisfies all three criteria, \
 return "Yes". Otherwise, return "No".
 

diff --git a/governenv/utils.py b/governenv/utils.py
@@ -0,0 +1,23 @@
+"""
+Utility functions
+"""
+
+from governenv.constants import EXKW
+
+
+def kw_filt(data: dict[str, str]) -> dict[str, str]:
+    """
+    Function to filter discussions based on keywords
+    """
+
+    return {k: v for k, v in data.items() if not any([i in v for i in EXKW])}
+
+
+def slash_filt(data: dict[str, str]) -> dict[str, str]:
+    """
+    Function to filter discussions based on slashes
+    """
+
+    # typically, a discussion has at least 4 levels of slashes
+    # if the slash count is less than 4, remove the discussion
+    return {k: v for k, v in data.items() if v.count("/") >= 4}
diff --git a/scripts/fetch_html.py b/scripts/fetch_html.py
@@ -0,0 +1,55 @@
+"""
+Fetch the http response of the discussion links
+"""
+
+import pickle
+import time
+from glob import glob
+
+import requests
+from tqdm import tqdm
+
+from governenv.constants import DATA_DIR, HEADERS
+from governenv.utils import kw_filt, slash_filt
+
+
+def fetch_http_response(url: str, timeout: int = 10) -> str:
+    """
+    Fetches the HTTP response from a given URL.
+    """
+    response = requests.get(url, headers=HEADERS, timeout=timeout)
+
+    # if the status_code is not 200, raise an error
+    if response.status_code != 200:
+        raise Exception(f"Status code: {response.status_code}")
+
+    return response.text
+
+
+if __name__ == "__main__":
+    # unpickle data_unique
+    with open(DATA_DIR / "discussion_links.pkl", "rb") as f:
+        data_unique = pickle.load(f)
+        print(f"Data length before filtering: {len(data_unique)}")
+
+    # filter discussions
+    data_unique = slash_filt(kw_filt(data_unique))
+    print(f"Data length after filtering: {len(data_unique)}")
+
+    fetched_data = [
+        _.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html"))
+    ]
+
+    # fetch http response
+    for i, (k, v) in tqdm(enumerate(data_unique.items()), total=len(data_unique)):
+        if str(i) in fetched_data:
+            continue
+        try:
+            # save the html
+            html = fetch_http_response(v)
+            with open(DATA_DIR / "html_200" / f"{i}.html", "w", encoding="utf-8") as f:
+                f.write(html)
+        except Exception as e:
+            print(f"Error fetching {v}: {e}")
+
+        time.sleep(2)
diff --git a/scripts/fetch_http.py b/scripts/fetch_http.py
diff --git a/scripts/process_html.py b/scripts/process_html.py
@@ -0,0 +1,68 @@
+"""
+Script to aggregate all the html files in the data folder into a single jsonl file
+"""
+
+import gzip
+import json
+import pickle
+from glob import glob
+
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+from governenv.constants import DATA_DIR
+from governenv.utils import kw_filt, slash_filt
+
+
+def distill_html(html: str) -> str:
+    """
+    Function to distill the html
+    """
+    # Parse the HTML
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Remove irrelevant tags (scripts, styles, footers, navs, etc.)
+    for tag in soup(
+        ["script", "style", "header", "footer", "nav", "aside", "form", "link", "meta"]
+    ):
+        tag.decompose()
+
+    # Extract text content from discussion-relevant tags
+    relevant_content = soup.find_all(["div", "p", "li", "article", "section"])
+
+    # Combine and clean the text
+    cleaned_text = "\n\n".join(
+        tag.get_text(strip=True) for tag in relevant_content if tag.get_text(strip=True)
+    )
+
+    return cleaned_text
+
+
+if __name__ == "__main__":
+
+    # unpickle data_unique
+    with open(DATA_DIR / "discussion_links.pkl", "rb") as f:
+        data_unique = pickle.load(f)
+        print(f"Data length before filtering: {len(data_unique)}")
+
+    # filter discussions
+    data_unique = slash_filt(kw_filt(data_unique))
+    print(f"Data length after filtering: {len(data_unique)}")
+
+    fetched_data = [
+        _.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html"))
+    ]
+
+    # save the html
+    with gzip.open(DATA_DIR / "html.jsonl.gz", "wt") as gz_f:
+        for i, (k, v) in tqdm(enumerate(data_unique.items())):
+            if str(i) in fetched_data:
+                # save the html
+                with open(DATA_DIR / "html" / f"{i}.html", "r", encoding="utf-8") as f:
+                    html = f.read()
+
+                # distill the html
+                html_distilled = distill_html(html)
+
+                json.dump({"url": v, "html": html_distilled}, gz_f)
+                gz_f.write("\n")
diff --git a/scripts/process_identify_html.py b/scripts/process_identify_html.py
@@ -0,0 +1,58 @@
+"""
+Script to identify whether the html files meet the criteria
+"""
+
+import gzip
+import json
+import math
+
+
+import tiktoken
+from tqdm import tqdm
+
+from governenv.constants import DATA_DIR
+from governenv.llm import ChatGPT
+from governenv.prompts import IDF_INSTRUCT, IDF_PROMPT
+
+tokenizer = tiktoken.encoding_for_model("gpt-4o")
+
+
+idf_dict = {}
+
+llm = ChatGPT()
+
+if __name__ == "__main__":
+
+    with gzip.open(DATA_DIR / "html.jsonl.gz", "rt") as gz_f:
+        for idx, line in tqdm(enumerate(gz_f)):
+            data = json.loads(line.strip())
+            url = data["url"]
+            html = data["html"]
+
+            try:
+                # identify if the html meets the criteria
+                idf_res = llm(
+                    instruction=IDF_INSTRUCT,
+                    message=IDF_PROMPT.format(http_response=html),
+                    logprobs=True,
+                    top_logprobs=2,
+                )
+
+                idf, prob = idf_res if isinstance(idf_res, tuple) else (idf_res, None)
+
+                first_prob = prob[0]
+                yes_prob = (
+                    math.exp(first_prob.logprob)
+                    if "Yes" in first_prob.token
+                    else 1 - math.exp(first_prob.logprob)
+                )
+
+                idf_dict[url] = {
+                    "idf": idf,
+                    "yes_prob": yes_prob,
+                }
+            except Exception as e:
+                print(f"Error processing {url}: {e}")
+
+    with open(DATA_DIR / "idf.json", "w", encoding="utf-8") as f:
+        json.dump(idf_dict, f, indent=2)
diff --git a/scripts/process_sentiment.py b/scripts/process_sentiment.py
@@ -8,7 +8,7 @@
 from governenv.constants import DATA_DIR, HEADERS
 from governenv.llm import ChatGPT
 from governenv.prompts import EVAL_INSTRUCT, EVAL_PROMPT, IDF_INSTRUCT, IDF_PROMPT
-
+from governenv.utils import kw_filt, slash_filt
 
 if __name__ == "__main__":