Skip to content

Commit

Permalink
update fetching
Browse files Browse the repository at this point in the history
  • Loading branch information
lyc0603 committed Nov 14, 2024
1 parent c85cebf commit b3bf716
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 94 deletions.
4 changes: 4 additions & 0 deletions governenv/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_exponential

from governenv.settings import OPENAI_API_KEY

Expand Down Expand Up @@ -36,6 +37,9 @@ def _build_prompt(

return prompt

@retry(
stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)
)
def __call__(
self,
message: str,
Expand Down
2 changes: 1 addition & 1 deletion governenv/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Prompts and instructions
"""

IDF_PROMPT = """Given the following website HTTP response, determine \
IDF_PROMPT = """Given the following HTTP website response, determine \
whether it satisfies the following criteria. If it satisfies all three criteria, \
return "Yes". Otherwise, return "No".
Expand Down
23 changes: 23 additions & 0 deletions governenv/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Utility functions
"""

from governenv.constants import EXKW


def kw_filt(data: dict[str, str]) -> dict[str, str]:
"""
Function to filter discussions based on keywords
"""

return {k: v for k, v in data.items() if not any([i in v for i in EXKW])}


def slash_filt(data: dict[str, str]) -> dict[str, str]:
"""
Function to filter discussions based on slashes
"""

# typically, a discussion has at least 4 levels of slashes
# if the slash count is less than 4, remove the discussion
return {k: v for k, v in data.items() if v.count("/") >= 4}
55 changes: 55 additions & 0 deletions scripts/fetch_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Fetch the http response of the discussion links
"""

import pickle
import time
from glob import glob

import requests
from tqdm import tqdm

from governenv.constants import DATA_DIR, HEADERS
from governenv.utils import kw_filt, slash_filt


def fetch_http_response(url: str, timeout: int = 10) -> str:
"""
Fetches the HTTP response from a given URL.
"""
response = requests.get(url, headers=HEADERS, timeout=timeout)

# if the status_code is not 200, raise an error
if response.status_code != 200:
raise Exception(f"Status code: {response.status_code}")

return response.text


if __name__ == "__main__":
# unpickle data_unique
with open(DATA_DIR / "discussion_links.pkl", "rb") as f:
data_unique = pickle.load(f)
print(f"Data length before filtering: {len(data_unique)}")

# filter discussions
data_unique = slash_filt(kw_filt(data_unique))
print(f"Data length after filtering: {len(data_unique)}")

fetched_data = [
_.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html"))
]

# fetch http response
for i, (k, v) in tqdm(enumerate(data_unique.items()), total=len(data_unique)):
if str(i) in fetched_data:
continue
try:
# save the html
html = fetch_http_response(v)
with open(DATA_DIR / "html_200" / f"{i}.html", "w", encoding="utf-8") as f:
f.write(html)
except Exception as e:
print(f"Error fetching {v}: {e}")

time.sleep(2)
92 changes: 0 additions & 92 deletions scripts/fetch_http.py

This file was deleted.

68 changes: 68 additions & 0 deletions scripts/process_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Script to aggregate all the html files in the data folder into a single jsonl file
"""

import gzip
import json
import pickle
from glob import glob

from bs4 import BeautifulSoup
from tqdm import tqdm

from governenv.constants import DATA_DIR
from governenv.utils import kw_filt, slash_filt


def distill_html(html: str) -> str:
"""
Function to distill the html
"""
# Parse the HTML
soup = BeautifulSoup(html, "html.parser")

# Remove irrelevant tags (scripts, styles, footers, navs, etc.)
for tag in soup(
["script", "style", "header", "footer", "nav", "aside", "form", "link", "meta"]
):
tag.decompose()

# Extract text content from discussion-relevant tags
relevant_content = soup.find_all(["div", "p", "li", "article", "section"])

# Combine and clean the text
cleaned_text = "\n\n".join(
tag.get_text(strip=True) for tag in relevant_content if tag.get_text(strip=True)
)

return cleaned_text


if __name__ == "__main__":

# unpickle data_unique
with open(DATA_DIR / "discussion_links.pkl", "rb") as f:
data_unique = pickle.load(f)
print(f"Data length before filtering: {len(data_unique)}")

# filter discussions
data_unique = slash_filt(kw_filt(data_unique))
print(f"Data length after filtering: {len(data_unique)}")

fetched_data = [
_.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html"))
]

# save the html
with gzip.open(DATA_DIR / "html.jsonl.gz", "wt") as gz_f:
for i, (k, v) in tqdm(enumerate(data_unique.items())):
if str(i) in fetched_data:
# save the html
with open(DATA_DIR / "html" / f"{i}.html", "r", encoding="utf-8") as f:
html = f.read()

# distill the html
html_distilled = distill_html(html)

json.dump({"url": v, "html": html_distilled}, gz_f)
gz_f.write("\n")
58 changes: 58 additions & 0 deletions scripts/process_identify_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Script to identify whether the html files meet the criteria
"""

import gzip
import json
import math


import tiktoken
from tqdm import tqdm

from governenv.constants import DATA_DIR
from governenv.llm import ChatGPT
from governenv.prompts import IDF_INSTRUCT, IDF_PROMPT

tokenizer = tiktoken.encoding_for_model("gpt-4o")


idf_dict = {}

llm = ChatGPT()

if __name__ == "__main__":

with gzip.open(DATA_DIR / "html.jsonl.gz", "rt") as gz_f:
for idx, line in tqdm(enumerate(gz_f)):
data = json.loads(line.strip())
url = data["url"]
html = data["html"]

try:
# identify if the html meets the criteria
idf_res = llm(
instruction=IDF_INSTRUCT,
message=IDF_PROMPT.format(http_response=html),
logprobs=True,
top_logprobs=2,
)

idf, prob = idf_res if isinstance(idf_res, tuple) else (idf_res, None)

first_prob = prob[0]
yes_prob = (
math.exp(first_prob.logprob)
if "Yes" in first_prob.token
else 1 - math.exp(first_prob.logprob)
)

idf_dict[url] = {
"idf": idf,
"yes_prob": yes_prob,
}
except Exception as e:
print(f"Error processing {url}: {e}")

with open(DATA_DIR / "idf.json", "w", encoding="utf-8") as f:
json.dump(idf_dict, f, indent=2)
2 changes: 1 addition & 1 deletion scripts/process_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from governenv.constants import DATA_DIR, HEADERS
from governenv.llm import ChatGPT
from governenv.prompts import EVAL_INSTRUCT, EVAL_PROMPT, IDF_INSTRUCT, IDF_PROMPT

from governenv.utils import kw_filt, slash_filt

if __name__ == "__main__":

Expand Down

0 comments on commit b3bf716

Please sign in to comment.