-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
lyc0603
committed
Nov 14, 2024
1 parent
c85cebf
commit b3bf716
Showing
8 changed files
with
210 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
""" | ||
Utility functions | ||
""" | ||
|
||
from governenv.constants import EXKW | ||
|
||
|
||
def kw_filt(data: dict[str, str]) -> dict[str, str]: | ||
""" | ||
Function to filter discussions based on keywords | ||
""" | ||
|
||
return {k: v for k, v in data.items() if not any([i in v for i in EXKW])} | ||
|
||
|
||
def slash_filt(data: dict[str, str]) -> dict[str, str]: | ||
""" | ||
Function to filter discussions based on slashes | ||
""" | ||
|
||
# typically, a discussion has at least 4 levels of slashes | ||
# if the slash count is less than 4, remove the discussion | ||
return {k: v for k, v in data.items() if v.count("/") >= 4} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
""" | ||
Fetch the http response of the discussion links | ||
""" | ||
|
||
import pickle | ||
import time | ||
from glob import glob | ||
|
||
import requests | ||
from tqdm import tqdm | ||
|
||
from governenv.constants import DATA_DIR, HEADERS | ||
from governenv.utils import kw_filt, slash_filt | ||
|
||
|
||
def fetch_http_response(url: str, timeout: int = 10) -> str: | ||
""" | ||
Fetches the HTTP response from a given URL. | ||
""" | ||
response = requests.get(url, headers=HEADERS, timeout=timeout) | ||
|
||
# if the status_code is not 200, raise an error | ||
if response.status_code != 200: | ||
raise Exception(f"Status code: {response.status_code}") | ||
|
||
return response.text | ||
|
||
|
||
if __name__ == "__main__": | ||
# unpickle data_unique | ||
with open(DATA_DIR / "discussion_links.pkl", "rb") as f: | ||
data_unique = pickle.load(f) | ||
print(f"Data length before filtering: {len(data_unique)}") | ||
|
||
# filter discussions | ||
data_unique = slash_filt(kw_filt(data_unique)) | ||
print(f"Data length after filtering: {len(data_unique)}") | ||
|
||
fetched_data = [ | ||
_.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html")) | ||
] | ||
|
||
# fetch http response | ||
for i, (k, v) in tqdm(enumerate(data_unique.items()), total=len(data_unique)): | ||
if str(i) in fetched_data: | ||
continue | ||
try: | ||
# save the html | ||
html = fetch_http_response(v) | ||
with open(DATA_DIR / "html_200" / f"{i}.html", "w", encoding="utf-8") as f: | ||
f.write(html) | ||
except Exception as e: | ||
print(f"Error fetching {v}: {e}") | ||
|
||
time.sleep(2) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
Script to aggregate all the html files in the data folder into a single jsonl file | ||
""" | ||
|
||
import gzip | ||
import json | ||
import pickle | ||
from glob import glob | ||
|
||
from bs4 import BeautifulSoup | ||
from tqdm import tqdm | ||
|
||
from governenv.constants import DATA_DIR | ||
from governenv.utils import kw_filt, slash_filt | ||
|
||
|
||
def distill_html(html: str) -> str: | ||
""" | ||
Function to distill the html | ||
""" | ||
# Parse the HTML | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# Remove irrelevant tags (scripts, styles, footers, navs, etc.) | ||
for tag in soup( | ||
["script", "style", "header", "footer", "nav", "aside", "form", "link", "meta"] | ||
): | ||
tag.decompose() | ||
|
||
# Extract text content from discussion-relevant tags | ||
relevant_content = soup.find_all(["div", "p", "li", "article", "section"]) | ||
|
||
# Combine and clean the text | ||
cleaned_text = "\n\n".join( | ||
tag.get_text(strip=True) for tag in relevant_content if tag.get_text(strip=True) | ||
) | ||
|
||
return cleaned_text | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# unpickle data_unique | ||
with open(DATA_DIR / "discussion_links.pkl", "rb") as f: | ||
data_unique = pickle.load(f) | ||
print(f"Data length before filtering: {len(data_unique)}") | ||
|
||
# filter discussions | ||
data_unique = slash_filt(kw_filt(data_unique)) | ||
print(f"Data length after filtering: {len(data_unique)}") | ||
|
||
fetched_data = [ | ||
_.split("/")[-1].split(".")[0] for _ in glob(str(DATA_DIR / "html" / "*.html")) | ||
] | ||
|
||
# save the html | ||
with gzip.open(DATA_DIR / "html.jsonl.gz", "wt") as gz_f: | ||
for i, (k, v) in tqdm(enumerate(data_unique.items())): | ||
if str(i) in fetched_data: | ||
# save the html | ||
with open(DATA_DIR / "html" / f"{i}.html", "r", encoding="utf-8") as f: | ||
html = f.read() | ||
|
||
# distill the html | ||
html_distilled = distill_html(html) | ||
|
||
json.dump({"url": v, "html": html_distilled}, gz_f) | ||
gz_f.write("\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
""" | ||
Script to identify whether the html files meet the criteria | ||
""" | ||
|
||
import gzip | ||
import json | ||
import math | ||
|
||
|
||
import tiktoken | ||
from tqdm import tqdm | ||
|
||
from governenv.constants import DATA_DIR | ||
from governenv.llm import ChatGPT | ||
from governenv.prompts import IDF_INSTRUCT, IDF_PROMPT | ||
|
||
tokenizer = tiktoken.encoding_for_model("gpt-4o") | ||
|
||
|
||
idf_dict = {} | ||
|
||
llm = ChatGPT() | ||
|
||
if __name__ == "__main__": | ||
|
||
with gzip.open(DATA_DIR / "html.jsonl.gz", "rt") as gz_f: | ||
for idx, line in tqdm(enumerate(gz_f)): | ||
data = json.loads(line.strip()) | ||
url = data["url"] | ||
html = data["html"] | ||
|
||
try: | ||
# identify if the html meets the criteria | ||
idf_res = llm( | ||
instruction=IDF_INSTRUCT, | ||
message=IDF_PROMPT.format(http_response=html), | ||
logprobs=True, | ||
top_logprobs=2, | ||
) | ||
|
||
idf, prob = idf_res if isinstance(idf_res, tuple) else (idf_res, None) | ||
|
||
first_prob = prob[0] | ||
yes_prob = ( | ||
math.exp(first_prob.logprob) | ||
if "Yes" in first_prob.token | ||
else 1 - math.exp(first_prob.logprob) | ||
) | ||
|
||
idf_dict[url] = { | ||
"idf": idf, | ||
"yes_prob": yes_prob, | ||
} | ||
except Exception as e: | ||
print(f"Error processing {url}: {e}") | ||
|
||
with open(DATA_DIR / "idf.json", "w", encoding="utf-8") as f: | ||
json.dump(idf_dict, f, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters