-
Notifications
You must be signed in to change notification settings - Fork 145
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b5628e4
commit 48b40ff
Showing
6 changed files
with
138 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import streamlit as st | ||
from scrape import ( | ||
scrape_website, | ||
extract_body_content, | ||
clean_body_content, | ||
split_dom_content, | ||
) | ||
from parse import parse_with_ollama | ||
|
||
# Streamlit UI | ||
st.title("AI Web Scraper") | ||
url = st.text_input("Enter Website URL") | ||
|
||
# Step 1: Scrape the Website | ||
if st.button("Scrape Website"): | ||
if url: | ||
st.write("Scraping the website...") | ||
|
||
# Scrape the website | ||
dom_content = scrape_website(url) | ||
body_content = extract_body_content(dom_content) | ||
cleaned_content = clean_body_content(body_content) | ||
|
||
# Store the DOM content in Streamlit session state | ||
st.session_state.dom_content = cleaned_content | ||
|
||
# Display the DOM content in an expandable text box | ||
with st.expander("View DOM Content"): | ||
st.text_area("DOM Content", cleaned_content, height=300) | ||
|
||
|
||
# Step 2: Ask Questions About the DOM Content | ||
if "dom_content" in st.session_state: | ||
parse_description = st.text_area("Describe what you want to parse") | ||
|
||
if st.button("Parse Content"): | ||
if parse_description: | ||
st.write("Parsing the content...") | ||
|
||
# Parse the content with Ollama | ||
dom_chunks = split_dom_content(st.session_state.dom_content) | ||
parsed_result = parse_with_ollama(dom_chunks, parse_description) | ||
st.write(parsed_result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from langchain_ollama import OllamaLLM | ||
from langchain_core.prompts import ChatPromptTemplate | ||
|
||
template = ( | ||
"You are tasked with extracting specific information from the following text content: {dom_content}. " | ||
"Please follow these instructions carefully: \n\n" | ||
"1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. " | ||
"2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. " | ||
"3. **Empty Response:** If no information matches the description, return an empty string ('')." | ||
"4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text." | ||
) | ||
|
||
model = OllamaLLM(model="llama3") | ||
|
||
|
||
def parse_with_ollama(dom_chunks, parse_description): | ||
prompt = ChatPromptTemplate.from_template(template) | ||
chain = prompt | model | ||
|
||
parsed_results = [] | ||
|
||
for i, chunk in enumerate(dom_chunks, start=1): | ||
response = chain.invoke( | ||
{"dom_content": chunk, "parse_description": parse_description} | ||
) | ||
print(f"Parsed batch: {i} of {len(dom_chunks)}") | ||
parsed_results.append(response) | ||
|
||
return "\n".join(parsed_results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
streamlit | ||
langchain | ||
langchain_ollama | ||
selenium | ||
beautifulsoup4 | ||
lxml | ||
html5lib | ||
python-dotenv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
SBR_WEBDRIVER="" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from selenium.webdriver import Remote, ChromeOptions | ||
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection | ||
from bs4 import BeautifulSoup | ||
from dotenv import load_dotenv | ||
import os | ||
|
||
load_dotenv() | ||
|
||
SBR_WEBDRIVER = os.getenv("SBR_WEBDRIVER") | ||
|
||
|
||
def scrape_website(website): | ||
print("Connecting to Scraping Browser...") | ||
sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, "goog", "chrome") | ||
with Remote(sbr_connection, options=ChromeOptions()) as driver: | ||
driver.get(website) | ||
print("Waiting captcha to solve...") | ||
solve_res = driver.execute( | ||
"executeCdpCommand", | ||
{ | ||
"cmd": "Captcha.waitForSolve", | ||
"params": {"detectTimeout": 10000}, | ||
}, | ||
) | ||
print("Captcha solve status:", solve_res["value"]["status"]) | ||
print("Navigated! Scraping page content...") | ||
html = driver.page_source | ||
return html | ||
|
||
|
||
def extract_body_content(html_content): | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
body_content = soup.body | ||
if body_content: | ||
return str(body_content) | ||
return "" | ||
|
||
|
||
def clean_body_content(body_content): | ||
soup = BeautifulSoup(body_content, "html.parser") | ||
|
||
for script_or_style in soup(["script", "style"]): | ||
script_or_style.extract() | ||
|
||
# Get text or further process the content | ||
cleaned_content = soup.get_text(separator="\n") | ||
cleaned_content = "\n".join( | ||
line.strip() for line in cleaned_content.splitlines() if line.strip() | ||
) | ||
|
||
return cleaned_content | ||
|
||
|
||
def split_dom_content(dom_content, max_length=6000): | ||
return [ | ||
dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length) | ||
] |