Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
techwithtim authored Aug 22, 2024
1 parent b5628e4 commit 48b40ff
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 0 deletions.
Binary file added chromedriver
Binary file not shown.
43 changes: 43 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import streamlit as st
from scrape import (
scrape_website,
extract_body_content,
clean_body_content,
split_dom_content,
)
from parse import parse_with_ollama

# Streamlit UI
st.title("AI Web Scraper")
url = st.text_input("Enter Website URL")

# Step 1: Scrape the Website
if st.button("Scrape Website"):
if url:
st.write("Scraping the website...")

# Scrape the website
dom_content = scrape_website(url)
body_content = extract_body_content(dom_content)
cleaned_content = clean_body_content(body_content)

# Store the DOM content in Streamlit session state
st.session_state.dom_content = cleaned_content

# Display the DOM content in an expandable text box
with st.expander("View DOM Content"):
st.text_area("DOM Content", cleaned_content, height=300)


# Step 2: Ask Questions About the DOM Content
if "dom_content" in st.session_state:
parse_description = st.text_area("Describe what you want to parse")

if st.button("Parse Content"):
if parse_description:
st.write("Parsing the content...")

# Parse the content with Ollama
dom_chunks = split_dom_content(st.session_state.dom_content)
parsed_result = parse_with_ollama(dom_chunks, parse_description)
st.write(parsed_result)
29 changes: 29 additions & 0 deletions parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

template = (
"You are tasked with extracting specific information from the following text content: {dom_content}. "
"Please follow these instructions carefully: \n\n"
"1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
"2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
"3. **Empty Response:** If no information matches the description, return an empty string ('')."
"4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
)

model = OllamaLLM(model="llama3")


def parse_with_ollama(dom_chunks, parse_description):
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model

parsed_results = []

for i, chunk in enumerate(dom_chunks, start=1):
response = chain.invoke(
{"dom_content": chunk, "parse_description": parse_description}
)
print(f"Parsed batch: {i} of {len(dom_chunks)}")
parsed_results.append(response)

return "\n".join(parsed_results)
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
streamlit
langchain
langchain_ollama
selenium
beautifulsoup4
lxml
html5lib
python-dotenv
1 change: 1 addition & 0 deletions sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SBR_WEBDRIVER=""
57 changes: 57 additions & 0 deletions scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import os

load_dotenv()

SBR_WEBDRIVER = os.getenv("SBR_WEBDRIVER")


def scrape_website(website):
print("Connecting to Scraping Browser...")
sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, "goog", "chrome")
with Remote(sbr_connection, options=ChromeOptions()) as driver:
driver.get(website)
print("Waiting captcha to solve...")
solve_res = driver.execute(
"executeCdpCommand",
{
"cmd": "Captcha.waitForSolve",
"params": {"detectTimeout": 10000},
},
)
print("Captcha solve status:", solve_res["value"]["status"])
print("Navigated! Scraping page content...")
html = driver.page_source
return html


def extract_body_content(html_content):
soup = BeautifulSoup(html_content, "html.parser")
body_content = soup.body
if body_content:
return str(body_content)
return ""


def clean_body_content(body_content):
soup = BeautifulSoup(body_content, "html.parser")

for script_or_style in soup(["script", "style"]):
script_or_style.extract()

# Get text or further process the content
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)

return cleaned_content


def split_dom_content(dom_content, max_length=6000):
return [
dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
]

0 comments on commit 48b40ff

Please sign in to comment.