Add files via upload

techwithtim · Aug 22, 2024 · 48b40ff · 48b40ff
1 parent b5628e4
commit 48b40ff
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 0 deletions.
diff --git a/chromedriver b/chromedriver
diff --git a/main.py b/main.py
@@ -0,0 +1,43 @@
+import streamlit as st
+from scrape import (
+    scrape_website,
+    extract_body_content,
+    clean_body_content,
+    split_dom_content,
+)
+from parse import parse_with_ollama
+
+# Streamlit UI
+st.title("AI Web Scraper")
+url = st.text_input("Enter Website URL")
+
+# Step 1: Scrape the Website
+if st.button("Scrape Website"):
+    if url:
+        st.write("Scraping the website...")
+
+        # Scrape the website
+        dom_content = scrape_website(url)
+        body_content = extract_body_content(dom_content)
+        cleaned_content = clean_body_content(body_content)
+
+        # Store the DOM content in Streamlit session state
+        st.session_state.dom_content = cleaned_content
+
+        # Display the DOM content in an expandable text box
+        with st.expander("View DOM Content"):
+            st.text_area("DOM Content", cleaned_content, height=300)
+
+
+# Step 2: Ask Questions About the DOM Content
+if "dom_content" in st.session_state:
+    parse_description = st.text_area("Describe what you want to parse")
+
+    if st.button("Parse Content"):
+        if parse_description:
+            st.write("Parsing the content...")
+
+            # Parse the content with Ollama
+            dom_chunks = split_dom_content(st.session_state.dom_content)
+            parsed_result = parse_with_ollama(dom_chunks, parse_description)
+            st.write(parsed_result)
diff --git a/parse.py b/parse.py
@@ -0,0 +1,29 @@
+from langchain_ollama import OllamaLLM
+from langchain_core.prompts import ChatPromptTemplate
+
+template = (
+    "You are tasked with extracting specific information from the following text content: {dom_content}. "
+    "Please follow these instructions carefully: \n\n"
+    "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
+    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
+    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
+    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
+)
+
+model = OllamaLLM(model="llama3")
+
+
+def parse_with_ollama(dom_chunks, parse_description):
+    prompt = ChatPromptTemplate.from_template(template)
+    chain = prompt | model
+
+    parsed_results = []
+
+    for i, chunk in enumerate(dom_chunks, start=1):
+        response = chain.invoke(
+            {"dom_content": chunk, "parse_description": parse_description}
+        )
+        print(f"Parsed batch: {i} of {len(dom_chunks)}")
+        parsed_results.append(response)
+
+    return "\n".join(parsed_results)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+streamlit 
+langchain 
+langchain_ollama
+selenium
+beautifulsoup4
+lxml 
+html5lib
+python-dotenv
diff --git a/sample.env b/sample.env
@@ -0,0 +1 @@
+SBR_WEBDRIVER=""
diff --git a/scrape.py b/scrape.py
@@ -0,0 +1,57 @@
+from selenium.webdriver import Remote, ChromeOptions
+from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+SBR_WEBDRIVER = os.getenv("SBR_WEBDRIVER")
+
+
+def scrape_website(website):
+    print("Connecting to Scraping Browser...")
+    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, "goog", "chrome")
+    with Remote(sbr_connection, options=ChromeOptions()) as driver:
+        driver.get(website)
+        print("Waiting captcha to solve...")
+        solve_res = driver.execute(
+            "executeCdpCommand",
+            {
+                "cmd": "Captcha.waitForSolve",
+                "params": {"detectTimeout": 10000},
+            },
+        )
+        print("Captcha solve status:", solve_res["value"]["status"])
+        print("Navigated! Scraping page content...")
+        html = driver.page_source
+        return html
+
+
+def extract_body_content(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    body_content = soup.body
+    if body_content:
+        return str(body_content)
+    return ""
+
+
+def clean_body_content(body_content):
+    soup = BeautifulSoup(body_content, "html.parser")
+
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.extract()
+
+    # Get text or further process the content
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+
+    return cleaned_content
+
+
+def split_dom_content(dom_content, max_length=6000):
+    return [
+        dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
+    ]