forked from Significant-Gravitas/AutoGPT
-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathweb.py
85 lines (61 loc) · 2.54 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from selenium import webdriver
import autogpt.summary as summary
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import logging
from pathlib import Path
from autogpt.config import Config
file_dir = Path(__file__).parent
cfg = Config()
def browse_website(url, question):
driver, text = scrape_text_with_selenium(url)
add_header(driver)
summary_text = summary.summarize_text(driver, text, question)
links = scrape_links_with_selenium(driver)
# Limit links to 5
if len(links) > 5:
links = links[:5]
close_browser(driver)
return f"从网站收集的答案: {summary_text} \n \n 链接: {links}", driver
def scrape_text_with_selenium(url):
logging.getLogger("selenium").setLevel(logging.CRITICAL)
options = Options()
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36"
)
driver = webdriver.Chrome(
executable_path=ChromeDriverManager().install(), options=options
)
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# Get the HTML content directly from the browser's DOM
page_source = driver.execute_script("return document.body.outerHTML;")
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return driver, text
def scrape_links_with_selenium(driver):
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
hyperlinks = extract_hyperlinks(soup)
return format_hyperlinks(hyperlinks)
def close_browser(driver):
driver.quit()
def extract_hyperlinks(soup):
return [(link.text, link["href"]) for link in soup.find_all("a", href=True)]
def format_hyperlinks(hyperlinks):
return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks]
def add_header(driver):
driver.execute_script(open(f"{file_dir}/js/overlay.js", "r").read())