Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce SeleniumBrowser #1733

Closed
wants to merge 39 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
23ee145
Update test_web_surfer.py
signalprime Feb 19, 2024
2daec15
Update browser_utils.py
signalprime Feb 19, 2024
9efb297
Update web_surfer.py
signalprime Feb 19, 2024
217ed91
ContentAgent: Custom LLM agent for collecting online content.
signalprime Feb 19, 2024
72a165a
Update content_agent.py
signalprime Feb 19, 2024
46b2424
Update browser_utils.py
signalprime Feb 20, 2024
d34ae1b
Update content_agent.py
signalprime Feb 20, 2024
1ba9e05
Update content_agent.py
signalprime Feb 20, 2024
84fa1b8
Unit Tests for the ContentAgent
signalprime Feb 20, 2024
67f95bf
Update browser_utils.py
signalprime Feb 20, 2024
08f8ff9
Update web_surfer.py
signalprime Feb 20, 2024
3954412
Update content_agent.py
signalprime Feb 20, 2024
749a556
Update content_agent.py
signalprime Feb 20, 2024
818a010
Update browser_utils.py
signalprime Feb 20, 2024
643bad0
Update content_agent.py
signalprime Feb 20, 2024
20cd2a6
Update browser_utils.py
signalprime Feb 20, 2024
0389387
Update test_web_surfer.py
signalprime Feb 20, 2024
be89b9b
Updates to include selenium in websurfer extras, webdrivers in the py…
signalprime Feb 20, 2024
0a40763
Added the websurfer with desktop browser demo notebook
signalprime Feb 22, 2024
25e15e0
Merge branch 'main' into main
signalprime Feb 22, 2024
5602958
Restored to original form in official main branch. Added for clari…
signalprime Feb 22, 2024
8954fef
Further cleaned the two test files and confirmed they passed using th…
signalprime Feb 22, 2024
0c2202c
Update after feedback from GitHub built error, with my apologies for …
signalprime Feb 22, 2024
13ba006
Update contrib-tests.yml for Selenium
signalprime Feb 22, 2024
e1e81f6
Update contrib-openai.yml
signalprime Feb 22, 2024
0b5e733
Update contrib-tests.yml
signalprime Feb 22, 2024
9099b57
Update contrib-openai.yml
signalprime Feb 22, 2024
7443458
Update setup.py
signalprime Feb 22, 2024
1b87acd
Update test_content_agent.py
signalprime Feb 22, 2024
11b00e5
pre-commit fix on setup.py for readability (websurfer extras)
signalprime Feb 22, 2024
66ac7bd
Final cleanup of unnecessary comments within the PR.
signalprime Feb 22, 2024
6fbe0b8
Restored the original copies of the two unrelated notebooks altered b…
signalprime Feb 22, 2024
451405b
Merge branch 'main' into main
sonichi Feb 25, 2024
c06f6fd
Provided a more descriptive name for the agent responsible for collec…
signalprime Feb 25, 2024
ef7586e
Update web_surfer.py
signalprime Mar 26, 2024
2be44bc
Update browser_utils.py
signalprime Mar 26, 2024
e64ae32
Update browser_utils.py
signalprime Mar 26, 2024
3e7cf18
Update contrib-openai.yml
signalprime Mar 26, 2024
841ed31
Merge branch 'main' into main
signalprime Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update content_agent.py
Very minor updates prior to submitting a PR
  • Loading branch information
signalprime authored Feb 19, 2024
commit 72a165aec33fb3ca17d63fa0a094ef82ce0d4639
32 changes: 18 additions & 14 deletions autogen/agentchat/contrib/content_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@
from urllib.parse import urlparse, urlunparse
from bs4 import BeautifulSoup


# Import the arxiv library if it is available
IS_ARXIV_CAPABLE = False
try:
import arxiv
IS_ARXIV_CAPABLE = True
except ModuleNotFoundError:
print("The 'arxiv' library was not found in this environment, but can be installed with 'pip install arxiv'.")
pass

from ...browser_utils import (
SeleniumBrowser, download_using_requests,
get_domain, get_scheme, get_path, get_last_path, get_file_path_from_url, fix_missing_protocol,
Expand Down Expand Up @@ -41,7 +49,7 @@ class ContentAgent(ConversableAgent):
- pillow

"""
def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs): #request_kwargs,
def __init__(self, silent=True, storage_path='./content', max_depth=1, page_loading_time=5, *args, **kwargs):
super().__init__(*args, **kwargs)

from collections import deque
Expand All @@ -51,15 +59,16 @@ def __init__(self, silent=True, storage_path='./content', max_depth=1, page_load
self.local_dir = storage_path
self.page_load_time = page_loading_time
self.silent = silent
self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"}) # {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.3; rv:122.0) Gecko/20100101 Firefox/122.0"} })
self.browser_kwargs = kwargs.get('browser_kwargs', {"browser": "firefox"})
self.request_kwargs = {'headers': { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15"} }
self.small_llm_config = kwargs['llm_config']

# Define the classifiers
self.define_classifiers()

def classifier_to_collector_reply(self, recipient, messages, sender, config): # replacement for classify_content
def classifier_to_collector_reply(self, recipient, messages, sender, config):
# Inner dialogue reply for boolean classification results
last_message = messages[-1] if isinstance(messages, list) else messages
# print(last_message)
_, rep = recipient.generate_oai_reply([last_message], sender)
if 'false' in rep.lower(): rep = 'False'
elif 'true' in rep.lower(): rep = 'True'
Expand Down Expand Up @@ -128,7 +137,7 @@ def fetch_content(self, link):
parsed_url = urlparse(link)

# A special case for arxiv links
if 'arxiv' in link:
if 'arxiv' in link and IS_ARXIV_CAPABLE:
return 'pdf', self.fetch_arxiv_content(parsed_url)

elif parsed_url.path.endswith('.pdf'):
Expand Down Expand Up @@ -214,10 +223,7 @@ def fetch_html_content(self, link):

# Close down the browser
self.browser.quit()

# # Deallocate the variable contents
# self.browser = None


return 'success'

def fetch_pdf_content(self, link):
Expand All @@ -227,8 +233,8 @@ def fetch_pdf_content(self, link):
)
os.makedirs(local_pdf_path, exist_ok=True)


response = requests.get(link, params={'headers': self.request_kwargs})
# This could be replaced with `download_using_requests`
response = requests.get(link, params={'headers': self.request_kwargs['headers']})

if response.status_code == 200:
with open(local_pdf_path, 'wb') as f:
Expand All @@ -246,8 +252,6 @@ def fetch_pdf_content(self, link):
return None

def fetch_arxiv_content(self, link):
# Import the arxiv library
import arxiv # todo: add try/catch

# Identify the paper identification
arxiv_id = link.path.split('/')[-1]
Expand Down