Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Search API #3049

Open
wants to merge 11 commits into
base: 0.2
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ __pycache__/
*.so

# Distribution / packaging
.devcontainer/
.Python
build/
develop-eggs/
Expand Down
Empty file.
10 changes: 6 additions & 4 deletions autogen/agentchat/contrib/web_surfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing_extensions import Annotated

from ... import Agent, AssistantAgent, ConversableAgent, GroupChat, GroupChatManager, OpenAIWrapper, UserProxyAgent
from ...browser_utils import SimpleTextBrowser
from ...browser_utils.browser_creator import TextBrowserEnum
from ...code_utils import content_str
from ...oai.openai_utils import filter_config
from ...token_count_utils import count_token, get_max_token_limit
Expand Down Expand Up @@ -40,6 +40,7 @@ def __init__(
llm_config: Optional[Union[Dict, Literal[False]]] = None,
summarizer_llm_config: Optional[Union[Dict, Literal[False]]] = None,
default_auto_reply: Optional[Union[str, Dict, None]] = "",
browser_name: Literal['bing', 'google'] = "bing",
browser_config: Optional[Union[Dict, None]] = None,
):
super().__init__(
Expand All @@ -58,7 +59,8 @@ def __init__(
self._create_summarizer_client(summarizer_llm_config, llm_config)

# Create the browser
self.browser = SimpleTextBrowser(**(browser_config if browser_config else {}))
self.browser_name = browser_name
self.browser = TextBrowserEnum.get_browser(browser_name)

inner_llm_config = copy.deepcopy(llm_config)

Expand Down Expand Up @@ -136,7 +138,7 @@ def _browser_state() -> Tuple[str, str]:
description="Perform an INFORMATIONAL web search query then return the search results.",
)
def _informational_search(query: Annotated[str, "The informational web search query to perform."]) -> str:
self.browser.visit_page(f"bing: {query}")
self.browser.visit_page(f"{self.browser_name}: {query}")
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content

Expand All @@ -146,7 +148,7 @@ def _informational_search(query: Annotated[str, "The informational web search qu
description="Perform a NAVIGATIONAL web search query then immediately navigate to the top result. Useful, for example, to navigate to a particular Wikipedia article or other known destination. Equivalent to Google's \"I'm Feeling Lucky\" button.",
)
def _navigational_search(query: Annotated[str, "The navigational web search query to perform."]) -> str:
self.browser.visit_page(f"bing: {query}")
self.browser.visit_page(f"{self.browser_name}: {query}")

# Extract the first linl
m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content)
Expand Down
2 changes: 1 addition & 1 deletion autogen/browser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
pass


class SimpleTextBrowser:
class TextBrowserBase:
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""

def __init__(
Expand Down
Empty file.
216 changes: 216 additions & 0 deletions autogen/browser_utils/base_browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import io
import mimetypes
import os
import re
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union, overload
from urllib.parse import urljoin, urlparse

import markdownify
import requests
from bs4 import BeautifulSoup

# Optional PDF support
IS_PDF_CAPABLE = False
try:
import pdfminer
import pdfminer.high_level

IS_PDF_CAPABLE = True
except ModuleNotFoundError:
pass

# Other optional dependencies
try:
import pathvalidate
except ModuleNotFoundError:
pass


class TextBrowserBase:
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""

def __init__(
self,
start_page: Optional[str] = None,
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
base_url: str = None,
api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
):
self.start_page: str = start_page if start_page else "about:blank"
self.viewport_size = viewport_size # Applies only to the standard uri types
self.downloads_folder = downloads_folder
self.history: List[str] = list()
self.page_title: Optional[str] = None
self.viewport_current_page = 0
self.viewport_pages: List[Tuple[int, int]] = list()
self.set_address(self.start_page)
self.base_url = base_url
self.api_key = api_key
self.request_kwargs = request_kwargs

self._page_content = ""

@property
def address(self) -> str:
"""Return the address of the current page."""
return self.history[-1]

@overload
def set_address(self, uri_or_path: str) -> None:
self.history.append(uri_or_path)

self.viewport_current_page = 0

@property
def viewport(self) -> str:
"""Return the content of the current viewport."""
bounds = self.viewport_pages[self.viewport_current_page]
return self.page_content[bounds[0] : bounds[1]]

@property
def page_content(self) -> str:
"""Return the full contents of the current page."""
return self._page_content

def _set_page_content(self, content: str) -> None:
"""Sets the text content of the current page."""
self._page_content = content
self._split_pages()
if self.viewport_current_page >= len(self.viewport_pages):
self.viewport_current_page = len(self.viewport_pages) - 1

def page_down(self) -> None:
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)

def page_up(self) -> None:
self.viewport_current_page = max(self.viewport_current_page - 1, 0)

def visit_page(self, path_or_uri: str) -> str:
"""Update the address, visit the page, and return the content of the viewport."""
self.set_address(path_or_uri)
return self.viewport

def _split_pages(self) -> None:
# Split only regular pages
if not self.address.startswith("http:") and not self.address.startswith("https:"):
self.viewport_pages = [(0, len(self._page_content))]
return

# Handle empty pages
if len(self._page_content) == 0:
self.viewport_pages = [(0, 0)]
return

# Break the viewport into pages
self.viewport_pages = []
start_idx = 0
while start_idx < len(self._page_content):
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
# Adjust to end on a space
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
end_idx += 1
self.viewport_pages.append((start_idx, end_idx))
start_idx = end_idx

def _fetch_page(self, url: str) -> None:
try:
# Prepare the request parameters
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you considered the benefits of using Playwright as opposed to a get request? A lot of pages will need dom composition and js execution to fully compose a usable page. Llamaindex web laoder is doing the same. That will make a much more reliable user experience and will meet expectations for we scraping tasks

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not thought about using a third party library. As daunting as it might sound, keeping it simple and as it was was my first approach. However, we can go with either Playwright or Llamaindex web loader both are fine. But then we will need to make sure that the whole library is implementing the requests the same way. Let me know your thoughts. @colombod

Copy link
Collaborator

@colombod colombod Aug 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The proposal I am doing is to just use that and move from a web get into actual web navigation. Playwright is what also LlamaIndex is using for navigation, that will also be useful as we see more multimodel approach

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@colombod I started working on adding Playwright into the PR. However, it seems its async functionality is a bit unstable giving me timeout errors, then working properly on another request. I am not sure if you have any experience of overcoming this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is not the experience I have with it, can you show one of the issues you are facing? Maybe I can help or we could get Playwright engineers to investigate.

Copy link
Author

@MohammedNagdy MohammedNagdy Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@colombod Well, I have been trying in the past couple of weeks. But I always get these failed tests

FAILED test/test_browser_utils_google.py::test_simple_text_browser - assert 'Redmond' in 'Page.goto: net::ERR_ABORTED at https://en.wikipedia.org/wiki/Microsoft\nCall log:\nnavigating to "https://en.wikipedia.org/wiki/Microsoft", waiting until "networkidle"\n'
FAILED test/test_browser_utils_google.py::test_google_search - assert "A Google search for 'Microsoft' found" in 'Page.goto: net::ERR_ABORTED at google: Microsoft\nCall log:\nnavigating to "google: Microsoft", waiting until "networkidle"\n'

Although I have the timeout increased to 600 seconds

response = await self._page.goto(url, wait_until='networkidle', timeout=600000)

request_kwargs["stream"] = True

# Send a HTTP request to the URL
response = requests.get(url, **request_kwargs)
response.raise_for_status()

# If the HTTP request returns a status code 200, proceed
if response.status_code == 200:
content_type = response.headers.get("content-type", "")
for ct in ["text/html", "text/plain", "application/pdf"]:
if ct in content_type.lower():
content_type = ct
break

if content_type == "text/html":
# Get the content of the response
html = ""
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
html += chunk

soup = BeautifulSoup(html, "html.parser")

# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()

# Convert to markdown -- Wikipedia gets special attention to get a clean version of the page
if url.startswith("https://en.wikipedia.org/"):
body_elm = soup.find("div", {"id": "mw-content-text"})
title_elm = soup.find("span", {"class": "mw-page-title-main"})

if body_elm:
# What's the title
main_title = soup.title.string
if title_elm and len(title_elm) > 0:
main_title = title_elm.string
webpage_text = (
"# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
)
else:
webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
else:
webpage_text = markdownify.MarkdownConverter().convert_soup(soup)

# Convert newlines
webpage_text = re.sub(r"\r\n", "\n", webpage_text)

# Remove excessive blank lines
self.page_title = soup.title.string
self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip())
elif content_type == "text/plain":
# Get the content of the response
plain_text = ""
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
plain_text += chunk

self.page_title = None
self._set_page_content(plain_text)
elif IS_PDF_CAPABLE and content_type == "application/pdf":
pdf_data = io.BytesIO(response.raw.read())
self.page_title = None
self._set_page_content(pdfminer.high_level.extract_text(pdf_data))
elif self.downloads_folder is not None:
# Try producing a safe filename
fname = None
try:
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
except NameError:
pass

# No suitable name, so make one
if fname is None:
extension = mimetypes.guess_extension(content_type)
if extension is None:
extension = ".download"
fname = str(uuid.uuid4()) + extension

# Open a file for writing
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
with open(download_path, "wb") as fh:
for chunk in response.iter_content(chunk_size=512):
fh.write(chunk)

# Return a page describing what just happened
self.page_title = "Download complete."
self._set_page_content(f"Downloaded '{url}' to '{download_path}'.")
else:
self.page_title = f"Error - Unsupported Content-Type '{content_type}'"
self._set_page_content(self.page_title)
else:
self.page_title = "Error"
self._set_page_content("Failed to retrieve " + url)
except requests.exceptions.RequestException as e:
self.page_title = "Error"
self._set_page_content(str(e))
Loading