From ee9f15483f0305ecb9b56ddc329bc514da9c8019 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Tue, 7 Feb 2023 09:09:34 -0500 Subject: [PATCH] feat: `partition_html` directly from a url (#202) * added tests for html from url * bump version * added types-requests * and -> an --- CHANGELOG.md | 4 +- requirements/test.in | 1 + requirements/test.txt | 4 ++ .../partition/test_html_partition.py | 47 +++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/partition/html.py | 29 ++++++++++-- 6 files changed, 80 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58cfc477ab..368abfaed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,6 @@ -## 0.4.7-dev0 +## 0.4.7-dev1 + +* Added the ability to pull an HTML document from a url in `partition_html`. ## 0.4.6 diff --git a/requirements/test.in b/requirements/test.in index bdeba9a08e..5597898573 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -11,6 +11,7 @@ pytest-cov # issue to address # ref: https://github.com/Unstructured-IO/unstructured/issues/200 label_studio_sdk==0.0.17 +types-requests vcrpy # NOTE(robinson) - The following pins are to address diff --git a/requirements/test.txt b/requirements/test.txt index b9e095bd84..193fa5b4bd 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -78,6 +78,10 @@ tomli==2.0.1 # coverage # mypy # pytest +types-requests==2.28.11.8 + # via -r requirements/test.in +types-urllib3==1.26.25.4 + # via types-requests typing-extensions==4.4.0 # via # black diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 37fa325bea..f441994247 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -1,6 +1,9 @@ import os import pathlib import pytest +from unittest.mock import patch + +import requests from unstructured.partition.html import partition_html @@ -29,6 +32,50 @@ def test_partition_html_from_text(): assert len(elements) > 0 +class MockResponse: + def __init__(self, text, status_code, headers={}): + self.text = text + self.status_code = status_code + self.ok = status_code < 300 + self.headers = headers + + +def test_partition_html_from_url(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") + with open(filename, "r") as f: + text = f.read() + + response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/html"}) + with patch.object(requests, "get", return_value=response) as _: + elements = partition_html(url="https://fake.url") + + assert len(elements) > 0 + + +def test_partition_html_from_url_raises_with_bad_status_code(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") + with open(filename, "r") as f: + text = f.read() + + response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"}) + with patch.object(requests, "get", return_value=response) as _: + with pytest.raises(ValueError): + partition_html(url="https://fake.url") + + +def test_partition_html_from_url_raises_with_bad_content_type(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") + with open(filename, "r") as f: + text = f.read() + + response = MockResponse( + text=text, status_code=200, headers={"Content-Type": "application/json"} + ) + with patch.object(requests, "get", return_value=response) as _: + with pytest.raises(ValueError): + partition_html(url="https://fake.url") + + def test_partition_html_raises_with_none_specified(): with pytest.raises(ValueError): partition_html() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b3a836b02f..b6d6d3b1ca 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.7-dev0" # pragma: no cover +__version__ = "0.4.7-dev1" # pragma: no cover diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 12237f3ae2..faca8953d0 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -1,11 +1,16 @@ from typing import IO, List, Optional +import requests + from unstructured.documents.elements import Element from unstructured.documents.html import HTMLDocument def partition_html( - filename: Optional[str] = None, file: Optional[IO] = None, text: Optional[str] = None + filename: Optional[str] = None, + file: Optional[IO] = None, + text: Optional[str] = None, + url: Optional[str] = None, ) -> List[Element]: """Partitions an HTML document into its constituent elements. @@ -17,15 +22,17 @@ def partition_html( A file-like object using "r" mode --> open(filename, "r"). text The string representation of the HTML document. + url + The URL of a webpage to parse. Only for URLs that return an HTML document. """ - if not any([filename, file, text]): + if not any([filename, file, text, url]): raise ValueError("One of filename, file, or text must be specified.") - if filename is not None and not file and not text: + if filename is not None and not file and not text and not url: document = HTMLDocument.from_file(filename) elements = document.elements - elif file is not None and not filename and not text: + elif file is not None and not filename and not text and not url: file_content = file.read() if isinstance(file_content, bytes): file_text = file_content.decode("utf-8") @@ -35,11 +42,23 @@ def partition_html( document = HTMLDocument.from_string(file_text) elements = document.elements - elif text is not None and not filename and not file: + elif text is not None and not filename and not file and not url: _text: str = str(text) document = HTMLDocument.from_string(_text) elements = document.elements + elif url is not None and not filename and not file and not text: + response = requests.get(url) + if not response.ok: + raise ValueError(f"URL return an error: {response.status_code} {response.text}") + + content_type = response.headers.get("Content-Type", "") + if not content_type.startswith("text/html"): + raise ValueError(f"Expected content type text/html. Got {content_type}.") + + document = HTMLDocument.from_string(response.text) + elements = document.elements + else: raise ValueError("Only one of filename, file, or text can be specified.")