Skip to content

Commit

Permalink
feat: partition_html directly from a url (Unstructured-IO#202)
Browse files Browse the repository at this point in the history
* added tests for html from url

* bump version

* added types-requests

* and -> an
  • Loading branch information
MthwRobinson authored Feb 7, 2023
1 parent 2b88890 commit ee9f154
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 7 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
## 0.4.7-dev0
## 0.4.7-dev1

* Added the ability to pull an HTML document from a url in `partition_html`.

## 0.4.6

Expand Down
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pytest-cov
# issue to address
# ref: https://github.com/Unstructured-IO/unstructured/issues/200
label_studio_sdk==0.0.17
types-requests
vcrpy

# NOTE(robinson) - The following pins are to address
Expand Down
4 changes: 4 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ tomli==2.0.1
# coverage
# mypy
# pytest
types-requests==2.28.11.8
# via -r requirements/test.in
types-urllib3==1.26.25.4
# via types-requests
typing-extensions==4.4.0
# via
# black
Expand Down
47 changes: 47 additions & 0 deletions test_unstructured/partition/test_html_partition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import pathlib
import pytest
from unittest.mock import patch

import requests

from unstructured.partition.html import partition_html

Expand Down Expand Up @@ -29,6 +32,50 @@ def test_partition_html_from_text():
assert len(elements) > 0


class MockResponse:
def __init__(self, text, status_code, headers={}):
self.text = text
self.status_code = status_code
self.ok = status_code < 300
self.headers = headers


def test_partition_html_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename, "r") as f:
text = f.read()

response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/html"})
with patch.object(requests, "get", return_value=response) as _:
elements = partition_html(url="https://fake.url")

assert len(elements) > 0


def test_partition_html_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename, "r") as f:
text = f.read()

response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"})
with patch.object(requests, "get", return_value=response) as _:
with pytest.raises(ValueError):
partition_html(url="https://fake.url")


def test_partition_html_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename, "r") as f:
text = f.read()

response = MockResponse(
text=text, status_code=200, headers={"Content-Type": "application/json"}
)
with patch.object(requests, "get", return_value=response) as _:
with pytest.raises(ValueError):
partition_html(url="https://fake.url")


def test_partition_html_raises_with_none_specified():
with pytest.raises(ValueError):
partition_html()
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.7-dev0" # pragma: no cover
__version__ = "0.4.7-dev1" # pragma: no cover
29 changes: 24 additions & 5 deletions unstructured/partition/html.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from typing import IO, List, Optional

import requests

from unstructured.documents.elements import Element
from unstructured.documents.html import HTMLDocument


def partition_html(
filename: Optional[str] = None, file: Optional[IO] = None, text: Optional[str] = None
filename: Optional[str] = None,
file: Optional[IO] = None,
text: Optional[str] = None,
url: Optional[str] = None,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
Expand All @@ -17,15 +22,17 @@ def partition_html(
A file-like object using "r" mode --> open(filename, "r").
text
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
"""
if not any([filename, file, text]):
if not any([filename, file, text, url]):
raise ValueError("One of filename, file, or text must be specified.")

if filename is not None and not file and not text:
if filename is not None and not file and not text and not url:
document = HTMLDocument.from_file(filename)
elements = document.elements

elif file is not None and not filename and not text:
elif file is not None and not filename and not text and not url:
file_content = file.read()
if isinstance(file_content, bytes):
file_text = file_content.decode("utf-8")
Expand All @@ -35,11 +42,23 @@ def partition_html(
document = HTMLDocument.from_string(file_text)
elements = document.elements

elif text is not None and not filename and not file:
elif text is not None and not filename and not file and not url:
_text: str = str(text)
document = HTMLDocument.from_string(_text)
elements = document.elements

elif url is not None and not filename and not file and not text:
response = requests.get(url)
if not response.ok:
raise ValueError(f"URL return an error: {response.status_code} {response.text}")

content_type = response.headers.get("Content-Type", "")
if not content_type.startswith("text/html"):
raise ValueError(f"Expected content type text/html. Got {content_type}.")

document = HTMLDocument.from_string(response.text)
elements = document.elements

else:
raise ValueError("Only one of filename, file, or text can be specified.")

Expand Down

0 comments on commit ee9f154

Please sign in to comment.