diff --git a/CHANGELOG.md b/CHANGELOG.md index 61f8dd2fc2..9106006941 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.5.7-dev2 +## 0.5.7-dev3 ### Enhancements * Refactored codebase using `exactly_one` +* Adds ability to pass headers when passing a url in partition_html() ### Features diff --git a/docs/requirements.txt b/docs/requirements.txt index 3e5217d8c8..1522236949 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,7 +8,7 @@ alabaster==0.7.13 # via sphinx babel==2.12.1 # via sphinx -beautifulsoup4==4.11.2 +beautifulsoup4==4.12.0 # via furo certifi==2022.12.7 # via @@ -20,13 +20,13 @@ docutils==0.18.1 # via # sphinx # sphinx-rtd-theme -furo==2022.12.7 +furo==2023.3.23 # via -r requirements/build.in idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via sphinx jinja2==3.1.2 # via sphinx @@ -52,6 +52,7 @@ sphinx==6.1.3 # furo # sphinx-basic-ng # sphinx-rtd-theme + # sphinxcontrib-jquery sphinx-basic-ng==1.0.0b1 # via furo sphinx-rtd-theme==1.2.0rc3 @@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2 # via sphinx sphinxcontrib-htmlhelp==2.0.1 # via sphinx -sphinxcontrib-jquery==3.0.0 +sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx @@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -urllib3==1.26.14 +urllib3==1.26.15 # via requests zipp==3.15.0 # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index bed7d41903..336c9836c1 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -210,10 +210,13 @@ Examples: The ``partition_html`` function partitions an HTML document and returns a list of document ``Element`` objects. ``partition_html`` can take a filename, file-like -object, or string as input. The three examples below all produce the same output. +object, string, or url as input. Examples: +These three invocations of partition_html() result are essentially equivalent: + + .. code:: python from unstructured.partition.html import partition_html @@ -228,6 +231,22 @@ Examples: elements = partition_html(text=text) + +The following illustrates fetching a url and partition it the response content. + +.. code:: python + + from unstructured.partition.html import partition_html + + elements = partition_html(url="https://python.org/") + + # you can also provide custom headers: + + elements = partition_html(url="https://python.org/", + headers={"User-Agent": "YourScriptName/1.0 ..."}) + + + ``partition_pdf`` --------------------- diff --git a/examples/sec-sentiment-analysis/fetch.py b/examples/sec-sentiment-analysis/fetch.py index 5972226aaa..a058dc8bc8 100644 --- a/examples/sec-sentiment-analysis/fetch.py +++ b/examples/sec-sentiment-analysis/fetch.py @@ -65,7 +65,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict: response.raise_for_status() content = json.loads(response.content) recent_forms = content["filings"]["recent"] - form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])} + form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"])) return form_types diff --git a/requirements/base.txt b/requirements/base.txt index aaf27c4cc7..a8f264fc7e 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,12 +4,9 @@ # # pip-compile --output-file=requirements/base.txt # ---extra-index-url https://pypi.ngc.nvidia.com ---trusted-host pypi.ngc.nvidia.com - anyio==3.6.2 # via httpcore -argilla==1.4.0 +argilla==1.5.0 # via unstructured (setup.py) backoff==2.2.1 # via argilla @@ -40,7 +37,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via markdown joblib==1.2.0 # via nltk @@ -49,7 +46,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via unstructured (setup.py) monotonic==1.6 # via argilla @@ -59,7 +56,7 @@ numpy==1.23.5 # via # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via unstructured (setup.py) packaging==23.0 # via argilla @@ -71,7 +68,7 @@ pillow==9.4.0 # via # python-pptx # unstructured (setup.py) -pydantic==1.10.6 +pydantic==1.10.7 # via argilla pygments==2.14.0 # via rich @@ -87,7 +84,7 @@ python-pptx==0.6.21 # via unstructured (setup.py) pytz==2022.7.1 # via pandas -regex==2022.10.31 +regex==2023.3.23 # via nltk requests==2.28.2 # via unstructured (setup.py) @@ -110,7 +107,7 @@ typing-extensions==4.5.0 # via # pydantic # rich -urllib3==1.26.14 +urllib3==1.26.15 # via requests wrapt==1.14.1 # via diff --git a/requirements/build.txt b/requirements/build.txt index 3e5217d8c8..1522236949 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -8,7 +8,7 @@ alabaster==0.7.13 # via sphinx babel==2.12.1 # via sphinx -beautifulsoup4==4.11.2 +beautifulsoup4==4.12.0 # via furo certifi==2022.12.7 # via @@ -20,13 +20,13 @@ docutils==0.18.1 # via # sphinx # sphinx-rtd-theme -furo==2022.12.7 +furo==2023.3.23 # via -r requirements/build.in idna==3.4 # via requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via sphinx jinja2==3.1.2 # via sphinx @@ -52,6 +52,7 @@ sphinx==6.1.3 # furo # sphinx-basic-ng # sphinx-rtd-theme + # sphinxcontrib-jquery sphinx-basic-ng==1.0.0b1 # via furo sphinx-rtd-theme==1.2.0rc3 @@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2 # via sphinx sphinxcontrib-htmlhelp==2.0.1 # via sphinx -sphinxcontrib-jquery==3.0.0 +sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx @@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3 # via sphinx sphinxcontrib-serializinghtml==1.1.5 # via sphinx -urllib3==1.26.14 +urllib3==1.26.15 # via requests zipp==3.15.0 # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/dev.txt b/requirements/dev.txt index 548be76bce..7dc7c77372 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -25,7 +25,7 @@ attrs==22.2.0 # via jsonschema backcall==0.2.0 # via ipython -beautifulsoup4==4.11.2 +beautifulsoup4==4.12.0 # via nbconvert bleach==6.0.0 # via nbconvert @@ -37,7 +37,7 @@ cfgv==3.3.1 # via pre-commit click==8.1.3 # via pip-tools -comm==0.1.2 +comm==0.1.3 # via ipykernel debugpy==1.6.6 # via ipykernel @@ -51,25 +51,24 @@ executing==1.2.0 # via stack-data fastjsonschema==2.16.3 # via nbformat -filelock==3.9.0 +filelock==3.10.3 # via virtualenv fqdn==1.5.1 # via jsonschema -identify==2.5.19 +identify==2.5.21 # via pre-commit idna==3.4 # via # anyio # jsonschema -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # jupyter-client # nbconvert importlib-resources==5.12.0 # via jsonschema -ipykernel==6.21.3 +ipykernel==6.22.0 # via - # ipywidgets # jupyter # jupyter-console # nbclassic @@ -86,7 +85,7 @@ ipython-genutils==0.2.0 # nbclassic # notebook # qtconsole -ipywidgets==8.0.4 +ipywidgets==8.0.5 # via jupyter isoduration==20.11.0 # via jsonschema @@ -106,7 +105,7 @@ jsonschema[format-nongpl]==4.17.3 # nbformat jupyter==1.0.0 # via -r requirements/dev.in -jupyter-client==8.0.3 +jupyter-client==8.1.0 # via # ipykernel # jupyter-console @@ -117,7 +116,7 @@ jupyter-client==8.0.3 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.2.0 +jupyter-core==5.3.0 # via # -r requirements/dev.in # ipykernel @@ -132,7 +131,7 @@ jupyter-core==5.2.0 # qtconsole jupyter-events==0.6.3 # via jupyter-server -jupyter-server==2.4.0 +jupyter-server==2.5.0 # via # nbclassic # notebook-shim @@ -140,7 +139,7 @@ jupyter-server-terminals==0.4.4 # via jupyter-server jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-widgets==3.0.5 +jupyterlab-widgets==3.0.6 # via ipywidgets markupsafe==2.1.2 # via @@ -156,13 +155,13 @@ nbclassic==0.5.3 # via notebook nbclient==0.7.2 # via nbconvert -nbconvert==7.2.9 +nbconvert==7.2.10 # via # jupyter # jupyter-server # nbclassic # notebook -nbformat==5.7.3 +nbformat==5.8.0 # via # jupyter-server # nbclassic @@ -186,6 +185,7 @@ packaging==23.0 # ipykernel # jupyter-server # nbconvert + # qtconsole # qtpy pandocfilters==1.5.0 # via nbconvert @@ -203,7 +203,7 @@ platformdirs==3.1.1 # via # jupyter-core # virtualenv -pre-commit==3.1.1 +pre-commit==3.2.0 # via -r requirements/dev.in prometheus-client==0.16.0 # via @@ -244,7 +244,7 @@ pyyaml==6.0 # via # jupyter-events # pre-commit -pyzmq==25.0.0 +pyzmq==25.0.2 # via # ipykernel # jupyter-client @@ -253,7 +253,7 @@ pyzmq==25.0.0 # nbclassic # notebook # qtconsole -qtconsole==5.4.0 +qtconsole==5.4.1 # via jupyter qtpy==2.3.0 # via qtconsole @@ -322,7 +322,7 @@ traitlets==5.9.0 # qtconsole uri-template==1.2.0 # via jsonschema -virtualenv==20.20.0 +virtualenv==20.21.0 # via pre-commit wcwidth==0.2.6 # via prompt-toolkit @@ -334,11 +334,11 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.5.1 # via jupyter-server -wheel==0.38.4 +wheel==0.40.0 # via # -r requirements/dev.in # pip-tools -widgetsnbextension==4.0.5 +widgetsnbextension==4.0.6 # via ipywidgets zipp==3.15.0 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index cde736dbe6..500332d579 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -6,7 +6,7 @@ # anyio==3.6.2 # via httpcore -argilla==1.4.0 +argilla==1.5.0 # via unstructured (setup.py) backoff==2.2.1 # via argilla @@ -28,9 +28,10 @@ deprecated==1.2.13 # via argilla et-xmlfile==1.1.0 # via openpyxl -filelock==3.9.0 +filelock==3.10.3 # via # huggingface-hub + # torch # transformers h11==0.14.0 # via httpcore @@ -38,15 +39,17 @@ httpcore==0.16.3 # via httpx httpx==0.23.3 # via argilla -huggingface-hub==0.13.1 +huggingface-hub==0.13.3 # via transformers idna==3.4 # via # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via markdown +jinja2==3.1.2 + # via torch joblib==1.2.0 # via # nltk @@ -58,10 +61,16 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via unstructured (setup.py) +markupsafe==2.1.2 + # via jinja2 monotonic==1.6 # via argilla +mpmath==1.3.0 + # via sympy +networkx==3.0 + # via torch nltk==3.8.1 # via unstructured (setup.py) numpy==1.23.5 @@ -69,7 +78,7 @@ numpy==1.23.5 # argilla # pandas # transformers -openpyxl==3.1.1 +openpyxl==3.1.2 # via unstructured (setup.py) packaging==23.0 # via @@ -84,10 +93,12 @@ pillow==9.4.0 # via # python-pptx # unstructured (setup.py) -pydantic==1.10.6 +pydantic==1.10.7 # via argilla pygments==2.14.0 # via rich +pypandoc==1.11 + # via unstructured (setup.py) python-dateutil==2.8.2 # via pandas python-docx==0.8.11 @@ -102,7 +113,7 @@ pyyaml==6.0 # via # huggingface-hub # transformers -regex==2022.10.31 +regex==2023.3.23 # via # nltk # sacremoses @@ -130,9 +141,11 @@ sniffio==1.3.0 # anyio # httpcore # httpx +sympy==1.11.1 + # via torch tokenizers==0.13.2 # via transformers -torch==1.13.1 +torch==2.0.0 # via unstructured (setup.py) tqdm==4.65.0 # via @@ -141,7 +154,7 @@ tqdm==4.65.0 # nltk # sacremoses # transformers -transformers==4.26.1 +transformers==4.27.3 # via unstructured (setup.py) typing-extensions==4.5.0 # via @@ -149,7 +162,7 @@ typing-extensions==4.5.0 # pydantic # rich # torch -urllib3==1.26.14 +urllib3==1.26.15 # via requests wrapt==1.14.1 # via diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 3fc264d856..3ba852ec47 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -16,7 +16,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.3.1 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -50,7 +50,7 @@ cffi==1.15.1 # via # azure-datalake-store # cryptography -charset-normalizer==3.0.1 +charset-normalizer==3.1.0 # via # -r requirements/base.txt # aiohttp @@ -59,7 +59,11 @@ click==8.1.3 # via # -r requirements/base.txt # nltk -cryptography==39.0.1 +commonmark==0.9.1 + # via + # -r requirements/base.txt + # rich +cryptography==39.0.2 # via # adal # azure-identity @@ -78,7 +82,7 @@ frozenlist==1.3.3 # via # aiohttp # aiosignal -fsspec==2023.1.0 +fsspec==2023.3.0 # via # adlfs # unstructured (setup.py) @@ -101,7 +105,7 @@ idna==3.4 # requests # rfc3986 # yarl -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -117,7 +121,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -144,7 +148,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -166,14 +170,22 @@ portalocker==2.7.0 # via msal-extensions pycparser==2.21 # via cffi -pydantic==1.10.5 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla +pygments==2.14.0 + # via + # -r requirements/base.txt + # rich pyjwt[crypto]==2.6.0 # via # adal # msal +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -195,7 +207,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -211,6 +223,10 @@ rfc3986[idna2008]==1.5.0 # via # -r requirements/base.txt # httpx +rich==13.0.1 + # via + # -r requirements/base.txt + # argilla six==1.16.0 # via # -r requirements/base.txt @@ -224,7 +240,7 @@ sniffio==1.3.0 # anyio # httpcore # httpx -tqdm==4.64.1 +tqdm==4.65.0 # via # -r requirements/base.txt # argilla @@ -235,7 +251,8 @@ typing-extensions==4.5.0 # azure-core # azure-storage-blob # pydantic -urllib3==1.26.14 + # rich +urllib3==1.26.15 # via # -r requirements/base.txt # requests @@ -244,7 +261,7 @@ wrapt==1.14.1 # -r requirements/base.txt # argilla # deprecated -xlsxwriter==3.0.8 +xlsxwriter==3.0.9 # via # -r requirements/base.txt # python-pptx diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index a5761e3fb7..80c81114c4 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -64,7 +64,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -78,7 +78,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -95,7 +95,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -115,7 +115,7 @@ pillow==9.4.0 # unstructured (setup.py) pycparser==2.21 # via cffi -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -129,6 +129,10 @@ pyjwt==2.6.0 # via pygithub pynacl==1.5.0 # via pygithub +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -149,7 +153,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -186,7 +190,7 @@ typing-extensions==4.5.0 # -r requirements/base.txt # pydantic # rich -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # requests diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index 89bb088679..4bfa0510b1 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -61,7 +61,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -75,7 +75,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -92,7 +92,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -110,7 +110,7 @@ pillow==9.4.0 # -r requirements/base.txt # python-pptx # unstructured (setup.py) -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -118,6 +118,10 @@ pygments==2.14.0 # via # -r requirements/base.txt # rich +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -140,7 +144,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -180,7 +184,7 @@ typing-extensions==4.5.0 # -r requirements/base.txt # pydantic # rich -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # requests diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 1099c632aa..1f98725f5e 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -47,7 +47,7 @@ et-xmlfile==1.1.0 # openpyxl google-api-core==2.11.0 # via google-api-python-client -google-api-python-client==2.80.0 +google-api-python-client==2.82.0 # via unstructured (setup.py) google-auth==2.16.2 # via @@ -56,7 +56,7 @@ google-auth==2.16.2 # google-auth-httplib2 google-auth-httplib2==0.1.0 # via google-api-python-client -googleapis-common-protos==1.58.0 +googleapis-common-protos==1.59.0 # via google-api-core h11==0.14.0 # via @@ -66,7 +66,7 @@ httpcore==0.16.3 # via # -r requirements/base.txt # httpx -httplib2==0.21.0 +httplib2==0.22.0 # via # google-api-python-client # google-auth-httplib2 @@ -80,7 +80,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -94,7 +94,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -111,7 +111,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -139,7 +139,7 @@ pyasn1==0.4.8 # rsa pyasn1-modules==0.2.8 # via google-auth -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -147,6 +147,10 @@ pygments==2.14.0 # via # -r requirements/base.txt # rich +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) pyparsing==3.0.9 # via httplib2 python-dateutil==2.8.2 @@ -169,7 +173,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -212,7 +216,7 @@ typing-extensions==4.5.0 # rich uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # requests diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 0e6ef3abbe..f3da59745e 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -61,7 +61,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -75,7 +75,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -92,7 +92,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -114,7 +114,7 @@ praw==7.7.0 # via unstructured (setup.py) prawcore==2.3.0 # via praw -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -122,6 +122,10 @@ pygments==2.14.0 # via # -r requirements/base.txt # rich +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -142,7 +146,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -182,7 +186,7 @@ typing-extensions==4.5.0 # rich update-checker==0.18.0 # via praw -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # requests diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 10b755913b..6a653f6f89 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -18,7 +18,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -87,7 +87,7 @@ idna==3.4 # requests # rfc3986 # yarl -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -103,7 +103,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -124,7 +124,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -142,7 +142,7 @@ pillow==9.4.0 # -r requirements/base.txt # python-pptx # unstructured (setup.py) -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -150,6 +150,10 @@ pygments==2.14.0 # via # -r requirements/base.txt # rich +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -171,7 +175,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -210,7 +214,7 @@ typing-extensions==4.5.0 # aioitertools # pydantic # rich -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # botocore diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index f3b1c0f061..1173152fda 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.4.0 +argilla==1.5.0 # via # -r requirements/base.txt # unstructured (setup.py) @@ -16,7 +16,7 @@ backoff==2.2.1 # via # -r requirements/base.txt # argilla -beautifulsoup4==4.11.2 +beautifulsoup4==4.12.0 # via wikipedia certifi==2022.12.7 # via @@ -63,7 +63,7 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via # -r requirements/base.txt # markdown @@ -77,7 +77,7 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via # -r requirements/base.txt # unstructured (setup.py) @@ -94,7 +94,7 @@ numpy==1.23.5 # -r requirements/base.txt # argilla # pandas -openpyxl==3.1.1 +openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured (setup.py) @@ -112,7 +112,7 @@ pillow==9.4.0 # -r requirements/base.txt # python-pptx # unstructured (setup.py) -pydantic==1.10.6 +pydantic==1.10.7 # via # -r requirements/base.txt # argilla @@ -120,6 +120,10 @@ pygments==2.14.0 # via # -r requirements/base.txt # rich +pypandoc==1.11 + # via + # -r requirements/base.txt + # unstructured (setup.py) python-dateutil==2.8.2 # via # -r requirements/base.txt @@ -140,7 +144,7 @@ pytz==2022.7.1 # via # -r requirements/base.txt # pandas -regex==2022.10.31 +regex==2023.3.23 # via # -r requirements/base.txt # nltk @@ -179,7 +183,7 @@ typing-extensions==4.5.0 # -r requirements/base.txt # pydantic # rich -urllib3==1.26.14 +urllib3==1.26.15 # via # -r requirements/base.txt # requests diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt index 9def8ca53b..b08816dbce 100644 --- a/requirements/local-inference.txt +++ b/requirements/local-inference.txt @@ -10,7 +10,7 @@ anyio==3.6.2 # via # httpcore # starlette -argilla==1.4.0 +argilla==1.5.0 # via unstructured (setup.py) backoff==2.2.1 # via argilla @@ -46,15 +46,16 @@ effdet==0.3.0 # via layoutparser et-xmlfile==1.1.0 # via openpyxl -fastapi==0.94.0 +fastapi==0.95.0 # via unstructured-inference -filelock==3.9.0 +filelock==3.10.3 # via # huggingface-hub + # torch # transformers flatbuffers==23.3.3 # via onnxruntime -fonttools==4.39.0 +fonttools==4.39.2 # via matplotlib h11==0.14.0 # via @@ -64,7 +65,7 @@ httpcore==0.16.3 # via httpx httpx==0.23.3 # via argilla -huggingface-hub==0.13.1 +huggingface-hub==0.13.3 # via # timm # transformers @@ -76,12 +77,14 @@ idna==3.4 # anyio # requests # rfc3986 -importlib-metadata==6.0.0 +importlib-metadata==6.1.0 # via markdown importlib-resources==5.12.0 # via matplotlib iopath==0.1.10 # via layoutparser +jinja2==3.1.2 + # via torch joblib==1.2.0 # via nltk kiwisolver==1.4.4 @@ -93,14 +96,18 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) -markdown==3.4.1 +markdown==3.4.3 # via unstructured (setup.py) +markupsafe==2.1.2 + # via jinja2 matplotlib==3.7.1 # via pycocotools monotonic==1.6 # via argilla mpmath==1.3.0 # via sympy +networkx==3.0 + # via torch nltk==3.8.1 # via unstructured (setup.py) numpy==1.23.5 @@ -124,7 +131,7 @@ opencv-python==4.6.0.66 # via # layoutparser # unstructured-inference -openpyxl==3.1.1 +openpyxl==3.1.2 # via unstructured (setup.py) packaging==23.0 # via @@ -163,12 +170,14 @@ pycocotools==2.0.6 # via effdet pycparser==2.21 # via cffi -pydantic==1.10.6 +pydantic==1.10.7 # via # argilla # fastapi pygments==2.14.0 # via rich +pypandoc==1.11 + # via unstructured (setup.py) pyparsing==3.0.9 # via matplotlib pytesseract==0.3.10 @@ -194,7 +203,7 @@ pyyaml==6.0 # omegaconf # timm # transformers -regex==2022.10.31 +regex==2023.3.23 # via # nltk # transformers @@ -217,21 +226,23 @@ sniffio==1.3.0 # anyio # httpcore # httpx -starlette==0.26.0.post1 +starlette==0.26.1 # via fastapi sympy==1.11.1 - # via onnxruntime + # via + # onnxruntime + # torch timm==0.6.12 # via effdet tokenizers==0.13.2 # via transformers -torch==1.13.1 +torch==2.0.0 # via # effdet # layoutparser # timm # torchvision -torchvision==0.14.1 +torchvision==0.15.1 # via # effdet # layoutparser @@ -243,7 +254,7 @@ tqdm==4.65.0 # iopath # nltk # transformers -transformers==4.26.1 +transformers==4.27.3 # via unstructured-inference typing-extensions==4.5.0 # via @@ -253,12 +264,11 @@ typing-extensions==4.5.0 # rich # starlette # torch - # torchvision unstructured-inference==0.2.11 # via unstructured (setup.py) -urllib3==1.26.14 +urllib3==1.26.15 # via requests -uvicorn==0.21.0 +uvicorn==0.21.1 # via unstructured-inference wand==0.6.11 # via pdfplumber diff --git a/requirements/test.in b/requirements/test.in index fc2e0a69b4..2f2c10bf51 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -8,6 +8,7 @@ flake8 mypy types-Markdown pytest-cov +pytest-mock label_studio_sdk types-requests vcrpy diff --git a/requirements/test.txt b/requirements/test.txt index b40c5f930e..b09dd5fa52 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -4,9 +4,6 @@ # # pip-compile requirements/test.in # ---extra-index-url https://pypi.ngc.nvidia.com ---trusted-host pypi.ngc.nvidia.com - appdirs==1.4.4 # via label-studio-tools attrs==22.2.0 @@ -23,7 +20,7 @@ click==8.1.3 # via # -r requirements/test.in # black -coverage[toml]==7.2.1 +coverage[toml]==7.2.2 # via # -r requirements/test.in # pytest-cov @@ -67,19 +64,23 @@ pluggy==1.0.0 # via pytest pycodestyle==2.10.0 # via flake8 -pydantic==1.10.6 +pydantic==1.10.7 # via label-studio-sdk pyflakes==3.0.1 # via flake8 pytest==7.2.2 - # via pytest-cov + # via + # pytest-cov + # pytest-mock pytest-cov==4.0.0 # via -r requirements/test.in +pytest-mock==3.10.0 + # via -r requirements/test.in pyyaml==6.0 # via vcrpy requests==2.28.2 # via label-studio-sdk -ruff==0.0.256 +ruff==0.0.259 # via -r requirements/test.in six==1.16.0 # via vcrpy @@ -91,7 +92,7 @@ tomli==2.0.1 # pytest types-markdown==3.4.2.5 # via -r requirements/test.in -types-requests==2.28.11.15 +types-requests==2.28.11.16 # via -r requirements/test.in types-urllib3==1.26.25.8 # via types-requests diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 0dd1df9b9b..1d3e8666d5 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -4,6 +4,7 @@ import pytest import requests +from requests.models import Response from unstructured.documents.elements import PageBreak from unstructured.partition.html import partition_html @@ -86,6 +87,25 @@ def test_partition_html_from_url_raises_with_bad_content_type(): partition_html(url="https://fake.url") +def test_partition_from_url_uses_headers(mocker): + test_url = "https://example.com" + test_headers = {"User-Agent": "test"} + + response = Response() + response.status_code = 200 + response._content = ( + b"

What do i know? Who needs to know it?

" + ) + response.headers = {"Content-Type": "text/html"} + + mock_get = mocker.patch("requests.get", return_value=response) + + partition_html(url=test_url, headers=test_headers) + + # Check if requests.get was called with the correct arguments + mock_get.assert_called_once_with(test_url, headers=test_headers) + + def test_partition_html_raises_with_none_specified(): with pytest.raises(ValueError): partition_html() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index df7694dbf4..4db40458d7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.7-dev2" # pragma: no cover +__version__ = "0.5.7-dev3" # pragma: no cover diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 29fb226d75..7783713ffe 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -1,4 +1,4 @@ -from typing import IO, List, Optional +from typing import IO, Dict, List, Optional import requests @@ -20,6 +20,7 @@ def partition_html( encoding: Optional[str] = None, include_page_breaks: bool = False, include_metadata: bool = True, + headers: Dict[str, str] = {}, parser: VALID_PARSERS = None, ) -> List[Element]: """Partitions an HTML document into its constituent elements. @@ -67,7 +68,7 @@ def partition_html( document = HTMLDocument.from_string(_text, parser=parser) elif url is not None: - response = requests.get(url) + response = requests.get(url, headers=headers) if not response.ok: raise ValueError(f"URL return an error: {response.status_code}")