Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update os dir naming for images #157

Merged
merged 4 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,5 @@ dmypy.json

# VSCode
.vscode/

sample-docs/*_images
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.5.7

* hotfix to handle issue storing images in a new dir when the pdf has no file extension

## 0.5.6

* Update the `annotate` and `_get_image_array` methods of `PageLayout` to get the image from the `image_path` property if the `image` property is `None`.
Expand Down
33 changes: 33 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base

ARG PIP_VERSION

# Set up environment
ENV HOME /home/
WORKDIR ${HOME}
RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
&& ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/usr/.local/bin:${PATH}"

FROM base as deps
# Copy and install Unstructured
COPY requirements requirements

RUN python3.8 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
pip install --no-cache -r requirements/base.txt && \
pip install --no-cache -r requirements/test.txt && \
pip install --no-cache -r requirements/dev.txt && \
pip install "unstructured.PaddleOCR" && \
dnf -y groupremove "Development Tools" && \
dnf clean all

FROM deps as code
ARG PACKAGE_NAME=unstructured_inference
COPY unstructured_inference unstructured_inference

#CMD ["pytest -m \"not slow\" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing"]
CMD ["/bin/bash"]
#CMD ["bash -c pytest test_unstructured_inference"]
21 changes: 21 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
PACKAGE_NAME := unstructured_inference
PIP_VERSION := 23.1.2
CURRENT_DIR := $(shell pwd)


.PHONY: help
Expand Down Expand Up @@ -116,3 +117,23 @@ version-sync:
.PHONY: check-coverage
check-coverage:
coverage report --fail-under=95

##########
# Docker #
##########

# Docker targets are provided for convenience only and are not required in a standard development environment

DOCKER_IMAGE ?= unstructured-inference:dev

.PHONY: docker-build
docker-build:
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh

.PHONY: docker-test
docker-test: docker-build
docker run --rm \
-v ${CURRENT_DIR}/test_unstructured_inference:/home/test_unstructured_inference \
-v ${CURRENT_DIR}/sample-docs:/home/sample-docs \
$(DOCKER_IMAGE) \
bash -c "pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured_inference"
9 changes: 7 additions & 2 deletions scripts/docker-build.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
#!/usr/bin/env bash

set -euo pipefail
PIP_VERSION="${PIP_VERSION:-23.1.2}"
DOCKER_IMAGE="unstructured-inference:dev"

DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \
DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \
--build-arg PIP_VERSION="$PIP_VERSION" \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--progress plain \
-t unstructured-inference-dev:latest .
-t "$DOCKER_IMAGE" .)

DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}"
12 changes: 11 additions & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,17 @@ def test_create_image_output_dir():
with tempfile.TemporaryDirectory() as tmpdir:
tmp_f_path = os.path.join(tmpdir, "loremipsum.pdf")
output_dir = create_image_output_dir(tmp_f_path)
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum")
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_images")
assert os.path.isdir(output_dir)
assert os.path.isabs(output_dir)
assert output_dir == expected_output_dir


def test_create_image_output_dir_no_ext():
with tempfile.TemporaryDirectory() as tmpdir:
tmp_f_path = os.path.join(tmpdir, "loremipsum_no_ext")
output_dir = create_image_output_dir(tmp_f_path)
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_no_ext_images")
assert os.path.isdir(output_dir)
assert os.path.isabs(output_dir)
assert output_dir == expected_output_dir
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.6" # pragma: no cover
__version__ = "0.5.7" # pragma: no cover
4 changes: 3 additions & 1 deletion unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,8 @@ def create_image_output_dir(
directory path"""
parent_dir = os.path.abspath(os.path.dirname(filename))
f_name_without_extension = os.path.splitext(os.path.basename(filename))[0]
output_dir = os.path.join(parent_dir, f_name_without_extension)

# Add a suffix to avoid conflicts in case original file doesn't have an extension
output_dir = os.path.join(parent_dir, f"{f_name_without_extension}_images")
os.makedirs(output_dir, exist_ok=True)
return output_dir