Skip to content
Merged
1 change: 1 addition & 0 deletions autocorpus/IAO_dicts/IAO_FINAL_MAPPING.txt
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,4 @@ version changes version number section
visual abstract graphical abstract section
web resources references section
web site references references section
document title document title
1 change: 1 addition & 0 deletions autocorpus/IAO_dicts/IAO_term_to_ID.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ requirements section IAO:0000641
statistical analysis section IAO:0000644
tables section IAO:0000645
descriptive data section IAO:0000701
document title IAO:0000305
disclosure section IAO:CUIless
highlights section IAO:CUIless
8 changes: 4 additions & 4 deletions autocorpus/file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .autocorpus import Autocorpus
from .file_type import FileType, check_file_type
from .html import process_html_article
from .parse_xml import convert_xml_to_json


def process_file(
Expand Down Expand Up @@ -35,10 +36,9 @@
file_path, *process_html_article(config, file_path, linked_tables)
)
case FileType.XML:
raise NotImplementedError(
f"Could not process file {file_path}. Process XML files by running:\n\t"
f"python -m autocorpus.parse_xml {file_path}"
)
main_text = convert_xml_to_json(file_path)

Check warning on line 39 in autocorpus/file_processing.py

View check run for this annotation

Codecov / codecov/patch

autocorpus/file_processing.py#L39

Added line #L39 was not covered by tests

return Autocorpus(file_path, main_text, dict(), dict())

Check warning on line 41 in autocorpus/file_processing.py

View check run for this annotation

Codecov / codecov/patch

autocorpus/file_processing.py#L41

Added line #L41 was not covered by tests
case FileType.PDF:
try:
from .pdf import extract_pdf_content
Expand Down
3,150 changes: 1,937 additions & 1,213 deletions autocorpus/parse_xml.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ disallow_any_generics = true
warn_unreachable = true
warn_unused_ignores = true
# disallow_untyped_defs = true
exclude = [".venv/", "docs/", "autocorpus/parse_xml.py", "site/"]
exclude = [".venv/", "docs/", "site/"]

[[tool.mypy.overrides]]
module = "tests.*"
Expand Down
1,039 changes: 1,039 additions & 0 deletions tests/data/PMC/xml/PMC8885717.xml

Large diffs are not rendered by default.

791 changes: 791 additions & 0 deletions tests/data/PMC/xml/PMC8885717_bioc.json

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,26 @@ def _run_html_regression_test(
assert not expected_tables


@pytest.mark.parametrize("input_file", [("PMC/xml/PMC8885717.xml")])
def test_xml(data_path: Path, input_file: str) -> None:
"""A regression test for the xml autoCORPus function."""
from autocorpus.parse_xml import convert_xml_to_json

pmc_example_path = data_path / input_file
with open(
str(pmc_example_path).replace(".xml", "_bioc.json"),
encoding="utf-8",
) as f:
expected_bioc = json.load(f)

bioc = convert_xml_to_json(pmc_example_path)
# Only remove date because XML doesn't have an "inputfile" field.
bioc.pop("date")
expected_bioc.pop("date")

assert bioc == expected_bioc


@pytest.mark.skip_ci_macos
@pytest.mark.parametrize(
"input_file, config",
Expand Down