Skip to content

Commit 06bcc9f

Browse files
committed
tests(html): add tests and fix bugs
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
1 parent 6b57b88 commit 06bcc9f

File tree

11 files changed

+640
-695
lines changed

11 files changed

+640
-695
lines changed

docling/backend/abstract_backend.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,20 @@
1212

1313

1414
class BaseBackendOptions(BaseModel):
15-
"""Common options for all declarative document backends.
15+
"""Common options for all declarative document backends."""
1616

17-
This is placeholder to define all common options among declarative backends.
18-
"""
17+
enable_remote_fetch: bool = Field(
18+
False, description="Enable remote resource fetching."
19+
)
20+
enable_local_fetch: bool = Field(
21+
False, description="Enable local resource fetching."
22+
)
1923

2024

2125
class DeclarativeBackendOptions(BaseBackendOptions):
2226
"""Default backend options for a declarative document backend."""
2327

24-
kind: Literal["declarative"] = "declarative"
28+
kind: Literal["declarative"] = Field("declarative", exclude=True, repr=False)
2529

2630

2731
class HTMLBackendOptions(BaseBackendOptions):
@@ -30,7 +34,7 @@ class HTMLBackendOptions(BaseBackendOptions):
3034
This class can be extended to include options specific to HTML processing.
3135
"""
3236

33-
kind: Literal["html"] = "html"
37+
kind: Literal["html"] = Field("html", exclude=True, repr=False)
3438
image_fetch: bool = Field(
3539
False,
3640
description=(
@@ -53,9 +57,6 @@ class HTMLBackendOptions(BaseBackendOptions):
5357

5458

5559
class AbstractDocumentBackend(ABC):
56-
enable_remote_fetch: bool = False
57-
enable_local_fetch: bool = False
58-
5960
@abstractmethod
6061
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
6162
self.file = in_doc.file

docling/backend/html_backend.py

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import base64
22
import logging
3+
import os
34
import re
5+
import warnings
46
from contextlib import contextmanager
57
from copy import deepcopy
68
from io import BytesIO
@@ -313,15 +315,15 @@ def _resolve_relative_path(self, loc: str) -> str:
313315
abs_loc = loc
314316

315317
if self.base_path:
316-
if not loc.startswith(("http://", "https://", "data:", "file://", "/")):
318+
if loc.startswith("//"):
319+
# Protocol-relative URL - default to https
320+
abs_loc = "https:" + loc
321+
elif not loc.startswith(("http://", "https://", "data:", "file://")):
317322
if HTMLDocumentBackend._is_remote_url(self.base_path): # remote fetch
318323
abs_loc = urljoin(self.base_path, loc)
319324
elif self.base_path: # local fetch
320325
# For local files, resolve relative to the HTML file location
321326
abs_loc = str(Path(self.base_path).parent / loc)
322-
elif loc.startswith("//"):
323-
# Protocol-relative URL - default to https
324-
abs_loc = "https:" + loc
325327

326328
_log.debug(f"Resolved location {loc} to {abs_loc}")
327329
return abs_loc
@@ -1198,13 +1200,18 @@ def get_img_hyperlink(img_tag):
11981200
def _create_image_ref(self, src_url: str) -> Optional[ImageRef]:
11991201
try:
12001202
img_data = self._load_image_data(src_url)
1201-
12021203
if img_data:
12031204
img = Image.open(BytesIO(img_data))
12041205
return ImageRef.from_pil(img, dpi=int(img.info.get("dpi", (72,))[0]))
1205-
1206-
except (ValidationError, UnidentifiedImageError, Exception) as e:
1207-
_log.warning(f"Could not process image (src={src_url}): {e}")
1206+
except (
1207+
requests.HTTPError,
1208+
ValidationError,
1209+
UnidentifiedImageError,
1210+
OperationNotAllowed,
1211+
TypeError,
1212+
ValueError,
1213+
) as e:
1214+
warnings.warn(f"Could not process an image from {src_url}: {e}")
12081215

12091216
return None
12101217

@@ -1213,33 +1220,33 @@ def _load_image_data(self, src_loc: str) -> Optional[bytes]:
12131220
_log.debug(f"Skipping SVG file: {src_loc}")
12141221
return None
12151222

1216-
try:
1217-
if HTMLDocumentBackend._is_remote_url(src_loc):
1218-
if not self.enable_remote_fetch:
1219-
raise OperationNotAllowed(
1220-
"Fetching remote resources is only allowed when set explicitly. "
1221-
"options.enable_remote_fetch=True."
1222-
)
1223-
response = requests.get(src_loc, stream=True)
1224-
response.raise_for_status()
1225-
return response.content
1226-
elif src_loc.startswith("data:"):
1227-
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
1228-
return base64.b64decode(data)
1229-
1230-
if src_loc.startswith("file://"):
1231-
src_loc = src_loc[7:]
1232-
1233-
if not self.enable_local_fetch:
1223+
if HTMLDocumentBackend._is_remote_url(src_loc):
1224+
if not self.options.enable_remote_fetch:
12341225
raise OperationNotAllowed(
1235-
"Fetching local resources is only allowed when set explicitly. "
1236-
"options.enable_local_fetch=True."
1226+
"Fetching remote resources is only allowed when set explicitly. "
1227+
"Set options.enable_remote_fetch=True."
12371228
)
1229+
response = requests.get(src_loc, stream=True)
1230+
response.raise_for_status()
1231+
return response.content
1232+
elif src_loc.startswith("data:"):
1233+
data = re.sub(r"^data:image/.+;base64,", "", src_loc)
1234+
return base64.b64decode(data)
1235+
1236+
if src_loc.startswith("file://"):
1237+
src_loc = src_loc[7:]
1238+
1239+
if not self.options.enable_local_fetch:
1240+
raise OperationNotAllowed(
1241+
"Fetching local resources is only allowed when set explicitly. "
1242+
"Set options.enable_local_fetch=True."
1243+
)
1244+
# add check that file exists and can read
1245+
if os.path.isfile(src_loc) and os.access(src_loc, os.R_OK):
12381246
with open(src_loc, "rb") as f:
12391247
return f.read()
1240-
except Exception as e:
1241-
_log.warning(f"Could not load image data: {e}")
1242-
return None
1248+
else:
1249+
raise ValueError("File does not exist or it is not readable.")
12431250

12441251
@staticmethod
12451252
def get_text(item: PageElement) -> str:

docling/datamodel/document.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,18 @@ def __init__(
143143
format=InputFormat.PDF,
144144
backend_options=backend_options,
145145
) # initialize with dummy values
146-
147146
self.limits = limits or DocumentLimits()
148147
self.format = format
149148

149+
# check for backend incompatibilities
150+
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
151+
if not issubclass(
152+
type(backend_options), type(backend.get_default_options())
153+
):
154+
raise ValueError(
155+
"Incompatible types between backend and backend_options arguments."
156+
)
157+
150158
try:
151159
if isinstance(path_or_stream, Path):
152160
self.file = path_or_stream
@@ -289,8 +297,7 @@ def docs(
289297
else:
290298
options = format_options[format]
291299
backend = options.backend
292-
# patch against circular import
293-
if "backend_options" in options.__class__.__dict__:
300+
if "backend_options" in options.model_fields_set:
294301
backend_options = cast("FormatOption", options).backend_options
295302

296303
path_or_stream: Union[BytesIO, Path]

0 commit comments

Comments
 (0)