11import base64
22import logging
3+ import os
34import re
5+ import warnings
46from contextlib import contextmanager
57from copy import deepcopy
68from io import BytesIO
@@ -313,15 +315,15 @@ def _resolve_relative_path(self, loc: str) -> str:
313315 abs_loc = loc
314316
315317 if self .base_path :
316- if not loc .startswith (("http://" , "https://" , "data:" , "file://" , "/" )):
318+ if loc .startswith ("//" ):
319+ # Protocol-relative URL - default to https
320+ abs_loc = "https:" + loc
321+ elif not loc .startswith (("http://" , "https://" , "data:" , "file://" )):
317322 if HTMLDocumentBackend ._is_remote_url (self .base_path ): # remote fetch
318323 abs_loc = urljoin (self .base_path , loc )
319324 elif self .base_path : # local fetch
320325 # For local files, resolve relative to the HTML file location
321326 abs_loc = str (Path (self .base_path ).parent / loc )
322- elif loc .startswith ("//" ):
323- # Protocol-relative URL - default to https
324- abs_loc = "https:" + loc
325327
326328 _log .debug (f"Resolved location { loc } to { abs_loc } " )
327329 return abs_loc
@@ -1198,13 +1200,18 @@ def get_img_hyperlink(img_tag):
11981200 def _create_image_ref (self , src_url : str ) -> Optional [ImageRef ]:
11991201 try :
12001202 img_data = self ._load_image_data (src_url )
1201-
12021203 if img_data :
12031204 img = Image .open (BytesIO (img_data ))
12041205 return ImageRef .from_pil (img , dpi = int (img .info .get ("dpi" , (72 ,))[0 ]))
1205-
1206- except (ValidationError , UnidentifiedImageError , Exception ) as e :
1207- _log .warning (f"Could not process image (src={ src_url } ): { e } " )
1206+ except (
1207+ requests .HTTPError ,
1208+ ValidationError ,
1209+ UnidentifiedImageError ,
1210+ OperationNotAllowed ,
1211+ TypeError ,
1212+ ValueError ,
1213+ ) as e :
1214+ warnings .warn (f"Could not process an image from { src_url } : { e } " )
12081215
12091216 return None
12101217
@@ -1213,33 +1220,33 @@ def _load_image_data(self, src_loc: str) -> Optional[bytes]:
12131220 _log .debug (f"Skipping SVG file: { src_loc } " )
12141221 return None
12151222
1216- try :
1217- if HTMLDocumentBackend ._is_remote_url (src_loc ):
1218- if not self .enable_remote_fetch :
1219- raise OperationNotAllowed (
1220- "Fetching remote resources is only allowed when set explicitly. "
1221- "options.enable_remote_fetch=True."
1222- )
1223- response = requests .get (src_loc , stream = True )
1224- response .raise_for_status ()
1225- return response .content
1226- elif src_loc .startswith ("data:" ):
1227- data = re .sub (r"^data:image/.+;base64," , "" , src_loc )
1228- return base64 .b64decode (data )
1229-
1230- if src_loc .startswith ("file://" ):
1231- src_loc = src_loc [7 :]
1232-
1233- if not self .enable_local_fetch :
1223+ if HTMLDocumentBackend ._is_remote_url (src_loc ):
1224+ if not self .options .enable_remote_fetch :
12341225 raise OperationNotAllowed (
1235- "Fetching local resources is only allowed when set explicitly. "
1236- "options.enable_local_fetch =True."
1226+ "Fetching remote resources is only allowed when set explicitly. "
1227+ "Set options.enable_remote_fetch =True."
12371228 )
1229+ response = requests .get (src_loc , stream = True )
1230+ response .raise_for_status ()
1231+ return response .content
1232+ elif src_loc .startswith ("data:" ):
1233+ data = re .sub (r"^data:image/.+;base64," , "" , src_loc )
1234+ return base64 .b64decode (data )
1235+
1236+ if src_loc .startswith ("file://" ):
1237+ src_loc = src_loc [7 :]
1238+
1239+ if not self .options .enable_local_fetch :
1240+ raise OperationNotAllowed (
1241+ "Fetching local resources is only allowed when set explicitly. "
1242+ "Set options.enable_local_fetch=True."
1243+ )
1244+ # add check that file exists and can read
1245+ if os .path .isfile (src_loc ) and os .access (src_loc , os .R_OK ):
12381246 with open (src_loc , "rb" ) as f :
12391247 return f .read ()
1240- except Exception as e :
1241- _log .warning (f"Could not load image data: { e } " )
1242- return None
1248+ else :
1249+ raise ValueError ("File does not exist or it is not readable." )
12431250
12441251 @staticmethod
12451252 def get_text (item : PageElement ) -> str :
0 commit comments