11"""Utilities to help linting various targets."""
22
33import os
4+ import re
45from typing import (
56 Any ,
67 Dict ,
2122if TYPE_CHECKING :
2223 from planemo .cli import PlanemoCliContext
2324
25+ REQUEST_TIMEOUT = 5
26+
2427
2528def build_lint_args (ctx : "PlanemoCliContext" , ** kwds ) -> Dict [str , Any ]:
2629 """Handle common report, error, and skip linting arguments."""
@@ -125,6 +128,49 @@ def lint_xsd(lint_ctx, schema_path, path):
125128 lint_ctx .info ("File validates against XML schema." )
126129
127130
131+ def _validate_doi_url (url , lint_ctx ):
132+ """Validate DOI URL by checking CrossRef API."""
133+ match = re .match ("https?://doi.org/(.*)$" , url )
134+ if match is None :
135+ return False
136+
137+ doi = match .group (1 )
138+ xref_url = f"https://api.crossref.org/works/{ doi } "
139+ return _validate_http_url (xref_url , lint_ctx = lint_ctx )
140+
141+
142+ def _validate_http_url (url , lint_ctx , user_agent = None ):
143+ """Validate HTTP/HTTPS URL."""
144+ headers = {"User-Agent" : user_agent , "Accept" : "*/*" } if user_agent else None
145+ r = None
146+ try :
147+ r = requests .get (url , headers = headers , stream = True , timeout = REQUEST_TIMEOUT )
148+ r .raise_for_status ()
149+ next (r .iter_content (1000 ))
150+ return True
151+ except Exception as e :
152+ if r is not None and r .status_code == 429 :
153+ # too many requests
154+ return True
155+ elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
156+ # CloudFlare protection block
157+ return True
158+ else :
159+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
160+ return False
161+
162+
163+ def _validate_other_url (url , lint_ctx ):
164+ """Validate non-HTTP URLs."""
165+ try :
166+ with urlopen (url ) as handle :
167+ handle .read (100 )
168+ return True
169+ except Exception as e :
170+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
171+ return False
172+
173+
128174def lint_urls (root , lint_ctx ):
129175 """Find referenced URLs and verify they are valid."""
130176 urls , docs = find_urls_for_xml (root )
@@ -133,34 +179,14 @@ def lint_urls(root, lint_ctx):
133179 BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
134180
135181 def validate_url (url , lint_ctx , user_agent = None ):
136- is_valid = True
137- if url .startswith ("http://" ) or url .startswith ("https://" ):
138- if user_agent :
139- headers = {"User-Agent" : user_agent , "Accept" : "*/*" }
140- else :
141- headers = None
142- r = None
143- try :
144- r = requests .get (url , headers = headers , stream = True )
145- r .raise_for_status ()
146- next (r .iter_content (1000 ))
147- except Exception as e :
148- if r is not None and r .status_code == 429 :
149- # too many requests
150- pass
151- elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
152- # CloudFlare protection block
153- pass
154- else :
155- is_valid = False
156- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
182+ is_valid = False
183+ if re .match ("https?://doi.org/(.*)$" , url ):
184+ is_valid = _validate_doi_url (url , lint_ctx )
185+ elif url .startswith ("http://" ) or url .startswith ("https://" ):
186+ is_valid = _validate_http_url (url , lint_ctx , user_agent )
157187 else :
158- try :
159- with urlopen (url ) as handle :
160- handle .read (100 )
161- except Exception as e :
162- is_valid = False
163- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
188+ is_valid = _validate_other_url (url , lint_ctx )
189+
164190 if is_valid :
165191 lint_ctx .info ("URL OK %s" % url )
166192
0 commit comments