@@ -96,7 +96,9 @@ class _NotHTTP(Exception):
96
96
pass
97
97
98
98
99
- def _ensure_api_response (url : str , session : PipSession ) -> None :
99
+ def _ensure_api_response (
100
+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
101
+ ) -> None :
100
102
"""
101
103
Send a HEAD request to the URL, and ensure the response contains a simple
102
104
API Response.
@@ -108,13 +110,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
108
110
if scheme not in {"http" , "https" }:
109
111
raise _NotHTTP ()
110
112
111
- resp = session .head (url , allow_redirects = True )
113
+ resp = session .head (url , allow_redirects = True , headers = headers )
112
114
raise_for_status (resp )
113
115
114
116
_ensure_api_header (resp )
115
117
116
118
117
- def _get_simple_response (url : str , session : PipSession ) -> Response :
119
+ def _get_simple_response (
120
+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
121
+ ) -> Response :
118
122
"""Access an Simple API response with GET, and return the response.
119
123
120
124
This consists of three parts:
@@ -128,10 +132,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
128
132
and raise `_NotAPIContent` otherwise.
129
133
"""
130
134
if is_archive_file (Link (url ).filename ):
131
- _ensure_api_response (url , session = session )
135
+ _ensure_api_response (url , session = session , headers = headers )
132
136
133
137
logger .debug ("Getting page %s" , redact_auth_from_url (url ))
134
138
139
+ logger .debug ("headers: %s" , str (headers ))
140
+ if headers is None :
141
+ headers = {}
135
142
resp = session .get (
136
143
url ,
137
144
headers = {
@@ -156,6 +163,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
156
163
# once per 10 minutes.
157
164
# For more information, please see pypa/pip#5670.
158
165
"Cache-Control" : "max-age=0" ,
166
+ ** headers ,
159
167
},
160
168
)
161
169
raise_for_status (resp )
@@ -235,7 +243,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
235
243
if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
236
244
data = json .loads (page .content )
237
245
for file in data .get ("files" , []):
238
- link = Link .from_json (file , page .url )
246
+ link = Link .from_json (file , page .url , page_content = page )
239
247
if link is None :
240
248
continue
241
249
yield link
@@ -248,7 +256,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
248
256
url = page .url
249
257
base_url = parser .base_url or url
250
258
for anchor in parser .anchors :
251
- link = Link .from_element (anchor , page_url = url , base_url = base_url )
259
+ link = Link .from_element (
260
+ anchor , page_url = url , base_url = base_url , page_content = page
261
+ )
252
262
if link is None :
253
263
continue
254
264
yield link
@@ -264,19 +274,25 @@ def __init__(
264
274
encoding : Optional [str ],
265
275
url : str ,
266
276
cache_link_parsing : bool = True ,
277
+ etag : Optional [str ] = None ,
278
+ date : Optional [str ] = None ,
267
279
) -> None :
268
280
"""
269
281
:param encoding: the encoding to decode the given content.
270
282
:param url: the URL from which the HTML was downloaded.
271
283
:param cache_link_parsing: whether links parsed from this page's url
272
284
should be cached. PyPI index urls should
273
285
have this set to False, for example.
286
+ :param etag: The ``ETag`` header from an HTTP request against ``url``.
287
+ :param date: The ``Date`` header from an HTTP request against ``url``.
274
288
"""
275
289
self .content = content
276
290
self .content_type = content_type
277
291
self .encoding = encoding
278
292
self .url = url
279
293
self .cache_link_parsing = cache_link_parsing
294
+ self .etag = etag
295
+ self .date = date
280
296
281
297
def __str__ (self ) -> str :
282
298
return redact_auth_from_url (self .url )
@@ -321,7 +337,8 @@ def _handle_get_simple_fail(
321
337
322
338
323
339
def _make_index_content (
324
- response : Response , cache_link_parsing : bool = True
340
+ response : Response ,
341
+ cache_link_parsing : bool = True ,
325
342
) -> IndexContent :
326
343
encoding = _get_encoding_from_headers (response .headers )
327
344
return IndexContent (
@@ -330,11 +347,15 @@ def _make_index_content(
330
347
encoding = encoding ,
331
348
url = response .url ,
332
349
cache_link_parsing = cache_link_parsing ,
350
+ etag = response .headers .get ("ETag" , None ),
351
+ date = response .headers .get ("Date" , None ),
333
352
)
334
353
335
354
336
- def _get_index_content (link : Link , * , session : PipSession ) -> Optional ["IndexContent" ]:
337
- url = link .url .split ("#" , 1 )[0 ]
355
+ def _get_index_content (
356
+ link : Link , * , session : PipSession , headers : Optional [Dict [str , str ]] = None
357
+ ) -> Optional ["IndexContent" ]:
358
+ url = link .url_without_fragment
338
359
339
360
# Check for VCS schemes that do not support lookup as web pages.
340
361
vcs_scheme = _match_vcs_scheme (url )
@@ -361,7 +382,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
361
382
logger .debug (" file: URL is directory, getting %s" , url )
362
383
363
384
try :
364
- resp = _get_simple_response (url , session = session )
385
+ resp = _get_simple_response (url , session = session , headers = headers )
365
386
except _NotHTTP :
366
387
logger .warning (
367
388
"Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +398,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
377
398
exc .request_desc ,
378
399
exc .content_type ,
379
400
)
380
- except NetworkConnectionError as exc :
381
- _handle_get_simple_fail (link , exc )
382
- except RetryError as exc :
401
+ except (NetworkConnectionError , RetryError ) as exc :
383
402
_handle_get_simple_fail (link , exc )
384
403
except SSLError as exc :
385
404
reason = "There was a problem confirming the ssl certificate: "
@@ -454,11 +473,14 @@ def create(
454
473
def find_links (self ) -> List [str ]:
455
474
return self .search_scope .find_links
456
475
457
- def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
476
+ def fetch_response (
477
+ self , location : Link , headers : Optional [Dict [str , str ]] = None
478
+ ) -> Optional [IndexContent ]:
458
479
"""
459
480
Fetch an HTML page containing package links.
460
481
"""
461
- return _get_index_content (location , session = self .session )
482
+ logger .debug ("headers: %s" , str (headers ))
483
+ return _get_index_content (location , session = self .session , headers = headers )
462
484
463
485
def collect_sources (
464
486
self ,
0 commit comments