Skip to content

Commit 26b9cf7

Browse files
make FetchResolveCache
- pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS
1 parent 36b0fa8 commit 26b9cf7

File tree

7 files changed

+257
-36
lines changed

7 files changed

+257
-36
lines changed

news/12257.feature.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.

src/pip/_internal/cache.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
9292
assert not cache_dir or os.path.isabs(cache_dir)
9393
self.cache_dir = cache_dir or None
9494

95-
def _get_cache_path_parts(self, link: Link) -> List[str]:
95+
def _get_cache_path_parts(
96+
self, link: Link, *, interpreter_dependent: bool
97+
) -> List[str]:
9698
"""Get parts of part that must be os.path.joined with cache_dir"""
9799

98100
# We want to generate an url to use as our cache key, we don't want to
@@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
104106
if link.subdirectory_fragment:
105107
key_parts["subdirectory"] = link.subdirectory_fragment
106108

107-
# Include interpreter name, major and minor version in cache key
108-
# to cope with ill-behaved sdists that build a different wheel
109-
# depending on the python version their setup.py is being run on,
110-
# and don't encode the difference in compatibility tags.
111-
# https://github.com/pypa/pip/issues/7296
112-
key_parts["interpreter_name"] = interpreter_name()
113-
key_parts["interpreter_version"] = interpreter_version()
109+
if interpreter_dependent:
110+
# Include interpreter name, major and minor version in cache key
111+
# to cope with ill-behaved sdists that build a different wheel
112+
# depending on the python version their setup.py is being run on,
113+
# and don't encode the difference in compatibility tags.
114+
# https://github.com/pypa/pip/issues/7296
115+
key_parts["interpreter_name"] = interpreter_name()
116+
key_parts["interpreter_version"] = interpreter_version()
114117

115118
# Encode our key url with sha224, we'll use this because it has similar
116119
# security properties to sha256, but with a shorter total output (and
@@ -138,11 +141,20 @@ class LinkMetadataCache(Cache):
138141
"""Persistently store the metadata of dists found at each link."""
139142

140143
def get_path_for_link(self, link: Link) -> str:
141-
parts = self._get_cache_path_parts(link)
144+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
142145
assert self.cache_dir
143146
return os.path.join(self.cache_dir, "link-metadata", *parts)
144147

145148

149+
class FetchResolveCache(Cache):
150+
def get_path_for_link(self, link: Link) -> str:
151+
# We are reading index links to extract other links from, not executing any
152+
# python code, so these caches are interpreter-independent.
153+
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
154+
assert self.cache_dir
155+
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
156+
157+
146158
class WheelCacheBase(Cache):
147159
"""Specializations to the cache concept for wheels."""
148160

@@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str:
197209
198210
:param link: The link of the sdist for which this will cache wheels.
199211
"""
200-
parts = self._get_cache_path_parts(link)
212+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
201213
assert self.cache_dir
202214
# Store wheels within the root cache_dir
203215
return os.path.join(self.cache_dir, "wheels", *parts)

src/pip/_internal/cli/req_command.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from optparse import Values
1313
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
1414

15-
from pip._internal.cache import LinkMetadataCache, WheelCache
15+
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
1616
from pip._internal.cli import cmdoptions
1717
from pip._internal.cli.base_command import Command
1818
from pip._internal.cli.command_context import CommandContextMixIn
@@ -509,8 +509,13 @@ def _build_package_finder(
509509
ignore_requires_python=ignore_requires_python,
510510
)
511511

512+
if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
513+
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
514+
else:
515+
fetch_resolve_cache = None
512516
return PackageFinder.create(
513517
link_collector=link_collector,
514518
selection_prefs=selection_prefs,
515519
target_python=target_python,
520+
fetch_resolve_cache=fetch_resolve_cache,
516521
)

src/pip/_internal/index/collector.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ class _NotHTTP(Exception):
9696
pass
9797

9898

99-
def _ensure_api_response(url: str, session: PipSession) -> None:
99+
def _ensure_api_response(
100+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
101+
) -> None:
100102
"""
101103
Send a HEAD request to the URL, and ensure the response contains a simple
102104
API Response.
@@ -108,13 +110,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
108110
if scheme not in {"http", "https"}:
109111
raise _NotHTTP()
110112

111-
resp = session.head(url, allow_redirects=True)
113+
resp = session.head(url, allow_redirects=True, headers=headers)
112114
raise_for_status(resp)
113115

114116
_ensure_api_header(resp)
115117

116118

117-
def _get_simple_response(url: str, session: PipSession) -> Response:
119+
def _get_simple_response(
120+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
121+
) -> Response:
118122
"""Access an Simple API response with GET, and return the response.
119123
120124
This consists of three parts:
@@ -128,10 +132,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
128132
and raise `_NotAPIContent` otherwise.
129133
"""
130134
if is_archive_file(Link(url).filename):
131-
_ensure_api_response(url, session=session)
135+
_ensure_api_response(url, session=session, headers=headers)
132136

133137
logger.debug("Getting page %s", redact_auth_from_url(url))
134138

139+
logger.debug("headers: %s", str(headers))
140+
if headers is None:
141+
headers = {}
135142
resp = session.get(
136143
url,
137144
headers={
@@ -156,6 +163,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
156163
# once per 10 minutes.
157164
# For more information, please see pypa/pip#5670.
158165
"Cache-Control": "max-age=0",
166+
**headers,
159167
},
160168
)
161169
raise_for_status(resp)
@@ -235,7 +243,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
235243
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
236244
data = json.loads(page.content)
237245
for file in data.get("files", []):
238-
link = Link.from_json(file, page.url)
246+
link = Link.from_json(file, page.url, page_content=page)
239247
if link is None:
240248
continue
241249
yield link
@@ -248,7 +256,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
248256
url = page.url
249257
base_url = parser.base_url or url
250258
for anchor in parser.anchors:
251-
link = Link.from_element(anchor, page_url=url, base_url=base_url)
259+
link = Link.from_element(
260+
anchor, page_url=url, base_url=base_url, page_content=page
261+
)
252262
if link is None:
253263
continue
254264
yield link
@@ -264,19 +274,25 @@ def __init__(
264274
encoding: Optional[str],
265275
url: str,
266276
cache_link_parsing: bool = True,
277+
etag: Optional[str] = None,
278+
date: Optional[str] = None,
267279
) -> None:
268280
"""
269281
:param encoding: the encoding to decode the given content.
270282
:param url: the URL from which the HTML was downloaded.
271283
:param cache_link_parsing: whether links parsed from this page's url
272284
should be cached. PyPI index urls should
273285
have this set to False, for example.
286+
:param etag: The ``ETag`` header from an HTTP request against ``url``.
287+
:param date: The ``Date`` header from an HTTP request against ``url``.
274288
"""
275289
self.content = content
276290
self.content_type = content_type
277291
self.encoding = encoding
278292
self.url = url
279293
self.cache_link_parsing = cache_link_parsing
294+
self.etag = etag
295+
self.date = date
280296

281297
def __str__(self) -> str:
282298
return redact_auth_from_url(self.url)
@@ -321,7 +337,8 @@ def _handle_get_simple_fail(
321337

322338

323339
def _make_index_content(
324-
response: Response, cache_link_parsing: bool = True
340+
response: Response,
341+
cache_link_parsing: bool = True,
325342
) -> IndexContent:
326343
encoding = _get_encoding_from_headers(response.headers)
327344
return IndexContent(
@@ -330,11 +347,15 @@ def _make_index_content(
330347
encoding=encoding,
331348
url=response.url,
332349
cache_link_parsing=cache_link_parsing,
350+
etag=response.headers.get("ETag", None),
351+
date=response.headers.get("Date", None),
333352
)
334353

335354

336-
def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
337-
url = link.url.split("#", 1)[0]
355+
def _get_index_content(
356+
link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
357+
) -> Optional["IndexContent"]:
358+
url = link.url_without_fragment
338359

339360
# Check for VCS schemes that do not support lookup as web pages.
340361
vcs_scheme = _match_vcs_scheme(url)
@@ -361,7 +382,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
361382
logger.debug(" file: URL is directory, getting %s", url)
362383

363384
try:
364-
resp = _get_simple_response(url, session=session)
385+
resp = _get_simple_response(url, session=session, headers=headers)
365386
except _NotHTTP:
366387
logger.warning(
367388
"Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +398,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
377398
exc.request_desc,
378399
exc.content_type,
379400
)
380-
except NetworkConnectionError as exc:
381-
_handle_get_simple_fail(link, exc)
382-
except RetryError as exc:
401+
except (NetworkConnectionError, RetryError) as exc:
383402
_handle_get_simple_fail(link, exc)
384403
except SSLError as exc:
385404
reason = "There was a problem confirming the ssl certificate: "
@@ -454,11 +473,14 @@ def create(
454473
def find_links(self) -> List[str]:
455474
return self.search_scope.find_links
456475

457-
def fetch_response(self, location: Link) -> Optional[IndexContent]:
476+
def fetch_response(
477+
self, location: Link, headers: Optional[Dict[str, str]] = None
478+
) -> Optional[IndexContent]:
458479
"""
459480
Fetch an HTML page containing package links.
460481
"""
461-
return _get_index_content(location, session=self.session)
482+
logger.debug("headers: %s", str(headers))
483+
return _get_index_content(location, session=self.session, headers=headers)
462484

463485
def collect_sources(
464486
self,

0 commit comments

Comments
 (0)