pypa
diff --git a/‎news/12257.feature.rst
Lines changed: 1 addition & 0 deletions b/‎news/12257.feature.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pip/_internal/cache.py
Lines changed: 22 additions & 10 deletions b/‎src/pip/_internal/cache.py
Lines changed: 22 additions & 10 deletions
diff --git a/‎src/pip/_internal/cli/req_command.py
Lines changed: 6 additions & 1 deletion b/‎src/pip/_internal/cli/req_command.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/pip/_internal/index/collector.py
Lines changed: 37 additions & 15 deletions b/‎src/pip/_internal/index/collector.py
Lines changed: 37 additions & 15 deletions
@@ -0,0 +1 @@
+Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.
@@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
         assert not cache_dir or os.path.isabs(cache_dir)
         self.cache_dir = cache_dir or None
 
-    def _get_cache_path_parts(self, link: Link) -> List[str]:
+    def _get_cache_path_parts(
+        self, link: Link, *, interpreter_dependent: bool
+    ) -> List[str]:
         """Get parts of part that must be os.path.joined with cache_dir"""
 
         # We want to generate an url to use as our cache key, we don't want to
@@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
         if link.subdirectory_fragment:
             key_parts["subdirectory"] = link.subdirectory_fragment
 
-        # Include interpreter name, major and minor version in cache key
-        # to cope with ill-behaved sdists that build a different wheel
-        # depending on the python version their setup.py is being run on,
-        # and don't encode the difference in compatibility tags.
-        # https://github.com/pypa/pip/issues/7296
-        key_parts["interpreter_name"] = interpreter_name()
-        key_parts["interpreter_version"] = interpreter_version()
+        if interpreter_dependent:
+            # Include interpreter name, major and minor version in cache key
+            # to cope with ill-behaved sdists that build a different wheel
+            # depending on the python version their setup.py is being run on,
+            # and don't encode the difference in compatibility tags.
+            # https://github.com/pypa/pip/issues/7296
+            key_parts["interpreter_name"] = interpreter_name()
+            key_parts["interpreter_version"] = interpreter_version()
 
         # Encode our key url with sha224, we'll use this because it has similar
         # security properties to sha256, but with a shorter total output (and
@@ -138,11 +141,20 @@ class LinkMetadataCache(Cache):
     """Persistently store the metadata of dists found at each link."""
 
     def get_path_for_link(self, link: Link) -> str:
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         return os.path.join(self.cache_dir, "link-metadata", *parts)
 
 
+class FetchResolveCache(Cache):
+    def get_path_for_link(self, link: Link) -> str:
+        # We are reading index links to extract other links from, not executing any
+        # python code, so these caches are interpreter-independent.
+        parts = self._get_cache_path_parts(link, interpreter_dependent=False)
+        assert self.cache_dir
+        return os.path.join(self.cache_dir, "fetch-resolve", *parts)
+
+
 class WheelCacheBase(Cache):
     """Specializations to the cache concept for wheels."""
 
@@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str:
 
         :param link: The link of the sdist for which this will cache wheels.
         """
-        parts = self._get_cache_path_parts(link)
+        parts = self._get_cache_path_parts(link, interpreter_dependent=True)
         assert self.cache_dir
         # Store wheels within the root cache_dir
         return os.path.join(self.cache_dir, "wheels", *parts)
 
@@ -12,7 +12,7 @@
 from optparse import Values
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple
 
-from pip._internal.cache import LinkMetadataCache, WheelCache
+from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
 from pip._internal.cli import cmdoptions
 from pip._internal.cli.base_command import Command
 from pip._internal.cli.command_context import CommandContextMixIn
@@ -509,8 +509,13 @@ def _build_package_finder(
             ignore_requires_python=ignore_requires_python,
         )
 
+        if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
+            fetch_resolve_cache = FetchResolveCache(options.cache_dir)
+        else:
+            fetch_resolve_cache = None
         return PackageFinder.create(
             link_collector=link_collector,
             selection_prefs=selection_prefs,
             target_python=target_python,
+            fetch_resolve_cache=fetch_resolve_cache,
         )
@@ -96,7 +96,9 @@ class _NotHTTP(Exception):
     pass
 
 
-def _ensure_api_response(url: str, session: PipSession) -> None:
+def _ensure_api_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> None:
     """
     Send a HEAD request to the URL, and ensure the response contains a simple
     API Response.
@@ -108,13 +110,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
     if scheme not in {"http", "https"}:
         raise _NotHTTP()
 
-    resp = session.head(url, allow_redirects=True)
+    resp = session.head(url, allow_redirects=True, headers=headers)
     raise_for_status(resp)
 
     _ensure_api_header(resp)
 
 
-def _get_simple_response(url: str, session: PipSession) -> Response:
+def _get_simple_response(
+    url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Response:
     """Access an Simple API response with GET, and return the response.
 
     This consists of three parts:
@@ -128,10 +132,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
        and raise `_NotAPIContent` otherwise.
     """
     if is_archive_file(Link(url).filename):
-        _ensure_api_response(url, session=session)
+        _ensure_api_response(url, session=session, headers=headers)
 
     logger.debug("Getting page %s", redact_auth_from_url(url))
 
+    logger.debug("headers: %s", str(headers))
+    if headers is None:
+        headers = {}
     resp = session.get(
         url,
         headers={
@@ -156,6 +163,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
             # once per 10 minutes.
             # For more information, please see pypa/pip#5670.
             "Cache-Control": "max-age=0",
+            **headers,
         },
     )
     raise_for_status(resp)
@@ -235,7 +243,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
         data = json.loads(page.content)
         for file in data.get("files", []):
-            link = Link.from_json(file, page.url)
+            link = Link.from_json(file, page.url, page_content=page)
             if link is None:
                 continue
             yield link
@@ -248,7 +256,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
     url = page.url
     base_url = parser.base_url or url
     for anchor in parser.anchors:
-        link = Link.from_element(anchor, page_url=url, base_url=base_url)
+        link = Link.from_element(
+            anchor, page_url=url, base_url=base_url, page_content=page
+        )
         if link is None:
             continue
         yield link
@@ -264,19 +274,25 @@ def __init__(
         encoding: Optional[str],
         url: str,
         cache_link_parsing: bool = True,
+        etag: Optional[str] = None,
+        date: Optional[str] = None,
     ) -> None:
         """
         :param encoding: the encoding to decode the given content.
         :param url: the URL from which the HTML was downloaded.
         :param cache_link_parsing: whether links parsed from this page's url
                                    should be cached. PyPI index urls should
                                    have this set to False, for example.
+        :param etag: The ``ETag`` header from an HTTP request against ``url``.
+        :param date: The ``Date`` header from an HTTP request against ``url``.
         """
         self.content = content
         self.content_type = content_type
         self.encoding = encoding
         self.url = url
         self.cache_link_parsing = cache_link_parsing
+        self.etag = etag
+        self.date = date
 
     def __str__(self) -> str:
         return redact_auth_from_url(self.url)
@@ -321,7 +337,8 @@ def _handle_get_simple_fail(
 
 
 def _make_index_content(
-    response: Response, cache_link_parsing: bool = True
+    response: Response,
+    cache_link_parsing: bool = True,
 ) -> IndexContent:
     encoding = _get_encoding_from_headers(response.headers)
     return IndexContent(
@@ -330,11 +347,15 @@ def _make_index_content(
         encoding=encoding,
         url=response.url,
         cache_link_parsing=cache_link_parsing,
+        etag=response.headers.get("ETag", None),
+        date=response.headers.get("Date", None),
     )
 
 
-def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
-    url = link.url.split("#", 1)[0]
+def _get_index_content(
+    link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
+) -> Optional["IndexContent"]:
+    url = link.url_without_fragment
 
     # Check for VCS schemes that do not support lookup as web pages.
     vcs_scheme = _match_vcs_scheme(url)
@@ -361,7 +382,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
         logger.debug(" file: URL is directory, getting %s", url)
 
     try:
-        resp = _get_simple_response(url, session=session)
+        resp = _get_simple_response(url, session=session, headers=headers)
     except _NotHTTP:
         logger.warning(
             "Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +398,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
             exc.request_desc,
             exc.content_type,
         )
-    except NetworkConnectionError as exc:
-        _handle_get_simple_fail(link, exc)
-    except RetryError as exc:
+    except (NetworkConnectionError, RetryError) as exc:
         _handle_get_simple_fail(link, exc)
     except SSLError as exc:
         reason = "There was a problem confirming the ssl certificate: "
@@ -454,11 +473,14 @@ def create(
     def find_links(self) -> List[str]:
         return self.search_scope.find_links
 
-    def fetch_response(self, location: Link) -> Optional[IndexContent]:
+    def fetch_response(
+        self, location: Link, headers: Optional[Dict[str, str]] = None
+    ) -> Optional[IndexContent]:
         """
         Fetch an HTML page containing package links.
         """
-        return _get_index_content(location, session=self.session)
+        logger.debug("headers: %s", str(headers))
+        return _get_index_content(location, session=self.session, headers=headers)
 
     def collect_sources(
         self,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.