fix: don't URL-encode V2 continuation tokens under encoding-type=url (#91)

ServerSideHannes · web-flow · commit 82d4c6050301 · 2026-06-30T09:32:23.000+02:00
diff --git a/s3proxy/xml_responses.py b/s3proxy/xml_responses.py
@@ -70,14 +70,18 @@ def list_objects(
         <StorageClass>{obj.get("storage_class", "STANDARD")}</StorageClass>{owner_xml}
     </Contents>"""
 
+    # V2 continuation tokens are opaque cursors, not keys. Per the S3 spec only
+    # Key/Prefix/Delimiter/StartAfter are URL-encoded under encoding-type=url, and
+    # clients (botocore) never URL-decode the continuation token. URL-encoding it
+    # corrupts the round-trip (e.g. '/' -> '%2F'), the backend can't advance, and
+    # the same token repeats -> botocore aborts with "same next token received
+    # twice". XML-escape only, regardless of encoding_type.
     next_token_xml = (
-        f"<NextContinuationToken>{_encode_key(next_token, encoding_type)}</NextContinuationToken>"
-        if next_token
-        else ""
+        f"<NextContinuationToken>{escape(next_token)}</NextContinuationToken>" if next_token else ""
     )
 
     continuation_token_xml = (
-        f"<ContinuationToken>{_encode_key(continuation_token, encoding_type)}</ContinuationToken>"
+        f"<ContinuationToken>{escape(continuation_token)}</ContinuationToken>"
         if continuation_token is not None
         else ""
     )
diff --git a/tests/unit/test_list_objects_continuation_token.py b/tests/unit/test_list_objects_continuation_token.py
@@ -0,0 +1,68 @@
+"""Self-check: V2 continuation tokens must not be URL-encoded.
+
+barman-cloud / botocore send encoding-type=url on ListObjectsV2. Continuation
+tokens are opaque cursors, not keys: per the S3 spec only Key/Prefix/Delimiter/
+StartAfter are URL-encoded, and botocore never URL-decodes the token. When the
+proxy URL-encoded a key-shaped token ('a/b/c.tar' -> 'a%2Fb%2Fc.tar') the token
+could not round-trip, the backend kept returning the first page, and botocore
+aborted the paginator with "The same next token was received twice".
+
+This proves the V2 serializer emits the token verbatim (XML-escaped only) even
+under encoding-type=url, so it survives the round-trip and pagination advances.
+"""
+
+from xml.etree.ElementTree import fromstring
+
+from s3proxy.xml_responses import list_objects
+
+TOKEN = "production-v3/production/base/20260619T223000/data_0007.tar"
+_NS = "{http://s3.amazonaws.com/doc/2006-03-01/}"
+
+
+def _parse(xml: str) -> dict:
+    root = fromstring(xml)
+    return {child.tag.replace(_NS, ""): child.text for child in root}
+
+
+def test_v2_continuation_token_not_url_encoded():
+    xml = list_objects(
+        bucket="oceanio-dc2-postgresql-backups",
+        prefix="production-v3/production/base/",
+        max_keys=1000,
+        is_truncated=True,
+        next_token=TOKEN,
+        objects=[],
+        continuation_token=TOKEN,
+        encoding_type="url",  # what barman/botocore sends
+    )
+    fields = _parse(xml)
+
+    # Verbatim token -> '/' preserved, not '%2F'. This is the actual bug guard.
+    assert fields["NextContinuationToken"] == TOKEN
+    assert fields["ContinuationToken"] == TOKEN
+    assert "%2F" not in xml
+
+    # Keys are still URL-encoded under encoding-type=url (regression guard).
+    key_xml = list_objects(
+        bucket="b",
+        prefix="",
+        max_keys=1000,
+        is_truncated=False,
+        next_token=None,
+        objects=[
+            {
+                "key": "a/b/c.tar",
+                "last_modified": "2026-06-24T09:00:00",
+                "etag": "x",
+                "size": 1,
+                "storage_class": "STANDARD",
+            }
+        ],
+        encoding_type="url",
+    )
+    assert "a%2Fb%2Fc.tar" in key_xml
+
+
+if __name__ == "__main__":
+    test_v2_continuation_token_not_url_encoded()
+    print("ok")