From f56d6505c1d7ed40a23c54ad1fc051efbc0d8754 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Nov 2024 23:13:40 -0800 Subject: [PATCH] fix indexing of cookie header: (#714) - add fields option for adding req.http:cookie and referrer entries to the cdxj - update to warcio 2.4.0 to support this functionality --- package.json | 2 +- src/util/warcwriter.ts | 7 +++++-- yarn.lock | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 1e1ca2e6..63dd203f 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.3.1", + "warcio": "^2.4.0", "ws": "^7.4.4", "yargs": "^17.7.2" }, diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 89aca978..3b59303c 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -2,7 +2,7 @@ import fs from "fs"; import { Writable } from "stream"; import path from "path"; -import { CDXIndexer, WARCRecord } from "warcio"; +import { CDXIndexer, WARCRecord, DEFAULT_CDX_FIELDS } from "warcio"; import { WARCSerializer } from "warcio/node"; import { logger, formatErr, LogDetails, LogContext } from "./logger.js"; import type { IndexerOffsetLength } from "warcio"; @@ -76,7 +76,10 @@ export class WARCWriter implements IndexerOffsetLength { this.recordLength = 0; if (this.warcCdxDir) { - this.indexer = new CDXIndexer({ format: "cdxj" }); + this.indexer = new CDXIndexer({ + format: "cdxj", + fields: [...DEFAULT_CDX_FIELDS, "req.http:cookie", "referrer"], + }); } return filename; diff --git a/yarn.lock b/yarn.lock index f848f0a8..2a398802 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5295,6 +5295,20 @@ warcio@^2.3.1: uuid-random "^1.3.2" yargs "^17.6.2" +warcio@^2.4.0: + version "2.4.0" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.0.tgz#13bae2837f1bbf5cf7585f75857e6311d30557bd" + integrity sha512-EfxXCgsnZ35CGf2j99QBMyB6EI98KEQ6YmeER+8Lnv/4KFJ3thT76PiX37HfZVbPJS21JihA0Eddjk9QBQRlPg== + dependencies: + "@types/pako" "^1.0.7" + "@types/stream-buffers" "^3.0.7" + base32-encode "^2.0.0" + hash-wasm "^4.9.0" + pako "^1.0.11" + tempy "^3.1.0" + uuid-random "^1.3.2" + yargs "^17.7.2" + web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864"