cocrawler · malteos · Aug 25, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -46,6 +46,10 @@ jobs:
       - name: Install cdx_toolkit
         run: pip install .[test]
 
+      - name: Lint code
+        run: |
+          make lint
+
       - name: Run tests
         run: |
           make test_coverage

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: init test clean_coverage test_coverage distclean distcheck dist install
+.PHONY: init test clean_coverage test_coverage lint distclean distcheck dist install
 
 init36:
 	# packages are deprecating support, so this uses exact versions
@@ -25,6 +25,9 @@ test_coverage: clean_coverage
 	PYTHONPATH=. py.test -rA -s --doctest-modules --cov-report=xml --cov-append --cov cdx_toolkit tests -v -v
 	coverage report
 
+lint:
+	flake8 cdx_toolkit
+
 distclean:
 	rm -rf dist/
 

diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py
@@ -2,7 +2,6 @@
 import json
 from pkg_resources import get_distribution, DistributionNotFound
 from collections.abc import MutableMapping
-import sys
 import warnings
 
 __version__ = 'installed-from-git'
@@ -62,8 +61,8 @@ def cdx_to_captures(resp, wb=None, warc_download_prefix=None):
     if text.startswith('{'):
         lines = resp.text.splitlines()
         ret = []
-        for l in lines:
-            ret.append(CaptureObject(json.loads(l), wb=wb, warc_download_prefix=warc_download_prefix))
+        for line in lines:
+            ret.append(CaptureObject(json.loads(line), wb=wb, warc_download_prefix=warc_download_prefix))
         return ret
 
     # ia output='json' is a json list of lists
@@ -202,7 +201,14 @@ def __next__(self):
 
 
 class CDXFetcher:
-    def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
+    def __init__(self,
+                 source='cc',
+                 crawl=None,
+                 wb=None,
+                 warc_download_prefix=None,
+                 cc_mirror=None,
+                 cc_sort='mixed',
+                 loglevel=None):
         self.source = source
         self.crawl = crawl
         self.cc_sort = cc_sort
@@ -231,13 +237,20 @@ def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None,
             LOGGER.setLevel(level=loglevel)
 
     def customize_index_list(self, params):
-        if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
-            LOGGER.info('making a custom cc index list')
-            if self.crawl and 'crawl' not in params:
-                params['crawl'] = self.crawl
+        if self.source == "cc" and (
+            self.crawl
+            or "crawl" in params
+            or "from" in params
+            or "from_ts" in params
+            or "to" in params
+            or "closest" in params
+        ):
+            LOGGER.info("making a custom cc index list")
+            if self.crawl and "crawl" not in params:
+                params["crawl"] = self.crawl
             return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params)
-        else:
-            return self.index_list
+
+        return self.index_list
 
     def get(self, url, **kwargs):
         # from_ts=None, to=None, matchType=None, limit=None, sort=None, closest=None,
@@ -264,7 +277,11 @@ def get(self, url, **kwargs):
         ret = []
         for endpoint in index_list:
             resp = myrequests_get(endpoint, params=params, cdx=True)
-            objs = cdx_to_captures(resp, wb=self.wb, warc_download_prefix=self.warc_download_prefix)  # turns 400 and 404 into []
+            objs = cdx_to_captures(
+                resp,
+                wb=self.wb,
+                warc_download_prefix=self.warc_download_prefix
+                )  # turns 400 and 404 into []
             ret.extend(objs)
             if 'limit' in params:
                 params['limit'] -= len(objs)

diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -15,11 +15,30 @@ def main(args=None):
     parser = ArgumentParser(description='cdx_toolkit iterator command line tool')
 
     parser.add_argument('--version', '-V', action='version', version=get_version())
-    parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)')
-
-    parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs')
-    parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc')
-    parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback')
+    parser.add_argument(
+        '--verbose',
+        '-v',
+        action='count',
+        help='set logging level to INFO (-v) or DEBUG (-vv)'
+    )
+    parser.add_argument(
+        '--cc',
+        action='store_const',
+        const='cc',
+        help='direct the query to the Common Crawl CDX/WARCs'
+    )
+    parser.add_argument(
+        '--crawl',
+        action='store',
+        help=('crawl names (comma separated) or an integer for the most recent N crawls. '
+              'Implies --cc')
+    )
+    parser.add_argument(
+        '--ia',
+        action='store_const',
+        const='ia',
+        help='direct the query to the Internet Archive CDX/wayback'
+    )
     parser.add_argument('--source', action='store', help='direct the query to this CDX server')
     parser.add_argument('--wb', action='store', help='direct replays for content to this wayback')
     parser.add_argument('--limit', type=int, action='store')
@@ -28,29 +47,72 @@ def main(args=None):
     parser.add_argument('--from', action='store')
     parser.add_argument('--to', action='store')
     parser.add_argument('--filter', action='append', help='see CDX API documentation for usage')
-    parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000')
-    parser.add_argument('--closest', action='store', help='get the closest capture to this timestamp. use with --get')
+    parser.add_argument(
+        '--get',
+        action='store_true',
+        help='use a single get instead of a paged iteration. default limit=1000'
+    )
+    parser.add_argument(
+        '--closest',
+        action='store',
+        help='get the closest capture to this timestamp. use with --get'
+    )
 
     subparsers = parser.add_subparsers(dest='cmd')
     subparsers.required = True
 
     iterate = subparsers.add_parser('iter', help='iterate printing captures')
     iterate.add_argument('--all-fields', action='store_true')
-    iterate.add_argument('--fields', action='store', default='url,status,timestamp', help='try --all-fields if you need the list')
+    iterate.add_argument(
+        '--fields',
+        action='store',
+        default='url,status,timestamp',
+        help='try --all-fields if you need the list'
+    )
     iterate.add_argument('--jsonl', action='store_true')
     iterate.add_argument('--csv', action='store_true')
     iterate.add_argument('url')
     iterate.set_defaults(func=iterator)
 
     warc = subparsers.add_parser('warc', help='iterate over capture content, creating a warc')
     warc.add_argument('--prefix', default='TEST', help='prefix for the warc filename')
-    warc.add_argument('--subprefix', type=str, default=None, help='subprefix for the warc filename, default None')
-    warc.add_argument('--size', type=int, default=1000000000, help='target for the warc filesize in bytes')
-    warc.add_argument('--creator', action='store', help='creator of the warc: person, organization, service')
-    warc.add_argument('--operator', action='store', help='a person, if the creator is an organization')
-    warc.add_argument('--url-fgrep', action='store', help='this pattern must be present to warc an url')
-    warc.add_argument('--url-fgrepv', action='store', help='this pattern must not be present to warc an url, e.g. /robots.txt')
-    warc.add_argument('--warc-download-prefix', action='store', help='prefix for downloading content, automatically set for CC')
+    warc.add_argument(
+        '--subprefix',
+        type=str,
+        default=None,
+        help='subprefix for the warc filename, default None'
+    )
+    warc.add_argument(
+        '--size',
+        type=int,
+        default=1000000000,
+        help='target for the warc filesize in bytes'
+    )
+    warc.add_argument(
+        '--creator',
+        action='store',
+        help='creator of the warc: person, organization, service'
+    )
+    warc.add_argument(
+        '--operator',
+        action='store',
+        help='a person, if the creator is an organization'
+    )
+    warc.add_argument(
+        '--url-fgrep',
+        action='store',
+        help='this pattern must be present to warc an url'
+    )
+    warc.add_argument(
+        '--url-fgrepv',
+        action='store',
+        help='this pattern must not be present to warc an url, e.g. /robots.txt'
+    )
+    warc.add_argument(
+        '--warc-download-prefix',
+        action='store',
+        help='prefix for downloading content, automatically set for CC'
+    )
     warc.add_argument('url')
     warc.set_defaults(func=warcer)
 

diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py
@@ -10,7 +10,13 @@
 import logging
 
 from .myrequests import myrequests_get
-from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
+from .timeutils import (
+    time_to_timestamp,
+    timestamp_to_time,
+    pad_timestamp_up,
+    cc_index_to_time,
+    cc_index_to_time_special,
+)
 
 LOGGER = logging.getLogger(__name__)
 
@@ -70,7 +76,10 @@ def get_cc_endpoints(cc_mirror):
         url = cc_mirror.rstrip('/') + '/collinfo.json'
         r = myrequests_get(url)
         if r.status_code != 200:
-            raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo))  # pragma: no cover
+            collinfo = None
+            raise RuntimeError(
+                'error {} getting list of cc indices from {}'.format(r.status_code, collinfo)
+            )  # pragma: no cover
         set_collinfo_cache(cc_mirror, r.text)
         col = r.json()
 
@@ -108,9 +117,9 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
         year = 365*86400
         if params.get('from_ts') is not None:
             if params.get('to') is None:
-                #from_ts = pad_timestamp(params['from_ts'])
-                #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
-                #LOGGER.info('no to, setting to=%s', params['to'])
+                # from_ts = pad_timestamp(params['from_ts'])
+                # params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
+                # LOGGER.info('no to, setting to=%s', params['to'])
                 LOGGER.info('from but no to, not doing anything')
         elif params.get('to') is not None:
             if params.get('from_ts') is None:

diff --git a/cdx_toolkit/compat.py b/cdx_toolkit/compat.py
@@ -26,10 +26,10 @@ def munge_filter(filter, source):
 
 def munge_fields(fields, lines):
     ret = []
-    for l in lines:
+    for line in lines:
         obj = {}
         for f in fields:
-            value = l.pop(0)
+            value = line.pop(0)
             if f in fields_to_pywb:
                 obj[fields_to_pywb[f]] = value
             else:

diff --git a/cdx_toolkit/warc.py b/cdx_toolkit/warc.py
@@ -144,7 +144,12 @@ def fetch_warc_record(capture, warc_download_prefix):
 
     warc_target_uri = record.rec_headers.get_header('WARC-Target-URI')
     if url != warc_target_uri:  # pragma: no cover
-        print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr)
+        print(
+            "Surprised that WARC-Target-URI {} is not the capture url {}".format(
+                warc_target_uri, url
+            ),
+            file=sys.stderr,
+        )
 
     record.rec_headers.replace_header('WARC-Source-URI', warc_url)
     record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1))

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ pytest==6.2.4
 pytest-cov==2.12.1
 pytest-sugar==0.9.4
 coveralls==3.1.0
+flake8>=7.3.0
 
 # packaging
 twine==3.4.1

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 # remember: keep requires synchronized with requirements.txt
 requires = ['requests', 'warcio']
 
-test_requirements = ['pytest', 'pytest-cov']
+test_requirements = ['pytest', 'pytest-cov', 'flake8']
 
 package_requirements = ['twine', 'setuptools', 'setuptools-scm']