Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ jobs:
- name: Install cdx_toolkit
run: pip install .[test]

- name: Lint code
run: |
make lint

- name: Run tests
run: |
make test_coverage
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: init test clean_coverage test_coverage distclean distcheck dist install
.PHONY: init test clean_coverage test_coverage lint distclean distcheck dist install

init36:
# packages are deprecating support, so this uses exact versions
Expand All @@ -25,6 +25,9 @@ test_coverage: clean_coverage
PYTHONPATH=. py.test -rA -s --doctest-modules --cov-report=xml --cov-append --cov cdx_toolkit tests -v -v
coverage report

lint:
flake8 cdx_toolkit

distclean:
rm -rf dist/

Expand Down
39 changes: 28 additions & 11 deletions cdx_toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
from pkg_resources import get_distribution, DistributionNotFound
from collections.abc import MutableMapping
import sys
import warnings

__version__ = 'installed-from-git'
Expand Down Expand Up @@ -62,8 +61,8 @@ def cdx_to_captures(resp, wb=None, warc_download_prefix=None):
if text.startswith('{'):
lines = resp.text.splitlines()
ret = []
for l in lines:
ret.append(CaptureObject(json.loads(l), wb=wb, warc_download_prefix=warc_download_prefix))
for line in lines:
ret.append(CaptureObject(json.loads(line), wb=wb, warc_download_prefix=warc_download_prefix))
return ret

# ia output='json' is a json list of lists
Expand Down Expand Up @@ -202,7 +201,14 @@ def __next__(self):


class CDXFetcher:
def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
def __init__(self,
source='cc',
crawl=None,
wb=None,
warc_download_prefix=None,
cc_mirror=None,
cc_sort='mixed',
loglevel=None):
self.source = source
self.crawl = crawl
self.cc_sort = cc_sort
Expand Down Expand Up @@ -231,13 +237,20 @@ def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None,
LOGGER.setLevel(level=loglevel)

def customize_index_list(self, params):
if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
LOGGER.info('making a custom cc index list')
if self.crawl and 'crawl' not in params:
params['crawl'] = self.crawl
if self.source == "cc" and (
self.crawl
or "crawl" in params
or "from" in params
or "from_ts" in params
or "to" in params
or "closest" in params
):
LOGGER.info("making a custom cc index list")
if self.crawl and "crawl" not in params:
params["crawl"] = self.crawl
return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params)
else:
return self.index_list

return self.index_list

def get(self, url, **kwargs):
# from_ts=None, to=None, matchType=None, limit=None, sort=None, closest=None,
Expand All @@ -264,7 +277,11 @@ def get(self, url, **kwargs):
ret = []
for endpoint in index_list:
resp = myrequests_get(endpoint, params=params, cdx=True)
objs = cdx_to_captures(resp, wb=self.wb, warc_download_prefix=self.warc_download_prefix) # turns 400 and 404 into []
objs = cdx_to_captures(
resp,
wb=self.wb,
warc_download_prefix=self.warc_download_prefix
) # turns 400 and 404 into []
ret.extend(objs)
if 'limit' in params:
params['limit'] -= len(objs)
Expand Down
92 changes: 77 additions & 15 deletions cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,30 @@ def main(args=None):
parser = ArgumentParser(description='cdx_toolkit iterator command line tool')

parser.add_argument('--version', '-V', action='version', version=get_version())
parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)')

parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs')
parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc')
parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback')
parser.add_argument(
'--verbose',
'-v',
action='count',
help='set logging level to INFO (-v) or DEBUG (-vv)'
)
parser.add_argument(
'--cc',
action='store_const',
const='cc',
help='direct the query to the Common Crawl CDX/WARCs'
)
parser.add_argument(
'--crawl',
action='store',
help=('crawl names (comma separated) or an integer for the most recent N crawls. '
'Implies --cc')
)
parser.add_argument(
'--ia',
action='store_const',
const='ia',
help='direct the query to the Internet Archive CDX/wayback'
)
parser.add_argument('--source', action='store', help='direct the query to this CDX server')
parser.add_argument('--wb', action='store', help='direct replays for content to this wayback')
parser.add_argument('--limit', type=int, action='store')
Expand All @@ -28,29 +47,72 @@ def main(args=None):
parser.add_argument('--from', action='store')
parser.add_argument('--to', action='store')
parser.add_argument('--filter', action='append', help='see CDX API documentation for usage')
parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000')
parser.add_argument('--closest', action='store', help='get the closest capture to this timestamp. use with --get')
parser.add_argument(
'--get',
action='store_true',
help='use a single get instead of a paged iteration. default limit=1000'
)
parser.add_argument(
'--closest',
action='store',
help='get the closest capture to this timestamp. use with --get'
)

subparsers = parser.add_subparsers(dest='cmd')
subparsers.required = True

iterate = subparsers.add_parser('iter', help='iterate printing captures')
iterate.add_argument('--all-fields', action='store_true')
iterate.add_argument('--fields', action='store', default='url,status,timestamp', help='try --all-fields if you need the list')
iterate.add_argument(
'--fields',
action='store',
default='url,status,timestamp',
help='try --all-fields if you need the list'
)
iterate.add_argument('--jsonl', action='store_true')
iterate.add_argument('--csv', action='store_true')
iterate.add_argument('url')
iterate.set_defaults(func=iterator)

warc = subparsers.add_parser('warc', help='iterate over capture content, creating a warc')
warc.add_argument('--prefix', default='TEST', help='prefix for the warc filename')
warc.add_argument('--subprefix', type=str, default=None, help='subprefix for the warc filename, default None')
warc.add_argument('--size', type=int, default=1000000000, help='target for the warc filesize in bytes')
warc.add_argument('--creator', action='store', help='creator of the warc: person, organization, service')
warc.add_argument('--operator', action='store', help='a person, if the creator is an organization')
warc.add_argument('--url-fgrep', action='store', help='this pattern must be present to warc an url')
warc.add_argument('--url-fgrepv', action='store', help='this pattern must not be present to warc an url, e.g. /robots.txt')
warc.add_argument('--warc-download-prefix', action='store', help='prefix for downloading content, automatically set for CC')
warc.add_argument(
'--subprefix',
type=str,
default=None,
help='subprefix for the warc filename, default None'
)
warc.add_argument(
'--size',
type=int,
default=1000000000,
help='target for the warc filesize in bytes'
)
warc.add_argument(
'--creator',
action='store',
help='creator of the warc: person, organization, service'
)
warc.add_argument(
'--operator',
action='store',
help='a person, if the creator is an organization'
)
warc.add_argument(
'--url-fgrep',
action='store',
help='this pattern must be present to warc an url'
)
warc.add_argument(
'--url-fgrepv',
action='store',
help='this pattern must not be present to warc an url, e.g. /robots.txt'
)
warc.add_argument(
'--warc-download-prefix',
action='store',
help='prefix for downloading content, automatically set for CC'
)
warc.add_argument('url')
warc.set_defaults(func=warcer)

Expand Down
19 changes: 14 additions & 5 deletions cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@
import logging

from .myrequests import myrequests_get
from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
from .timeutils import (
time_to_timestamp,
timestamp_to_time,
pad_timestamp_up,
cc_index_to_time,
cc_index_to_time_special,
)

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -70,7 +76,10 @@ def get_cc_endpoints(cc_mirror):
url = cc_mirror.rstrip('/') + '/collinfo.json'
r = myrequests_get(url)
if r.status_code != 200:
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover
collinfo = None
raise RuntimeError(
'error {} getting list of cc indices from {}'.format(r.status_code, collinfo)
) # pragma: no cover
set_collinfo_cache(cc_mirror, r.text)
col = r.json()

Expand Down Expand Up @@ -108,9 +117,9 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
year = 365*86400
if params.get('from_ts') is not None:
if params.get('to') is None:
#from_ts = pad_timestamp(params['from_ts'])
#params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
#LOGGER.info('no to, setting to=%s', params['to'])
# from_ts = pad_timestamp(params['from_ts'])
# params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
# LOGGER.info('no to, setting to=%s', params['to'])
LOGGER.info('from but no to, not doing anything')
elif params.get('to') is not None:
if params.get('from_ts') is None:
Expand Down
4 changes: 2 additions & 2 deletions cdx_toolkit/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def munge_filter(filter, source):

def munge_fields(fields, lines):
ret = []
for l in lines:
for line in lines:
obj = {}
for f in fields:
value = l.pop(0)
value = line.pop(0)
if f in fields_to_pywb:
obj[fields_to_pywb[f]] = value
else:
Expand Down
7 changes: 6 additions & 1 deletion cdx_toolkit/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ def fetch_warc_record(capture, warc_download_prefix):

warc_target_uri = record.rec_headers.get_header('WARC-Target-URI')
if url != warc_target_uri: # pragma: no cover
print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr)
print(
"Surprised that WARC-Target-URI {} is not the capture url {}".format(
warc_target_uri, url
),
file=sys.stderr,
)

record.rec_headers.replace_header('WARC-Source-URI', warc_url)
record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1))
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pytest==6.2.4
pytest-cov==2.12.1
pytest-sugar==0.9.4
coveralls==3.1.0
flake8>=7.3.0

# packaging
twine==3.4.1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# remember: keep requires synchronized with requirements.txt
requires = ['requests', 'warcio']

test_requirements = ['pytest', 'pytest-cov']
test_requirements = ['pytest', 'pytest-cov', 'flake8']

package_requirements = ['twine', 'setuptools', 'setuptools-scm']

Expand Down