Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NOT READY: warcio test #66

Open
wants to merge 71 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
ff1f543
warcio test
wumpus Jan 26, 2019
ebb721f
documentation
wumpus Jan 26, 2019
7aa060d
tests
wumpus Jan 26, 2019
24f3000
tests
wumpus Jan 26, 2019
40f9fc6
coverage
wumpus Jan 26, 2019
c70e68e
python 2.7 test fix
wumpus Jan 26, 2019
1847633
python 2.7 fixes
wumpus Jan 26, 2019
2c676db
coverage
wumpus Jan 26, 2019
97ee457
py2 testing
wumpus Jan 27, 2019
df50151
py2 windows testing
wumpus Jan 27, 2019
858a752
coverage
wumpus Jan 28, 2019
5bfffea
branch coverage
wumpus Jan 28, 2019
bb31f14
py2 branch coverage
wumpus Jan 28, 2019
cc54259
py2 testing
wumpus Jan 28, 2019
2b8d596
add record ids to test
wumpus Jan 28, 2019
c704fe9
preserve capitalization in messages
wumpus Jan 28, 2019
3839fa1
capitals and colons
wumpus Jan 28, 2019
8b9032d
use valid record ids
wumpus Jan 28, 2019
2a10b23
warc-segment-number cleaner recommendation
wumpus Jan 28, 2019
81c9f0a
segment origin id
wumpus Jan 28, 2019
c78343a
timestamp checking
wumpus Jan 28, 2019
efe0fda
buglet
wumpus Jan 29, 2019
7a26644
global checks
wumpus Jan 30, 2019
1d6fd9d
check -v; capitalize most commentary
wumpus Jan 31, 2019
5b716b7
...
wumpus Feb 1, 2019
fb8e3fa
revisits and global detection with just one file
wumpus Feb 1, 2019
d243632
show errors for decompression and unchunking failures
wumpus Feb 1, 2019
29517c4
make this function reentrant
wumpus Feb 2, 2019
844807e
narrow exception; fix bug not reading to the end of a chunked buffer
wumpus Feb 2, 2019
a55afd3
...
wumpus Feb 2, 2019
a33a5eb
put tester output in external files
wumpus Feb 6, 2019
fec139a
wip
wumpus Apr 4, 2019
417eee1
merge
wumpus Apr 4, 2019
a471222
tweak to match new test files
wumpus Apr 5, 2019
a80a784
merge
wumpus Sep 9, 2019
30a86fe
tests pass
wumpus Sep 9, 2019
19dc8b3
warcio test
wumpus Jan 26, 2019
88dff09
documentation
wumpus Jan 26, 2019
c99bc2e
tests
wumpus Jan 26, 2019
0039335
tests
wumpus Jan 26, 2019
9b7c9ce
coverage
wumpus Jan 26, 2019
903ed1d
python 2.7 test fix
wumpus Jan 26, 2019
68938bd
python 2.7 fixes
wumpus Jan 26, 2019
234468a
coverage
wumpus Jan 26, 2019
e7f88e7
py2 testing
wumpus Jan 27, 2019
8662073
py2 windows testing
wumpus Jan 27, 2019
291460e
coverage
wumpus Jan 28, 2019
69080d5
branch coverage
wumpus Jan 28, 2019
2e1d820
py2 branch coverage
wumpus Jan 28, 2019
bbdb57b
py2 testing
wumpus Jan 28, 2019
fc2d7b4
add record ids to test
wumpus Jan 28, 2019
d1fe18e
preserve capitalization in messages
wumpus Jan 28, 2019
484da9c
capitals and colons
wumpus Jan 28, 2019
4687497
use valid record ids
wumpus Jan 28, 2019
bcfe672
warc-segment-number cleaner recommendation
wumpus Jan 28, 2019
7f715c0
segment origin id
wumpus Jan 28, 2019
2583f19
timestamp checking
wumpus Jan 28, 2019
8eb87e8
buglet
wumpus Jan 29, 2019
3a8747e
global checks
wumpus Jan 30, 2019
f7cd1db
check -v; capitalize most commentary
wumpus Jan 31, 2019
b570b6c
...
wumpus Feb 1, 2019
921e748
revisits and global detection with just one file
wumpus Feb 1, 2019
4265b62
show errors for decompression and unchunking failures
wumpus Feb 1, 2019
08e6bd9
make this function reentrant
wumpus Feb 2, 2019
d1f48ed
narrow exception; fix bug not reading to the end of a chunked buffer
wumpus Feb 2, 2019
6e44a44
...
wumpus Feb 2, 2019
59198eb
put tester output in external files
wumpus Feb 6, 2019
b61878e
wip
wumpus Apr 4, 2019
2d2b7d5
tests pass
wumpus Sep 9, 2019
f4bc076
merge
wumpus Nov 5, 2019
fc19c7d
comments
wumpus Feb 16, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure.
``warcio check -v`` will print verbose output for each record in the
WARC file.

Test
~~~~

The ``warcio test`` command will check one or more WARC files against
the WARC standard, giving commentary about standards violations,
recommendations, and other issues.


Recompress
~~~~~~~~~~

Expand Down
17 changes: 11 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from setuptools import setup, find_packages
from setuptools.command.test import test as TestCommand
import glob
import sys

__version__ = '1.7.0.dev0'

Expand All @@ -21,6 +22,15 @@ def run_tests(self):
errcode = pytest.main(['--doctest-module', './warcio', '--cov', 'warcio', '-v', 'test/'])
sys.exit(errcode)

tests_require = [
'pytest',
'pytest-cov',
'httpbin==0.5.0',
'requests',
]
if sys.version_info < (3, 3):
tests_require.append('ipaddress')

setup(
name='warcio',
version=__version__,
Expand All @@ -44,12 +54,7 @@ def run_tests(self):
""",
cmdclass={'test': PyTest},
test_suite='',
tests_require=[
'pytest',
'pytest-cov',
'httpbin==0.5.0',
'requests',
],
tests_require=tests_require,
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Web Environment',
Expand Down
File renamed without changes.
5 changes: 5 additions & 0 deletions test/data/standard-torture-missing.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
WARC/1.0
WARC-Type: warcinfo
Content-Length: 0


53 changes: 53 additions & 0 deletions test/data/standard-torture-validate-field.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
WARC/1.0
WARC-Target-URI: <http://example.com/>
WARC-Target-URI: example.com
WARC-Target-URI: ex ample.com
WARC-Target-URI: h<>ttp://example.com/
WARC-Type: does-not-exist
WARC-Type: CAPITALIZED
WARC-Concurrent-To: http://example.com/
WARC-Record-ID: <foo:bar>
WARC-Date: 2017-03-06T04:03:53Z
WARC-Date: 2017-03-06T04:03:53.Z
Content-Type: asdf
Content-Type: has space/asdf
Content-Type: asdf/has space
Content-Type: asdf/has space;asdf
WARC-Block-Digest: asdf
WARC-Block-Digest: has space:asdf
WARC-Block-Digest: sha1:&$*^&*^#*&^
WARC-IP-Address: 1.2.3.4.5
WARC-Truncated: invalid
WARC-Warcinfo-ID: asdf:asdf
WARC-Filename: not-yet-tested
WARC-Profile: asdf
WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
WARC-Identified-Payload-Type: asdf
WARC-Segment-Origin-ID: http://example.com
WARC-Segment-Number: not-an-integer
WARC-Segment-Number: 0
WARC-Segment-Number: 1
WARC-Segment-Number: 2
WARC-Segment-Total-Length: 0
WARC-Segment-Total-Length: not-an-integer
WARC-Refers-To-Target-URI: http://example.com
WARC-Refers-To-Date: not-a-date
WARC-Unknown-Field: asdf
Content-Length: 0


WARC/1.1
WARC-Date: 2017-03-06T04:03:53Z
WARC-Date: 2017-03-06T04:03:53.Z
WARC-Date: 2017-03-06T04:03:53.0Z
WARC-Type: invalid
Content-Length: 0


WARC/1.1
WARC-Type: request
WARC-Segment-Number: 1
Content-Length: 0


WARC/invalid
111 changes: 111 additions & 0 deletions test/data/standard-torture-validate-record.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
WARC/1.0
WARC-Type: warcinfo
Content-Type: application/warc-fields
WARC-Refers-To: probhibited
Content-Length: 146

first line can't start with a space
test: invalid utf8 �(
test: lines should end with \r\n
foo:
bar

no colon
token cannot have a space:


WARC/1.0
WARC-Type: warcinfo
Content-Type: application/warc-fields
Content-Length: 0


WARC/1.0
WARC-Type: response
WARC-Target-URI: HtTp://example.com/
Content-Type: text/plain
Content-Length: 0


WARC/1.0
WARC-Type: resource
WARC-Target-URI: DnS:asdfasdf
Content-Type: text/plain
Content-Length: 0


WARC/1.0
WARC-Type: resource
WARC-Target-URI: DnS:asdfasdf
Content-Type: text/dns
Content-Length: 0


WARC/1.0
WARC-Type: resource
WARC-Target-URI: foo:bar
Content-Length: 0


WARC/1.0
WARC-Type: request
WARC-Target-URI: hTtP://example.com/
Content-Type: text/plain
Content-Length: 0


WARC/1.0
WARC-Type: request
WARC-Target-URI: hTtP://example.com/
WARC-IP-Address: 1.2.3.4
Content-Type: text/plain
Content-Length: 0


WARC/1.0
WARC-Type: metadata
Content-Type: application/warc-fields
Content-Length: 0


WARC/1.0
WARC-Type: metadata
Content-Type: not-application/warc-fields
Content-Length: 0


WARC/1.0
WARC-Type: revisit
WARC-Profile: none
Content-Length: 0


WARC/1.0
WARC-Type: revisit
WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
Content-Length: 0


WARC/1.0
WARC-Type: revisit
WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
Content-Length: 0


WARC/1.0
WARC-Type: conversion
Content-Length: 0


WARC/1.0
WARC-Type: continuation
WARC-Segment-Number: 1
Content-Length: 0


WARC/1.0
WARC-Type: continuation
WARC-Segment-Number: 2
Content-Length: 0


10 changes: 6 additions & 4 deletions test/test_archiveiterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def test_err_arc_iterator_on_warc(self):
def test_corrects_wget_bug(self):
with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record:
assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record:
assert record.rec_headers.get('WARC-Target-URI') == '<http://example.com/>'

def _digests_mutilate_helper(self, contents, expected_t, expected_f, capsys, full_read=False):
with pytest.raises(ArchiveLoadFailed):
Expand Down Expand Up @@ -243,9 +245,9 @@ def test_digests_file(self):
expected_t = ['request', 'request', 'request']

# record 1: invalid payload digest
assert self._load_archive('example-digest.warc', check_digests=True) == expected_t
assert self._load_archive('example-digest.warc', check_digests=False) == expected_f
assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t
assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f

# record 2: b64 digest; record 3: b64 filename safe digest
assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t
assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t
assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t
assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t
2 changes: 1 addition & 1 deletion test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_check_valid():


def test_check_invalid():
filenames = [get_test_file('example-digest.warc')]
filenames = [get_test_file('example-digest-bad.warc')]

args = ['check'] + filenames
value = check_helper(args, 1)
Expand Down
Loading