Skip to content

Commit

Permalink
warcio check cli, plus bugfixes to check_digests (#58)
Browse files Browse the repository at this point in the history
* warcio check cli, plus bugfixes to check_digests

* rename .status to .passed and cleanups
  • Loading branch information
wumpus authored and ikreymer committed Jan 22, 2019
1 parent 168e44a commit 704297b
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 31 deletions.
10 changes: 9 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ Please refer to `warcwriter.py <warcio/warcwriter.py>`__ and
WARCIO CLI: Indexing and Recompression
--------------------------------------
The library currently ships with two simple command line tools.
The library currently ships with a few simple command line tools.
Index
~~~~~
Expand Down Expand Up @@ -360,6 +360,14 @@ the warc file.
associated with web archives. To create these indexes, please see the
`cdxj-indexer <https://github.com/webrecorder/cdxj-indexer>`__ tool which extends warcio indexing to provide this functionality)
Check
~~~~~
The ``warcio check`` command will check the payload and block digests
of WARC records, if possible. An exit value of 1 indicates a failure.
``warcio check -v`` will print verbose output for each record in the
WARC file.
Recompress
~~~~~~~~~~
Expand Down
12 changes: 6 additions & 6 deletions test/test_archiveiterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _load_archive(self, filename, offset=0, cls=ArchiveIterator,
with open(get_test_file(filename), 'rb') as fh:
fh.seek(offset)
iter_ = cls(fh, **kwargs)
rec_types = [record.rec_type for record in iter_ if record.digest_checker.status is not False]
rec_types = [record.rec_type for record in iter_ if record.digest_checker.passed is not False]

assert iter_.err_count == errs_expected

Expand All @@ -32,9 +32,9 @@ def _load_archive_memory(self, stream, offset=0, cls=ArchiveIterator,
iter_ = cls(stream, **kwargs)
if full_read:
rec_types = [record.rec_type for record in iter_
if (record.content_stream().read() or True) and record.digest_checker.status is not False]
if (record.content_stream().read() or True) and record.digest_checker.passed is not False]
else:
rec_types = [record.rec_type for record in iter_ if record.digest_checker.status is not False]
rec_types = [record.rec_type for record in iter_ if record.digest_checker.passed is not False]

assert iter_.err_count == errs_expected

Expand Down Expand Up @@ -73,18 +73,18 @@ def test_iterator(self):
with closing(ArchiveIterator(fh)) as a:
for record in a:
assert record.rec_type == 'warcinfo'
assert record.digest_checker.status is None
assert record.digest_checker.passed is None
assert len(record.digest_checker.problems) == 0
break

record = next(a)
assert record.rec_type == 'response'
assert record.digest_checker.status is None
assert record.digest_checker.passed is None
assert len(record.digest_checker.problems) == 0

for record in a:
assert record.rec_type == 'request'
assert record.digest_checker.status is None
assert record.digest_checker.passed is None
assert len(record.digest_checker.problems) == 0
break

Expand Down
42 changes: 42 additions & 0 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,48 @@ def test_index_2():
res = main(args=args)
assert buff.getvalue() == expected


def check_helper(args, expected_exit_value):
with patch_stdout() as buff:
exit_value = None
try:
main(args=args)
except SystemExit as e:
exit_value = e.code
finally:
assert exit_value == expected_exit_value

return buff.getvalue()


def test_check_valid():
filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')]

args = ['check'] + filenames
expected = b''
assert check_helper(args, 0) == expected

args = ['check', '-v'] + filenames
value = check_helper(args, 0)
assert value.count(b'digest pass') == 2
assert value.count(b'WARC-Record-ID') == 12


def test_check_invalid():
filenames = [get_test_file('example-digest.warc')]

args = ['check'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'WARC-Record-ID') == 1

args = ['check', '-v'] + filenames
value = check_helper(args, 1)
assert value.count(b'payload digest failed') == 1
assert value.count(b'digest pass') == 3
assert value.count(b'WARC-Record-ID') == 4


def test_recompress_non_chunked():
with named_temp() as temp:
test_file = get_test_file('example-bad-non-chunked.warc.gz')
Expand Down
44 changes: 44 additions & 0 deletions warcio/checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import print_function

from warcio.archiveiterator import ArchiveIterator


class Checker(object):
def __init__(self, cmd):
self.inputs = cmd.inputs
self.verbose = cmd.verbose
self.exit_value = 0

def process_all(self):
for filename in self.inputs:
self.process_one(filename)
return self.exit_value

def process_one(self, filename):
with open(filename, 'rb') as stream:
file_printed = False
filename = filename
for record in ArchiveIterator(stream, check_digests=True):
record.content_stream().read() # make sure digests are checked
rec_id = record.rec_headers.get_header('WARC-Record-ID')
rec_type = record.rec_headers.get_header('WARC-Type')
if record.digest_checker.passed is False:
self.exit_value = 1
file_printed = _fprint(filename, file_printed)
print(' ', 'WARC-Record-ID', rec_id, rec_type)
for p in record.digest_checker.problems:
print(' ', p)
elif record.digest_checker.passed is True and self.verbose:
file_printed = _fprint(filename, file_printed)
print(' ', 'WARC-Record-ID', rec_id, rec_type)
print(' digest pass')
elif record.digest_checker.passed is None and self.verbose:
file_printed = _fprint(filename, file_printed)
print(' ', 'WARC-Record-ID', rec_id, rec_type)
print(' digest not checked')


def _fprint(filename, file_printed):
if not file_printed:
print(filename)
return True
16 changes: 14 additions & 2 deletions warcio/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from warcio.bufferedreaders import DecompressingBufferedReader

from warcio.indexer import Indexer
from warcio.checker import Checker
from warcio.utils import BUFF_SIZE

import tempfile
Expand Down Expand Up @@ -48,6 +49,11 @@ def main(args=None):

extract.set_defaults(func=extract_record)

check = subparsers.add_parser('check', help='WARC digest checker')
check.add_argument('inputs', nargs='+')
check.add_argument('-v', '--verbose', action='store_true')
check.set_defaults(func=checker)

cmd = parser.parse_args(args=args)
cmd.func(cmd)

Expand Down Expand Up @@ -90,8 +96,14 @@ def get_version():

# ============================================================================
def indexer(cmd):
indexer = Indexer(cmd.fields, cmd.inputs, cmd.output)
indexer.process_all()
_indexer = Indexer(cmd.fields, cmd.inputs, cmd.output)
_indexer.process_all()


# ============================================================================
def checker(cmd):
_checker = Checker(cmd)
sys.exit(_checker.process_all())


# ============================================================================
Expand Down
50 changes: 28 additions & 22 deletions warcio/digestverifyingreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,28 @@
class DigestChecker(object):
def __init__(self, kind=None):
self._problem = []
self._status = None
self._passed = None
self.kind = kind

@property
def status(self):
return self._status
def passed(self):
return self._passed

@status.setter
def status(self, value):
self._status = value
@passed.setter
def passed(self, value):
self._passed = value

@property
def problems(self):
return self._problem

def problem(self, value):
def problem(self, value, passed=False):
self._problem.append(value)
if self.kind == 'raise':
raise ArchiveLoadFailed(value)
if self.kind == 'log':
sys.stderr.write(value + '\n')
self.status = False
self._passed = passed


# ============================================================================
Expand All @@ -45,11 +45,11 @@ def __init__(self, stream, limit, digest_checker, record_type=None,

super(DigestVerifyingReader, self).__init__(stream, limit)

self.check_digest = digest_checker
self.digest_checker = digest_checker

if record_type == 'revisit':
block_digest = None # XXX my bug, or is example.warc wrong?
payload_digest = None # no payload, so can't check it
block_digest = None
payload_digest = None
if segment_number is not None: #pragma: no cover
payload_digest = None

Expand All @@ -65,22 +65,25 @@ def __init__(self, stream, limit, digest_checker, record_type=None,
algo, _ = _parse_digest(block_digest)
self.block_digester = Digester(algo)
except ValueError:
self.problem('unknown hash algorithm name in block digest')
self.digest_checker.problem('unknown hash algorithm name in block digest')
self.block_digester = None
if payload_digest:
try:
algo, _ = _parse_digest(self.payload_digest)
self.payload_digester_obj = Digester(algo)
except ValueError:
self.problem('unknown hash algorithm name in payload digest')
self.digest_checker.problem('unknown hash algorithm name in payload digest')
self.payload_digester_obj = None

def begin_payload(self):
self.payload_digester = self.payload_digester_obj
if self.limit == 0:
if _compare_digest_rfc_3548(self.payload_digester, self.payload_digest) is False:
self.problem('payload digest failed: {}'.format(self.payload_digest))
check = _compare_digest_rfc_3548(self.payload_digester, self.payload_digest)
if check is False:
self.digest_checker.problem('payload digest failed: {}'.format(self.payload_digest))
self.payload_digester = None # prevent double-fire
elif check is True and self.digest_checker.passed is not False:
self.digest_checker.passed = True

def _update(self, buff):
super(DigestVerifyingReader, self)._update(buff)
Expand All @@ -91,16 +94,19 @@ def _update(self, buff):
self.block_digester.update(buff)

if self.limit == 0:
if _compare_digest_rfc_3548(self.block_digester, self.block_digest) is False:
self.problem('block digest failed: {}'.format(self.block_digest))
if _compare_digest_rfc_3548(self.payload_digester, self.payload_digest) is False:
self.problem('payload digest failed {}'.format(self.payload_digest))
check = _compare_digest_rfc_3548(self.block_digester, self.block_digest)
if check is False:
self.digest_checker.problem('block digest failed: {}'.format(self.block_digest))
elif check is True and self.digest_checker.passed is not False:
self.digest_checker.passed = True
check = _compare_digest_rfc_3548(self.payload_digester, self.payload_digest)
if check is False:
self.digest_checker.problem('payload digest failed {}'.format(self.payload_digest))
elif check is True and self.digest_checker.passed is not False:
self.digest_checker.passed = True

return buff

def problem(self, reason):
self.check_digest.problem(reason)


def _compare_digest_rfc_3548(digester, digest):
'''
Expand Down

0 comments on commit 704297b

Please sign in to comment.