From 759ab0777fb1ec03ec7f47bba270ac3f61e07a81 Mon Sep 17 00:00:00 2001 From: "Chase H.D" Date: Sat, 26 Jan 2019 13:22:24 -0500 Subject: [PATCH] Fix ARCHeadersParser splits on space, cause errors with spaces in uri's (#62) * Use rsplit() to allow urls with spaces in them to be counted as single field -- this should take care of errors in arcs with urls that contain spaces tests: added space-in-url.arc for testing arc with spaces in url --- test/data/example-space-in-url.arc | 69 ++++++++++++++++++++++++++++++ test/test_archiveiterator.py | 4 ++ warcio/recordloader.py | 2 +- 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/data/example-space-in-url.arc diff --git a/test/data/example-space-in-url.arc b/test/data/example-space-in-url.arc new file mode 100644 index 00000000..da248a83 --- /dev/null +++ b/test/data/example-space-in-url.arc @@ -0,0 +1,69 @@ +filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75 +1 0 LiveWeb Capture +URL IP-address Archive-date Content-type Archive-length + +http://example.com/index.cfm?FuseAction=Email&EmailTitle=Examples From The Live Web&IsPopUp=False 93.184.216.119 20140216050221 text/html 1591 +HTTP/1.1 200 OK +Accept-Ranges: bytes +Cache-Control: max-age=604800 +Content-Type: text/html +Date: Sun, 16 Feb 2014 05:02:20 GMT +Etag: "359670651" +Expires: Sun, 23 Feb 2014 05:02:20 GMT +Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT +Server: ECS (sjc/4FCE) +X-Cache: HIT +x-ec-custom-error: 1 +Content-Length: 1270 + + + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is established to be used for illustrative examples in documents. You may use this + domain in examples without prior coordination or asking for permission.

+

More information...

+
+ + + diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 8cba7600..93be6544 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -112,6 +112,10 @@ def test_example_arc_gz(self): expected = ['arc_header', 'response'] assert self._load_archive('example.arc.gz') == expected + def test_example_space_in_url_arc(self): + expected = ['arc_header', 'response'] + assert self._load_archive('example-space-in-url.arc') == expected + def test_example_arc(self): expected = ['arc_header', 'response'] assert self._load_archive('example.arc') == expected diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 2467bde3..148480cd 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -288,7 +288,7 @@ def parse(self, stream, headerline=None): total_read += len(version) total_read += len(spec) - parts = headerline.split(' ') + parts = headerline.rsplit(' ', len(headernames)-1) if len(parts) != len(headernames): msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'