From 2ea7f7de836ced707c78e9e1c8a9a516a4c7862c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 13 May 2024 11:51:29 -0400 Subject: [PATCH 1/4] Move test CI from Travis to GitHub Actions Also: - Remove older Python versions from CI - Switch to updated PSF httpbin --- .github/workflows/ci.yaml | 34 ++++++++++++++++++++++++++++++++++ .travis.yml | 27 --------------------------- setup.py | 2 +- 3 files changed, 35 insertions(+), 28 deletions(-) create mode 100644 .github/workflows/ci.yaml delete mode 100644 .travis.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 00000000..4a7c4c29 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,34 @@ +name: CI + +on: [push, pull_request] + +jobs: + unit-tests: + runs-on: ubuntu-latest + strategy: + max-parallel: 3 + matrix: + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + + steps: + - name: checkout + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install wheel brotlipy coverage codecov + + - name: Install warcio + run: python setup.py install + + - name: Run tests + run: python setup.py test + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 0f3fe21d..00000000 --- a/.travis.yml +++ /dev/null @@ -1,27 +0,0 @@ -language: python - -python: - - "3.5" - - "3.6" - - "3.7" - - "3.8" - -os: - - linux - -sudo: false - -install: - # add brotli for tests - - pip install brotlipy - - python setup.py install - - pip install coverage pytest-cov codecov - - pip install 'jinja2<3.0.0' - - pip install 'itsdangerous<2.0.0' - -script: - - python setup.py test - -after_success: - - codecov - diff --git a/setup.py b/setup.py index d07692e8..7795a4a5 100755 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ def run_tests(self): tests_require=[ 'pytest', 'pytest-cov', - 'httpbin==0.5.0', + 'httpbin>=0.10.2', 'requests', 'wsgiprox', ], From b1209aaa22a728845ce740f0417b68d2c8236f05 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 15 May 2024 11:02:19 -0400 Subject: [PATCH 2/4] Modify cache to file test to use different endpoint This avoids wsgiref incorrectly throwing an exception saying that the return data must be bytes. --- test/test_capture_http.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/test/test_capture_http.py b/test/test_capture_http.py index 41274d87..6348cb5d 100644 --- a/test/test_capture_http.py +++ b/test/test_capture_http.py @@ -16,6 +16,9 @@ from warcio.utils import BUFF_SIZE from warcio.warcwriter import BufferWARCWriter, WARCWriter +# ================================================================== + + # ================================================================== class TestCaptureHttpBin(object): @@ -68,21 +71,28 @@ def test_get(self): assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' - def test_get_cache_to_file(self): + def test_post_cache_to_file(self): warc_writer = BufferWARCWriter(gzip=False) - url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2) + random_bytes = os.urandom(BUFF_SIZE * 2) + request_data = {"data": str(random_bytes)} + + url = 'http://localhost:{0}/anything'.format(self.port) with capture_http(warc_writer): - res = requests.get(url, headers={'Host': 'httpbin.org'}) + res = requests.post( + url, + headers={'Host': 'httpbin.org'}, + json=request_data + ) - assert len(res.content) == BUFF_SIZE * 2 + assert res.json()["json"] == request_data ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' - assert res.content == response.content_stream().read() + assert request_data == json.loads(response.content_stream().read().decode('utf-8'))["json"] request = next(ai) assert request.rec_type == 'request' From 92edc563758ac268a5405156d53c66f1e7ceb90c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 15 May 2024 13:22:16 -0400 Subject: [PATCH 3/4] Fix issues with proxies in recent urllib3 and requests - Use hookdns to route calls to proxy.com to localhost proxy now that requests no longer allows localhost:port proxies - Pin urrllib3 to 1.25.11, as more recent versions throw an SSL exception or hang if using a http url as an https proxy --- .github/workflows/ci.yaml | 2 +- setup.py | 2 + test/test_capture_http_proxy.py | 239 +++++++++++++++++--------------- 3 files changed, 128 insertions(+), 115 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4a7c4c29..edd11ab0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install wheel brotlipy coverage codecov + pip install urllib3==1.25.11 wheel brotlipy coverage codecov - name: Install warcio run: python setup.py install diff --git a/setup.py b/setup.py index 7795a4a5..922c025c 100755 --- a/setup.py +++ b/setup.py @@ -45,11 +45,13 @@ def run_tests(self): cmdclass={'test': PyTest}, test_suite='', tests_require=[ + 'urllib3==1.25.11', 'pytest', 'pytest-cov', 'httpbin>=0.10.2', 'requests', 'wsgiprox', + 'hookdns', ], classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/test/test_capture_http_proxy.py b/test/test_capture_http_proxy.py index fba301d8..4c59f7ab 100644 --- a/test/test_capture_http_proxy.py +++ b/test/test_capture_http_proxy.py @@ -5,8 +5,10 @@ import time import requests +from hookdns import hosts from warcio.archiveiterator import ArchiveIterator + from pytest import raises @@ -29,9 +31,14 @@ def handle_error(self, request, client_address): server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer) addr, cls.port = server.socket.getsockname() - cls.proxies = {'https': 'localhost:' + str(cls.port), - 'http': 'localhost:' + str(cls.port) - } + print(f"cls.port: {cls.port}", flush=True) + + cls.proxies = { + 'https': 'http://proxy.com:' + str(cls.port), + 'http': 'http://proxy.com:' + str(cls.port) + } + + print(f"cls.proxies: {cls.proxies}", flush=True) def run(): try: @@ -45,123 +52,127 @@ def run(): time.sleep(0.1) def test_capture_http_proxy(self): - with capture_http() as warc_writer: - res = requests.get("http://example.com/test", proxies=self.proxies, verify=False) + with hosts({"proxy.com": "127.0.0.1"}): + with capture_http() as warc_writer: + res = requests.get("http://example.com/test", proxies=self.proxies, verify=False) - ai = ArchiveIterator(warc_writer.get_stream()) - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test" - assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test' - assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port) + ai = ArchiveIterator(warc_writer.get_stream()) + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test" + assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test' + assert response.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port) - request = next(ai) - assert request.rec_type == 'request' - assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test" - assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port) + request = next(ai) + assert request.rec_type == 'request' + assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test" + assert request.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port) - with raises(StopIteration): - assert next(ai) + with raises(StopIteration): + assert next(ai) def test_capture_https_proxy(self): - with capture_http() as warc_writer: - res = requests.get("https://example.com/test", proxies=self.proxies, verify=False) - res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False) - - # not recording this request - res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False) - - with capture_http(warc_writer): - res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False) - - ai = ArchiveIterator(warc_writer.get_stream()) - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test' - - request = next(ai) - assert request.rec_type == 'request' - assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test" - assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo' - - request = next(ai) - assert request.rec_type == 'request' - assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo" - assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar' - - request = next(ai) - assert request.rec_type == 'request' - - with raises(StopIteration): - assert next(ai) + with hosts({"proxy.com": "127.0.0.1"}): + with capture_http() as warc_writer: + res = requests.get("https://example.com/test", proxies=self.proxies, verify=False) + res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False) + + # not recording this request + res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False) + + with capture_http(warc_writer): + res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False) + + ai = ArchiveIterator(warc_writer.get_stream()) + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test' + + request = next(ai) + assert request.rec_type == 'request' + assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test" + assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo' + + request = next(ai) + assert request.rec_type == 'request' + assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo" + assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar' + + request = next(ai) + assert request.rec_type == 'request' + + with raises(StopIteration): + assert next(ai) def test_capture_https_proxy_same_session(self): sesh = requests.session() - with capture_http() as warc_writer: - res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False) - res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False) - - # *will* be captured, as part of same session... (fix this?) - res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False) - - with capture_http(warc_writer): - res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False) - - ai = ArchiveIterator(warc_writer.get_stream()) - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test' - - request = next(ai) - assert request.rec_type == 'request' - assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test" - assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo' - - request = next(ai) - assert request.rec_type == 'request' - assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo" - assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip' - - request = next(ai) - assert request.rec_type == 'request' - - response = next(ai) - assert response.rec_type == 'response' - assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar" - assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port) - assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar' - - request = next(ai) - assert request.rec_type == 'request' - - with raises(StopIteration): - assert next(ai) + + with hosts({"proxy.com": "127.0.0.1"}): + with capture_http() as warc_writer: + res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False) + res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False) + + # *will* be captured, as part of same session... (fix this?) + res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False) + + with capture_http(warc_writer): + res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False) + + ai = ArchiveIterator(warc_writer.get_stream()) + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test' + + request = next(ai) + assert request.rec_type == 'request' + assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test" + assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo' + + request = next(ai) + assert request.rec_type == 'request' + assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo" + assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip' + + request = next(ai) + assert request.rec_type == 'request' + + response = next(ai) + assert response.rec_type == 'response' + assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar" + assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port) + assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar' + + request = next(ai) + assert request.rec_type == 'request' + + with raises(StopIteration): + assert next(ai) From f8603f05432fbd0ebc32ab40df263a0747a72603 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 24 May 2024 13:32:40 -0400 Subject: [PATCH 4/4] Change test class setup method to setup_class for pytest --- test/test_capture_http_proxy.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/test_capture_http_proxy.py b/test/test_capture_http_proxy.py index 4c59f7ab..fb8e6f69 100644 --- a/test/test_capture_http_proxy.py +++ b/test/test_capture_http_proxy.py @@ -14,7 +14,7 @@ # ================================================================== class TestCaptureHttpProxy(): - def setup(cls): + def setup_class(cls): def app(env, start_response): result = ('Proxied: ' + env['PATH_INFO']).encode('utf-8') headers = [('Content-Length', str(len(result)))] @@ -31,15 +31,11 @@ def handle_error(self, request, client_address): server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer) addr, cls.port = server.socket.getsockname() - print(f"cls.port: {cls.port}", flush=True) - cls.proxies = { 'https': 'http://proxy.com:' + str(cls.port), 'http': 'http://proxy.com:' + str(cls.port) } - print(f"cls.proxies: {cls.proxies}", flush=True) - def run(): try: server.serve_forever()