Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to GitHub Actions CI and resolve dependency issues #164

Merged
merged 4 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: CI

on: [push, pull_request]

jobs:
unit-tests:
runs-on: ubuntu-latest
strategy:
max-parallel: 3
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']

steps:
- name: checkout
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install urllib3==1.25.11 wheel brotlipy coverage codecov

- name: Install warcio
run: python setup.py install

- name: Run tests
run: python setup.py test

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
27 changes: 0 additions & 27 deletions .travis.yml

This file was deleted.

4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ def run_tests(self):
cmdclass={'test': PyTest},
test_suite='',
tests_require=[
'urllib3==1.25.11',
'pytest',
'pytest-cov',
'httpbin==0.5.0',
'httpbin>=0.10.2',
'requests',
'wsgiprox',
'hookdns',
],
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down
20 changes: 15 additions & 5 deletions test/test_capture_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
from warcio.utils import BUFF_SIZE
from warcio.warcwriter import BufferWARCWriter, WARCWriter

# ==================================================================



# ==================================================================
class TestCaptureHttpBin(object):
Expand Down Expand Up @@ -68,21 +71,28 @@ def test_get(self):
assert request.rec_headers['WARC-Target-URI'] == url
assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'

def test_get_cache_to_file(self):
def test_post_cache_to_file(self):
warc_writer = BufferWARCWriter(gzip=False)

url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2)
random_bytes = os.urandom(BUFF_SIZE * 2)
request_data = {"data": str(random_bytes)}

url = 'http://localhost:{0}/anything'.format(self.port)
with capture_http(warc_writer):
res = requests.get(url, headers={'Host': 'httpbin.org'})
res = requests.post(
url,
headers={'Host': 'httpbin.org'},
json=request_data
)

assert len(res.content) == BUFF_SIZE * 2
assert res.json()["json"] == request_data

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == url
assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
assert res.content == response.content_stream().read()
assert request_data == json.loads(response.content_stream().read().decode('utf-8'))["json"]

request = next(ai)
assert request.rec_type == 'request'
Expand Down
237 changes: 122 additions & 115 deletions test/test_capture_http_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
import time

import requests
from hookdns import hosts
from warcio.archiveiterator import ArchiveIterator


from pytest import raises


# ==================================================================
class TestCaptureHttpProxy():
def setup(cls):
def setup_class(cls):
def app(env, start_response):
result = ('Proxied: ' + env['PATH_INFO']).encode('utf-8')
headers = [('Content-Length', str(len(result)))]
Expand All @@ -29,9 +31,10 @@ def handle_error(self, request, client_address):
server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer)
addr, cls.port = server.socket.getsockname()

cls.proxies = {'https': 'localhost:' + str(cls.port),
'http': 'localhost:' + str(cls.port)
}
cls.proxies = {
'https': 'http://proxy.com:' + str(cls.port),
'http': 'http://proxy.com:' + str(cls.port)
}

def run():
try:
Expand All @@ -45,123 +48,127 @@ def run():
time.sleep(0.1)

def test_capture_http_proxy(self):
with capture_http() as warc_writer:
res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)
with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
assert response.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port)

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port)

with raises(StopIteration):
assert next(ai)
with raises(StopIteration):
assert next(ai)

def test_capture_https_proxy(self):
with capture_http() as warc_writer:
res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)

# not recording this request
res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)
with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)

# not recording this request
res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

def test_capture_https_proxy_same_session(self):
sesh = requests.session()
with capture_http() as warc_writer:
res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)

# *will* be captured, as part of same session... (fix this?)
res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'

request = next(ai)
assert request.rec_type == 'request'

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)

# *will* be captured, as part of same session... (fix this?)
res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'

request = next(ai)
assert request.rec_type == 'request'

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

Loading