Skip to content

Commit

Permalink
Migrate to GitHub Actions CI and resolve dependency issues
Browse files Browse the repository at this point in the history
- Migrate from Travis CI to GitHub Actions CI
- Test Python versions 3.7-3.11
- Update httpbin to latest PSF-maintained version
- Modify handling of proxies in test_capture_http_proxy module to account
for breaking changes in requests and urllib3 (note that this currently means
pinning urllib3==1.25.11, as more recent versions no longer allow using
http:// scheme URLs with the https key in the proxies dictionary; depending
on whether and how this is resolved, we may need to come back to this in the
future but for now it gets CI working again)
  • Loading branch information
tw4l authored May 27, 2024
2 parents 8c305f5 + f8603f0 commit de769a3
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 148 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: CI

on: [push, pull_request]

jobs:
unit-tests:
runs-on: ubuntu-latest
strategy:
max-parallel: 3
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']

steps:
- name: checkout
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install urllib3==1.25.11 wheel brotlipy coverage codecov
- name: Install warcio
run: python setup.py install

- name: Run tests
run: python setup.py test

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
27 changes: 0 additions & 27 deletions .travis.yml

This file was deleted.

4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ def run_tests(self):
cmdclass={'test': PyTest},
test_suite='',
tests_require=[
'urllib3==1.25.11',
'pytest',
'pytest-cov',
'httpbin==0.5.0',
'httpbin>=0.10.2',
'requests',
'wsgiprox',
'hookdns',
],
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down
20 changes: 15 additions & 5 deletions test/test_capture_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
from warcio.utils import BUFF_SIZE
from warcio.warcwriter import BufferWARCWriter, WARCWriter

# ==================================================================



# ==================================================================
class TestCaptureHttpBin(object):
Expand Down Expand Up @@ -68,21 +71,28 @@ def test_get(self):
assert request.rec_headers['WARC-Target-URI'] == url
assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'

def test_get_cache_to_file(self):
def test_post_cache_to_file(self):
warc_writer = BufferWARCWriter(gzip=False)

url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2)
random_bytes = os.urandom(BUFF_SIZE * 2)
request_data = {"data": str(random_bytes)}

url = 'http://localhost:{0}/anything'.format(self.port)
with capture_http(warc_writer):
res = requests.get(url, headers={'Host': 'httpbin.org'})
res = requests.post(
url,
headers={'Host': 'httpbin.org'},
json=request_data
)

assert len(res.content) == BUFF_SIZE * 2
assert res.json()["json"] == request_data

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == url
assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1'
assert res.content == response.content_stream().read()
assert request_data == json.loads(response.content_stream().read().decode('utf-8'))["json"]

request = next(ai)
assert request.rec_type == 'request'
Expand Down
237 changes: 122 additions & 115 deletions test/test_capture_http_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
import time

import requests
from hookdns import hosts
from warcio.archiveiterator import ArchiveIterator


from pytest import raises


# ==================================================================
class TestCaptureHttpProxy():
def setup(cls):
def setup_class(cls):
def app(env, start_response):
result = ('Proxied: ' + env['PATH_INFO']).encode('utf-8')
headers = [('Content-Length', str(len(result)))]
Expand All @@ -29,9 +31,10 @@ def handle_error(self, request, client_address):
server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer)
addr, cls.port = server.socket.getsockname()

cls.proxies = {'https': 'localhost:' + str(cls.port),
'http': 'localhost:' + str(cls.port)
}
cls.proxies = {
'https': 'http://proxy.com:' + str(cls.port),
'http': 'http://proxy.com:' + str(cls.port)
}

def run():
try:
Expand All @@ -45,123 +48,127 @@ def run():
time.sleep(0.1)

def test_capture_http_proxy(self):
with capture_http() as warc_writer:
res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)
with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
assert response.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port)

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'http://proxy.com:{0}'.format(self.port)

with raises(StopIteration):
assert next(ai)
with raises(StopIteration):
assert next(ai)

def test_capture_https_proxy(self):
with capture_http() as warc_writer:
res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)

# not recording this request
res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)
with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)

# not recording this request
res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

def test_capture_https_proxy_same_session(self):
sesh = requests.session()
with capture_http() as warc_writer:
res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)

# *will* be captured, as part of same session... (fix this?)
res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'

request = next(ai)
assert request.rec_type == 'request'

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

with hosts({"proxy.com": "127.0.0.1"}):
with capture_http() as warc_writer:
res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)

# *will* be captured, as part of same session... (fix this?)
res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)

with capture_http(warc_writer):
res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)

ai = ArchiveIterator(warc_writer.get_stream())
response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'

request = next(ai)
assert request.rec_type == 'request'
assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
assert request.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'

request = next(ai)
assert request.rec_type == 'request'

response = next(ai)
assert response.rec_type == 'response'
assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
assert response.rec_headers['WARC-Proxy-Host'] == 'https://proxy.com:{0}'.format(self.port)
assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'

request = next(ai)
assert request.rec_type == 'request'

with raises(StopIteration):
assert next(ai)

0 comments on commit de769a3

Please sign in to comment.