Skip to content

Commit

Permalink
downloads: add SOCKS Proxy support
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Aug 27, 2024
1 parent 14c79c0 commit a732852
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 6 deletions.
22 changes: 21 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ jobs:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13-dev"
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
env:
- MINIMAL: "true"
PROXY_TEST: "false"
- MINIMAL: "false"
PROXY_TEST: "true"
include:
# custom python versions
- os: ubuntu-20.04
Expand All @@ -36,6 +40,18 @@ jobs:
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.12"
services:
socks_proxy:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
ports:
- 1080:1080
socks_proxy_auth:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
env:
PROXY_USER: user
PROXY_PASSWORD: pass
ports:
- 1081:1080
steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -84,6 +100,10 @@ jobs:
if: ${{ matrix.env.MINIMAL == 'false'}}
run: python -m pip install -e ".[all]"

- name: Install SOCKS proxy dependencies
if: ${{ matrix.env.PROXY_TEST == 'true' }}
run: python -m pip install -e ".[socks]"

# tests
- name: Lint with flake8
run: |
Expand Down
18 changes: 18 additions & 0 deletions compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
services:
socks_proxy:
image: serjs/go-socks5-proxy
ports:
- 1080:1080
socks_proxy_auth:
image: serjs/go-socks5-proxy
ports:
- 1081:1080
environment:
PROXY_USER: user
PROXY_PASSWORD: pass
tor_proxy:
image: dperson/torproxy
ports:
- 9050:9050


3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def get_long_description():
"gui": [
"Gooey >= 1.0.1",
],
"socks": [
"pysocks >= 1.7.1"
],
}

setup(
Expand Down
23 changes: 21 additions & 2 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from trafilatura.utils import decode_file, decode_response, handle_compressed_file, load_html

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
LOGGER = logging.getLogger(__name__)

ZERO_CONFIG = DEFAULT_CONFIG
ZERO_CONFIG['DEFAULT']['MIN_OUTPUT_SIZE'] = '0'
Expand All @@ -59,10 +60,24 @@ def _reset_downloads_global_objects():
"""
Force global objects to be re-created
"""
trafilatura.downloads.PROXY_URL = None
trafilatura.downloads.HTTP_POOL = None
trafilatura.downloads.NO_CERT_POOL = None
trafilatura.downloads.RETRY_STRATEGY = None

PROXY_URLS = [None]
if os.environ.get('PROXY_TEST', 'false') == 'true':
PROXY_URLS.extend([
'socks5://localhost:1080',
'socks5://user:pass@localhost:1081'
])

@pytest.fixture(params=PROXY_URLS)
def proxy_url(request):
_reset_downloads_global_objects()
trafilatura.downloads.PROXY_URL = request.param
yield trafilatura.downloads.PROXY_URL
_reset_downloads_global_objects()

def test_response_object():
"Test if the Response class is functioning as expected."
Expand Down Expand Up @@ -90,8 +105,10 @@ def test_response_object():
assert extract(response, url=response.url, config=ZERO_CONFIG) is None


def test_is_live_page():
def test_is_live_page(proxy_url):
'''Test if pages are available on the network.'''
if proxy_url is not None:
LOGGER.debug('using proxy %s', proxy_url)
# is_live general tests
assert _urllib3_is_live_page('https://httpbun.com/status/301') is True
assert _urllib3_is_live_page('https://httpbun.com/status/404') is False
Expand All @@ -101,8 +118,10 @@ def test_is_live_page():
assert _pycurl_is_live_page('https://httpbun.com/status/301') is True


def test_fetch():
def test_fetch(proxy_url):
'''Test URL fetching.'''
if proxy_url is not None:
LOGGER.debug('using proxy %s', proxy_url)
# sanity check
assert _send_urllib_request('', True, False, DEFAULT_CONFIG) is None
with pytest.raises(ValueError):
Expand Down
38 changes: 35 additions & 3 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import logging
import os
import random

from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -12,10 +13,26 @@
from io import BytesIO
from time import sleep
from typing import Any, ByteString, Dict, Generator, List, Optional, Set, Tuple, Union
from urllib.parse import urlparse

import certifi
import urllib3

SOCKS_PROXY_SCHEMES = {"socks4", "socks4a", "socks5", "socks5h"}

try:
import socks
import urllib3.contrib.socks

PROXY_URL = os.environ.get("http_proxy")
if PROXY_URL is not None:
parsed_proxy_url = urlparse(PROXY_URL)
if parsed_proxy_url.scheme not in SOCKS_PROXY_SCHEMES:

Check warning on line 30 in trafilatura/downloads.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/downloads.py#L29-L30

Added lines #L29 - L30 were not covered by tests
# TODO: maybe issue warning because of unsupported proxy scheme
PROXY_URL = None

Check warning on line 32 in trafilatura/downloads.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/downloads.py#L32

Added line #L32 was not covered by tests
except ImportError:
PROXY_URL = None

try:
import pycurl

Expand Down Expand Up @@ -159,9 +176,16 @@ def _send_urllib_request(
# unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes
)
try:
if PROXY_URL:
pool_manager_class = urllib3.contrib.socks.SOCKSProxyManager
pool_manager_args = {"proxy_url": PROXY_URL}

Check warning on line 181 in trafilatura/downloads.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/downloads.py#L180-L181

Added lines #L180 - L181 were not covered by tests
else:
pool_manager_class = urllib3.PoolManager
pool_manager_args = {}
if no_ssl is False:
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(
HTTP_POOL = pool_manager_class(
**pool_manager_args,
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
ca_certs=certifi.where(),
Expand All @@ -170,7 +194,8 @@ def _send_urllib_request(
pool_manager = HTTP_POOL
else:
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(
NO_CERT_POOL = pool_manager_class(
**pool_manager_args,
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
cert_reqs="CERT_NONE",
Expand Down Expand Up @@ -288,13 +313,20 @@ def _pycurl_is_live_page(url: str) -> bool:
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)

Check warning on line 317 in trafilatura/downloads.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/downloads.py#L317

Added line #L317 was not covered by tests
# Perform the request
try:
curl.perform()
# Get the response code
page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400
except pycurl.error as err:
LOGGER.debug("pycurl HEAD error: %s %s", url, err)
if PROXY_URL is not None and err.args[0] == pycurl.E_COULDNT_CONNECT:
# connection errors could be related to SOCKS proxy
log_level = logging.WARN

Check warning on line 326 in trafilatura/downloads.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/downloads.py#L326

Added line #L326 was not covered by tests
else:
log_level = logging.DEBUG
LOGGER.log(log_level, "pycurl HEAD error: %s %s", url, err)
page_exists = False
# Clean up
curl.close()
Expand Down

0 comments on commit a732852

Please sign in to comment.