Skip to content

Commit

Permalink
downloads: add SOCKS Proxy support
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Aug 27, 2024
1 parent 14c79c0 commit 9433a17
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 4 deletions.
28 changes: 27 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ jobs:
os: [ubuntu-latest]
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
python-version: ["3.9", "3.11"] # "3.13-dev"
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
env:
- MINIMAL: "true"
- MINIMAL: "false"
- MINIMAL: "true"
http_proxy: "socks5://127.0.0.1:1080/"
- MINIMAL: "false"
http_proxy: "socks5://127.0.0.1:1080/"
- MINIMAL: "true"
http_proxy: "socks5://user:pass@127.0.0.1:1081/"
- MINIMAL: "false"
http_proxy: "socks5://user:pass@127.0.0.1:1081/"
include:
# custom python versions
- os: ubuntu-20.04
Expand All @@ -36,6 +46,18 @@ jobs:
python-version: "3.10"
- os: ubuntu-latest
python-version: "3.12"
services:
socks_proxy:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
ports:
- 1080:1080
socks_proxy_auth:
image: ${{ matrix.os == 'ubuntu-latest' && 'serjs/go-socks5-proxy' || '' }}
env:
PROXY_USER: user
PROXY_PASSWORD: pass
ports:
- 1081:1080
steps:
# Python and pip setup
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -84,6 +106,10 @@ jobs:
if: ${{ matrix.env.MINIMAL == 'false'}}
run: python -m pip install -e ".[all]"

- name: Install SOCKS proxy dependencies
if: ${{ matrix.env.http_proxy }}
run: python -m pip install -e ".[socks]"

# tests
- name: Lint with flake8
run: |
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ def get_long_description():
"gui": [
"Gooey >= 1.0.1",
],
"socks": [
"pysocks >= 1.7.1"
],
}

setup(
Expand Down
41 changes: 38 additions & 3 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import logging
import os
import random

from concurrent.futures import ThreadPoolExecutor, as_completed
Expand All @@ -12,10 +13,29 @@
from io import BytesIO
from time import sleep
from typing import Any, ByteString, Dict, Generator, List, Optional, Set, Tuple, Union
from urllib.parse import urlparse

import certifi
import urllib3

SOCKS_PROXY_SCHEMES = {"socks4", "socks4a", "socks5", "socks5h"}

try:
import socks
import urllib3.contrib.socks

PROXY_URL = os.environ.get("http_proxy")
if PROXY_URL is not None:
parsed_proxy_url = urlparse(PROXY_URL)
if parsed_proxy_url.scheme not in SOCKS_PROXY_SCHEMES:
# TODO: maybe issue warning because of unsupported proxy scheme
PROXY_URL = None
else:
PROXY_AUTH = parsed_proxy_url.username is not None and\
parsed_proxy_url.password is not None
except ImportError:
PROXY_URL = None

try:
import pycurl

Expand Down Expand Up @@ -159,9 +179,16 @@ def _send_urllib_request(
# unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes
)
try:
if PROXY_URL:
pool_manager_class = urllib3.contrib.socks.SOCKSProxyManager
pool_manager_args = {"proxy_url": PROXY_URL}
else:
pool_manager_class = urllib3.PoolManager
pool_manager_args = {}
if no_ssl is False:
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(
HTTP_POOL = pool_manager_class(
**pool_manager_args,
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
ca_certs=certifi.where(),
Expand All @@ -170,7 +197,8 @@ def _send_urllib_request(
pool_manager = HTTP_POOL
else:
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(
NO_CERT_POOL = pool_manager_class(
**pool_manager_args,
retries=RETRY_STRATEGY,
timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"),
cert_reqs="CERT_NONE",
Expand Down Expand Up @@ -288,13 +316,20 @@ def _pycurl_is_live_page(url: str) -> bool:
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
if PROXY_URL:
curl.setopt(pycurl.PRE_PROXY, PROXY_URL)
# Perform the request
try:
curl.perform()
# Get the response code
page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400
except pycurl.error as err:
LOGGER.debug("pycurl HEAD error: %s %s", url, err)
if PROXY_URL is not None and err.args[0] == pycurl.E_COULDNT_CONNECT:
# connection errors could be related to SOCKS proxy
log_level = logging.WARN
else:
log_level = logging.DEBUG
LOGGER.log(log_level, "pycurl HEAD error: %s %s", url, err)
page_exists = False
# Clean up
curl.close()
Expand Down

0 comments on commit 9433a17

Please sign in to comment.