|
1 | 1 | import flask
|
2 | 2 | import requests
|
3 | 3 | from flask import request
|
| 4 | +from bs4 import BeautifulSoup |
| 5 | +from urllib.parse import urlparse, urljoin |
4 | 6 |
|
5 | 7 | app = flask.Flask(__name__)
|
6 | 8 | googlebot_headers = {
|
|
185 | 187 | </html>
|
186 | 188 | """
|
187 | 189 |
|
| 190 | +def add_base_tag(html_content, original_url): |
| 191 | + soup = BeautifulSoup(html_content, 'html.parser') |
| 192 | + parsed_url = urlparse(original_url) |
| 193 | + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" |
| 194 | + |
| 195 | + # Handle paths that are not root, e.g., "https://x.com/some/path/w.html" |
| 196 | + if parsed_url.path and not parsed_url.path.endswith('/'): |
| 197 | + base_url = urljoin(base_url, parsed_url.path.rsplit('/', 1)[0] + '/') |
| 198 | + base_tag = soup.find('base') |
| 199 | + |
| 200 | + print(base_url) |
| 201 | + if not base_tag: |
| 202 | + new_base_tag = soup.new_tag('base', href=base_url) |
| 203 | + if soup.head: |
| 204 | + soup.head.insert(0, new_base_tag) |
| 205 | + else: |
| 206 | + head_tag = soup.new_tag('head') |
| 207 | + head_tag.insert(0, new_base_tag) |
| 208 | + soup.insert(0, head_tag) |
| 209 | + |
| 210 | + return str(soup) |
188 | 211 |
|
189 | 212 | def bypass_paywall(url):
|
190 | 213 | """
|
191 | 214 | Bypass paywall for a given url
|
192 | 215 | """
|
193 |
| - response = requests.get(url, headers=googlebot_headers) |
194 |
| - response.encoding = response.apparent_encoding |
195 |
| - return response.text |
| 216 | + if url.startswith("http"): |
| 217 | + response = requests.get(url, headers=googlebot_headers) |
| 218 | + response.encoding = response.apparent_encoding |
| 219 | + return add_base_tag(response.text, response.url) |
| 220 | + |
| 221 | + try: |
| 222 | + return bypass_paywall("https://" + url) |
| 223 | + except requests.exceptions.RequestException as e: |
| 224 | + return bypass_paywall("http://" + url) |
196 | 225 |
|
197 | 226 |
|
198 | 227 | @app.route("/")
|
|
0 commit comments