Skip to content

Commit f68c29a

Browse files
committed
Update portable.py
1 parent 3e2dd24 commit f68c29a

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

app/portable.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import flask
22
import requests
33
from flask import request
4+
from bs4 import BeautifulSoup
5+
from urllib.parse import urlparse, urljoin
46

57
app = flask.Flask(__name__)
68
googlebot_headers = {
@@ -185,14 +187,41 @@
185187
</html>
186188
"""
187189

190+
def add_base_tag(html_content, original_url):
191+
soup = BeautifulSoup(html_content, 'html.parser')
192+
parsed_url = urlparse(original_url)
193+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
194+
195+
# Handle paths that are not root, e.g., "https://x.com/some/path/w.html"
196+
if parsed_url.path and not parsed_url.path.endswith('/'):
197+
base_url = urljoin(base_url, parsed_url.path.rsplit('/', 1)[0] + '/')
198+
base_tag = soup.find('base')
199+
200+
print(base_url)
201+
if not base_tag:
202+
new_base_tag = soup.new_tag('base', href=base_url)
203+
if soup.head:
204+
soup.head.insert(0, new_base_tag)
205+
else:
206+
head_tag = soup.new_tag('head')
207+
head_tag.insert(0, new_base_tag)
208+
soup.insert(0, head_tag)
209+
210+
return str(soup)
188211

189212
def bypass_paywall(url):
190213
"""
191214
Bypass paywall for a given url
192215
"""
193-
response = requests.get(url, headers=googlebot_headers)
194-
response.encoding = response.apparent_encoding
195-
return response.text
216+
if url.startswith("http"):
217+
response = requests.get(url, headers=googlebot_headers)
218+
response.encoding = response.apparent_encoding
219+
return add_base_tag(response.text, response.url)
220+
221+
try:
222+
return bypass_paywall("https://" + url)
223+
except requests.exceptions.RequestException as e:
224+
return bypass_paywall("http://" + url)
196225

197226

198227
@app.route("/")

0 commit comments

Comments
 (0)