Skip to content

Commit 98a7bab

Browse files
committed
for test
1 parent 811aeba commit 98a7bab

File tree

1 file changed

+41
-7
lines changed

1 file changed

+41
-7
lines changed

scrapegraphai/utils/cleanup_html.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,44 @@
33
"""
44

55
import re
6+
import json
67
from urllib.parse import urljoin
78

89
from bs4 import BeautifulSoup, Comment
910
from minify_html import minify
1011

1112

13+
def extract_from_script_tags(soup):
14+
script_content = []
15+
16+
for script in soup.find_all("script"):
17+
content = script.string
18+
if content:
19+
try:
20+
json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
21+
json_matches = re.findall(json_pattern, content)
22+
23+
for potential_json in json_matches:
24+
try:
25+
parsed = json.loads(potential_json)
26+
if parsed:
27+
script_content.append(f"JSON data from script: {json.dumps(parsed, indent=2)}")
28+
except json.JSONDecodeError:
29+
pass
30+
31+
if "window." in content or "document." in content:
32+
data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
33+
data_matches = re.findall(data_pattern, content)
34+
35+
for var_name, var_value in data_matches:
36+
script_content.append(f"Dynamic data - {var_name}: {var_value.strip()}")
37+
except Exception:
38+
if len(content) < 1000:
39+
script_content.append(f"Script content: {content.strip()}")
40+
41+
return "\n\n".join(script_content)
42+
43+
1244
def cleanup_html(html_content: str, base_url: str) -> str:
1345
"""
1446
Processes HTML content by removing unnecessary tags,
@@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:
3466

3567
title_tag = soup.find("title")
3668
title = title_tag.get_text() if title_tag else ""
37-
38-
for tag in soup.find_all(["script", "style"]):
69+
70+
script_content = extract_from_script_tags(soup)
71+
72+
for tag in soup.find_all("style"):
3973
tag.extract()
4074

4175
link_urls = [
@@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
5488
body_content = soup.find("body")
5589
if body_content:
5690
minimized_body = minify(str(body_content))
57-
return title, minimized_body, link_urls, image_urls
91+
return title, minimized_body, link_urls, image_urls, script_content
5892

5993
else:
6094
raise ValueError(
@@ -106,10 +140,10 @@ def reduce_html(html, reduction):
106140
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
107141
comment.extract()
108142

109-
for tag in soup(["script", "style"]):
143+
for tag in soup(["style"]):
110144
tag.string = ""
111145

112-
attrs_to_keep = ["class", "id", "href", "src"]
146+
attrs_to_keep = ["class", "id", "href", "src", "type"]
113147
for tag in soup.find_all(True):
114148
for attr in list(tag.attrs):
115149
if attr not in attrs_to_keep:
@@ -118,15 +152,15 @@ def reduce_html(html, reduction):
118152
if reduction == 1:
119153
return minify_html(str(soup))
120154

121-
for tag in soup(["script", "style"]):
155+
for tag in soup(["style"]):
122156
tag.decompose()
123157

124158
body = soup.body
125159
if not body:
126160
return "No <body> tag found in the HTML"
127161

128162
for tag in body.find_all(string=True):
129-
if tag.parent.name not in ["script", "style"]:
163+
if tag.parent.name not in ["script"]:
130164
tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20])
131165

132166
reduced_html = str(body)

0 commit comments

Comments
 (0)