Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
print("1. 从真实数据集加载预处理HTML数据...")

# 使用DataLoader加载真实的样本数据
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_1848_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
dataset_path = Path("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl")
print(f"📂 数据集文件: {dataset_path}")

if not dataset_path.exists():
Expand Down
4 changes: 2 additions & 2 deletions webmainbench/extractors/llm_webkit_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,12 +589,12 @@ def _extract_content_from_main_html(self, main_html: str, url: str = None) -> tu
"""使用llm-webkit的方法将main_html提取成content"""
import traceback
try:
from llm_web_kit.simple import extract_html_to_md
from llm_web_kit.simple import extract_content_from_main_html

print(f"🔧 开始使用llm-webkit简单接口提取content...")

# 使用简单接口提取markdown,传入URL
content = extract_html_to_md(url or "", main_html, clip_html=False)
content = extract_content_from_main_html(url or "", main_html)

print(f"✅ llm-webkit提取完成: {len(content)}字符")

Expand Down
Loading