-
Notifications
You must be signed in to change notification settings - Fork 2.3k
/
Copy pathtest_web_crawler.py
148 lines (129 loc) · 5.29 KB
/
test_web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import unittest, os
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import (
RegexChunking,
FixedLengthWordChunking,
SlidingWindowChunking,
)
from crawl4ai.extraction_strategy import (
CosineStrategy,
LLMExtractionStrategy,
TopicExtractionStrategy,
NoExtractionStrategy,
)
class TestWebCrawler(unittest.TestCase):
def setUp(self):
self.crawler = WebCrawler()
def test_warmup(self):
self.crawler.warmup()
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
def test_run_default_strategies(self):
result = self.crawler.run(
url="https://www.nbcnews.com/business",
word_count_threshold=5,
chunking_strategy=RegexChunking(),
extraction_strategy=CosineStrategy(),
bypass_cache=True,
)
self.assertTrue(
result.success, "Failed to crawl and extract using default strategies"
)
def test_run_different_strategies(self):
url = "https://www.nbcnews.com/business"
# Test with FixedLengthWordChunking and LLMExtractionStrategy
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
),
bypass_cache=True,
)
self.assertTrue(
result.success,
"Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy",
)
# Test with SlidingWindowChunking and TopicExtractionStrategy
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
extraction_strategy=TopicExtractionStrategy(num_keywords=5),
bypass_cache=True,
)
self.assertTrue(
result.success,
"Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy",
)
def test_invalid_url(self):
with self.assertRaises(Exception) as context:
self.crawler.run(url="invalid_url", bypass_cache=True)
self.assertIn("Invalid URL", str(context.exception))
def test_unsupported_extraction_strategy(self):
with self.assertRaises(Exception) as context:
self.crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy="UnsupportedStrategy",
bypass_cache=True,
)
self.assertIn("Unsupported extraction strategy", str(context.exception))
def test_invalid_css_selector(self):
with self.assertRaises(ValueError) as context:
self.crawler.run(
url="https://www.nbcnews.com/business",
css_selector="invalid_selector",
bypass_cache=True,
)
self.assertIn("Invalid CSS selector", str(context.exception))
def test_crawl_with_cache_and_bypass_cache(self):
url = "https://www.nbcnews.com/business"
# First crawl with cache enabled
result = self.crawler.run(url=url, bypass_cache=False)
self.assertTrue(result.success, "Failed to crawl and cache the result")
# Second crawl with bypass_cache=True
result = self.crawler.run(url=url, bypass_cache=True)
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
def test_fetch_multiple_pages(self):
urls = ["https://www.nbcnews.com/business", "https://www.bbc.com/news"]
results = []
for url in urls:
result = self.crawler.run(
url=url,
word_count_threshold=5,
chunking_strategy=RegexChunking(),
extraction_strategy=CosineStrategy(),
bypass_cache=True,
)
results.append(result)
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
for result in results:
self.assertTrue(
result.success, "Failed to crawl and extract a page in the list"
)
def test_run_fixed_length_word_chunking_and_no_extraction(self):
result = self.crawler.run(
url="https://www.nbcnews.com/business",
word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=NoExtractionStrategy(),
bypass_cache=True,
)
self.assertTrue(
result.success,
"Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy",
)
def test_run_sliding_window_and_no_extraction(self):
result = self.crawler.run(
url="https://www.nbcnews.com/business",
word_count_threshold=5,
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
extraction_strategy=NoExtractionStrategy(),
bypass_cache=True,
)
self.assertTrue(
result.success,
"Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy",
)
if __name__ == "__main__":
unittest.main()