forked from Aider-AI/aider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_scrape.py
175 lines (131 loc) · 6.28 KB
/
test_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import time
import unittest
from unittest.mock import MagicMock
from aider.commands import Commands
from aider.io import InputOutput
from aider.scrape import Scraper
class TestScrape(unittest.TestCase):
def test_scrape_self_signed_ssl(self):
def scrape_with_retries(scraper, url, max_retries=5, delay=0.5):
for _ in range(max_retries):
result = scraper.scrape(url)
if result is not None:
return result
time.sleep(delay)
return None
# Test with SSL verification
scraper_verify = Scraper(
print_error=MagicMock(), playwright_available=True, verify_ssl=True
)
result_verify = scrape_with_retries(scraper_verify, "https://self-signed.badssl.com")
self.assertIsNone(result_verify)
scraper_verify.print_error.assert_called()
# Test without SSL verification
scraper_no_verify = Scraper(
print_error=MagicMock(), playwright_available=True, verify_ssl=False
)
result_no_verify = scrape_with_retries(scraper_no_verify, "https://self-signed.badssl.com")
self.assertIsNotNone(result_no_verify)
self.assertIn("self-signed", result_no_verify)
scraper_no_verify.print_error.assert_not_called()
def setUp(self):
self.io = InputOutput(yes=True)
self.commands = Commands(self.io, None)
def test_cmd_web_imports_playwright(self):
# Create a mock print_error function
mock_print_error = MagicMock()
self.commands.io.tool_error = mock_print_error
# Run the cmd_web command
result = self.commands.cmd_web("https://example.com", return_content=True)
# Assert that the result contains some content
self.assertIsNotNone(result)
self.assertNotEqual(result, "")
# Try to import playwright
try:
import playwright # noqa: F401
playwright_imported = True
except ImportError:
playwright_imported = False
# Assert that playwright was successfully imported
self.assertTrue(
playwright_imported, "Playwright should be importable after running cmd_web"
)
# Assert that print_error was never called
mock_print_error.assert_not_called()
def test_scrape_actual_url_with_playwright(self):
# Create a Scraper instance with a mock print_error function
mock_print_error = MagicMock()
scraper = Scraper(print_error=mock_print_error, playwright_available=True)
# Scrape a real URL
result = scraper.scrape("https://example.com")
# Assert that the result contains expected content
self.assertIsNotNone(result)
self.assertIn("Example Domain", result)
# Assert that print_error was never called
mock_print_error.assert_not_called()
def test_scraper_print_error_not_called(self):
# Create a Scraper instance with a mock print_error function
mock_print_error = MagicMock()
scraper = Scraper(print_error=mock_print_error)
# Test various methods of the Scraper class
scraper.scrape_with_httpx("https://example.com")
scraper.try_pandoc()
scraper.html_to_markdown("<html><body><h1>Test</h1></body></html>")
# Assert that print_error was never called
mock_print_error.assert_not_called()
def test_scrape_with_playwright_error_handling(self):
# Create a Scraper instance with a mock print_error function
mock_print_error = MagicMock()
scraper = Scraper(print_error=mock_print_error, playwright_available=True)
# Mock the playwright module to raise an error
import playwright
playwright._impl._errors.Error = Exception # Mock the Error class
def mock_content():
raise playwright._impl._errors.Error("Test error")
# Mock the necessary objects and methods
scraper.scrape_with_playwright = MagicMock()
scraper.scrape_with_playwright.return_value = (None, None)
# Call the scrape method
result = scraper.scrape("https://example.com")
# Assert that the result is None
self.assertIsNone(result)
# Assert that print_error was called with the expected error message
mock_print_error.assert_called_once_with(
"Failed to retrieve content from https://example.com"
)
# Reset the mock
mock_print_error.reset_mock()
# Test with a different return value
scraper.scrape_with_playwright.return_value = ("Some content", "text/html")
result = scraper.scrape("https://example.com")
# Assert that the result is not None
self.assertIsNotNone(result)
# Assert that print_error was not called
mock_print_error.assert_not_called()
def test_scrape_text_plain(self):
# Create a Scraper instance
scraper = Scraper(print_error=MagicMock(), playwright_available=True)
# Mock the scrape_with_playwright method
plain_text = "This is plain text content."
scraper.scrape_with_playwright = MagicMock(return_value=(plain_text, "text/plain"))
# Call the scrape method
result = scraper.scrape("https://example.com")
# Assert that the result is the same as the input plain text
self.assertEqual(result, plain_text)
def test_scrape_text_html(self):
# Create a Scraper instance
scraper = Scraper(print_error=MagicMock(), playwright_available=True)
# Mock the scrape_with_playwright method
html_content = "<html><body><h1>Test</h1><p>This is HTML content.</p></body></html>"
scraper.scrape_with_playwright = MagicMock(return_value=(html_content, "text/html"))
# Mock the html_to_markdown method
expected_markdown = "# Test\n\nThis is HTML content."
scraper.html_to_markdown = MagicMock(return_value=expected_markdown)
# Call the scrape method
result = scraper.scrape("https://example.com")
# Assert that the result is the expected markdown
self.assertEqual(result, expected_markdown)
# Assert that html_to_markdown was called with the HTML content
scraper.html_to_markdown.assert_called_once_with(html_content)
if __name__ == "__main__":
unittest.main()