Skip to content

Commit

Permalink
captcha fix
Browse files Browse the repository at this point in the history
  • Loading branch information
itsOwen committed Sep 3, 2024
1 parent 921afd0 commit a69a5da
Showing 1 changed file with 17 additions and 13 deletions.
30 changes: 17 additions & 13 deletions src/scrapers/playwright_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,31 @@ async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Opti
await self.apply_stealth_settings(page)
await self.set_browser_features(page)

if handle_captcha:
await self.handle_captcha(page, url)

contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
except Exception as e:
self.logger.error(f"Error during scraping: {str(e)}")
contents = [f"Error: {str(e)}"]
finally:
await browser.close()
self.logger.info("Browser closed after scraping.")
if not self.config.use_current_browser:
await browser.close()
self.logger.info("Browser closed after scraping.")

return contents

async def handle_captcha(self, page: Page, url: str):
self.logger.info("Waiting for user to solve CAPTCHA...")
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)

print("Please solve the CAPTCHA in the browser window.")
print("Once solved, press Enter in this console to continue...")
input()

await page.wait_for_load_state('networkidle')
self.logger.info("CAPTCHA handling completed.")

async def launch_and_connect_to_chrome(self, playwright):
if self.chrome_process is None:
self.temp_user_data_dir = tempfile.mkdtemp(prefix="chrome_debug_profile_")
Expand Down Expand Up @@ -194,17 +209,6 @@ async def set_browser_features(self, page: Page):
'Upgrade-Insecure-Requests': '1'
})

async def handle_captcha(self, page: Page, url: str):
self.logger.info("Waiting for user to solve CAPTCHA...")
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)

print("Please solve the CAPTCHA in the browser window.")
print("Once solved, press Enter in this console to continue...")
input()

await page.wait_for_load_state('networkidle')
self.logger.info("CAPTCHA handling completed.")

async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
contents = []

Expand Down

0 comments on commit a69a5da

Please sign in to comment.