forked from sarperavci/CloudflareBypassForScraping
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e6292c7
commit e315200
Showing
3 changed files
with
98 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,42 @@ | ||
import time | ||
from DrissionPage import ChromiumPage | ||
from DrissionPage import ChromiumPage | ||
|
||
|
||
# This code is written for readibility and simplicity. It is not optimized for performance or real-world usage. | ||
# You can optimize the code by removing the unnecessary sleeps and checks. | ||
|
||
class CloudflareBypasser: | ||
def __init__(self, driver: ChromiumPage): | ||
def __init__(self, driver: ChromiumPage, max_retries=-1, log=True): | ||
self.max_retries = max_retries | ||
self.log = log | ||
self.driver = driver | ||
|
||
def clickCycle(self): | ||
#reach the captcha button and click it | ||
# if iframe does not exist, it means the page is already bypassed. | ||
if self.driver.wait.ele_displayed('.spacer',timeout=1.5): | ||
if self.driver.wait.ele_displayed('.spacer', timeout=1.5): | ||
time.sleep(1.5) | ||
self.driver.ele(".spacer", timeout=2.5).click() | ||
# The location of the button may vary time to time. I sometimes check the button's location and update the code. | ||
|
||
def isBypassed(self): | ||
title = self.driver.title.lower() | ||
# If the title does not contain "just a moment", it means the page is bypassed. | ||
# This is a simple check, you can implement more complex checks. | ||
return "just a moment" not in title | ||
|
||
def bypass(self): | ||
count = 0 | ||
while not self.isBypassed(): | ||
if 0 < self.max_retries + 1 <= count: | ||
if self.log: | ||
print("Exceeded maximum tries") | ||
break | ||
time.sleep(2) | ||
# A click may be enough to bypass the captcha, if your IP is clean. | ||
# I haven't seen a captcha that requires more than 3 clicks. | ||
print("Verification page detected. Trying to bypass...") | ||
if self.log: | ||
print("Verification page detected. Trying to bypass...") | ||
time.sleep(2) | ||
self.clickCycle() | ||
self.clickCycle() | ||
count += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import json | ||
|
||
from CloudflareBypasser import CloudflareBypasser | ||
from DrissionPage import ChromiumPage, ChromiumOptions | ||
from fastapi import FastAPI, HTTPException, Response | ||
from pydantic import BaseModel | ||
|
||
arguments = [ | ||
"-no-first-run", | ||
"-force-color-profile=srgb", | ||
"-metrics-recording-only", | ||
"-password-store=basic", | ||
"-use-mock-keychain", | ||
"-export-tagged-pdf", | ||
"-no-default-browser-check", | ||
"-disable-background-mode", | ||
"-enable-features=NetworkService,NetworkServiceInProcess,LoadCryptoTokenExtension,PermuteTLSExtensions", | ||
"-disable-features=FlashDeprecationWarning,EnablePasswordsAccountStorage", | ||
"-deny-permission-prompts", | ||
"-disable-gpu", | ||
"-accept-lang=en-US", | ||
] | ||
browser_path = "/usr/bin/google-chrome" | ||
app = FastAPI() | ||
|
||
|
||
class CookieResponse(BaseModel): | ||
cookies: dict | ||
|
||
|
||
def bypass_cloudlflare(url, retries): | ||
# Set up Chromium options | ||
options = ChromiumOptions() | ||
options.set_paths(browser_path=browser_path).headless(False) | ||
|
||
# Initialize the browser | ||
driver = ChromiumPage(addr_or_opts=options) | ||
try: | ||
# Bypass | ||
driver.get(url) | ||
cf_bypasser = CloudflareBypasser(driver, retries, True) | ||
cf_bypasser.bypass() | ||
return driver | ||
except Exception as e: | ||
driver.quit() | ||
raise e | ||
|
||
|
||
@app.get("/cookies", response_model=CookieResponse) | ||
async def get_cookies(url: str, retries: int = 5): | ||
try: | ||
driver = bypass_cloudlflare(url, retries) | ||
cookies = driver.cookies(as_dict=True) | ||
driver.quit() | ||
return CookieResponse(cookies=cookies) | ||
except Exception as e: | ||
raise HTTPException(status_code=500, detail=str(e)) | ||
|
||
|
||
@app.get("/html") | ||
async def get_cookies(url: str, retries: int = 5): | ||
try: | ||
driver = bypass_cloudlflare(url, retries) | ||
html = driver.html | ||
|
||
cookies_json = json.dumps(driver.cookies(as_dict=True)) | ||
|
||
response = Response(content=html, media_type="text/html") | ||
response.headers['cookies'] = cookies_json | ||
driver.quit() | ||
return response | ||
except Exception as e: | ||
raise HTTPException(status_code=500, detail=str(e)) | ||
|
||
|
||
if __name__ == "__main__": | ||
import uvicorn | ||
|
||
uvicorn.run(app, host="0.0.0.0", port=8000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
fastapi | ||
pydantic | ||
uvicorn |