Skip to content

Commit 2a45ed7

Browse files
committed
fix crawler
1 parent 73f7074 commit 2a45ed7

File tree

1 file changed

+35
-15
lines changed

1 file changed

+35
-15
lines changed

contract_crawler.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,32 @@
2626
'Optimization Enabled': 'optimizations',
2727
'Other Settings:': 'settings'}
2828

29+
session = {}
30+
31+
def get_session_from_chromedriver(url):
32+
driver = uc.Chrome()
33+
driver.get(url)
34+
35+
session = requests.Session()
36+
user_agent = driver.execute_script("return navigator.userAgent;")
37+
session.headers.update({'User-Agent': user_agent})
38+
39+
if len(driver.get_cookies()):
40+
raise Exception('Should have some cookies here')
41+
42+
for cookie in driver.get_cookies():
43+
session.cookies.set(cookie['name'], cookie['value'])
44+
45+
print(f'Cookies loaded from {url} {session.cookies}')
46+
return session
47+
48+
def load_session(url):
49+
global session
50+
if not session:
51+
session = get_session_from_chromedriver(url)
52+
return session
53+
54+
2955
def address_from_tr(td: Any) -> str:
3056
a = td.select_one('a.js-clipboard')
3157
return a.attrs.get('data-clipboard-text') if (a and a.attrs) else None
@@ -34,7 +60,7 @@ def address_from_tr(td: Any) -> str:
3460
def parse_page(page: Optional[int]=None, retry=3, retry_delay=5) -> Optional[List[Dict[str, str]]]:
3561
url = VERIFIED_CONTRACT_URL if page is None else f'{VERIFIED_CONTRACT_URL}/{page}'
3662
print(f'Crawling {url}')
37-
resp = requests.get(url, headers=REQ_HEADER, allow_redirects=False, proxies=proxies)
63+
resp = session.get(url, allow_redirects=False)
3864
if resp.status_code != 200:
3965
print(f'No results found on page: {page}, http status: {resp.status_code}')
4066
return None
@@ -131,7 +157,7 @@ def download_source(contract: Dict[str, str], retry=3, retry_delay=5, throw_if_f
131157
address = contract['Address']
132158
contract_name = contract['Contract Name']
133159
url = CONTRACT_SOURCE_URL.format(address)
134-
resp = requests.get(url, headers=REQ_HEADER, allow_redirects=False, proxies=proxies)
160+
resp = session.get(url, allow_redirects=False)
135161

136162
def maybe_retry(e=None):
137163
if retry > 0:
@@ -165,25 +191,16 @@ def fetch_all():
165191

166192
def download_url_poly(url, retry=3, retry_delay=5, throw_if_fail=False):
167193
address = url.split('/')[-1].split('#')[0]
168-
driver = uc.Chrome()
169-
driver.get(url)
170-
171-
# fullscreen_btn = driver.find_elements(By.XPATH, '//a[@class="btn btn-xss btn-secondary togglefullscreen"]')
172-
# for btn in fullscreen_btn:
173-
# btn.click()
174-
# time.sleep(0.05)
175194

176-
cookie = driver.get_cookies()[0]
177-
for key, value in cookie.items():
178-
cookie[key] = str(value)
195+
session = get_session_from_chromedriver(url)
179196

180-
resp = requests.get(url, headers=REQ_HEADER, allow_redirects=True, cookies=cookie, proxies=proxies)
197+
resp = session.get(url)
181198
soup = BeautifulSoup(resp.content, 'lxml')
182199
parse_source_soup(soup, address)
183200

184201
def download_url(url, retry=3, retry_delay=5, throw_if_fail=False):
185202
address = url.split('/')[-1].split('#')[0]
186-
resp = requests.get(url, headers=REQ_HEADER, allow_redirects=False, proxies=proxies)
203+
resp = session.get(url, allow_redirects=False)
187204

188205
if resp.status_code != 200:
189206
if retry > 0:
@@ -198,7 +215,6 @@ def download_url(url, retry=3, retry_delay=5, throw_if_fail=False):
198215
soup = BeautifulSoup(resp.content, 'lxml')
199216
parse_source_soup(soup, address)
200217

201-
202218
if __name__ == '__main__':
203219
ap = argparse.ArgumentParser()
204220
ap.add_argument("--web", default="etherscan",type=str, help="Choose website, etherscan(default) or bscscan")
@@ -210,6 +226,7 @@ def download_url(url, retry=3, retry_delay=5, throw_if_fail=False):
210226
ROOT_DIR = f'{OUTPUT_DIR}/contracts'
211227

212228
web = args.web
229+
213230
if web == 'etherscan':
214231
VERIFIED_CONTRACT_URL = 'https://etherscan.io/contractsVerified'
215232
CONTRACT_SOURCE_URL = 'https://etherscan.io/address/{}#code'
@@ -237,6 +254,9 @@ def download_url(url, retry=3, retry_delay=5, throw_if_fail=False):
237254
print(CONTRACT_SOURCE_URL)
238255
print(ROOT_DIR)
239256
url = args.url
257+
258+
load_session(VERIFIED_CONTRACT_URL)
259+
240260
if url:
241261
fn(url)
242262
else:

0 commit comments

Comments
 (0)