-
Notifications
You must be signed in to change notification settings - Fork 0
/
gcscan.py
137 lines (111 loc) · 4.78 KB
/
gcscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#! /usr/bin/python
# -*- coding: utf-8 -*-
from selenium import webdriver
import re
import sys
import requests
import time
import csv
class GScan(object):
# Get data from Google Cache
# Like the date of the last indexation
def __init__(self):
self.source_file = "" # The file where the URLs are
self.urls = "" # Contain URLs from source file
self.results_urls = [] # List of URLs crawled
self.results_dates = [] # List of cache dates
self.urls_max_count = 0
def extract_urls_from_file(self, source_file):
file = open(source_file, "r")
self.urls = file.readlines()
file.close()
return self.urls
def get_urls_results(self, urls_extracted):
i = 0
for url in urls_extracted:
self.results_urls.append(url)
i += 1
self.urls_max_count = i
return self.results_urls
def check_captcha(self, driver, current_url):
captcha = 1
driver.get(current_url)
try:
driver.find_element_by_id("captcha")
except:
captcha = 0
return captcha
def unlock_captcha(self, current_url):
print("[!] Google ask a captcha, fill it and press Enter to resume the scan")
driver2 = webdriver.Firefox()
driver2.get(current_url)
input()
def fill_csv(self, array_urls, array_dates, csv_file_path):
result_array = []
array_length = len(array_dates)
# We fill the CSV file
file = open(csv_file_path, "w")
csv_file = csv.writer(file, delimiter=';', lineterminator='\n')
# We merge the two arrays in one
i = 1
result_array[0][0].append(array_urls[i])
result_array[0][1].append(array_dates[i])
while not i == array_length:
print("pouet")
result_array[i][0].append(array_urls[i])
result_array[i][1].append(array_dates[i])
i += 1
csv_file.writerows(result_array)
file.close()
def get_cache_date(self, urls_extracted, driver):
print("Browser loading...")
if driver == "firefox":
driver = webdriver.Firefox()
elif driver == "ghostJS":
driver = webdriver.PhantomJS()
i = 0 # For progression check
print("There is " + str(self.urls_max_count) + " URLs ready.")
for url in urls_extracted:
error_flag = 0 # The URL is considered as accessible by default
i += 1
# Prepare Google Cache URL
google_cache_url = "http://webcache.googleusercontent.com/search?q=cache:" + url \
+ "&espv=2&strip=0&vwsrc=1"
# Show the progression
print("Check the URL " + str(i) + "/" + str(self.urls_max_count))
if i > 75: # We delay the crawl for too many URL, fear the captcha :)
print("Please wait 27 sec (avoid captcha)")
time.sleep(27)
# Open the cached version of the URL
driver.get(google_cache_url)
try:
# Get header of the page named id="google-cache-hdr"
header = driver.find_element_by_id("google-cache-hdr").text
except:
# If the cache doesn't exist, why ?
error_flag = 1
status_code = 200 # We consider URL is 200 by default
test_status_code = ""
# We try to access the page directly by it's URL
try:
test_status_code = requests.get(url)
except:
status_code = test_status_code.status_code
if status_code == 200: # The page exist but is not cached
is_captcha = self.check_captcha(driver, google_cache_url)
if is_captcha == 0:
self.results_dates.append("No cache")
else:
self.unlock_captcha(google_cache_url)
else: # The page is not accessible (404, 500, etc.)
self.results_dates.append("Error " + str(status_code))
if error_flag == 0: # If the page is well cached, we get the cache date
# Based on french version of Google (google.fr), sorry :(
# Get the text (date) between the two sections "était affichée le " and " GMT" in results
header_date = re.findall('\xe9tait affich\xe9e le (.*?) GMT', header)
cache_date = header_date[0]
# TODO : La suppression du /n ne marque que sur le dernier élément
cache_date = cache_date.replace("\n", "") # Remove the new line break at each URL
self.results_dates.append(cache_date)
driver.close()
return self.results_dates