1
- from scrapeops_scrapy .core .api import SOPSRequest
2
- from scrapeops_scrapy .normalizer .domains import DomainNormalizer
3
- from scrapeops_scrapy .utils import utils
4
1
import json
5
2
import logging
6
3
import re
4
+ import time
5
+
6
+ from scrapeops_scrapy .core .api import SOPSRequest
7
+ from scrapeops_scrapy .normalizer .domains import DomainNormalizer
8
+ from scrapeops_scrapy .utils import utils
7
9
8
- class ErrorLogger (object ):
9
10
11
+ class ErrorLogger (object ):
10
12
ERROR_LOGGER_ACTIVE = True
11
13
12
14
def __init__ (self , spider , crawler , spider_settings , server_hostname , server_ip , start_time , log_file ):
13
15
self .spider = spider
14
16
self .crawler = crawler
15
- self .bot_name = crawler .settings .get (' BOT_NAME' , ' None' )
17
+ self .bot_name = crawler .settings .get (" BOT_NAME" , " None" )
16
18
self .spider_settings = spider_settings
17
19
self .server_hostname = server_hostname
18
20
self .server_ip = server_ip
@@ -28,72 +30,70 @@ def update_error_logger(self, job_name, job_id):
28
30
29
31
def log_error (self , reason = None , error = None , data = None , request_type = None ):
30
32
if ErrorLogger .ERROR_LOGGER_ACTIVE :
31
- self ._error_history .append ({
32
- 'time' : utils .current_time (),
33
- 'reason' : reason ,
34
- 'error' : str (error ),
35
- 'data' : data ,
36
- 'request_type' : request_type ,
37
- })
38
-
33
+ self ._error_history .append (
34
+ {
35
+ "time" : utils .current_time (),
36
+ "reason" : reason ,
37
+ "error" : str (error ),
38
+ "data" : data ,
39
+ "request_type" : request_type ,
40
+ }
41
+ )
39
42
40
43
def send_error_report (self , error_type = None , body = None , log_data = False ):
41
44
if ErrorLogger .ERROR_LOGGER_ACTIVE :
42
45
try :
43
46
data , status = SOPSRequest ().error_report_request (error_type = error_type , body = body )
44
47
if status .valid :
45
- if log_data and self .log_file is not None and data .get (' sdk_error_id' ) is not None :
46
- with open (self .log_file , 'rb' ) as f :
48
+ if log_data and self .log_file is not None and data .get (" sdk_error_id" ) is not None :
49
+ with open (self .log_file , "rb" ) as f :
47
50
post_body = {
48
- 'sops_sdk' : 'scrapy' ,
49
- 'spider_name' : self .spider .name ,
50
- 'job_group_id' : self .job_group_id ,
51
- 'job_group_name' : self .job_group_name ,
52
- 'sdk_error_id' : data .get ('sdk_error_id' )
53
- }
54
- _ , status = SOPSRequest ().error_report_request (error_type = error_type , body = post_body , files = {'file' : f })
51
+ "sops_sdk" : "scrapy" ,
52
+ "spider_name" : self .spider .name ,
53
+ "job_group_id" : self .job_group_id ,
54
+ "job_group_name" : self .job_group_name ,
55
+ "sdk_error_id" : data .get ("sdk_error_id" ),
56
+ }
57
+ _ , status = SOPSRequest ().error_report_request (
58
+ error_type = error_type , body = post_body , files = {"file" : f }
59
+ )
55
60
if status .valid is False :
56
- self .log_error (reason = ' send_error_logs_failed' , error = status .error )
61
+ self .log_error (reason = " send_error_logs_failed" , error = status .error )
57
62
58
63
if status .valid is False :
59
- self .log_error (reason = ' send_error_report_failed' , error = status .error )
64
+ self .log_error (reason = " send_error_report_failed" , error = status .error )
60
65
except Exception :
61
66
pass
62
67
63
-
64
68
def sdk_error_close (self , reason = None , error = None , request_type = None , data = None ):
65
69
if ErrorLogger .ERROR_LOGGER_ACTIVE :
66
70
self .log_error (reason = reason , error = error , data = data , request_type = request_type )
67
71
error_data = {
68
- ' final_reason' : reason ,
69
- ' sops_sdk' : ' scrapy' ,
70
- ' spider_name' : self .spider .name ,
71
- ' bot_name' : self .bot_name ,
72
- ' server_ip' : self .server_ip ,
73
- ' server_hostname' : self .server_hostname ,
74
- ' job_group_id' : self .job_group_id ,
75
- ' job_group_name' : self .job_group_name ,
76
- ' job_args' : utils .get_args (),
77
- ' job_start_time' : self .start_time ,
78
- ' sops_scrapeops_version' : utils .get_scrapeops_version (),
79
- ' sops_scrapy_version' : utils .get_scrapy_version (),
80
- ' sops_python_version' : utils .get_python_version (),
81
- ' sops_system_version' : utils .get_system_version (),
82
- ' sops_middleware_enabled' : utils .scrapeops_middleware_installed (self .spider_settings ),
83
- ' error_history' : self ._error_history ,
72
+ " final_reason" : reason ,
73
+ " sops_sdk" : " scrapy" ,
74
+ " spider_name" : self .spider .name ,
75
+ " bot_name" : self .bot_name ,
76
+ " server_ip" : self .server_ip ,
77
+ " server_hostname" : self .server_hostname ,
78
+ " job_group_id" : self .job_group_id ,
79
+ " job_group_name" : self .job_group_name ,
80
+ " job_args" : utils .get_args (),
81
+ " job_start_time" : self .start_time ,
82
+ " sops_scrapeops_version" : utils .get_scrapeops_version (),
83
+ " sops_scrapy_version" : utils .get_scrapy_version (),
84
+ " sops_python_version" : utils .get_python_version (),
85
+ " sops_system_version" : utils .get_system_version (),
86
+ " sops_middleware_enabled" : utils .scrapeops_middleware_installed (self .spider_settings ),
87
+ " error_history" : self ._error_history ,
84
88
}
85
-
86
- self .send_error_report (error_type = 'sdk_close' , body = error_data , log_data = True )
87
-
88
89
89
-
90
+ self . send_error_report ( error_type = "sdk_close" , body = error_data , log_data = True )
90
91
91
- class TailLogHandler (logging .Handler ):
92
92
93
+ class TailLogHandler (logging .Handler ):
93
94
retryErrors = [
94
95
"Couldn't bind" ,
95
- "Hostname couldn't be looked up'"
96
- "No route to host" ,
96
+ "Hostname couldn't be looked up'" "No route to host" ,
97
97
"Connection was refused by other side" ,
98
98
"TCP connection timed out" ,
99
99
"File used for UNIX socket is no good" ,
@@ -124,123 +124,117 @@ def __init__(self, log_dict, log_dict_cumulative):
124
124
self .log_dict = log_dict
125
125
self .log_dict_cumulative = log_dict_cumulative
126
126
127
-
128
127
def flush (self ):
129
128
self .log_dict .clear ()
130
-
131
129
132
130
def emit (self , record ):
133
-
134
131
try :
135
-
136
- if (record .levelname == "ERROR" or record .levelname == "WARNING" or record .levelname == "CRITICAL" ):
137
-
138
- if hasattr (record , 'message' ):
132
+ if record .levelname == "ERROR" or record .levelname == "WARNING" or record .levelname == "CRITICAL" :
133
+ if hasattr (record , "message" ):
139
134
errorMessage = record .message
140
- fileAndLine = record .pathname + ' , line: ' + str (record .lineno )
141
- dateTime = record . asctime
135
+ fileAndLine = record .pathname + " , line: " + str (record .lineno )
136
+ dateTime = self . format_time ( record )
142
137
type = record .levelname
143
138
engine = record .name
144
139
145
-
146
- #covering warnings/probableCause/traceback missing
147
- traceback = 'No traceback available'
148
- probableCause = ''
140
+ # covering warnings/probableCause/traceback missing
141
+ traceback = "No traceback available"
142
+ probableCause = ""
149
143
150
144
if record .exc_text is not None :
151
145
traceback = record .exc_text
152
- splitTraceback = traceback .split (' \n ' )
146
+ splitTraceback = traceback .split (" \n " )
153
147
probableCause = splitTraceback [len (splitTraceback ) - 1 ]
154
148
155
-
156
- #covering retrys
157
- if ("Gave up retrying <" in record .message ):
158
-
149
+ # covering retrys
150
+ if "Gave up retrying <" in record .message :
159
151
for retryError in self .retryErrors :
160
- if ( retryError in record .message ) :
161
- method = record .message .split ('<' )[1 ].split (' ' )[0 ]
152
+ if retryError in record .message :
153
+ method = record .message .split ("<" )[1 ].split (" " )[0 ]
162
154
errorMessage = "Error: Gave up retrying " + method + " request - " + retryError
163
- fileAndLine = ''
155
+ fileAndLine = ""
164
156
probableCause = retryError
165
157
break
166
-
158
+
167
159
# Deprecation Warnings
168
160
if "ScrapyDeprecationWarning:" in record .message and record .message [0 ] == "/" :
169
161
splitString = record .message .split ("ScrapyDeprecationWarning:" )
170
162
errorMessage = "ScrapyDeprecationWarning: " + splitString [1 ]
171
163
probableCause = splitString [0 ]
172
164
173
-
174
165
# "Some Other Error Occurred"
175
- if "Some other error occurred: " in record .message :
176
- splitError = record .message .split (' /' )
166
+ if "Some other error occurred: " in record .message :
167
+ splitError = record .message .split (" /" )
177
168
cleanError = splitError [0 ].split (">: " )[1 ]
178
169
errorMessage = "Some other error occurred: " + cleanError
179
170
probableCause = cleanError
180
171
traceback = record .message
181
172
182
-
183
173
# Convert Urls To Domains in Error Messages
184
- urls = re .findall (r' (https?://[^\s]+)' , errorMessage )
174
+ urls = re .findall (r" (https?://[^\s]+)" , errorMessage )
185
175
for url in urls :
186
176
domain = DomainNormalizer .get_domain (url )
187
177
errorMessage = errorMessage .replace (url , domain )
188
178
189
-
190
179
if errorMessage in self .log_dict :
191
- self .log_dict [errorMessage ][' count' ] = self .log_dict [errorMessage ][' count' ] + 1
180
+ self .log_dict [errorMessage ][" count" ] = self .log_dict [errorMessage ][" count" ] + 1
192
181
else :
193
182
self .log_dict [errorMessage ] = {
194
- 'type' : type ,
195
- 'engine' : engine ,
196
- 'name' : errorMessage ,
197
- 'count' : 1 ,
198
- 'traceback' : traceback ,
199
- 'message' : probableCause ,
200
- 'filepath' : fileAndLine ,
201
- 'dateTime' : dateTime
202
- }
203
-
204
- if (SOPSRequest .HIGH_FREQ_ACC == True ):
205
-
206
- if (errorMessage in self .log_dict_cumulative ):
207
- self .log_dict_cumulative [errorMessage ]['count' ] = self .log_dict_cumulative [errorMessage ]['count' ] + 1
183
+ "type" : type ,
184
+ "engine" : engine ,
185
+ "name" : errorMessage ,
186
+ "count" : 1 ,
187
+ "traceback" : traceback ,
188
+ "message" : probableCause ,
189
+ "filepath" : fileAndLine ,
190
+ "dateTime" : dateTime ,
191
+ }
192
+
193
+ if SOPSRequest .HIGH_FREQ_ACC == True :
194
+ if errorMessage in self .log_dict_cumulative :
195
+ self .log_dict_cumulative [errorMessage ]["count" ] = (
196
+ self .log_dict_cumulative [errorMessage ]["count" ] + 1
197
+ )
208
198
else :
209
-
210
- self .log_dict_cumulative [errorMessage ] = {
211
- 'type' : type ,
212
- 'engine' : engine ,
213
- 'name' : errorMessage ,
214
- 'count' : 1 ,
215
- 'traceback' : traceback ,
216
- 'message' : probableCause ,
217
- 'filepath' : fileAndLine ,
218
- 'dateTime' : dateTime
199
+ self .log_dict_cumulative [errorMessage ] = {
200
+ "type" : type ,
201
+ "engine" : engine ,
202
+ "name" : errorMessage ,
203
+ "count" : 1 ,
204
+ "traceback" : traceback ,
205
+ "message" : probableCause ,
206
+ "filepath" : fileAndLine ,
207
+ "dateTime" : dateTime ,
219
208
}
220
-
209
+
221
210
except Exception as e :
222
- logging .info (' Error: Error in error logger' )
211
+ logging .info (" Error: Error in error logger" )
223
212
logging .info (e , exc_info = True )
224
213
225
- class TailLogger (object ):
214
+ def format_time (self , record ):
215
+ if self .formatter :
216
+ return self .formatter .formatTime (record )
217
+ else :
218
+ # Fallback to a basic time format if no formatter is set
219
+ return time .strftime ("%Y-%m-%d %H:%M:%S" )
220
+
226
221
222
+ class TailLogger (object ):
227
223
def __init__ (self ):
228
224
self ._log_dict = {}
229
225
self ._log_dict_cumulative = {}
230
226
self ._log_handler = TailLogHandler (self ._log_dict , self ._log_dict_cumulative )
231
227
232
- def contents (self , type = "diff" ):
233
-
234
- if (type == "cumulative" ):
235
- jsonLogsCumulative = json .dumps (self ._log_dict_cumulative , indent = 2 )
228
+ def contents (self , type = "diff" ):
229
+ if type == "cumulative" :
230
+ jsonLogsCumulative = json .dumps (self ._log_dict_cumulative , indent = 2 )
236
231
return jsonLogsCumulative
237
232
238
233
else :
239
- jsonLogs = json .dumps (self ._log_dict , indent = 2 )
234
+ jsonLogs = json .dumps (self ._log_dict , indent = 2 )
240
235
self ._log_handler .flush ()
241
236
return jsonLogs
242
237
243
238
@property
244
239
def log_handler (self ):
245
240
return self ._log_handler
246
-
0 commit comments