@@ -16,6 +16,9 @@ def add_options(p):
16
16
p .add_argument ('--disable-evicts' , action = 'store_true' , help = 'Disable VDisk evicts' )
17
17
p .add_argument ('--disable-restarts' , action = 'store_true' , help = 'Disable node restarts' )
18
18
p .add_argument ('--enable-pdisk-encryption-keys-changes' , action = 'store_true' , help = 'Enable changes of PDisk encryption keys' )
19
+ p .add_argument ('--enable-kill-tablets' , action = 'store_true' , help = 'Enable tablet killer' )
20
+ p .add_argument ('--enable-kill-blob-depot' , action = 'store_true' , help = 'Enable BlobDepot killer' )
21
+ p .add_argument ('--kill-signal' , type = str , default = 'KILL' , help = 'Kill signal to send to restart node' )
19
22
20
23
21
24
def fetch_start_time_map (base_config ):
@@ -84,6 +87,8 @@ def do(args):
84
87
config_retries -= 1
85
88
continue
86
89
90
+ tablets = common .fetch_json_info ('tabletinfo' ) if args .enable_kill_tablets or args .enable_kill_blob_depot else {}
91
+
87
92
config_retries = None
88
93
89
94
for vslot in base_config .VSlot :
@@ -135,7 +140,7 @@ def do_restart(node_id):
135
140
host = node_fqdn_map [node_id ]
136
141
if args .enable_pdisk_encryption_keys_changes :
137
142
update_pdisk_key_config (node_fqdn_map , pdisk_keys , node_id )
138
- subprocess .call (['ssh' , host , 'sudo' , 'killall' , '-9' , 'kikimr' ])
143
+ subprocess .call (['ssh' , host , 'sudo' , 'killall' , '-%s' % args . kill_signal , 'kikimr' ])
139
144
if args .enable_pdisk_encryption_keys_changes :
140
145
remove_old_pdisk_keys (pdisk_keys , pdisk_key_versions , node_id )
141
146
@@ -185,6 +190,29 @@ def do_add_pdisk_key(node_id):
185
190
"version" : v ,
186
191
"file" : "keynumber" + str (v )})
187
192
193
+ def do_kill_tablet ():
194
+ tablet_list = [
195
+ value
196
+ for key , value in tablets .items ()
197
+ if value ['State' ] == 'Active' and value ['Leader' ]
198
+ ]
199
+ item = random .choice (tablet_list )
200
+ tablet_id = int (item ['TabletId' ])
201
+ print ('Killing tablet %d of type %s' % (tablet_id , item ['Type' ]))
202
+ common .fetch ('tablets' , dict (RestartTabletID = tablet_id ), fmt = 'raw' , cache = False )
203
+
204
+ def do_kill_blob_depot ():
205
+ tablet_list = [
206
+ value
207
+ for key , value in tablets .items ()
208
+ if value ['State' ] == 'Active' and value ['Leader' ] and value ['Type' ] == 'BlobDepot'
209
+ ]
210
+ if tablet_list :
211
+ item = random .choice (tablet_list )
212
+ tablet_id = int (item ['TabletId' ])
213
+ print ('Killing tablet %d of type %s' % (tablet_id , item ['Type' ]))
214
+ common .fetch ('tablets' , dict (RestartTabletID = tablet_id ), fmt = 'raw' , cache = False )
215
+
188
216
################################################################################################################
189
217
190
218
now = datetime .utcnow ()
@@ -193,19 +221,45 @@ def do_add_pdisk_key(node_id):
193
221
194
222
possible_actions = []
195
223
224
+ if args .enable_kill_tablets :
225
+ possible_actions .append (('kill tablet' , (do_kill_tablet ,)))
226
+ if args .enable_kill_blob_depot :
227
+ possible_actions .append (('kill blob depot' , (do_kill_blob_depot ,)))
228
+
229
+ evicts = []
230
+ wipes = []
231
+ readonlies = []
232
+ unreadonlies = []
233
+
196
234
for vslot in base_config .VSlot :
197
235
if common .is_dynamic_group (vslot .GroupId ):
198
236
vslot_id = common .get_vslot_id (vslot .VSlotId )
199
237
vdisk_id = '[%08x:%d:%d:%d]' % (vslot .GroupId , vslot .FailRealmIdx , vslot .FailDomainIdx , vslot .VDiskIdx )
200
238
if vslot_id in vslot_readonly and not args .disable_readonly :
201
- possible_actions .append (('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , False )))
239
+ unreadonlies .append (('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , False )))
202
240
if can_act_on_vslot (* vslot_id ) and (recent_restarts or args .disable_restarts ):
203
241
if not args .disable_evicts :
204
- possible_actions .append (('evict vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_evict , vslot_id )))
242
+ evicts .append (('evict vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_evict , vslot_id )))
205
243
if not args .disable_wipes :
206
- possible_actions .append (('wipe vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_wipe , vslot )))
244
+ wipes .append (('wipe vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_wipe , vslot )))
207
245
if not args .disable_readonly :
208
- possible_actions .append (('readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , True )))
246
+ readolies .append (('readonly vslot id: %s, vdisk id: %s' % (vslot_id , vdisk_id ), (do_readonly , vslot , True )))
247
+
248
+ def pick (v ):
249
+ action_name , action = random .choice (v )
250
+ print (action_name )
251
+ action [0 ](* action [1 :])
252
+
253
+ if evicts :
254
+ possible_actions .append (('evict' , (pick , evicts )))
255
+ if wipes :
256
+ possible_actions .append (('wipe' , (pick , wipes )))
257
+ if readonlies :
258
+ possible_actions .append (('readonly' , (pick , readonlies )))
259
+ if unreadonlies :
260
+ possible_actions .append (('un-readonly' , (pick , unreadonlies )))
261
+
262
+ restarts = []
209
263
210
264
if start_time_map and len (recent_restarts ) < 3 :
211
265
# sort so that the latest restarts come first
@@ -216,7 +270,10 @@ def do_add_pdisk_key(node_id):
216
270
if args .enable_pdisk_encryption_keys_changes :
217
271
possible_actions .append (('add new pdisk key to node with id: %d' % node_id , (do_add_pdisk_key , node_id )))
218
272
if not args .disable_restarts :
219
- possible_actions .append (('restart node with id: %d' % node_id , (do_restart , node_id )))
273
+ restarts .append (('restart node with id: %d' % node_id , (do_restart , node_id )))
274
+
275
+ if restarts :
276
+ possible_actions .append (('restart' , (pick , restarts )))
220
277
221
278
if not possible_actions :
222
279
common .print_if_not_quiet (args , 'Waiting for the next round...' , file = sys .stdout )
@@ -226,7 +283,7 @@ def do_add_pdisk_key(node_id):
226
283
################################################################################################################
227
284
228
285
action_name , action = random .choice (possible_actions )
229
- common . print_if_not_quiet ( args , '%s' % action_name , file = sys . stdout )
286
+ print ( '%s %s ' % ( action_name , datetime . utcnow (). strftime ( '%Y-%m-%dT%H:%M:%S' )) )
230
287
231
288
try :
232
289
action [0 ](* action [1 :])
0 commit comments