Skip to content

Commit a9651c0

Browse files
authored
Merge 643e5bb into e90285a
2 parents e90285a + 643e5bb commit a9651c0

File tree

6 files changed

+82
-15
lines changed

6 files changed

+82
-15
lines changed

ydb/apps/dstool/lib/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def apply_args(self, args, with_localhost=True):
111111

112112
if args.token_file:
113113
self.token = args.token_file.readline().rstrip('\r\n')
114+
args.token_file.close()
114115
if self.token is None:
115116
self.token = os.getenv('YDB_TOKEN')
116117
if self.token is not None:

ydb/apps/dstool/lib/dstool_cmd_cluster_workload_run.py

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ def add_options(p):
1616
p.add_argument('--disable-evicts', action='store_true', help='Disable VDisk evicts')
1717
p.add_argument('--disable-restarts', action='store_true', help='Disable node restarts')
1818
p.add_argument('--enable-pdisk-encryption-keys-changes', action='store_true', help='Enable changes of PDisk encryption keys')
19+
p.add_argument('--enable-kill-tablets', action='store_true', help='Enable tablet killer')
20+
p.add_argument('--enable-kill-blob-depot', action='store_true', help='Enable BlobDepot killer')
21+
p.add_argument('--kill-signal', type=str, default='KILL', help='Kill signal to send to restart node')
1922

2023

2124
def fetch_start_time_map(base_config):
@@ -84,6 +87,8 @@ def do(args):
8487
config_retries -= 1
8588
continue
8689

90+
tablets = common.fetch_json_info('tabletinfo') if args.enable_kill_tablets or args.enable_kill_blob_depot else {}
91+
8792
config_retries = None
8893

8994
for vslot in base_config.VSlot:
@@ -135,7 +140,7 @@ def do_restart(node_id):
135140
host = node_fqdn_map[node_id]
136141
if args.enable_pdisk_encryption_keys_changes:
137142
update_pdisk_key_config(node_fqdn_map, pdisk_keys, node_id)
138-
subprocess.call(['ssh', host, 'sudo', 'killall', '-9', 'kikimr'])
143+
subprocess.call(['ssh', host, 'sudo', 'killall', '-%s' % args.kill_signal, 'kikimr'])
139144
if args.enable_pdisk_encryption_keys_changes:
140145
remove_old_pdisk_keys(pdisk_keys, pdisk_key_versions, node_id)
141146

@@ -185,6 +190,29 @@ def do_add_pdisk_key(node_id):
185190
"version" : v,
186191
"file" : "keynumber" + str(v)})
187192

193+
def do_kill_tablet():
194+
tablet_list = [
195+
value
196+
for key, value in tablets.items()
197+
if value['State'] == 'Active' and value['Leader']
198+
]
199+
item = random.choice(tablet_list)
200+
tablet_id = int(item['TabletId'])
201+
print('Killing tablet %d of type %s' % (tablet_id, item['Type']))
202+
common.fetch('tablets', dict(RestartTabletID=tablet_id), fmt='raw', cache=False)
203+
204+
def do_kill_blob_depot():
205+
tablet_list = [
206+
value
207+
for key, value in tablets.items()
208+
if value['State'] == 'Active' and value['Leader'] and value['Type'] == 'BlobDepot'
209+
]
210+
if tablet_list:
211+
item = random.choice(tablet_list)
212+
tablet_id = int(item['TabletId'])
213+
print('Killing tablet %d of type %s' % (tablet_id, item['Type']))
214+
common.fetch('tablets', dict(RestartTabletID=tablet_id), fmt='raw', cache=False)
215+
188216
################################################################################################################
189217

190218
now = datetime.utcnow()
@@ -193,19 +221,45 @@ def do_add_pdisk_key(node_id):
193221

194222
possible_actions = []
195223

224+
if args.enable_kill_tablets:
225+
possible_actions.append(('kill tablet', (do_kill_tablet,)))
226+
if args.enable_kill_blob_depot:
227+
possible_actions.append(('kill blob depot', (do_kill_blob_depot,)))
228+
229+
evicts = []
230+
wipes = []
231+
readonlies = []
232+
unreadonlies = []
233+
196234
for vslot in base_config.VSlot:
197235
if common.is_dynamic_group(vslot.GroupId):
198236
vslot_id = common.get_vslot_id(vslot.VSlotId)
199237
vdisk_id = '[%08x:%d:%d:%d]' % (vslot.GroupId, vslot.FailRealmIdx, vslot.FailDomainIdx, vslot.VDiskIdx)
200238
if vslot_id in vslot_readonly and not args.disable_readonly:
201-
possible_actions.append(('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, False)))
239+
unreadonlies.append(('un-readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, False)))
202240
if can_act_on_vslot(*vslot_id) and (recent_restarts or args.disable_restarts):
203241
if not args.disable_evicts:
204-
possible_actions.append(('evict vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_evict, vslot_id)))
242+
evicts.append(('evict vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_evict, vslot_id)))
205243
if not args.disable_wipes:
206-
possible_actions.append(('wipe vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_wipe, vslot)))
244+
wipes.append(('wipe vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_wipe, vslot)))
207245
if not args.disable_readonly:
208-
possible_actions.append(('readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, True)))
246+
readolies.append(('readonly vslot id: %s, vdisk id: %s' % (vslot_id, vdisk_id), (do_readonly, vslot, True)))
247+
248+
def pick(v):
249+
action_name, action = random.choice(v)
250+
print(action_name)
251+
action[0](*action[1:])
252+
253+
if evicts:
254+
possible_actions.append(('evict', (pick, evicts)))
255+
if wipes:
256+
possible_actions.append(('wipe', (pick, wipes)))
257+
if readonlies:
258+
possible_actions.append(('readonly', (pick, readonlies)))
259+
if unreadonlies:
260+
possible_actions.append(('un-readonly', (pick, unreadonlies)))
261+
262+
restarts = []
209263

210264
if start_time_map and len(recent_restarts) < 3:
211265
# sort so that the latest restarts come first
@@ -216,7 +270,10 @@ def do_add_pdisk_key(node_id):
216270
if args.enable_pdisk_encryption_keys_changes:
217271
possible_actions.append(('add new pdisk key to node with id: %d' % node_id, (do_add_pdisk_key, node_id)))
218272
if not args.disable_restarts:
219-
possible_actions.append(('restart node with id: %d' % node_id, (do_restart, node_id)))
273+
restarts.append(('restart node with id: %d' % node_id, (do_restart, node_id)))
274+
275+
if restarts:
276+
possible_actions.append(('restart', (pick, restarts)))
220277

221278
if not possible_actions:
222279
common.print_if_not_quiet(args, 'Waiting for the next round...', file=sys.stdout)
@@ -226,7 +283,7 @@ def do_add_pdisk_key(node_id):
226283
################################################################################################################
227284

228285
action_name, action = random.choice(possible_actions)
229-
common.print_if_not_quiet(args, '%s' % action_name, file=sys.stdout)
286+
print('%s %s' % (action_name, datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')))
230287

231288
try:
232289
action[0](*action[1:])

ydb/apps/dstool/lib/dstool_cmd_group_take_snapshot.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,11 @@ def get_endpoints():
4242
global output_file
4343
output_file = args.output
4444

45-
threads = []
46-
for p in get_endpoints():
47-
thread = Thread(target=fetch_blobs_from_vdisk, args=p, daemon=True)
48-
threads.append(thread)
49-
thread.start()
50-
for thread in threads:
51-
thread.join()
45+
with output_file:
46+
threads = []
47+
for p in get_endpoints():
48+
thread = Thread(target=fetch_blobs_from_vdisk, args=p, daemon=True)
49+
threads.append(thread)
50+
thread.start()
51+
for thread in threads:
52+
thread.join()

ydb/core/keyvalue/channel_balancer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ namespace NKikimr::NKeyValue {
114114
const size_t index = (LatencyQueue.size() - 1) * 99 / 100;
115115
const TDuration perc = latencies[index];
116116
weight = MeanExpectedLatency.GetValue() * weight / Max(perc, TDuration::MilliSeconds(1)).GetValue();
117-
Y_DEBUG_ABORT_UNLESS(weight);
117+
//Y_DEBUG_ABORT_UNLESS(weight);
118118
if (!weight) {
119119
weight = 1;
120120
}

ydb/core/keyvalue/keyvalue_state.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,9 @@ void TKeyValueState::InitExecute(ui64 tabletId, TActorId keyValueActorId, ui32 e
634634
}
635635

636636
THelpers::DbEraseCollect(db, ctx);
637+
if (IsEmptyDbStart) {
638+
THelpers::DbUpdateState(StoredState, db, ctx);
639+
}
637640

638641
// corner case, if no CollectGarbage events were sent
639642
if (InitialCollectsSent == 0) {

ydb/core/keyvalue/keyvalue_storage_request.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,11 @@ class TKeyValueStorageRequest : public TActorBootstrapped<TKeyValueStorageReques
341341
IntermediateResults->Stat.GroupReadIops[std::make_pair(response.Id.Channel(), groupId)] += 1; // FIXME: count distinct blobs?
342342
read.Value.Write(readItem.ValueOffset, std::move(response.Buffer));
343343
} else {
344+
Y_VERIFY_DEBUG_S(response.Status != NKikimrProto::NODATA, "NODATA received for TEvGet"
345+
<< " TabletId# " << TabletInfo->TabletID
346+
<< " Id# " << response.Id
347+
<< " Key# " << read.Key);
348+
344349
TStringStream err;
345350
if (read.Message.size()) {
346351
err << read.Message << Endl;

0 commit comments

Comments
 (0)