Skip to content

Commit

Permalink
Display an error in case operation was longer than OMAP lock duration
Browse files Browse the repository at this point in the history
Fixes #599

Signed-off-by: Gil Bregman <gbregman@il.ibm.com>
  • Loading branch information
gbregman committed Aug 29, 2024
1 parent 8b1fe4e commit f733449
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
8 changes: 8 additions & 0 deletions control/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def __init__(self, omap_state, gateway_state, rpc_lock: threading.Lock) -> None:
self.omap_file_lock_retries = self.omap_state.config.getint_with_default("gateway", "omap_file_lock_retries", 30)
self.omap_file_lock_retry_sleep_interval = self.omap_state.config.getfloat_with_default("gateway",
"omap_file_lock_retry_sleep_interval", 1.0)
self.lock_start_time = 0.0
# This is used for testing purposes only. To allow us testing locking from two gateways at the same time
self.omap_file_disable_unlock = self.omap_state.config.getboolean_with_default("gateway", "omap_file_disable_unlock", False)
if self.omap_file_disable_unlock:
Expand All @@ -240,11 +241,18 @@ def __init__(self, omap_state, gateway_state, rpc_lock: threading.Lock) -> None:
def __enter__(self):
if self.omap_file_lock_duration > 0:
self.lock_omap()
self.lock_start_time = time.monotonic()
return self

def __exit__(self, typ, value, traceback):
if self.omap_file_lock_duration > 0:
duration = 0.0
if self.lock_start_time:
duration = time.monotonic() - self.lock_start_time
self.lock_start_time = 0.0
self.unlock_omap()
if duration > self.omap_file_lock_duration:
self.logger.error(f"Operation ran for {duration:.2f} seconds, but the OMAP lock expired after {self.omap_file_lock_duration} seconds")

def get_omap_lock_to_use(self, context):
if context:
Expand Down
43 changes: 42 additions & 1 deletion tests/test_omap_lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
import spdk.rpc.bdev as rpc_bdev

image = "mytestdevimage"
image2 = "mytestdevimage2"
pool = "rbd"
subsystem_prefix = "nqn.2016-06.io.spdk:cnode"
host_nqn_prefix = "nqn.2014-08.org.nvmexpress:uuid:22207d09-d8af-4ed2-84ec-a6d80b"
created_resource_count = 10

def setup_config(config, gw1_name, gw2_name, gw_group, update_notify ,update_interval_sec, disable_unlock, lock_duration,
def setup_config(config, gw1_name, gw2_name, gw_group, update_notify, update_interval_sec, disable_unlock, lock_duration,
sock1_name, sock2_name, port_inc):
"""Sets up the config objects for gateways A and B """

Expand Down Expand Up @@ -67,6 +68,26 @@ def stop_servers(gatewayA, gatewayB):
gatewayA.server.stop(grace=1)
gatewayB.server.stop(grace=1)

@pytest.fixture(scope="function")
def short_lock_duration(config, request):
"""Sets up and tears down Gateways A and B."""

# Setup GatewayA and GatewayB configs
configA, configB = setup_config(config, "GatewayA", "GatewayB", "Group1", False, 300, False, 1,
"spdk_GatewayA.sock", "spdk_GatewayB.sock", 0)
addr = configA.get("gateway", "addr")
portA = configA.getint("gateway", "port")
portB = configB.getint("gateway", "port")
ceph_utils = CephUtils(config)
# Start servers
with (
GatewayServer(configA) as gatewayA,
GatewayServer(configB) as gatewayB,
):
stubA, stubB = start_servers(gatewayA, gatewayB, "Group1", addr, portA, portB, ceph_utils)
yield stubA, stubB, gatewayA.gateway_rpc, gatewayB.gateway_rpc
stop_servers(gatewayA, gatewayB)

@pytest.fixture(scope="function")
def conn_omap_reread(config, request):
"""Sets up and tears down Gateways A and B."""
Expand Down Expand Up @@ -181,6 +202,26 @@ def check_resource_by_index(i, subsys_list, hosts_info):
pass
assert found_host

def test_short_lock_duration(config, short_lock_duration, caplog):
"""Test an operation being lobger than the lock duration
"""
stubA, stubB, gatewayA, gatewayB = short_lock_duration
nqn = subsystem_prefix + "Y1"
serial = "Ceph00000000000002"
nsid=20

subsystem_req = pb2.create_subsystem_req(subsystem_nqn=nqn, serial_number=serial, max_namespaces=256, enable_ha=True)
namespace_req = pb2.namespace_add_req(subsystem_nqn=nqn, nsid=nsid,
rbd_pool_name=pool, rbd_image_name=image2, block_size=4096,
create_image=True, size=16*1024*1024, force=True)
caplog.clear()
ret_subsystem = stubA.create_subsystem(subsystem_req)
assert ret_subsystem.status == 0
ret_namespace = stubA.namespace_add(namespace_req)
assert ret_namespace.status == 0
assert f"No such lock, the lock duration might have passed" in capog.text
assert f"seconds, but the OMAP lock expired after" in caplog.text

def test_multi_gateway_omap_reread(config, conn_omap_reread, caplog):
"""Tests reading out of date OMAP file
"""
Expand Down

0 comments on commit f733449

Please sign in to comment.