Skip to content

Commit

Permalink
Add logging && metrics
Browse files Browse the repository at this point in the history
Signed-off-by: JaySon-Huang <jayson.hjs@gmail.com>
  • Loading branch information
JaySon-Huang committed Feb 23, 2023
1 parent a779594 commit 3165fc6
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 5 deletions.
1 change: 1 addition & 0 deletions dbms/src/Common/CurrentMetrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
M(IOLimiterPendingFgWriteReq) \
M(IOLimiterPendingBgReadReq) \
M(IOLimiterPendingFgReadReq) \
M(S3LockServiceNumLatches) \
M(StoragePoolV2Only) \
M(StoragePoolV3Only) \
M(StoragePoolMixMode) \
Expand Down
8 changes: 8 additions & 0 deletions dbms/src/Common/TiFlashMetrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,14 @@ namespace DB
F(type_bg_write_alloc_bytes, {"type", "bg_write_alloc_bytes"})) \
M(tiflash_storage_rough_set_filter_rate, "Bucketed histogram of rough set filter rate", Histogram, \
F(type_dtfile_pack, {{"type", "dtfile_pack"}}, EqualWidthBuckets{0, 6, 20})) \
M(tiflash_disaggregated_object_lock_request_count, "Total number of S3 object lock/delete request", Counter, \
F(type_lock, {"type", "lock"}), F(type_delete, {"type", "delete"}), \
F(type_owner_changed, {"type", "owner_changed"}), F(type_error, {"type", "error"}), \
F(type_lock_conflict, {"type", "lock_conflict"}), F(type_delete_conflict, {"type", "delete_conflict"}), \
F(type_delete_risk, {"type", "delete_risk"})) \
M(tiflash_disaggregated_object_lock_request_duration_seconds, "Bucketed histogram of S3 object lock/delete request duration", Histogram, \
F(type_lock, {{"type", "cop"}}, ExpBuckets{0.001, 2, 20}), \
F(type_delete, {{"type", "batch"}}, ExpBuckets{0.001, 2, 20})) \
M(tiflash_raft_command_duration_seconds, "Bucketed histogram of some raft command: apply snapshot", \
Histogram, /* these command usually cost servel seconds, increase the start bucket to 50ms */ \
F(type_ingest_sst, {{"type", "ingest_sst"}}, ExpBuckets{0.05, 2, 10}), \
Expand Down
61 changes: 56 additions & 5 deletions dbms/src/Flash/Disaggregated/S3LockService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <Common/CurrentMetrics.h>
#include <Common/TiFlashMetrics.h>
#include <Flash/Disaggregated/S3LockService.h>
#include <Flash/ServiceUtils.h>
#include <Storages/S3/S3Common.h>
Expand All @@ -29,6 +31,11 @@
#include <magic_enum.hpp>


namespace CurrentMetrics
{
extern const Metric S3LockServiceNumLatches;
}

namespace DB::S3
{

Expand All @@ -51,32 +58,47 @@ grpc::Status S3LockService::tryAddLock(const disaggregated::TryAddLockRequest *
{
try
{
Stopwatch watch;
SCOPE_EXIT({ GET_METRIC(tiflash_disaggregated_object_lock_request_duration_seconds, type_lock).Observe(watch.elapsedSeconds()); });
tryAddLockImpl(request->data_file_key(), request->lock_store_id(), request->lock_seq(), response);
if (response->result().has_conflict())
{
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_lock_conflict).Increment();
}
else if (response->result().has_not_owner())
{
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_owner_changed).Increment();
}
return grpc::Status::OK;
}
catch (const TiFlashException & e)
{
LOG_ERROR(log, "TiFlash Exception: {}\n{}", e.displayText(), e.getStackTrace().toString());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.standardText());
}
catch (const Exception & e)
{
LOG_ERROR(log, "DB Exception: {}\n{}", e.message(), e.getStackTrace().toString());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(tiflashErrorCodeToGrpcStatusCode(e.code()), e.message());
}
catch (const pingcap::Exception & e)
{
LOG_ERROR(log, "KV Client Exception: {}", e.message());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.message());
}
catch (const std::exception & e)
{
LOG_ERROR(log, "std exception: {}", e.what());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.what());
}
catch (...)
{
LOG_ERROR(log, "other exception");
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, "other exception");
}
}
Expand All @@ -85,32 +107,47 @@ grpc::Status S3LockService::tryMarkDelete(const disaggregated::TryMarkDeleteRequ
{
try
{
Stopwatch watch;
SCOPE_EXIT({ GET_METRIC(tiflash_disaggregated_object_lock_request_duration_seconds, type_delete).Observe(watch.elapsedSeconds()); });
tryMarkDeleteImpl(request->data_file_key(), response);
if (response->result().has_conflict())
{
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_delete_conflict).Increment();
}
else if (response->result().has_not_owner())
{
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_owner_changed).Increment();
}
return grpc::Status::OK;
}
catch (const TiFlashException & e)
{
LOG_ERROR(log, "TiFlash Exception: {}\n{}", e.displayText(), e.getStackTrace().toString());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.standardText());
}
catch (const Exception & e)
{
LOG_ERROR(log, "DB Exception: {}\n{}", e.message(), e.getStackTrace().toString());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(tiflashErrorCodeToGrpcStatusCode(e.code()), e.message());
}
catch (const pingcap::Exception & e)
{
LOG_ERROR(log, "KV Client Exception: {}", e.message());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.message());
}
catch (const std::exception & e)
{
LOG_ERROR(log, "std exception: {}", e.what());
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, e.what());
}
catch (...)
{
LOG_ERROR(log, "other exception");
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_error).Increment();
return grpc::Status(grpc::StatusCode::INTERNAL, "other exception");
}
}
Expand All @@ -123,18 +160,21 @@ S3LockService::getDataFileLatch(const String & data_file_key)
if (it == file_latch_map.end())
{
it = file_latch_map.emplace(data_file_key, std::make_shared<DataFileMutex>()).first;
CurrentMetrics::add(CurrentMetrics::S3LockServiceNumLatches);
}
auto file_latch = it->second;
// must add ref count under the protection of `file_latch_map_mutex`
file_latch->addRefCount();
return file_latch;
}

bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_store_id, UInt64 lock_seq, disaggregated::TryAddLockResponse * response)
bool S3LockService::tryAddLockImpl(
const String & data_file_key,
UInt64 lock_store_id,
UInt64 lock_seq,
disaggregated::TryAddLockResponse * response)
{
if (data_file_key.empty())
return false;

GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_lock).Increment();
const S3FilenameView key_view = S3FilenameView::fromKey(data_file_key);
RUNTIME_CHECK(key_view.isDataFile(), data_file_key);

Expand All @@ -155,6 +195,7 @@ bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_sto
if (file_latch->decreaseRefCount() == 0)
{
file_latch_map.erase(data_file_key);
CurrentMetrics::sub(CurrentMetrics::S3LockServiceNumLatches);
}
});

Expand All @@ -163,6 +204,7 @@ bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_sto
{
auto * e = response->mutable_result()->mutable_conflict();
e->set_reason(fmt::format("data file not exist, key={}", data_file_key));
LOG_INFO(log, "data file lock conflict: not exist, key={}", data_file_key);
return false;
}

Expand All @@ -172,6 +214,7 @@ bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_sto
{
auto * e = response->mutable_result()->mutable_conflict();
e->set_reason(fmt::format("data file is mark deleted, key={} delmark={}", data_file_key, delmark_key));
LOG_INFO(log, "data file lock conflict: mark deleted, key={} delmark={}", data_file_key, delmark_key);
return false;
}

Expand All @@ -180,6 +223,7 @@ bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_sto
{
// client should retry
response->mutable_result()->mutable_not_owner();
LOG_INFO(log, "data file lock conflict: owner changed, key={}", data_file_key);
return false;
}
const auto lock_key = key_view.getLockKey(lock_store_id, lock_seq);
Expand All @@ -191,9 +235,11 @@ bool S3LockService::tryAddLockImpl(const String & data_file_key, UInt64 lock_sto
// it is safe to return owner change and let the client retry.
// the obsolete lock file will finally get removed in S3 GC.
response->mutable_result()->mutable_not_owner();
LOG_INFO(log, "data file lock conflict: owner changed after lock added, key={} lock_key={}", data_file_key, lock_key);
return false;
}

LOG_INFO(log, "data file is locked, key={} lock_key={}", data_file_key, lock_key);
response->mutable_result()->mutable_success();
return true;
}
Expand Down Expand Up @@ -242,6 +288,7 @@ bool S3LockService::tryMarkDeleteImpl(const String & data_file_key, disaggregate
if (file_latch->decreaseRefCount() == 0)
{
file_latch_map.erase(data_file_key);
CurrentMetrics::sub(CurrentMetrics::S3LockServiceNumLatches);
}
});

Expand All @@ -252,6 +299,7 @@ bool S3LockService::tryMarkDeleteImpl(const String & data_file_key, disaggregate
{
auto * e = response->mutable_result()->mutable_conflict();
e->set_reason(fmt::format("data file is locked, key={} lock_by={}", data_file_key, lock_key.value()));
LOG_INFO(log, "data file mark delete conflict: file is locked, key={} lock_by={}", data_file_key, lock_key.value());
return false;
}

Expand All @@ -260,6 +308,7 @@ bool S3LockService::tryMarkDeleteImpl(const String & data_file_key, disaggregate
{
// client should retry
response->mutable_result()->mutable_not_owner();
LOG_INFO(log, "data file mark delete conflict: owner changed, key={}", data_file_key);
return false;
}
// upload delete mark
Expand All @@ -269,9 +318,11 @@ bool S3LockService::tryMarkDeleteImpl(const String & data_file_key, disaggregate
{
// owner changed happens when delmark is uploading, can not
// ensure whether this is safe or not.
LOG_ERROR(log, "owner changed when marking file deleted! key={} delmark={}", data_file_key, delmark_key);
LOG_ERROR(log, "data file mark delete conflict: owner changed when marking file deleted! key={} delmark={}", data_file_key, delmark_key);
GET_METRIC(tiflash_disaggregated_object_lock_request_count, type_delete_risk).Increment();
}

LOG_INFO(log, "data file is mark deleted, key={} delmark={}", data_file_key, delmark_key);
response->mutable_result()->mutable_success();
return true;
}
Expand Down

0 comments on commit 3165fc6

Please sign in to comment.