Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Load new model version should not reload loaded existing model version(s) #388

Merged
merged 7 commits into from
Aug 20, 2024
4 changes: 3 additions & 1 deletion src/model_config_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2432,13 +2432,15 @@ TritonToDataType(const TRITONSERVER_DataType dtype)
}

bool
EquivalentInNonInstanceGroupConfig(
EquivalentInNonInstanceGroupAndNonVersionPolicyConfig(
kthui marked this conversation as resolved.
Show resolved Hide resolved
const inference::ModelConfig& old_config,
const inference::ModelConfig& new_config)
{
::google::protobuf::util::MessageDifferencer pb_diff;
pb_diff.IgnoreField(
old_config.descriptor()->FindFieldByLowercaseName("instance_group"));
pb_diff.IgnoreField(
old_config.descriptor()->FindFieldByLowercaseName("version_policy"));
return pb_diff.Compare(old_config, new_config);
}

Expand Down
8 changes: 5 additions & 3 deletions src/model_config_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,14 @@ TRITONSERVER_DataType DataTypeToTriton(const inference::DataType dtype);
/// \return The data type.
inference::DataType TritonToDataType(const TRITONSERVER_DataType dtype);

/// Check if non instance group settings on the model configs are equivalent.
/// Check if non instance group and non version policy settings on the model
/// configs are equivalent.
/// \param old_config The old model config.
/// \param new_config The new model config.
/// \return True if the model configs are equivalent in all non instance group
/// settings. False if they differ in non instance group settings.
bool EquivalentInNonInstanceGroupConfig(
/// and non version policy settings. False if they differ in non instance group
/// and non version policy settings.
bool EquivalentInNonInstanceGroupAndNonVersionPolicyConfig(
const inference::ModelConfig& old_config,
const inference::ModelConfig& new_config);

Expand Down
7 changes: 4 additions & 3 deletions src/model_repository_manager/model_lifecycle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ Status
ModelLifeCycle::AsyncLoad(
const ModelIdentifier& model_id, const std::string& model_path,
const inference::ModelConfig& model_config, const bool is_config_provided,
const bool is_model_file_updated,
const ModelTimestamp& prev_timestamp, const ModelTimestamp& curr_timestamp,
const std::shared_ptr<TritonRepoAgentModelList>& agent_model_list,
std::function<void(Status)>&& OnComplete)
{
Expand Down Expand Up @@ -488,8 +488,9 @@ ModelLifeCycle::AsyncLoad(
if (serving_model->state_ == ModelReadyState::READY) {
// The model is currently being served. Check if the model load could
// be completed with a simple config update.
if (!is_model_file_updated && !serving_model->is_ensemble_ &&
EquivalentInNonInstanceGroupConfig(
if (!serving_model->is_ensemble_ &&
!prev_timestamp.IsModelVersionModified(curr_timestamp, version) &&
EquivalentInNonInstanceGroupAndNonVersionPolicyConfig(
serving_model->model_config_, model_config)) {
// Update the model
model_info = serving_model.get();
Expand Down
6 changes: 4 additions & 2 deletions src/model_repository_manager/model_lifecycle.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -162,6 +162,7 @@ class TritonRepoAgentModelList {
};

class InferenceServer;
class ModelTimestamp;

class ModelLifeCycle {
public:
Expand All @@ -183,7 +184,8 @@ class ModelLifeCycle {
Status AsyncLoad(
const ModelIdentifier& model_id, const std::string& model_path,
const inference::ModelConfig& model_config, const bool is_config_provided,
const bool is_model_file_updated,
const ModelTimestamp& prev_timestamp,
const ModelTimestamp& curr_timestamp,
const std::shared_ptr<TritonRepoAgentModelList>& agent_model_list,
std::function<void(Status)>&& OnComplete);

Expand Down
174 changes: 120 additions & 54 deletions src/model_repository_manager/model_repository_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ CreateAgentModelListWithLoadAction(
// Return the latest modification time in ns for all files/folders starting
// from the path. The modification time will be 0 if there is an error.
int64_t
GetModifiedTime(const std::string& path)
GetPathModifiedTime(const std::string& path)
{
// If there is an error in any step the fall-back default
// modification time is 0. This means that in error cases 'path'
Expand Down Expand Up @@ -306,18 +306,15 @@ GetModifiedTime(const std::string& path)

for (const auto& child : contents) {
const auto full_path = JoinPath({path, child});
mtime = std::max(mtime, GetModifiedTime(full_path));
mtime = std::max(mtime, GetPathModifiedTime(full_path));
}

return mtime;
}
// Return the latest modification time in ns for '<config.pbtxt, model files>'
// in a model directory path. The time for "config.pbtxt" will be 0 if not
// found at "[model_dir_path]/config.pbtxt" or "[model_dir_path]/configs/
// <custom-config-name>.pbtxt" if "--model-config-name" is set. The time for
// "model files" includes the time for 'model_dir_path'.
std::pair<int64_t, int64_t>
GetDetailedModifiedTime(

} // namespace

ModelTimestamp::ModelTimestamp(
const std::string& model_dir_path, const std::string& model_config_path)
{
// Check if 'model_dir_path' is a directory.
Expand All @@ -326,64 +323,131 @@ GetDetailedModifiedTime(
if (!status.IsOk()) {
LOG_ERROR << "Failed to determine modification time for '" << model_dir_path
<< "': " << status.AsString();
return std::make_pair(0, 0);
return;
}
if (!is_dir) {
LOG_ERROR << "Failed to determine modification time for '" << model_dir_path
<< "': Model directory path is not a directory";
return std::make_pair(0, 0);
return;
}

std::pair<int64_t, int64_t> mtime(0, 0); // <config.pbtxt, model files>

// Populate 'model files' time to the model directory time
status = FileModificationTime(model_dir_path, &mtime.second);
// Populate time for 'model_dir_path'.
int64_t model_dir_time = 0;
status = FileModificationTime(model_dir_path, &model_dir_time);
if (!status.IsOk()) {
LOG_ERROR << "Failed to determine modification time for '" << model_dir_path
<< "': " << status.AsString();
return std::make_pair(0, 0);
return;
}
model_timestamps_.emplace("", model_dir_time);

// Get files/folders in model_dir_path.
std::set<std::string> contents;
status = GetDirectoryContents(model_dir_path, &contents);
// Populate time for all immediate files/folders in 'model_dir_path'.
std::set<std::string> dir_contents;
status = GetDirectoryContents(model_dir_path, &dir_contents);
if (!status.IsOk()) {
LOG_ERROR << "Failed to determine modification time for '" << model_dir_path
<< "': " << status.AsString();
return std::make_pair(0, 0);
}
// Get latest modification time for each files/folders, and place it at the
// correct category.
for (const auto& child : contents) {
const auto full_path = JoinPath({model_dir_path, child});
if (full_path == model_config_path) {
// config.pbtxt or customized config file in configs folder
mtime.first = GetModifiedTime(full_path);
} else {
// model files
mtime.second = std::max(mtime.second, GetModifiedTime(full_path));
model_timestamps_.clear();
return;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise exception and try catch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did a quick test on L0_lifecycle, the test currently expects any error related to retrieving timestamp to remain hidden and simply set the failed timestamp to 0. To translate it into this ModelTimestamp object, setting timestamp to 0 is the same as initializing the object using the default constructor, which is leaving the model_timestamps_ object empty. Thus, if an exception is raised here, the caller catching the exception will simply re-initialize ModelTimestamp object using the default constructor (to remain aligned with the current logic of setting the timestamp to 0). In this case, raising an exception is redundant and can be handled by the constructor in one or two lines, or with a helper function specifically for keeping everything empty.

Another approach is we can change the test cases to accept the behavior change that failure to determine timestamp will fail the model load.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think to get this feature out quickly, we can retain the existing error handling on reading timestamps, and file another ticket for enhancing the error handling at a later time. @GuanLuo @rmccorm4 what do you think?

Copy link
Contributor

@rmccorm4 rmccorm4 Aug 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Retaining existing error behavior for the cherry-pick and enhancing in a follow-up sounds fine to me. If we really intend to do it - please put in a // DLIS-XXXX: blah blah so it doesn't get lost.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be addressed as follow-up, but I do think an exception should be raised, and it is fine to have the caller catch the exception and then initialize a timestamp with default constructor (if not failing). This is from the stand point where if the user initializes a timestamp with a path, the user is expecting the returned timestamp to carry correct timestamp w.r.t. the given path, not 0. And thus an error should be raised programmatically and it is up to the user to decide how to handle the error.

}
for (const auto& content_name : dir_contents) {
const auto content_path = JoinPath({model_dir_path, content_name});
bool is_model_config = model_config_path.rfind(content_path, 0) == 0;
if (is_model_config) {
if (!model_config_content_name_.empty()) {
LOG_ERROR << "Failed to determine modification time for '"
<< model_dir_path << "': Duplicate model config is detected";
model_timestamps_.clear();
model_config_content_name_.clear();
return;
}
model_config_content_name_ = content_name;
}
model_timestamps_.emplace(content_name, GetPathModifiedTime(content_path));
kthui marked this conversation as resolved.
Show resolved Hide resolved
}
}

return mtime;
bool
ModelTimestamp::IsModified(const ModelTimestamp& new_timestamp) const
{
int64_t old_modified_time = GetModifiedTime();
int64_t new_modified_time = new_timestamp.GetModifiedTime();
return new_modified_time > old_modified_time;
}
// Return true if any file in the 'model_dir_path' has been modified more
// recently than 'last_ns', which represents last modified time for
// '<config.pbtxt, model files>'. Update the 'last_ns' to the most-recent
// modified time.

bool
IsModified(
const std::string& model_dir_path, const std::string& model_config_path,
std::pair<int64_t, int64_t>* last_ns)
ModelTimestamp::IsModelVersionModified(
const ModelTimestamp& new_timestamp, const int64_t version) const
{
auto new_ns = GetDetailedModifiedTime(model_dir_path, model_config_path);
bool modified = std::max(new_ns.first, new_ns.second) >
std::max(last_ns->first, last_ns->second);
last_ns->swap(new_ns);
return modified;
int64_t old_modified_time = std::max(
GetModelVersionModifiedTime(version),
GetNonModelConfigNorVersionNorDirModifiedTime());
int64_t new_modified_time = std::max(
new_timestamp.GetModelVersionModifiedTime(version),
new_timestamp.GetNonModelConfigNorVersionNorDirModifiedTime());
return new_modified_time > old_modified_time;
}

} // namespace
int64_t
ModelTimestamp::GetModifiedTime() const
{
int64_t modified_time = 0;
for (const auto& pair : model_timestamps_) {
const int64_t time = pair.second;
modified_time = std::max(modified_time, time);
}
return modified_time;
}

int64_t
ModelTimestamp::GetModelVersionModifiedTime(const int64_t version) const
{
int64_t modified_time = 0;
auto itr = model_timestamps_.find(std::to_string(version));
if (itr != model_timestamps_.end()) {
modified_time = itr->second;
}
return modified_time;
}

int64_t
ModelTimestamp::GetNonModelConfigNorVersionNorDirModifiedTime() const
{
// Get modified time excluding time from model config, model version
// directory(s) and model directory.
int64_t modified_time = 0;
for (const auto& pair : model_timestamps_) {
const std::string& content_name = pair.first;
bool found_non_digit_in_content_name = false;
for (const char& c : content_name) {
if (std::isdigit(c) == 0) {
found_non_digit_in_content_name = true;
break;
}
}
// All model version directory(s) will not 'found_non_digit_in_content_name'
// as they are all digit(s).
// Model directory name will not 'found_non_digit_in_content_name' as it is
// empty.
if (found_non_digit_in_content_name &&
content_name != model_config_content_name_) {
const int64_t time = pair.second;
modified_time = std::max(modified_time, time);
}
}
return modified_time;
}

void
ModelTimestamp::SetModelConfigModifiedTime(const int64_t time_ns)
{
if (model_config_content_name_.empty()) {
LOG_ERROR << "Failed to set config modification time: "
"model_config_content_name_ is empty";
return;
}
model_timestamps_[model_config_content_name_] = time_ns;
}

ModelRepositoryManager::ModelRepositoryManager(
const std::set<std::string>& repository_paths, const bool autofill,
Expand Down Expand Up @@ -624,7 +688,7 @@ ModelRepositoryManager::LoadModelByDependency(
auto status = model_life_cycle_->AsyncLoad(
valid_model->model_id_, itr->second->model_path_,
valid_model->model_config_, itr->second->is_config_provided_,
itr->second->mtime_nsec_.second > itr->second->prev_mtime_ns_.second,
itr->second->prev_mtime_ns_, itr->second->mtime_nsec_,
itr->second->agent_model_list_, [model_state](Status load_status) {
model_state->status_ = load_status;
model_state->ready_.set_value();
Expand Down Expand Up @@ -1406,7 +1470,7 @@ ModelRepositoryManager::InitializeModelInfo(
if (iitr != infos_.end()) {
linfo->prev_mtime_ns_ = iitr->second->mtime_nsec_;
} else {
linfo->prev_mtime_ns_ = std::make_pair(0, 0);
linfo->prev_mtime_ns_ = ModelTimestamp();
}

// Set 'mtime_nsec_' and override 'model_path_' if current path is empty
Expand Down Expand Up @@ -1435,7 +1499,7 @@ ModelRepositoryManager::InitializeModelInfo(
// For file override, set 'mtime_nsec_' to minimum value so that
// the next load without override will trigger re-load to undo
// the override while the local files may still be unchanged.
linfo->mtime_nsec_ = std::make_pair(0, 0);
linfo->mtime_nsec_ = ModelTimestamp();
linfo->model_path_ = location;
linfo->model_config_path_ = JoinPath({location, kModelConfigPbTxt});
linfo->agent_model_list_.reset(new TritonRepoAgentModelList());
Expand All @@ -1445,14 +1509,16 @@ ModelRepositoryManager::InitializeModelInfo(
GetModelConfigFullPath(linfo->model_path_, model_config_name_);
// Model is not loaded.
if (iitr == infos_.end()) {
linfo->mtime_nsec_ = GetDetailedModifiedTime(
linfo->model_path_, linfo->model_config_path_);
linfo->mtime_nsec_ =
ModelTimestamp(linfo->model_path_, linfo->model_config_path_);
} else {
// Check the current timestamps to determine if model actually has been
// modified
linfo->mtime_nsec_ = linfo->prev_mtime_ns_;
unmodified = !IsModified(
linfo->model_path_, linfo->model_config_path_, &linfo->mtime_nsec_);
ModelTimestamp new_mtime_ns =
ModelTimestamp(linfo->model_path_, linfo->model_config_path_);
unmodified = !linfo->mtime_nsec_.IsModified(new_mtime_ns);
linfo->mtime_nsec_ = new_mtime_ns;
}
}

Expand All @@ -1467,9 +1533,9 @@ ModelRepositoryManager::InitializeModelInfo(
// the override while the local files may still be unchanged.
auto time_since_epoch =
std::chrono::system_clock::now().time_since_epoch();
linfo->mtime_nsec_.first =
linfo->mtime_nsec_.SetModelConfigModifiedTime(
std::chrono::duration_cast<std::chrono::nanoseconds>(time_since_epoch)
.count();
.count());
unmodified = false;

const std::string& override_config = override_parameter->ValueString();
Expand Down
Loading
Loading