From 3057d4d8dd7744300b9ea8369728e5ac6308332b Mon Sep 17 00:00:00 2001 From: Zach Zhu Date: Thu, 14 Dec 2023 16:07:43 +0800 Subject: [PATCH 1/3] algo: make tsf use a reasonable small batch size as default Signed-off-by: Zach Zhu --- algorithm/kapacity/timeseries/forecasting/forecaster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithm/kapacity/timeseries/forecasting/forecaster.py b/algorithm/kapacity/timeseries/forecasting/forecaster.py index 0c7b6d2..d986ed9 100644 --- a/algorithm/kapacity/timeseries/forecasting/forecaster.py +++ b/algorithm/kapacity/timeseries/forecasting/forecaster.py @@ -630,7 +630,7 @@ def fit(freq: str, context_length: int, learning_rate: float = 1e-3, epochs: int = 100, - batch_size: int = 1024, + batch_size: int = 32, num_workers: int = 0, model_path: str = './', df: pd.DataFrame = None, From 7b84ffa2640d713627b407206d9aac9b88ca395d Mon Sep 17 00:00:00 2001 From: Zach Zhu Date: Thu, 14 Dec 2023 16:09:48 +0800 Subject: [PATCH 2/3] algo: support pods metric query as workload external query Signed-off-by: Zach Zhu --- algorithm/kapacity/metric/query.py | 34 +++++++++++-------- .../portrait/horizontal/predictive/main.py | 7 ++-- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/algorithm/kapacity/metric/query.py b/algorithm/kapacity/metric/query.py index 0758ad2..a36afa4 100644 --- a/algorithm/kapacity/metric/query.py +++ b/algorithm/kapacity/metric/query.py @@ -42,8 +42,12 @@ def fetch_metrics(addr, namespace, metric, scale_target, start, end): start=start, end=end) elif metric_type == 'Pods': - # TODO: support pods metric type - raise RuntimeError('UnsupportedMetricType') + return fetch_pod_metric_history(addr=addr, + namespace=namespace, + metric=metric, + scale_target=scale_target, + start=start, + end=end) elif metric_type == 'Object': return fetch_object_metric_history(addr=addr, namespace=namespace, @@ -71,19 +75,6 @@ def compute_history_range(history_len): return start, end -def fetch_replicas_metric_history(addr, namespace, metric, scale_target, start, end): - external = metric['external'] - metric_identifier = build_metric_identifier(external['metric']) - name, group_kind = get_obj_name_and_group_kind(scale_target) - workload_external = metric_pb.WorkloadExternalQuery(group_kind=group_kind, - namespace=namespace, - name=name, - metric=metric_identifier) - query = metric_pb.Query(type=metric_pb.WORKLOAD_EXTERNAL, - workload_external=workload_external) - return query_metrics(addr=addr, query=query, start=start, end=end) - - def fetch_resource_metric_history(addr, namespace, metric, scale_target, start, end): resource_name = metric['resource']['name'] name, group_kind = get_obj_name_and_group_kind(scale_target) @@ -113,6 +104,19 @@ def fetch_container_resource_metric_history(addr, namespace, metric, scale_targe return query_metrics(addr=addr, query=query, start=start, end=end) +def fetch_pod_metric_history(addr, namespace, metric, scale_target, start, end): + pods = metric['pods'] + metric_identifier = build_metric_identifier(pods['metric']) + name, group_kind = get_obj_name_and_group_kind(scale_target) + workload_external = metric_pb.WorkloadExternalQuery(group_kind=group_kind, + namespace=namespace, + name=name, + metric=metric_identifier) + query = metric_pb.Query(type=metric_pb.WORKLOAD_EXTERNAL, + workload_external=workload_external) + return query_metrics(addr=addr, query=query, start=start, end=end) + + def fetch_object_metric_history(addr, namespace, metric, start, end): obj = metric['object'] metric_identifier = build_metric_identifier(obj['metric']) diff --git a/algorithm/kapacity/portrait/horizontal/predictive/main.py b/algorithm/kapacity/portrait/horizontal/predictive/main.py index d63b79e..ad8804f 100644 --- a/algorithm/kapacity/portrait/horizontal/predictive/main.py +++ b/algorithm/kapacity/portrait/horizontal/predictive/main.py @@ -190,13 +190,12 @@ def fetch_metrics_history(args, env, hp_cr): metric_ctx.resource_target = compute_resource_target(env.namespace, resource, scale_target) metric_ctx.resource_history = resource_history.rename(columns={'value': resource['name']}) elif i == 1: - if metric_type != 'External': + if metric_type != 'Pods': raise RuntimeError('MetricTypeError') - replica_history = query.fetch_replicas_metric_history(env.metrics_server_addr, env.namespace, metric, - scale_target, start, end) + replica_history = query.fetch_metrics(env.metrics_server_addr, env.namespace, metric, scale_target, start, end) metric_ctx.replicas_history = replica_history.rename(columns={'value': 'replicas'}) else: - if metric_type != 'Object' and metric_type != 'External': + if metric_type != 'Pods' and metric_type != 'Object' and metric_type != 'External': raise RuntimeError('MetricTypeError') metric_name = metric['name'] traffic_history = query.fetch_metrics(env.metrics_server_addr, env.namespace, metric, scale_target, start, end) From ad417e9690dc7d0b658538a55a5cae46c05422b6 Mon Sep 17 00:00:00 2001 From: Zach Zhu Date: Thu, 14 Dec 2023 16:10:26 +0800 Subject: [PATCH 3/3] algo: tweaks Signed-off-by: Zach Zhu --- algorithm/kapacity/metric/query.py | 19 ++++++------------- .../portrait/horizontal/predictive/main.py | 16 ++++++---------- .../predictive/replicas_estimator.py | 18 +++++++++--------- 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/algorithm/kapacity/metric/query.py b/algorithm/kapacity/metric/query.py index a36afa4..43d2ddf 100644 --- a/algorithm/kapacity/metric/query.py +++ b/algorithm/kapacity/metric/query.py @@ -171,19 +171,12 @@ def query_metrics(addr, query, start, end): def convert_metric_series_to_dataframe(series): - dataframe = None - for item in series: - array = [] - for point in item.points: - array.append([point.timestamp, point.value]) - df = pd.DataFrame(array, columns=['timestamp', 'value'], dtype=float) - df['timestamp'] = df['timestamp'].map(lambda x: x / 1000).astype('int64') - if dataframe is not None: - # TODO: consider if it's possible to have multiple series - pd.merge(dataframe, df, how='left', on='timestamp') - else: - dataframe = df - return dataframe + df_list = [] + for point in series[0].points: + df_list.append([point.timestamp, point.value]) + df = pd.DataFrame(df_list, columns=['timestamp', 'value'], dtype=float) + df['timestamp'] = df['timestamp'].map(lambda x: x / 1000).astype('int64') + return df def time_period_to_minutes(time_period): diff --git a/algorithm/kapacity/portrait/horizontal/predictive/main.py b/algorithm/kapacity/portrait/horizontal/predictive/main.py index ad8804f..a3c68f3 100644 --- a/algorithm/kapacity/portrait/horizontal/predictive/main.py +++ b/algorithm/kapacity/portrait/horizontal/predictive/main.py @@ -33,7 +33,6 @@ class EnvInfo: class MetricsContext: workload_identifier = None - resource_name = None resource_target = 0 resource_history = None replicas_history = None @@ -130,7 +129,7 @@ def predict_replicas(args, metric_ctx, pred_traffics): pred = estimator.estimate(history, pred_traffics, 'timestamp', - metric_ctx.resource_name, + 'resource', 'replicas', traffic_col, metric_ctx.resource_target, @@ -155,12 +154,10 @@ def merge_history_dict(history_dict): def resample_by_freq(old_df, freq, agg_funcs): - df = old_df.copy() - df = df.sort_values(by='timestamp', ascending=True) + df = old_df.sort_values(by='timestamp', ascending=True) df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - df = df.resample(rule=freq, on='timestamp').agg(agg_funcs) - df = df.rename_axis('timestamp').reset_index() - df['timestamp'] = df['timestamp'].astype('int64') // 10 ** 9 + df = df.resample(rule=freq, on='timestamp').agg(agg_funcs).reset_index() + df['timestamp'] = df['timestamp'].astype('int64') // 1e9 return df @@ -185,10 +182,9 @@ def fetch_metrics_history(args, env, hp_cr): resource = metric['containerResource'] else: raise RuntimeError('MetricTypeError') - resource_history = query.fetch_metrics(env.metrics_server_addr, env.namespace, metric, scale_target, start, end) - metric_ctx.resource_name = resource['name'] metric_ctx.resource_target = compute_resource_target(env.namespace, resource, scale_target) - metric_ctx.resource_history = resource_history.rename(columns={'value': resource['name']}) + resource_history = query.fetch_metrics(env.metrics_server_addr, env.namespace, metric, scale_target, start, end) + metric_ctx.resource_history = resource_history.rename(columns={'value': 'resource'}) elif i == 1: if metric_type != 'Pods': raise RuntimeError('MetricTypeError') diff --git a/algorithm/kapacity/portrait/horizontal/predictive/replicas_estimator.py b/algorithm/kapacity/portrait/horizontal/predictive/replicas_estimator.py index 6170913..f4b0902 100644 --- a/algorithm/kapacity/portrait/horizontal/predictive/replicas_estimator.py +++ b/algorithm/kapacity/portrait/horizontal/predictive/replicas_estimator.py @@ -631,15 +631,15 @@ class EstimationException(Exception): pass -def estimate(data, - data_pred, - time_col, - resource_col, - replicas_col, - traffic_cols, - resource_target, - time_delta_hours, - test_dataset_size_in_seconds=86400): +def estimate(data: pd.DataFrame, + data_pred: pd.DataFrame, + time_col: str, + resource_col: str, + replicas_col: str, + traffic_cols: list[str], + resource_target: float, + time_delta_hours: int, + test_dataset_size_in_seconds: int = 86400) -> pd.DataFrame: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') logger = logging.getLogger()