From fa1adf2378a2adcd7f7a4d107d266c96b4fc72b9 Mon Sep 17 00:00:00 2001 From: Govind Kamat Date: Mon, 12 Aug 2024 10:50:45 -0700 Subject: [PATCH] Updates to the expand-data-corpus script (#612) Signed-off-by: Govind Kamat --- scripts/expand-data-corpus.py | 31 ++++++++++++++++++---------- tests/scripts_test.py | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 tests/scripts_test.py diff --git a/scripts/expand-data-corpus.py b/scripts/expand-data-corpus.py index d27134f54..67d9fe04e 100755 --- a/scripts/expand-data-corpus.py +++ b/scripts/expand-data-corpus.py @@ -19,12 +19,9 @@ help_msg = """ -NOTE: This is a beta feature. The user model, interface and options -are subject to change. - This tool is intended for the purpose of expanding the size of the -data corpus associated an OSB workload. Currently, this capability is -implemented only for the http_logs workload. +data corpus associated with an OSB workload. Currently, this capability +is implemented only for the http_logs workload. TLDR: to generate a 100 GB corpus and then run a test against it: @@ -106,6 +103,12 @@ def handler(signum, frame): sys.exit(1) + +def error_exit(script_name, message): + print(f'{script_name}: {message}', file=sys.stderr) + sys.exit(1) + + class DocGenerator: def __init__(self, @@ -150,7 +153,8 @@ def error(self, message): self.usage_msg() -def generate_docs(workload: str, +def generate_docs(script_name: str, + workload: str, repository: str, input_file: str, output_file_suffix: str, @@ -165,12 +169,17 @@ def generate_docs(workload: str, # config = configparser.ConfigParser() benchmark_home = os.environ.get('BENCHMARK_HOME') or os.environ['HOME'] - config.read(benchmark_home + '/.benchmark/benchmark.ini') + benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini' + if not os.path.isfile(benchmark_ini): + error_exit(script_name, f"could not find OSB config file {benchmark_ini}, run a workload first to create it") + config.read(benchmark_ini) root_dir = config['node']['root.dir'] workload_dir= root_dir + '/workloads/' + repository + '/' + workload data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload + if not os.path.exists(data_dir): + error_exit(script_name, f"workload data directory {data_dir} does not exist, run the appropriate workload first to create it") output_file = data_dir + '/documents-' + output_file_suffix + '.json' if '/' not in input_file: input_file = data_dir + '/' + input_file @@ -274,8 +283,6 @@ def main(args: list) -> None: output_file_suffix = args.output_file_suffix n_docs = args.number_of_docs corpus_size = args.corpus_size - interval = args.interval if args.interval is not None else \ - corpus_size * -2 start_timestamp = args.start_timestamp batch_size = args.batch_size @@ -286,12 +293,14 @@ def main(args: list) -> None: elif not n_docs and not corpus_size: parser.usage_msg(script_name + ": must specify number of documents or corpus size") - + interval = args.interval if args.interval is not None else \ + corpus_size * -2 if workload != 'http_logs': parser.usage_msg(script_name + ': only the "http_logs" workload is currently supported') - generate_docs(workload, + generate_docs(script_name, + workload, repository, input_file, output_file_suffix, diff --git a/tests/scripts_test.py b/tests/scripts_test.py new file mode 100644 index 000000000..44bcf8a19 --- /dev/null +++ b/tests/scripts_test.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Modifications Copyright OpenSearch Contributors. See +# GitHub history for details. +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import pathlib +import subprocess +from unittest import TestCase + +class ScriptsTests(TestCase): + + def test_scr(self): + os.environ["BENCHMARK_HOME"] = "/tmp" + script = pathlib.Path(__file__).parent.parent / "scripts" / "expand-data-corpus.py" + p = subprocess.Popen([str(script), "-c", "10"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stderr = p.communicate()[1].decode('UTF-8') + self.assertTrue("could not find OSB config file" in stderr)