From fa1adf2378a2adcd7f7a4d107d266c96b4fc72b9 Mon Sep 17 00:00:00 2001
From: Govind Kamat <govkamat@amazon.com>
Date: Mon, 12 Aug 2024 10:50:45 -0700
Subject: [PATCH] Updates to the expand-data-corpus script (#612)

Signed-off-by: Govind Kamat <govkamat@amazon.com>
---
 scripts/expand-data-corpus.py | 31 ++++++++++++++++++----------
 tests/scripts_test.py         | 38 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 11 deletions(-)
 create mode 100644 tests/scripts_test.py

diff --git a/scripts/expand-data-corpus.py b/scripts/expand-data-corpus.py
index d27134f54..67d9fe04e 100755
--- a/scripts/expand-data-corpus.py
+++ b/scripts/expand-data-corpus.py
@@ -19,12 +19,9 @@
 
 help_msg = """
 
-NOTE: This is a beta feature.  The user model, interface and options
-are subject to change.
-
 This tool is intended for the purpose of expanding the size of the
-data corpus associated an OSB workload.  Currently, this capability is
-implemented only for the http_logs workload.
+data corpus associated with an OSB workload.  Currently, this capability
+is implemented only for the http_logs workload.
 
 TLDR: to generate a 100 GB corpus and then run a test against it:
 
@@ -106,6 +103,12 @@
 def handler(signum, frame):
     sys.exit(1)
 
+
+def error_exit(script_name, message):
+    print(f'{script_name}: {message}', file=sys.stderr)
+    sys.exit(1)
+
+
 class DocGenerator:
 
     def __init__(self,
@@ -150,7 +153,8 @@ def error(self, message):
         self.usage_msg()
 
 
-def generate_docs(workload: str,
+def generate_docs(script_name: str,
+                  workload: str,
                   repository: str,
                   input_file: str,
                   output_file_suffix: str,
@@ -165,12 +169,17 @@ def generate_docs(workload: str,
     #
     config = configparser.ConfigParser()
     benchmark_home = os.environ.get('BENCHMARK_HOME') or os.environ['HOME']
-    config.read(benchmark_home + '/.benchmark/benchmark.ini')
+    benchmark_ini = benchmark_home + '/.benchmark/benchmark.ini'
+    if not os.path.isfile(benchmark_ini):
+        error_exit(script_name, f"could not find OSB config file {benchmark_ini}, run a workload first to create it")
+    config.read(benchmark_ini)
 
     root_dir = config['node']['root.dir']
     workload_dir= root_dir + '/workloads/' + repository + '/' + workload
     data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload
 
+    if not os.path.exists(data_dir):
+        error_exit(script_name, f"workload data directory {data_dir} does not exist, run the appropriate workload first to create it")
     output_file = data_dir + '/documents-' + output_file_suffix + '.json'
     if '/' not in input_file:
         input_file = data_dir + '/' + input_file
@@ -274,8 +283,6 @@ def main(args: list) -> None:
     output_file_suffix = args.output_file_suffix
     n_docs = args.number_of_docs
     corpus_size = args.corpus_size
-    interval = args.interval if args.interval is not None else \
-			corpus_size * -2
     start_timestamp = args.start_timestamp
     batch_size = args.batch_size
 
@@ -286,12 +293,14 @@ def main(args: list) -> None:
     elif not n_docs and not corpus_size:
         parser.usage_msg(script_name +
                      ": must specify number of documents or corpus size")
-
+    interval = args.interval if args.interval is not None else \
+			corpus_size * -2
     if workload != 'http_logs':
         parser.usage_msg(script_name +
                      ': only the "http_logs" workload is currently supported')
 
-    generate_docs(workload,
+    generate_docs(script_name,
+                  workload,
                   repository,
                   input_file,
                   output_file_suffix,
diff --git a/tests/scripts_test.py b/tests/scripts_test.py
new file mode 100644
index 000000000..44bcf8a19
--- /dev/null
+++ b/tests/scripts_test.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+# Modifications Copyright OpenSearch Contributors. See
+# GitHub history for details.
+# Licensed to Elasticsearch B.V. under one or more contributor
+# license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright
+# ownership. Elasticsearch B.V. licenses this file to you under
+# the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#	http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pathlib
+import subprocess
+from unittest import TestCase
+
+class ScriptsTests(TestCase):
+
+    def test_scr(self):
+        os.environ["BENCHMARK_HOME"] = "/tmp"
+        script = pathlib.Path(__file__).parent.parent / "scripts" / "expand-data-corpus.py"
+        p = subprocess.Popen([str(script), "-c", "10"],
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stderr = p.communicate()[1].decode('UTF-8')
+        self.assertTrue("could not find OSB config file" in stderr)