From 8e36925bd36a503e39fcbbc488e9e46126f079ed Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 17 Jan 2024 12:08:50 -0500 Subject: [PATCH] Submission checker version 4.0 (#1560) * Submission checker version 4.0 * Fix Llama2-70b name * Add 4.0 random seeds --- language/llama2-70b/README.md | 4 +- language/llama2-70b/main.py | 4 +- language/llama2-70b/mlperf.conf | 6 +- loadgen/CMakeLists.txt | 4 +- loadgen/setup.py | 4 +- loadgen/version_generator.py | 2 +- mlperf.conf | 12 +- text_to_image/tools/sample_ids.py | 5 +- text_to_image/tools/sample_ids.txt | 20 +- tools/submission/submission_checker.py | 280 ++++++++++++++++++++++++- 10 files changed, 307 insertions(+), 34 deletions(-) diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md index 0fefec95f..6c4b1c36a 100644 --- a/language/llama2-70b/README.md +++ b/language/llama2-70b/README.md @@ -1,6 +1,6 @@ -# Reference Implementation for Llama-v2-70B +# Reference Implementation for llama2-70b -**Basic implementation for Llama-v2-70B. Few noteworthy items:** +**Basic implementation for llama2-70b. Few noteworthy items:** + Processing of Validation dataset is not finalized yet. Decision on input token lengths is pending + Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py index d039d0463..bf1def806 100644 --- a/language/llama2-70b/main.py +++ b/language/llama2-70b/main.py @@ -47,8 +47,8 @@ def main(): settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario.lower()] # Need to update the conf - settings.FromConfig(args.mlperf_conf, "llama-v2-70b", args.scenario) - settings.FromConfig(args.user_conf, "llama-v2-70b", args.scenario) + settings.FromConfig(args.mlperf_conf, "llama2-70b", args.scenario) + settings.FromConfig(args.user_conf, "llama2-70b", args.scenario) if args.accuracy: settings.mode = lg.TestMode.AccuracyOnly diff --git a/language/llama2-70b/mlperf.conf b/language/llama2-70b/mlperf.conf index 3533fcabd..28c19bddf 100644 --- a/language/llama2-70b/mlperf.conf +++ b/language/llama2-70b/mlperf.conf @@ -56,9 +56,9 @@ rnnt.Server.target_latency = 1000 gptj.Server.target_latency = 20000 # Falcon Server scenario requires two latency constraints -llama-v2-70b.Server.target_latency = 2000 -llama-v2-70b.Server.ttft_latency = 2000 -llama-v2-70b.Server.tpot_latency = 200 +llama2-70b.Server.target_latency = 2000 +llama2-70b.Server.ttft_latency = 2000 +llama2-70b.Server.tpot_latency = 200 *.Offline.target_latency_percentile = 90 *.Offline.min_duration = 600000 diff --git a/loadgen/CMakeLists.txt b/loadgen/CMakeLists.txt index cd6846e16..977068cd8 100644 --- a/loadgen/CMakeLists.txt +++ b/loadgen/CMakeLists.txt @@ -3,8 +3,8 @@ cmake_minimum_required(VERSION 3.1) project(mlperf_loadgen) # The mlperf_loadgen version. -set(mlperf_loadgen_VERSION_MAJOR 3) -set(mlperf_loadgen_VERSION_MINOR 1) +set(mlperf_loadgen_VERSION_MAJOR 4) +set(mlperf_loadgen_VERSION_MINOR 0) message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}") # Set build options. NB: CXX_STANDARD is supported since CMake 3.1. diff --git a/loadgen/setup.py b/loadgen/setup.py index 676258b81..d7b8224de 100644 --- a/loadgen/setup.py +++ b/loadgen/setup.py @@ -76,13 +76,13 @@ mlperf_loadgen_module = Pybind11Extension( "mlperf_loadgen", - define_macros=[("MAJOR_VERSION", "3"), ("MINOR_VERSION", "1")], + define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "0")], include_dirs=[".", get_include()], sources=mlperf_loadgen_sources, depends=mlperf_loadgen_headers) setup(name="mlperf_loadgen", - version="3.1", + version="4.0", description="MLPerf Inference LoadGen python bindings", url="https://mlcommons.org/", cmdclass={"build_ext": build_ext}, diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py index 331780f02..a34657a64 100644 --- a/loadgen/version_generator.py +++ b/loadgen/version_generator.py @@ -94,7 +94,7 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root): ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n") ofile.write("#include \n\n") ofile.write("namespace mlperf {\n\n") - ofile.write(func_def("Version", "\"3.1\"")) + ofile.write(func_def("Version", "\"4.0\"")) date_time_now_local = datetime.datetime.now().isoformat() date_time_now_utc = datetime.datetime.utcnow().isoformat() diff --git a/mlperf.conf b/mlperf.conf index 3bf8c0db7..a73ddd4a7 100644 --- a/mlperf.conf +++ b/mlperf.conf @@ -17,13 +17,13 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000 3d-unet.*.performance_sample_count_override = 0 # Set seeds. The seeds will be distributed two weeks before the submission. -*.*.qsl_rng_seed = 148687905518835231 -*.*.sample_index_rng_seed = 520418551913322573 -*.*.schedule_rng_seed = 811580660758947900 +*.*.qsl_rng_seed = 13281865557512327830 +*.*.sample_index_rng_seed = 198141574272810017 +*.*.schedule_rng_seed = 7575108116881280410 # Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. -*.*.test05_qsl_rng_seed = 793197339507417767 -*.*.test05_sample_index_rng_seed = 255610748586851044 -*.*.test05_schedule_rng_seed = 352213341366340113 +*.*.test05_qsl_rng_seed = 2376919268182438552 +*.*.test05_sample_index_rng_seed = 11176391829184272374 +*.*.test05_schedule_rng_seed = 3911940905271271337 *.SingleStream.target_latency_percentile = 90 diff --git a/text_to_image/tools/sample_ids.py b/text_to_image/tools/sample_ids.py index 3b0548fd2..e1d6effb4 100644 --- a/text_to_image/tools/sample_ids.py +++ b/text_to_image/tools/sample_ids.py @@ -15,13 +15,16 @@ def get_args(): parser.add_argument( "--n", type=int, default=10, help="Dataset download location" ) + parser.add_argument( + "--seed", "-s", type=int, default=926019364, help="Dataset download location" + ) args = parser.parse_args() return args if __name__ == "__main__": args = get_args() - np.random.seed(42) + np.random.seed(args.seed) df_annotations = pd.read_csv(f"{args.tsv_path}", sep="\t") sample_ids = list(np.random.choice(df_annotations.shape[0], args.n)) with open(args.output_path, "w+") as f: diff --git a/text_to_image/tools/sample_ids.txt b/text_to_image/tools/sample_ids.txt index a8bd67045..65c9f5641 100644 --- a/text_to_image/tools/sample_ids.txt +++ b/text_to_image/tools/sample_ids.txt @@ -1,10 +1,10 @@ -860 -3772 -3092 -466 -4426 -3444 -3171 -2919 -130 -1685 \ No newline at end of file +4459 +4015 +2705 +1682 +4048 +4683 +3757 +1578 +3319 +95 \ No newline at end of file diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 6c2265916..7131415a4 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1049,6 +1049,184 @@ "gpt3-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, }, }, + "v4.0": { + "models": [ + "resnet", + "retinanet", + "rnnt", + "bert-99", + "bert-99.9", + "dlrm-v2-99", + "dlrm-v2-99.9", + "3d-unet-99", + "3d-unet-99.9", + "gptj-99", + "gptj-99.9", + "llama2-70b-99", + "llama2-70b-99.9", + "stable-diffusion-xl", + ], + "required-scenarios-datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "bert-99": ["Server", "Offline"], + "bert-99.9": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + }, + "optional-scenarios-datacenter": {}, + "required-scenarios-edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + "optional-scenarios-edge": {}, + "required-scenarios-datacenter-edge": { + "resnet": ["SingleStream", "Offline", "MultiStream", "Server"], + "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"], + "rnnt": ["SingleStream", "Offline", "Server"], + "bert-99": ["SingleStream", "Offline", "Server"], + "bert-99.9": ["Offline", "Server"], + "dlrm-v2-99": ["Offline", "Server"], + "dlrm-v2-99.9": ["Offline", "Server"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline", "Server"], + "gptj-99.9": ["SingleStream", "Offline", "Server"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline", "Server"], + }, + "optional-scenarios-datacenter-edge": {}, + "accuracy-target": { + "resnet": ("acc", 76.46 * 0.99), + "retinanet": ("mAP", 37.55 * 0.99), + "rnnt": ("WER", (100 - 7.452) * 0.99), + "bert-99": ("F1", 90.874 * 0.99), + "bert-99.9": ("F1", 90.874 * 0.999), + "dlrm-v2-99": ("AUC", 80.31 * 0.99), + "dlrm-v2-99.9": ("AUC", 80.31 * 0.999), + "3d-unet-99": ("DICE", 0.86170 * 0.99), + "3d-unet-99.9": ("DICE", 0.86170 * 0.999), + "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9), + "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9), + # TODO: Replace with metrics for llama2-70b + "llama2-70b-99" : ("ROUGE1", 43.88 * 0.99, "ROUGE2", 21.7108 * 0.99, "ROUGEL", 28.2502 * 0.99, "tokens_per_sample", 839.4*0.9), + "llama2-70b-99.9" : ("ROUGE1", 43.88 * 0.999, "ROUGE2", 21.7108 * 0.999, "ROUGEL", 28.2502 * 0.999, "tokens_per_sample", 839.4*0.9), + "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758) + }, + "accuracy-upper-limit": { + "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626) + }, + "performance-sample-count": { + "resnet": 1024, + "retinanet": 64, + "rnnt": 2513, + "bert-99": 10833, + "bert-99.9": 10833, + "dlrm-v2-99": 204800, + "dlrm-v2-99.9": 204800, + "3d-unet-99": 43, + "3d-unet-99.9": 43, + "gptj-99": 13368, + "gptj-99.9": 13368, + "llama2-70b-99": 24576, + "llama2-70b-99.9": 24576, + "stable-diffusion-xl": 5000 + }, + # TODO: Update this list. + "model_mapping": { + # map model names to the official mlperf model class + "ssd-mobilenet": "ssd-small", + "ssd-resnet34": "retinanet", + "mobilenet": "resnet", + "resnet50": "resnet", + "ssd_resnet101_v1_fpn_640x640": "ssd-small", + "ssd_resnet101_v1_fpn_1024x1024": "ssd-large", + "ssd_resnet152_v1_fpn_640x640": "ssd-small", + "ssd_resnet152_v1_fpn_1024x1024": "ssd-large", + "rcnn-resnet50-lowproposals-coco": "ssd-large", + "rcnn-inception-resnet-v2-lowproposals-coco": "ssd-large", + "rcnn-inception-v2-coco": "ssd-large", + "rcnn-nas-lowproposals-coco": "ssd-large", + "rcnn-resnet101-lowproposals-coco": "ssd-large", + "ssd_mobilenet_v1_coco": "ssd-small", + "ssd_mobilenet_v1_fpn_640x640": "ssd-small", + "ssd_mobilenet_v1_quantized_coco": "ssd-small", + "ssd_mobilenet_v2_320x320": "ssd-small", + "ssd_mobilenet_v2_fpnlite_320x320": "ssd-small", + "ssd_mobilenet_v2_fpnlite_640x640": "ssd-small", + "ssd_resnet50_v1_fpn_640x640": "ssd-small", + "ssd_resnet50_v1_fpn_1024x1024": "ssd-large", + }, + "seeds": { + # TODO: Update random seeds + "qsl_rng_seed": 13281865557512327830, + "sample_index_rng_seed": 198141574272810017, + "schedule_rng_seed": 7575108116881280410, + }, + "test05_seeds": { + # TODO: Update random seeds + "qsl_rng_seed": 2376919268182438552, + "sample_index_rng_seed": 11176391829184272374, + "schedule_rng_seed": 3911940905271271337, + }, + "ignore_errors": [], + "latency-constraint": { + "resnet": {"Server": 15000000}, + "retinanet": {"Server": 100000000}, + "rnnt": {"Server": 1000000000}, + "bert-99": {"Server": 130000000}, + "bert-99.9": {"Server": 130000000}, + "dlrm-v2-99": {"Server": 60000000}, + "dlrm-v2-99.9": {"Server": 60000000}, + "gptj-99": {"Server": 20000000000}, + "gptj-99.9": {"Server": 20000000000}, + "llama2-70b-99": {"Server": 20000000000}, + "llama2-70b-99.9": {"Server": 20000000000}, + "stable-diffusion-xl" : {"Server": 20000000000} + }, + "min-queries": { + "resnet": { + "SingleStream": 1024, + "MultiStream": 270336, + "Server": 270336, + "Offline": 1, + }, + "retinanet": { + "SingleStream": 1024, + "MultiStream": 270336, + "Server": 270336, + "Offline": 1, + }, + "rnnt": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "bert-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "bert-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "dlrm-v2-99": {"Server": 270336, "Offline": 1}, + "dlrm-v2-99.9": {"Server": 270336, "Offline": 1}, + "3d-unet-99": {"SingleStream": 1024, "Offline": 1}, + "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1}, + "gptj-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1} + }, + }, } VALID_DIVISIONS = ["open", "closed", "network"] @@ -1069,6 +1247,24 @@ "accuracy.txt", "mlperf_log_accuracy.json", ] +REQUIRED_ACC_BENCHMARK = { + "stable-diffusion-xl": { + "v4.0": { + "images": [ + "4459", + "4015", + "2705", + "1682", + "4048", + "4683", + "3757", + "1578", + "3319", + "95" + ] + } + } +} REQUIRED_MEASURE_FILES = ["mlperf.conf", "user.conf", "README.md"] REQUIRED_POWER_MEASURE_FILES = ["analyzer_table.*", "power_settings.*"] MS_TO_NS = 1000 * 1000 @@ -1156,6 +1352,24 @@ "MultiStreamLegacy": "effective_samples_per_query", "MultiStream": "early_stopping_latency_ms", "Server": "result_scheduled_samples_per_sec", + }, + "v4.0": { + "Offline": "result_samples_per_second", + "SingleStream": "early_stopping_latency_ss", + "MultiStreamLegacy": "effective_samples_per_query", + "MultiStream": "early_stopping_latency_ms", + "Server": "result_scheduled_samples_per_sec", + } +} + +RESULT_FIELD_BENCHMARK_OVERWRITE = { + "llama2-70b-99": { + "Offline": "result_tokens_per_second", + "Server": "result_scheduled_samples_per_sec", + }, + "llama2-70b-99.9": { + "Offline": "result_tokens_per_second", + "Server": "result_scheduled_samples_per_sec", } } @@ -1170,7 +1384,11 @@ "ROUGE1": r".*'rouge1':\s([\d.]+).*", "ROUGE2": r".*'rouge2':\s([\d.]+).*", "ROUGEL": r".*'rougeL':\s([\d.]+).*", + "ROUGELSUM": r".*'rougeLsum':\s([\d.]+).*", "GEN_LEN": r".*'gen_len':\s([\d.]+).*", + "TOKENS_PER_SAMPLE": r".*'tokens_per_sample':\s([\d.]+).*", + "CLIP_SCORE": r".*'CLIP_SCORE':\s([\d.]+).*", + "FID_SCORE": r".*'FID_SCORE':\s([\d.]+).*", } SYSTEM_DESC_REQUIRED_FIELDS = [ @@ -1309,6 +1527,7 @@ def __init__( self.seeds = self.base["seeds"] self.test05_seeds = self.base["test05_seeds"] self.accuracy_target = self.base["accuracy-target"] + self.accuracy_upper_limit = self.base["accuracy-upper-limit"] self.performance_sample_count = self.base["performance-sample-count"] self.latency_constraint = self.base.get("latency-constraint", {}) self.min_queries = self.base.get("min-queries", {}) @@ -1394,6 +1613,9 @@ def get_accuracy_target(self, model): if model not in self.accuracy_target: raise ValueError("model not known: " + model) return self.accuracy_target[model] + + def get_accuracy_upper_limit(self, model): + return self.accuracy_upper_limit.get(model, None) def get_performance_sample_count(self, model): model = self.get_mlperf_model(model) @@ -1442,7 +1664,7 @@ def get_args(): parser.add_argument("--input", required=True, help="submission directory") parser.add_argument( "--version", - default="v3.1", + default="v4.0", choices=list(MODEL_CONFIG.keys()), help="mlperf version", ) @@ -1522,6 +1744,23 @@ def list_files_recursively(*path): ] +def check_extra_files(path, target_files): + missing_files = [] + check_pass = True + folders = list_dir(path) + for dir in target_files.keys(): + if dir not in folders: + check_pass = False + missing_files.append(os.path.join(path, dir)) + else: + files = [f.split(".")[0] for f in list_files(os.path.join(path, dir))] + for target_file in target_files[dir]: + if target_file not in files: + check_pass = False + missing_files.append(f"{os.path.join(path, dir, target_file)}.png") + return check_pass, missing_files + + def split_path(m): return m.replace("\\", "/").split("/") @@ -1574,12 +1813,18 @@ def check_accuracy_dir(config, model, path, verbose): result_acc = None hash_val = None target = config.get_accuracy_target(model) + acc_upper_limit = config.get_accuracy_upper_limit(model) patterns = [] acc_targets = [] + if acc_upper_limit is not None: + acc_limits = [] + acc_limit_check = True for i in range(0, len(target), 2): acc_type, acc_target = target[i:i+2] patterns.append(ACC_PATTERN[acc_type]) acc_targets.append(acc_target) + if acc_upper_limit is not None: + acc_limits.append(acc_upper_limit[i+1]) acc_seen = [False for _ in acc_targets] with open(os.path.join(path, "accuracy.txt"), "r", encoding="utf-8") as f: for line in f: @@ -1596,12 +1841,17 @@ def check_accuracy_dir(config, model, path, verbose): elif acc is not None: all_accuracy_valid = False log.warning("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc) + if acc is not None and acc_upper_limit is not None and float(acc) > acc_limits[i]: + acc_limit_check = False + log.warning("%s accuracy not met: upper limit=%f, found=%s", path, acc_limits[i], acc) if i == 0 and acc: result_acc = acc acc = None if all(acc_seen) and hash_val: break; is_valid = all_accuracy_valid & all(acc_seen) + if acc_upper_limit is not None: + is_valid &= acc_limit_check if not hash_val: log.error("%s not hash value for mlperf_log_accuracy.json", path) @@ -1678,6 +1928,9 @@ def check_performance_dir( else scenario ) res = float(mlperf_log[RESULT_FIELD_NEW[config.version][scenario_for_res]]) + if model in RESULT_FIELD_BENCHMARK_OVERWRITE and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[model]: + res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[model][scenario_for_res]]) + latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"] latency_mean = mlperf_log["result_mean_latency_ns"] if scenario in ["MultiStream"]: @@ -2638,6 +2891,14 @@ def log_result( acc_path, debug or is_closed_or_network, ) + if mlperf_model in REQUIRED_ACC_BENCHMARK: + if config.version in REQUIRED_ACC_BENCHMARK[mlperf_model]: + extra_files_pass, missing_files = check_extra_files(acc_path, REQUIRED_ACC_BENCHMARK[mlperf_model][config.version]) + if not extra_files_pass: + log.error( + "%s expected to have the following extra files (%s)", acc_path, missing_files + ) + accuracy_is_valid = False if not accuracy_is_valid and not is_closed_or_network: if debug: log.warning( @@ -3216,20 +3477,29 @@ def check_compliance_dir( "retinanet", "gptj-99", "gptj-99.9", - "gpt3-99", - "gpt3-99.9", + "llama2-70b-99", + "llama2-70b-99.9", + "stable-diffusion-xl" + ]: test_list.remove("TEST04") if model in [ "gptj-99", "gptj-99.9", - "gpt3-99", - "gpt3-99.9", + "llama2-70b-99", + "llama2-70b-99.9", + "stable-diffusion-xl" ]: test_list.remove("TEST05") test_list.remove("TEST01") + if model in [ + "llama2-70b-99", + "llama2-70b-99.9", + ]: + test_list.append("TEST06") + # Check performance of all Tests for test in test_list: test_dir = os.path.join(compliance_dir, test)