mlcommons
diff --git a/‎README.md
Lines changed: 4 additions & 4 deletions b/‎README.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎compliance/nvidia/TEST04-A/README.md
Lines changed: 8 additions & 6 deletions b/‎compliance/nvidia/TEST04-A/README.md
Lines changed: 8 additions & 6 deletions
diff --git a/‎language/bert/pytorch_SUT.py
Lines changed: 9 additions & 3 deletions b/‎language/bert/pytorch_SUT.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎loadgen/loadgen.cc
Lines changed: 19 additions & 19 deletions b/‎loadgen/loadgen.cc
Lines changed: 19 additions & 19 deletions
diff --git a/‎loadgen/test_settings_internal.cc
Lines changed: 4 additions & 3 deletions b/‎loadgen/test_settings_internal.cc
Lines changed: 4 additions & 3 deletions
diff --git a/‎loadgen/version_generator.py
Lines changed: 1 addition & 1 deletion b/‎loadgen/version_generator.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎speech_recognition/rnnt/accuracy_eval.py
Lines changed: 1 addition & 3 deletions b/‎speech_recognition/rnnt/accuracy_eval.py
Lines changed: 1 addition & 3 deletions
@@ -26,7 +26,7 @@ See the individual Readme files in the reference app for details.
 | ssd-resnet34 1200x1200 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection) | tensorflow, pytorch, onnx | coco resized to 1200x1200|
 | bert | [language/bert](https://github.com/mlcommons/inference/tree/master/language/bert) | tensorflow, pytorch, onnx | squad-1.1 |
 | dlrm | [recommendation/dlrm](https://github.com/mlcommons/inference/tree/master/recommendation/dlrm/pytorch) | pytorch, tensorflow(?), onnx(?) | Criteo Terabyte |
-| 3d-unet | [vision/medical_imageing/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 |
+| 3d-unet | [vision/medical_imaging/3d-unet-kits19](https://github.com/mlcommons/inference/tree/master/vision/medical_imaging/3d-unet-kits19) | pytorch, tensorflow, onnx | KiTS19 |
 | rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus |
 
 
@@ -42,7 +42,7 @@ See the individual Readme files in the reference app for details.
 | ssd-resnet34 1200x1200 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/r1.1/vision/classification_and_detection) | tensorflow, pytorch, onnx | coco resized to 1200x1200|
 | bert | [language/bert](https://github.com/mlcommons/inference/tree/r1.1/language/bert) | tensorflow, pytorch, onnx | squad-1.1 |
 | dlrm | [recommendation/dlrm](https://github.com/mlcommons/inference/tree/r1.1/recommendation/dlrm/pytorch) | pytorch, tensorflow(?), onnx(?) | Criteo Terabyte |
-| 3d-unet | [vision/medical_imageing/3d-unet](https://github.com/mlcommons/inference/tree/r1.1/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
+| 3d-unet | [vision/medical_imaging/3d-unet](https://github.com/mlcommons/inference/tree/r1.1/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
 | rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/r1.1/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus |
 
 ## MLPerf Inference v1.0 (submission 03/19/2021)
@@ -57,7 +57,7 @@ See the individual Readme files in the reference app for details.
 | ssd-resnet34 1200x1200 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/r1.0/vision/classification_and_detection) | tensorflow, pytorch, onnx | coco resized to 1200x1200|
 | bert | [language/bert](https://github.com/mlcommons/inference/tree/r1.0/language/bert) | tensorflow, pytorch, onnx | squad-1.1 |
 | dlrm | [recommendation/dlrm](https://github.com/mlcommons/inference/tree/r1.0/recommendation/dlrm/pytorch) | pytorch, tensorflow(?), onnx(?) | Criteo Terabyte |
-| 3d-unet | [vision/medical_imageing/3d-unet](https://github.com/mlcommons/inference/tree/r1.0/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
+| 3d-unet | [vision/medical_imaging/3d-unet](https://github.com/mlcommons/inference/tree/r1.0/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
 | rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/r1.0/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus |
 
 
@@ -73,7 +73,7 @@ See the individual Readme files in the reference app for details.
 | ssd-resnet34 1200x1200 | [vision/classification_and_detection](https://github.com/mlcommons/inference/tree/r0.7/vision/classification_and_detection) | tensorflow, pytorch, onnx | coco resized to 1200x1200|
 | bert | [language/bert](https://github.com/mlcommons/inference/tree/r0.7/language/bert) | tensorflow, pytorch, onnx | squad-1.1 |
 | dlrm | [recommendation/dlrm](https://github.com/mlcommons/inference/tree/r0.7/recommendation/dlrm/pytorch) | pytorch, tensorflow(?), onnx(?) | Criteo Terabyte |
-| 3d-unet | [vision/medical_imageing/3d-unet](https://github.com/mlcommons/inference/tree/r0.7/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
+| 3d-unet | [vision/medical_imaging/3d-unet](https://github.com/mlcommons/inference/tree/r0.7/vision/medical_imaging/3d-unet) | pytorch, tensorflow(?), onnx(?) | BraTS 2019 |
 | rnnt | [speech_recognition/rnnt](https://github.com/mlcommons/inference/tree/r0.7/speech_recognition/rnnt) | pytorch | OpenSLR LibriSpeech Corpus |
 
 ## MLPerf Inference v0.5
 
@@ -24,18 +24,20 @@ This test requires measuring & comparing performance of SUT (PerformanceOnly, mo
 Test script works best with Python 3.3 or later.
 
 ## Exempt Benchmarks
-This test is not applicable for the following benchmarks whose performance is dependent on variably sized input samples:
- 1. RNNT
- 2. BERT
- 3. DLRM
+This test is not applicable for the following benchmarks whose performance is dependent on variably sized input samples
+ 1. rnnt
+ 2. bert
+ 3. dlrm
+ 4. 3d-unet
 
 ## Scenarios
 
- - This test is applicable for scenarios Offline, Server and SingleStream always.
- - This test is not applicable for Multi-Stream scenario if samples_per_query >= Performance Sample Count
+ - As of v2.0, this test is applicable for all valid scenarios of non-exempt benchmarks.
 
 ## Pass Criteria
 Performance of TEST04-B should be slower than performance of TEST04-A. To account for noise, TEST04-A can be upto 20% slower than TEST04-B for SingleStream scenario with very short latencies (<200us) & upto 10% slower otherwise.
+Significant run-to-run variation can result due to the small number of samples in this test. 
+To compensate, the performance sample count may be increased to increase the number of samples in the test, up to the size of the dataset or the size that still fits in the SUT's memory, whichever is reached first.
 
 ## Instructions
 
 
@@ -25,6 +25,7 @@
 import mlperf_loadgen as lg
 import numpy as np
 import torch
+import transformers
 from transformers import BertConfig, BertForQuestionAnswering
 from squad_QSL import get_squad_QSL
 
@@ -48,11 +49,13 @@ def __init__(self, args):
             vocab_size=config_json["vocab_size"])
 
         self.dev = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        self.version = transformers.__version__
 
         print("Loading PyTorch model...")
         self.model = BertForQuestionAnswering(config)
         self.model.to(self.dev)
-        self.model.load_state_dict(torch.load("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"), strict=False)
+        self.model.eval()
+        self.model.load_state_dict(torch.load("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"), strict=True)
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies)
@@ -67,8 +70,11 @@ def issue_queries(self, query_samples):
                 model_output = self.model.forward(input_ids=torch.LongTensor(eval_features.input_ids).unsqueeze(0).to(self.dev),
                     attention_mask=torch.LongTensor(eval_features.input_mask).unsqueeze(0).to(self.dev),
                     token_type_ids=torch.LongTensor(eval_features.segment_ids).unsqueeze(0).to(self.dev))
-                start_scores = model_output.start_logits
-                end_scores = model_output.end_logits
+                if self.version >= '4.0.0':
+                    start_scores = model_output.start_logits
+                    end_scores = model_output.end_logits
+                else:
+                    start_scores, end_scores = model_output
                 output = torch.stack([start_scores, end_scores], axis=-1).squeeze(0).cpu().numpy()
 
                 response_array = array.array("B", output.tobytes())
 
@@ -183,22 +183,24 @@ auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
 }
 
 /// \brief SampleDistribution for 3D-UNet SingleStream, for v2.0
-// FIXME: meant for 3D UNet SingleStream only at the moment but the logic should work for others
-// TODO: consolidate the distribution generator after v2.0 
-auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size, std::mt19937* rng) {
+// FIXME: meant for 3D UNet SingleStream only at the moment but the logic should
+// work for others
+// TODO: consolidate the distribution generator after v2.0
+auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
+                                  std::mt19937* rng) {
   std::vector<size_t> indices;
   std::vector<size_t> shuffle_indices(set_size);
   std::iota(shuffle_indices.begin(), shuffle_indices.end(), 0);
   for (size_t j = 0; j < sample_count; j += set_size) {
     std::shuffle(shuffle_indices.begin(), shuffle_indices.end(), *rng);
-    indices.insert(indices.end(), shuffle_indices.begin(), shuffle_indices.end());
+    indices.insert(indices.end(), shuffle_indices.begin(),
+                   shuffle_indices.end());
   }
   return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
-    return indices.at((i++)%indices.size());
+    return indices.at((i++) % indices.size());
   };
 }
 
-
 /// \brief Generates queries for the requested settings, templated by
 /// scenario and mode.
 /// \todo Make GenerateQueries faster.
@@ -262,10 +264,8 @@ std::vector<QueryMetadata> GenerateQueries(
 
   // FIXME: Only used for v2.0 3D-UNet KiTS19 SingleStream
   // TODO: Need to consolidate the code for any generic usage after v2.0
-  auto sample_distribution_equal_issue =
-      SampleDistributionEqualIssue(min_queries,
-                                   loaded_samples.size(),
-                                   &sample_rng);
+  auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
+      min_queries, loaded_samples.size(), &sample_rng);
 
   auto schedule_distribution =
       ScheduleDistribution<scenario>(settings.target_qps);
@@ -340,12 +340,11 @@ std::vector<QueryMetadata> GenerateQueries(
                          scenario == TestScenario::SingleStream;
       for (auto& s : samples) {
         s = loaded_samples[settings.performance_issue_unique
-                           ? sample_distribution_unique(sample_rng)
-                           : settings.performance_issue_same
-                            ? same_sample
-                            : equal_issue
-                              ? sample_distribution_equal_issue(sample_rng)
-                              : sample_distribution(sample_rng)];
+                               ? sample_distribution_unique(sample_rng)
+                           : settings.performance_issue_same ? same_sample
+                           : equal_issue
+                               ? sample_distribution_equal_issue(sample_rng)
+                               : sample_distribution(sample_rng)];
       }
     }
     queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
@@ -653,7 +652,6 @@ void PerformanceSummary::ProcessLatencies() {
   // Calculate per-query stats.
   size_t query_count = pr.queries_issued;
   assert(pr.query_latencies.size() == query_count);
-  assert(pr.query_intervals.size() == query_count);
   std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
   QuerySampleLatency accumulated_query_latency = 0;
   for (auto latency : pr.query_latencies) {
@@ -1058,13 +1056,15 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
     }
     MLPERF_LOG(detail, "result_invalid_reason", recommendation);
   }
+  std::replace(early_stopping_recommendation.begin(),
+               early_stopping_recommendation.end(), '\n', ' ');
   MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
 
   // Report number of queries
-  MLPERF_LOG(detail, "result_query_count", std::to_string(query_count));
+  MLPERF_LOG(detail, "result_query_count", query_count);
   if (settings.scenario == TestScenario::Server) {
     MLPERF_LOG(detail, "result_overlatency_query_count",
-               std::to_string(overlatency_query_count));
+               overlatency_query_count);
   }
 
   auto reportPerQueryLatencies = [&]() {
 
@@ -120,9 +120,10 @@ TestSettingsInternal::TestSettingsInternal(
 
   // Sample by concatentating several permutations of the dataset
   // sample_concatenate_permutation
-  sample_concatenate_permutation = (requested.sample_concatenate_permutation == 0)
-                                    ? false
-                                    : requested.sample_concatenate_permutation;
+  sample_concatenate_permutation =
+      (requested.sample_concatenate_permutation == 0)
+          ? false
+          : requested.sample_concatenate_permutation;
 
   // Samples per query.
   if (requested.scenario == TestScenario::MultiStream) {
 
@@ -93,7 +93,7 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root):
     ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
     ofile.write("#include <string>\n\n")
     ofile.write("namespace mlperf {\n\n")
-    ofile.write(func_def("Version", "\"1.1\""))
+    ofile.write(func_def("Version", "\"2.0\""))
 
     date_time_now_local = datetime.datetime.now().isoformat()
     date_time_now_utc = datetime.datetime.utcnow().isoformat()
 
@@ -8,7 +8,6 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch"))
 
-from QSL import AudioQSL
 from helpers import process_evaluation_epoch, __gather_predictions
 from parts.manifest import Manifest
 
@@ -31,8 +30,7 @@ def get_args():
 def main():
     args = get_args()
     labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
-    qsl = AudioQSL(args.dataset_dir, args.manifest, labels)
-    manifest = qsl.manifest
+    manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), normalize=True, max_duration=15.0)
     with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
         results = json.load(fh)
     hypotheses = []