Perf improvement for TopK Accuracy and return all topK in Classificat…

…ion Evaluator (#5395) * Fix for issue 744 * cleanup * fixing report output * fixedTestReferenceOutputs * Fixed test reference outputs for NetCore31 * change top k acc output string format * Ranking algorithm now uses first appearance in dataset rather than worstCase * fixed benchmark * various minor changes from code review * limit TopK to OutputTopKAcc parameter * top k output name changes * make old TopK readOnly * restored old baselineOutputs since respecting outputTopK param means no topK in most test output * fix test fails, re-add names parameter * Clean up commented code * that'll teach me to edit from the github webpage * use existing method, fix nits * Slight comment change * Comment change / Touch to kick off build pipeline * fix whitespace * Added new test * Code formatting nits * Code formatting nit * Fixed undefined rankofCorrectLabel and trailing whitespace warning * Removed _numUnknownClassInstances and added test for unknown labels * Add weight to seenRanks * Nits * Removed FastTree import Co-authored-by: Antonio Velazquez <anvelazq@microsoft.com> Co-authored-by: Justin Ormont <justinormont@users.noreply.github.com>
dotnet · Dec 9, 2020 · 6a413ed · 6a413ed
1 parent 0c6238e
commit 6a413ed
Show file tree

Hide file tree

Showing 9 changed files with 203 additions and 63 deletions.
diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs
@@ -123,8 +123,7 @@ private static TMetrics GetAverageMetrics(IEnumerable<TMetrics> metrics, TMetric
                     logLoss: GetAverageOfNonNaNScores(newMetrics.Select(x => x.LogLoss)),
                     logLossReduction: GetAverageOfNonNaNScores(newMetrics.Select(x => x.LogLossReduction)),
                     topKPredictionCount: newMetrics.ElementAt(0).TopKPredictionCount,
-                    topKAccuracy: GetAverageOfNonNaNScores(newMetrics.Select(x => x.TopKAccuracy)),
-                    // Return PerClassLogLoss and ConfusionMatrix from the fold closest to average score
+                    topKAccuracies: GetAverageOfNonNaNScoresInNestedEnumerable(newMetrics.Select(x => x.TopKAccuracyForAllK)),
                     perClassLogLoss: (metricsClosestToAvg as MulticlassClassificationMetrics).PerClassLogLoss.ToArray(),
                     confusionMatrix: (metricsClosestToAvg as MulticlassClassificationMetrics).ConfusionMatrix);
                 return result as TMetrics;
@@ -163,7 +162,6 @@ private static double[] GetAverageOfNonNaNScoresInNestedEnumerable(IEnumerable<I
             double[] arr = new double[results.ElementAt(0).Count()];
             for (int i = 0; i < arr.Length; i++)
             {
-                Contracts.Assert(arr.Length == results.ElementAt(i).Count());
                 arr[i] = GetAverageOfNonNaNScores(results.Select(x => x.ElementAt(i)));
             }
             return arr;

diff --git a/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs b/src/Microsoft.ML.Data/Evaluators/EvaluatorUtils.cs
@@ -1035,7 +1035,13 @@ private static List<string> GetMetricNames(IChannel ch, DataViewSchema schema, D
                         names = editor.Commit();
                     }
                     foreach (var name in names.Items(all: true))
-                        metricNames.Add(string.Format("{0}{1}", metricName, name.Value));
+                    {
+                        var tryNaming = string.Format(metricName, name.Value);
+                        if (tryNaming == metricName) // metricName wasn't a format string, so just append slotname
+                            tryNaming = (string.Format("{0}{1}", metricName, name.Value));
+
+                        metricNames.Add(tryNaming);
+                    }
                 }
             }
             ch.Assert(metricNames.Count == metricCount);

diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/MulticlassClassificationMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/MulticlassClassificationMetrics.cs
@@ -2,8 +2,10 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System;
 using System.Collections.Generic;
 using System.Collections.Immutable;
+using System.Linq;
 using Microsoft.ML.Runtime;
 
 namespace Microsoft.ML.Data
@@ -71,16 +73,22 @@ public sealed class MulticlassClassificationMetrics
         public double MicroAccuracy { get; }
 
         /// <summary>
-        /// If <see cref="TopKPredictionCount"/> is positive, this is the relative number of examples where
-        /// the true label is one of the top-k predicted labels by the predictor.
+        /// Convenience method for "TopKAccuracyForAllK[TopKPredictionCount - 1]". If <see cref="TopKPredictionCount"/> is positive,
+        /// this is the relative number of examples where
+        /// the true label is one of the top K predicted labels by the predictor.
         /// </summary>
-        public double TopKAccuracy { get; }
+        public double TopKAccuracy => TopKAccuracyForAllK?.LastOrDefault() ?? 0;
 
         /// <summary>
-        /// If positive, this indicates the K in <see cref="TopKAccuracy"/>.
+        /// If positive, this indicates the K in <see cref="TopKAccuracy"/> and <see cref="TopKAccuracyForAllK"/>.
         /// </summary>
         public int TopKPredictionCount { get; }
 
+        /// <summary>
+        /// Returns the top K accuracy for all K from 1 to the value of TopKPredictionCount.
+        /// </summary>
+        public IReadOnlyList<double> TopKAccuracyForAllK { get; }
+
         /// <summary>
         /// Gets the log-loss of the classifier for each class. Log-loss measures the performance of a classifier
         /// with respect to how much the predicted probabilities diverge from the true class label. Lower
@@ -115,29 +123,30 @@ internal MulticlassClassificationMetrics(IHost host, DataViewRow overallResult,
             LogLoss = FetchDouble(MulticlassClassificationEvaluator.LogLoss);
             LogLossReduction = FetchDouble(MulticlassClassificationEvaluator.LogLossReduction);
             TopKPredictionCount = topKPredictionCount;
+
             if (topKPredictionCount > 0)
-                TopKAccuracy = FetchDouble(MulticlassClassificationEvaluator.TopKAccuracy);
+                TopKAccuracyForAllK = RowCursorUtils.Fetch<VBuffer<double>>(host, overallResult, MulticlassClassificationEvaluator.AllTopKAccuracy).DenseValues().ToImmutableArray();
 
             var perClassLogLoss = RowCursorUtils.Fetch<VBuffer<double>>(host, overallResult, MulticlassClassificationEvaluator.PerClassLogLoss);
             PerClassLogLoss = perClassLogLoss.DenseValues().ToImmutableArray();
             ConfusionMatrix = MetricWriter.GetConfusionMatrix(host, confusionMatrix, binary: false, perClassLogLoss.Length);
         }
 
         internal MulticlassClassificationMetrics(double accuracyMicro, double accuracyMacro, double logLoss, double logLossReduction,
-            int topKPredictionCount, double topKAccuracy, double[] perClassLogLoss)
+            int topKPredictionCount, double[] topKAccuracies, double[] perClassLogLoss)
         {
             MicroAccuracy = accuracyMicro;
             MacroAccuracy = accuracyMacro;
             LogLoss = logLoss;
             LogLossReduction = logLossReduction;
             TopKPredictionCount = topKPredictionCount;
-            TopKAccuracy = topKAccuracy;
+            TopKAccuracyForAllK = topKAccuracies;
             PerClassLogLoss = perClassLogLoss.ToImmutableArray();
         }
 
         internal MulticlassClassificationMetrics(double accuracyMicro, double accuracyMacro, double logLoss, double logLossReduction,
-            int topKPredictionCount, double topKAccuracy, double[] perClassLogLoss, ConfusionMatrix confusionMatrix)
-            : this(accuracyMicro, accuracyMacro, logLoss, logLossReduction, topKPredictionCount, topKAccuracy, perClassLogLoss)
+            int topKPredictionCount, double[] topKAccuracies, double[] perClassLogLoss, ConfusionMatrix confusionMatrix)
+            : this(accuracyMicro, accuracyMacro, logLoss, logLossReduction, topKPredictionCount, topKAccuracies, perClassLogLoss)
         {
             ConfusionMatrix = confusionMatrix;
         }