-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Ensured Sanitized Column Names are Unique in AutoML CLI #5177
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3669579
d476c55
433ebd3
d17e739
22ad34c
42dfa10
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| using System.Linq; | ||
| using System.Reflection; | ||
| using System.Text; | ||
| using System.Text.RegularExpressions; | ||
| using Microsoft.CodeAnalysis; | ||
| using Microsoft.CodeAnalysis.CSharp; | ||
| using Microsoft.CodeAnalysis.Formatting; | ||
|
|
@@ -246,6 +247,8 @@ internal static int CreateSolutionFile(string solutionFile, string outputPath) | |
| internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnInferenceResults, IDictionary<string, CodeGeneratorSettings.ColumnMapping> columnMapping = default) | ||
| { | ||
| IList<string> result = new List<string>(); | ||
| List<string> normalizedColumnNames = new List<string>(); | ||
| bool duplicateColumnNamesExist = false; | ||
| foreach (var column in columnInferenceResults.TextLoaderOptions.Columns) | ||
| { | ||
| StringBuilder sb = new StringBuilder(); | ||
|
|
@@ -268,37 +271,9 @@ internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnI | |
| dataKind = column.DataKind; | ||
| columnName = column.Name; | ||
| } | ||
| switch (dataKind) | ||
| { | ||
| case Microsoft.ML.Data.DataKind.String: | ||
| sb.Append(Symbols.StringSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.Boolean: | ||
| sb.Append(Symbols.BoolSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.Single: | ||
| sb.Append(Symbols.FloatSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.Double: | ||
| sb.Append(Symbols.DoubleSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.Int32: | ||
| sb.Append(Symbols.IntSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.UInt32: | ||
| sb.Append(Symbols.UIntSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.Int64: | ||
| sb.Append(Symbols.LongSymbol); | ||
| break; | ||
| case Microsoft.ML.Data.DataKind.UInt64: | ||
| sb.Append(Symbols.UlongSymbol); | ||
| break; | ||
| default: | ||
| throw new ArgumentException($"The data type '{column.DataKind}' is not handled currently."); | ||
|
|
||
| } | ||
| sb.Append(GetSymbolOfDataKind(dataKind)); | ||
|
|
||
| // Accomodate VectorType (array) columns | ||
| if (range > 0) | ||
| { | ||
| result.Add($"[ColumnName(\"{columnName}\"),LoadColumn({column.Source[0].Min}, {column.Source[0].Max}) VectorType({(range + 1)})]"); | ||
|
|
@@ -309,12 +284,51 @@ internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnI | |
| result.Add($"[ColumnName(\"{columnName}\"), LoadColumn({column.Source[0].Min})]"); | ||
| } | ||
| sb.Append(" "); | ||
| sb.Append(Utils.Normalize(column.Name)); | ||
| sb.Append("{get; set;}"); | ||
| string normalizedColumnName = Utils.Normalize(column.Name); | ||
| // Put placeholder for normalized and unique version of column name | ||
| if (!duplicateColumnNamesExist && normalizedColumnNames.Contains(normalizedColumnName)) | ||
| duplicateColumnNamesExist = true; | ||
| normalizedColumnNames.Add(normalizedColumnName); | ||
| result.Add(sb.ToString()); | ||
| result.Add("\r\n"); | ||
| } | ||
| for (int i = 1; i < result.Count; i+=3) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you make sure the generated label are identical for other source files in generated project? and here: (SampleData also call into Util.Normalize)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, these generated labels are generated using |
||
| { | ||
| // Get normalized column name for correctly typed class property name | ||
| // If duplicate column names exist, the only way to ensure all generated column names are unique is to add | ||
| // a differentiator depending on the column load order from dataset. | ||
| if (duplicateColumnNamesExist) | ||
| result[i] += normalizedColumnNames[i/3] + $"_col_{i/3}"; | ||
| else | ||
| result[i] += normalizedColumnNames[i/3]; | ||
| result[i] += "{get; set;}"; | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| internal static string GetSymbolOfDataKind(DataKind dataKind) | ||
| { | ||
| switch (dataKind) | ||
| { | ||
| case DataKind.String: | ||
| return Symbols.StringSymbol; | ||
| case DataKind.Boolean: | ||
| return Symbols.BoolSymbol; | ||
| case DataKind.Single: | ||
| return Symbols.FloatSymbol; | ||
| case DataKind.Double: | ||
| return Symbols.DoubleSymbol; | ||
| case DataKind.Int32: | ||
| return Symbols.IntSymbol; | ||
| case DataKind.UInt32: | ||
| return Symbols.UIntSymbol; | ||
| case DataKind.Int64: | ||
| return Symbols.LongSymbol; | ||
| case DataKind.UInt64: | ||
| return Symbols.UlongSymbol; | ||
| default: | ||
| throw new ArgumentException($"The data type '{dataKind}' is not handled currently."); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| //***************************************************************************************** | ||
| //* * | ||
| //* This is an auto-generated file by Microsoft ML.NET CLI (Command-Line Interface) tool. * | ||
| //* * | ||
| //***************************************************************************************** | ||
|
|
||
| using Microsoft.ML.Data; | ||
|
|
||
| namespace test.Model | ||
| { | ||
| public class ModelInput | ||
| { | ||
| [ColumnName("input_0"), LoadColumn(0)] | ||
| public int Id_col_0 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_1"), LoadColumn(1)] | ||
| public int MsAssetNum_col_1 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_2"), LoadColumn(2)] | ||
| public string Make_col_2 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_3"), LoadColumn(3)] | ||
| public string Model_col_3 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_4"), LoadColumn(4)] | ||
| public double Model_col_4 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_5"), LoadColumn(5)] | ||
| public string Work_category_col_5 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("Work category"), LoadColumn(6)] | ||
| public int Work_category_col_6 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("input_7"), LoadColumn(7)] | ||
| public bool IsDetachable_col_7 { get; set; } | ||
|
|
||
|
|
||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| //***************************************************************************************** | ||
| //* * | ||
| //* This is an auto-generated file by Microsoft ML.NET CLI (Command-Line Interface) tool. * | ||
| //* * | ||
| //***************************************************************************************** | ||
|
|
||
| using Microsoft.ML.Data; | ||
|
|
||
| namespace test.Model | ||
| { | ||
| public class ModelInput | ||
| { | ||
| [ColumnName("id"), LoadColumn(0)] | ||
| public int Id_col_0 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("MsAssetNum"), LoadColumn(1)] | ||
| public int MsAssetNum_col_1 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("Make"), LoadColumn(2)] | ||
| public string Make_col_2 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("Model"), LoadColumn(3)] | ||
| public string Model_col_3 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("model"), LoadColumn(4)] | ||
| public double Model_col_4 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("work category"), LoadColumn(5)] | ||
| public string Work_category_col_5 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("Work category"), LoadColumn(6)] | ||
| public int Work_category_col_6 { get; set; } | ||
|
|
||
|
|
||
| [ColumnName("IsDetachable"), LoadColumn(7)] | ||
| public bool IsDetachable_col_7 { get; set; } | ||
|
|
||
|
|
||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've decided to keep this
GenerateClassLabelsas it is possible that aCodeGeneratorinstance can call this function, as it does below:machinelearning/test/Microsoft.ML.CodeGenerator.Tests/CodeGenTests.cs
Line 113 in d476c55
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!