-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Labels
AutoML.NETAutomating various steps of the machine learning processAutomating various steps of the machine learning processP1Priority of the issue for triage purpose: Needs to be fixed soon.Priority of the issue for triage purpose: Needs to be fixed soon.bugSomething isn't workingSomething isn't workingclassificationBugs related classification tasksBugs related classification taskscommand-lineIssues pertaining to the command-line interfaceIssues pertaining to the command-line interfaceimageBugs related image datatype tasksBugs related image datatype tasks
Description
As @LittleLittleCloud noted in this comment in PR #5177 for fixing Issue #3902, columns generated from inline data are currently named in the following way:
machinelearning/src/Microsoft.ML.CodeGenerator/Utils.cs
Lines 49 to 68 in 33f5f32
| internal static IDictionary<string, string> GenerateSampleData(IDataView dataView, ColumnInferenceResults columnInference) | |
| { | |
| var featureColumns = dataView.Schema.AsEnumerable().Where(col => col.Name != columnInference.ColumnInformation.LabelColumnName && !columnInference.ColumnInformation.IgnoredColumnNames.Contains(col.Name)); | |
| var rowCursor = dataView.GetRowCursor(featureColumns); | |
| var sampleData = featureColumns.Select(column => new { key = Utils.Normalize(column.Name), val = "null" }).ToDictionary(x => x.key, x => x.val); | |
| if (rowCursor.MoveNext()) | |
| { | |
| var getGetGetterMethod = typeof(Utils).GetMethod(nameof(Utils.GetValueFromColumn), BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic); | |
| foreach (var column in featureColumns) | |
| { | |
| var getGeneraicGetGetterMethod = getGetGetterMethod.MakeGenericMethod(column.Type.RawType); | |
| string val = getGeneraicGetGetterMethod.Invoke(null, new object[] { rowCursor, column }) as string; | |
| sampleData[Utils.Normalize(column.Name)] = val; | |
| } | |
| } | |
| return sampleData; | |
| } |
This method of directly using Utils.Normalize is different from using GenerateClassLabels instead to obtain normalized and sanitized column names. GenerateClassLabels can accommodate conflicting/duplicate column names, whereas in GenerateSampleData() this situation results in exceptions.
machinelearning/src/Microsoft.ML.CodeGenerator/Utils.cs
Lines 246 to 318 in 33f5f32
| internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnInferenceResults, IDictionary<string, CodeGeneratorSettings.ColumnMapping> columnMapping = default) | |
| { | |
| IList<string> result = new List<string>(); | |
| foreach (var column in columnInferenceResults.TextLoaderOptions.Columns) | |
| { | |
| StringBuilder sb = new StringBuilder(); | |
| int range = (column.Source[0].Max - column.Source[0].Min).Value; | |
| bool isArray = range > 0; | |
| sb.Append(Symbols.PublicSymbol); | |
| sb.Append(Symbols.Space); | |
| // if column is in columnMapping, use the type and name in that | |
| DataKind dataKind; | |
| string columnName; | |
| if (columnMapping != null && columnMapping.ContainsKey(column.Name)) | |
| { | |
| dataKind = columnMapping[column.Name].ColumnType; | |
| columnName = columnMapping[column.Name].ColumnName; | |
| } | |
| else | |
| { | |
| dataKind = column.DataKind; | |
| columnName = column.Name; | |
| } | |
| switch (dataKind) | |
| { | |
| case Microsoft.ML.Data.DataKind.String: | |
| sb.Append(Symbols.StringSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.Boolean: | |
| sb.Append(Symbols.BoolSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.Single: | |
| sb.Append(Symbols.FloatSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.Double: | |
| sb.Append(Symbols.DoubleSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.Int32: | |
| sb.Append(Symbols.IntSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.UInt32: | |
| sb.Append(Symbols.UIntSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.Int64: | |
| sb.Append(Symbols.LongSymbol); | |
| break; | |
| case Microsoft.ML.Data.DataKind.UInt64: | |
| sb.Append(Symbols.UlongSymbol); | |
| break; | |
| default: | |
| throw new ArgumentException($"The data type '{column.DataKind}' is not handled currently."); | |
| } | |
| if (range > 0) | |
| { | |
| result.Add($"[ColumnName(\"{columnName}\"),LoadColumn({column.Source[0].Min}, {column.Source[0].Max}) VectorType({(range + 1)})]"); | |
| sb.Append("[]"); | |
| } | |
| else | |
| { | |
| result.Add($"[ColumnName(\"{columnName}\"), LoadColumn({column.Source[0].Min})]"); | |
| } | |
| sb.Append(" "); | |
| sb.Append(Utils.Normalize(column.Name)); | |
| sb.Append("{get; set;}"); | |
| result.Add(sb.ToString()); | |
| result.Add("\r\n"); | |
| } | |
| return result; | |
| } |
To-do:
- Ensure
GenerateSampleData()can accomodate conflicting/duplicate column names by usingUtils.GenerateClassLabels().
Metadata
Metadata
Assignees
Labels
AutoML.NETAutomating various steps of the machine learning processAutomating various steps of the machine learning processP1Priority of the issue for triage purpose: Needs to be fixed soon.Priority of the issue for triage purpose: Needs to be fixed soon.bugSomething isn't workingSomething isn't workingclassificationBugs related classification tasksBugs related classification taskscommand-lineIssues pertaining to the command-line interfaceIssues pertaining to the command-line interfaceimageBugs related image datatype tasksBugs related image datatype tasks