From 59699a5e5ed48bb1dbd9289c4676cebe3f9020d1 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Tue, 30 Jul 2019 13:55:28 -0700 Subject: [PATCH] Farewell to the Static API (#4009) --- Microsoft.ML.sln | 127 -- .../experimental/MlNetCookBookStaticApi.md | 995 ---------- .../Microsoft.ML.Samples.csproj | 8 +- .../AveragedPerceptronBinaryClassification.cs | 105 - .../Static/FastTreeBinaryClassification.cs | 108 -- .../Static/FastTreeRegression.cs | 65 - .../Static/FeatureSelectionTransform.cs | 122 -- .../Static/LightGBMBinaryClassification.cs | 107 -- .../LightGBMMulticlassWithInMemoryData.cs | 105 - .../Static/LightGBMRegression.cs | 71 - .../Static/SDCABinaryClassification.cs | 115 -- .../Static/SDCARegression.cs | 65 - .../Microsoft.ML.StaticPipe.nupkgproj | 15 - .../Microsoft.ML.StaticPipe.symbols.nupkgproj | 5 - .../Microsoft.ML.Analyzer.csproj | 14 - .../Properties/AssemblyInfo.cs | 7 - .../TypeIsSchemaShapeAnalyzer.cs | 421 ----- .../Properties/AssemblyInfo.cs | 8 - .../Properties/AssemblyInfo.cs | 2 - .../Properties/AssemblyInfo.cs | 2 - .../Properties/AssemblyInfo.cs | 1 - .../Properties/AssemblyInfo.cs | 1 - .../LightGbmStaticExtensions.cs | 401 ---- .../Microsoft.ML.LightGbm.StaticPipe.csproj | 13 - .../Properties/AssemblyInfo.cs | 1 - ...rosoft.ML.Mkl.Components.StaticPipe.csproj | 13 - .../VectorWhiteningStaticExtensions.cs | 77 - .../Properties/AssemblyInfo.cs | 2 - .../DnnImageFeaturizerStaticExtensions.cs | 65 - ...osoft.ML.OnnxTransformer.StaticPipe.csproj | 14 - .../OnnxStaticExtensions.cs | 60 - .../Properties/AssemblyInfo.cs | 1 - .../Properties/AssemblyInfo.cs | 1 - .../Properties/AssemblyInfo.cs | 1 - .../Properties/AssemblyInfo.cs | 1 - src/Microsoft.ML.StaticPipe/Attributes.cs | 27 - .../CategoricalHashStaticExtensions.cs | 171 -- .../CategoricalStaticExtensions.cs | 161 -- .../ConvertStaticExtensions.cs | 267 --- .../ConvertStaticExtensions.tt | 61 - .../DataLoadSaveOperationsExtensions.cs | 41 - src/Microsoft.ML.StaticPipe/DataLoader.cs | 54 - .../DataLoaderEstimator.cs | 41 - src/Microsoft.ML.StaticPipe/DataView.cs | 65 - src/Microsoft.ML.StaticPipe/Estimator.cs | 86 - .../EvaluatorStaticExtensions.cs | 315 --- .../FactorizationMachineStatic.cs | 137 -- .../ImageStaticPipe.cs | 173 -- .../ImageTransformsStatic.cs | 246 --- src/Microsoft.ML.StaticPipe/KMeansStatic.cs | 98 - src/Microsoft.ML.StaticPipe/LbfgsStatic.cs | 323 ---- .../LdaStaticExtensions.cs | 172 -- .../LocalPathReader.cs | 21 - .../LpNormalizerStaticExtensions.cs | 62 - .../MatrixFactorizationStatic.cs | 107 -- .../Microsoft.ML.StaticPipe.csproj | 48 - .../MulticlassNaiveBayesStatic.cs | 51 - .../NormalizerStaticExtensions.cs | 376 ---- .../OnlineLearnerStatic.cs | 259 --- src/Microsoft.ML.StaticPipe/PipelineColumn.cs | 158 -- src/Microsoft.ML.StaticPipe/Reconciler.cs | 77 - .../SchemaAssertionContext.cs | 215 --- src/Microsoft.ML.StaticPipe/SchemaBearing.cs | 63 - .../SdcaStaticExtensions.cs | 517 ----- src/Microsoft.ML.StaticPipe/SgdStatic.cs | 179 -- .../StaticPipeExtensions.cs | 94 - .../StaticPipeInternalUtils.cs | 672 ------- .../StaticPipeUtils.cs | 464 ----- .../StaticSchemaShape.cs | 350 ---- .../TermStaticExtensions.cs | 1174 ------------ .../TermStaticExtensions.tt | 99 - .../TextLoaderStatic.cs | 307 --- .../TextStaticExtensions.cs | 597 ------ .../TrainerEstimatorReconciler.cs | 529 ------ .../TrainingStaticExtensions.cs | 287 --- src/Microsoft.ML.StaticPipe/Transformer.cs | 45 - .../TransformsStatic.cs | 1683 ----------------- .../TreeTrainersStatic.cs | 316 ---- .../WordEmbeddingsStaticExtensions.cs | 89 - .../Microsoft.ML.TensorFlow.StaticPipe.csproj | 13 - .../TensorFlowStaticExtensions.cs | 94 - src/Microsoft.ML.TensorFlow/AssemblyInfo.cs | 9 - .../Properties/AssemblyInfo.cs | 5 +- .../Microsoft.ML.TimeSeries.StaticPipe.csproj | 13 - .../TimeSeriesStatic.cs | 311 --- .../Properties/AssemblyInfo.cs | 1 - .../Code/BestFriendTest.cs | 1 - .../Helpers/AdditionalMetadataReferences.cs | 2 - .../Helpers/CSharpCodeFixVerifier`2.cs | 1 - .../Microsoft.ML.CodeAnalyzer.Tests.csproj | 2 - .../TypeIsSchemaShapeClassResource.cs | 154 -- .../Resources/TypeIsSchemaShapeResource.cs | 46 - .../TypeIsSchemaShapeResourceChained.cs | 64 - .../TypeIsSchemaShapeTest.cs | 90 - .../Microsoft.ML.Functional.Tests.csproj | 4 +- .../DnnImageFeaturizerTest.cs | 27 +- .../Microsoft.ML.OnnxTransformerTest.csproj | 4 +- .../OnnxTransformTests.cs | 29 +- .../ImageAnalyticsTests.cs | 44 - .../Microsoft.ML.StaticPipelineTesting.csproj | 31 - .../StaticPipeFakes.cs | 210 -- .../StaticPipeTests.cs | 889 --------- .../Training.cs | 1364 ------------- .../TreeRepresentation.cs | 191 -- test/Microsoft.ML.Tests/CachingTests.cs | 21 - .../Microsoft.ML.Tests.csproj | 2 - .../Api/CookbookSamples/CookbookSamples.cs | 716 ------- .../Scenarios/GetColumnTests.cs | 90 +- .../TensorFlowEstimatorTests.cs | 48 +- .../TrainerEstimators/OnlineLinearTests.cs | 24 +- .../TrainerEstimators/SdcaTests.cs | 34 +- .../Transformers/CategoricalHashTests.cs | 45 +- .../Transformers/CategoricalTests.cs | 29 +- .../Transformers/FeatureSelectionTests.cs | 78 +- .../KeyToBinaryVectorEstimatorTest.cs | 29 +- .../Transformers/KeyToValueTests.cs | 34 +- .../Transformers/KeyToVectorEstimatorTests.cs | 33 +- .../Transformers/NAIndicatorTests.cs | 16 +- .../Transformers/NAReplaceTests.cs | 33 +- .../Transformers/NormalizerTests.cs | 110 +- .../Transformers/PcaTests.cs | 34 +- .../Transformers/RffTests.cs | 21 +- .../Transformers/TextFeaturizerTests.cs | 146 +- .../Transformers/TextNormalizer.cs | 10 +- .../Microsoft.ML.TimeSeries.Tests.csproj | 3 +- ...icTests.cs => TimeSeriesSimpleApiTests.cs} | 60 +- 126 files changed, 423 insertions(+), 19188 deletions(-) delete mode 100644 docs/code/experimental/MlNetCookBookStaticApi.md delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/FeatureSelectionTransform.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs delete mode 100644 pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.nupkgproj delete mode 100644 pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.symbols.nupkgproj delete mode 100644 src/Microsoft.ML.Analyzer/Microsoft.ML.Analyzer.csproj delete mode 100644 src/Microsoft.ML.Analyzer/Properties/AssemblyInfo.cs delete mode 100644 src/Microsoft.ML.Analyzer/TypeIsSchemaShapeAnalyzer.cs delete mode 100644 src/Microsoft.ML.LightGbm.StaticPipe/LightGbmStaticExtensions.cs delete mode 100644 src/Microsoft.ML.LightGbm.StaticPipe/Microsoft.ML.LightGbm.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.Mkl.Components.StaticPipe/Microsoft.ML.Mkl.Components.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.Mkl.Components.StaticPipe/VectorWhiteningStaticExtensions.cs delete mode 100644 src/Microsoft.ML.OnnxTransformer.StaticPipe/DnnImageFeaturizerStaticExtensions.cs delete mode 100644 src/Microsoft.ML.OnnxTransformer.StaticPipe/Microsoft.ML.OnnxTransformer.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.OnnxTransformer.StaticPipe/OnnxStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/Attributes.cs delete mode 100644 src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt delete mode 100644 src/Microsoft.ML.StaticPipe/DataLoadSaveOperationsExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/DataLoader.cs delete mode 100644 src/Microsoft.ML.StaticPipe/DataLoaderEstimator.cs delete mode 100644 src/Microsoft.ML.StaticPipe/DataView.cs delete mode 100644 src/Microsoft.ML.StaticPipe/Estimator.cs delete mode 100644 src/Microsoft.ML.StaticPipe/EvaluatorStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/FactorizationMachineStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/ImageStaticPipe.cs delete mode 100644 src/Microsoft.ML.StaticPipe/ImageTransformsStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/KMeansStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/LbfgsStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/LocalPathReader.cs delete mode 100644 src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/MatrixFactorizationStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.StaticPipe/MulticlassNaiveBayesStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/OnlineLearnerStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/PipelineColumn.cs delete mode 100644 src/Microsoft.ML.StaticPipe/Reconciler.cs delete mode 100644 src/Microsoft.ML.StaticPipe/SchemaAssertionContext.cs delete mode 100644 src/Microsoft.ML.StaticPipe/SchemaBearing.cs delete mode 100644 src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/SgdStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/StaticPipeExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/StaticPipeInternalUtils.cs delete mode 100644 src/Microsoft.ML.StaticPipe/StaticPipeUtils.cs delete mode 100644 src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt delete mode 100644 src/Microsoft.ML.StaticPipe/TextLoaderStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TrainerEstimatorReconciler.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs delete mode 100644 src/Microsoft.ML.StaticPipe/Transformer.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TransformsStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs delete mode 100644 src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs delete mode 100644 src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs delete mode 100644 src/Microsoft.ML.TensorFlow/AssemblyInfo.cs delete mode 100644 src/Microsoft.ML.TimeSeries.StaticPipe/Microsoft.ML.TimeSeries.StaticPipe.csproj delete mode 100644 src/Microsoft.ML.TimeSeries.StaticPipe/TimeSeriesStatic.cs delete mode 100644 test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeClassResource.cs delete mode 100644 test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResource.cs delete mode 100644 test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResourceChained.cs delete mode 100644 test/Microsoft.ML.CodeAnalyzer.Tests/TypeIsSchemaShapeTest.cs delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/ImageAnalyticsTests.cs delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/StaticPipeFakes.cs delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/Training.cs delete mode 100644 test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs rename test/Microsoft.ML.TimeSeries.Tests/{TimeSeriesStaticTests.cs => TimeSeriesSimpleApiTests.cs} (76%) diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 5e95aea98a..18df20ad0b 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -104,10 +104,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Mkl.Components EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TensorFlow", "src\Microsoft.ML.TensorFlow\Microsoft.ML.TensorFlow.csproj", "{570A0B8A-5463-44D2-8521-54C0CA4CACA9}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Analyzer", "src\Microsoft.ML.Analyzer\Microsoft.ML.Analyzer.csproj", "{6DEF0F40-3853-47B3-8165-5F24BA5E14DF}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.StaticPipelineTesting", "test\Microsoft.ML.StaticPipelineTesting\Microsoft.ML.StaticPipelineTesting.csproj", "{8B38BF24-35F4-4787-A9C5-22D35987106E}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TimeSeries", "src\Microsoft.ML.TimeSeries\Microsoft.ML.TimeSeries.csproj", "{5A79C7F0-3D99-4123-B0DA-7C9FFCD13132}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxTransformer", "src\Microsoft.ML.OnnxTransformer\Microsoft.ML.OnnxTransformer.csproj", "{8C05642D-C3AA-4972-B02C-93681161A6BC}" @@ -138,18 +134,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.DnnImageFeatur EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.EntryPoints", "src\Microsoft.ML.EntryPoints\Microsoft.ML.EntryPoints.csproj", "{7504D46F-E4B3-43CB-9B1C-82F3131F1C99}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.StaticPipe", "src\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.csproj", "{6B1B93D0-142A-4111-A20E-62B55A3E36A3}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TensorFlow.StaticPipe", "src\Microsoft.ML.TensorFlow.StaticPipe\Microsoft.ML.TensorFlow.StaticPipe.csproj", "{F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Mkl.Components.StaticPipe", "src\Microsoft.ML.Mkl.Components.StaticPipe\Microsoft.ML.Mkl.Components.StaticPipe.csproj", "{2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxTransformer.StaticPipe", "src\Microsoft.ML.OnnxTransformer.StaticPipe\Microsoft.ML.OnnxTransformer.StaticPipe.csproj", "{D1324668-9568-40F4-AA55-30A9A516C230}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.LightGbm.StaticPipe", "src\Microsoft.ML.LightGbm.StaticPipe\Microsoft.ML.LightGbm.StaticPipe.csproj", "{22C51B08-ACAE-47B2-A312-462DC239A23B}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.TimeSeries.StaticPipe", "src\Microsoft.ML.TimeSeries.StaticPipe\Microsoft.ML.TimeSeries.StaticPipe.csproj", "{06A147ED-15EA-4106-9105-9B745125B470}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.Mkl.Components", "Microsoft.ML.Mkl.Components", "{63006A14-B924-48C5-83C9-CFE9DA22B01F}" ProjectSection(SolutionItems) = preProject pkg\Microsoft.ML.Mkl.Components\Microsoft.ML.Mkl.Components.nupkgproj = pkg\Microsoft.ML.Mkl.Components\Microsoft.ML.Mkl.Components.nupkgproj @@ -185,12 +169,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.OnnxTransforme pkg\Microsoft.ML.OnnxTransformer\Microsoft.ML.OnnxTransformer.symbols.nupkgproj = pkg\Microsoft.ML.OnnxTransformer\Microsoft.ML.OnnxTransformer.symbols.nupkgproj EndProjectSection EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.StaticPipe", "Microsoft.ML.StaticPipe", "{EC9844CE-070A-4313-B3A3-44701AC5F1E1}" - ProjectSection(SolutionItems) = preProject - pkg\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.nupkgproj = pkg\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.nupkgproj - pkg\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.symbols.nupkgproj = pkg\Microsoft.ML.StaticPipe\Microsoft.ML.StaticPipe.symbols.nupkgproj - EndProjectSection -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.Recommender", "Microsoft.ML.Recommender", "{320AF46A-4809-486E-8F9E-A00C8AE47751}" ProjectSection(SolutionItems) = preProject pkg\Microsoft.ML.Recommender\Microsoft.ML.Recommender.nupkgproj = pkg\Microsoft.ML.Recommender\Microsoft.ML.Recommender.nupkgproj @@ -680,30 +658,6 @@ Global {570A0B8A-5463-44D2-8521-54C0CA4CACA9}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU {570A0B8A-5463-44D2-8521-54C0CA4CACA9}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU {570A0B8A-5463-44D2-8521-54C0CA4CACA9}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug|Any CPU.Build.0 = Debug|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release|Any CPU.ActiveCfg = Release|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release|Any CPU.Build.0 = Release|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug|Any CPU.Build.0 = Debug|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release|Any CPU.ActiveCfg = Release|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release|Any CPU.Build.0 = Release|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {8B38BF24-35F4-4787-A9C5-22D35987106E}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU {5A79C7F0-3D99-4123-B0DA-7C9FFCD13132}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {5A79C7F0-3D99-4123-B0DA-7C9FFCD13132}.Debug|Any CPU.Build.0 = Debug|Any CPU {5A79C7F0-3D99-4123-B0DA-7C9FFCD13132}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU @@ -872,78 +826,6 @@ Global {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU {7504D46F-E4B3-43CB-9B1C-82F3131F1C99}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug|Any CPU.Build.0 = Debug|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release|Any CPU.ActiveCfg = Release|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release|Any CPU.Build.0 = Release|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {6B1B93D0-142A-4111-A20E-62B55A3E36A3}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug|Any CPU.Build.0 = Debug|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release|Any CPU.ActiveCfg = Release|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release|Any CPU.Build.0 = Release|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug|Any CPU.Build.0 = Debug|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release|Any CPU.ActiveCfg = Release|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release|Any CPU.Build.0 = Release|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug|Any CPU.Build.0 = Debug|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release|Any CPU.ActiveCfg = Release|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release|Any CPU.Build.0 = Release|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {D1324668-9568-40F4-AA55-30A9A516C230}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug|Any CPU.Build.0 = Debug|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release|Any CPU.ActiveCfg = Release|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release|Any CPU.Build.0 = Release|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {22C51B08-ACAE-47B2-A312-462DC239A23B}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug|Any CPU.Build.0 = Debug|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug-Intrinsics|Any CPU.Build.0 = Debug-Intrinsics|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release|Any CPU.ActiveCfg = Release|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release|Any CPU.Build.0 = Release|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release-Intrinsics|Any CPU.ActiveCfg = Release-Intrinsics|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release-Intrinsics|Any CPU.Build.0 = Release-Intrinsics|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU - {06A147ED-15EA-4106-9105-9B745125B470}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU {85D0CAFD-2FE8-496A-88C7-585D35B94243}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {85D0CAFD-2FE8-496A-88C7-585D35B94243}.Debug|Any CPU.Build.0 = Debug|Any CPU {85D0CAFD-2FE8-496A-88C7-585D35B94243}.Debug-Intrinsics|Any CPU.ActiveCfg = Debug-Intrinsics|Any CPU @@ -1058,8 +940,6 @@ Global {00E38F77-1E61-4CDF-8F97-1417D4E85053} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {A7222F41-1CF0-47D9-B80C-B4D77B027A61} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {570A0B8A-5463-44D2-8521-54C0CA4CACA9} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {6DEF0F40-3853-47B3-8165-5F24BA5E14DF} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {8B38BF24-35F4-4787-A9C5-22D35987106E} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} {5A79C7F0-3D99-4123-B0DA-7C9FFCD13132} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {8C05642D-C3AA-4972-B02C-93681161A6BC} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {73DAAC82-D308-48CC-8FFE-3B037F8BBCCA} = {09EADF06-BE25-4228-AB53-95AE3E15B530} @@ -1074,19 +954,12 @@ Global {4805129D-78C8-46D4-9519-0AD9B0574D6D} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {DB7CEB5E-8BE6-48A7-87BE-B91D9AE96F71} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {7504D46F-E4B3-43CB-9B1C-82F3131F1C99} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {6B1B93D0-142A-4111-A20E-62B55A3E36A3} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {F95F7AFB-03AF-4D20-BD75-1740B5FF71D3} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {2F25EF6A-C754-45BE-AD9E-7DDF46A1B51A} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {D1324668-9568-40F4-AA55-30A9A516C230} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {22C51B08-ACAE-47B2-A312-462DC239A23B} = {09EADF06-BE25-4228-AB53-95AE3E15B530} - {06A147ED-15EA-4106-9105-9B745125B470} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {63006A14-B924-48C5-83C9-CFE9DA22B01F} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {1229F799-37F0-4282-B9F0-74BFA97CC362} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {DE95FE65-9FF7-4233-93DF-7A8F2805624A} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {4CF8095E-B4A3-4326-A550-43098E447288} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {19AC192B-75FE-45D5-B219-898E401D5904} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {93FF16AA-635E-421D-96C1-008818C143A2} = {D3D38B03-B557-484D-8348-8BADEE4DF592} - {EC9844CE-070A-4313-B3A3-44701AC5F1E1} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {320AF46A-4809-486E-8F9E-A00C8AE47751} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {11894B4A-78B4-4523-A6DD-4495722E244F} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {7F3D89CF-EAAD-4F21-AE83-F2EF9C97EC32} = {D3D38B03-B557-484D-8348-8BADEE4DF592} diff --git a/docs/code/experimental/MlNetCookBookStaticApi.md b/docs/code/experimental/MlNetCookBookStaticApi.md deleted file mode 100644 index 086e3b8e3b..0000000000 --- a/docs/code/experimental/MlNetCookBookStaticApi.md +++ /dev/null @@ -1,995 +0,0 @@ -# ML.NET Cookbook (Using Static API examples) - -This document is intended to provide essential samples for common usage patterns of ML.NET, using examples from the experimental "Static API". -The static api operates on the schema of the data, and strongly types the data columns. -If you are loading an existing model from a stream, there's no need to use static types (and it's also pretty hard to do). -You should consider using the supported dynamic API. The same examples and content found through this cookbook is also available for the dynamic API in the [ML.NET CookBook.md](../MlNetCookBook.md) -Also, if the data view's schema is only known at runtime, there is no way to use static types. -Using the static API gives you compiler support: it's more likely that if your code compiles, it will also work as intended because the checks on the type compatibility between the data and the estimators are taken care at compile time. - -To get started, it is helpful to be at least minimally familiar with [high-level concepts of ML.NET](../MlNetHighLevelConcepts.md), otherwise the terminology in this document may be foreign to you. -As the static API is experimental and under development, we welcome feedback and examples where it gives an improved experience over the supported dynamic API. - -## How to use this cookbook - -Developers often work by copying and pasting source code from somewhere and then adapting it to their needs. We do it all the time. - -So, we decided to embrace the pattern and provide an authoritative set of example usages of ML.NET, for many common scenarios that you may encounter. -These examples are multi-purpose: - -- They can kickstart your development, so that you don't start from nothing, -- They are annotated and verbose, so you have easier time adapting them to your needs. - -Each sample also contains a snippet of the data file used in the sample. We mostly use snippets from our test datasets for that. - -Please feel free to search this page and use any code that suits your needs. - -### List of recipes - -- [How do I load data from a text file?](#how-do-i-load-data-from-a-text-file) -- [How do I load data with many columns from a CSV?](#how-do-i-load-data-with-many-columns-from-a-csv) -- [How do I debug my experiment or preview my pipeline?](#how-do-i-debug-my-experiment-or-preview-my-pipeline) -- [How do I look at the intermediate data?](#how-do-i-look-at-the-intermediate-data) -- [How do I train a regression model?](#how-do-i-train-a-regression-model) -- [How do I verify the model quality?](#how-do-i-verify-the-model-quality) -- [How do I save and load the model?](#how-do-i-save-and-load-the-model) -- [How do I use the model to make one prediction?](#how-do-i-use-the-model-to-make-one-prediction) -- [What if my training data is not in a text file?](#what-if-my-training-data-is-not-in-a-text-file) -- [I want to look at my model's coefficients](#i-want-to-look-at-my-models-coefficients) -- [What is normalization and why do I need to care?](#what-is-normalization-and-why-do-i-need-to-care) -- [How do I train my model on categorical data?](#how-do-i-train-my-model-on-categorical-data) -- [How do I train my model on textual data?](#how-do-i-train-my-model-on-textual-data) -- [How do I train using cross-validation?](#how-do-i-train-using-cross-validation) -- [Can I mix and match static and dynamic pipelines?](#can-i-mix-and-match-static-and-dynamic-pipelines) - -### General questions about the samples - -As this document is reviewed, we found that certain general clarifications are in order about all the samples together. We try to address them in this section. - -- *My compiler fails to find some of the methods that are present in the samples!* -This is because we rely on extension methods a lot, and they only become available after you say `using TheRightNamespace`. -We are still re-organizing the namespaces, and trying to improve the story. In the meantime, the following namespaces prove useful for extension methods: -```csharp -using Microsoft.ML.Data; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Trainers; -using Microsoft.ML.Transforms; -``` - -- *Why is there two ways of doing things? Which one is better, static or dynamic?* -The static and dynamic are just two different APIs to compose the same pipelines, evaluate the models and generate predictions. -They create different developer experiences:the static API is typed over the schemaof the data, and the dynamic one is not, causing the errors to show up only at runtime. - -If you are loading an existing model from a stream, there's no need to use static types (and it's also pretty hard to do). -Also, if the data view's schema is only known at runtime, there is no way to use static types. -You might prefer the static types, since this way gives you compiler support: it's more likely that, if your code compiles, it will also work as intended. -The pipelines created, functionality and the model generated as the result of training are the same. - -- *What is the [MLContext](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.mlcontext?view=ml-dotnet)?* -The MLContext is a starting point for all ML.NET operations. -It is instantiated by user, and provides mechanisms for logging, exception tracking and logging, setting the source of randomness. -It is the entry points for training, prediction, model operations and also serves as a catalog of available operations. -You will need one MlContext object for your pipelines or inference code. - -```csharp -// as a catalog of available operations and as the source of randomness. -var mlContext = new MLContext(); -``` - -- *Why do we call `loader.MakeNewEstimator` to create a pipeline?* -In the static pipeline, we need to know the two 'schema' types: the input and the output to the pipeline. -One of them is already known: typically, the output schema of `loader` (which is the same as the schema of `loader.Load()`) is also the input schema of the learning pipeline. - -The call to `x.MakeNewEstimator` is only using the `x`'s *schema* to create an empty pipeline, it doesn't use anything else from `x`. So, the following three lines would create the exactly same (empty) pipeline: -```csharp -var p1 = loader.MakeNewEstimator(); -var p2 = loader.Load(dataLocation).MakeNewEstimator(); -var p3 = p1.MakeNewEstimator(); -``` - -- *Can we use `loader` to load more than one file?* -Absolutely! This is why we separated `loader` from the data. This is completely legitimate (and recommended): -```csharp -var trainData = loader.Load(trainDataLocation); -var testData = loader.Load(testDataLocation); -``` - -## How do I load data from a text file? - -`TextLoader` is used to load data from text files. You will need to specify what are the data columns, what are their types, and where to find them in the text file. - -Note that it's perfectly acceptable to load only some columns of a file, or load the same column multiple times. - -[Example file](../../test/data/adult.tiny.with-schema.txt): -``` -Label Workclass education marital-status -0 Private 11th Never-married -0 Private HS-grad Married-civ-spouse -1 Local-gov Assoc-acdm Married-civ-spouse -1 Private Some-college Married-civ-spouse -``` - -This is how you can load this data: - -```csharp -// Create the loader: define the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // A boolean column depicting the 'target label'. - IsOver50K: ctx.LoadBool(0), - // Three text columns. - Workclass: ctx.LoadText(1), - Education: ctx.LoadText(2), - MaritalStatus: ctx.LoadText(3)), - hasHeader: true); - -// Now load the file (remember though, loaders are lazy, so the actual loading will happen when the data is accessed). -var data = loader.Load(dataPath); -``` - -## How do I load data from multiple files? - -You can again use the `TextLoader`, and specify an array of files to its Load method. -The files need to have the same schema (same number and type of columns) - -[Example file1](../../test/data/adult.train): -[Example file2](../../test/data/adult.test): -``` -Label Workclass education marital-status -0 Private 11th Never-married -0 Private HS-grad Married-civ-spouse -1 Local-gov Assoc-acdm Married-civ-spouse -1 Private Some-college Married-civ-spouse -``` - -This is how you can load this data: -```csharp - -// Create the loader: define the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // A boolean column depicting the 'target label'. - IsOver50K: ctx.LoadBool(14), - // Three text columns. - Workclass: ctx.LoadText(1), - Education: ctx.LoadText(3), - MaritalStatus: ctx.LoadText(5)), - hasHeader: true); - -// Now load the files (remember though, loaders are lazy, so the actual loading will happen when the data is accessed). -var data = loader.Load(exampleFile1, exampleFile2); -``` - -## How do I load data with many columns from a CSV? -`TextLoader` is used to load data from text files. You will need to specify what are the data columns, what are their types, and where to find them in the text file. - -When the input file contains many columns of the same type, always intended to be used together, we recommend loading them as a *vector column* from the very start: this way the schema of the data is cleaner, and we don't incur unnecessary performance costs. - -[Example file](../../test/data/generated_regression_dataset.csv): -``` --2.75,0.77,-0.61,0.14,1.39,0.38,-0.53,-0.50,-2.13,-0.39,0.46,140.66 --0.61,-0.37,-0.12,0.55,-1.00,0.84,-0.02,1.30,-0.24,-0.50,-2.12,148.12 --0.85,-0.91,1.81,0.02,-0.78,-1.41,-1.09,-0.65,0.90,-0.37,-0.22,402.20 -0.28,1.05,-0.24,0.30,-0.99,0.19,0.32,-0.95,-1.19,-0.63,0.75,443.51 -``` - -Loading this file using `TextLoader`: -```csharp -// Create the loader: define the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // We load the first 11 values as a single float vector. - FeatureVector: ctx.LoadFloat(0, 10), - // Separately, load the target variable. - Target: ctx.LoadFloat(11) - ), - // Default separator is tab, but we need a comma. - separatorChar: ','); - - -// Now load the file (remember though, loaders are lazy, so the actual loading will happen when the data is accessed). -var data = loader.Load(dataPath); -``` - -## How do I debug my experiment or preview my pipeline? - -Most ML.NET operations are 'lazy': they are not actually processing data, they just validate that the operation is possible, and then defer execution until the output data is actually requested. -This provides good efficiency, but makes it hard to step through and debug the experiment. - -The `Preview()` extension method is added to data views, transformers, estimators and loaders: - -- `Preview` of a data view contains first 100 rows (configurable) of the data view, encoded as objects, in a single in-memory structure. -- `Preview` of a transformer takes data as input, and outputs the preview of the transformed data. -- `Preview` of an estimator also takes data as input, fits an 'approximated model' on the first 100 rows (configurable) of data, and then outputs the preview of the resulting transformer. - -We tried to make `Preview` debugger-friendly: our expectation is that, if you enter, say `data.Preview()` in your Watch window, you will be able to easily inspect the data there. - -Here is the code sample: -```csharp -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // We load the first 11 values as a single float vector. - FeatureVector: ctx.LoadFloat(0, 10), - // Separately, load the target variable. - Target: ctx.LoadFloat(11) - ), - // Default separator is tab, but we need a comma. - separatorChar: ','); - - -// Now load the file (remember though, loaders are lazy, so the actual loading will happen when the data is accessed). -var data = loader.Load(dataPath); - -// Preview the data. -var dataPreview = data.Preview(); -``` - -Similarly, if we wanted to preview the data resulting from the transformation, we would compose a pipeline, than call Preview(): - -```csharp - var learningPipeline = loader.MakeNewEstimator() - // We add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. - .AppendCacheCheckpoint() - // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be - // between -1 and 1 for all examples), and then train the model. - .Append(r => ( - // Retain the 'Target' column for evaluation purposes. - r.Target, - // We choose the SDCA regression trainer. Note that we normalize the 'FeatureVector' right here in - // the the same call. - Prediction: mlContext.Regression.Trainers.Sdca(label: r.Target, features: r.FeatureVector.Normalize()))); - -// Train the pipeline. -var model = learningPipeline.Fit(trainData); - -// Preview -var dataPreview = model.Preview(); - -``` - - -## How do I look at the intermediate data? - -Oftentimes, when we construct the experiment, we want to make sure that the data processing 'up to a certain moment' produces the results that we want. -With ML.NET it is not very easy to do: since all ML.NET operations are lazy, the objects we construct are just 'promises' of data. - -We will need to create the cursor and scan the data to obtain the actual values. One way to do this is to use [schema comprehension](SchemaComprehension.md) and map the data to an `IEnumerable` of user-defined objects. - -Another mechanism that lets you inspect the intermediate data is the `GetColumn` extension method. It lets you look at the contents of one column of your data in a form of an `IEnumerable`. - -Here is all of this in action: - -[Example file](../../test/data/adult.tiny.with-schema.txt): -``` -Label Workclass education marital-status -0 Private 11th Never-married -0 Private HS-grad Married-civ-spouse -1 Local-gov Assoc-acdm Married-civ-spouse -1 Private Some-college Married-civ-spouse - -``` - -```csharp -// Create the loader: define the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // A boolean column depicting the 'target label'. - IsOver50K: ctx.LoadBool(0), - // Three text columns. - Workclass: ctx.LoadText(1), - Education: ctx.LoadText(2), - MaritalStatus: ctx.LoadText(3)), - hasHeader: true); - -// Start creating our processing pipeline. For now, let's just concatenate all the text columns -// together into one. -var dataPipeline = loader.MakeNewEstimator() - .Append(row => ( - row.IsOver50K, - AllFeatures: row.Workclass.ConcatWith(row.Education, row.MaritalStatus) - )); - -// Let's verify that the data has been load correctly. -// First, we load the data file. -var data = loader.Load(dataPath); - -// Fit our data pipeline and transform data with it. -var transformedData = dataPipeline.Fit(data).Transform(data); - -// 'transformedData' is a 'promise' of data. Let's actually load it. -var someRows = mlContext - // Convert to an enumerable of user-defined type. - .CreateEnumerable(transformedData.AsDynamic, reuseRowObject: false) - // Take a couple values as an array. - .Take(4).ToArray(); - -// Extract the 'AllFeatures' column. -// This will give the entire dataset: make sure to only take several row -// in case the dataset is huge. -var featureColumns = transformedData.GetColumn(r => r.AllFeatures) - .Take(20).ToArray(); - -``` - -## How do I train a regression model? - -Generally, in order to train any model in ML.NET, you will go through three steps: -1. Figure out how the training data gets into ML.NET in a form of an `IDataView` -2. Build the 'learning pipeline' as a sequence of elementary 'operators' (estimators). -3. Call `Fit` on the pipeline to obtain the trained model. - -[Example file](../../test/data/generated_regression_dataset.csv): -``` -feature_0;feature_1;feature_2;feature_3;feature_4;feature_5;feature_6;feature_7;feature_8;feature_9;feature_10;target --2.75;0.77;-0.61;0.14;1.39;0.38;-0.53;-0.50;-2.13;-0.39;0.46;140.66 --0.61;-0.37;-0.12;0.55;-1.00;0.84;-0.02;1.30;-0.24;-0.50;-2.12;148.12 --0.85;-0.91;1.81;0.02;-0.78;-1.41;-1.09;-0.65;0.90;-0.37;-0.22;402.20 -``` - -In the file above, the last column (12th) is label that we predict, and all the preceding ones are features. - -```csharp -// Step one: load the data as an IDataView. -// First, we define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // We load the first 11 values as a single float vector. - FeatureVector: ctx.LoadFloat(0, 10), - // Separately, load the target variable. - Target: ctx.LoadFloat(11) - ), - // The data file has header. - hasHeader: true, - // Default separator is tab, but we need a semicolon. - separatorChar: ';'); - - -// Now load the file (remember though, loaders are lazy, so the actual loading will happen when the data is accessed). -var trainData = loader.Load(trainDataPath); - -// Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used -// several times somewhere. The caching mechanism is also lazy; it only caches things after being used. -// User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because -// a caching step, which provides the same caching function, will be inserted in the considered "learningPipeline." -var cachedTrainData = trainData.Cache(); - -// Step two: define the learning pipeline. - -// We 'start' the pipeline with the output of the loader. -var learningPipeline = loader.MakeNewEstimator() - // We add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. Otherwise, the following - // trainer will load data from disk multiple times. The caching mechanism uses an on-demand strategy. - // The data accessed in any downstream step will be cached since its first use. In general, you only - // need to add a caching step before trainable step, because caching is not helpful if the data is - // only scanned once. This step can be removed if user doesn't have enough memory to store the whole - // data set. - .AppendCacheCheckpoint() - // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be - // between -1 and 1 for all examples) - .Append(r => ( - // Retain the 'Target' column for evaluation purposes. - r.Target, - // We choose the SDCA regression trainer. Note that we normalize the 'FeatureVector' right here in - // the the same call. - Prediction: mlContext.Regression.Trainers.Sdca(label: r.Target, features: r.FeatureVector.Normalize()))); - -// Step three. Fit the pipeline to the training data. -var model = learningPipeline.Fit(trainData); -``` - -## How do I verify the model quality? - -This is the first question that arises after you train the model: how good it actually is? -For each of the machine learning tasks, there is a set of 'metrics' that can describe how good the model is: it could be log-loss or F1 score for classification, RMS or L1 loss for regression etc. - -You can use the corresponding 'context' of the task to evaluate the model. - -Assuming the example above was used to train the model, here's how you calculate the metrics. -```csharp -// Load the test dataset. -var testData = loader.Load(testDataPath); -// Calculate metrics of the model on the test data. -var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r => r.Target, score: r => r.Prediction); -``` - -## How do I save and load the model? - -Assuming that the model metrics look good to you, it's time to 'operationalize' the model. -This is where ML.NET really shines: the `model` object you just built is ready for immediate consumption, it will apply all the same steps that it has 'learned' during training, and it can be persisted and reused in different environments. - -Here's what you do to save the model to a file, and reload it (potentially in a different context). - -```csharp -// Saving and loading happens to 'dynamic' models, so the static typing is lost in the process. -mlContext.Model.Save(model.AsDynamic, trainData.AsDynamic.Schema, modelPath); - -// Potentially, the lines below can be in a different process altogether. - -// When you load the model, it's a 'dynamic' transformer. -ITransformer loadedModel = mlContext.Model.Load(modelPath, out var schema); -``` - -## How do I use the model to make one prediction? - -Since any ML.NET model is a transformer, you can of course use `model.Transform` to apply the model to the 'data view' and obtain predictions this way. - -A more typical case, though, is when there is no 'dataset' that we want to predict on, but instead we receive one example at a time. -For instance, we run the model as part of the ASP.NET website, and we need to make a prediction for an incoming HTTP request. - -For this case, ML.NET offers a convenient `PredictionEngine` component, that essentially runs one example at a time through the prediction pipeline. - -Here is the full example. Let's imagine that we have built a model for the famous Iris prediction dataset: - -```csharp -// Step one: load the data as an IDataView. -// First, we define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separatorChar: ','); - -// Retrieve the training data. -var trainData = loader.Load(irisDataPath); - -// Build the training pipeline. -var learningPipeline = loader.MakeNewEstimator() - .Append(r => ( - r.Label, - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - // We add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. Otherwise, the following - // trainer will load data from disk multiple times. The caching mechanism uses an on-demand strategy. - // The data accessed in any downstream step will be cached since its first use. In general, you only - // need to add a caching step before trainable step, because caching is not helpful if the data is - // only scanned once. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - // Note that the label is a text, so it needs to be converted to key using 'ToKey' estimator. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label.ToKey(), r.Features))) - // Apply the inverse conversion from 'predictedLabel' key back to string value. - // Note that the final output column is only one, and we didn't assign a name to it. - // In this case, ML.NET auto-assigns the name 'Data' to the produced column. - .Append(r => r.Predictions.predictedLabel.ToValue()); - -// Train the model. -var model = learningPipeline.Fit(trainData); -``` - -Now, in order to use [schema comprehension](SchemaComprehension.md) for prediction, we define a pair of classes like following: -```csharp -private class IrisInput -{ - // Unfortunately, we still need the dummy 'Label' column to be present. - [ColumnName("Label")] - public string IgnoredLabel { get; set; } - public float SepalLength { get; set; } - public float SepalWidth { get; set; } - public float PetalLength { get; set; } - public float PetalWidth { get; set; } -} - -private class IrisPrediction -{ - [ColumnName("Data")] - public string PredictedClass { get; set; } -} -``` - -The prediction code now looks as follows: -```csharp -// -var model = learningPipeline.Fit(trainData).AsDynamic; - -// Use the model for one-time prediction. -// Make the prediction function object. Note that, on average, this call takes around 200x longer -// than one prediction, so you might want to cache and reuse the prediction function, instead of -// creating one per prediction. -var predictionFunc = model.CreatePredictionEngine(mlContext); - -// Obtain the prediction. Remember that 'Predict' is not reentrant. If you want to use multiple threads -// for simultaneous prediction, make sure each thread is using its own PredictionEngine. -var prediction = predictionFunc.Predict(new IrisInput -{ - SepalLength = 4.1f, - SepalWidth = 0.1f, - PetalLength = 3.2f, - PetalWidth = 1.4f -}); -``` - -## What if my training data is not in a text file? - -The commonly demonstrated use case for ML.NET is when the training data resides somewhere on disk, and we use the `TextLoader` to load it. -However, in real-time training scenarios the training data can be elsewhere: in a bunch of SQL tables, extracted from log files, or even generated on the fly. - -Here is how we can use [schema comprehension](SchemaComprehension.md) to bring an existing C# `IEnumerable` into ML.NET as a data view. - -For the purpose of this example, we will assume that we build the customer churn prediction model, and we can extract the following features from our production system: -- Customer ID (ignored by the model) -- Whether the customer has churned (the target 'label') -- The 'demographic category' (one string, like 'young adult' etc.) -- The number of visits from the last 5 days. -```csharp -private class CustomerChurnInfo -{ - public string CustomerID { get; set; } - public bool HasChurned { get; set; } - public string DemographicCategory { get; set; } - // Visits during last 5 days, latest to newest. - [VectorType(5)] - public float[] LastVisits { get; set; } -} -``` - -Given this information, here's how we turn this data into the ML.NET data view and train on it: -```csharp -// Step one: load the data as an IDataView. -// Let's assume that 'GetChurnData()' fetches and returns the training data from somewhere. -IEnumerable churnData = GetChurnInfo(); - -// Turn the data into the ML.NET data view. -// We can use CreateDataView or CreateStreamingDataView, depending on whether 'churnData' is an IList, -// or merely an IEnumerable. -var trainData = mlContext.CreateStreamingDataView(churnData); - -// Build the learning pipeline. -// In our case, we will one-hot encode the demographic category, and concatenate that with the number of visits. -// We apply our FastTree binary classifier to predict the 'HasChurned' label. - -// First, transition to the statically-typed data view. -var staticData = trainData.AssertStatic(mlContext, c => ( - HasChurned: c.Bool.Scalar, - DemographicCategory: c.Text.Scalar, - LastVisits: c.R4.Vector)); - -// Build the pipeline, same as the one above. -var staticPipeline = staticData.MakeNewEstimator() - .Append(r => ( - r.HasChurned, - Features: r.DemographicCategory.OneHotEncoding().ConcatWith(r.LastVisits))) - .AppendCacheCheckpoint() // FastTree will benefit from caching data in memory. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.HasChurned, r.Features, numTrees: 20)); - -var staticModel = staticPipeline.Fit(staticData); - -// Note that dynamicModel should be the same as staticModel.AsDynamic (give or take random variance from -// the training procedure). -``` - -## I want to look at my model's coefficients - -Oftentimes, once a model is trained, we are also interested on 'what it has learned'. - -For example, if the linear model assigned zero weight to a feature that we consider important, it could indicate some problem with modeling. -The weights of the linear model can also be used as a poor man's estimation of 'feature importance'. - -In the static pipeline API, we provide a set of `onFit` delegates that allow introspection of the individual transformers as they are trained. - -This is how we can extract the learned parameters out of the model that we trained: -```csharp - -// Step one: load the data as an IDataView. -// First, we define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separatorChar: ','); - -// Retrieve the training data. -var trainData = loader.Load(dataPath); - -// This is the predictor ('weights collection') that we will train. -MulticlassLogisticRegressionPredictor predictor = null; -// And these are the normalizer scales that we will learn. -ImmutableArray normScales; -// Build the training pipeline. -var learningPipeline = loader.MakeNewEstimator() - .Append(r => ( - r.Label, - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - .Append(r => ( - r.Label, - // Normalize (rescale) the features to be between -1 and 1. - Features: r.Features.Normalize( - // When the normalizer is trained, the below delegate is going to be called. - // We use it to memorize the scales. - onFit: (scales, offsets) => normScales = scales))) - // Cache data used in memory because the subsequently trainer needs to access the data multiple times. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - // Note that the label is a text, so it needs to be converted to key using 'ToKey' estimator. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label.ToKey(), r.Features, - // When the model is trained, the below delegate is going to be called. - // We use that to memorize the predictor object. - onFit: p => predictor = p))); - -// Train the model. During this call our 'onFit' delegate will be invoked, -// and our 'predictor' will be set. -var model = learningPipeline.Fit(trainData); - -// Now we can use 'predictor' to look at the weights. -// 'weights' will be an array of weight vectors, one vector per class. -// Our problem has 3 classes, so numClasses will be 3, and weights will contain -// 3 vectors (of 4 values each). -VBuffer[] weights = null; -predictor.GetWeights(ref weights, out int numClasses); - -// Similarly we can also inspect the biases for the 3 classes. -var biases = predictor.GetBiases(); - -// Inspect the normalizer scales. -Console.WriteLine(string.Join(" ", normScales)); -``` - -## What is normalization and why do I need to care? - -In ML.NET we expose a number of [parametric and non-parametric algorithms](https://machinelearningmastery.com/parametric-and-nonparametric-machine-learning-algorithms/). - -Typically, parametric learners hold certain assumptions about the training data, and if they are not met, the training is greatly hampered (or sometimes becomes completely impossible). - -Most commonly, the assumptions are that -- All the features have values roughly on the same scale; -- Feature values are not too large, and not too small. - -Violating the first assumption above can cause the learner to train a sub-optimal model (or even a completely useless one). -Violating the second assumption can cause arithmetic error accumulation, which typically breaks the training process altogether. - -As a general rule, *if you use a parametric learner, you need to make sure your training data is correctly scaled*. - -ML.NET offers several built-in scaling algorithms, or 'normalizers': -- MinMax normalizer: for each feature, we learn the minimum and maximum value of it, and then linearly rescale it so that the values fit between -1 and 1. -- MeanVar normalizer: for each feature, compute the mean and variance, and then linearly rescale it to zero-mean, unit-variance. -- CDF normalizer: for each feature, compute the mean and variance, and then replace each value `x` with `Cdf(x)`, where `Cdf` is the cumulative density function of normal distribution with these mean and variance. -- Binning normalizer: discretize the feature value into `N` 'buckets', and then replace each value with the index of the bucket, divided by `N-1`. - -These normalizers all have different properties and tradeoffs, but it's not *that* big of a deal if you use one over another. Just make sure you use a normalizer when training linear models or other parametric models. - -An important parameter of ML.NET normalizers is called `fixZero`. If `fixZero` is true, zero input is always mapped to zero output. This is very important when you handle sparse data: if we don't preserve zeroes, we will turn all sparse data into dense, which is usually a bad idea. - -It is a good practice to include the normalizer directly in the ML.NET learning pipeline: this way you are sure that the normalization -- is only trained on the training data, and not on your test data, -- is correctly applied to all the new incoming data, without the need for extra pre-processing at prediction time. - -Here's a snippet of code that demonstrates normalization in learning pipelines. It assumes the Iris dataset: -```csharp -// Define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset will be grouped together as one Features column. - Features: ctx.LoadFloat(0, 3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separatorChar: ','); - -// Load the training data. -var trainData = loader.Load(dataPath); - -// Apply all kinds of standard ML.NET normalization to the raw features. -var pipeline = loader.MakeNewEstimator() - .Append(r => ( - MinMaxNormalized: r.Features.Normalize(fixZero: true), - MeanVarNormalized: r.Features.NormalizeByMeanVar(fixZero: false), - CdfNormalized: r.Features.NormalizeByCumulativeDistribution(), - BinNormalized: r.Features.NormalizeByBinning(maxBins: 256) - )); - -// Let's train our pipeline of normalizers, and then apply it to the same data. -var normalizedData = pipeline.Fit(trainData).Transform(trainData); - -// Inspect one column of the resulting dataset. -var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray(); -``` - -## How do I train my model on categorical data? - -Generally speaking, *all ML.NET learners expect the features as a float vector*. So, if some of your data is not natively a float, you will need to convert to floats. - -If our data contains 'categorical' features (think 'enum'), we need to 'featurize' them somehow. ML.NET offers several ways of converting categorical data to features: -- One-hot encoding -- Hash-based one-hot encoding -- Binary encoding (convert category index into a bit sequence and use bits as features) - -If some of the categories are very high-cardinality (there's lots of different values, but only several are commonly occurring), a one-hot encoding can be wasteful. -We can use count-based feature selection to trim down the number of slots that we encode. - -Same with normalization, it's a good practice to include categorical featurization directly in the ML.NET learning pipeline: this way you are sure that the categorical transformation -- is only 'trained' on the training data, and not on your test data, -- is correctly applied to all the new incoming data, without the need for extra pre-processing at prediction time. - -Below is an example of categorical handling for the [adult census dataset](../../test/data/adult.tiny.with-schema.txt): -``` -Label Workclass education marital-status occupation relationship ethnicity sex native-country-region age fnlwgt education-num capital-gain capital-loss hours-per-week -0 Private 11th Never-married Machine-op-inspct Own-child Black Male United-States 25 226802 7 0 0 40 -0 Private HS-grad Married-civ-spouse Farming-fishing Husband White Male United-States 38 89814 9 0 0 50 -1 Local-gov Assoc-acdm Married-civ-spouse Protective-serv Husband White Male United-States 28 336951 12 0 0 40 -1 Private Some-college Married-civ-spouse Machine-op-inspct Husband Black Male United-States 44 160323 10 7688 0 40 -``` - -```csharp - -// Define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - Label: ctx.LoadBool(0), - // We will load all the categorical features into one vector column of size 8. - CategoricalFeatures: ctx.LoadText(1, 8), - // Similarly, load all numerical features into one vector of size 6. - NumericalFeatures: ctx.LoadFloat(9, 14), - // Let's also separately load the 'Workclass' column. - Workclass: ctx.LoadText(1) - ), hasHeader: true); - -// Load the data. -var data = loader.Load(dataPath); - -// Inspect the categorical columns to check that they are correctly load. -var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray(); - -// Build several alternative featurization pipelines. -var learningPipeline = loader.MakeNewEstimator() - // Cache data in memory in an on-demand manner. Columns used in any downstream step will be - // cached in memory at their first uses. This step can be removed if user's machine doesn't - // have enough memory. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - r.NumericalFeatures, - // Convert each categorical feature into one-hot encoding independently. - CategoricalOneHot: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), - // Convert all categorical features into indices, and build a 'word bag' of these. - CategoricalBag: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), - // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set. - WorkclassOneHotTrimmed: r.Workclass.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10) - )); - -// Let's train our pipeline, and then apply it to the same data. -var transformedData = learningPipeline.Fit(data).Transform(data); - -// Inspect some columns of the resulting dataset. -var categoricalBags = transformedData.GetColumn(x => x.CategoricalBag).Take(10).ToArray(); -var workclasses = transformedData.GetColumn(x => x.WorkclassOneHotTrimmed).Take(10).ToArray(); - -// Of course, if we want to train the model, we will need to compose a single float vector of all the features. -// Here's how we could do this: - -var fullLearningPipeline = learningPipeline - .Append(r => ( - r.Label, - // Concatenate two of the 3 categorical pipelines, and the numeric features. - Features: r.NumericalFeatures.ConcatWith(r.CategoricalBag, r.WorkclassOneHotTrimmed))) - // Now we're ready to train. We chose our FastTree trainer for this classification task. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.Label, r.Features, numTrees: 50)); - -// Train the model. -var model = fullLearningPipeline.Fit(data); -``` - -## How do I train my model on textual data? - -Generally speaking, *all ML.NET learners expect the features as a float vector*. So, if some of your data is not natively a float, you will need to convert to floats. - -If we want to learn on textual data, we need to 'extract features' out of the texts. There is an entire research area of NLP (Natural Language Processing) that handles this. In ML.NET we offer some basic mechanisms of text feature extraction: -- Text normalization (removing punctuation, diacritics, switching to lowercase etc.) -- Separator-based tokenization. -- Stopword removal. -- Ngram and skip-gram extraction. -- TF-IDF rescaling. -- Bag of words conversion. - -ML.NET offers a "one-stop shop" operation called `TextFeaturizer`, that runs a combination of above steps as one big 'text featurization'. We have tested it extensively on text datasets, and we're confident that it performs reasonably well without the need to deep-dive into the operations. - -However, we also offer a selection of elementary operations that let you customize your NLP processing. Here's the example below where we use them. - -Wikipedia detox dataset: -``` -Sentiment SentimentText -1 Stop trolling, zapatancas, calling me a liar merely demonstartes that you arer Zapatancas. You may choose to chase every legitimate editor from this site and ignore me but I am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. The consensus is overwhelmingly against you and your trollin g lover Zapatancas, -1 ::::: Why are you threatening me? I'm not being disruptive, its you who is being disruptive. -0 " *::Your POV and propaganda pushing is dully noted. However listing interesting facts in a netral and unacusitory tone is not POV. You seem to be confusing Censorship with POV monitoring. I see nothing POV expressed in the listing of intersting facts. If you want to contribute more facts or edit wording of the cited fact to make them sound more netral then go ahead. No need to CENSOR interesting factual information. " -0 ::::::::This is a gross exaggeration. Nobody is setting a kangaroo court. There was a simple addition concerning the airline. It is the only one disputed here. -``` - -```csharp -// Define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - IsToxic: ctx.LoadBool(0), - Message: ctx.LoadText(1) - ), hasHeader: true); - -// Load the data. -var data = loader.Load(dataPath); - -// Inspect the message texts that are load from the file. -var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray(); - -// Apply various kinds of text operations supported by ML.NET. -var learningPipeline = loader.MakeNewEstimator() - // Cache data in memory in an on-demand manner. Columns used in any downstream step will be - // cached in memory at their first uses. This step can be removed if user's machine doesn't - // have enough memory. - .AppendCacheCheckpoint() - .Append(r => ( - // One-stop shop to run the full text featurization. - TextFeatures: r.Message.FeaturizeText(), - - // NLP pipeline 1: bag of words. - BagOfWords: r.Message.NormalizeText().ToBagofWords(), - - // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - BagOfBigrams: r.Message.NormalizeText().ToBagofHashedWords(ngramLength: 2, allLengths: false), - - // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.TokenizeIntoCharacters().ToNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), - - // NLP pipeline 4: word embeddings. - Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingsExtractorTransformer.PretrainedModelKind.GloVeTwitter25D) - )); - -// Let's train our pipeline, and then apply it to the same data. -// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. -var transformedData = learningPipeline.Fit(data).Transform(data); - -// Inspect some columns of the resulting dataset. -var embeddings = transformedData.GetColumn(x => x.Embeddings).Take(10).ToArray(); -var unigrams = transformedData.GetColumn(x => x.BagOfWords).Take(10).ToArray(); -``` - -## How do I train using cross-validation? - -[Cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) is a useful technique for ML applications. -It helps estimate the variance of the model quality from one run to another and also eliminates the need to extract a separate test set for evaluation. - -There are a couple pitfalls that await us when we implement our own cross-validation. -Essentially, if we are not careful, we may introduce label leakage in the process, so our metrics could become over-inflated. - -- It is tempting to apply the same pre-processing to the entire data, and then just cross-validate the final training of the model. -If we do this for data-dependent, 'trainable' pre-processing (like text featurization, categorical handling and normalization/rescaling), we cause these processing steps to 'train' on the union of train subset and test subset, thus causing label leakage. -The correct way is to apply pre-processing independently for each 'fold' of the cross-validation. -- In many cases there is a natural 'grouping' of the data that needs to be respected. -For example, if we are solving a click prediction problem, it's a good idea to group all examples pertaining to one URL to appear in one-fold of the data. -If they end up separated, we can introduce label leakage. - -ML.NET guards us against both these pitfalls: it will automatically apply the featurization correctly (as long as all of the preprocessing resides in one learning pipeline), and we can use the 'stratification column' concept to make sure that related examples don't get separated. - -Here's an example of training on Iris dataset using randomized 90/10 train-test split, as well as a 5-fold cross-validation: -```csharp -// Step one: load the data as an IDataView. -// First, we define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separatorChar: ','); - -// Load the data. -var data = loader.Load(dataPath); - -// Build the training pipeline. -var learningPipeline = loader.MakeNewEstimator() - .Append(r => ( - // Convert string label to a key. - Label: r.Label.ToKey(), - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - // Add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features))); - -// Split the data 90:10 into train and test sets, train and evaluate. -var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - -// Train the model. -var model = learningPipeline.Fit(trainData); -// Compute quality metrics on the test set. -var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(testData), r => r.Label, r => r.Predictions); -Console.WriteLine(metrics.AccuracyMicro); - -// Now run the 5-fold cross-validation experiment, using the same pipeline. -var cvResults = mlContext.MulticlassClassification.CrossValidate(data, learningPipeline, r => r.Label, numFolds: 5); - -// The results object is an array of 5 elements. For each of the 5 folds, we have metrics, model and scored test data. -// Let's compute the average micro-accuracy. -var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro); -Console.WriteLine(microAccuracies.Average()); -``` - -## Can I mix and match static and dynamic pipelines? - -Yes, we can have both of them in our codebase. The static pipelines are just a statically-typed way to build dynamic pipelines. - -Namely, any statically typed component (`DataView`, `Transformer`, `Estimator`) has its dynamic counterpart as an `AsDynamic` property. - -Transitioning from dynamic to static types is more costly: we have to formally declare what is the 'schema shape'. Or, in case of estimators and transformers, what is the input and output schema shape. - -We can do this via `AssertStatic` extensions, as demonstrated in the following example, where we mix and match static and dynamic pipelines. -```c# -// Load the data as an IDataView. -// First, we define the loader: specify the data columns and where to find them in the text file. -var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separatorChar: ','); - -// Load the data. -var data = loader.Load(dataPath); - -// Build the pre-processing pipeline. -var learningPipeline = loader.MakeNewEstimator() - .Append(r => ( - // Convert string label to a key. - Label: r.Label.ToKey(), - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))); - -// Let's append the OVA learner to the dynamic pipeline. -IEstimator dynamicPipe = learningPipeline.AsDynamic; - -// Create a binary classification trainer. -var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features"); - -// Append the OVA learner to the pipeline. -dynamicPipe = dynamicPipe.Append(new Ova(mlContext, binaryTrainer)); - -// At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and -// ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world. -// Here's how we go back to the static pipeline: -var staticFinalPipe = dynamicPipe.AssertStatic(mlContext, - // Declare the shape of the input. As you can see, it's identical to the shape of the loader: - // four float features and a string label. - c => ( - SepalLength: c.R4.Scalar, - SepalWidth: c.R4.Scalar, - PetalLength: c.R4.Scalar, - PetalWidth: c.R4.Scalar, - Label: c.Text.Scalar), - // Declare the shape of the output (or a relevant subset of it). - // In our case, we care only about the predicted label column (a key type), and scores (vector of floats). - c => ( - Score: c.R4.Vector, - // Predicted label is a key backed by uint, with text values (since original labels are text). - PredictedLabel: c.KeyU4.TextValues.Scalar)) - // Convert the predicted label from key back to the original string value. - .Append(r => r.PredictedLabel.ToValue()); - -// Train the model in a statically typed way. -var model = staticFinalPipe.Fit(data); - -// And here is how we could've stayed in the dynamic pipeline and train that way. -dynamicPipe = dynamicPipe.Append(new KeyToValueEstimator(mlContext, "PredictedLabel")); -var dynamicModel = dynamicPipe.Fit(data.AsDynamic); - -// Now 'dynamicModel', and 'model.AsDynamic' are equivalent. -``` \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 2de093e8e0..be5f9753b0 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -10,16 +10,15 @@ - + - @@ -73,11 +72,6 @@ - - false - Analyzer - - diff --git a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs deleted file mode 100644 index c5eb417fdd..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs +++ /dev/null @@ -1,105 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.StaticPipe; - -namespace Samples.Static -{ - public class AveragedPerceptronBinaryClassificationExample - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadAdultDataset(); - - // Data Preview - // 1. Column [Label]: IsOver50K (boolean) - // 2. Column: workclass (text/categorical) - // 3. Column: education (text/categorical) - // 4. Column: marital-status (text/categorical) - // 5. Column: occupation (text/categorical) - // 6. Column: relationship (text/categorical) - // 7. Column: ethnicity (text/categorical) - // 8. Column: sex (text/categorical) - // 9. Column: native-country-region (text/categorical) - // 10. Column: age (numeric) - // 11. Column: fnlwgt (numeric) - // 12. Column: education-num (numeric) - // 13. Column: capital-gain (numeric) - // 14. Column: capital-loss (numeric) - // 15. Column: hours-per-week (numeric) - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var mlContext = new MLContext(); - - // Creating Data Loader with the initial schema based on the format of the data - var loader = TextLoaderStatic.CreateLoader( - mlContext, - c => ( - Age: c.LoadFloat(0), - Workclass: c.LoadText(1), - Fnlwgt: c.LoadFloat(2), - Education: c.LoadText(3), - EducationNum: c.LoadFloat(4), - MaritalStatus: c.LoadText(5), - Occupation: c.LoadText(6), - Relationship: c.LoadText(7), - Ethnicity: c.LoadText(8), - Sex: c.LoadText(9), - CapitalGain: c.LoadFloat(10), - CapitalLoss: c.LoadFloat(11), - HoursPerWeek: c.LoadFloat(12), - NativeCountry: c.LoadText(13), - IsOver50K: c.LoadBool(14)), - separator: ',', - hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // Create the Estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(row => ( - Features: row.Age.ConcatWith( - row.EducationNum, - row.MaritalStatus.OneHotEncoding(), - row.Occupation.OneHotEncoding(), - row.Relationship.OneHotEncoding(), - row.Ethnicity.OneHotEncoding(), - row.Sex.OneHotEncoding(), - row.HoursPerWeek, - row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), - Label: row.IsOver50K)) - .Append(row => ( - Features: row.Features.Normalize(), - Label: row.Label, - Score: mlContext.BinaryClassification.Trainers.AveragedPerceptron( - row.Label, - row.Features, - learningRate: 0.1f, - numIterations: 100))) - .Append(row => ( - Label: row.Label, - Score: row.Score, - PredictedLabel: row.Score.predictedLabel)); - - // Fit this Pipeline to the Training Data - var model = learningPipeline.Fit(trainData); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score); - - Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.83 - Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.88 - Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.63 - - Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.89 - Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.89 - Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.64 - Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.62 - } - } -} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs deleted file mode 100644 index cda4a54ef8..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs +++ /dev/null @@ -1,108 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.StaticPipe; - -namespace Samples.Static -{ - public class FastTreeBinaryClassificationExample - { - // This example requires installation of additional nuget package Microsoft.ML.FastTree. - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadAdultDataset(); - - // Data Preview - // 1. Column [Label]: IsOver50K (boolean) - // 2. Column: workclass (text/categorical) - // 3. Column: education (text/categorical) - // 4. Column: marital-status (text/categorical) - // 5. Column: occupation (text/categorical) - // 6. Column: relationship (text/categorical) - // 7. Column: ethnicity (text/categorical) - // 8. Column: sex (text/categorical) - // 9. Column: native-country-region (text/categorical) - // 10. Column: age (numeric) - // 11. Column: fnlwgt (numeric) - // 12. Column: education-num (numeric) - // 13. Column: capital-gain (numeric) - // 14. Column: capital-loss (numeric) - // 15. Column: hours-per-week (numeric) - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var mlContext = new MLContext(); - - // Creating Data Loader with the initial schema based on the format of the data - var loader = TextLoaderStatic.CreateLoader( - mlContext, - c => ( - Age: c.LoadFloat(0), - Workclass: c.LoadText(1), - Fnlwgt: c.LoadFloat(2), - Education: c.LoadText(3), - EducationNum: c.LoadFloat(4), - MaritalStatus: c.LoadText(5), - Occupation: c.LoadText(6), - Relationship: c.LoadText(7), - Ethnicity: c.LoadText(8), - Sex: c.LoadText(9), - CapitalGain: c.LoadFloat(10), - CapitalLoss: c.LoadFloat(11), - HoursPerWeek: c.LoadFloat(12), - NativeCountry: c.LoadText(13), - IsOver50K: c.LoadBool(14)), - separator: ',', - hasHeader: true); - - // Loader the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // Create the Estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(row => ( - Features: row.Age.ConcatWith( - row.EducationNum, - row.MaritalStatus.OneHotEncoding(), - row.Occupation.OneHotEncoding(), - row.Relationship.OneHotEncoding(), - row.Ethnicity.OneHotEncoding(), - row.Sex.OneHotEncoding(), - row.HoursPerWeek, - row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), - Label: row.IsOver50K)) - .Append(row => ( - Features: row.Features.Normalize(), - Label: row.Label, - Score: mlContext.BinaryClassification.Trainers.FastTree( - row.Label, - row.Features, - numberOfTrees: 100, // try: (int) 20-2000 - numberOfLeaves: 20, // try: (int) 2-128 - minimumExampleCountPerLeaf: 10, // try: (int) 1-100 - learningRate: 0.2))) // try: (float) 0.025-0.4 - .Append(row => ( - Label: row.Label, - Score: row.Score, - PredictedLabel: row.Score.predictedLabel)); - - // Fit this Pipeline to the Training Data - var model = learningPipeline.Fit(trainData); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score); - - Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.84 - Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.89 - Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.64 - - Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88 - Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91 - Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68 - Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.60 - } - } -} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs deleted file mode 100644 index 66ca7176d4..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeRegression.cs +++ /dev/null @@ -1,65 +0,0 @@ -using System; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Trainers.FastTree; - -namespace Samples.Static -{ - public class FastTreeRegressionExample - { - // This example requires installation of additional nuget package Microsoft.ML.FastTree. - public static void Example() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem this code will run - // you can open the file to see the data. - string dataFile = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var mlContext = new MLContext(); - - // Creating a data loader, based on the format of the data - var loader = TextLoaderStatic.CreateLoader(mlContext, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFile); - - // The predictor that gets produced out of training - FastTreeRegressionModelParameters pred = null; - - // Create the estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree( - r.label, - r.features, - numberOfTrees: 100, // try: (int) 20-2000 - numberOfLeaves: 20, // try: (int) 2-128 - minimumExampleCountPerLeaf: 10, // try: (int) 1-100 - learningRate: 0.2, // try: (float) 0.025-0.4 - onFit: p => pred = p) - ) - ); - - var cvResults = mlContext.Regression.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); - var averagedMetrics = ( - L1: cvResults.Select(r => r.metrics.MeanAbsoluteError).Average(), - L2: cvResults.Select(r => r.metrics.MeanSquaredError).Average(), - LossFn: cvResults.Select(r => r.metrics.LossFunction).Average(), - Rms: cvResults.Select(r => r.metrics.RootMeanSquaredError).Average(), - RSquared: cvResults.Select(r => r.metrics.RSquared).Average() - ); - Console.WriteLine($"L1 - {averagedMetrics.L1}"); // 3.091095 - Console.WriteLine($"L2 - {averagedMetrics.L2}"); // 20.351073 - Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); // 20.351074 - Console.WriteLine($"RMS - {averagedMetrics.Rms}"); // 4.478358 - Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); // 0.754977 - } - - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Static/FeatureSelectionTransform.cs deleted file mode 100644 index aa52b3a468..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/FeatureSelectionTransform.cs +++ /dev/null @@ -1,122 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.StaticPipe; - -namespace Samples.Static -{ - public class FeatureSelectionTransformStaticExample - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); - - // Data Preview - // 1. Label 0=benign, 1=malignant - // 2. Clump Thickness 1 - 10 - // 3. Uniformity of Cell Size 1 - 10 - // 4. Uniformity of Cell Shape 1 - 10 - // 5. Marginal Adhesion 1 - 10 - // 6. Single Epithelial Cell Size 1 - 10 - // 7. Bare Nuclei 1 - 10 - // 8. Bland Chromatin 1 - 10 - // 9. Normal Nucleoli 1 - 10 - // 10. Mitoses 1 - 10 - - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from - // all the feature columns into entries of a vector of a single column named "Features". - var loader = TextLoaderStatic.CreateLoader(ml, c => ( - Label: c.LoadBool(0), - Features: c.LoadFloat(1, 9) - ), - separator: '\t', hasHeader: true); - - // Then, we use the loader to load the data as an IDataView. - var data = loader.Load(dataFilePath); - - // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data - // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. - - // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default - // values than the specified count. This transformation can be used to remove slots with too many missing values. - // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature - // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to - // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information - // between features and label. - var pipeline = loader.MakeNewEstimator() - .Append(r =>( - FeaturesCountSelect: r.Features.SelectFeaturesBasedOnCount(count: 695), - Label: r.Label - )) - .Append(r => ( - FeaturesCountSelect: r.FeaturesCountSelect, - FeaturesMISelect: r.FeaturesCountSelect.SelectFeaturesBasedOnMutualInformation(r.Label, slotsInOutput: 5), - Label: r.Label - )); - - - // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. - var transformedData = pipeline.Fit(data).Transform(data); - - // Small helper to print the data inside a column, in the console. Only prints the first 10 rows. - Action>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - int count = 0; - foreach (var row in column) - { - foreach (var value in row.GetValues()) - Console.Write($"{value}\t"); - Console.WriteLine(""); - count++; - if (count >= 10) - break; - } - - Console.WriteLine("==================================================="); - }; - - // Print the data that results from the transformations. - var countSelectColumn = transformedData.AsDynamic.GetColumn>(transformedData.AsDynamic.Schema["FeaturesCountSelect"]); - var MISelectColumn = transformedData.AsDynamic.GetColumn>(transformedData.AsDynamic.Schema["FeaturesMISelect"]); - printHelper("FeaturesCountSelect", countSelectColumn); - printHelper("FeaturesMISelect", MISelectColumn); - - // Below is the output of the this code. We see that some slots habe been dropped by the first transformation. - // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation - // with the label column. - - // FeaturesCountSelect column obtained post-transformation. - // 5 4 4 5 7 3 2 1 - // 3 1 1 1 2 3 1 1 - // 6 8 8 1 3 3 7 1 - // 4 1 1 3 2 3 1 1 - // 8 10 10 8 7 9 7 1 - // 1 1 1 1 2 3 1 1 - // 2 1 2 1 2 3 1 1 - // 2 1 1 1 2 1 1 5 - // 4 2 1 1 2 2 1 1 - // 1 1 1 1 1 3 1 1 - // =================================================== - // FeaturesMISelect column obtained post-transformation. - // 4 4 7 3 2 - // 1 1 2 3 1 - // 8 8 3 3 7 - // 1 1 2 3 1 - // 10 10 7 9 7 - // 1 1 2 3 1 - // 1 2 2 3 1 - // 1 1 2 1 1 - // 2 1 2 2 1 - // 1 1 1 3 1 - // =================================================== - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs deleted file mode 100644 index 4723d3e1f8..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs +++ /dev/null @@ -1,107 +0,0 @@ -using System; -using Microsoft.ML.Trainers.LightGbm.StaticPipe; -using Microsoft.ML.StaticPipe; -using Microsoft.ML; - -namespace Samples.Static -{ - public class LightGbmBinaryClassificationExample - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadAdultDataset(); - - // Data Preview - // 1. Column [Label]: IsOver50K (boolean) - // 2. Column: workclass (text/categorical) - // 3. Column: education (text/categorical) - // 4. Column: marital-status (text/categorical) - // 5. Column: occupation (text/categorical) - // 6. Column: relationship (text/categorical) - // 7. Column: ethnicity (text/categorical) - // 8. Column: sex (text/categorical) - // 9. Column: native-country-region (text/categorical) - // 10. Column: age (numeric) - // 11. Column: fnlwgt (numeric) - // 12. Column: education-num (numeric) - // 13. Column: capital-gain (numeric) - // 14. Column: capital-loss (numeric) - // 15. Column: hours-per-week (numeric) - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var mlContext = new MLContext(); - - // Creating Data Loader with the initial schema based on the format of the data - var loader = TextLoaderStatic.CreateLoader( - mlContext, - c => ( - Age: c.LoadFloat(0), - Workclass: c.LoadText(1), - Fnlwgt: c.LoadFloat(2), - Education: c.LoadText(3), - EducationNum: c.LoadFloat(4), - MaritalStatus: c.LoadText(5), - Occupation: c.LoadText(6), - Relationship: c.LoadText(7), - Ethnicity: c.LoadText(8), - Sex: c.LoadText(9), - CapitalGain: c.LoadFloat(10), - CapitalLoss: c.LoadFloat(11), - HoursPerWeek: c.LoadFloat(12), - NativeCountry: c.LoadText(13), - IsOver50K: c.LoadBool(14)), - separator: ',', - hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // Create the Estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(row => ( - Features: row.Age.ConcatWith( - row.EducationNum, - row.MaritalStatus.OneHotEncoding(), - row.Occupation.OneHotEncoding(), - row.Relationship.OneHotEncoding(), - row.Ethnicity.OneHotEncoding(), - row.Sex.OneHotEncoding(), - row.HoursPerWeek, - row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), - Label: row.IsOver50K)) - .Append(row => ( - Features: row.Features.Normalize(), - Label: row.Label, - Score: mlContext.BinaryClassification.Trainers.LightGbm( - row.Label, - row.Features, - numberOfLeaves: 4, - minimumExampleCountPerLeaf: 6, - learningRate: 0.001))) - .Append(row => ( - Label: row.Label, - Score: row.Score, - PredictedLabel: row.Score.predictedLabel)); - - // Fit this Pipeline to the Training Data - var model = learningPipeline.Fit(trainData); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - - var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score); - - Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.84 - Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.89 - Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.64 - - Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88 - Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91 - Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68 - Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.60 - } - } -} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs deleted file mode 100644 index 8f08f86889..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs +++ /dev/null @@ -1,105 +0,0 @@ -using System; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Trainers.LightGbm.StaticPipe; -using Microsoft.ML.SamplesUtils; -using Microsoft.ML.StaticPipe; -using Microsoft.ML; - -namespace Samples.Static -{ - class LightGBMMulticlassWithInMemoryData - { - public void Example() - { - // Create a general context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Create in-memory examples as C# native class. - var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000); - - // Convert native C# class to IDataView, a consumble format to ML.NET functions. - var dataView = mlContext.Data.LoadFromEnumerable(examples); - - // IDataView is the data format used in dynamic-typed pipeline. To use static-typed pipeline, we need to convert - // IDataView to DataView by calling AssertStatic(...). The basic idea is to specify the static type for each column - // in IDataView in a lambda function. - var staticDataView = dataView.AssertStatic(mlContext, c => ( - Features: c.R4.Vector, - Label: c.Text.Scalar)); - - // Create static pipeline. First, we make an estimator out of static DataView as the starting of a pipeline. - // Then, we append necessary transforms and a classifier to the starting estimator. - var pipe = staticDataView.MakeNewEstimator() - .Append(mapper: r => ( - r.Label, - // Train multi-class LightGBM. The trained model maps Features to Label and probability of each class. - // The call of ToKey() is needed to convert string labels to integer indexes. - Predictions: mlContext.MulticlassClassification.Trainers.LightGbm(r.Label.ToKey(), r.Features) - )) - .Append(r => ( - // Actual label. - r.Label, - // Labels are converted to keys when training LightGBM so we convert it here again for calling evaluation function. - LabelIndex: r.Label.ToKey(), - // Used to compute metrics such as accuracy. - r.Predictions, - // Assign a new name to predicted class index. - PredictedLabelIndex: r.Predictions.predictedLabel, - // Assign a new name to class probabilities. - Scores: r.Predictions.score - )); - - // Split the static-typed data into training and test sets. Only training set is used in fitting - // the created pipeline. Metrics are computed on the test. - var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5); - - // Train the model. - var model = pipe.Fit(trainingData); - - // Do prediction on the test set. - var prediction = model.Transform(testingData); - - // Evaluate the trained model is the test set. - var metrics = mlContext.MulticlassClassification.Evaluate(prediction, r => r.LabelIndex, r => r.Predictions); - - // Check if metrics are resonable. - Console.WriteLine ("Macro accuracy: {0}, Micro accuracy: {1}.", 0.863482146891263, 0.86309523809523814); - - // Convert prediction in ML.NET format to native C# class. - var nativePredictions = mlContext.Data.CreateEnumerable(prediction.AsDynamic, false).ToList(); - - // Get schema object out of the prediction. It contains annotations such as the mapping from predicted label index - // (e.g., 1) to its actual label (e.g., "AA"). The call to "AsDynamic" converts our statically-typed pipeline into - // a dynamically-typed one only for extracting annotations. In the future, annotations in statically-typed pipeline should - // be accessible without dynamically-typed things. - var schema = prediction.AsDynamic.Schema; - - // Retrieve the mapping from labels to label indexes. - var labelBuffer = new VBuffer>(); - schema[nameof(DatasetUtils.MulticlassClassificationExample.PredictedLabelIndex)].Annotations.GetValue("KeyValues", ref labelBuffer); - // nativeLabels is { "AA" , "BB", "CC", "DD" } - var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex. - - - // Show prediction result for the 3rd example. - var nativePrediction = nativePredictions[2]; - // Console output: - // Our predicted label to this example is "AA" with probability 0.922597349. - Console.WriteLine("Our predicted label to this example is {0} with probability {1}", - nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1], - nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]); - - var expectedProbabilities = new float[] { 0.922597349f, 0.07508608f, 0.00221699756f, 9.95488E-05f }; - // Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i]. - // Console output: - // The probability of being class "AA" is 0.922597349. - // The probability of being class "BB" is 0.07508608. - // The probability of being class "CC" is 0.00221699756. - // The probability of being class "DD" is 9.95488E-05. - for (int i = 0; i < labelBuffer.Length; ++i) - Console.WriteLine("The probability of being class {0} is {1}.", nativeLabels[i], nativePrediction.Scores[i]); - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs deleted file mode 100644 index a7c1d7bdae..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs +++ /dev/null @@ -1,71 +0,0 @@ -using System; -using Microsoft.ML.Data; -using Microsoft.ML.Trainers.LightGbm; -using Microsoft.ML.Trainers.LightGbm.StaticPipe; -using Microsoft.ML.StaticPipe; -using Microsoft.ML; - -namespace Samples.Static -{ - public class LightGbmRegressionExample - { - public static void Example() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem. - // You can open the file to see the data. - string dataFile = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var mlContext = new MLContext(); - - // Creating a data loader, based on the format of the data - var loader = TextLoaderStatic.CreateLoader(mlContext, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(new MultiFileSource(dataFile)); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // The predictor that gets produced out of training - LightGbmRegressionModelParameters pred = null; - - // Create the estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm( - r.label, - r.features, - numberOfLeaves: 4, - minimumExampleCountPerLeaf: 6, - learningRate: 0.001, - onFit: p => pred = p) - ) - ); - - // Fit this pipeline to the training data - var model = learningPipeline.Fit(trainData); - - // Check the weights that the model learned - VBuffer weights = default; - pred.GetFeatureWeights(ref weights); - - var weightsValues = weights.GetValues(); - Console.WriteLine($"weight 0 - {weightsValues[0]}"); - Console.WriteLine($"weight 1 - {weightsValues[1]}"); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); - - Console.WriteLine($"L1 - {metrics.MeanAbsoluteError}"); // 4.9669731 - Console.WriteLine($"L2 - {metrics.MeanSquaredError}"); // 51.37296 - Console.WriteLine($"LossFunction - {metrics.LossFunction}"); // 51.37296 - Console.WriteLine($"RMS - {metrics.RootMeanSquaredError}"); // 7.167493 - Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.079478 - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs deleted file mode 100644 index e43d1cfcb9..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs +++ /dev/null @@ -1,115 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.StaticPipe; - -namespace Samples.Static -{ - public class SdcaBinaryClassificationExample - { - public static void Example() - { - // Downloading a classification dataset from github.com/dotnet/machinelearning. - // It will be stored in the same path as the executable - string dataFilePath = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadAdultDataset(); - - // Data Preview - // 1. Column [Label]: IsOver50K (boolean) - // 2. Column: workclass (text/categorical) - // 3. Column: education (text/categorical) - // 4. Column: marital-status (text/categorical) - // 5. Column: occupation (text/categorical) - // 6. Column: relationship (text/categorical) - // 7. Column: ethnicity (text/categorical) - // 8. Column: sex (text/categorical) - // 9. Column: native-country-region (text/categorical) - // 10. Column: age (numeric) - // 11. Column: fnlwgt (numeric) - // 12. Column: education-num (numeric) - // 13. Column: capital-gain (numeric) - // 14. Column: capital-loss (numeric) - // 15. Column: hours-per-week (numeric) - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var mlContext = new MLContext(); - - // Creating Data Loader with the initial schema based on the format of the data - var loader = TextLoaderStatic.CreateLoader( - mlContext, - c => ( - Age: c.LoadFloat(0), - Workclass: c.LoadText(1), - Fnlwgt: c.LoadFloat(2), - Education: c.LoadText(3), - EducationNum: c.LoadFloat(4), - MaritalStatus: c.LoadText(5), - Occupation: c.LoadText(6), - Relationship: c.LoadText(7), - Ethnicity: c.LoadText(8), - Sex: c.LoadText(9), - CapitalGain: c.LoadFloat(10), - CapitalLoss: c.LoadFloat(11), - HoursPerWeek: c.LoadFloat(12), - NativeCountry: c.LoadText(13), - IsOver50K: c.LoadBool(14)), - separator: ',', - hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // Create the Estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(row => ( - Features: row.Age.ConcatWith( - row.EducationNum, - row.MaritalStatus.OneHotEncoding(), - row.Occupation.OneHotEncoding(), - row.Relationship.OneHotEncoding(), - row.Ethnicity.OneHotEncoding(), - row.Sex.OneHotEncoding(), - row.HoursPerWeek, - row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), - Label: row.IsOver50K)) - .Append(row => ( - Features: row.Features.Normalize(), - Label: row.Label, - Score: mlContext.BinaryClassification.Trainers.Sdca( - row.Label, - row.Features, - l1Threshold: 0.25f, - numberOfIterations: 100))) - .Append(row => ( - Label: row.Label, - Score: row.Score, - PredictedLabel: row.Score.predictedLabel)); - - // Fit this Pipeline to the Training Data - var model = learningPipeline.Fit(trainData); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - - var metrics = mlContext.BinaryClassification.EvaluateWithPRCurve(dataWithPredictions, row => row.Label, row => row.Score, out List prCurve); - - Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.83 - Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.88 - Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.59 - - Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.87 - Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91 - Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.65 - Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.55 - - foreach(var prData in prCurve) - { - Console.Write($"Threshold: {prData.Threshold} "); - Console.Write($"Precision: {prData.Precision} "); - Console.Write($"Recall: {prData.Recall} "); - Console.WriteLine($"FPR: {prData.FalsePositiveRate}"); - } - } - } -} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs deleted file mode 100644 index df8cf17267..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs +++ /dev/null @@ -1,65 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Trainers; - -namespace Samples.Static -{ - public class SdcaRegressionExample - { - public static void Example() - { - // Downloading a regression dataset from github.com/dotnet/machinelearning - // this will create a housing.txt file in the filsystem this code will run - // you can open the file to see the data. - string dataFile = Microsoft.ML.SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); - - // Creating the ML.Net IHostEnvironment object, needed for the pipeline - var mlContext = new MLContext(); - - // Creating a data loader, based on the format of the data - var loader = TextLoaderStatic.CreateLoader(mlContext, c => ( - label: c.LoadFloat(0), - features: c.LoadFloat(1, 6) - ), - separator: '\t', hasHeader: true); - - // Load the data, and leave 10% out, so we can use them for testing - var data = loader.Load(dataFile); - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // The predictor that gets produced out of training - LinearRegressionModelParameters pred = null; - - // Create the estimator - var learningPipeline = loader.MakeNewEstimator() - .Append(r => (r.label, score: mlContext.Regression.Trainers.Sdca( - r.label, - r.features, - l1Threshold: 0f, - numberOfIterations: 100, - onFit: p => pred = p) - ) - ); - - // Fit this pipeline to the training data - var model = learningPipeline.Fit(trainData); - - // Check the weights that the model learned - var weights = pred.Weights; - - Console.WriteLine($"weight 0 - {weights[0]}"); - Console.WriteLine($"weight 1 - {weights[1]}"); - - // Evaluate how the model is doing on the test data - var dataWithPredictions = model.Transform(testData); - var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); - - Console.WriteLine($"L1 - {metrics.MeanAbsoluteError}"); // 3.7226085 - Console.WriteLine($"L2 - {metrics.MeanSquaredError}"); // 24.250636 - Console.WriteLine($"LossFunction - {metrics.LossFunction}"); // 24.25063 - Console.WriteLine($"RMS - {metrics.RootMeanSquaredError}"); // 4.924493 - Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.565467 - } - } -} diff --git a/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.nupkgproj b/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.nupkgproj deleted file mode 100644 index 1ee836402f..0000000000 --- a/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.nupkgproj +++ /dev/null @@ -1,15 +0,0 @@ - - - - netstandard2.0 - ML.NET component for a statically typed API. - - - - - - - - - - diff --git a/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.symbols.nupkgproj b/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.symbols.nupkgproj deleted file mode 100644 index a4b942a712..0000000000 --- a/pkg/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.symbols.nupkgproj +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/src/Microsoft.ML.Analyzer/Microsoft.ML.Analyzer.csproj b/src/Microsoft.ML.Analyzer/Microsoft.ML.Analyzer.csproj deleted file mode 100644 index b14b62f354..0000000000 --- a/src/Microsoft.ML.Analyzer/Microsoft.ML.Analyzer.csproj +++ /dev/null @@ -1,14 +0,0 @@ - - - - netstandard1.3 - Microsoft.ML - - - - - - - - - diff --git a/src/Microsoft.ML.Analyzer/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Analyzer/Properties/AssemblyInfo.cs deleted file mode 100644 index 62b6d49c73..0000000000 --- a/src/Microsoft.ML.Analyzer/Properties/AssemblyInfo.cs +++ /dev/null @@ -1,7 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Runtime.CompilerServices; - -[assembly: InternalsVisibleTo("Microsoft.ML.CodeAnalyzer.Tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] diff --git a/src/Microsoft.ML.Analyzer/TypeIsSchemaShapeAnalyzer.cs b/src/Microsoft.ML.Analyzer/TypeIsSchemaShapeAnalyzer.cs deleted file mode 100644 index 1f0231d405..0000000000 --- a/src/Microsoft.ML.Analyzer/TypeIsSchemaShapeAnalyzer.cs +++ /dev/null @@ -1,421 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using Microsoft.CodeAnalysis; -using Microsoft.CodeAnalysis.CSharp; -using Microsoft.CodeAnalysis.CSharp.Syntax; -using Microsoft.CodeAnalysis.Diagnostics; - -namespace Microsoft.ML.Analyzer -{ - [DiagnosticAnalyzer(LanguageNames.CSharp)] - public sealed class TypeIsSchemaShapeAnalyzer : DiagnosticAnalyzer - { - internal static class ShapeDiagnostic - { - private const string Category = "Type Check"; - public const string Id = "MSML_TypeShouldBeSchemaShape"; - private const string Title = "The type is not a schema shape"; - private const string Format = "Type{0} is neither a PipelineColumn nor a ValueTuple, nor a class of an allowed form."; - internal const string Description = - "Within statically typed pipeline elements of ML.NET, the shape of the schema is determined by a 'shape' type. " + - "A valid 'shape' type is either an instance of one of the PipelineColumn subclasses (for example, Scalar " + - "or something like that), or a ValueTuple containing only valid 'shape' types, or a class whose only publicly " + - "accessible members are methods, a single constructor, and properties that are valid 'shape' types themselves, " + - "and that have either get and set accessors and the single constructor takes no parameters, or that has get only " + - "property accessors and the constructor takes as many parameters as there are properties. (So, ValueTuples " + - "containing other value tuples are fine, so long as they terminate in a PipelineColumn subclass.)"; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeParameterDiagnostic - { - private const string Category = "Type Check"; - public const string Id = "MSML_TypeParameterShouldBeSchemaShape"; - private const string Title = "The type is not a schema shape"; - private const string Format = "Type parameter {0} is not marked with [IsShape] or appropriate type constraints."; - internal const string Description = ShapeDiagnostic.Description + " " + - "If using type parameters when interacting with the statically typed pipelines, the type parameter ought to be " + - "constrained in such a way that it, either by applying the [IsShape] attribute or by having type constraints to " + - "indicate that it is valid, for example, constraining the type to descend from PipelineColumn."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticConstructor - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassShouldHaveOnePublicConstructor"; - private const string Title = "The class does not have exactly one public constructor."; - private const string Format = "Member's type {0} does not have exactly one public constructor."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type does not have exactly one public constructor."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticField - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassShouldHaveNoPublicFields"; - private const string Title = "The class should not have publicly accessible fields."; - private const string Format = "Type {0} has publicly accessible field {1}."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type has publicly accessible fields."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticGettable - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassGettableProperty"; - private const string Title = "All properties should be gettable."; - private const string Format = "Type {0} has property {1} without a public getter."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type has a property without a getter."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticNoArgsSettable - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassWithParameterlessConstructorSettableProperties"; - private const string Title = "If the class has a constructor with no parameters, all properties should be settable."; - private const string Format = "Type {0} has property {1} that is not settable."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type has a parameterless constructor, but a field that is not settable."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticArgsSettable - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassWithParameterfulConstructorSettableProperties"; - private const string Title = "If the class has a constructor with parameters, but some properties are settable."; - private const string Format = "Type {0} has property {1} that is settable."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type has a constructor with parameters, but some of the properties also have setters."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - internal static class ShapeClassDiagnosticCorrespondence - { - private const string Category = "Type Check"; - public const string Id = "MSML_SchemaShapeClassConstructorAndPropertiesCorrespond"; - private const string Title = "If the class has a constructor with parameters, there ought to be a one to one correspondence between the parameters and the properties."; - private const string Format = "Type {0} appears to not have an exact correspondence among the number or type of constructor parameters and properties."; - internal const string Description = ShapeDiagnostic.Description + " " + - "This type has a constructor with parameters, but it appears that the number or types of the properties do not correspond to the parameters in the constructor."; - - internal static DiagnosticDescriptor Rule = - new DiagnosticDescriptor(Id, Title, Format, Category, - DiagnosticSeverity.Error, isEnabledByDefault: true, description: Description); - } - - private const string AttributeName = "Microsoft.ML.StaticPipe.IsShapeAttribute"; - private const string LeafTypeName = "Microsoft.ML.StaticPipe.PipelineColumn"; - - public override ImmutableArray SupportedDiagnostics => - ImmutableArray.Create(ShapeDiagnostic.Rule, ShapeParameterDiagnostic.Rule, ShapeClassDiagnosticConstructor.Rule, ShapeClassDiagnosticField.Rule, - ShapeClassDiagnosticGettable.Rule, ShapeClassDiagnosticNoArgsSettable.Rule, ShapeClassDiagnosticArgsSettable.Rule, ShapeClassDiagnosticCorrespondence.Rule); - - public override void Initialize(AnalysisContext context) - { - context.RegisterSemanticModelAction(Analyze); - } - - private enum SpecificError - { - None, - General, - TypeParam, - Constructor, - Field, - - PropGettable, - PropNoArgSettable, - PropArgSettable, - PropNoCorrespondence, - } - - private void Analyze(SemanticModelAnalysisContext context) - { - // We start with the model, then do the the method invocations. - // We could have phrased it as RegisterSyntaxNodeAction(Analyze, SyntaxKind.InvocationExpression), - // but this seemed more inefficient since getting the model and fetching the type symbols every - // single time seems to incur significant cost. The following invocation is somewhat more awkward - // since we must iterate over the invocation syntaxes ourselves, but this seems to be worthwhile. - var model = context.SemanticModel; - var comp = model.Compilation; - - // Get the symbols of the key types we are analyzing. If we can't find any of them there is - // no point in going further. - var attrType = comp.GetTypeByMetadataName(AttributeName); - if (attrType == null) - return; - var leafType = comp.GetTypeByMetadataName(LeafTypeName); - if (leafType == null) - return; - - // This internal helper method recursively determines whether an attributed type parameter - // has a valid type. It is called externally from the loop over invocations. - bool CheckType(ITypeSymbol type, out string path, out ITypeSymbol problematicType, out SpecificError specificError) - { - // Assume it's OK. - path = null; - problematicType = null; - specificError = SpecificError.None; - - if (type.Kind == SymbolKind.ErrorType) - return true; // We at least should not complain, so we don't get in the way of whatever the real problem is. - - if (type.TypeKind == TypeKind.TypeParameter) - { - var typeParam = (ITypeParameterSymbol)type; - // Does the type parameter have the attribute that triggers a check? - if (type.GetAttributes().Any(attr => attr.AttributeClass == attrType)) - return true; - // Are any of the declared constraint types OK? If they're OK, we're OK. - if (typeParam.ConstraintTypes.Any(ct => CheckType(ct, out string ctPath, out var ctProb, out var ctSpecificError))) - return true; - // Well, probably not good then. Let's call it a day. - specificError = SpecificError.TypeParam; - problematicType = typeParam; - return false; - } - else if (type.IsTupleType) - { - INamedTypeSymbol nameType = (INamedTypeSymbol)type; - var tupleElems = nameType.TupleElements; - - for (int i = 0; i < tupleElems.Length; ++i) - { - var e = tupleElems[i]; - if (!CheckType(e.Type, out string innerPath, out problematicType, out specificError)) - { - path = e.Name ?? $"Item{i + 1}"; - if (innerPath != null) - path += "." + innerPath; - return false; - } - } - return true; - } - else if (type.IsReferenceType) - { - // First check to see if it is a pipeline column. If it is we can stop. - for (var rt = type; rt != null; rt = rt.BaseType) - { - if (rt == leafType) - return true; - } - - // Next check if it's a reference type. - var members = type.GetMembers(); - - // First find the constructor. - IMethodSymbol constructor = null; - foreach (var method in members.OfType()) - { - if (method.DeclaredAccessibility != Accessibility.Public) - continue; - - if (method.MethodKind != MethodKind.Constructor) - continue; - if (constructor != null) - { - problematicType = type; - specificError = SpecificError.Constructor; - return false; - } - constructor = method; - } - - if (constructor == null) - { - problematicType = type; - specificError = SpecificError.Constructor; - return false; - } - - // Determine the parameters of the constructor, if any. - var t2c = new Dictionary(); - foreach (var prm in constructor.Parameters) - { - t2c.TryGetValue(prm.Type, out int cnt); - t2c[prm.Type] = cnt + 1; - } - bool needsSetters = constructor.Parameters.Length == 0; - - // Next iterate over the members. - foreach (var member in members) - { - // Only care about public members, and ignore methods. - if (member.DeclaredAccessibility != Accessibility.Public || member.Kind == SymbolKind.Method || member.IsStatic) - continue; - - if (member.Kind == SymbolKind.Field) - { - path = member.Name; - problematicType = type; - specificError = SpecificError.Field; - return false; - } - - if (member.Kind == SymbolKind.Property) - { - var propSymbol = (IPropertySymbol)member; - if (!CheckType(propSymbol.Type, out string innerPath, out problematicType, out specificError)) - { - path = propSymbol.Name; - if (innerPath != null) - path += "." + innerPath; - return false; - } - - // Make sure the property is gettable. - if (propSymbol.GetMethod?.DeclaredAccessibility != Accessibility.Public) - { - path = propSymbol.Name; - problematicType = type; - specificError = SpecificError.PropGettable; - return false; - } - if (constructor.Parameters.Length > 0) - { - if (t2c.TryGetValue(propSymbol.Type, out int count)) - { - t2c[propSymbol.Type] = count - 1; - if (count == 1) - t2c.Remove(propSymbol.Type); - } - else - { - // Couldn't find a corresponding parameter in the constructor. - path = propSymbol.Name; - problematicType = type; - specificError = SpecificError.PropNoCorrespondence; - return false; - } - if (propSymbol.SetMethod?.DeclaredAccessibility == Accessibility.Public) - { - path = propSymbol.Name; - problematicType = type; - specificError = SpecificError.PropArgSettable; - return false; - } - } - else if (propSymbol.SetMethod?.DeclaredAccessibility != Accessibility.Public) - { - path = propSymbol.Name; - problematicType = type; - specificError = SpecificError.PropNoArgSettable; - return false; - } - } - } - // Finally check that *every* parameter in the constructor has been covered. - if (t2c.Count > 0) - { - // Some parameters in the constructor were uncovered. - problematicType = type; - specificError = SpecificError.PropNoCorrespondence; - return false; - } - return true; - } - problematicType = type; - specificError = SpecificError.General; - return false; - } - - foreach (var invocation in model.SyntaxTree.GetRoot().DescendantNodes().OfType()) - { - var symbolInfo = model.GetSymbolInfo(invocation); - if (!(symbolInfo.Symbol is IMethodSymbol methodSymbol)) - { - // Should we perhaps skip when there is a method resolution failure? This is often but not always a sign of another problem. - if (symbolInfo.CandidateReason != CandidateReason.OverloadResolutionFailure || symbolInfo.CandidateSymbols.Length == 0) - continue; - methodSymbol = symbolInfo.CandidateSymbols[0] as IMethodSymbol; - if (methodSymbol == null) - continue; - } - // Analysis only applies to generic methods. - if (!methodSymbol.IsGenericMethod) - continue; - // Scan the type parameters for one that has our target attribute. - for (int i = 0; i < methodSymbol.TypeParameters.Length; ++i) - { - var par = methodSymbol.TypeParameters[i]; - var attr = par.GetAttributes(); - if (attr.Length == 0) - continue; - if (!attr.Any(a => a.AttributeClass == attrType)) - continue; - // We've found it. Check the type argument to ensure it is of the appropriate type. - var p = methodSymbol.TypeArguments[i]; - if (CheckType(p, out string path, out ITypeSymbol problematicType, out SpecificError error)) - continue; - - Diagnostic diagnostic; - switch (error) - { - case SpecificError.TypeParam: - diagnostic = Diagnostic.Create(ShapeParameterDiagnostic.Rule, invocation.GetLocation(), problematicType.Name); - break; - case SpecificError.Constructor: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticConstructor.Rule, invocation.GetLocation(), problematicType.Name); - break; - case SpecificError.Field: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticField.Rule, invocation.GetLocation(), problematicType.Name, path); - break; - - case SpecificError.PropGettable: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticGettable.Rule, invocation.GetLocation(), problematicType.Name, path); - break; - case SpecificError.PropArgSettable: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticArgsSettable.Rule, invocation.GetLocation(), problematicType.Name, path); - break; - case SpecificError.PropNoArgSettable: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticNoArgsSettable.Rule, invocation.GetLocation(), problematicType.Name, path); - break; - case SpecificError.PropNoCorrespondence: - diagnostic = Diagnostic.Create(ShapeClassDiagnosticCorrespondence.Rule, invocation.GetLocation(), problematicType.Name); - break; - - case SpecificError.General: - default: // Whoops. Just pretend it's a general error. - path = path == null ? "" : " of item " + path; - diagnostic = Diagnostic.Create(ShapeDiagnostic.Rule, invocation.GetLocation(), path); - break; - } - context.ReportDiagnostic(diagnostic); - } - } - } - } -} diff --git a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs index da0471dbfe..359d623c07 100644 --- a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs @@ -11,7 +11,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Sweeper.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.InferenceTesting" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipelineTesting" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformerTest" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.EntryPoints" + PublicKey.Value)] @@ -40,13 +39,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Transforms" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow.StaticPipe" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Mkl.Components.StaticPipe" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformer.StaticPipe" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGbm.StaticPipe" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries.StaticPipe" + PublicKey.Value)] - [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Internal.MetaLinearLearner" + InternalPublicKey.Value)] diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index 2375a5a05a..c485575f7f 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -13,7 +13,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Benchmarks" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipelineTesting" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Benchmarks" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.EntryPoints" + PublicKey.Value)] @@ -44,7 +43,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet18" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet50" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Experimental" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Internal.MetaLinearLearner" + InternalPublicKey.Value)] diff --git a/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs b/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs index 2e27f549de..52ebe6ccb7 100644 --- a/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs @@ -11,8 +11,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGbm" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Sweeper" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] - [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Internal.FastTree" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "RunTests" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] diff --git a/src/Microsoft.ML.ImageAnalytics/Properties/AssemblyInfo.cs b/src/Microsoft.ML.ImageAnalytics/Properties/AssemblyInfo.cs index 02cc8e9483..734f2f357d 100644 --- a/src/Microsoft.ML.ImageAnalytics/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.ImageAnalytics/Properties/AssemblyInfo.cs @@ -5,7 +5,6 @@ using System.Runtime.CompilerServices; using Microsoft.ML; -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.KMeansClustering/Properties/AssemblyInfo.cs b/src/Microsoft.ML.KMeansClustering/Properties/AssemblyInfo.cs index 4cfdbca7bb..734f2f357d 100644 --- a/src/Microsoft.ML.KMeansClustering/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.KMeansClustering/Properties/AssemblyInfo.cs @@ -6,6 +6,5 @@ using Microsoft.ML; [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.LightGbm.StaticPipe/LightGbmStaticExtensions.cs b/src/Microsoft.ML.LightGbm.StaticPipe/LightGbmStaticExtensions.cs deleted file mode 100644 index 0fb8bf312b..0000000000 --- a/src/Microsoft.ML.LightGbm.StaticPipe/LightGbmStaticExtensions.cs +++ /dev/null @@ -1,401 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Microsoft.ML.Trainers.LightGbm.StaticPipe -{ - /// - /// Regression trainer estimators. - /// - public static class LightGbmStaticExtensions - { - /// - /// Predict a target using a tree regression model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The weights column. - /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. - /// The learning rate. - /// Number of iterations. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - /// - /// - /// - /// - public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights = null, - int? numberOfLeaves = null, - int? minimumExampleCountPerLeaf = null, - double? learningRate = null, - int numberOfIterations = Defaults.NumberOfIterations, - Action onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LightGbmRegressionTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - minimumExampleCountPerLeaf, learningRate, numberOfIterations); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a tree regression model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - public static Scalar LightGbm(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights, - LightGbmRegressionTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(options, nameof(options)); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LightGbmRegressionTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a tree binary classification model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The weights column. - /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. - /// The learning rate. - /// Number of iterations. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - /// - /// - /// - /// - public static (Scalar score, Scalar probability, Scalar predictedLabel) LightGbm(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - int? numberOfLeaves = null, - int? minimumExampleCountPerLeaf = null, - double? learningRate = null, - int numberOfIterations = Defaults.NumberOfIterations, - Action> onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LightGbmBinaryTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - minimumExampleCountPerLeaf, learningRate, numberOfIterations); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a tree binary classification model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - public static (Scalar score, Scalar probability, Scalar predictedLabel) LightGbm(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights, - LightGbmBinaryTrainer.Options options, - Action> onFit = null) - { - Contracts.CheckValue(options, nameof(options)); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LightGbmBinaryTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Ranks a series of inputs based on their relevance, training a decision tree ranking model through the . - /// - /// The . - /// The label column. - /// The features column. - /// The groupId column. - /// The weights column. - /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. - /// The learning rate. - /// Number of iterations. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - public static Scalar LightGbm(this RankingCatalog.RankingTrainers catalog, - Scalar label, - Vector features, - Key groupId, - Scalar weights = null, - int? numberOfLeaves = null, - int? minimumExampleCountPerLeaf = null, - double? learningRate = null, - int numberOfIterations = Defaults.NumberOfIterations, - Action onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); - Contracts.CheckValue(groupId, nameof(groupId)); - - var rec = new TrainerEstimatorReconciler.Ranker( - (env, labelName, featuresName, groupIdName, weightsName) => - { - var trainer = new LightGbmRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numberOfLeaves, - minimumExampleCountPerLeaf, learningRate, numberOfIterations); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, groupId, weights); - - return rec.Score; - } - - /// - /// Ranks a series of inputs based on their relevance, training a decision tree ranking model through the . - /// - /// The . - /// The label column. - /// The features column. - /// The groupId column. - /// The weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - public static Scalar LightGbm(this RankingCatalog.RankingTrainers catalog, - Scalar label, Vector features, Key groupId, Scalar weights, - LightGbmRankingTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(options, nameof(options)); - CheckUserValues(label, features, weights, onFit); - Contracts.CheckValue(groupId, nameof(groupId)); - - var rec = new TrainerEstimatorReconciler.Ranker( - (env, labelName, featuresName, groupIdName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.RowGroupColumnName = groupIdName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LightGbmRankingTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, groupId, weights); - - return rec.Score; - } - - /// - /// Predict a target using a tree multiclass classification model trained with the . - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The weights column. - /// The number of leaves to use. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. - /// The learning rate. - /// Number of iterations. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - /// - /// - /// - /// - /// - public static (Vector score, Key predictedLabel) - LightGbm(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights = null, - int? numberOfLeaves = null, - int? minimumExampleCountPerLeaf = null, - double? learningRate = null, - int numberOfIterations = Defaults.NumberOfIterations, - Action onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, minimumExampleCountPerLeaf, learningRate, numberOfIterations, onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LightGbmMulticlassTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - minimumExampleCountPerLeaf, learningRate, numberOfIterations); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a tree multiclass classification model trained with the . - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The weights column. - /// Advanced options to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) - LightGbm(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights, - LightGbmMulticlassTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(options, nameof(options)); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LightGbmMulticlassTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - private static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, - int? numberOfLeaves, - int? minimumExampleCountPerLeaf, - double? learningRate, - int numBoostRound, - Delegate onFit) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(numberOfLeaves < 2), nameof(numberOfLeaves), "Must be at least 2."); - Contracts.CheckParam(!(minimumExampleCountPerLeaf <= 0), nameof(minimumExampleCountPerLeaf), "Must be positive"); - Contracts.CheckParam(!(learningRate <= 0), nameof(learningRate), "Must be positive"); - Contracts.CheckParam(numBoostRound > 0, nameof(numBoostRound), "Must be positive"); - Contracts.CheckValueOrNull(onFit); - } - - private static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, Delegate onFit) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(onFit); - } - } -} diff --git a/src/Microsoft.ML.LightGbm.StaticPipe/Microsoft.ML.LightGbm.StaticPipe.csproj b/src/Microsoft.ML.LightGbm.StaticPipe/Microsoft.ML.LightGbm.StaticPipe.csproj deleted file mode 100644 index 9383a25281..0000000000 --- a/src/Microsoft.ML.LightGbm.StaticPipe/Microsoft.ML.LightGbm.StaticPipe.csproj +++ /dev/null @@ -1,13 +0,0 @@ - - - - netstandard2.0 - - - - - - - - - diff --git a/src/Microsoft.ML.LightGbm/Properties/AssemblyInfo.cs b/src/Microsoft.ML.LightGbm/Properties/AssemblyInfo.cs index ef60f8d4a6..0d0c69995d 100644 --- a/src/Microsoft.ML.LightGbm/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.LightGbm/Properties/AssemblyInfo.cs @@ -6,7 +6,6 @@ using Microsoft.ML; [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.LightGbm.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "RunTests" + InternalPublicKey.Value)] diff --git a/src/Microsoft.ML.Mkl.Components.StaticPipe/Microsoft.ML.Mkl.Components.StaticPipe.csproj b/src/Microsoft.ML.Mkl.Components.StaticPipe/Microsoft.ML.Mkl.Components.StaticPipe.csproj deleted file mode 100644 index a5ae4ee177..0000000000 --- a/src/Microsoft.ML.Mkl.Components.StaticPipe/Microsoft.ML.Mkl.Components.StaticPipe.csproj +++ /dev/null @@ -1,13 +0,0 @@ - - - - netstandard2.0 - - - - - - - - - diff --git a/src/Microsoft.ML.Mkl.Components.StaticPipe/VectorWhiteningStaticExtensions.cs b/src/Microsoft.ML.Mkl.Components.StaticPipe/VectorWhiteningStaticExtensions.cs deleted file mode 100644 index 94b81b6377..0000000000 --- a/src/Microsoft.ML.Mkl.Components.StaticPipe/VectorWhiteningStaticExtensions.cs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.Mkl.Components.StaticPipe -{ - /// - /// Extensions for statically typed Whitening estimator. - /// - public static class VectorWhiteningStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, WhiteningKind kind, float eps, int maxRows, int pcaNum) - : base(new Reconciler(kind, eps, maxRows, pcaNum), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly WhiteningKind _kind; - private readonly float _eps; - private readonly int _maxRows; - private readonly int _pcaNum; - - public Reconciler(WhiteningKind kind, float eps, int maxRows, int pcaNum) - { - _kind = kind; - _eps = eps; - _maxRows = maxRows; - _pcaNum = pcaNum; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var infos = new VectorWhiteningEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; i++) - infos[i] = new VectorWhiteningEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[((OutPipelineColumn)toOutput[i]).Input], _kind, _eps, _maxRows, _pcaNum); - - return new VectorWhiteningEstimator(env, infos); - } - } - - /// The column to which the transform will be applied. - /// Whitening constant, prevents division by zero when scaling the data by inverse of eigenvalues. - /// Maximum number of rows used to train the transform. - /// In case of PCA whitening, indicates the number of components to retain. - public static Vector PcaWhitening(this Vector input, - float eps = VectorWhiteningEstimator.Defaults.Epsilon, - int maxRows = VectorWhiteningEstimator.Defaults.MaximumNumberOfRows, - int pcaNum = VectorWhiteningEstimator.Defaults.Rank) - => new OutPipelineColumn(input, WhiteningKind.PrincipalComponentAnalysis, eps, maxRows, pcaNum); - - /// The column to which the transform will be applied. - /// Whitening constant, prevents division by zero. - /// Maximum number of rows used to train the transform. - public static Vector ZcaWhitening(this Vector input, - float eps = VectorWhiteningEstimator.Defaults.Epsilon, - int maxRows = VectorWhiteningEstimator.Defaults.MaximumNumberOfRows) - => new OutPipelineColumn(input, WhiteningKind.ZeroPhaseComponentAnalysis, eps, maxRows, VectorWhiteningEstimator.Defaults.Rank); - } -} diff --git a/src/Microsoft.ML.Mkl.Components/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Mkl.Components/Properties/AssemblyInfo.cs index f3505fb02c..1365172209 100644 --- a/src/Microsoft.ML.Mkl.Components/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Mkl.Components/Properties/AssemblyInfo.cs @@ -7,8 +7,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Mkl.Components.StaticPipe" + PublicKey.Value)] - [assembly: InternalsVisibleTo(assemblyName: "RunTests" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Internal.MetaLinearLearner" + InternalPublicKey.Value)] diff --git a/src/Microsoft.ML.OnnxTransformer.StaticPipe/DnnImageFeaturizerStaticExtensions.cs b/src/Microsoft.ML.OnnxTransformer.StaticPipe/DnnImageFeaturizerStaticExtensions.cs deleted file mode 100644 index 619d1ca95e..0000000000 --- a/src/Microsoft.ML.OnnxTransformer.StaticPipe/DnnImageFeaturizerStaticExtensions.cs +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Transforms.Onnx; - -namespace Microsoft.ML.Transforms.StaticPipe -{ - public static class DnnImageFeaturizerStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, Func> modelFactory) - : base(new Reconciler(modelFactory), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly Func> _modelFactory; - - public Reconciler(Func> modelFactory) - { - _modelFactory = modelFactory; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - return new DnnImageFeaturizerEstimator(env, outputNames[outCol], _modelFactory, inputNames[outCol.Input]); - } - } - - /// - /// Creates and applies a DnnImageFeaturizer transform to be used by the static API. - /// for more information about how the transformation works. - /// - /// Vector of image pixel weights. - /// An extension method on the that creates a chain of two - /// s (one for preprocessing and one with a pretrained image DNN) with specific models - /// included in a package together with that extension method. - /// For an example, see Microsoft.ML.DnnImageFeaturizer.ResNet18 - /// A vector of float feature weights based on the input image. - public static Vector DnnImageFeaturizer(this Vector input, Func> modelFactory) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, modelFactory); - } - } -} diff --git a/src/Microsoft.ML.OnnxTransformer.StaticPipe/Microsoft.ML.OnnxTransformer.StaticPipe.csproj b/src/Microsoft.ML.OnnxTransformer.StaticPipe/Microsoft.ML.OnnxTransformer.StaticPipe.csproj deleted file mode 100644 index 063c5f7d50..0000000000 --- a/src/Microsoft.ML.OnnxTransformer.StaticPipe/Microsoft.ML.OnnxTransformer.StaticPipe.csproj +++ /dev/null @@ -1,14 +0,0 @@ - - - - netstandard2.0 - Microsoft.ML.Transform.StaticPipe - - - - - - - - - diff --git a/src/Microsoft.ML.OnnxTransformer.StaticPipe/OnnxStaticExtensions.cs b/src/Microsoft.ML.OnnxTransformer.StaticPipe/OnnxStaticExtensions.cs deleted file mode 100644 index 66b44ff334..0000000000 --- a/src/Microsoft.ML.OnnxTransformer.StaticPipe/OnnxStaticExtensions.cs +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Transforms.Onnx; - -namespace Microsoft.ML.Transforms.StaticPipe -{ - public static class OnnxStaticExtensions - { - - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, string modelFile) - : base(new Reconciler(modelFile), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly string _modelFile; - - public Reconciler(string modelFile) - { - Contracts.AssertNonEmpty(modelFile); - _modelFile = modelFile; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - return new OnnxScoringEstimator(env, new[] { outputNames[outCol] }, new[] { inputNames[outCol.Input] }, _modelFile); - } - } - - /// - /// Run a Onnx model on the input column and extract one output column. - /// The inputs and outputs are matched to Onnx graph nodes by name. - /// - public static Vector ApplyOnnxModel(this Vector input, string modelFile) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); - return new OutColumn(input, modelFile); - } - } -} diff --git a/src/Microsoft.ML.OnnxTransformer/Properties/AssemblyInfo.cs b/src/Microsoft.ML.OnnxTransformer/Properties/AssemblyInfo.cs index 0084e082ef..55d9ab82cb 100644 --- a/src/Microsoft.ML.OnnxTransformer/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.OnnxTransformer/Properties/AssemblyInfo.cs @@ -9,5 +9,4 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet101" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet18" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet50" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformer.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformerTest" + PublicKey.TestValue)] diff --git a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs index a4571c9c78..7cc466fb2f 100644 --- a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs @@ -6,7 +6,6 @@ using Microsoft.ML; [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.Recommender/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Recommender/Properties/AssemblyInfo.cs index 4cfdbca7bb..734f2f357d 100644 --- a/src/Microsoft.ML.Recommender/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Recommender/Properties/AssemblyInfo.cs @@ -6,6 +6,5 @@ using Microsoft.ML; [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.StandardTrainers/Properties/AssemblyInfo.cs b/src/Microsoft.ML.StandardTrainers/Properties/AssemblyInfo.cs index ecb343b55b..65d503c1d9 100644 --- a/src/Microsoft.ML.StandardTrainers/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.StandardTrainers/Properties/AssemblyInfo.cs @@ -6,7 +6,6 @@ using Microsoft.ML; [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Ensemble" + PublicKey.Value)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] diff --git a/src/Microsoft.ML.StaticPipe/Attributes.cs b/src/Microsoft.ML.StaticPipe/Attributes.cs deleted file mode 100644 index 78c2c9b521..0000000000 --- a/src/Microsoft.ML.StaticPipe/Attributes.cs +++ /dev/null @@ -1,27 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -namespace Microsoft.ML.StaticPipe -{ - /// - /// An indicator to the analyzer that this type parameter ought to be a valid schema-shape object (for example, a pipeline-column, or - /// value-tuples or some other containing type of such) as the return type. Note that this attribute is typically only used in - /// situations where a user might be essentially declaring that type, as opposed to using an already established shape type. - /// So: a method that merely takes an already existing typed instance would tend on the other hand to not use this type parameter. - /// To give an example: - /// - /// has the parameter on the new output tuple shape. - /// - /// The cost to not specifying this on such an entry point is that the compile time type-checks on the shape parameters will - /// no longer be enforced, which is suboptimal given that the purpose of the statically typed interfaces is to have compile-time - /// checks. However, it is not disastrous since the runtime checks will still be in effect. - /// - /// User code may use this attribute on their types if they have generic type parameters that interface with this library. - /// - [AttributeUsage(AttributeTargets.GenericParameter)] - public sealed class IsShapeAttribute : Attribute - { - } -} diff --git a/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs deleted file mode 100644 index 9ce5e5d5cd..0000000000 --- a/src/Microsoft.ML.StaticPipe/CategoricalHashStaticExtensions.cs +++ /dev/null @@ -1,171 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - public static class CategoricalHashStaticExtensions - { - public enum OneHotHashVectorOutputKind : byte - { - /// - /// Output is a bag (multi-set) vector - /// - Bag = 1, - - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - public enum OneHotHashScalarOutputKind : byte - { - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - private const OneHotHashVectorOutputKind DefOut = (OneHotHashVectorOutputKind)OneHotHashEncodingEstimator.Defaults.OutputKind; - private const int DefNumberOfBits = OneHotHashEncodingEstimator.Defaults.NumberOfBits; - private const uint DefSeed = OneHotHashEncodingEstimator.Defaults.Seed; - private const bool DefOrdered = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing; - private const int DefMaximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts; - - private readonly struct Config - { - public readonly int NumberOfBits; - public readonly uint Seed; - public readonly bool Ordered; - public readonly int MaximumNumberOfInverts; - public readonly OneHotHashVectorOutputKind OutputKind; - - public Config(OneHotHashVectorOutputKind outputKind, int numberOfBits, uint seed, bool ordered, int maximumNumberOfInverts) - { - OutputKind = outputKind; - NumberOfBits = numberOfBits; - Seed = seed; - Ordered = ordered; - MaximumNumberOfInverts = maximumNumberOfInverts; - } - } - - private interface ICategoricalCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new OneHotHashEncodingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ICategoricalCol)toOutput[i]; - infos[i] = new OneHotHashEncodingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[tcol.Input], (OneHotEncodingEstimator.OutputKind)tcol.Config.OutputKind, - tcol.Config.NumberOfBits, tcol.Config.Seed, tcol.Config.Ordered, tcol.Config.MaximumNumberOfInverts); - } - return new OneHotHashEncodingEstimator(env, infos); - } - } - - /// - /// Converts the categorical value into an indicator array by hashing categories into certain value and using that value as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this Scalar input, OneHotHashScalarOutputKind outputKind = (OneHotHashScalarOutputKind)DefOut, - int numberOfBits = DefNumberOfBits, uint seed = DefSeed, bool ordered = DefOrdered, int maximumNumberOfInverts = DefMaximumNumberOfInverts) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplScalar(input, new Config((OneHotHashVectorOutputKind)outputKind, numberOfBits, seed, ordered, maximumNumberOfInverts)); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this Vector input, OneHotHashVectorOutputKind outputKind = DefOut, - int numberOfBits = DefNumberOfBits, uint seed = DefSeed, bool ordered = DefOrdered, int maximumNumberOfInverts = DefMaximumNumberOfInverts) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, numberOfBits, seed, ordered, maximumNumberOfInverts)); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// Amount of bits to use for hashing. - /// Seed value used for hashing. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector OneHotHashEncoding(this VarVector input, OneHotHashVectorOutputKind outputKind = DefOut, - int numberOfBits = DefNumberOfBits, uint seed = DefSeed, bool ordered = DefOrdered, int maximumNumberOfInverts = DefMaximumNumberOfInverts) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, numberOfBits, seed, ordered, maximumNumberOfInverts)); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs deleted file mode 100644 index f8e2e356a5..0000000000 --- a/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs +++ /dev/null @@ -1,161 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; -using static Microsoft.ML.StaticPipe.TermStaticExtensions; - -namespace Microsoft.ML.StaticPipe -{ - public static class CategoricalStaticExtensions - { - public enum OneHotVectorOutputKind : byte - { - /// - /// Output is a bag (multi-set) vector - /// - Bag = 1, - - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - public enum OneHotScalarOutputKind : byte - { - /// - /// Output is an indicator vector - /// - Ind = 2, - - /// - /// Output is binary encoded - /// - Bin = 4, - } - - private const KeyOrdinality DefSort = (KeyOrdinality)ValueToKeyMappingEstimator.Defaults.Ordinality; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys; - private const OneHotVectorOutputKind DefOut = (OneHotVectorOutputKind)OneHotEncodingEstimator.Defaults.OutKind; - - private readonly struct Config - { - public readonly KeyOrdinality Order; - public readonly int Max; - public readonly OneHotVectorOutputKind OutputKind; - public readonly Action OnFit; - - public Config(OneHotVectorOutputKind outputKind, KeyOrdinality order, int max, Action onFit) - { - OutputKind = outputKind; - Order = order; - Max = max; - OnFit = onFit; - } - } - - private static Action Wrap(ToKeyFitResult.OnFit onFit) - { - if (onFit == null) - return null; - // The type T asociated with the delegate will be the actual value type once #863 goes in. - // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. - // For now construct the useless object then pass it into the delegate. - return map => onFit(new ToKeyFitResult(map)); - } - - private interface ICategoricalCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector, ICategoricalCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new OneHotEncodingEstimator.ColumnOptions[toOutput.Length]; - Action onFit = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ICategoricalCol)toOutput[i]; - infos[i] = new OneHotEncodingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[tcol.Input], (OneHotEncodingEstimator.OutputKind)tcol.Config.OutputKind, - tcol.Config.Max, (ValueToKeyMappingEstimator.KeyOrdinality)tcol.Config.Order); - if (tcol.Config.OnFit != null) - { - int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. - onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); - } - } - var est = new OneHotEncodingEstimator(env, infos); - if (onFit != null) - est.WrapTermWithDelegate(onFit); - return est; - } - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: array or binary encoded data. - /// How the Id for each value would be assigined: by occurrence or by value. - /// Maximum number of ids to keep during data scanning. - /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector OneHotEncoding(this Scalar input, OneHotScalarOutputKind outputKind = (OneHotScalarOutputKind)DefOut, KeyOrdinality keyOrdinality = DefSort, - int maximumNumberOfItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplScalar(input, new Config((OneHotVectorOutputKind)outputKind, keyOrdinality, maximumNumberOfItems, Wrap(onFit))); - } - - /// - /// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. - /// - /// Incoming data. - /// Specify the output type of indicator array: Multiarray, array or binary encoded data. - /// How the Id for each value would be assigined: by occurrence or by value. - /// Maximum number of ids to keep during data scanning. - /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector OneHotEncoding(this Vector input, OneHotVectorOutputKind outputKind = DefOut, KeyOrdinality keyOrdinality = DefSort, int maximumNumberOfItems = DefMax, - ToKeyFitResult>.OnFit onFit = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(outputKind, keyOrdinality, maximumNumberOfItems, Wrap(onFit))); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs deleted file mode 100644 index cf404e2eb3..0000000000 --- a/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.cs +++ /dev/null @@ -1,267 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public static partial class ConvertStaticExtensions - { - // Do not edit this file directly. Rather, it is generated out of ConvertStaticExtensions.tt. - #region For string inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For double inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For sbyte inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For short inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For int inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For long inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For byte inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For ushort inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For uint inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For ulong inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - #region For bool inputs. - /// - /// Convert to float. - /// - /// The input column. - /// Float column. - public static Scalar ToFloat(this Scalar input) => new ImplScalar(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to array of floats. - /// - /// The input column. - /// Column with array of floats. - public static Vector ToFloat(this Vector input) => new ImplVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - /// - /// Convert to variable array of floats. - /// - /// The input column. - /// Column with variable array of floats. - public static VarVector ToFloat(this VarVector input) => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), InternalDataKind.R4); - - #endregion - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt b/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt deleted file mode 100644 index 74620beab9..0000000000 --- a/src/Microsoft.ML.StaticPipe/ConvertStaticExtensions.tt +++ /dev/null @@ -1,61 +0,0 @@ -<#@ template debug="false" hostspecific="false" language="C#" #> -<#@ assembly name="System.Core" #> -<#@ import namespace="System.Linq" #> -<#@ import namespace="System.Text" #> -<#@ import namespace="System.Collections.Generic" #> -<#@ output extension=".cs" #> -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Microsoft.ML.StaticPipe -{ - public static partial class ConvertStaticExtensions - { - // Do not edit this file directly. Rather, it is generated out of ConvertStaticExtensions.tt. - <# - // Let's skip the time-based types for now. - foreach (string typeName in new string[] { "string", "double", "sbyte", "short", "int", "long", "byte", "ushort", "uint", "ulong", "bool" }) { - #> -#region For <#=typeName#> inputs. - <# - foreach (string columnKind in new string[] { "Scalar", "Vector", "VarVector" }) { - string returnType=""; - string convertTo=""; - switch(columnKind) - { - case "Scalar": - returnType = "Float column."; - convertTo = "float"; - break; - case "Vector": - returnType = "Column with array of floats."; - convertTo = "array of floats"; - break; - case "VarVector": - returnType = "Column with variable array of floats."; - convertTo = "variable array of floats"; - break; - } - #> -/// - /// Convert to <#=convertTo#>. - /// - /// The input column. - /// <#=returnType#> - public static <#=columnKind#> ToFloat(this <#=columnKind#><<#=typeName#>> input) => new Impl<#=columnKind#><<#=typeName#>>(Contracts.CheckRef(input, nameof(input)), DataKind.R4); - - <# - } - #> -#endregion - <# - } - #> - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/DataLoadSaveOperationsExtensions.cs b/src/Microsoft.ML.StaticPipe/DataLoadSaveOperationsExtensions.cs deleted file mode 100644 index adc32764b6..0000000000 --- a/src/Microsoft.ML.StaticPipe/DataLoadSaveOperationsExtensions.cs +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Data; -using static Microsoft.ML.StaticPipe.TextLoaderStatic; - -namespace Microsoft.ML.StaticPipe -{ - public static class DataLoadSaveOperationsExtensions - { - /// - /// Configures a loader for text files. - /// - /// The type shape parameter, which must be a valid-schema shape. As a practical - /// matter this is generally not explicitly defined from the user, but is instead inferred from the return - /// type of the where one takes an input and uses it to compose - /// a shape-type instance describing what the columns are and how to load them from the file. - /// The catalog. - /// The delegate that describes what fields to read from the text file, as well as - /// describing their input type. The way in which it works is that the delegate is fed a , - /// and the user composes a shape type with instances out of that . - /// The resulting data will have columns with the names corresponding to their names in the shape type. - /// Input files. - /// Data file has header with feature names. - /// Text field separator. - /// Whether the input -may include quoted values, which can contain separator - /// characters, colons, and distinguish empty values from missing values. When true, consecutive separators - /// denote a missing value and an empty value is denoted by "". When false, consecutive separators - /// denote an empty value. - /// Whether the input may include sparse representations. - /// Remove trailing whitespace from lines. - /// A configured statically-typed loader for text files. - public static DataLoader CreateTextLoader<[IsShape] TShape>( - this DataOperationsCatalog catalog, Func func, IMultiStreamSource files = null, - bool hasHeader = false, char separator = '\t', bool allowQuoting = true, bool allowSparse = true, - bool trimWhitspace = false) - => CreateLoader(catalog.GetEnvironment(), func, files, separator, hasHeader, allowQuoting, allowSparse, trimWhitspace); - } -} diff --git a/src/Microsoft.ML.StaticPipe/DataLoader.cs b/src/Microsoft.ML.StaticPipe/DataLoader.cs deleted file mode 100644 index db93b80405..0000000000 --- a/src/Microsoft.ML.StaticPipe/DataLoader.cs +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public sealed class DataLoader : SchemaBearing - { - public IDataLoader AsDynamic { get; } - - internal DataLoader(IHostEnvironment env, IDataLoader loader, StaticSchemaShape shape) - : base(env, shape) - { - Env.AssertValue(loader); - - AsDynamic = loader; - Shape.Check(Env, AsDynamic.GetOutputSchema()); - } - - public DataLoaderEstimator> Append(Estimator estimator) - where TTrans : class, ITransformer - { - Contracts.Assert(nameof(Append) == nameof(CompositeLoaderEstimator.Append)); - - var loaderEst = AsDynamic.Append(estimator.AsDynamic); - return new DataLoaderEstimator>(Env, loaderEst, estimator.Shape); - } - - public DataLoader Append(Transformer transformer) - where TTransformer : class, ITransformer - { - Env.CheckValue(transformer, nameof(transformer)); - Env.Assert(nameof(Append) == nameof(CompositeLoaderEstimator.Append)); - - var loader = AsDynamic.Append(transformer.AsDynamic); - return new DataLoader(Env, loader, transformer.Shape); - } - - public DataView Load(TIn input) - { - // We cannot check the value of input since it may not be a reference type, and it is not clear - // that there is an absolute case for insisting that the input type be a reference type, and much - // less further that null inputs will never be correct. So we rely on the wrapping object to make - // that determination. - Env.Assert(nameof(Load) == nameof(IDataLoader.Load)); - - var data = AsDynamic.Load(input); - return new DataView(Env, data, Shape); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/DataLoaderEstimator.cs b/src/Microsoft.ML.StaticPipe/DataLoaderEstimator.cs deleted file mode 100644 index 1901eda49a..0000000000 --- a/src/Microsoft.ML.StaticPipe/DataLoaderEstimator.cs +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public sealed class DataLoaderEstimator : SchemaBearing - where TDataLoader : class, IDataLoader - { - public IDataLoaderEstimator AsDynamic { get; } - - internal DataLoaderEstimator(IHostEnvironment env, IDataLoaderEstimator estimator, StaticSchemaShape shape) - : base(env, shape) - { - Env.AssertValue(estimator); - - AsDynamic = estimator; - Shape.Check(Env, AsDynamic.GetOutputSchema()); - } - - public DataLoader Fit(TIn input) - { - Contracts.Assert(nameof(Fit) == nameof(IDataLoaderEstimator.Fit)); - - var loader = AsDynamic.Fit(input); - return new DataLoader(Env, loader, Shape); - } - - public DataLoaderEstimator> Append(Estimator est) - where TTrans : class, ITransformer - { - Contracts.Assert(nameof(Append) == nameof(CompositeLoaderEstimator.Append)); - - var loaderEst = AsDynamic.Append(est.AsDynamic); - return new DataLoaderEstimator>(Env, loaderEst, est.Shape); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/DataView.cs b/src/Microsoft.ML.StaticPipe/DataView.cs deleted file mode 100644 index c45ba3413a..0000000000 --- a/src/Microsoft.ML.StaticPipe/DataView.cs +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public class DataView : SchemaBearing - { - public IDataView AsDynamic { get; } - - internal DataView(IHostEnvironment env, IDataView view, StaticSchemaShape shape) - : base(env, shape) - { - Env.AssertValue(view); - - AsDynamic = view; - Shape.Check(Env, AsDynamic.Schema); - } - - /// - /// This function return a whose columns are all cached in memory. - /// This returned is almost the same to the source . - /// The only difference are cache-related properties. - /// - public DataView Cache() - { - // Generate all column indexes in the source data. - var prefetched = Enumerable.Range(0, AsDynamic.Schema.Count).ToArray(); - // Create a cached version of the source data by caching all columns. - return new DataView(Env, new CacheDataView(Env, AsDynamic, prefetched), Shape); - } - } - - public static class DataViewExtensions - { - private static IEnumerable GetColumnCore(DataView data, Func column) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(column, nameof(column)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string columnName = indexer.Get(column(indexer.Indices)); - - var dynamicData = data.AsDynamic; - return dynamicData.GetColumn(dynamicData.Schema[columnName]); - } - - public static IEnumerable GetColumn(this DataView data, Func> column) - => GetColumnCore(data, column); - - public static IEnumerable GetColumn(this DataView data, Func> column) - => GetColumnCore(data, column); - - public static IEnumerable GetColumn(this DataView data, Func> column) - => GetColumnCore(data, column); - } -} diff --git a/src/Microsoft.ML.StaticPipe/Estimator.cs b/src/Microsoft.ML.StaticPipe/Estimator.cs deleted file mode 100644 index ed44944781..0000000000 --- a/src/Microsoft.ML.StaticPipe/Estimator.cs +++ /dev/null @@ -1,86 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public sealed class Estimator : SchemaBearing - where TTransformer : class, ITransformer - { - public IEstimator AsDynamic { get; } - private readonly StaticSchemaShape _inShape; - - internal Estimator(IHostEnvironment env, IEstimator estimator, StaticSchemaShape inShape, StaticSchemaShape outShape) - : base(env, outShape) - { - Env.CheckValue(estimator, nameof(estimator)); - AsDynamic = estimator; - _inShape = inShape; - // Our ability to check estimators at constructor time is somewaht limited. During fit though we could. - // Fortunately, estimators are one of the least likely things that users will freqeuently declare the - // types of on their own. - } - - public Transformer Fit(DataView view) - { - Contracts.Assert(nameof(Fit) == nameof(IEstimator.Fit)); - _inShape.Check(Env, view.AsDynamic.Schema); - - var trans = AsDynamic.Fit(view.AsDynamic); - return new Transformer(Env, trans, _inShape, Shape); - } - - public Estimator Append(Estimator estimator) - { - Env.CheckValue(estimator, nameof(estimator)); - - var est = AsDynamic.Append(estimator.AsDynamic); - return new Estimator(Env, est, _inShape, estimator.Shape); - } - - public Estimator Append<[IsShape] TNewOutShape>(Func mapper) - { - Contracts.CheckValue(mapper, nameof(mapper)); - - using (var ch = Env.Start(nameof(Append))) - { - var method = mapper.Method; - - // Construct the dummy column structure, then apply the mapping. - var input = StaticPipeInternalUtils.MakeAnalysisInstance(out var fakeReconciler); - KeyValuePair[] inPairs = StaticPipeInternalUtils.GetNamesValues(input, method.GetParameters()[0]); - - // Initially we suppose we've only assigned names to the inputs. - var inputColToName = new Dictionary(); - foreach (var p in inPairs) - inputColToName[p.Value] = p.Key; - string NameMap(PipelineColumn col) - { - inputColToName.TryGetValue(col, out var val); - return val; - } - - var readerEst = StaticPipeUtils.GeneralFunctionAnalyzer(Env, ch, input, fakeReconciler, mapper, out var estTail, NameMap); - ch.Assert(readerEst == null); - ch.AssertValue(estTail); - - var est = AsDynamic.Append(estTail); - var newOut = StaticSchemaShape.Make(method.ReturnParameter); - return new Estimator(Env, est, _inShape, newOut); - } - } - - /// - /// Cache data produced in memory by this estimator. It may append an extra estimator to the this estimator - /// for caching. The newly added estimator would be returned. - /// - public Estimator AppendCacheCheckpoint() - { - return new Estimator(Env, AsDynamic.AppendCacheCheckpoint(Env), _inShape, Shape); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/EvaluatorStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/EvaluatorStaticExtensions.cs deleted file mode 100644 index 03b3ab7306..0000000000 --- a/src/Microsoft.ML.StaticPipe/EvaluatorStaticExtensions.cs +++ /dev/null @@ -1,315 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extension methods for evaluation. - /// - public static class EvaluatorStaticExtensions - { - /// - /// Evaluates scored binary classification data. - /// - /// The shape type for the input data. - /// The binary classification catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for columns from calibrated prediction of a binary classifier. - /// Under typical scenarios, this will just be the same tuple of results returned from the trainer. - /// The evaluation results for these calibrated outputs. - public static CalibratedBinaryClassificationMetrics Evaluate( - this BinaryClassificationCatalog catalog, - DataView data, - Func> label, - Func score, Scalar probability, Scalar predictedLabel)> pred) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(pred, nameof(pred)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - (var scoreCol, var probCol, var predCol) = pred(indexer.Indices); - env.CheckParam(scoreCol != null, nameof(pred), "Indexing delegate resulted in null score column."); - env.CheckParam(probCol != null, nameof(pred), "Indexing delegate resulted in null probability column."); - env.CheckParam(predCol != null, nameof(pred), "Indexing delegate resulted in null predicted label column."); - string scoreName = indexer.Get(scoreCol); - string probName = indexer.Get(probCol); - string predName = indexer.Get(predCol); - - var eval = new BinaryClassifierEvaluator(env, new BinaryClassifierEvaluator.Arguments() { }); - return eval.Evaluate(data.AsDynamic, labelName, scoreName, probName, predName); - } - - /// - /// Evaluates scored binary classification data and generates precision recall curve data. - /// - /// The shape type for the input data. - /// The binary classification catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for columns from calibrated prediction of a binary classifier. - /// Under typical scenarios, this will just be the same tuple of results returned from the trainer. - /// The generated precision recall curve data. Up to 100000 of samples are used for p/r curve generation. - /// The evaluation results for these calibrated outputs. - public static CalibratedBinaryClassificationMetrics EvaluateWithPRCurve( - this BinaryClassificationCatalog catalog, - DataView data, - Func> label, - Func score, Scalar probability, Scalar predictedLabel)> pred, - out List prCurve) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(pred, nameof(pred)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - (var scoreCol, var probCol, var predCol) = pred(indexer.Indices); - env.CheckParam(scoreCol != null, nameof(pred), "Indexing delegate resulted in null score column."); - env.CheckParam(probCol != null, nameof(pred), "Indexing delegate resulted in null probability column."); - env.CheckParam(predCol != null, nameof(pred), "Indexing delegate resulted in null predicted label column."); - string scoreName = indexer.Get(scoreCol); - string probName = indexer.Get(probCol); - string predName = indexer.Get(predCol); - - var eval = new BinaryClassifierEvaluator(env, new BinaryClassifierEvaluator.Arguments() { NumRocExamples = 100000 }); - return eval.EvaluateWithPRCurve(data.AsDynamic, labelName, scoreName, probName, predName, out prCurve); - } - - /// - /// Evaluates scored binary classification data, if the predictions are not calibrated. - /// - /// The shape type for the input data. - /// The binary classification catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for columns from uncalibrated prediction of a binary classifier. - /// Under typical scenarios, this will just be the same tuple of results returned from the trainer. - /// The evaluation results for these uncalibrated outputs. - public static BinaryClassificationMetrics Evaluate( - this BinaryClassificationCatalog catalog, - DataView data, - Func> label, - Func score, Scalar predictedLabel)> pred) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(pred, nameof(pred)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - (var scoreCol, var predCol) = pred(indexer.Indices); - Contracts.CheckParam(scoreCol != null, nameof(pred), "Indexing delegate resulted in null score column."); - Contracts.CheckParam(predCol != null, nameof(pred), "Indexing delegate resulted in null predicted label column."); - string scoreName = indexer.Get(scoreCol); - string predName = indexer.Get(predCol); - - var eval = new BinaryClassifierEvaluator(env, new BinaryClassifierEvaluator.Arguments() { }); - return eval.Evaluate(data.AsDynamic, labelName, scoreName, predName); - } - - /// - /// Evaluates scored binary classification data, if the predictions are not calibrated - /// and generates precision recall curve data. - /// - /// The shape type for the input data. - /// The binary classification catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for columns from uncalibrated prediction of a binary classifier. - /// Under typical scenarios, this will just be the same tuple of results returned from the trainer. - /// The generated precision recall curve data. Up to 100000 of samples are used for p/r curve generation. - /// The evaluation results for these uncalibrated outputs. - public static BinaryClassificationMetrics EvaluateWithPRCurve( - this BinaryClassificationCatalog catalog, - DataView data, - Func> label, - Func score, Scalar predictedLabel)> pred, - out List prCurve) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(pred, nameof(pred)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - (var scoreCol, var predCol) = pred(indexer.Indices); - Contracts.CheckParam(scoreCol != null, nameof(pred), "Indexing delegate resulted in null score column."); - Contracts.CheckParam(predCol != null, nameof(pred), "Indexing delegate resulted in null predicted label column."); - string scoreName = indexer.Get(scoreCol); - string predName = indexer.Get(predCol); - - var eval = new BinaryClassifierEvaluator(env, new BinaryClassifierEvaluator.Arguments() { NumRocExamples = 100000 }); - return eval.EvaluateWithPRCurve(data.AsDynamic, labelName, scoreName, predName, out prCurve); - } - - /// - /// Evaluates scored clustering prediction data. - /// - /// The shape type for the input data. - /// The clustering catalog. - /// The data to evaluate. - /// The index delegate for the predicted score column. - /// The optional index delegate for the label column. - /// The optional index delegate for the features column. - /// The evaluation metrics. - public static ClusteringMetrics Evaluate( - this ClusteringCatalog catalog, - DataView data, - Func> score, - Func> label = null, - Func> features = null) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(score, nameof(score)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string scoreName = indexer.Get(score(indexer.Indices)); - - string labelName = (label != null)? indexer.Get(label(indexer.Indices)) : null; - string featuresName = (features!= null) ? indexer.Get(features(indexer.Indices)): null; - - var args = new ClusteringEvaluator.Arguments() { CalculateDbi = !string.IsNullOrEmpty(featuresName) }; - - return new ClusteringEvaluator(env, args).Evaluate(data.AsDynamic, scoreName, labelName, featuresName); - } - - /// - /// Evaluates scored multiclass classification data. - /// - /// The shape type for the input data. - /// The value type for the key label. - /// The multiclass classification catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for columns from the prediction of a multiclass classifier. - /// Under typical scenarios, this will just be the same tuple of results returned from the trainer. - /// If given a positive value, the will be filled with - /// the top-K accuracy, that is, the accuracy assuming we consider an example with the correct class within - /// the top-K values as being stored "correctly." - /// The evaluation metrics. - public static MulticlassClassificationMetrics Evaluate( - this MulticlassClassificationCatalog catalog, - DataView data, - Func> label, - Func score, Key predictedLabel)> pred, - int topKPredictionCount = 0) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(pred, nameof(pred)); - env.CheckParam(topKPredictionCount >= 0, nameof(topKPredictionCount), "Must not be negative."); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - (var scoreCol, var predCol) = pred(indexer.Indices); - Contracts.CheckParam(scoreCol != null, nameof(pred), "Indexing delegate resulted in null score column."); - Contracts.CheckParam(predCol != null, nameof(pred), "Indexing delegate resulted in null predicted label column."); - string scoreName = indexer.Get(scoreCol); - string predName = indexer.Get(predCol); - - var args = new MulticlassClassificationEvaluator.Arguments() { }; - if (topKPredictionCount > 0) - args.OutputTopKAcc = topKPredictionCount; - - var eval = new MulticlassClassificationEvaluator(env, args); - return eval.Evaluate(data.AsDynamic, labelName, scoreName, predName); - } - - private sealed class TrivialRegressionLossFactory : ISupportRegressionLossFactory - { - private readonly IRegressionLoss _loss; - public TrivialRegressionLossFactory(IRegressionLoss loss) => _loss = loss; - public IRegressionLoss CreateComponent(IHostEnvironment env) => _loss; - } - - /// - /// Evaluates scored regression data. - /// - /// The shape type for the input data. - /// The regression catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for predicted score column. - /// Potentially custom loss function. If left unspecified defaults to . - /// The evaluation metrics. - public static RegressionMetrics Evaluate( - this RegressionCatalog catalog, - DataView data, - Func> label, - Func> score, - IRegressionLoss lossFunction = null) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(score, nameof(score)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - string scoreName = indexer.Get(score(indexer.Indices)); - - var args = new RegressionEvaluator.Arguments() { }; - if (lossFunction != null) - args.LossFunction = new TrivialRegressionLossFactory(lossFunction); - return new RegressionEvaluator(env, args).Evaluate(data.AsDynamic, labelName, scoreName); - } - - /// - /// Evaluates scored ranking data. - /// - /// The shape type for the input data. - /// The type of data, before being converted to a key. - /// The ranking catalog. - /// The data to evaluate. - /// The index delegate for the label column. - /// The index delegate for the groupId column. - /// The index delegate for predicted score column. - /// The evaluation metrics. - public static RankingMetrics Evaluate( - this RankingCatalog catalog, - DataView data, - Func> label, - Func> groupId, - Func> score) - { - Contracts.CheckValue(data, nameof(data)); - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckValue(label, nameof(label)); - env.CheckValue(groupId, nameof(groupId)); - env.CheckValue(score, nameof(score)); - - var indexer = StaticPipeUtils.GetIndexer(data); - string labelName = indexer.Get(label(indexer.Indices)); - string scoreName = indexer.Get(score(indexer.Indices)); - string groupIdName = indexer.Get(groupId(indexer.Indices)); - - var args = new RankingEvaluator.Arguments() { }; - - return new RankingEvaluator(env, args).Evaluate(data.AsDynamic, labelName, groupIdName, scoreName); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/FactorizationMachineStatic.cs b/src/Microsoft.ML.StaticPipe/FactorizationMachineStatic.cs deleted file mode 100644 index 69d726907f..0000000000 --- a/src/Microsoft.ML.StaticPipe/FactorizationMachineStatic.cs +++ /dev/null @@ -1,137 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extension methods and utilities for instantiating FFM trainer estimators inside statically typed pipelines. - /// - public static class FactorizationMachineExtensions - { - /// - /// Predict a target using a field-aware factorization machine. - /// - /// The binary classifier catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. - /// This delegate will receive the model that was trained. The type of the model is . - /// Note that this action cannot change the result in any way; it is only a way for the caller to be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar predictedLabel) FieldAwareFactorizationMachine(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector[] features, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckNonEmpty(features, nameof(features)); - - Contracts.CheckValueOrNull(onFit); - - var rec = new CustomReconciler((env, labelCol, featureCols) => - { - var trainer = new FieldAwareFactorizationMachineTrainer(env, featureCols, labelCol); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features); - return rec.Output; - } - - /// - /// Predict a target using a field-aware factorization machine. - /// - /// The binary classifier catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. - /// This delegate will receive the model that was trained. The type of the model is . - /// Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar predictedLabel) FieldAwareFactorizationMachine(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector[] features, - FieldAwareFactorizationMachineTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckNonEmpty(features, nameof(features)); - - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new CustomReconciler((env, labelCol, featureCols) => - { - var trainer = new FieldAwareFactorizationMachineTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features); - return rec.Output; - } - - private sealed class CustomReconciler : TrainerEstimatorReconciler - { - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score, DefaultColumnNames.PredictedLabel }; - private readonly Func> _factory; - - /// - /// The general output for binary classifiers. - /// - public (Scalar score, Scalar predictedLabel) Output { get; } - - /// - /// The output columns. - /// - protected override IEnumerable Outputs { get; } - - public CustomReconciler(Func> factory, Scalar label, Vector[] features) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(features, nameof(features))), _fixedOutputNames) - { - Contracts.AssertValue(factory); - _factory = factory; - - Output = (new Impl(this), new ImplBool(this)); - Outputs = new PipelineColumn[] { Output.score, Output.predictedLabel }; - } - - private static PipelineColumn[] MakeInputs(Scalar label, Vector[] features) - => new PipelineColumn[] { label }.Concat(features).ToArray(); - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - - // First input is label, rest are features. - return _factory(env, inputNames[0], inputNames.Skip(1).ToArray()); - } - - private sealed class Impl : Scalar - { - public Impl(CustomReconciler rec) : base(rec, rec.Inputs) { } - } - - private sealed class ImplBool : Scalar - { - public ImplBool(CustomReconciler rec) : base(rec, rec.Inputs) { } - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/ImageStaticPipe.cs b/src/Microsoft.ML.StaticPipe/ImageStaticPipe.cs deleted file mode 100644 index d9fbf782e6..0000000000 --- a/src/Microsoft.ML.StaticPipe/ImageStaticPipe.cs +++ /dev/null @@ -1,173 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Drawing; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.Image; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// A type used in the generic argument to . We must simultaneously distinguish - /// between a of fixed (with ) and unfixed (with this type), - /// in the static pipelines. - /// - public class UnknownSizeBitmap { private UnknownSizeBitmap() { } } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class ImageStaticPipe - { - /// - /// Load an image from an input column that holds the paths to images. - /// - /// The scalar text column that holds paths to the images - /// If specified, paths are considered to be relative to this directory. - /// However, since the transform can be persisted across machines, it is generally considered more - /// safe for users to simply always make their input paths absolute. - /// The loaded images - /// - public static Custom LoadAsImage(this Scalar path, string relativeTo = null) - { - Contracts.CheckValue(path, nameof(path)); - Contracts.CheckValueOrNull(relativeTo); - return new ImageLoadingStaticExtensions.OutPipelineColumn(path, relativeTo); - } - - /// - /// Converts the image to grayscale. - /// - /// The image to convert - /// The grayscale images - /// - public static Custom AsGrayscale(this Custom input) - { - Contracts.CheckValue(input, nameof(input)); - return new ImageGreyScalingStaticExtensions.OutPipelineColumn(input); - } - - /// - /// Converts the image to grayscale. - /// - /// The image to convert - /// The grayscale images - /// - public static Custom AsGrayscale(this Custom input) - { - Contracts.CheckValue(input, nameof(input)); - return new ImageGreyScalingStaticExtensions.OutPipelineColumn(input); - } - - /// - /// Given a column of images of unfixed size, resize the images so they have uniform size. - /// - /// The input images - /// The width to resize to - /// The height to resize to - /// The type of resizing to do - /// If cropping is necessary, at what position will the image be fixed? - /// The now uniformly sized images - /// - public static Custom Resize(this Custom input, int width, int height, - ImageResizingEstimator.ResizingKind resizing = ImageResizingEstimator.ResizingKind.IsoCrop, - ImageResizingEstimator.Anchor cropAnchor = ImageResizingEstimator.Anchor.Center) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckParam(width > 0, nameof(width), "Must be positive"); - Contracts.CheckParam(height > 0, nameof(height), "Must be positive"); - Contracts.CheckParam(Enum.IsDefined(typeof(ImageResizingEstimator.ResizingKind), resizing), nameof(resizing), "Undefined value detected"); - Contracts.CheckParam(Enum.IsDefined(typeof(ImageResizingEstimator.Anchor), cropAnchor), nameof(cropAnchor), "Undefined value detected"); - - return new ImageResizingStaticExtensions.OutPipelineColumn(input, width, height, resizing, cropAnchor); - } - - /// - /// Given a column of images, resize them to a new fixed size. - /// - /// The input images - /// The width to resize to - /// The height to resize to - /// The type of resizing to do - /// If cropping is necessary, at what - /// The resized images - /// - public static Custom Resize(this Custom input, int width, int height, - ImageResizingEstimator.ResizingKind resizing = ImageResizingEstimator.ResizingKind.IsoCrop, - ImageResizingEstimator.Anchor cropAnchor = ImageResizingEstimator.Anchor.Center) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckParam(width > 0, nameof(width), "Must be positive"); - Contracts.CheckParam(height > 0, nameof(height), "Must be positive"); - Contracts.CheckParam(Enum.IsDefined(typeof(ImageResizingEstimator.ResizingKind), resizing), nameof(resizing), "Undefined value detected"); - Contracts.CheckParam(Enum.IsDefined(typeof(ImageResizingEstimator.Anchor), cropAnchor), nameof(cropAnchor), "Undefined value detected"); - - return new ImageResizingStaticExtensions.OutPipelineColumn(input, width, height, resizing, cropAnchor); - } - - /// - /// Vectorizes the image as the numeric values of its pixels converted and possibly transformed to floating point values. - /// The output vector is output in height then width major order, with the channels being the most minor (if - /// is true) or major (if is false) dimension. - /// - /// The input image to extract - /// Whether the alpha channel should be extracted - /// Whether the red channel should be extracted - /// Whether the green channel should be extracted - /// Whether the blue channel should be extracted - /// In which order extract channels. - /// Whether the pixel values should be interleaved, as opposed to being separated by channel - /// Scale the normally 0 through 255 pixel values by this amount - /// Add this amount to the pixel values, before scaling - /// The vectorized image - /// - public static Vector ExtractPixels(this Custom input, bool useAlpha = false, bool useRed = true, - bool useGreen = true, bool useBlue = true, ImagePixelExtractingEstimator.ColorsOrder order = ImagePixelExtractingEstimator.Defaults.Order, bool interleave = false, float scale = 1.0f, float offset = 0.0f) - { - var colParams = new ImagePixelExtractingTransformer.Column - { - UseAlpha = useAlpha, - UseRed = useRed, - UseGreen = useGreen, - UseBlue = useBlue, - Interleave = interleave, - Scale = scale, - Offset = offset, - Convert = true - }; - return new ImagePixelExtractingStaticExtensions.OutPipelineColumn(input, colParams); - } - - /// - /// Vectorizes the image as the numeric byte values of its pixels. - /// The output vector is output in height then width major order, with the channels being the most minor (if - /// is true) or major (if is false) dimension. - /// - /// The input image to extract - /// Whether the alpha channel should be extracted - /// Whether the red channel should be extracted - /// Whether the green channel should be extracted - /// Whether the blue channel should be extracted - /// In which order extract channels. - /// Whether the pixel values should be interleaved, as opposed to being separated by channel - /// The vectorized image - /// - public static Vector ExtractPixelsAsBytes(this Custom input, bool useAlpha = false, bool useRed = true, - bool useGreen = true, bool useBlue = true, ImagePixelExtractingEstimator.ColorsOrder order = ImagePixelExtractingEstimator.Defaults.Order, bool interleave = false) - { - var colParams = new ImagePixelExtractingTransformer.Column - { - UseAlpha = useAlpha, - UseRed = useRed, - UseGreen = useGreen, - UseBlue = useBlue, - Interleave = interleave, - Convert = false - }; - return new ImagePixelExtractingStaticExtensions.OutPipelineColumn(input, colParams); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/ImageTransformsStatic.cs b/src/Microsoft.ML.StaticPipe/ImageTransformsStatic.cs deleted file mode 100644 index cee882025a..0000000000 --- a/src/Microsoft.ML.StaticPipe/ImageTransformsStatic.cs +++ /dev/null @@ -1,246 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Drawing; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.Image; - -namespace Microsoft.ML.StaticPipe -{ - public static class ImageLoadingStaticExtensions - { - internal sealed class OutPipelineColumn : Custom - { - private readonly Scalar _input; - - public OutPipelineColumn(Scalar path, string relativeTo) - : base(new Reconciler(relativeTo), path) - { - Contracts.AssertValue(path); - _input = path; - } - - /// - /// Reconciler to an for the . - /// - /// - /// We must create a new reconciler per call, because the relative path of - /// is considered a transform-wide option, as it is not specified in . However, we still - /// implement so the analyzer can still equate two of these things if they happen to share the same - /// path, so we can be a bit more efficient with respect to our estimator declarations. - /// - /// - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly string _relTo; - - public Reconciler(string relativeTo) - { - Contracts.AssertValueOrNull(relativeTo); - _relTo = relativeTo; - } - - public bool Equals(Reconciler other) - => other != null && other._relTo == _relTo; - - public override bool Equals(object obj) - => obj is Reconciler other && Equals(other); - - public override int GetHashCode() - => _relTo?.GetHashCode() ?? 0; - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new (string outputColumnName, string inputColumnName)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (OutPipelineColumn)toOutput[i]; - cols[i] = (outputNames[outCol], inputNames[outCol._input]); - } - return new ImageLoadingEstimator(env, _relTo, cols); - } - } - } - } - - public static class ImageGreyScalingStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - internal sealed class OutPipelineColumn : Custom, IColInput - { - public PipelineColumn Input { get; } - - public OutPipelineColumn(Custom input) - : base(Reconciler.Inst, input) - { - Contracts.AssertValue(input); - Contracts.Assert(typeof(T) == typeof(Bitmap) || typeof(T) == typeof(UnknownSizeBitmap)); - Input = input; - } - } - - /// - /// Reconciler to an for the . - /// - /// Because we want to use the same reconciler for - /// - /// - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new (string outputColumnName, string inputColumnName)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (IColInput)toOutput[i]; - cols[i] = (outputNames[toOutput[i]], inputNames[outCol.Input]); - } - return new ImageGrayscalingEstimator(env, cols); - } - } - } - - public static class ImageResizingStaticExtensions - { - internal sealed class OutPipelineColumn : Custom - { - private readonly PipelineColumn _input; - private readonly int _width; - private readonly int _height; - private readonly ImageResizingEstimator.ResizingKind _resizing; - private readonly ImageResizingEstimator.Anchor _cropAnchor; - - public OutPipelineColumn(PipelineColumn input, int width, int height, - ImageResizingEstimator.ResizingKind resizing, ImageResizingEstimator.Anchor cropAnchor) - : base(Reconciler.Inst, input) - { - Contracts.AssertValue(input); - _input = input; - _width = width; - _height = height; - _resizing = resizing; - _cropAnchor = cropAnchor; - } - - private ImageResizingEstimator.ColumnOptions MakeColumnOptions(string outputColumnName, string inputColumnName) - => new ImageResizingEstimator.ColumnOptions(outputColumnName, _width, _height, inputColumnName, _resizing, _cropAnchor); - - /// - /// Reconciler to an for the . - /// - /// - /// - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() - { - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new ImageResizingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (OutPipelineColumn)toOutput[i]; - cols[i] = outCol.MakeColumnOptions(outputNames[outCol], inputNames[outCol._input]); - } - return new ImageResizingEstimator(env, cols); - } - } - } - } - - public static class ImagePixelExtractingStaticExtensions - { - private interface IColInput - { - Custom Input { get; } - - ImagePixelExtractingEstimator.ColumnOptions MakeColumnOptions(string outputColumnName, string inputColumnName); - } - - internal sealed class OutPipelineColumn : Vector, IColInput - { - public Custom Input { get; } - private static readonly ImagePixelExtractingTransformer.Options _defaultArgs = new ImagePixelExtractingTransformer.Options(); - private readonly ImagePixelExtractingTransformer.Column _colParam; - - public OutPipelineColumn(Custom input, ImagePixelExtractingTransformer.Column col) - : base(Reconciler.Inst, input) - { - Contracts.AssertValue(input); - Contracts.Assert(typeof(T) == typeof(float) || typeof(T) == typeof(byte)); - Input = input; - _colParam = col; - } - - public ImagePixelExtractingEstimator.ColumnOptions MakeColumnOptions(string outputColumnName, string inputColumnName) - { - // In principle, the analyzer should only call the the reconciler once for these columns. - Contracts.Assert(_colParam.Source == null); - Contracts.Assert(_colParam.Name == null); - - _colParam.Name = outputColumnName; - _colParam.Source = inputColumnName; - return new ImagePixelExtractingEstimator.ColumnOptions(_colParam, _defaultArgs); - } - } - - /// - /// Reconciler to an for the . - /// - /// Because we want to use the same reconciler for - /// - /// - private sealed class Reconciler : EstimatorReconciler - { - /// - /// Because there are no global settings that cannot be overridden, we can always just use the same reconciler. - /// - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new ImagePixelExtractingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (IColInput)toOutput[i]; - cols[i] = outCol.MakeColumnOptions(outputNames[toOutput[i]], inputNames[outCol.Input]); - } - return new ImagePixelExtractingEstimator(env, cols); - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/KMeansStatic.cs b/src/Microsoft.ML.StaticPipe/KMeansStatic.cs deleted file mode 100644 index 8918102717..0000000000 --- a/src/Microsoft.ML.StaticPipe/KMeansStatic.cs +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// The trainer context extensions for the . - /// - public static class KMeansClusteringExtensions - { - /// - /// KMeans extension method. - /// - /// The clustering catalog trainer object. - /// The features, or independent variables. - /// The optional example weights. - /// The number of clusters to use for KMeans. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Vector score, Key predictedLabel) KMeans(this ClusteringCatalog.ClusteringTrainers catalog, - Vector features, Scalar weights = null, - int clustersCount = KMeansTrainer.Defaults.NumberOfClusters, - Action onFit = null) - { - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(clustersCount > 1, nameof(clustersCount), "If provided, must be greater than 1."); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.Clustering( - (env, featuresName, weightsName) => - { - var options = new KMeansTrainer.Options - { - FeatureColumnName = featuresName, - NumberOfClusters = clustersCount, - ExampleWeightColumnName = weightsName - }; - - var trainer = new KMeansTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, features, weights); - - return rec.Output; - } - - /// - /// KMeans extension method. - /// - /// The regression catalog trainer object. - /// The features, or independent variables. - /// The optional example weights. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Vector score, Key predictedLabel) KMeans(this ClusteringCatalog.ClusteringTrainers catalog, - Vector features, Scalar weights, - KMeansTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValueOrNull(onFit); - Contracts.CheckValue(options, nameof(options)); - - var rec = new TrainerEstimatorReconciler.Clustering( - (env, featuresName, weightsName) => - { - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new KMeansTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, features, weights); - - return rec.Output; - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/LbfgsStatic.cs b/src/Microsoft.ML.StaticPipe/LbfgsStatic.cs deleted file mode 100644 index 47086d0933..0000000000 --- a/src/Microsoft.ML.StaticPipe/LbfgsStatic.cs +++ /dev/null @@ -1,323 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - using Options = LbfgsLogisticRegressionBinaryTrainer.Options; - - /// - /// Binary Classification trainer estimators. - /// - public static class LbfgsBinaryStaticExtensions - { - /// - /// Predict a target using a linear binary classification model trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Enforce non-negative weights. - /// Weight of L1 regularization term. - /// Weight of L2 regularization term. - /// Memory size for . Low=faster, less accurate. - /// Threshold for optimizer convergence. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar probability, Scalar predictedLabel) LbfgsLogisticRegression(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - float l1Regularization = Options.Defaults.L1Regularization, - float l2Regularization = Options.Defaults.L2Regularization, - float optimizationTolerance = Options.Defaults.OptimizationTolerance, - int historySize = Options.Defaults.HistorySize, - bool enforceNonNegativity = Options.Defaults.EnforceNonNegativity, - Action> onFit = null) - { - LbfgsStaticUtils.ValidateParams(label, features, weights, l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity, onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LbfgsLogisticRegressionBinaryTrainer(env, labelName, featuresName, weightsName, - l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear binary classification model trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// Advanced arguments to the algorithm. - /// The predicted output. - public static (Scalar score, Scalar probability, Scalar predictedLabel) LbfgsLogisticRegression(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - Options options, - Action> onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValue(options, nameof(options)); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LbfgsLogisticRegressionBinaryTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - } - - /// - /// Regression trainer estimators. - /// - public static class LbfgsRegressionStaticExtensions - { - /// - /// Predict a target using a linear regression model trained with the trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Enforce non-negative weights. - /// Weight of L1 regularization term. - /// Weight of L2 regularization term. - /// Memory size for . Low=faster, less accurate. - /// Threshold for optimizer convergence. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static Scalar LbfgsPoissonRegression(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - float l1Regularization = Options.Defaults.L1Regularization, - float l2Regularization = Options.Defaults.L2Regularization, - float optimizationTolerance = Options.Defaults.OptimizationTolerance, - int historySize = Options.Defaults.HistorySize, - bool enforceNonNegativity = Options.Defaults.EnforceNonNegativity, - Action onFit = null) - { - LbfgsStaticUtils.ValidateParams(label, features, weights, l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity, onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LbfgsPoissonRegressionTrainer(env, labelName, featuresName, weightsName, - l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a linear regression model trained with the trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static Scalar LbfgsPoissonRegression(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - LbfgsPoissonRegressionTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValue(options, nameof(options)); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LbfgsPoissonRegressionTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - - return trainer; - }, label, features, weights); - - return rec.Score; - } - } - - /// - /// Multiclass Classification trainer estimators. - /// - public static class LbfgsMulticlassStaticExtensions - { - /// - /// Predict a target using a maximum entropy classification model trained with the L-BFGS method implemented in . - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Enforce non-negative weights. - /// Weight of L1 regularization term. - /// Weight of L2 regularization term. - /// Memory size for . Low=faster, less accurate. - /// Threshold for optimizer convergence. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) - LbfgsMaximumEntropy(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights = null, - float l1Regularization = Options.Defaults.L1Regularization, - float l2Regularization = Options.Defaults.L2Regularization, - float optimizationTolerance = Options.Defaults.OptimizationTolerance, - int historySize = Options.Defaults.HistorySize, - bool enforceNonNegativity = Options.Defaults.EnforceNonNegativity, - Action onFit = null) - { - LbfgsStaticUtils.ValidateParams(label, features, weights, l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity, onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - var trainer = new LbfgsMaximumEntropyMulticlassTrainer(env, labelName, featuresName, weightsName, - l1Regularization, l2Regularization, optimizationTolerance, historySize, enforceNonNegativity); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a maximum entropy classification model trained with the L-BFGS method implemented in . - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) - LbfgsMaximumEntropy(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights, - LbfgsMaximumEntropyMulticlassTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValue(options, nameof(options)); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new LbfgsMaximumEntropyMulticlassTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - } - - internal static class LbfgsStaticUtils - { - internal static void ValidateParams(PipelineColumn label, - Vector features, - Scalar weights = null, - float l1Regularization = Options.Defaults.L1Regularization, - float l2Regularization = Options.Defaults.L2Regularization, - float optimizationTolerance = Options.Defaults.OptimizationTolerance, - int historySize = Options.Defaults.HistorySize, - bool enforceNonNegativity = Options.Defaults.EnforceNonNegativity, - Delegate onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckParam(l2Regularization >= 0, nameof(l2Regularization), "Must be non-negative"); - Contracts.CheckParam(l1Regularization >= 0, nameof(l1Regularization), "Must be non-negative"); - Contracts.CheckParam(optimizationTolerance > 0, nameof(optimizationTolerance), "Must be positive"); - Contracts.CheckParam(historySize > 0, nameof(historySize), "Must be positive"); - Contracts.CheckValueOrNull(onFit); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs deleted file mode 100644 index 3173519493..0000000000 --- a/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.Text; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Information on the result of fitting a LDA transform. - /// - public sealed class LatentDirichletAllocationFitResult - { - /// - /// For user defined delegates that accept instances of the containing type. - /// - /// - public delegate void OnFit(LatentDirichletAllocationFitResult result); - - public LatentDirichletAllocationTransformer.ModelParameters LdaTopicSummary; - public LatentDirichletAllocationFitResult(LatentDirichletAllocationTransformer.ModelParameters ldaTopicSummary) - { - LdaTopicSummary = ldaTopicSummary; - } - } - - public static class LatentDirichletAllocationStaticExtensions - { - private struct Config - { - public readonly int NumberOfTopics; - public readonly Single AlphaSum; - public readonly Single Beta; - public readonly int SamplingStepCount; - public readonly int MaximumNumberOfIterations; - public readonly int LikelihoodInterval; - public readonly int NumberOfThreads; - public readonly int MaximumTokenCountPerDocument; - public readonly int NumberOfSummaryTermsPerTopic; - public readonly int NumberOfBurninIterations; - public readonly bool ResetRandomGenerator; - - public readonly Action OnFit; - - public Config(int numberOfTopics, Single alphaSum, Single beta, int samplingStepCount, int maximumNumberOfIterations, int likelihoodInterval, - int numberOfThreads, int maximumTokenCountPerDocument, int numberOfSummaryTermsPerTopic, int numberOfBurninIterations, bool resetRandomGenerator, - Action onFit) - { - NumberOfTopics = numberOfTopics; - AlphaSum = alphaSum; - Beta = beta; - SamplingStepCount = samplingStepCount; - MaximumNumberOfIterations = maximumNumberOfIterations; - LikelihoodInterval = likelihoodInterval; - NumberOfThreads = numberOfThreads; - MaximumTokenCountPerDocument = maximumTokenCountPerDocument; - NumberOfSummaryTermsPerTopic = numberOfSummaryTermsPerTopic; - NumberOfBurninIterations = numberOfBurninIterations; - ResetRandomGenerator = resetRandomGenerator; - - OnFit = onFit; - } - } - - private static Action Wrap(LatentDirichletAllocationFitResult.OnFit onFit) - { - if (onFit == null) - return null; - - return ldaTopicSummary => onFit(new LatentDirichletAllocationFitResult(ldaTopicSummary)); - } - - private interface ILdaCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplVector : Vector, ILdaCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var infos = new LatentDirichletAllocationEstimator.ColumnOptions[toOutput.Length]; - Action onFit = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ILdaCol)toOutput[i]; - - infos[i] = new LatentDirichletAllocationEstimator.ColumnOptions(outputNames[toOutput[i]], - inputNames[tcol.Input], - tcol.Config.NumberOfTopics, - tcol.Config.AlphaSum, - tcol.Config.Beta, - tcol.Config.SamplingStepCount, - tcol.Config.MaximumNumberOfIterations, - tcol.Config.LikelihoodInterval, - tcol.Config.NumberOfThreads, - tcol.Config.MaximumTokenCountPerDocument, - tcol.Config.NumberOfSummaryTermsPerTopic, - tcol.Config.NumberOfBurninIterations, - tcol.Config.ResetRandomGenerator); - - if (tcol.Config.OnFit != null) - { - int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. - onFit += tt => tcol.Config.OnFit(tt.GetLdaDetails(ii)); - } - } - - var est = new LatentDirichletAllocationEstimator(env, infos); - if (onFit == null) - return est; - - return est.WithOnFitDelegate(onFit); - } - } - - /// - /// A vector of floats representing the document. - /// The number of topics. - /// Dirichlet prior on document-topic vectors. - /// Dirichlet prior on vocab-topic vectors. - /// Number of Metropolis Hasting step. - /// Number of iterations. - /// Compute log likelihood over local dataset on this iteration interval. - /// The number of training threads. Default value depends on number of logical processors. - /// The threshold of maximum count of tokens per doc. - /// The number of words to summarize the topic. - /// The number of burn-in iterations. - /// Reset the random number generator for each document. - /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector LatentDirichletAllocation(this Vector input, - int numberOfTopics = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics, - Single alphaSum = LatentDirichletAllocationEstimator.Defaults.AlphaSum, - Single beta = LatentDirichletAllocationEstimator.Defaults.Beta, - int samplingStepCount = LatentDirichletAllocationEstimator.Defaults.SamplingStepCount, - int maximumNumberOfIterations = LatentDirichletAllocationEstimator.Defaults.MaximumNumberOfIterations, - int likelihoodInterval = LatentDirichletAllocationEstimator.Defaults.LikelihoodInterval, - int numberOfThreads = LatentDirichletAllocationEstimator.Defaults.NumberOfThreads, - int maximumTokenCountPerDocument = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument, - int numberOfSummaryTermsPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic, - int numberOfBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations, - bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator, - LatentDirichletAllocationFitResult.OnFit onFit = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, - new Config(numberOfTopics, alphaSum, beta, samplingStepCount, maximumNumberOfIterations, likelihoodInterval, numberOfThreads, maximumTokenCountPerDocument, numberOfSummaryTermsPerTopic, - numberOfBurninIterations, resetRandomGenerator, Wrap(onFit))); - } - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/LocalPathReader.cs b/src/Microsoft.ML.StaticPipe/LocalPathReader.cs deleted file mode 100644 index cc01516152..0000000000 --- a/src/Microsoft.ML.StaticPipe/LocalPathReader.cs +++ /dev/null @@ -1,21 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Data; -using Microsoft.ML.Data; - -namespace Microsoft.ML.StaticPipe -{ - public static class LocalPathReader - { - - /// - /// Reads data from one or more file into an . - /// - /// The loader to use. - /// One or more paths from which to load data. - public static DataView Load(this DataLoader loader, params string[] path) - => loader.Load(new MultiFileSource(path)); - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs deleted file mode 100644 index a3f9cddc9c..0000000000 --- a/src/Microsoft.ML.StaticPipe/LpNormalizerStaticExtensions.cs +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extensions for statically typed . - /// - public static class LpNormNormalizerStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, LpNormNormalizingEstimatorBase.NormFunction norm, bool ensureZeroMean) - : base(new Reconciler(norm, ensureZeroMean), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly LpNormNormalizingEstimatorBase.NormFunction _norm; - private readonly bool _ensureZeroMean; - - public Reconciler(LpNormNormalizingEstimatorBase.NormFunction norm, bool ensureZeroMean) - { - _norm = norm; - _ensureZeroMean = ensureZeroMean; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new LpNormNormalizingEstimator(env, pairs.ToArray(), _norm, _ensureZeroMean); - } - } - - /// - /// The column containing the vectors to apply the normalization to. - /// Type of norm to use to normalize each sample. - /// Subtract mean from each value before normalizing. - public static Vector NormalizeLpNorm(this Vector input, - LpNormNormalizingEstimatorBase.NormFunction norm = LpNormNormalizingEstimatorBase.Defaults.Norm, - bool ensureZeroMean = LpNormNormalizingEstimatorBase.Defaults.LpEnsureZeroMean) => new OutPipelineColumn(input, norm, ensureZeroMean); - } -} diff --git a/src/Microsoft.ML.StaticPipe/MatrixFactorizationStatic.cs b/src/Microsoft.ML.StaticPipe/MatrixFactorizationStatic.cs deleted file mode 100644 index 0408ca85d9..0000000000 --- a/src/Microsoft.ML.StaticPipe/MatrixFactorizationStatic.cs +++ /dev/null @@ -1,107 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; -using Microsoft.ML.Trainers.Recommender; - -namespace Microsoft.ML.StaticPipe -{ - public static class MatrixFactorizationExtensions - { - /// - /// Predict matrix entry using matrix factorization - /// - /// The type of physical value of matrix's row and column index. It must be an integer type such as uint. - /// The regression catalog trainer object. - /// The label variable. - /// The column index of the considered matrix. - /// The row index of the considered matrix. - /// Advanced algorithm settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static Scalar MatrixFactorization(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Key matrixColumnIndex, Key matrixRowIndex, - MatrixFactorizationTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(matrixColumnIndex, nameof(matrixColumnIndex)); - Contracts.CheckValue(matrixRowIndex, nameof(matrixRowIndex)); - Contracts.CheckValue(options, nameof(options)); - Contracts.CheckValueOrNull(onFit); - - var rec = new MatrixFactorizationReconciler((env, labelColName, matrixColumnIndexColName, matrixRowIndexColName) => - { - options.MatrixColumnIndexColumnName = matrixColumnIndexColName; - options.MatrixRowIndexColumnName = matrixRowIndexColName; - options.LabelColumnName = labelColName; - - var trainer = new MatrixFactorizationTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, matrixColumnIndex, matrixRowIndex); - return rec.Output; - } - - private sealed class MatrixFactorizationReconciler : TrainerEstimatorReconciler - { - // Output column name of the trained estimator. - private static string FixedOutputName => DefaultColumnNames.Score; - - // A function used to create trainer of matrix factorization. It instantiates a trainer by indicating the - // expected inputs and output (IDataView's) column names. That trainer has a Fit(IDataView data) for learning - // a MatrixFactorizationPredictionTransformer from the data. - private readonly Func> _factory; - - /// - /// The only output produced by matrix factorization predictor - /// - public Scalar Output { get; } - - /// - /// The output columns. - /// - protected override IEnumerable Outputs { get; } - - public MatrixFactorizationReconciler(Func> factory, - Scalar label, Key matColumnIndex, Key matRowIndex) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(matColumnIndex, nameof(matColumnIndex)), Contracts.CheckRef(matRowIndex, nameof(matRowIndex))), - new string[] { FixedOutputName }) - { - Contracts.AssertValue(factory); - _factory = factory; - - Output = new Impl(this); - Outputs = new PipelineColumn[] { Output }; - } - - private static PipelineColumn[] MakeInputs(Scalar label, PipelineColumn matrixRowIndex, PipelineColumn matrixColumnIndex) - => new PipelineColumn[] { label, matrixRowIndex, matrixColumnIndex }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - - // The first, second, third names are label, matrix's column index, and matrix's row index, respectively. - return _factory(env, inputNames[0], inputNames[1], inputNames[2]); - } - - private sealed class Impl : Scalar - { - public Impl(MatrixFactorizationReconciler rec) : base(rec, rec.Inputs) { } - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj b/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj deleted file mode 100644 index 7669ba03ec..0000000000 --- a/src/Microsoft.ML.StaticPipe/Microsoft.ML.StaticPipe.csproj +++ /dev/null @@ -1,48 +0,0 @@ - - - - netstandard2.0 - Microsoft.ML.StaticPipe - - - - - - - - - - - - - - - ConvertStaticExtensions.tt - True - True - - - TermStaticExtensions.tt - TermStaticExtensions.cs - True - TextTemplatingFileGenerator - True - - - - - - ConvertStaticExtensions.cs - TextTemplatingFileGenerator - - - TermStaticExtensions.cs - TextTemplatingFileGenerator - - - - - - - - diff --git a/src/Microsoft.ML.StaticPipe/MulticlassNaiveBayesStatic.cs b/src/Microsoft.ML.StaticPipe/MulticlassNaiveBayesStatic.cs deleted file mode 100644 index 1c65d58fd6..0000000000 --- a/src/Microsoft.ML.StaticPipe/MulticlassNaiveBayesStatic.cs +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Multiclass Classification trainer estimators. - /// - public static partial class MulticlassClassificationStaticExtensions - { - /// - /// Predict a target using a linear multiclass classification model trained with the trainer. - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) - MulticlassNaiveBayesTrainer(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Action onFit = null) - { - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - var trainer = new NaiveBayesMulticlassTrainer(env, labelName, featuresName); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, null); - - return rec.Output; - } - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs deleted file mode 100644 index f571dbc5ea..0000000000 --- a/src/Microsoft.ML.StaticPipe/NormalizerStaticExtensions.cs +++ /dev/null @@ -1,376 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using Microsoft.ML.Data; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extension methods for static pipelines for normalization of data. - /// - public static class NormalizerStaticExtensions - { - private const long MaxTrain = NormalizingEstimator.Defaults.MaximumExampleCount; - - /// - /// Learns an affine function based on the minimum and maximum, so that all values between the minimum and - /// maximum observed during fitting fall into the range of -1 to 1. - /// - /// The input column. - /// If set to false, then the observed minimum and maximum during fitting - /// will map to -1 and 1 respectively, exactly. If however set to true, then 0 will always map to 0. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// When gathering statistics only look at most this many examples. - /// A delegate that can be called whenever the function is fit, with the learned slopes - /// and, if is false, the offsets as well. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// Note that if values are later transformed that are lower than the minimum, or higher than the maximum, - /// observed during fitting, that the output values may be outside the range of -1 to 1. - /// The normalized column. - public static NormVector Normalize( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitAffine> onFit = null) - { - return NormalizeByMinMaxCore(input, ensureZeroUntouched, maximumExampleCount, onFit); - } - - /// - /// Learns an affine function based on the minimum and maximum, so that all values between the minimum and - /// maximum observed during fitting fall into the range of -1 to 1. - /// - /// The column containing the vectors to apply the normalization to. - /// If set to false, then the observed minimum and maximum during fitting - /// will map to -1 and 1 respectively, exactly. If however set to true, then 0 will always map to 0. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the learned slopes - /// and, if is false, the offsets as well. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// Note that if values are later transformed that are lower than the minimum, or higher than the maximum, - /// observed during fitting, that the output values may be outside the range of -1 to 1. - /// The normalized column. - public static NormVector Normalize( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitAffine> onFit = null) - { - return NormalizeByMinMaxCore(input, ensureZeroUntouched, maximumExampleCount, onFit); - } - - private static NormVector NormalizeByMinMaxCore(Vector input, bool ensureZeroUntouched, long maximumExampleCount, - OnFitAffine> onFit) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckParam(maximumExampleCount > 1, nameof(maximumExampleCount), "Must be greater than 1"); - return new Impl(input, (name, src) => new NormalizingEstimator.MinMaxColumnOptions(name, src, maximumExampleCount, ensureZeroUntouched), AffineMapper(onFit)); - } - - // We have a slightly different breaking up of categories of normalizers versus the dynamic API. Both the mean-var and - // CDF normalizers are initialized in the same way because they gather exactly the same statistics, but from the point of - // view of the static API what is more important is the type of mapping that winds up being computed. - - /// - /// Learns an affine function based on the observed mean and standard deviation. This is less susceptible - /// to outliers as compared to . - /// - /// The column containing the vectors to apply the normalization to. - /// If set to true then the offset will always be considered zero. - /// If set to true then we transform over the logarithm of the values, rather - /// than just the raw values. If this is set to true then is ignored. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the learned slopes - /// and, if is false, the offsets as well. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeMeanVariance( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - bool useLog = false, long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitAffine> onFit = null) - { - return NormalizeByMVCdfCore(input, ensureZeroUntouched, useLog, false, maximumExampleCount, AffineMapper(onFit)); - } - - /// - /// Learns an affine function based on the observed mean and standard deviation. This is less susceptible - /// to outliers as compared to . - /// - /// The column containing the vectors to apply the normalization to. - /// If set to true then the offset will always be considered zero. - /// If set to true then we transform over the logarithm of the values, rather - /// than just the raw values. If this is set to true then is ignored. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the learned slopes - /// and, if is false, the offsets as well. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeMeanVariance( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - bool useLog = false, long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitAffine> onFit = null) - { - return NormalizeByMVCdfCore(input, ensureZeroUntouched, useLog, false, maximumExampleCount, AffineMapper(onFit)); - } - - /// - /// Learns a function based on the cumulative density function of a normal distribution parameterized by - /// a mean and variance as observed during fitting. - /// - /// The column containing the vectors to apply the normalization to. - /// If set to false, then the learned distributional parameters will be - /// adjusted in such a way as to ensure that the input 0 maps to the output 0. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// If set to true then we transform over the logarithm of the values, rather - /// than just the raw values. If this is set to true then is ignored. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the learned mean and standard - /// deviation for all slots. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeByCumulativeDistribution( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - bool useLog = false, long maximumNumberOfExamples = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitCumulativeDistribution> onFit = null) - { - return NormalizeByMVCdfCore(input, ensureZeroUntouched, useLog, true, maximumNumberOfExamples, CdfMapper(onFit)); - } - - /// - /// Learns a function based on the cumulative density function of a normal distribution parameterized by - /// a mean and variance as observed during fitting. - /// - /// The column containing the vectors to apply the normalization to. - /// If set to false, then the learned distributional parameters will be - /// adjusted in such a way as to ensure that the input 0 maps to the output 0. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// If set to true then we transform over the logarithm of the values, rather - /// than just the raw values. If this is set to true then is ignored. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the learned mean and standard - /// deviation for all slots. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeByCumulativeDistribution( - this Vector input, bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - bool useLog = false, long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitCumulativeDistribution> onFit = null) - { - return NormalizeByMVCdfCore(input, ensureZeroUntouched, useLog, true, maximumExampleCount, CdfMapper(onFit)); - } - - private static NormVector NormalizeByMVCdfCore(Vector input, bool ensureZeroUntouched, bool useLog, bool useCdf, long maximumExampleCount, Action onFit) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckParam(maximumExampleCount > 1, nameof(maximumExampleCount), "Must be greater than 1"); - return new Impl(input, (name, src) => - { - if (useLog) - return new NormalizingEstimator.LogMeanVarianceColumnOptions(name, src, maximumExampleCount, useCdf); - return new NormalizingEstimator.MeanVarianceColumnOptions(name, src, maximumExampleCount, ensureZeroUntouched, useCdf); - }, onFit); - } - - /// - /// Learns a function based on a discretization of the input values. The observed values for each slot are - /// analyzed, and the range of numbers is partitioned into monotonically increasing bins. An attempt is made - /// to make these bins equal in population, but under some circumstances this may be impossible (for example, a slot - /// with a very dominant mode). The way the mapping works is, if there are N bins in a slot, and a value - /// falls in the range of bin n (indexed from 0), the output value is n / (N - 1), and then possibly - /// subtracting off the binned value for what 0 would have been if is true. - /// - /// The column containing the vectors to apply the normalization to. - /// The maximum number of discretization points to learn per slot. - /// Normally the output is in the range of 0 to 1, but if set to true, then what - /// would have been the output for a zero input is subtracted off the value. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the bin upper bounds for each slot. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeByBinning( - this Vector input, int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount, - bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitBinned> onFit = null) - { - return NormalizeByBinningCore(input, maximumBinCount, ensureZeroUntouched, maximumExampleCount, onFit); - } - - /// - /// Learns a function based on a discretization of the input values. The observed values for each slot are - /// analyzed, and the range of numbers is partitioned into monotonically increasing bins. An attempt is made - /// to make these bins equal in population, but under some circumstances this may be impossible (for example, a slot - /// with a very dominant mode). The way the mapping works is, if there are N bins in a slot, and a value - /// falls in the range of bin n (indexed from 0), the output value is n / (N - 1), and then possibly - /// subtracting off the binned value for what 0 would have been if is true. - /// - /// The column containing the vectors to apply the normalization to. - /// The maximum number of discretization points to learn per slot. - /// Normally the output is in the range of 0 to 1, but if set to true, then what - /// would have been the output for a zero input is subtracted off the value. - /// This is valuable for the sake of sparsity preservation, if normalizing sparse vectors. - /// When gathering statistics only look at most this many examples. - /// A delegate called whenever the estimator is fit, with the bin upper bounds for each slot. - /// Note that the statistics gathering and normalization is done independently per slot of the - /// vector values. - /// The normalized column. - public static NormVector NormalizeByBinning( - this Vector input, int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount, - bool ensureZeroUntouched = NormalizingEstimator.Defaults.EnsureZeroUntouched, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - OnFitBinned> onFit = null) - { - return NormalizeByBinningCore(input, maximumBinCount, ensureZeroUntouched, maximumExampleCount, onFit); - } - - private static NormVector NormalizeByBinningCore(Vector input, int maximumBinCount, bool ensureZeroUntouched, long maximumExampleCount, - OnFitBinned> onFit) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckParam(maximumBinCount > 1, nameof(maximumExampleCount), "Must be greater than 1"); - Contracts.CheckParam(maximumExampleCount > 1, nameof(maximumExampleCount), "Must be greater than 1"); - return new Impl(input, (name, src) => new NormalizingEstimator.BinningColumnOptions(name, src, maximumExampleCount, ensureZeroUntouched, maximumBinCount), BinMapper(onFit)); - } - - /// - /// For user provided delegates to receive information when an affine normalizer is fitted. - /// The function of the normalizer transformer is (input - offset) * scale. - /// - /// The data type being received, either a numeric type, or a sequence of the numeric type - /// The scales. In the scalar case, this is a single value. In the vector case this is of length equal - /// to the number of slots. - /// The offsets. In the scalar case, this is a single value. In the vector case this is of length equal - /// to the number of slots, or of length zero if all the offsets are zero. - public delegate void OnFitAffine(TData scale, TData offset); - - /// - /// For user provided delegates to receive information when a cumulative distribution function normalizer is fitted. - /// - /// The data type being received, either a numeric type, or a sequence of the numeric type - /// The mean value. In the scalar case, this is a single value. In the vector case this is of length equal - /// to the number of slots. - /// The standard deviation. In the scalar case, this is a single value. In the vector case - /// this is of length equal to the number of slots. - public delegate void OnFitCumulativeDistribution(TData mean, TData standardDeviation); - - /// - /// For user provided delegates to receive information when a binning normalizer is fitted. - /// The function fo the normalizer transformer is, given a value, find its index in the upper bounds, then divide that value - /// by the number of upper bounds minus 1, so as to scale the index between 0 and 1. Then, if zero had been fixed, subtract - /// off the value that would have been computed by the above procedure for the value zero. - /// - /// The data type being received, either a numeric type, or a sequence of the numeric type - /// For a scalar column a single sequence of the bin upper bounds. For a vector, the same, but - /// for all slots. - public delegate void OnFitBinned(ImmutableArray upperBounds); - - #region Implementation support - private delegate NormalizingEstimator.ColumnOptionsBase CreateNormCol(string outputColumnName, string inputColumnName); - - private sealed class Rec : EstimatorReconciler - { - // All settings are self contained in the columns. - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var cols = new NormalizingEstimator.ColumnOptionsBase[toOutput.Length]; - List<(int idx, Action onFit)> onFits = null; - - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (INormColCreator)toOutput[i]; - cols[i] = col.CreateNormCol(outputNames[toOutput[i]], inputNames[col.Input]); - if (col.OnFit != null) - Utils.Add(ref onFits, (i, col.OnFit)); - } - var norm = new NormalizingEstimator(env, cols); - if (Utils.Size(onFits) == 0) - return norm; - return norm.WithOnFitDelegate(normTrans => - { - Contracts.Assert(normTrans.ColumnFunctions.Count == toOutput.Length); - foreach ((int idx, Action onFit) in onFits) - onFit(normTrans.ColumnFunctions[idx]); - }); - } - } - - private static Action AffineMapper(OnFitAffine onFit) - { - Contracts.AssertValueOrNull(onFit); - if (onFit == null) - return null; - return col => - { - var aCol = (NormalizingTransformer.AffineNormalizerModelParameters)col?.GetNormalizerModelParams(); - onFit(aCol.Scale, aCol.Offset); - }; - } - - private static Action CdfMapper(OnFitCumulativeDistribution onFit) - { - Contracts.AssertValueOrNull(onFit); - if (onFit == null) - return null; - return col => - { - var aCol = (NormalizingTransformer.CdfNormalizerModelParameters)col?.GetNormalizerModelParams(); - onFit(aCol.Mean, aCol.StandardDeviation); - }; - } - - private static Action BinMapper(OnFitBinned onFit) - { - Contracts.AssertValueOrNull(onFit); - if (onFit == null) - return null; - return col => - { - var aCol = (NormalizingTransformer.BinNormalizerModelParameters)col?.GetNormalizerModelParams(); - onFit(aCol.UpperBounds); - }; - } - - private interface INormColCreator - { - CreateNormCol CreateNormCol { get; } - PipelineColumn Input { get; } - Action OnFit { get; } - } - - private sealed class Impl : NormVector, INormColCreator - { - public PipelineColumn Input { get; } - public CreateNormCol CreateNormCol { get; } - public Action OnFit { get; } - - public Impl(Vector input, CreateNormCol del, Action onFitDel) - : base(Rec.Inst, input) - { - Contracts.AssertValue(input); - Contracts.AssertValue(del); - Contracts.AssertValueOrNull(onFitDel); - Input = input; - CreateNormCol = del; - OnFit = onFitDel; - } - } - #endregion - } -} diff --git a/src/Microsoft.ML.StaticPipe/OnlineLearnerStatic.cs b/src/Microsoft.ML.StaticPipe/OnlineLearnerStatic.cs deleted file mode 100644 index 5e3f478e67..0000000000 --- a/src/Microsoft.ML.StaticPipe/OnlineLearnerStatic.cs +++ /dev/null @@ -1,259 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Binary Classification trainer estimators. - /// - public static class AveragedPerceptronStaticExtensions - { - /// - /// Predict a target using a linear binary classification model trained with the AveragedPerceptron trainer, and a custom loss. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The custom loss. - /// The optional example weights. - /// The learning Rate. - /// Decrease learning rate as iterations progress. - /// L2 regularization weight. - /// Number of training iterations through the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - /// . - /// - /// - /// - /// - public static (Scalar score, Scalar predictedLabel) AveragedPerceptron( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - IClassificationLoss lossFunction = null, - float learningRate = AveragedLinearOptions.AveragedDefault.LearningRate, - bool decreaseLearningRate = AveragedLinearOptions.AveragedDefault.DecreaseLearningRate, - float l2Regularization = AveragedLinearOptions.AveragedDefault.L2Regularization, - int numIterations = AveragedLinearOptions.AveragedDefault.NumberOfIterations, - Action onFit = null - ) - { - OnlineLinearStaticUtils.CheckUserParams(label, features, weights, learningRate, l2Regularization, numIterations, onFit); - - bool hasProbs = lossFunction is LogLoss; - - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - - var trainer = new AveragedPerceptronTrainer(env, labelName, featuresName, lossFunction, - learningRate, decreaseLearningRate, l2Regularization, numIterations); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear binary classification model trained with the AveragedPerceptron trainer, and a custom loss. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The custom loss. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - /// . - /// - /// - /// - /// - public static (Scalar score, Scalar predictedLabel) AveragedPerceptron( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - IClassificationLoss lossFunction, - AveragedPerceptronTrainer.Options options, - Action onFit = null - ) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - bool hasProbs = lossFunction is LogLoss; - - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - - var trainer = new AveragedPerceptronTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - - }, label, features, weights); - - return rec.Output; - } - } - - /// - /// Regression trainer estimators. - /// - public static class OnlineGradientDescentExtensions - { - /// - /// Predict a target using a linear regression model trained with the trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// The custom loss. Defaults to if not provided. - /// The learning Rate. - /// Decrease learning rate as iterations progress. - /// L2 regularization weight. - /// Number of training iterations through the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - /// . - /// The predicted output. - public static Scalar OnlineGradientDescent(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - IRegressionLoss lossFunction = null, - float learningRate = OnlineGradientDescentTrainer.Options.OgdDefaultArgs.LearningRate, - bool decreaseLearningRate = OnlineGradientDescentTrainer.Options.OgdDefaultArgs.DecreaseLearningRate, - float l2Regularization = OnlineGradientDescentTrainer.Options.OgdDefaultArgs.L2Regularization, - int numIterations = OnlineLinearOptions.OnlineDefault.NumberOfIterations, - Action onFit = null) - { - OnlineLinearStaticUtils.CheckUserParams(label, features, weights, learningRate, l2Regularization, numIterations, onFit); - Contracts.CheckValueOrNull(lossFunction); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - var trainer = new OnlineGradientDescentTrainer(env, labelName, featuresName, learningRate, - decreaseLearningRate, l2Regularization, numIterations, lossFunction); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a linear regression model trained with the trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - /// . - /// The predicted output. - public static Scalar OnlineGradientDescent(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - OnlineGradientDescentTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValue(options, nameof(options)); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - - var trainer = new OnlineGradientDescentTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - - return trainer; - }, label, features, weights); - - return rec.Score; - } - } - - internal static class OnlineLinearStaticUtils - { - - internal static void CheckUserParams(PipelineColumn label, - PipelineColumn features, - PipelineColumn weights, - float learningRate, - float l2RegularizerWeight, - int numIterations, - Delegate onFit) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(learningRate > 0, nameof(learningRate), "Must be positive."); - Contracts.CheckParam(0 <= l2RegularizerWeight && l2RegularizerWeight < 0.5, nameof(l2RegularizerWeight), "must be in range [0, 0.5)"); - Contracts.CheckParam(numIterations > 0, nameof(numIterations), "Must be positive, if specified."); - Contracts.CheckValueOrNull(onFit); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/PipelineColumn.cs b/src/Microsoft.ML.StaticPipe/PipelineColumn.cs deleted file mode 100644 index 36b4364f86..0000000000 --- a/src/Microsoft.ML.StaticPipe/PipelineColumn.cs +++ /dev/null @@ -1,158 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// This class is used as a type marker for producing structures for use in the statically - /// typed columnate pipeline building helper API. Users will not create these structures directly. Rather components - /// will implement (hidden) subclasses of one of this classes subclasses (for example, , - /// ), which will contain information that the builder API can use to construct an actual - /// sequence of objects. - /// - public abstract class PipelineColumn - { - internal readonly Reconciler ReconcilerObj; - internal readonly PipelineColumn[] Dependencies; - - private protected PipelineColumn(Reconciler reconciler, PipelineColumn[] dependencies) - { - Contracts.CheckValue(reconciler, nameof(reconciler)); - Contracts.CheckValueOrNull(dependencies); - - ReconcilerObj = reconciler; - Dependencies = dependencies; - } - } - - /// - /// For representing a non-key, non-vector . - /// - /// The scalar item type. - public abstract class Scalar : PipelineColumn - { - protected Scalar(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(Scalar)}<{typeof(T).Name}>"; - } - - /// - /// For representing a of known length. - /// - /// The vector item type. - public abstract class Vector : PipelineColumn - { - protected Vector(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(Vector)}<{typeof(T).Name}>"; - } - - /// - /// For representing a that is normalized, that is, its - /// value is set with the value true. - /// - /// The vector item type. - public abstract class NormVector : Vector - { - protected NormVector(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(NormVector)}<{typeof(T).Name}>"; - } - - /// - /// For representing a of unknown length. - /// - /// The vector item type. - public abstract class VarVector : PipelineColumn - { - protected VarVector(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(VarVector)}<{typeof(T).Name}>"; - } - - /// - /// For representing a of known cardinality, where the type of key is not specified. - /// - /// The physical type representing the key, which should always be one of , - /// , , or - /// Note that a vector of keys type we would represent as with a - /// type parameter. Note also, if the type of the key is known then that should be represented - /// by . - public abstract class Key : PipelineColumn - { - protected Key(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(Key)}<{typeof(T).Name}>"; - } - - /// - /// For representing a key-type of known cardinality that has key values over a particular type. This is used to - /// represent a where it is known that it will have of a particular type . - /// - /// The physical type representing the key, which should always be one of , - /// , , or - /// The type of values the key-type is enumerating. Commonly this is but - /// this is not necessary - public abstract class Key : Key - { - protected Key(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(Key)}<{typeof(T).Name}, {typeof(TVal).Name}>"; - } - - /// - /// For representing a of unknown cardinality. - /// - /// The physical type representing the key, which should always be one of , - /// , , or - /// Note that unlike the and duality, there is no - /// type corresponding to this type but with key-values, since key-values are necessarily a vector of known - /// size so any enumeration into that set would itself be a key-value of unknown cardinality. - public abstract class VarKey : PipelineColumn - { - protected VarKey(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(VarKey)}<{typeof(T).Name}>"; - } - - /// - /// For representing a custom . - /// - /// The custom item type. - public abstract class Custom: PipelineColumn - { - protected Custom(Reconciler reconciler, params PipelineColumn[] dependencies) - : base(reconciler, dependencies) - { - } - - public override string ToString() => $"{nameof(Custom)}<{typeof(T).Name}>"; - } - -} diff --git a/src/Microsoft.ML.StaticPipe/Reconciler.cs b/src/Microsoft.ML.StaticPipe/Reconciler.cs deleted file mode 100644 index 4d0bc448cd..0000000000 --- a/src/Microsoft.ML.StaticPipe/Reconciler.cs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// An object for instances to indicate to the analysis code for static pipelines that - /// they should be considered a single group of columns (through equality on the reconcilers), as well as how to - /// actually create the underlying dynamic structures, whether an - /// (for the ) or a - /// (for the ). - /// - public abstract class Reconciler - { - private protected Reconciler() { } - } - - /// - /// Reconciler for column groups intended to resolve to a new - /// or . - /// - /// The input type of the - /// object. - public abstract class LoaderReconciler : Reconciler - { - public LoaderReconciler() : base() { } - - /// - /// Returns a data-loader estimator. Note that there are no input names because the columns from a data-loader - /// estimator should have no dependencies. - /// - /// The host environment to use to create the data-loader estimator - /// The columns that the object created by the reconciler should output - /// A map containing - /// - public abstract IDataLoaderEstimator> Reconcile( - IHostEnvironment env, PipelineColumn[] toOutput, IReadOnlyDictionary outputNames); - } - - /// - /// Reconciler for column groups intended to resolve to an . This type of - /// reconciler will work with - /// or other methods that involve the creation of estimator chains. - /// - public abstract class EstimatorReconciler : Reconciler - { - public EstimatorReconciler() : base() { } - - /// - /// Returns an estimator. - /// - /// The host environment to use to create the estimator - /// The columns that the object created by the reconciler should output - /// The name mapping that maps dependencies of the output columns to their names - /// The name mapping that maps the output column to their names - /// While most estimators allow full control over the names of their outputs, a limited - /// subset of estimator transforms do not allow this: they produce columns whose names are unconfigurable. For - /// these, there is this collection which provides the names used by the analysis tool. If the estimator under - /// construction must use one of the names here, then they are responsible for "saving" the column they will - /// overwrite using applications of the . Note that if the estimator under - /// construction has complete control over what columns it produces, there is no need for it to pay this argument - /// any attention. - /// Returns an estimator. - public abstract IEstimator Reconcile( - IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames); - } -} diff --git a/src/Microsoft.ML.StaticPipe/SchemaAssertionContext.cs b/src/Microsoft.ML.StaticPipe/SchemaAssertionContext.cs deleted file mode 100644 index 8bb85ddce0..0000000000 --- a/src/Microsoft.ML.StaticPipe/SchemaAssertionContext.cs +++ /dev/null @@ -1,215 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// An object for declaring a schema-shape. This is mostly commonly used in situations where a user is - /// asserting that a dynamic object bears a certain specific static schema. For example: when phrasing - /// the dynamically typed as being a specific . - /// It is never created by the user directly, but instead an instance is typically fed in as an argument - /// to a delegate, and the user will call methods on this context to indicate a certain type is so. - /// - /// - /// All objects are, deliberately, imperitavely useless as they are - /// intended to be used only in a declarative fashion. The methods and properties of this class go one step - /// further and return null for everything with a return type of . - /// - /// Because 's type system is extensible, assemblies that declare their own types - /// should allow users to assert typedness in their types by defining extension methods over this class. - /// However, even failing the provision of such a helper, a user can still provide a workaround by just - /// declaring the type as something like default(Scalar<TheCustomType>, without using the - /// instance of this context. - /// - public sealed class SchemaAssertionContext - { - // Hiding all these behind empty-structures is a bit of a cheap trick, but probably works - // pretty well considering that the alternative is a bunch of tiny objects allocated on the - // stack. Plus, the default value winds up working for them. We can also exploit the `ref struct` - // property of these things to make sure people don't make the mistake of assigning them as the - // values. - - /// Assertions over a column of . - public PrimitiveTypeAssertions I1 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions I2 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions I4 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions I8 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions U1 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions U2 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions U4 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions U8 => default; - - /// Assertions over a column of . - public NormalizableTypeAssertions R4 => default; - - /// Assertions over a column of . - public NormalizableTypeAssertions R8 => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions Text => default; - - /// Assertions over a column of . - public PrimitiveTypeAssertions Bool => default; - - /// Assertions over a column of with . - public KeyTypeSelectorAssertions KeyU1 => default; - /// Assertions over a column of with . - public KeyTypeSelectorAssertions KeyU2 => default; - /// Assertions over a column of with . - public KeyTypeSelectorAssertions KeyU4 => default; - /// Assertions over a column of with . - public KeyTypeSelectorAssertions KeyU8 => default; - - internal static SchemaAssertionContext Inst = new SchemaAssertionContext(); - - private SchemaAssertionContext() { } - - // Until we have some transforms that use them, we might not expect to see too much interest in asserting - // the time relevant datatypes. - - /// - /// Holds assertions relating to the basic primitive types. - /// - public ref struct PrimitiveTypeAssertions - { - private PrimitiveTypeAssertions(int i) { } - - /// - /// Asserts a type that is directly this . - /// - public Scalar Scalar => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is true. - /// - public Vector Vector => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is false. - /// - public VarVector VarVector => null; - } - - public ref struct NormalizableTypeAssertions - { - private NormalizableTypeAssertions(int i) { } - - /// - /// Asserts a type that is directly this . - /// - public Scalar Scalar => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is true. - /// - public Vector Vector => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is false. - /// - public VarVector VarVector => null; - /// - /// Asserts a type corresponding to a of this , - /// where is true, and the - /// metadata is defined with a Boolean true value. - /// - public NormVector NormVector => null; - } - - /// - /// Once a single general key type has been selected, we can select its vector-ness. - /// - /// The static type corresponding to a . - public ref struct KeyTypeVectorAssertions - where T : class - { - private KeyTypeVectorAssertions(int i) { } - - /// - /// Asserts a type that is directly this . - /// - public T Scalar => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is true. - /// - public Vector Vector => null; - - /// - /// Asserts a type corresponding to a of this , - /// where is false. - /// - public VarVector VarVector => null; - } - - /// - /// Assertions for key types of various forms. Used to select a particular . - /// - /// - public ref struct KeyTypeSelectorAssertions - { - private KeyTypeSelectorAssertions(int i) { } - - /// - /// Asserts a type corresponding to a where is positive, that is, is of known cardinality, - /// but that we are not asserting has any particular type of metadata. - /// - public KeyTypeVectorAssertions> NoValue => default; - - /// - /// Asserts a type corresponding to a where is zero, that is, is of unknown cardinality. - /// - public KeyTypeVectorAssertions> UnknownCardinality => default; - - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> I1Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> I2Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> I4Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> I8Values => default; - - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> U1Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> U2Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> U4Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> U8Values => default; - - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> R4Values => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> R8Values => default; - - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> TextValues => default; - /// Asserts a of known cardinality with a vector of metadata. - public KeyTypeVectorAssertions> BoolValues => default; - } - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/SchemaBearing.cs b/src/Microsoft.ML.StaticPipe/SchemaBearing.cs deleted file mode 100644 index 683674f984..0000000000 --- a/src/Microsoft.ML.StaticPipe/SchemaBearing.cs +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Threading; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// A base class for the statically-typed pipeline components, that are marked as producing - /// data whose schema has a certain shape. - /// - /// The shape type parameter. - public abstract class SchemaBearing - { - internal readonly IHostEnvironment Env; - internal readonly StaticSchemaShape Shape; - - private StaticPipeUtils.IndexHelper _indexer; - /// - /// The indexer for the object. Note component authors will not access this directly but should instead - /// work via the public method - /// - internal StaticPipeUtils.IndexHelper Indexer - { - get - { - return _indexer ?? - Interlocked.CompareExchange(ref _indexer, new StaticPipeUtils.IndexHelper(this), null) ?? - _indexer; - } - } - - /// - /// Constructor for a block maker. - /// - /// The host environment, stored with this object - /// The item holding the name and types as enumerated within - /// - private protected SchemaBearing(IHostEnvironment env, StaticSchemaShape shape) - { - Contracts.AssertValue(env); - env.AssertValue(shape); - - Env = env; - Shape = shape; - } - - /// - /// Starts a new pipeline, using the output schema of this object. Note that the returned - /// estimator does not contain this object, but it has its schema informed by . - /// The returned object is an empty estimator, on which a new segment of the pipeline can be created. - /// - /// An empty estimator with the same shape as the object on which it was created - public Estimator MakeNewEstimator() - { - var est = new EstimatorChain(); - return new Estimator(Env, est, Shape, Shape); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs deleted file mode 100644 index 595a8f3e5c..0000000000 --- a/src/Microsoft.ML.StaticPipe/SdcaStaticExtensions.cs +++ /dev/null @@ -1,517 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extension methods and utilities for instantiating SDCA trainer estimators inside statically typed pipelines. - /// - public static class SdcaStaticExtensions - { - /// - /// Predict a target using a linear regression model trained with the SDCA trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// The L2 regularization hyperparameter. - /// The L1 regularization hyperparameter. Higher values will tend to lead to more sparse model. - /// The maximum number of passes to perform over the data. - /// The custom loss, if unspecified will be . - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - /// - /// - /// - /// - public static Scalar Sdca(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights = null, - float? l2Regularization = null, - float? l1Threshold = null, - int? numberOfIterations = null, - ISupportSdcaRegressionLoss lossFunction = null, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(l2Regularization < 0), nameof(l2Regularization), "Must not be negative, if specified."); - Contracts.CheckParam(!(l1Threshold < 0), nameof(l1Threshold), "Must not be negative, if specified."); - Contracts.CheckParam(!(numberOfIterations < 1), nameof(numberOfIterations), "Must be positive if specified"); - Contracts.CheckValueOrNull(lossFunction); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SdcaRegressionTrainer(env, labelName, featuresName, weightsName, lossFunction, l2Regularization, l1Threshold, numberOfIterations); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a linear regression model trained with the SDCA trainer. - /// - /// The regression catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - /// - /// - /// - /// - public static Scalar Sdca(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights, - SdcaRegressionTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SdcaRegressionTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// Predict a target using a linear binary classification model trained with the SDCA trainer, and log-loss. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// The L2 regularization hyperparameter. - /// The L1 regularization hyperparameter. Higher values will tend to lead to more sparse model. - /// The maximum number of passes to perform over the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - /// - /// - /// - /// - public static (Scalar score, Scalar probability, Scalar predictedLabel) Sdca( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights = null, - float? l2Regularization = null, - float? l1Threshold = null, - int? numberOfIterations = null, - Action> onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(l2Regularization < 0), nameof(l2Regularization), "Must not be negative, if specified."); - Contracts.CheckParam(!(l1Threshold < 0), nameof(l1Threshold), "Must not be negative, if specified."); - Contracts.CheckParam(!(numberOfIterations < 1), nameof(numberOfIterations), "Must be positive if specified"); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SdcaLogisticRegressionBinaryTrainer(env, labelName, featuresName, weightsName, l2Regularization, l1Threshold, numberOfIterations); - if (onFit != null) - { - return trainer.WithOnFitDelegate(trans => - { - onFit(trans.Model); - }); - } - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear binary classification model trained with the SDCA trainer, and log-loss. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - /// - /// - /// - /// - public static (Scalar score, Scalar probability, Scalar predictedLabel) Sdca( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights, - SdcaLogisticRegressionBinaryTrainer.Options options, - Action> onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SdcaLogisticRegressionBinaryTrainer(env, options); - if (onFit != null) - { - return trainer.WithOnFitDelegate(trans => - { - onFit(trans.Model); - }); - } - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear binary classification model trained with the SDCA trainer, and a custom loss. - /// Note that because we cannot be sure that all loss functions will produce naturally calibrated outputs, setting - /// a custom loss function will not produce a calibrated probability column. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The custom loss. - /// The optional example weights. - /// The L2 regularization hyperparameter. - /// The L1 regularization hyperparameter. Higher values will tend to lead to more sparse model. - /// The maximum number of passes to perform over the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - public static (Scalar score, Scalar predictedLabel) SdcaNonCalibrated( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, - ISupportSdcaClassificationLoss lossFunction, - Scalar weights = null, - float? l2Regularization = null, - float? l1Threshold = null, - int? numberOfIterations = null, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValue(lossFunction, nameof(lossFunction)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(l2Regularization < 0), nameof(l2Regularization), "Must not be negative, if specified."); - Contracts.CheckParam(!(l1Threshold < 0), nameof(l1Threshold), "Must not be negative, if specified."); - Contracts.CheckParam(!(numberOfIterations < 1), nameof(numberOfIterations), "Must be positive if specified"); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SdcaNonCalibratedBinaryTrainer(env, labelName, featuresName, weightsName, lossFunction, l2Regularization, l1Threshold, numberOfIterations); - if (onFit != null) - { - return trainer.WithOnFitDelegate(trans => - { - onFit(trans.Model); - }); - } - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear binary classification model trained with the SDCA trainer, and a custom loss. - /// Note that because we cannot be sure that all loss functions will produce naturally calibrated outputs, setting - /// a custom loss function will not produce a calibrated probability column. - /// - /// The binary classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The custom loss. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained, as well as the calibrator on top of that model. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), and the predicted label. - public static (Scalar score, Scalar predictedLabel) SdcaNonCalibrated( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights, - ISupportSdcaClassificationLoss lossFunction, - SdcaNonCalibratedBinaryTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - options.FeatureColumnName = featuresName; - options.LabelColumnName = labelName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SdcaNonCalibratedBinaryTrainer(env, options); - if (onFit != null) - { - return trainer.WithOnFitDelegate(trans => - { - onFit(trans.Model); - }); - } - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a maximum entropy classification model trained with the SDCA trainer. - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// The L2 regularization hyperparameter. - /// The L1 regularization hyperparameter. Higher values will tend to lead to more sparse model. - /// The maximum number of passes to perform over the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) Sdca( - this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights = null, - float? l2Regularization = null, - float? l1Threshold = null, - int? numberOfIterations = null, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(l2Regularization < 0), nameof(l2Regularization), "Must not be negative, if specified."); - Contracts.CheckParam(!(l1Threshold < 0), nameof(l1Threshold), "Must not be negative, if specified."); - Contracts.CheckParam(!(numberOfIterations < 1), nameof(numberOfIterations), "Must be positive if specified"); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SdcaMaximumEntropyMulticlassTrainer(env, labelName, featuresName, weightsName, l2Regularization, l1Threshold, numberOfIterations); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a maximum entropy classification model trained with the SDCA trainer. - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) Sdca( - this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights, - SdcaMaximumEntropyMulticlassTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SdcaMaximumEntropyMulticlassTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear multiclass classification model trained with the SDCA trainer. - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The custom loss, for example, . - /// The optional example weights. - /// The L2 regularization hyperparameter. - /// The L1 regularization hyperparameter. Higher values will tend to lead to more sparse model. - /// The maximum number of passes to perform over the data. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) SdcaNonCalibrated( - this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - ISupportSdcaClassificationLoss lossFunction, - Scalar weights = null, - float? l2Regularization = null, - float? l1Threshold = null, - int? numberOfIterations = null, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(lossFunction); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(!(l2Regularization < 0), nameof(l2Regularization), "Must not be negative, if specified."); - Contracts.CheckParam(!(l1Threshold < 0), nameof(l1Threshold), "Must not be negative, if specified."); - Contracts.CheckParam(!(numberOfIterations < 1), nameof(numberOfIterations), "Must be positive if specified"); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SdcaNonCalibratedMulticlassTrainer(env, labelName, featuresName, weightsName, lossFunction, l2Regularization, l1Threshold, numberOfIterations); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear multiclass classification model trained with the SDCA trainer. - /// - /// The multiclass classification catalog trainer object. - /// The label, or dependent variable. - /// The features, or independent variables. - /// The optional example weights. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the - /// result in any way; it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted per-class likelihoods (between 0 and 1, and summing up to 1), and the predicted label. - public static (Vector score, Key predictedLabel) SdcaNonCalibrated( - this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog, - Key label, - Vector features, - Scalar weights, - SdcaNonCalibratedMulticlassTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(options); - Contracts.CheckValueOrNull(onFit); - - var rec = new TrainerEstimatorReconciler.MulticlassClassificationReconciler( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SdcaNonCalibratedMulticlassTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Output; - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/SgdStatic.cs b/src/Microsoft.ML.StaticPipe/SgdStatic.cs deleted file mode 100644 index c0c02cb7e9..0000000000 --- a/src/Microsoft.ML.StaticPipe/SgdStatic.cs +++ /dev/null @@ -1,179 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Binary Classification trainer estimators. - /// - public static class SgdStaticExtensions - { - /// - /// Predict a target using logistic regression trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The name of the label column. - /// The name of the feature column. - /// The name for the example weight column. - /// The maximum number of iterations; set to 1 to simulate online learning. - /// The initial learning rate used by SGD. - /// The L2 weight for regularization. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar probability, Scalar predictedLabel) StochasticGradientDescentClassificationTrainer( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - int numberOfIterations = SgdCalibratedTrainer.Options.Defaults.NumberOfIterations, - double learningRate = SgdCalibratedTrainer.Options.Defaults.LearningRate, - float l2Regularization = SgdCalibratedTrainer.Options.Defaults.L2Regularization, - Action> onFit = null) - { - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SgdCalibratedTrainer(env, labelName, featuresName, weightsName, numberOfIterations, learningRate, l2Regularization); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using logistic regression trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The name of the label column. - /// The name of the feature column. - /// The name for the example weight column. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar probability, Scalar predictedLabel) StochasticGradientDescentClassificationTrainer( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - SgdCalibratedTrainer.Options options, - Action> onFit = null) - { - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - options.FeatureColumnName = featuresName; - options.LabelColumnName = labelName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SgdCalibratedTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear classification model trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The name of the label column. - /// The name of the feature column. - /// The name for the example weight column. - /// The maximum number of iterations; set to 1 to simulate online learning. - /// The initial learning rate used by SGD. - /// The L2 weight for regularization. - /// The loss function to use. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar predictedLabel) StochasticGradientDescentNonCalibratedClassificationTrainer( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights = null, - int numberOfIterations = SgdNonCalibratedTrainer.Options.Defaults.NumberOfIterations, - double learningRate = SgdNonCalibratedTrainer.Options.Defaults.LearningRate, - float l2Regularization = SgdNonCalibratedTrainer.Options.Defaults.L2Regularization, - IClassificationLoss lossFunction = null, - Action onFit = null) - { - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - var trainer = new SgdNonCalibratedTrainer(env, labelName, featuresName, weightsName, - numberOfIterations, learningRate, l2Regularization, lossFunction); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - - /// - /// Predict a target using a linear classification model trained with the trainer. - /// - /// The binary classification catalog trainer object. - /// The name of the label column. - /// The name of the feature column. - /// The name for the example weight column. - /// Advanced arguments to the algorithm. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; it is only a way for the caller to - /// be informed about what was learnt. - /// The predicted output. - public static (Scalar score, Scalar predictedLabel) StochasticGradientDescentNonCalibratedClassificationTrainer( - this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, - Vector features, - Scalar weights, - SgdNonCalibratedTrainer.Options options, - Action onFit = null) - { - var rec = new TrainerEstimatorReconciler.BinaryClassifierNoCalibration( - (env, labelName, featuresName, weightsName) => - { - options.FeatureColumnName = featuresName; - options.LabelColumnName = labelName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new SgdNonCalibratedTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - - }, label, features, weights); - - return rec.Output; - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/StaticPipeExtensions.cs b/src/Microsoft.ML.StaticPipe/StaticPipeExtensions.cs deleted file mode 100644 index 076f463ee1..0000000000 --- a/src/Microsoft.ML.StaticPipe/StaticPipeExtensions.cs +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -namespace Microsoft.ML.StaticPipe -{ - public static class StaticPipeExtensions - { - /// - /// Asserts that a given data view has the indicated schema. If this method returns without - /// throwing then the view has been validated to have columns with the indicated names and types. - /// - /// The type representing the view's schema shape - /// The view to assert the static schema on - /// The host environment to keep in the statically typed variant - /// The delegate through which we declare the schema, which ought to - /// use the input to declare a - /// of the indices, properly named - /// A statically typed wrapping of the input view - public static DataView AssertStatic<[IsShape] T>(this IDataView view, IHostEnvironment env, - Func outputDecl) - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(view, nameof(view)); - env.CheckValue(outputDecl, nameof(outputDecl)); - - // We don't actually need to call the method, it's just there to give the declaration. -#if DEBUG - outputDecl(SchemaAssertionContext.Inst); -#endif - - var schema = StaticSchemaShape.Make(outputDecl.Method.ReturnParameter); - return new DataView(env, view, schema); - } - - public static DataLoader AssertStatic(this IDataLoader loader, IHostEnvironment env, - Func outputDecl) - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(loader, nameof(loader)); - env.CheckValue(outputDecl, nameof(outputDecl)); - - var schema = StaticSchemaShape.Make(outputDecl.Method.ReturnParameter); - return new DataLoader(env, loader, schema); - } - - public static DataLoaderEstimator AssertStatic( - this IDataLoaderEstimator loaderEstimator, IHostEnvironment env, - Func outputDecl) - where TLoader : class, IDataLoader - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(loaderEstimator, nameof(loaderEstimator)); - env.CheckValue(outputDecl, nameof(outputDecl)); - - var schema = StaticSchemaShape.Make(outputDecl.Method.ReturnParameter); - return new DataLoaderEstimator(env, loaderEstimator, schema); - } - - public static Transformer AssertStatic<[IsShape] TIn, [IsShape] TOut, TTrans>( - this TTrans transformer, IHostEnvironment env, - Func inputDecl, - Func outputDecl) - where TTrans : class, ITransformer - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(transformer, nameof(transformer)); - env.CheckValue(inputDecl, nameof(inputDecl)); - env.CheckValue(outputDecl, nameof(outputDecl)); - - var inSchema = StaticSchemaShape.Make(inputDecl.Method.ReturnParameter); - var outSchema = StaticSchemaShape.Make(outputDecl.Method.ReturnParameter); - return new Transformer(env, transformer, inSchema, outSchema); - } - - public static Estimator AssertStatic<[IsShape] TIn, [IsShape] TOut, TTrans>( - this IEstimator estimator, IHostEnvironment env, - Func inputDecl, - Func outputDecl) - where TTrans : class, ITransformer - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(estimator, nameof(estimator)); - env.CheckValue(inputDecl, nameof(inputDecl)); - env.CheckValue(outputDecl, nameof(outputDecl)); - - var inSchema = StaticSchemaShape.Make(inputDecl.Method.ReturnParameter); - var outSchema = StaticSchemaShape.Make(outputDecl.Method.ReturnParameter); - return new Estimator(env, estimator, inSchema, outSchema); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/StaticPipeInternalUtils.cs b/src/Microsoft.ML.StaticPipe/StaticPipeInternalUtils.cs deleted file mode 100644 index e7acc2cbf3..0000000000 --- a/src/Microsoft.ML.StaticPipe/StaticPipeInternalUtils.cs +++ /dev/null @@ -1,672 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Reflection; -using System.Runtime.CompilerServices; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Utility functions useful for the internal implementations of the key pipeline utilities. - /// - internal static class StaticPipeInternalUtils - { - /// - /// Given a type which is a tree with leaves, return an instance of that - /// type which has appropriate instances of that use the returned reconciler. - /// - /// This is a data-reconciler that always reconciles to a null object - /// A type of either or one of the major subclasses - /// (for example, , , etc.) - /// An instance of where all fields have the provided reconciler - public static T MakeAnalysisInstance(out LoaderReconciler fakeReconciler) - { - var rec = new AnalyzeUtil.Rec(); - fakeReconciler = rec; - return (T)AnalyzeUtil.MakeAnalysisInstanceCore(rec, new HashSet()); - } - - private static class AnalyzeUtil - { - public sealed class Rec : LoaderReconciler - { - public Rec() : base() { } - - public override IDataLoaderEstimator> Reconcile( - IHostEnvironment env, PipelineColumn[] toOutput, IReadOnlyDictionary outputNames) - { - Contracts.AssertValue(env); - foreach (var col in toOutput) - env.Assert(col.ReconcilerObj == this); - return null; - } - } - - private static Reconciler _reconciler = new Rec(); - - private sealed class AScalar : Scalar { public AScalar(Rec rec) : base(rec, null) { } } - private sealed class AVector : Vector { public AVector(Rec rec) : base(rec, null) { } } - private sealed class ANormVector : NormVector { public ANormVector(Rec rec) : base(rec, null) { } } - private sealed class AVarVector : VarVector { public AVarVector(Rec rec) : base(rec, null) { } } - private sealed class AKey : Key { public AKey(Rec rec) : base(rec, null) { } } - private sealed class AKey : Key { public AKey(Rec rec) : base(rec, null) { } } - private sealed class AVarKey : VarKey { public AVarKey(Rec rec) : base(rec, null) { } } - private sealed class ACustom : Custom { public ACustom(Rec rec) : base(rec, null) { } } - - private static PipelineColumn MakeScalar(Rec rec) => new AScalar(rec); - private static PipelineColumn MakeVector(Rec rec) => new AVector(rec); - private static PipelineColumn MakeNormVector(Rec rec) => new ANormVector(rec); - private static PipelineColumn MakeVarVector(Rec rec) => new AVarVector(rec); - private static PipelineColumn MakeKey(Rec rec) => new AKey(rec); - private static Key MakeKey(Rec rec) => new AKey(rec); - private static PipelineColumn MakeVarKey(Rec rec) => new AVarKey(rec); - private static PipelineColumn MakeCustom(Rec rec) => new ACustom(rec); - - private static MethodInfo[] _valueTupleCreateMethod = InitValueTupleCreateMethods(); - - private static MethodInfo[] InitValueTupleCreateMethods() - { - const string methodName = nameof(ValueTuple.Create); - var methods = typeof(ValueTuple).GetMethods() - .Where(m => m.Name == methodName && m.ContainsGenericParameters) - .OrderBy(m => m.GetGenericArguments().Length).Take(7) - .ToArray().AppendElement(typeof(AnalyzeUtil).GetMethod(nameof(UnstructedCreate))); - return methods; - } - - /// - /// Note that we use this instead of - /// for the eight-item because that method will embed the last element into a one-element tuple, - /// which is embedded in the original. The actual physical representation, which is what is relevant here, - /// has no real conveniences around its creation. - /// - public static ValueTuple - UnstructedCreate( - T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, TRest restTuple) - where TRest : struct - { - return new ValueTuple(v1, v2, v3, v4, v5, v6, v7, restTuple); - } - - public static object MakeAnalysisInstanceCore(Rec rec, HashSet encountered) - { - var t = typeof(T); - if (typeof(PipelineColumn).IsAssignableFrom(t)) - { - if (t.IsGenericType) - { - var genP = t.GetGenericArguments(); - var genT = t.GetGenericTypeDefinition(); - - if (genT == typeof(Scalar<>)) - return Utils.MarshalInvoke(MakeScalar, genP[0], rec); - if (genT == typeof(Vector<>)) - return Utils.MarshalInvoke(MakeVector, genP[0], rec); - if (genT == typeof(NormVector<>)) - return Utils.MarshalInvoke(MakeNormVector, genP[0], rec); - if (genT == typeof(VarVector<>)) - return Utils.MarshalInvoke(MakeVarVector, genP[0], rec); - if (genT == typeof(Key<>)) - return Utils.MarshalInvoke(MakeKey, genP[0], rec); - if (genT == typeof(Key<,>)) - { - Func f = MakeKey; - return f.Method.GetGenericMethodDefinition().MakeGenericMethod(genP).Invoke(null, new object[] { rec }); - } - if (genT == typeof(VarKey<>)) - return Utils.MarshalInvoke(MakeVector, genP[0], rec); - if (genT == typeof(Custom<>)) - return Utils.MarshalInvoke(MakeCustom, genP[0], rec); - } - throw Contracts.Except($"Type {t} is a {nameof(PipelineColumn)} yet does not appear to be directly one of " + - $"the official types. This is commonly due to a mistake by the component author and can be addressed by " + - $"upcasting the instance in the tuple definition to one of the official types."); - } - // If it's not a pipeline column type, perhaps it is a value-tuple. - - if (ValueTupleUtils.IsValueTuple(t)) - { - var genT = t.GetGenericTypeDefinition(); - var genP = t.GetGenericArguments(); - Contracts.Assert(1 <= genP.Length && genP.Length <= 8); - // First recursively create the sub-analysis objects. - object[] subArgs = genP.Select(subType => Utils.MarshalInvoke(MakeAnalysisInstanceCore, subType, rec, encountered)).ToArray(); - // Next create the tuple. - return _valueTupleCreateMethod[subArgs.Length - 1].MakeGenericMethod(genP).Invoke(null, subArgs); - } - else - { - // If neither of these, perhaps it's a supported type of property-bearing class. Either way, this is the sort - // of class we have to be careful about since there could be some recursively defined types. - if (!encountered.Add(t)) - throw Contracts.Except($"Recursively defined type {t} encountered."); - var func = GetContainerMaker(t, out Type[] inputTypes); - object[] subArgs = inputTypes.Select(subType => Utils.MarshalInvoke(MakeAnalysisInstanceCore, subType, rec, encountered)).ToArray(); - encountered.Remove(t); - return func(subArgs); - } - - //throw Contracts.Except($"Type {t} is neither a {nameof(PipelineColumn)} subclass nor a value tuple. Other types are not permitted."); - } - } - - public static KeyValuePair[] GetNamesTypes(ParameterInfo pInfo) - => GetNamesTypes(pInfo); - - public static KeyValuePair[] GetNamesTypes(ParameterInfo pInfo) - { - Contracts.CheckValue(pInfo, nameof(pInfo)); - if (typeof(T) != pInfo.ParameterType) - throw Contracts.ExceptParam(nameof(pInfo), "Type mismatch with " + typeof(T).Name); - var result = NameUtil.GetNames(default, pInfo); - var retVal = new KeyValuePair[result.Length]; - for (int i = 0; i < result.Length; ++i) - { - retVal[i] = new KeyValuePair(result[i].name, result[i].type); - Contracts.Assert(result[i].value == default); - } - return retVal; - } - - /// - /// Given a schema shape defining instance, return the pairs of names and values, based on a recursive - /// traversal of the structure. If in that list the value a.b.c is paired with an item x, - /// then programmatically when accessed as .a.b.c - /// would be that item x. - /// - /// The schema shape defining type. - /// The instance of that schema shape defining type, whose items will - /// populate the fields of the returned items. - /// It is an implementation detail of the value-tuple type that the names - /// are not associated with the type at all, but there is instead an illusion propagated within - /// Visual Studio, that works via attributes. Programmatic access to this is limited, except that - /// a is attached to the type in appropriate places, for example, - /// in a delegate one of the parameters, or the return parameter, or somesuch. If present, the names - /// will be extracted from that structure, and if not the default names of Item1, Item2, - /// etc. will be used. Note that non-value-tuple tupes do not have this problem. - /// The set of names and corresponding values. - public static KeyValuePair[] GetNamesValues(T record, ParameterInfo pInfo) - => GetNamesValues(record, pInfo); - - private static KeyValuePair[] GetNamesValues(T record, ParameterInfo pInfo) - { - Contracts.CheckValue(pInfo, nameof(pInfo)); - Contracts.CheckParam(typeof(T) == pInfo.ParameterType, nameof(pInfo), "Type mismatch with " + nameof(record)); - var result = NameUtil.GetNames(record, pInfo); - var retVal = new KeyValuePair[result.Length]; - for (int i = 0; i < result.Length; ++i) - retVal[i] = new KeyValuePair(result[i].name, result[i].value); - return retVal; - } - - /// - /// A sort of extended version of that accounts - /// for the presence of the , and types. /> - /// - /// Can we assign to this type? - /// From that type? - /// - public static bool IsAssignableFromStaticPipeline(this Type to, Type from) - { - Contracts.AssertValue(to); - Contracts.AssertValue(from); - if (to.IsAssignableFrom(from)) - return true; - // The only exception to the above test are the vector types. These are generic types. - if (!to.IsGenericType || !from.IsGenericType) - return false; - var gto = to.GetGenericTypeDefinition(); - var gfrom = from.GetGenericTypeDefinition(); - - // If either of the types is not one of the vector types, we can just stop right here. - if ((gto != typeof(Vector<>) && gto != typeof(VarVector<>) && gto != typeof(NormVector<>)) || - (gfrom != typeof(Vector<>) && gfrom != typeof(VarVector<>) && gfrom != typeof(NormVector<>))) - { - return false; - } - - // First check the value types. If those don't match, no sense going any further. - var ato = to.GetGenericArguments(); - var afrom = from.GetGenericArguments(); - Contracts.Assert(Utils.Size(ato) == 1); - Contracts.Assert(Utils.Size(afrom) == 1); - - if (!ato[0].IsAssignableFrom(afrom[0])) - return false; - - // We have now confirmed at least the compatibility of the item types. Next we must confirm the same of the vector type. - // Variable sized vectors must match in their types, norm vector can be considered assignable to vector. - - // If either is a var vector, the other must be as well. - if (gto == typeof(VarVector<>)) - return gfrom == typeof(VarVector<>); - - // We can assign from NormVector<> to Vector<>, but not the other way around. So we only fail if we are trying to assign Vector<> to NormVector<>. - return gfrom != typeof(Vector<>) || gto != typeof(NormVector<>); - } - - /// - /// Utility for extracting names out of shape-shape structures. - /// - /// The base type in the base world. - private static class NameUtil - { - private readonly struct Info - { - public readonly Type Type; - public readonly object Item; - - public Info(Type type, object item) - { - Type = type; - Item = item; - } - } - - /// - /// A utility for exacting name/type/value triples out of a value-tuple based tree structure. - /// - /// For example: If were then the value-tuple - /// (a: 1, b: (c: 2, d: 3), e: 4) would result in the return array where the name/value - /// pairs were [("a", 1), ("b.c", 2), ("b.d", 3), "e", 4], in some order, and the type - /// is typeof(int). - /// - /// Note that the type returned in the triple is the type as declared in the tuple, which will - /// be a derived type of , and in turn the type of the value will be - /// of a type derived from that type. - /// - /// This method will throw if anything other than value-tuples or - /// instances are detected during its execution. - /// - /// The type to extract on. - /// The instance to extract values out of. - /// A type parameter associated with this, usually extracted out of some - /// delegate over this type. In the case of value-tuples specifically, note that names in value-tuples - /// are an illusion perpetrated by the C# compiler, and are not accessible though - /// by reflection, which is why it is necessary to engage in trickery like passing in a delegate over - /// those types, which does retain the information on the names. - /// The list of name/type/value triples extracted out of the tree like-structure - public static (string name, Type type, TLeaf value)[] GetNames(T record, ParameterInfo pInfo) - { - Contracts.AssertValue(pInfo); - Contracts.Assert(typeof(T) == pInfo.ParameterType); - // Record can only be null if it isn't the value tuple type. - - if (typeof(TLeaf).IsAssignableFrom(typeof(T))) - return new[] { ("Data", typeof(T), (TLeaf)(object)record) }; - - // The structure of names for value tuples is somewhat unusual. All names in a nested structure of value - // tuples is arranged in a roughly depth-first structure, unless we consider tuple cardinality greater - // than seven (which is physically stored in a tuple of cardinality eight, with the so-called `Rest` - // field iteratively holding "more" values. So what appears to be a ten-tuple is really an eight-tuple, - // with the first seven items holding the first seven items of the original tuple, and another value - // tuple in `Rest` holding the remaining three items. - - // Anyway: the names are given in depth-first fashion with all items in a tuple being assigned - // contiguously to the items (so for any n-tuple, there is an contiguous n-length segment in the names - // array corresponding to the names). This also applies to the "virtual" >7 tuples, which are for this - // purpose considered "one" tuple, which has some interesting implications on valid traversals of the - // structure. - - var tupleNames = pInfo.GetCustomAttribute()?.TransformNames; - var accumulated = new List<(string, Type, TLeaf)>(); - RecurseNames(record, tupleNames, 0, null, accumulated); - return accumulated.ToArray(); - } - - /// - /// Helper method for , that given a - /// will either append triples to (if the item is of type - /// ), or recurse on this function (if the item is a ), - /// or otherwise throw an error. - /// - /// The type we are recursing on, should be a of some sort - /// The we are extracting on. Note that this is - /// just for the sake of ease of using - /// . - /// The names list extracted from the attribute, or null - /// if no such attribute could be found. - /// The offset into where 's names begin. - /// null for the root level structure, or the appendation of . suffixed names - /// of the path of value-tuples down to this item. - /// The list into which the names are being added - /// The total number of items added to - private static int RecurseNames(object record, IList names, int namesOffset, string namePrefix, List<(string, Type, TLeaf)> accum) - { - Contracts.AssertValueOrNull(record); - Contracts.Assert(record == null || record is T); - Contracts.Assert(record == null || !typeof(TLeaf).IsAssignableFrom(record.GetType())); - Contracts.AssertValueOrNull(names); - Contracts.Assert(names == null || namesOffset <= names.Count); - Contracts.AssertValueOrNull(namePrefix); - Contracts.AssertValue(accum); - - var ttype = typeof(T); - if (ValueTupleUtils.IsValueTuple(ttype)) - { - record = record ?? Activator.CreateInstance(ttype); - var tupleItems = new List(); - - ValueTupleUtils.ApplyActionToTuple((T)record, (index, type, item) - => tupleItems.Add(new Info(type, item))); - int total = tupleItems.Count; - - for (int i = 0; i < tupleItems.Count; ++i) - { - string name = names?[namesOffset + i] ?? $"Item{i + 1}"; - if (!string.IsNullOrEmpty(namePrefix)) - name = namePrefix + name; - - if (typeof(TLeaf).IsAssignableFrom(tupleItems[i].Type)) - accum.Add((name, tupleItems[i].Type, (TLeaf)tupleItems[i].Item)); - else - { - total += Utils.MarshalInvoke(RecurseNames, tupleItems[i].Type, - tupleItems[i].Item, names, namesOffset + total, name + ".", accum); - } - } - return total; - } - - // Otherwise it may be a class. Let's first check. - string error = VerifyIsSupportedContainingType(ttype); - if (error != null) - { - if (string.IsNullOrEmpty(namePrefix)) - throw Contracts.Except(error); - throw Contracts.Except($"Problem with {namePrefix}: {error}"); - } - - var props = ttype.GetProperties(BindingFlags.Public | BindingFlags.Instance); - foreach (var prop in props) - { - var propValue = record == null ? null : prop.GetValue(record); - var name = namePrefix + prop.Name; - - if (typeof(TLeaf).IsAssignableFrom(prop.PropertyType)) - accum.Add((name, prop.PropertyType, (TLeaf)propValue)); - else - { - // It may be that the property itself points to a value-tuple. Get it, just in case. - var tupleNames = prop.GetCustomAttribute()?.TransformNames; - // Do not incremenet the total in this case. This was not a value-tuple, and any internal thing - // that was a value-tuple should not result in an increment on the count. Correspondingly, we also - // start the recursion again insofar as the offset is concerned. - Utils.MarshalInvoke(RecurseNames, prop.PropertyType, - propValue, tupleNames, 0, name + ".", accum); - } - } - return 0; - } - } - - /// - /// Verifies whether the given type is a supported containing type. This returns the error - /// message rather than throwing itself so that the caller can report the error in the way that - /// is most appropriate to their context. - /// - /// The type to check. - /// A non-null answer - private static string VerifyIsSupportedContainingType(Type type) - { - Contracts.AssertValue(type); - if (type.IsValueType) - return $"Type {type.Name} is a value-type, not a reference type."; - - // Somehow, BindingsFlags.Public does not find the public constructor. Who knows why. - var constructors = type.GetConstructors(BindingFlags.Public | BindingFlags.Instance); - if (constructors.Length != 1) - return $"Type {type.Name} for schema shape should have exactly one public constructor."; - var constructor = constructors[0]; - var parameters = constructor.GetParameters(); - - // Let's do a small minor smoke test on the type to see that there is a one to one correspondence between - // the types of the properties, and the types of the parameters in the constructor. - var counters = new Dictionary(); - - // We allow one of two ways to create the instance. We can either have an empty constructor with all settable properties, - // or a constructor with as many parameters as properties, where there is a correspondence between the types. - var props = type.GetProperties(BindingFlags.Public | BindingFlags.Instance); - foreach (var prop in props) - { - // Skip all non-public properties. - if (prop.GetAccessors().All(a => !a.IsPublic && !a.IsStatic)) - continue; - if (!prop.CanRead) - return $"Type {type.Name} for schema shape has non-readable property {prop.Name}."; - if (prop.CanWrite != (parameters.Length == 0)) - { - if (prop.CanWrite) - return $"Type {type.Name} for schema shape has writable property {prop.Name}, but also has a non-empty constructor."; - return $"Type {type.Name} for schema shape has non-writable property {prop.Name}, but also has an empty constructor."; - } - counters.TryGetValue(prop.PropertyType, out int currCount); - counters[prop.PropertyType] = currCount + 1; - } - // Next let's check the types of the constructor properties, if any, and make sure there is a correspondence. - if (parameters.Length > 0) - { - foreach (var p in parameters) - { - if (!counters.TryGetValue(p.ParameterType, out int c) || c == 0) - return $"Constructor parameter {p.Name} is of type {p.ParameterType.Name} which appeared more often than we found a corresponding property."; - counters[p.ParameterType]--; - } - } - return null; - } - - /// - /// Creates a unified means of creating an instance of the given containing schema-shape type, - /// whether it is the type that has a constructor that has all the inputs enumerated and getter - /// // methods, - /// - /// - /// The types we expect when constructing. Note that - /// - public static Func GetContainerMaker(Type type, out Type[] inputTypes) - { - Contracts.AssertValue(type); - var typeError = VerifyIsSupportedContainingType(type); - if (typeError != null) - throw Contracts.ExceptParam(nameof(type), typeError); - var constructor = type.GetConstructors().First(p => p.IsPublic); - var parameters = constructor.GetParameters(); - var props = type.GetProperties(BindingFlags.Public | BindingFlags.Instance | BindingFlags.DeclaredOnly); - - Func retval; - if (parameters.Length == 0) - { - // This kind of functions like a quasi-constructor that sets all teh objects. - Action allSetter = null; - - inputTypes = new Type[props.Length]; - // All properties must have setters. - for (int i = 0; i < props.Length; ++i) - { - inputTypes[i] = props[i].PropertyType; - int ii = i; - allSetter += - (obj, inputs) => - { - Contracts.Assert(inputs.Length == props.Length); - Contracts.Assert(props[ii].CanWrite); - Contracts.Assert(props[ii].PropertyType.IsAssignableFrom(inputs[ii].GetType())); - props[ii].SetValue(obj, inputs[ii]); - }; - } - retval = - inputs => - { - var obj = constructor.Invoke(new object[0]); - allSetter?.Invoke(obj, inputs); - return obj; - }; - } - else - { - // Otherwise it's the constructor variant. - inputTypes = Utils.BuildArray(parameters.Length, i => parameters[i].ParameterType); - retval = constructor.Invoke; - } - // In either case, we would like there to be a check after this to ensure that no funny-business - // went on with the initialization, and that every public property is in fact reference equatable. - - return inputs => - { - var inputSet = new HashSet(inputs); - var obj = retval(inputs); - foreach (var prop in props) - { - var propValue = prop.GetValue(obj); - if (!inputSet.Remove(propValue)) - throw Contracts.Except($"While making {type.Name} instance, unexpected value found in property {prop.Name}."); - } - return obj; - }; - } - - private static class ValueTupleUtils - { - public static bool IsValueTuple(Type t) - { - Type genT = t.IsGenericType ? t.GetGenericTypeDefinition() : t; - return genT == typeof(ValueTuple<>) || genT == typeof(ValueTuple<,>) || genT == typeof(ValueTuple<,,>) - || genT == typeof(ValueTuple<,,,>) || genT == typeof(ValueTuple<,,,,>) || genT == typeof(ValueTuple<,,,,,>) - || genT == typeof(ValueTuple<,,,,,,>) || genT == typeof(ValueTuple<,,,,,,,>); - } - - public delegate void TupleItemAction(int index, Type itemType, object item); - - public static void ApplyActionToTuple(T tuple, TupleItemAction action) - { - Contracts.CheckValue(action, nameof(action)); - ApplyActionToTuple(tuple, 0, action); - } - - internal static void ApplyActionToTuple(object tuple, int root, TupleItemAction action) - { - Contracts.AssertValue(action); - Contracts.Assert(root >= 0); - - var tType = typeof(T); - if (tType.IsGenericType) - tType = tType.GetGenericTypeDefinition(); - - if (typeof(ValueTuple<>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,,,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,,,,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,,,,,>) == tType) - MarshalInvoke>(Process, tuple, root, action); - else if (typeof(ValueTuple<,,,,,,,>) == tType) - MarshalInvoke>>(Process, tuple, root, action); - else - { - // This will fall through here if this was either not a generic type or is a value tuple type. - throw Contracts.ExceptParam(nameof(tuple), $"Item should have been a {nameof(ValueTuple)} but was instead {tType}"); - } - } - - private delegate void Processor(T val, int root, TupleItemAction action); - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - action(root++, typeof(T4), val.Item4); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - action(root++, typeof(T4), val.Item4); - action(root++, typeof(T5), val.Item5); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - action(root++, typeof(T4), val.Item4); - action(root++, typeof(T5), val.Item5); - action(root++, typeof(T6), val.Item6); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - action(root++, typeof(T4), val.Item4); - action(root++, typeof(T5), val.Item5); - action(root++, typeof(T6), val.Item6); - action(root++, typeof(T7), val.Item7); - } - - private static void Process(ValueTuple val, int root, TupleItemAction action) - where TRest : struct - { - action(root++, typeof(T1), val.Item1); - action(root++, typeof(T2), val.Item2); - action(root++, typeof(T3), val.Item3); - action(root++, typeof(T4), val.Item4); - action(root++, typeof(T5), val.Item5); - action(root++, typeof(T6), val.Item6); - action(root++, typeof(T7), val.Item7); - ApplyActionToTuple(val.Rest, root++, action); - } - - private static void MarshalInvoke(Processor del, object arg, int root, TupleItemAction action) - { - Contracts.AssertValue(del); - Contracts.Assert(del.Method.IsGenericMethod); - var argType = arg.GetType(); - Contracts.Assert(argType.IsGenericType); - var argGenTypes = argType.GetGenericArguments(); - // The argument generic types should be compatible with the delegate's generic types. - Contracts.Assert(del.Method.GetGenericArguments().Length == argGenTypes.Length); - // Reconstruct the delegate generic types so it adheres to the args generic types. - var newDel = del.Method.GetGenericMethodDefinition().MakeGenericMethod(argGenTypes); - - var result = newDel.Invoke(null, new object[] { arg, root, action }); - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/StaticPipeUtils.cs b/src/Microsoft.ML.StaticPipe/StaticPipeUtils.cs deleted file mode 100644 index 031c1bd838..0000000000 --- a/src/Microsoft.ML.StaticPipe/StaticPipeUtils.cs +++ /dev/null @@ -1,464 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Utility methods for components that want to expose themselves in the idioms of the statically-typed pipelines. - /// These utilities are meant to be called by and useful to component authors, not users of those components. The - /// purpose is not to keep them hidden per se, but rather in a place less conspicuous to users that are just trying - /// to use the library without writing additional components of their own. - /// - public static class StaticPipeUtils - { - /// - /// This is a utility method intended to be used by authors of components to provide a strongly typed . - /// This analysis tool provides a standard way for loaders to exploit statically typed pipelines with the - /// standard tuple-shape objects without having to write such code themselves. - /// - /// Estimators will be instantiated with this environment - /// /// Some minor debugging information will be passed along to this channel - /// The input that will be used when invoking , which is used - /// either to produce the input columns. - /// All columns that are yielded by should produce this - /// single reconciler. The analysis code in this method will ensure that this is the first object to be - /// reconciled, before all others. - /// The user provided delegate. - /// The type parameter for the input type to the data loader estimator. - /// The input type of the input delegate. This might be some object out of - /// which one can fetch or else retrieve - /// The schema shape type describing the output. - /// The constructed wrapping data loader estimator. - public static DataLoaderEstimator> - LoaderEstimatorAnalyzerHelper( - IHostEnvironment env, - IChannel ch, - TDelegateInput input, - LoaderReconciler baseReconciler, - Func mapper) - { - var loaderEstimator = GeneralFunctionAnalyzer(env, ch, input, baseReconciler, mapper, out var est, col => null); - var schema = StaticSchemaShape.Make(mapper.Method.ReturnParameter); - return new DataLoaderEstimator>(env, loaderEstimator, schema); - } - - internal static IDataLoaderEstimator> - GeneralFunctionAnalyzer( - IHostEnvironment env, - IChannel ch, - TDelegateInput input, - LoaderReconciler baseReconciler, - Func mapper, - out IEstimator estimator, - Func inputNameFunction) - { - Contracts.CheckValue(mapper, nameof(mapper)); - - var method = mapper.Method; - var output = mapper(input); - - KeyValuePair[] outPairs = StaticPipeInternalUtils.GetNamesValues(output, method.ReturnParameter); - - // Map where the key depends on the set of things in the value. The value contains the yet unresolved dependencies. - var keyDependsOn = new Dictionary>(); - // Map where the set of things in the value depend on the key. - var dependsOnKey = new Dictionary>(); - // The set of columns detected with zero dependencies. - var zeroDependencies = new List(); - - // First we build up the two structures above, using a queue and visiting from the outputs up. - var toVisit = new Queue(outPairs.Select(p => p.Value)); - while (toVisit.Count > 0) - { - var col = toVisit.Dequeue(); - ch.CheckParam(col != null, nameof(mapper), "The delegate seems to have null columns returned somewhere in the pipe."); - if (keyDependsOn.ContainsKey(col)) - continue; // Already visited. - - var dependsOn = new HashSet(); - foreach (var dep in col.Dependencies ?? Enumerable.Empty()) - { - dependsOn.Add(dep); - if (!dependsOnKey.TryGetValue(dep, out var dependsOnDep)) - { - dependsOnKey[dep] = dependsOnDep = new HashSet(); - toVisit.Enqueue(dep); - } - dependsOnDep.Add(col); - } - keyDependsOn[col] = dependsOn; - if (dependsOn.Count == 0) - zeroDependencies.Add(col); - } - - // Get the base input columns. - var baseInputs = keyDependsOn.Select(p => p.Key).Where(col => col.ReconcilerObj == baseReconciler).ToArray(); - - // The columns that utilize the base reconciler should have no dependencies. This could only happen if - // the caller of this function has introduced a situation whereby they are claiming they can reconcile - // to a data-loader object but still have input data dependencies, which does not make sense and - // indicates that there is a bug in that component code. Unfortunately we can only detect that condition, - // not determine exactly how it arose, but we can still do so to indicate to the user that there is a - // problem somewhere in the stack. - ch.CheckParam(baseInputs.All(col => keyDependsOn[col].Count == 0), - nameof(input), "Bug detected where column producing object was yielding columns with dependencies."); - - // This holds the mappings of columns to names and back. Note that while the same column could be used on - // the *output*, for example, you could hypothetically have `(a: r.Foo, b: r.Foo)`, we treat that as the last thing - // that is done. - var nameMap = new BidirectionalDictionary(); - - // Check to see if we have any set of initial names. This is important in the case where we are mapping - // in an input data view. - foreach (var col in baseInputs) - { - string inputName = inputNameFunction(col); - if (inputName != null) - { - ch.Assert(!nameMap.ContainsKey(col)); - ch.Assert(!nameMap.ContainsKey(inputName)); - nameMap[col] = inputName; - - ch.Trace($"Using input with name {inputName}."); - } - } - - estimator = null; - var toCopy = new List<(string dst, string src)>(); - - int tempNum = 0; - // For all outputs, get potential name collisions with used inputs. Resolve by assigning the input a temporary name. - foreach (var p in outPairs) - { - // If the name for the output is already used by one of the inputs, and this output column does not - // happen to have the same name, then we need to rename that input to keep it available. - if (nameMap.TryGetValue(p.Key, out var inputCol) && p.Value != inputCol) - { - ch.Assert(baseInputs.Contains(inputCol)); - string tempName = $"#Temp_{tempNum++}"; - ch.Trace($"Input/output name collision: Renaming '{p.Key}' to '{tempName}'."); - toCopy.Add((tempName, p.Key)); - nameMap[tempName] = nameMap[p.Key]; - ch.Assert(!nameMap.ContainsKey(p.Key)); - } - // If we already have a name for this output column, maybe it is used elsewhere. (This can happen when - // the only thing done with an input is we rename it, or output it twice, or something like this.) In - // this case it is most appropriate to delay renaming till after all other processing has been done in - // that case. But otherwise we may as well just take the name. - if (!nameMap.ContainsKey(p.Value)) - nameMap[p.Key] = p.Value; - } - - // If any renamings were necessary, create the CopyColumns estimator. - if (toCopy.Count > 0) - estimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); - - // First clear the inputs from zero-dependencies yet to be resolved. - foreach (var col in baseInputs) - { - ch.Assert(zeroDependencies.Contains(col)); - ch.Assert(col.ReconcilerObj == baseReconciler); - - zeroDependencies.Remove(col); // Make more efficient... - if (!dependsOnKey.TryGetValue(col, out var depends)) - continue; - // If any of these base inputs do not have names because, for example, they do not directly appear - // in the outputs and otherwise do not have names, assign them a name. - if (!nameMap.ContainsKey(col)) - nameMap[col] = $"Temp_{tempNum++}"; - - foreach (var depender in depends) - { - var dependencies = keyDependsOn[depender]; - ch.Assert(dependencies.Contains(col)); - dependencies.Remove(col); - if (dependencies.Count == 0) - zeroDependencies.Add(depender); - } - dependsOnKey.Remove(col); - } - - // Call the reconciler to get the base loader estimator. - var loaderEstimator = baseReconciler.Reconcile(env, baseInputs, nameMap.AsOther(baseInputs)); - ch.AssertValueOrNull(loaderEstimator); - - // Next we iteratively find those columns with zero dependencies, "create" them, and if anything depends on - // these add them to the collection of zero dependencies, etc. etc. - while (zeroDependencies.Count > 0) - { - // All columns with the same reconciler can be transformed together. - - // Note that the following policy of just taking the first group is not optimal. So for example, we - // could have three columns, (a, b, c). If we had the output (a.X(), b.X() c.Y().X()), then maybe we'd - // reconcile a.X() and b.X() together, then reconcile c.Y(), then reconcile c.Y().X() alone. Whereas, we - // could have reconciled c.Y() first, then reconciled a.X(), b.X(), and c.Y().X() together. - var group = zeroDependencies.GroupBy(p => p.ReconcilerObj).First(); - // Beyond that first group that *might* be a data loader reconciler, all subsequent operations will - // be on where the data is already loaded and so accept data as an input, that is, they should produce - // an estimator. If this is not the case something seriously wonky is going on, most probably that the - // user tried to use a column from another source. If this is detected we can produce a sensible error - // message to tell them not to do this. - if (!(group.Key is EstimatorReconciler rec)) - { - throw ch.Except("Columns from multiple sources were detected. " + - "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); - } - PipelineColumn[] cols = group.ToArray(); - // All dependencies should, by this time, have names. - ch.Assert(cols.SelectMany(c => c.Dependencies).All(dep => nameMap.ContainsKey(dep))); - foreach (var newCol in cols) - { - if (!nameMap.ContainsKey(newCol)) - nameMap[newCol] = $"#Temp_{tempNum++}"; - - } - - var localInputNames = nameMap.AsOther(cols.SelectMany(c => c.Dependencies ?? Enumerable.Empty())); - var localOutputNames = nameMap.AsOther(cols); - var usedNames = new HashSet(nameMap.Keys1.Except(localOutputNames.Values)); - - var localEstimator = rec.Reconcile(env, cols, localInputNames, localOutputNames, usedNames); - loaderEstimator = loaderEstimator?.Append(localEstimator); - estimator = estimator?.Append(localEstimator) ?? localEstimator; - - foreach (var newCol in cols) - { - zeroDependencies.Remove(newCol); // Make more efficient!! - - // Finally, we find all columns that depend on this one. If this happened to be the last pending - // dependency, then we add it to the list. - if (dependsOnKey.TryGetValue(newCol, out var depends)) - { - foreach (var depender in depends) - { - var dependencies = keyDependsOn[depender]; - Contracts.Assert(dependencies.Contains(newCol)); - dependencies.Remove(newCol); - if (dependencies.Count == 0) - zeroDependencies.Add(depender); - } - dependsOnKey.Remove(newCol); - } - } - } - - if (keyDependsOn.Any(p => p.Value.Count > 0)) - { - // This might happen if the user does something incredibly strange, like, say, take some prior - // lambda, assign a column to a local variable, then re-use it downstream in a different lambda. - // The user would have to go to some extraordinary effort to do that, but nonetheless we want to - // fail with a semi-sensible error message. - throw ch.Except("There were some leftover columns with unresolved dependencies. " + - "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); - } - - // Now do the final renaming, if any is necessary. - toCopy.Clear(); - foreach (var p in outPairs) - { - // TODO: Right now we just write stuff out. Once the copy-columns estimator is in place - // we ought to do this for real. - Contracts.Assert(nameMap.ContainsKey(p.Value)); - string currentName = nameMap[p.Value]; - if (currentName != p.Key) - { - ch.Trace($"Will copy '{p.Key}' to '{currentName}'"); - toCopy.Add((p.Key, currentName)); - } - } - - // If any final renamings were necessary, insert the appropriate CopyColumns transform. - if (toCopy.Count > 0) - { - var copyEstimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); - if (estimator == null) - estimator = copyEstimator; - else - estimator = estimator.Append(copyEstimator); - } - - ch.Trace($"Exiting {nameof(LoaderEstimatorAnalyzerHelper)}"); - - return loaderEstimator; - } - - private sealed class BidirectionalDictionary - { - private readonly Dictionary _d12; - private readonly Dictionary _d21; - - public BidirectionalDictionary() - { - _d12 = new Dictionary(); - _d21 = new Dictionary(); - } - - public bool ContainsKey(T1 k) => _d12.ContainsKey(k); - public bool ContainsKey(T2 k) => _d21.ContainsKey(k); - - public IEnumerable Keys1 => _d12.Keys; - public IEnumerable Keys2 => _d21.Keys; - - public bool TryGetValue(T1 k, out T2 v) => _d12.TryGetValue(k, out v); - public bool TryGetValue(T2 k, out T1 v) => _d21.TryGetValue(k, out v); - - public T1 this[T2 key] - { - get => _d21[key]; - set { - Contracts.CheckValue((object)key, nameof(key)); - Contracts.CheckValue((object)value, nameof(value)); - - bool removeOldKey = _d12.TryGetValue(value, out var oldKey); - if (_d21.TryGetValue(key, out var oldValue)) - _d12.Remove(oldValue); - if (removeOldKey) - _d21.Remove(oldKey); - - _d12[value] = key; - _d21[key] = value; - Contracts.Assert(_d12.Count == _d21.Count); - } - } - - public T2 this[T1 key] - { - get => _d12[key]; - set { - Contracts.CheckValue((object)key, nameof(key)); - Contracts.CheckValue((object)value, nameof(value)); - - bool removeOldKey = _d21.TryGetValue(value, out var oldKey); - if (_d12.TryGetValue(key, out var oldValue)) - _d21.Remove(oldValue); - if (removeOldKey) - _d12.Remove(oldKey); - - _d21[value] = key; - _d12[key] = value; - - Contracts.Assert(_d12.Count == _d21.Count); - } - } - - public IReadOnlyDictionary AsOther(IEnumerable keys) - { - Dictionary d = new Dictionary(); - foreach (var v in keys) - d[v] = _d12[v]; - return d; - } - - public IReadOnlyDictionary AsOther(IEnumerable keys) - { - Dictionary d = new Dictionary(); - foreach (var v in keys) - d[v] = _d21[v]; - return d; - } - } - - /// - /// Retrieves the internally stored environment in . - /// Intended usecases is component generating code that needs to have access to an - /// environment. - /// - /// The shape type. - /// The object for which we get the environment. - /// The internal of the object. - public static IHostEnvironment GetEnvironment(SchemaBearing schematized) - { - Contracts.CheckValue(schematized, nameof(schematized)); - return schematized.Env; - } - - /// - /// Retrieves the index helper object for . - /// - /// The shape type. - /// The object for which we get the indexer. - /// The index helper. - public static IndexHelper GetIndexer(SchemaBearing schematized) - { - Contracts.CheckValue(schematized, nameof(schematized)); - return schematized.Indexer; - } - - /// - /// An indexer that can be constructed over a static pipeline object, to enable us to determine - /// the names of the columns. This is used by component authors to allow users to "select" a column, - /// but the structure is itself not directly used by users of the API as a general rule. Rather, - /// one might imagine the component exposing some sort of delegate taking method that given the - /// instance , returns one of the instances stored - /// therein, which the component can use to do specific operations. - /// - /// The shape type. - public sealed class IndexHelper - { - /// - /// An instance of the shape type whose items can be used to index to find the names of column. - /// - public T Indices { get; } - - /// - /// Maps the items inside to the names of the associated data's column's name. - /// Components can use this structure, but it may be preferable to use - /// to ensure uniformity in the exception messages, if possible. - /// - private ImmutableDictionary Map { get; } - - /// - /// Performs a lookup on . If the key is not present this will throw an exception - /// more generally helpful in context than that of a direct failure of index on . - /// - /// The column to look up. - /// The optional exception context. - /// If successful the name of the column. - public string Get(PipelineColumn key, IExceptionContext ectx = null) - { - Contracts.CheckValueOrNull(ectx); - ectx.CheckValue(key, nameof(key)); - if (!Map.TryGetValue(key, out string name)) - { - // The most obvious reason this might happen is if the user did something like try to attempt to - // apply an estimator inside the index delegate, which obviously will not work. - throw ectx.ExceptParam(nameof(key), "Column does not appear to be valid for this structure. " + - "Please use columns in the provided indexing object without attempting modification."); - } - return name; - } - - /// - /// Constructor for the index helper. Note that any public or component code will instead use - /// , to fetch the lazily constructed instance of this - /// object, since its construction is somewhat expensive. - /// - /// Constructs the helper for this object. - internal IndexHelper(SchemaBearing schematized) - { - Indices = StaticPipeInternalUtils.MakeAnalysisInstance(out var rec); - // We define this delegate just to get its return parameter, so the name extractor has something - // to work over. Because this is defined without the names the names will be default, which is not - // useful, except we can get the "real" names from the schematized object's shape. - Func dummyFunc = () => default; - var pairs = StaticPipeInternalUtils.GetNamesValues(Indices, dummyFunc.Method.ReturnParameter); - Contracts.Assert(pairs.Length == schematized.Shape.Pairs.Length); - - var builder = ImmutableDictionary.CreateBuilder(); - for (int i = 0; i < pairs.Length; ++i) - // Each "index" come from the analysis of the indices object, but we get the names from the shape. - builder.Add(pairs[i].Value, schematized.Shape.Pairs[i].Key); - Map = builder.ToImmutable(); - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs b/src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs deleted file mode 100644 index 07a1095cb3..0000000000 --- a/src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs +++ /dev/null @@ -1,350 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Reflection; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// A schema shape with names corresponding to a type parameter in one of the typed variants - /// of the data pipeline structures. Instances of this class tend to be bundled with the statically - /// typed variants of the dynamic structures (for example, and so forth), - /// and their primary purpose is to ensure that the schemas of the dynamic structures and the - /// statically declared structures are compatible. - /// - internal sealed class StaticSchemaShape - { - /// - /// The enumeration of name/type pairs. Do not modify. - /// - public readonly KeyValuePair[] Pairs; - - private StaticSchemaShape(KeyValuePair[] pairs) - { - Contracts.AssertValue(pairs); - Pairs = pairs; - } - - /// - /// Creates a new instance out of a parameter info, presumably fetched from a user specified delegate. - /// - /// The static shape type. - /// The parameter info on the method, whose type should be - /// . - /// A new instance with names and members types enumerated. - public static StaticSchemaShape Make(ParameterInfo info) - { - Contracts.AssertValue(info); - var pairs = StaticPipeInternalUtils.GetNamesTypes(info); - return new StaticSchemaShape(pairs); - } - - /// - /// Checks whether this object is consistent with an actual schema from a dynamic object, - /// throwing exceptions if not. - /// - /// The context on which to throw exceptions - /// The schema to check - public void Check(IExceptionContext ectx, DataViewSchema schema) - { - Contracts.AssertValue(ectx); - ectx.AssertValue(schema); - - foreach (var pair in Pairs) - { - if (!schema.TryGetColumnIndex(pair.Key, out int colIdx)) - throw ectx.ExceptParam(nameof(schema), $"Column named '{pair.Key}' was not found"); - var col = schema[colIdx]; - var type = GetTypeOrNull(col); - if ((type != null && !pair.Value.IsAssignableFromStaticPipeline(type)) || (type == null && IsStandard(ectx, pair.Value))) - { - // When not null, we can use IsAssignableFrom to indicate we could assign to this, so as to allow - // for example Key to be considered to be compatible with Key. - - // In the null case, while we cannot directly verify an unrecognized type, we can at least verify - // that the statically declared type should not have corresponded to a recognized type. - if (!pair.Value.IsAssignableFromStaticPipeline(type)) - { - throw ectx.ExceptParam(nameof(schema), - $"Column '{pair.Key}' of type '{col.Type}' cannot be expressed statically as type '{pair.Value}'."); - } - } - } - } - - /// - /// Checks whether this object is consistent with an actual schema shape from a dynamic object, - /// throwing exceptions if not. - /// - /// The context on which to throw exceptions - /// The schema shape to check - public void Check(IExceptionContext ectx, SchemaShape shape) - { - Contracts.AssertValue(ectx); - ectx.AssertValue(shape); - - foreach (var pair in Pairs) - { - if (!shape.TryFindColumn(pair.Key, out var col)) - throw ectx.ExceptParam(nameof(shape), $"Column named '{pair.Key}' was not found"); - var type = GetTypeOrNull(col); - if ((type != null && !pair.Value.IsAssignableFromStaticPipeline(type)) || (type == null && IsStandard(ectx, pair.Value))) - { - // When not null, we can use IsAssignableFrom to indicate we could assign to this, so as to allow - // for example Key to be considered to be compatible with Key. - - // In the null case, while we cannot directly verify an unrecognized type, we can at least verify - // that the statically declared type should not have corresponded to a recognized type. - if (!pair.Value.IsAssignableFromStaticPipeline(type)) - { - throw ectx.ExceptParam(nameof(shape), - $"Column '{pair.Key}' of type '{col.GetTypeString()}' cannot be expressed statically as type '{pair.Value}'."); - } - } - } - } - - private static Type GetTypeOrNull(SchemaShape.Column col) - { - Contracts.Assert(col.IsValid); - - Type vecType = null; - - switch (col.Kind) - { - case SchemaShape.Column.VectorKind.Scalar: - break; // Keep it null. - case SchemaShape.Column.VectorKind.Vector: - // Assume that if the normalized metadata is indicated by the schema shape, it is bool and true. - vecType = col.IsNormalized() ? typeof(NormVector<>) : typeof(Vector<>); - break; - case SchemaShape.Column.VectorKind.VariableVector: - vecType = typeof(VarVector<>); - break; - default: - // Not recognized. Not necessarily an error of the user, may just indicate this code ought to be updated. - Contracts.Assert(false); - return null; - } - - if (col.IsKey) - { - Type physType = GetPhysicalType(col.ItemType); - Contracts.Assert(physType == typeof(byte) || physType == typeof(ushort) - || physType == typeof(uint) || physType == typeof(ulong)); - var keyType = typeof(Key<>).MakeGenericType(physType); - if (col.Annotations.TryFindColumn(AnnotationUtils.Kinds.KeyValues, out var kvMeta)) - { - var subtype = GetTypeOrNull(kvMeta); - if (subtype != null && subtype.IsGenericType) - { - var sgtype = subtype.GetGenericTypeDefinition(); - if (sgtype == typeof(NormVector<>) || sgtype == typeof(Vector<>)) - { - var args = subtype.GetGenericArguments(); - Contracts.Assert(args.Length == 1); - keyType = typeof(Key<,>).MakeGenericType(physType, args[0]); - } - } - } - - return vecType?.MakeGenericType(keyType) ?? keyType; - } - - if (col.ItemType is PrimitiveDataViewType pt) - { - Type physType = GetPhysicalType(pt); - // Though I am unaware of any existing instances, it is theoretically possible for a - // primitive type to exist, have the same data kind as one of the existing types, and yet - // not be one of the built in types. (For example, an outside analogy to the key types.) For this - // reason, we must be certain that when we return here we are covering one fo the builtin types. - if (physType != null && ( - pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int32 || - pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt32 || - pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance || - pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance || - pt == TextDataViewType.Instance)) - { - return (vecType ?? typeof(Scalar<>)).MakeGenericType(physType); - } - } - - return null; - } - - /// - /// Returns true if the input type is something recognizable as being oen of the standard - /// builtin types. This method will also throw if something is detected as being definitely - /// wrong (for example, the input type does not descend from at all, - /// or a is declared with a type parameter or - /// something. - /// - private static bool IsStandard(IExceptionContext ectx, Type t) - { - Contracts.AssertValue(ectx); - ectx.AssertValue(t); - if (!typeof(PipelineColumn).IsAssignableFrom(t)) - { - throw ectx.ExceptParam(nameof(t), $"Type {t} was not even of {nameof(PipelineColumn)}"); - } - var gt = t.IsGenericType ? t.GetGenericTypeDefinition() : t; - if (gt != typeof(Scalar<>) && gt != typeof(Key<>) && gt != typeof(Key<,>) && gt != typeof(VarKey<>) && - gt != typeof(Vector<>) && gt != typeof(VarVector<>) && gt != typeof(NormVector<>) && gt != typeof(Custom<>)) - { - throw ectx.ExceptParam(nameof(t), - $"Type {t} was not one of the standard subclasses of {nameof(PipelineColumn)}"); - } - ectx.Assert(t.IsGenericType); - var ga = t.GetGenericArguments(); - ectx.AssertNonEmpty(ga); - - if (gt == typeof(Key<>) || gt == typeof(Key<,>) || gt == typeof(VarKey<>)) - { - ectx.Assert((gt == typeof(Key<,>) && ga.Length == 2) || ga.Length == 1); - var kt = ga[0]; - if (kt != typeof(byte) && kt != typeof(ushort) && kt != typeof(uint) && kt != typeof(ulong)) - throw ectx.ExceptParam(nameof(t), $"Type parameter {kt.Name} is not a valid type for key"); - return gt != typeof(Key<,>) || IsStandardCore(ga[1]); - } - - ectx.Assert(ga.Length == 1); - return IsStandardCore(ga[0]); - } - - private static bool IsStandardCore(Type t) - { - Contracts.AssertValue(t); - return t == typeof(float) || t == typeof(double) || t == typeof(string) || t == typeof(bool) || - t == typeof(sbyte) || t == typeof(short) || t == typeof(int) || t == typeof(long) || - t == typeof(byte) || t == typeof(ushort) || t == typeof(uint) || t == typeof(ulong) || - t == typeof(TimeSpan) || t == typeof(DateTime) || t == typeof(DateTimeOffset); - } - - /// - /// Returns a .NET type corresponding to the static pipelines that would tend to represent this column. - /// Generally this will return null if it simply does not recognize the type but might throw if - /// there is something seriously wrong with it. - /// - /// The column - /// The .NET type for the static pipelines that should be used to reflect this type, given - /// both the characteristics of the as well as one or two crucial pieces of metadata - private static Type GetTypeOrNull(DataViewSchema.Column col) - { - var t = col.Type; - - Type vecType = null; - if (t is VectorDataViewType vt) - { - vecType = vt.Size > 0 ? typeof(Vector<>) : typeof(VarVector<>); - // Check normalized subtype of vectors. - if (vt.Size > 0) - { - // Check to see if the column is normalized. - // Once we shift to metadata being a row globally we can also make this a bit more efficient: - var meta = col.Annotations; - var normalizedColumn = meta.Schema.GetColumnOrNull(AnnotationUtils.Kinds.IsNormalized); - if (normalizedColumn.HasValue) - { - if (normalizedColumn.Value.Type == BooleanDataViewType.Instance) - { - bool val = default; - meta.GetGetter(normalizedColumn.Value)(ref val); - if (val) - vecType = typeof(NormVector<>); - } - } - } - t = vt.ItemType; - // Fall through to the non-vector case to handle subtypes. - } - Contracts.Assert(!(t is VectorDataViewType)); - - if (t is KeyDataViewType kt) - { - Type physType = GetPhysicalType(kt); - Contracts.Assert(physType == typeof(byte) || physType == typeof(ushort) - || physType == typeof(uint) || physType == typeof(ulong)); - var keyType = kt.Count > 0 ? typeof(Key<>) : typeof(VarKey<>); - keyType = keyType.MakeGenericType(physType); - - if (kt.Count > 0) - { - // Check to see if we have key value metadata of the appropriate type, size, and whatnot. - var meta = col.Annotations; - if (meta.Schema.TryGetColumnIndex(AnnotationUtils.Kinds.KeyValues, out int kvcolIndex)) - { - var kvcol = meta.Schema[kvcolIndex]; - var kvType = kvcol.Type; - Contracts.Assert(kt.Count <= int.MaxValue); - if (kvType is VectorDataViewType kvVecType && kvVecType.Size == (int)kt.Count) - { - Contracts.Assert(kt.Count > 0); - var subtype = GetTypeOrNull(kvcol); - if (subtype != null && subtype.IsGenericType) - { - var sgtype = subtype.GetGenericTypeDefinition(); - if (sgtype == typeof(NormVector<>) || sgtype == typeof(Vector<>)) - { - var args = subtype.GetGenericArguments(); - Contracts.Assert(args.Length == 1); - keyType = typeof(Key<,>).MakeGenericType(physType, args[0]); - } - } - } - } - } - return vecType?.MakeGenericType(keyType) ?? keyType; - } - - if (t is PrimitiveDataViewType pt) - { - Type physType = GetPhysicalType(pt); - // Though I am unaware of any existing instances, it is theoretically possible for a - // primitive type to exist, have the same data kind as one of the existing types, and yet - // not be one of the built in types. (For example, an outside analogy to the key types.) For this - // reason, we must be certain that when we return here we are covering one fo the builtin types. - if (physType != null && ( - pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int64 || - pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt64 || - pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance || - pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance || - pt == TextDataViewType.Instance)) - { - return (vecType ?? typeof(Scalar<>)).MakeGenericType(physType); - } - } - - return null; - } - - /// - /// Note that this can return a different type than the actual physical representation type, for example, for - /// the return type is , even though we do not use that - /// type for communicating text. - /// - /// The basic type used to represent an item type in the static pipeline - private static Type GetPhysicalType(DataViewType columnType) - { - switch (columnType) - { - case NumberDataViewType numberType: - case KeyDataViewType keyType: - case TimeSpanDataViewType timeSpanType: - case DateTimeDataViewType dateTimeType: - case DateTimeOffsetDataViewType dateTimeOffsetType: - case BooleanDataViewType boolType: - return columnType.RawType; - case TextDataViewType textType: - return typeof(string); - - default: - return null; - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs deleted file mode 100644 index bfff7564c3..0000000000 --- a/src/Microsoft.ML.StaticPipe/TermStaticExtensions.cs +++ /dev/null @@ -1,1174 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - public static partial class TermStaticExtensions - { - // Do not edit this file directly. Rather, it is generated out of TermStaticExtensions.tt. - /// - /// Information on the result of fitting a to-key transform. - /// - /// The type of the values. - public sealed class ToKeyFitResult - { - /// - /// For user defined delegates that accept instances of the containing type. - /// - /// - public delegate void OnFit(ToKeyFitResult result); - - // At the moment this is empty. Once PR #863 clears, we can change this class to hold the output - // key-values metadata. - - [BestFriend] - internal ToKeyFitResult(ValueToKeyMappingTransformer.TermMap map) - { - } - } - - #region For string inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult>.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For float inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For double inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For sbyte inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For short inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For int inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For long inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For byte inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For ushort inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For uint inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For ulong inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - - #region For bool inputs. - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Scalar input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Key ToKey(this Key input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplScalar(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static Vector> ToKey(this Vector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static VarVector> ToKey(this VarVector> input, - KeyOrdinality keyOrdinality = DefSort, int maxItems = DefMax, ToKeyFitResult.OnFit onFit = null) - => new ImplVarVector(Contracts.CheckRef(input, nameof(input)), new Config(keyOrdinality, maxItems, Wrap(onFit))); - #endregion - } - - public enum KeyOrdinality : byte - { - /// - /// Terms will be assigned ID in the order in which they appear. - /// - Occurence = ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence, - - /// - /// Terms will be assigned ID according to their sort via an ordinal comparison for the type. - /// - Value = ValueToKeyMappingEstimator.KeyOrdinality.ByValue - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt b/src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt deleted file mode 100644 index 1f4e47b9d7..0000000000 --- a/src/Microsoft.ML.StaticPipe/TermStaticExtensions.tt +++ /dev/null @@ -1,99 +0,0 @@ -<#@ template debug="false" hostspecific="false" language="C#" #> -<#@ assembly name="System.Core" #> -<#@ import namespace="System.Linq" #> -<#@ import namespace="System.Text" #> -<#@ import namespace="System.Collections.Generic" #> -<#@ output extension=".cs" #> -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - public static partial class TermStaticExtensions - { - // Do not edit this file directly. Rather, it is generated out of TermStaticExtensions.tt. - /// - /// Information on the result of fitting a to-key transform. - /// - /// The type of the values. - public sealed class ToKeyFitResult - { - /// - /// For user defined delegates that accept instances of the containing type. - /// - /// - public delegate void OnFit(ToKeyFitResult result); - - // At the moment this is empty. Once PR #863 clears, we can change this class to hold the output - // key-values metadata. - - [BestFriend] - internal ToKeyFitResult(ValueToKeyMappingTransformer.TermMap map) - { - } - } -<# -// Let's skip the time-based types for now. -foreach (string typeName in new string[] { "string", "float", "double", "sbyte", "short", "int", "long", "byte", "ushort", "uint", "ulong", "bool" }) { -bool startRegionBlock = true; -#> - - #region For <#=typeName#> inputs. -<# -foreach (bool inputIsKey in new bool[] { false, true }) { -foreach (string arityName in new string[] { "Scalar", "Vector", "VarVector" }) { -string onFitType = typeName == "string" ? "ReadOnlyMemory" : typeName; -bool omitInputArity = arityName == "Scalar" && inputIsKey; -bool isNumeric = typeName != "string" && typeName != "bool"; -bool isScalar = arityName == "Scalar"; - -if (!startRegionBlock) { // Put lines between the declarations to make them look pretty, but not after the region tag. -#> - -<#} -startRegionBlock = false; -#> - /// - /// Map values to a key-value representation, where the key type's values are those values observed in the input - /// during fitting. During transformation, any values unobserved during fitting will map to the missing key. -<# -if (typeName == "string") { #> - /// Because the empty string is never entered into the dictionary, it will always map to the missing key. -<# } -if (typeName == "float" || typeName == "double") { #> - /// Because NaN floating point values are never entered into the dictionary, and they will always map to the missing key. -<# } -if (!isScalar && isNumeric && !inputIsKey) { #> - /// Zero is considered a valid value and so will be entered into the dictionary if observed. The potential perf - /// implication in that case is that sparse input numeric vectors will map to dense output key vectors. -<# } -if (inputIsKey) { #> - /// We are inputting a key type with values, and in that case the dictionary is considered to be built over the - /// values of the keys, rather than the keys themselves. This also mean the key-values learned for the output - /// will be a subset of the key-values in the input. -<# } - -#> - /// - /// The input column. - /// The ordering policy for what order values will appear in the enumerated set. - /// The maximum number of items. - /// Called upon fitting with the learnt enumeration on the dataset. - /// The key-valued column. - public static <#=isScalar?"":arityName+"<"#>Key><#=isScalar?"":">"#> ToKey<#=inputIsKey?"":""#>(this <#=omitInputArity?"":arityName+"<"#><#=inputIsKey?"Key<#=typeName#>><#=inputIsKey&&!isScalar?">":""#> input, - KeyValueOrder order = DefSort, int maxItems = DefMax, ToKeyFitResult<<#=onFitType#>>.OnFit onFit = null) - => new Impl<#=arityName#><<#=typeName#>>(Contracts.CheckRef(input, nameof(input)), new Config(order, maxItems, Wrap(onFit))); -<# -} } -#> - #endregion -<# -} -#> - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/TextLoaderStatic.cs b/src/Microsoft.ML.StaticPipe/TextLoaderStatic.cs deleted file mode 100644 index a36d51a42d..0000000000 --- a/src/Microsoft.ML.StaticPipe/TextLoaderStatic.cs +++ /dev/null @@ -1,307 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public static class TextLoaderStatic - { - /// - /// Configures a loader for text files. - /// - /// The type shape parameter, which must be a valid-schema shape. As a practical - /// matter this is generally not explicitly defined from the user, but is instead inferred from the return - /// type of the where one takes an input and uses it to compose - /// a shape-type instance describing what the columns are and how to load them from the file. - /// The environment. - /// The delegate that describes what fields to read from the text file, as well as - /// describing their input type. The way in which it works is that the delegate is fed a , - /// and the user composes a shape type with instances out of that . - /// The resulting data will have columns with the names corresponding to their names in the shape type. - /// Input files. If null then no files are read, but this means that options or - /// configurations that require input data for initialization (for example, or - /// ) with a null second argument. - /// Text field separator. - /// Data file has header with feature names. - /// Whether the input -may include quoted values, which can contain separator - /// characters, colons, and distinguish empty values from missing values. When true, consecutive separators - /// denote a missing value and an empty value is denoted by "". When false, consecutive separators - /// denote an empty value. - /// Whether the input may include sparse representations. - /// Remove trailing whitespace from lines. - /// A configured statically-typed loader for text files. - public static DataLoader CreateLoader<[IsShape] TShape>( - IHostEnvironment env, Func func, IMultiStreamSource files = null, - char separator = '\t', bool hasHeader = false, bool allowQuoting = true, bool allowSparse = true, - bool trimWhitspace = false) - { - Contracts.CheckValue(env, nameof(env)); - env.CheckValue(func, nameof(func)); - env.CheckValueOrNull(files); - - // Populate all args except the columns. - var args = new TextLoader.Options(); - args.AllowQuoting = allowQuoting; - args.AllowSparse = allowSparse; - args.HasHeader = hasHeader; - args.Separators = new[] { separator }; - args.TrimWhitespace = trimWhitspace; - - var rec = new TextReconciler(args, files); - var ctx = new Context(rec); - - using (var ch = env.Start("Initializing " + nameof(TextLoader))) - { - var loaderEst = StaticPipeUtils.LoaderEstimatorAnalyzerHelper(env, ch, ctx, rec, func); - Contracts.AssertValue(loaderEst); - return loaderEst.Fit(files); - } - } - - private sealed class TextReconciler : LoaderReconciler - { - private readonly TextLoader.Options _args; - private readonly IMultiStreamSource _files; - - public TextReconciler(TextLoader.Options options, IMultiStreamSource files) - { - Contracts.AssertValue(options); - Contracts.AssertValueOrNull(files); - - _args = options; - _files = files; - } - - public override IDataLoaderEstimator> Reconcile( - IHostEnvironment env, PipelineColumn[] toOutput, IReadOnlyDictionary outputNames) - { - Contracts.AssertValue(env); - Contracts.AssertValue(toOutput); - Contracts.AssertValue(outputNames); - Contracts.Assert(_args.Columns == null); - - TextLoader.Column Create(PipelineColumn pipelineCol) - { - var pipelineArgCol = (IPipelineArgColumn)pipelineCol; - var argCol = pipelineArgCol.Create(); - argCol.Name = outputNames[pipelineCol]; - return argCol; - } - - var cols = _args.Columns = new TextLoader.Column[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - cols[i] = Create(toOutput[i]); - - var orig = new TextLoader(env, _args, _files); - return new TrivialLoaderEstimator(orig); - } - } - - private interface IPipelineArgColumn - { - /// - /// Creates a object corresponding to the , with everything - /// filled in except . - /// - TextLoader.Column Create(); - } - - /// - /// Context object by which a user can indicate what fields they want to read from a text file, and what data type they ought to have. - /// Instances of this class are never made but the user, but rather are fed into the delegate in - /// . - /// - public sealed class Context - { - private readonly Reconciler _rec; - - internal Context(Reconciler rec) - { - Contracts.AssertValue(rec); - _rec = rec; - } - - /// - /// Reads a scalar Boolean column from a single field in the text file. - /// - /// The zero-based index of the field to read from. - /// The column representation. - public Scalar LoadBool(int ordinal) => Load(InternalDataKind.BL, ordinal); - - /// - /// Reads a vector Boolean column from a range of fields in the text file. - /// - /// The zero-based inclusive lower index of the field to read from. - /// The zero-based inclusive upper index of the field to read from. - /// Note that if this is null, it will read to the end of the line. The file(s) - /// will be inspected to get the length of the type. - /// The column representation. - public Vector LoadBool(int minOrdinal, int? maxOrdinal) => Load(InternalDataKind.BL, minOrdinal, maxOrdinal); - - /// - /// Create a representation for a key loaded from TextLoader as an unsigned integer (32 bits). - /// - /// The zero-based index of the field to read from. - /// If specified, it's the count or cardinality of valid key values. - /// Using null initalizes to uint.MaxValue - /// The column representation. - public Key LoadKey(int ordinal, ulong? keyCount) => Load(InternalDataKind.U4, ordinal, keyCount); - - /// - /// Reads a scalar single-precision floating point column from a single field in the text file. - /// - /// The zero-based index of the field to read from. - /// The column representation. - public Scalar LoadFloat(int ordinal) => Load(InternalDataKind.R4, ordinal); - - /// - /// Reads a vector single-precision column from a range of fields in the text file. - /// - /// The zero-based inclusive lower index of the field to read from. - /// The zero-based inclusive upper index of the field to read from. - /// Note that if this is null, it will read to the end of the line. The file(s) - /// will be inspected to get the length of the type. - /// The column representation. - public Vector LoadFloat(int minOrdinal, int? maxOrdinal) => Load(InternalDataKind.R4, minOrdinal, maxOrdinal); - - /// - /// Reads a scalar double-precision floating point column from a single field in the text file. - /// - /// The zero-based index of the field to read from. - /// The column representation. - public Scalar LoadDouble(int ordinal) => Load(InternalDataKind.R8, ordinal); - - /// - /// Reads a vector double-precision column from a range of fields in the text file. - /// - /// The zero-based inclusive lower index of the field to read from. - /// The zero-based inclusive upper index of the field to read from. - /// Note that if this is null, it will read to the end of the line. The file(s) - /// will be inspected to get the length of the type. - /// The column representation. - public Vector LoadDouble(int minOrdinal, int? maxOrdinal) => Load(InternalDataKind.R8, minOrdinal, maxOrdinal); - - /// - /// Reads a scalar textual column from a single field in the text file. - /// - /// The zero-based index of the field to read from. - /// The column representation. - public Scalar LoadText(int ordinal) => Load(InternalDataKind.TX, ordinal); - - /// - /// Reads a vector textual column from a range of fields in the text file. - /// - /// The zero-based inclusive lower index of the field to read from. - /// The zero-based inclusive upper index of the field to read from. - /// Note that if this is null, it will read to the end of the line. The file(s) - /// will be inspected to get the length of the type. - /// The column representation. - public Vector LoadText(int minOrdinal, int? maxOrdinal) => Load(InternalDataKind.TX, minOrdinal, maxOrdinal); - - private Scalar Load(InternalDataKind kind, int ordinal) - { - Contracts.CheckParam(ordinal >= 0, nameof(ordinal), "Should be non-negative"); - return new MyScalar(_rec, kind, ordinal); - } - - private Vector Load(InternalDataKind kind, int minOrdinal, int? maxOrdinal) - { - Contracts.CheckParam(minOrdinal >= 0, nameof(minOrdinal), "Should be non-negative"); - var v = maxOrdinal >= minOrdinal; - Contracts.CheckParam(!(maxOrdinal < minOrdinal), nameof(maxOrdinal), "If specified, cannot be less than " + nameof(minOrdinal)); - return new MyVector(_rec, kind, minOrdinal, maxOrdinal); - } - - private Key Load(InternalDataKind kind, int ordinal, ulong? keyCount) - { - Contracts.CheckParam(ordinal >= 0, nameof(ordinal), "Should be non-negative"); - return new MyKey(_rec, kind, ordinal, keyCount); - } - - /// - /// A data type used to bridge and . It can be used as - /// in static-typed pipelines and provides for translating itself into . - /// - private class MyKey : Key, IPipelineArgColumn - { - // The storage type that the targeted content would be loaded as. - private readonly InternalDataKind _kind; - // The position where the key value gets read from. - private readonly int _oridinal; - // The count or cardinality of valid key values. Its value is null if unbounded. - private readonly ulong? _keyCount; - - // Contstuct a representation for a key-typed column loaded from a text file. Key values are assumed to be contiguous. - public MyKey(Reconciler rec, InternalDataKind kind, int oridinal, ulong? keyCount=null) - : base(rec, null) - { - _kind = kind; - _oridinal = oridinal; - _keyCount = keyCount; - } - - // Translate the internal variable representation to columns of TextLoader. - public TextLoader.Column Create() - { - return new TextLoader.Column() - { - Type = _kind, - Source = new[] { new TextLoader.Range(_oridinal) }, - KeyCount = _keyCount.HasValue ? new KeyCount(_keyCount.GetValueOrDefault()) : new KeyCount() - }; - } - } - - private class MyScalar : Scalar, IPipelineArgColumn - { - private readonly InternalDataKind _kind; - private readonly int _ordinal; - - public MyScalar(Reconciler rec, InternalDataKind kind, int ordinal) - : base(rec, null) - { - _kind = kind; - _ordinal = ordinal; - } - - public TextLoader.Column Create() - { - return new TextLoader.Column() - { - Type = _kind, - Source = new[] { new TextLoader.Range(_ordinal) }, - }; - } - } - - private class MyVector : Vector, IPipelineArgColumn - { - private readonly InternalDataKind _kind; - private readonly int _min; - private readonly int? _max; - - public MyVector(Reconciler rec, InternalDataKind kind, int min, int? max) - : base(rec, null) - { - _kind = kind; - _min = min; - _max = max; - } - - public TextLoader.Column Create() - { - return new TextLoader.Column() - { - Type = _kind, - Source = new[] { new TextLoader.Range(_min, _max) }, - }; - } - } - } - } -} \ No newline at end of file diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs deleted file mode 100644 index b3e39bd8a7..0000000000 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ /dev/null @@ -1,597 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.Text; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extensions for statically typed word tokenizer. - /// - public static class WordTokenizerStaticExtensions - { - private sealed class OutPipelineColumn : VarVector - { - public readonly Scalar Input; - - public OutPipelineColumn(Scalar input, char[] separators) - : base(new Reconciler(separators), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly char[] _separators; - - public Reconciler(char[] separators) - { - _separators = separators; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new WordTokenizingEstimator(env, pairs.ToArray(), _separators); - } - } - - /// - /// Tokenize incoming text using and output the tokens. - /// - /// The column to apply to. - /// The separators to use (uses space character by default). - public static VarVector TokenizeIntoWords(this Scalar input, char[] separators = null) => new OutPipelineColumn(input, separators); - } - - /// - /// Extensions for statically typed character tokenizer. - /// - public static class CharacterTokenizerStaticExtensions - { - private sealed class OutPipelineColumn : VarVector> - { - public readonly Scalar Input; - - public OutPipelineColumn(Scalar input, bool useMarkerChars) - : base(new Reconciler(useMarkerChars), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly bool _useMarker; - - public Reconciler(bool useMarkerChars) - { - _useMarker = useMarkerChars; - } - - public bool Equals(Reconciler other) - { - return _useMarker == other._useMarker; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new TokenizingByCharactersEstimator(env, _useMarker, pairs.ToArray()); - } - } - - /// - /// Tokenize incoming text into a sequence of characters. - /// - /// The column to apply to. - /// Whether to use marker characters to separate words. - public static VarVector> TokenizeIntoCharactersAsKeys(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); - } - - /// - /// Extensions for statically typed stop word remover. - /// - public static class StopwordRemoverStaticExtensions - { - private sealed class OutPipelineColumn : VarVector - { - public readonly VarVector Input; - - public OutPipelineColumn(VarVector input, StopWordsRemovingEstimator.Language language) - : base(new Reconciler(language), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly StopWordsRemovingEstimator.Language _language; - - public Reconciler(StopWordsRemovingEstimator.Language language) - { - _language = language; - } - - public bool Equals(Reconciler other) - { - return _language == other._language; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var columns = new List(); - foreach (var outCol in toOutput) - columns.Add(new StopWordsRemovingEstimator.ColumnOptions(outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input], _language)); - - return new StopWordsRemovingEstimator(env, columns.ToArray()); - } - } - - /// - /// Remove stop words from incoming text. - /// - /// The column to apply to. - /// Langauge of the input text. It will be used to retrieve a built-in stopword list. - public static VarVector RemoveDefaultStopWords(this VarVector input, - StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language); - } - - /// - /// Extensions for statically typed text normalizer. - /// - public static class TextNormalizerStaticExtensions - { - private sealed class OutPipelineColumn : Scalar - { - public readonly Scalar Input; - - public OutPipelineColumn(Scalar input, TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) - : base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly TextNormalizingEstimator.CaseMode _textCase; - private readonly bool _keepDiacritics; - private readonly bool _keepPunctuations; - private readonly bool _keepNumbers; - - public Reconciler(TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) - { - _textCase = textCase; - _keepDiacritics = keepDiacritics; - _keepPunctuations = keepPunctuations; - _keepNumbers = keepNumbers; - - } - - public bool Equals(Reconciler other) - { - return _textCase == other._textCase && - _keepDiacritics == other._keepDiacritics && - _keepPunctuations == other._keepPunctuations && - _keepNumbers == other._keepNumbers; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new TextNormalizingEstimator(env, _textCase, _keepDiacritics, _keepPunctuations, _keepNumbers, pairs.ToArray()); - } - } - - /// - /// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers. - /// - /// The column to apply to. - /// Casing text using the rules of the invariant culture. - /// Whether to keep diacritical marks or remove them. - /// Whether to keep punctuation marks or remove them. - /// Whether to keep numbers or remove them. - public static Scalar NormalizeText(this Scalar input, - TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower, - bool keepDiacritics = false, - bool keepPunctuations = true, - bool keepNumbers = true) => new OutPipelineColumn(input, caseMode, keepDiacritics, keepPunctuations, keepNumbers); - } - - /// - /// Extensions for statically typed bag of word converter. - /// - public static class WordBagEstimatorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Scalar Input; - - public OutPipelineColumn(Scalar input, - int ngramLength, - int skipLength, - bool allLengths, - int maxNumTerms, - NgramExtractingEstimator.WeightingCriteria weighting) - : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly int _ngramLength; - private readonly int _skipLength; - private readonly bool _useAllLengths; - private readonly int _maxNumTerms; - private readonly NgramExtractingEstimator.WeightingCriteria _weighting; - - public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) - { - _ngramLength = ngramLength; - _skipLength = skipLength; - _useAllLengths = allLengths; - _maxNumTerms = maxNumTerms; - _weighting = weighting; - - } - - public bool Equals(Reconciler other) - { - return _ngramLength == other._ngramLength && - _skipLength == other._skipLength && - _useAllLengths == other._useAllLengths && - _maxNumTerms == other._maxNumTerms && - _weighting == other._weighting; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string names, string[] sources)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - - return new WordBagEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNumTerms, _weighting); - } - } - - /// - /// Produces a bag of counts of n-grams (sequences of consecutive words) in a given text. - /// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. - /// - /// The column to apply to. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an n-gram. - /// Whether to include all n-gram lengths up to or only . - /// Maximum number of n-grams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ProduceWordBags(this Scalar input, - int ngramLength = 1, - int skipLength = 0, - bool useAllLengths = true, - int maximumNgramsCount = 10000000, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); - } - - /// - /// Extensions for statically typed bag of wordhash converter. - /// - public static class WordHashBagEstimatorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Scalar Input; - - public OutPipelineColumn(Scalar input, - int numberOfBits, - int ngramLength, - int skipLength, - bool useAllLengths, - uint seed, - bool useOrderedHashing, - int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly int _numberOfBits; - private readonly int _ngramLength; - private readonly int _skipLength; - private readonly bool _useAllLengths; - private readonly uint _seed; - private readonly bool _useOrderedHashing; - private readonly int _maximumNumberOfInverts; - - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) - { - _numberOfBits = numberOfBits; - _ngramLength = ngramLength; - _skipLength = skipLength; - _useAllLengths = useAllLengths; - _seed = seed; - _useOrderedHashing = useOrderedHashing; - _maximumNumberOfInverts = maximumNumberOfInverts; - } - - public bool Equals(Reconciler other) - { - return _numberOfBits == other._numberOfBits && - _ngramLength == other._ngramLength && - _skipLength == other._skipLength && - _useAllLengths == other._useAllLengths && - _seed == other._seed && - _useOrderedHashing == other._useOrderedHashing && - _maximumNumberOfInverts == other._maximumNumberOfInverts; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string name, string[] sources)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - - return new WordHashBagEstimator(env, pairs.ToArray(), _numberOfBits, _ngramLength, _skipLength, _useAllLengths, _seed, _useOrderedHashing, _maximumNumberOfInverts); - } - } - - /// - /// Produces a bag of counts of n-grams (sequences of consecutive words of length 1-n) in a given text. - /// It does so by hashing each n-gram and using the hash value as the index in the bag. - /// - /// The column to apply to. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an n-gram. - /// Whether to include all n-gram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ProduceHashedWordBags(this Scalar input, - int numberOfBits = 16, - int ngramLength = 1, - int skipLength = 0, - bool useAllLengths = true, - uint seed = 314489979, - bool useOrderedHashing = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); - } - - /// - /// Extensions for statically typed n-gram estimator. - /// - public static class NgramEstimatorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly PipelineColumn Input; - - public OutPipelineColumn(PipelineColumn input, - int ngramLength, - int skipLength, - bool useAllLengths, - int maxNumTerms, - NgramExtractingEstimator.WeightingCriteria weighting) - : base(new Reconciler(ngramLength, skipLength, useAllLengths, maxNumTerms, weighting), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly int _ngramLength; - private readonly int _skipLength; - private readonly bool _useAllLengths; - private readonly int _maxNgramsCount; - private readonly NgramExtractingEstimator.WeightingCriteria _weighting; - - public Reconciler(int ngramLength, int skipLength, bool useAllLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) - { - _ngramLength = ngramLength; - _skipLength = skipLength; - _useAllLengths = useAllLengths; - _maxNgramsCount = maxNumTerms; - _weighting = weighting; - - } - - public bool Equals(Reconciler other) - { - return _ngramLength == other._ngramLength && - _skipLength == other._skipLength && - _useAllLengths == other._useAllLengths && - _maxNgramsCount == other._maxNgramsCount && - _weighting == other._weighting; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string inputs, string output)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNgramsCount, _weighting); - } - } - - /// - /// Produces a bag of counts of n-grams (sequences of consecutive words ) in a given tokenized text. - /// It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. - /// - /// /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. - /// - /// The column to apply to. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an n-gram. - /// Whether to include all n-gram lengths up to or only . - /// Maximum number of n-grams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ProduceNgrams(this VarVector> input, - int ngramLength = 1, - int skipLength = 0, - bool useAllLengths = true, - int maximumNgramsCount = 10000000, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); - } - - /// - /// Extensions for statically typed n-gram hash estimator. - /// - public static class NgramHashEstimatorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly VarVector> Input; - - public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler, IEquatable - { - private readonly int _numberOfBits; - private readonly int _ngramLength; - private readonly int _skipLength; - private readonly bool _useAllLengths; - private readonly uint _seed; - private readonly bool _useOrderedHashing; - private readonly int _maximumNumberOfInverts; - - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) - { - _numberOfBits = numberOfBits; - _ngramLength = ngramLength; - _skipLength = skipLength; - _useAllLengths = useAllLengths; - _seed = seed; - _useOrderedHashing = useOrderedHashing; - _maximumNumberOfInverts = maximumNumberOfInverts; - } - - public bool Equals(Reconciler other) - { - return _numberOfBits == other._numberOfBits && - _ngramLength == other._ngramLength && - _skipLength == other._skipLength && - _useAllLengths == other._useAllLengths && - _seed == other._seed && - _useOrderedHashing == other._useOrderedHashing && - _maximumNumberOfInverts == other._maximumNumberOfInverts; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var columns = new List(); - foreach (var outCol in toOutput) - columns.Add(new NgramHashingEstimator.ColumnOptions(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] }, - _ngramLength, _skipLength, _useAllLengths, _numberOfBits, _seed, _useOrderedHashing, _maximumNumberOfInverts)); - - return new NgramHashingEstimator(env, columns.ToArray()); - } - } - - /// - /// Produces a bag of counts of n-grams (sequences of n consecutive words of length 1-n) in a given tokenized text. - /// It does so by hashing each n-gram and using the hash value as the index in the bag. - /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. - /// - /// The column to apply to. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an n-gram. - /// Whether to include all n-gram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ProduceHashedNgrams(this VarVector> input, - int numberOfBits = 16, - int ngramLength = 2, - int skipLength = 0, - bool useAllLengths = true, - uint seed = 314489979, - bool useOrderedHashing = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); - } -} diff --git a/src/Microsoft.ML.StaticPipe/TrainerEstimatorReconciler.cs b/src/Microsoft.ML.StaticPipe/TrainerEstimatorReconciler.cs deleted file mode 100644 index c2e55da928..0000000000 --- a/src/Microsoft.ML.StaticPipe/TrainerEstimatorReconciler.cs +++ /dev/null @@ -1,529 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// General purpose reconciler for a typical case with trainers, where they accept some generally - /// fixed number of inputs, and produce some outputs where the names of the outputs are fixed. - /// Authors of components that want to produce columns can subclass this directly, or use one of the - /// common nested subclasses. - /// - public abstract class TrainerEstimatorReconciler : EstimatorReconciler - { - protected readonly PipelineColumn[] Inputs; - private readonly string[] _outputNames; - - /// - /// The output columns. Note that subclasses should return exactly the same items each time, - /// and the items should correspond to the output names passed into the constructor. - /// - protected abstract IEnumerable Outputs { get; } - - /// - /// Constructor for the base class. - /// - /// The set of inputs - /// The names of the outputs, which we assume cannot be changed - protected TrainerEstimatorReconciler(PipelineColumn[] inputs, string[] outputNames) - { - Contracts.CheckValue(inputs, nameof(inputs)); - Contracts.CheckValue(outputNames, nameof(outputNames)); - - Inputs = inputs; - _outputNames = outputNames; - } - - /// - /// Produce the training estimator. - /// - /// The host environment to use to create the estimator. - /// The names of the inputs, which corresponds exactly to the input columns - /// fed into the constructor. - /// An estimator, which should produce the additional columns indicated by the output names - /// in the constructor. - protected abstract IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames); - - /// - /// Produces the estimator. Note that this is made out of 's - /// return value, plus whatever usages of are necessary to avoid collisions with - /// the output names fed to the constructor. This class provides the implementation, and subclasses should instead - /// override . - /// - public sealed override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.AssertValue(env); - env.AssertValue(toOutput); - env.AssertValue(inputNames); - env.AssertValue(outputNames); - env.AssertValue(usedNames); - - // The reconciler should have been called with all the input columns having names. - env.Assert(inputNames.Keys.All(Inputs.Contains) && Inputs.All(inputNames.Keys.Contains)); - // The output name map should contain only outputs as their keys. Yet, it is possible not all - // outputs will be required in which case these will both be subsets of those outputs indicated - // at construction. - env.Assert(outputNames.Keys.All(Outputs.Contains)); - env.Assert(toOutput.All(Outputs.Contains)); - env.Assert(Outputs.Count() == _outputNames.Length); - - IEstimator result = null; - - // In the case where we have names used that conflict with the fixed output names, we must have some - // renaming logic. - var collisions = new HashSet(_outputNames); - collisions.IntersectWith(usedNames); - var old2New = new Dictionary(); - - if (collisions.Count > 0) - { - // First get the old names to some temporary names. - int tempNum = 0; - foreach (var c in collisions) - old2New[c] = $"#TrainTemp{tempNum++}"; - // In the case where the input names have anything that is used, we must reconstitute the input mapping. - if (inputNames.Values.Any(old2New.ContainsKey)) - { - var newInputNames = new Dictionary(); - foreach (var p in inputNames) - newInputNames[p.Key] = old2New.ContainsKey(p.Value) ? old2New[p.Value] : p.Value; - inputNames = newInputNames; - } - result = new ColumnCopyingEstimator(env, old2New.Select(p => (p.Value, p.Key)).ToArray()); - } - - // Map the inputs to the names. - string[] mappedInputNames = Inputs.Select(c => inputNames[c]).ToArray(); - // Finally produce the trainer. - var trainerEst = ReconcileCore(env, mappedInputNames); - if (result == null) - result = trainerEst; - else - result = result.Append(trainerEst); - - // OK. Now handle the final renamings from the fixed names, to the desired names, in the case - // where the output was desired, and a renaming is even necessary. - var toRename = new List<(string outputColumnName, string inputColumnName)>(); - foreach ((PipelineColumn outCol, string fixedName) in Outputs.Zip(_outputNames, (c, n) => (c, n))) - { - if (outputNames.TryGetValue(outCol, out string desiredName)) - toRename.Add((desiredName, fixedName)); - else - env.Assert(!toOutput.Contains(outCol)); - } - // Finally if applicable handle the renaming back from the temp names to the original names. - foreach (var p in old2New) - toRename.Add((p.Key, p.Value)); - if (toRename.Count > 0) - result = result.Append(new ColumnCopyingEstimator(env, toRename.ToArray())); - - return result; - } - - /// - /// A reconciler for regression capable of handling the most common cases for regression. - /// - public sealed class Regression : TrainerEstimatorReconciler - { - /// - /// The delegate to create the regression trainer instance. - /// - /// The environment with which to create the estimator - /// The label column name - /// The features column name - /// The weights column name, or null if the reconciler was constructed with null weights - /// A estimator producing columns with the fixed name . - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string label, string features, string weights); - - private readonly EstimatorFactory _estFact; - - /// - /// The output score column for the regression. This will have this instance as its reconciler. - /// - public Scalar Score { get; } - - protected override IEnumerable Outputs => Enumerable.Repeat(Score, 1); - - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score }; - - /// - /// Constructs a new general regression reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a single new scalar column named . - /// The input label column. - /// The input features column. - /// The input weights column, or null if there are no weights. - public Regression(EstimatorFactory estimatorFactory, Scalar label, Vector features, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(features, nameof(features)), weights), - _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 2 || Inputs.Length == 3); - Score = new Impl(this); - } - - private static PipelineColumn[] MakeInputs(Scalar label, Vector features, Scalar weights) - => weights == null ? new PipelineColumn[] { label, features } : new PipelineColumn[] { label, features, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames[1], inputNames.Length > 2 ? inputNames[2] : null); - } - - private sealed class Impl : Scalar - { - public Impl(Regression rec) : base(rec, rec.Inputs) { } - } - } - - /// - /// A reconciler capable of handling the most common cases for binary classification with calibrated outputs. - /// - public sealed class BinaryClassifier : TrainerEstimatorReconciler - { - /// - /// The delegate to create the binary classifier trainer instance. - /// - /// The environment with which to create the estimator. - /// The label column name. - /// The features column name. - /// The weights column name, or null if the reconciler was constructed with null weights. - /// A binary classification trainer estimator. - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string label, string features, string weights); - - private readonly EstimatorFactory _estFact; - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score, DefaultColumnNames.Probability, DefaultColumnNames.PredictedLabel }; - - /// - /// The general output for binary classifiers. - /// - public (Scalar score, Scalar probability, Scalar predictedLabel) Output { get; } - - protected override IEnumerable Outputs => new PipelineColumn[] { Output.score, Output.probability, Output.predictedLabel }; - - /// - /// Constructs a new general regression reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a single new scalar column named . - /// The input label column. - /// The input features column. - /// The input weights column, or null if there are no weights. - public BinaryClassifier(EstimatorFactory estimatorFactory, Scalar label, Vector features, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(features, nameof(features)), weights), - _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 2 || Inputs.Length == 3); - - Output = (new Impl(this), new Impl(this), new ImplBool(this)); - } - - private static PipelineColumn[] MakeInputs(Scalar label, Vector features, Scalar weights) - => weights == null ? new PipelineColumn[] { label, features } : new PipelineColumn[] { label, features, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames[1], inputNames.Length > 2 ? inputNames[2] : null); - } - - private sealed class Impl : Scalar - { - public Impl(BinaryClassifier rec) : base(rec, rec.Inputs) { } - } - - private sealed class ImplBool : Scalar - { - public ImplBool(BinaryClassifier rec) : base(rec, rec.Inputs) { } - } - } - - /// - /// A reconciler capable of handling the most common cases for binary classification that does not - /// have calibrated outputs and therefore no probability output is produced. If probability output - /// is required, please add a calibrator after this trainer. - /// - public sealed class BinaryClassifierNoCalibration : TrainerEstimatorReconciler - { - /// - /// The delegate to create the binary classifier trainer instance. - /// - /// The environment with which to create the estimator - /// The label column name. - /// The features column name. - /// The weights column name, or null if the reconciler was constructed with null weights. - /// A binary classification trainer estimator. - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string label, string features, string weights); - - private readonly EstimatorFactory _estFact; - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score, DefaultColumnNames.PredictedLabel }; - - /// - /// The general output for binary classifiers. - /// - public (Scalar score, Scalar predictedLabel) Output { get; } - - /// - /// The output columns, which will contain at least the columns produced by . - /// - protected override IEnumerable Outputs { get; } - - /// - /// Constructs a new general binary classifier reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a single new scalar column named . - /// The input label column. - /// The input features column. - /// The input weights column, or null if there are no weights. - public BinaryClassifierNoCalibration(EstimatorFactory estimatorFactory, Scalar label, Vector features, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(features, nameof(features)), weights), _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 2 || Inputs.Length == 3); - - Output = (new Impl(this), new ImplBool(this)); - - Outputs = new PipelineColumn[] { Output.score, Output.predictedLabel }; - } - - private static PipelineColumn[] MakeInputs(Scalar label, Vector features, Scalar weights) - => weights == null ? new PipelineColumn[] { label, features } : new PipelineColumn[] { label, features, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames[1], inputNames.Length > 2 ? inputNames[2] : null); - } - - private sealed class Impl : Scalar - { - public Impl(BinaryClassifierNoCalibration rec) : base(rec, rec.Inputs) { } - } - - private sealed class ImplBool : Scalar - { - public ImplBool(BinaryClassifierNoCalibration rec) : base(rec, rec.Inputs) { } - } - } - - /// - /// A reconciler capable of handling clustering. - /// - public sealed class Clustering : TrainerEstimatorReconciler - { - /// - /// The delegate to create the clustering trainer instance. - /// - /// The environment with which to create the estimator. - /// The features column name. - /// The weights column name, or null if the reconciler was constructed with null weights. - /// A clustering trainer estimator. - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string features, string weights); - - private readonly EstimatorFactory _estFact; - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score, DefaultColumnNames.PredictedLabel }; - - /// - /// The general output for clustering. - /// - public (Vector score, Key predictedLabel) Output { get; } - - /// - /// The output columns, which will contain the columns produced by . - /// - protected override IEnumerable Outputs => new PipelineColumn[] { Output.score, Output.predictedLabel }; - - /// - /// Constructs a new general clustering reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a new scalar column named and a - /// named PredictedLabel. - /// The input features column. - /// The input weights column, or null if there are no weights. - public Clustering(EstimatorFactory estimatorFactory, Vector features, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(features, nameof(features)), weights), - _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 1 || Inputs.Length == 2); - - Output = (new ImplScore(this), new ImplLabel(this)); - } - - private static PipelineColumn[] MakeInputs(Vector features, Scalar weights) - => weights == null ? new PipelineColumn[] { features } : new PipelineColumn[] { features, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames.Length > 1 ? inputNames[1] : null); - } - - private sealed class ImplScore : Vector - { - public ImplScore(Clustering rec) : base(rec, rec.Inputs) { } - } - - private sealed class ImplLabel : Key - { - public ImplLabel(Clustering rec) : base(rec, rec.Inputs) { } - } - } - - /// - /// A reconciler for multiclass classification capable of handling the most common cases for multiclass classification. - /// - public sealed class MulticlassClassificationReconciler : TrainerEstimatorReconciler - { - /// - /// The delegate to create the multiclass classifier trainer instance. - /// - /// The environment with which to create the estimator - /// The label column name - /// The features column name - /// The weights column name, or null if the reconciler was constructed with null weights - /// A estimator producing columns with the fixed name and . - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string label, string features, string weights); - - private readonly EstimatorFactory _estFact; - - /// - /// The general output for multiclass classifiers. - /// - public (Vector score, Key predictedLabel) Output { get; } - - protected override IEnumerable Outputs => new PipelineColumn[] { Output.score, Output.predictedLabel }; - - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score, DefaultColumnNames.PredictedLabel }; - - /// - /// Constructs a new general multiclass classifier reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a vector column named and a scalar - /// key column named . - /// The input label column. - /// The input features column. - /// The input weights column, or null if there are no weights. - public MulticlassClassificationReconciler(EstimatorFactory estimatorFactory, Key label, Vector features, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), Contracts.CheckRef(features, nameof(features)), weights), - _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 2 || Inputs.Length == 3); - Output = (new ImplScore(this), new ImplLabel(this)); - } - - private static PipelineColumn[] MakeInputs(Key label, Vector features, Scalar weights) - => weights == null ? new PipelineColumn[] { label, features } : new PipelineColumn[] { label, features, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames[1], inputNames.Length > 2 ? inputNames[2] : null); - } - - private sealed class ImplLabel : Key - { - public ImplLabel(MulticlassClassificationReconciler rec) : base(rec, rec.Inputs) { } - } - - private sealed class ImplScore : Vector - { - public ImplScore(MulticlassClassificationReconciler rec) : base(rec, rec.Inputs) { } - } - } - - /// - /// A reconciler for ranking capable of handling the most common cases for ranking. - /// - public sealed class Ranker : TrainerEstimatorReconciler - { - /// - /// The delegate to create the ranking trainer instance. - /// - /// The environment with which to create the estimator - /// The label column name - /// The features column name - /// The weights column name, or null if the reconciler was constructed with null weights - /// The groupId column name. - /// A estimator producing columns with the fixed name . - public delegate IEstimator EstimatorFactory(IHostEnvironment env, string label, string features, string weights, string groupId); - - private readonly EstimatorFactory _estFact; - - /// - /// The output score column for ranking. This will have this instance as its reconciler. - /// - public Scalar Score { get; } - - protected override IEnumerable Outputs => Enumerable.Repeat(Score, 1); - - private static readonly string[] _fixedOutputNames = new[] { DefaultColumnNames.Score }; - - /// - /// Constructs a new general ranker reconciler. - /// - /// The delegate to create the training estimator. It is assumed that this estimator - /// will produce a single new scalar column named . - /// The input label column. - /// The input features column. - /// The input weights column, or null if there are no weights. - /// The input groupId column. - public Ranker(EstimatorFactory estimatorFactory, Scalar label, Vector features, Key groupId, Scalar weights) - : base(MakeInputs(Contracts.CheckRef(label, nameof(label)), - Contracts.CheckRef(features, nameof(features)), - Contracts.CheckRef(groupId, nameof(groupId)), - weights), - _fixedOutputNames) - { - Contracts.CheckValue(estimatorFactory, nameof(estimatorFactory)); - _estFact = estimatorFactory; - Contracts.Assert(Inputs.Length == 3 || Inputs.Length == 4); - Score = new Impl(this); - } - - private static PipelineColumn[] MakeInputs(Scalar label, Vector features, Key groupId, Scalar weights) - => weights == null ? new PipelineColumn[] { label, features, groupId } : new PipelineColumn[] { label, features, groupId, weights }; - - protected override IEstimator ReconcileCore(IHostEnvironment env, string[] inputNames) - { - Contracts.AssertValue(env); - env.Assert(Utils.Size(inputNames) == Inputs.Length); - return _estFact(env, inputNames[0], inputNames[1], inputNames[2], inputNames.Length > 3 ? inputNames[3] : null); - } - - private sealed class Impl : Scalar - { - public Impl(Ranker rec) : base(rec, rec.Inputs) { } - } - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs deleted file mode 100644 index db6eb42d78..0000000000 --- a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs +++ /dev/null @@ -1,287 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Defines static extension methods that allow operations like train-test split, cross-validate, - /// sampling etc. with the . - /// - public static class TrainingStaticExtensions - { - /// - /// Split the dataset into the train set and test set according to the given fraction. - /// Respects the if provided. - /// - /// The tuple describing the data schema. - /// The training catalog. - /// The dataset to split. - /// The fraction of data to go into the test set. - /// Optional selector for the column to use as a stratification column. If two examples share the same value of the - /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set. - /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers . - /// Optional parameter used in combination with the . - /// If the is not provided, the random numbers generated to create it, will use this seed as value. - /// And if it is not provided, the default value will be used. - /// A pair of datasets, for the train and test set. - public static (DataView trainSet, DataView testSet) TrainTestSplit(this DataOperationsCatalog catalog, - DataView data, double testFraction = 0.1, Func stratificationColumn = null, int? seed = null) - { - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive"); - env.CheckValueOrNull(stratificationColumn); - - string stratName = null; - - if (stratificationColumn != null) - { - var indexer = StaticPipeUtils.GetIndexer(data); - var column = stratificationColumn(indexer.Indices); - env.CheckParam(column != null, nameof(stratificationColumn), "Stratification column not found"); - stratName = indexer.Get(column); - } - - var split = catalog.TrainTestSplit(data.AsDynamic, testFraction, stratName, seed); - return (new DataView(env, split.TrainSet, data.Shape), new DataView(env, split.TestSet, data.Shape)); - } - - /// - /// Run cross-validation over folds of , by fitting , - /// and respecting if provided. - /// Then evaluate each sub-model against and return metrics. - /// - /// The input schema shape. - /// The output schema shape. - /// The type of the trained model. - /// The training catalog. - /// The data to run cross-validation on. - /// The estimator to fit. - /// Number of cross-validation folds. - /// The label column (for evaluation). - /// Optional selector for the column to use as a stratification column. If two examples share the same value of the - /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set. - /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers . - /// Optional parameter used in combination with the . - /// If the is not provided, the random numbers generated to create it, will use this seed as value. - /// And if it is not provided, the default value will be used. - /// Per-fold results: metrics, models, scored datasets. - public static (RegressionMetrics metrics, Transformer model, DataView scoredTestData)[] CrossValidate( - this RegressionCatalog catalog, - DataView data, - Estimator estimator, - Func> label, - int numFolds = 5, - Func stratificationColumn = null, int? seed = null) - where TTransformer : class, ITransformer - { - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1"); - env.CheckValue(label, nameof(label)); - env.CheckValueOrNull(stratificationColumn); - - var outIndexer = StaticPipeUtils.GetIndexer(estimator); - var labelColumn = label(outIndexer.Indices); - env.CheckParam(labelColumn != null, nameof(stratificationColumn), "Stratification column not found"); - var labelName = outIndexer.Get(labelColumn); - - string stratName = null; - if (stratificationColumn != null) - { - var indexer = StaticPipeUtils.GetIndexer(data); - var column = stratificationColumn(indexer.Indices); - env.CheckParam(column != null, nameof(stratificationColumn), "Stratification column not found"); - stratName = indexer.Get(column); - } - - var results = catalog.CrossValidate(data.AsDynamic, estimator.AsDynamic, numFolds, labelName, stratName, seed); - - return results.Select(x => ( - x.Metrics, - new Transformer(env, (TTransformer)x.Model, data.Shape, estimator.Shape), - new DataView(env, x.ScoredHoldOutSet, estimator.Shape))) - .ToArray(); - } - - /// - /// Run cross-validation over folds of , by fitting , - /// and respecting if provided. - /// Then evaluate each sub-model against and return metrics. - /// - /// The input schema shape. - /// The output schema shape. - /// The type of the trained model. - /// The training catalog. - /// The data to run cross-validation on. - /// The estimator to fit. - /// Number of cross-validation folds. - /// The label column (for evaluation). - /// Optional selector for the column to use as a stratification column. If two examples share the same value of the - /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set. - /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers . - /// Optional parameter used in combination with the . - /// If the is not provided, the random numbers generated to create it, will use this seed as value. - /// And if it is not provided, the default value will be used. - /// Per-fold results: metrics, models, scored datasets. - public static (MulticlassClassificationMetrics metrics, Transformer model, DataView scoredTestData)[] CrossValidate( - this MulticlassClassificationCatalog catalog, - DataView data, - Estimator estimator, - Func> label, - int numFolds = 5, - Func stratificationColumn = null, int? seed = null) - where TTransformer : class, ITransformer - { - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1"); - env.CheckValue(label, nameof(label)); - env.CheckValueOrNull(stratificationColumn); - - var outputIndexer = StaticPipeUtils.GetIndexer(estimator); - var labelColumn = label(outputIndexer.Indices); - env.CheckParam(labelColumn != null, nameof(stratificationColumn), "Stratification column not found"); - var labelName = outputIndexer.Get(labelColumn); - - string stratName = null; - if (stratificationColumn != null) - { - var indexer = StaticPipeUtils.GetIndexer(data); - var column = stratificationColumn(indexer.Indices); - env.CheckParam(column != null, nameof(stratificationColumn), "Stratification column not found"); - stratName = indexer.Get(column); - } - - var results = catalog.CrossValidate(data.AsDynamic, estimator.AsDynamic, numFolds, labelName, stratName, seed); - - return results.Select(x => ( - x.Metrics, - new Transformer(env, (TTransformer)x.Model, data.Shape, estimator.Shape), - new DataView(env, x.ScoredHoldOutSet, estimator.Shape))) - .ToArray(); - } - - /// - /// Run cross-validation over folds of , by fitting , - /// and respecting if provided. - /// Then evaluate each sub-model against and return metrics. - /// - /// The input schema shape. - /// The output schema shape. - /// The type of the trained model. - /// The training catalog. - /// The data to run cross-validation on. - /// The estimator to fit. - /// Number of cross-validation folds. - /// The label column (for evaluation). - /// Optional selector for the column to use as a stratification column. If two examples share the same value of the - /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set. - /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers . - /// Optional parameter used in combination with the . - /// If the is not provided, the random numbers generated to create it, will use this seed as value. - /// And if it is not provided, the default value will be used. - /// Per-fold results: metrics, models, scored datasets. - public static (BinaryClassificationMetrics metrics, Transformer model, DataView scoredTestData)[] CrossValidateNonCalibrated( - this BinaryClassificationCatalog catalog, - DataView data, - Estimator estimator, - Func> label, - int numFolds = 5, - Func stratificationColumn = null, int? seed = null) - where TTransformer : class, ITransformer - { - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1"); - env.CheckValue(label, nameof(label)); - env.CheckValueOrNull(stratificationColumn); - - var outputIndexer = StaticPipeUtils.GetIndexer(estimator); - var labelColumn = label(outputIndexer.Indices); - env.CheckParam(labelColumn != null, nameof(stratificationColumn), "Stratification column not found"); - var labelName = outputIndexer.Get(labelColumn); - - string stratName = null; - if (stratificationColumn != null) - { - var indexer = StaticPipeUtils.GetIndexer(data); - var column = stratificationColumn(indexer.Indices); - env.CheckParam(column != null, nameof(stratificationColumn), "Stratification column not found"); - stratName = indexer.Get(column); - } - - var results = catalog.CrossValidateNonCalibrated(data.AsDynamic, estimator.AsDynamic, numFolds, labelName, stratName, seed); - - return results.Select(x => ( - x.Metrics, - new Transformer(env, (TTransformer)x.Model, data.Shape, estimator.Shape), - new DataView(env, x.ScoredHoldOutSet, estimator.Shape))) - .ToArray(); - } - - /// - /// Run cross-validation over folds of , by fitting , - /// and respecting if provided. - /// Then evaluate each sub-model against and return metrics. - /// - /// The input schema shape. - /// The output schema shape. - /// The type of the trained model. - /// The training catalog. - /// The data to run cross-validation on. - /// The estimator to fit. - /// Number of cross-validation folds. - /// The label column (for evaluation). - /// Optional selector for the column to use as a stratification column. If two examples share the same value of the - /// (if provided), they are guaranteed to appear in the same subset (train or test). Use this to make sure there is no label leakage from train to the test set. - /// If this optional parameter is not provided, a stratification columns will be generated, and its values will be random numbers . - /// Optional parameter used in combination with the . - /// If the is not provided, the random numbers generated to create it, will use this seed as value. - /// And if it is not provided, the default value will be used. - /// Per-fold results: metrics, models, scored datasets. - public static (CalibratedBinaryClassificationMetrics metrics, Transformer model, DataView scoredTestData)[] CrossValidate( - this BinaryClassificationCatalog catalog, - DataView data, - Estimator estimator, - Func> label, - int numFolds = 5, - Func stratificationColumn = null, int? seed = null) - where TTransformer : class, ITransformer - { - var env = StaticPipeUtils.GetEnvironment(data); - Contracts.AssertValue(env); - env.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1"); - env.CheckValue(label, nameof(label)); - env.CheckValueOrNull(stratificationColumn); - - var outputIndexer = StaticPipeUtils.GetIndexer(estimator); - var labelColumn = label(outputIndexer.Indices); - env.CheckParam(labelColumn != null, nameof(stratificationColumn), "Stratification column not found"); - var labelName = outputIndexer.Get(labelColumn); - - string stratName = null; - if (stratificationColumn != null) - { - var indexer = StaticPipeUtils.GetIndexer(data); - var column = stratificationColumn(indexer.Indices); - env.CheckParam(column != null, nameof(stratificationColumn), "Stratification column not found"); - stratName = indexer.Get(column); - } - - var results = catalog.CrossValidate(data.AsDynamic, estimator.AsDynamic, numFolds, labelName, stratName, seed); - - return results.Select(x => ( - x.Metrics, - new Transformer(env, (TTransformer)x.Model, data.Shape, estimator.Shape), - new DataView(env, x.ScoredHoldOutSet, estimator.Shape))) - .ToArray(); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/Transformer.cs b/src/Microsoft.ML.StaticPipe/Transformer.cs deleted file mode 100644 index c8cd9cc9f8..0000000000 --- a/src/Microsoft.ML.StaticPipe/Transformer.cs +++ /dev/null @@ -1,45 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; - -namespace Microsoft.ML.StaticPipe -{ - public sealed class Transformer : SchemaBearing - where TTransformer : class, ITransformer - { - public TTransformer AsDynamic { get; } - private readonly StaticSchemaShape _inShape; - - internal Transformer(IHostEnvironment env, TTransformer transformer, StaticSchemaShape inShape, StaticSchemaShape outShape) - : base(env, outShape) - { - Env.AssertValue(transformer); - Env.AssertValue(inShape); - AsDynamic = transformer; - _inShape = inShape; - // The ability to check at runtime is limited. We could check during transformation time on the input data view. - } - - public Transformer> - Append(Transformer transformer) - where TNewTransformer : class, ITransformer - { - Env.Assert(nameof(Append) == nameof(LearningPipelineExtensions.Append)); - - var trans = AsDynamic.Append(transformer.AsDynamic); - return new Transformer>(Env, trans, _inShape, transformer.Shape); - } - - public DataView Transform(DataView input) - { - Env.Assert(nameof(Transform) == nameof(ITransformer.Transform)); - Env.CheckValue(input, nameof(input)); - - var view = AsDynamic.Transform(input.AsDynamic); - return new DataView(Env, view, Shape); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs deleted file mode 100644 index 747f2ce11c..0000000000 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ /dev/null @@ -1,1683 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Data; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Text; -using static Microsoft.ML.Transforms.Text.TextFeaturizingEstimator; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// Extensions for statically typed . - /// - public static class GlobalContrastNormalizerStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, bool ensureZeroMean, bool ensureUnitStandardDeviation, float scale) - : base(new Reconciler(ensureZeroMean, ensureUnitStandardDeviation, scale), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly bool _ensureZeroMean; - private readonly bool _ensureUnitStandardDeviation; - private readonly float _scale; - - public Reconciler(bool ensureZeroMean, bool ensureUnitStandardDeviation, float scale) - { - _ensureZeroMean = ensureZeroMean; - _ensureUnitStandardDeviation = ensureUnitStandardDeviation; - _scale = scale; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new GlobalContrastNormalizingEstimator(env, pairs.ToArray(), _ensureZeroMean, _ensureUnitStandardDeviation, _scale); - } - } - - /// - /// The column containing the vectors to apply the normalization to. - /// If , subtract mean from each value before normalizing and use the raw input otherwise. - /// If , resulted vector's standard deviation would be one. Otherwise, resulted vector's L2-norm would be one. - /// Scale features by this value. - public static Vector NormalizeGlobalContrast(this Vector input, - bool ensureZeroMean = LpNormNormalizingEstimatorBase.Defaults.GcnEnsureZeroMean, - bool ensureUnitStandardDeviation = LpNormNormalizingEstimatorBase.Defaults.EnsureUnitStdDev, - float scale = LpNormNormalizingEstimatorBase.Defaults.Scale) => new OutPipelineColumn(input, ensureZeroMean, ensureUnitStandardDeviation, scale); - } - - /// - /// Extensions for statically typed . - /// - public static class MutualInformationFeatureSelectorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - public readonly PipelineColumn LabelColumn; - - public OutPipelineColumn(Vector input, Scalar labelColumn, int slotsInOutput, int numBins) - : base(new Reconciler(labelColumn, slotsInOutput, numBins), input, labelColumn) - { - Input = input; - LabelColumn = labelColumn; - } - - public OutPipelineColumn(Vector input, Scalar labelColumn, int slotsInOutput, int numBins) - : base(new Reconciler(labelColumn, slotsInOutput, numBins), input, labelColumn) - { - Input = input; - LabelColumn = labelColumn; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly PipelineColumn _labelColumn; - private readonly int _slotsInOutput; - private readonly int _numBins; - - public Reconciler(PipelineColumn labelColumn, int slotsInOutput, int numBins) - { - _labelColumn = labelColumn; - _slotsInOutput = slotsInOutput; - _numBins = numBins; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var pairs = new List<(string outputColumnName, string inputColumnName)>(); - foreach (var outCol in toOutput) - pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - - return new MutualInformationFeatureSelectingEstimator(env, inputNames[_labelColumn], _slotsInOutput, _numBins, pairs.ToArray()); - } - } - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - - /// - /// Name of the input column. - /// Name of the column to use for labels. - /// The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns. - /// Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnMutualInformation( - this Vector input, - Scalar labelColumn, - int slotsInOutput = MutualInformationFeatureSelectingEstimator.Defaults.SlotsInOutput, - int numBins = MutualInformationFeatureSelectingEstimator.Defaults.NumBins) => new OutPipelineColumn(input, labelColumn, slotsInOutput, numBins); - } - - /// - /// Extensions for statically typed . - /// - public static class CountFeatureSelectorStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, long count) - : base(new Reconciler(count), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly long _count; - - public Reconciler(long count) - { - _count = count; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var infos = new CountFeatureSelectingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; i++) - infos[i] = new CountFeatureSelectingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[((OutPipelineColumn)toOutput[i]).Input], _count); - - return new CountFeatureSelectingEstimator(env, infos); - } - } - - /// - /// Name of the input column. - /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnCount(this Vector input, - long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); - - /// - /// Name of the input column. - /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnCount(this Vector input, - long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); - - /// - /// Name of the input column. - /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved. - /// - /// - /// - /// - /// - public static Vector SelectFeaturesBasedOnCount(this Vector input, - long count = CountFeatureSelectingEstimator.Defaults.Count) => new OutPipelineColumn(input, count); - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class KeyToBinaryVectorStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - - public OutVectorColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - - public OutVectorColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var infos = new (string outputColumnName, string inputColumnName)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (IColInput)toOutput[i]; - infos[i] = (outputNames[toOutput[i]], inputNames[col.Input]); - } - return new KeyToBinaryVectorMappingEstimator(env, infos); - } - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static Vector ToBinaryVector(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static Vector ToBinaryVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static VarVector ToBinaryVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static Vector ToBinaryVector(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static Vector ToBinaryVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces a vector of bits representing the key in binary form. - /// The first value is encoded as all zeros and missing values are encoded as all ones. - /// In the case where a vector has multiple keys, the encoded values are concatenated. - /// Number of bits per key is determined as the number of bits needed to represent the cardinality of the keys plus one. - /// - public static VarVector ToBinaryVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class KeyToVectorStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - bool Bag { get; } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - public bool Bag { get; } - - public OutVectorColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = false; - } - - public OutVectorColumn(Vector> input, bool bag) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = bag; - } - - public OutVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = true; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - public bool Bag { get; } - - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = false; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - public bool Bag { get; } - - public OutVectorColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = false; - } - - public OutVectorColumn(Vector> input, bool bag) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = bag; - } - - public OutVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = true; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - public bool Bag { get; } - - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - Bag = false; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var infos = new KeyToVectorMappingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (IColInput)toOutput[i]; - infos[i] = new KeyToVectorMappingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); - } - return new KeyToVectorMappingEstimator(env, infos); - } - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// - public static Vector ToVector(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// - public static Vector ToVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, false); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static VarVector ToVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static Vector ToBaggedVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, true); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static Vector ToBaggedVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// - public static Vector ToVector(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// - public static Vector ToVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, false); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static VarVector ToVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static Vector ToBaggedVector(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, true); - } - - /// - /// Takes a column of key type of known cardinality and produces an indicator vector of floats. - /// Each key value of the input is used to create an indicator vector: the indicator vector is the length of the key cardinality, - /// where all values are 0, except for the entry corresponding to the value of the key, which is 1. - /// If the key value is missing, then all values are 0. Naturally this tends to generate very sparse vectors. - /// In this case then the indicator vectors for all values in the column will be simply added together, - /// to produce the final vector with type equal to the key cardinality; so, in all cases, whether vector or scalar, - /// the output column will be a vector type of length equal to that cardinality. - /// - public static Vector ToBaggedVector(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class NAReplacerStaticExtensions - { - private readonly struct Config - { - public readonly bool ImputeBySlot; - public readonly MissingValueReplacingEstimator.ReplacementMode ReplacementMode; - - public Config(MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, - bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) - { - ImputeBySlot = imputeBySlot; - ReplacementMode = replacementMode; - } - } - - private interface IColInput - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class OutScalar : Scalar, IColInput - { - public PipelineColumn Input { get; } - public Config Config { get; } - - public OutScalar(Scalar input, Config config) - : base(Reconciler.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - public Config Config { get; } - - public OutVectorColumn(Vector input, Config config) - : base(Reconciler.Inst, input) - { - Input = input; - Config = config; - } - - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - public Config Config { get; } - - public OutVarVectorColumn(VarVector input, Config config) - : base(Reconciler.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var infos = new MissingValueReplacingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (IColInput)toOutput[i]; - infos[i] = new MissingValueReplacingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[col.Input], col.Config.ReplacementMode, col.Config.ImputeBySlot); - } - return new MissingValueReplacingEstimator(env, infos); - } - } - - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - public static Scalar ReplaceNaNValues(this Scalar input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input, new Config(replacementMode, false)); - } - - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - public static Scalar ReplaceNaNValues(this Scalar input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input, new Config(replacementMode, false)); - } - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - /// If true, per-slot imputation of replacement is performed. - /// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors, - /// where imputation is always for the entire column. - public static Vector ReplaceNaNValues(this Vector input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, new Config(replacementMode, imputeBySlot)); - } - - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - /// If true, per-slot imputation of replacement is performed. - /// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors, - /// where imputation is always for the entire column. - public static Vector ReplaceNaNValues(this Vector input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, bool imputeBySlot = MissingValueReplacingEstimator.Defaults.ImputeBySlot) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input, new Config(replacementMode, imputeBySlot)); - } - - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - public static VarVector ReplaceNaNValues(this VarVector input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input, new Config(replacementMode, false)); - } - /// - /// Scan through all rows and replace NaN values according to replacement strategy. - /// - /// Incoming data. - /// How NaN should be replaced - public static VarVector ReplaceNaNValues(this VarVector input, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input, new Config(replacementMode, false)); - } - } - - public static partial class ConvertStaticExtensions - { - - private interface IConvertCol - { - PipelineColumn Input { get; } - InternalDataKind Kind { get; } - } - - private sealed class ImplScalar : Scalar, IConvertCol - { - public PipelineColumn Input { get; } - public InternalDataKind Kind { get; } - public ImplScalar(PipelineColumn input, InternalDataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class ImplVector : Vector, IConvertCol - { - public PipelineColumn Input { get; } - public InternalDataKind Kind { get; } - public ImplVector(PipelineColumn input, InternalDataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class ImplVarVector : VarVector, IConvertCol - { - public PipelineColumn Input { get; } - public InternalDataKind Kind { get; } - public ImplVarVector(PipelineColumn input, InternalDataKind kind) : base(Rec.Inst, input) - { - Input = input; - Kind = kind; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new TypeConvertingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (IConvertCol)toOutput[i]; - infos[i] = new TypeConvertingEstimator.ColumnOptions(outputNames[toOutput[i]], tcol.Kind.ToDataKind(), inputNames[tcol.Input]); - } - return new TypeConvertingEstimator(env, infos); - } - } - } - - public static partial class TermStaticExtensions - { - // I am not certain I see a good way to cover the distinct types beyond complete enumeration. - // Raw generics would allow illegal possible inputs, for example, Scalar. So, this is a partial - // class, and all the public facing extension methods for each possible type are in a T4 generated result. - - private const KeyOrdinality DefSort = (KeyOrdinality)ValueToKeyMappingEstimator.Defaults.Ordinality; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys; - - private readonly struct Config - { - public readonly KeyOrdinality Order; - public readonly int Max; - public readonly Action OnFit; - - public Config(KeyOrdinality order, int max, Action onFit) - { - Order = order; - Max = max; - OnFit = onFit; - } - } - - private static Action Wrap(ToKeyFitResult.OnFit onFit) - { - if (onFit == null) - return null; - // The type T asociated with the delegate will be the actual value type once #863 goes in. - // However, until such time as #863 goes in, it would be too awkward to attempt to extract the metadata. - // For now construct the useless object then pass it into the delegate. - return map => onFit(new ToKeyFitResult(map)); - } - - private interface ITermCol - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplScalar : Key, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplScalar(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVector : Vector>, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class ImplVarVector : VarVector>, ITermCol - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVarVector(PipelineColumn input, Config config) : base(Rec.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Rec : EstimatorReconciler - { - public static readonly Rec Inst = new Rec(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var infos = new ValueToKeyMappingEstimator.ColumnOptions[toOutput.Length]; - Action onFit = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (ITermCol)toOutput[i]; - infos[i] = new ValueToKeyMappingEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[tcol.Input], - tcol.Config.Max, (ValueToKeyMappingEstimator.KeyOrdinality)tcol.Config.Order); - if (tcol.Config.OnFit != null) - { - int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. - onFit += tt => tcol.Config.OnFit(tt.GetTermMap(ii)); - } - } - var est = new ValueToKeyMappingEstimator(env, infos); - if (onFit == null) - return est; - return est.WithOnFitDelegate(onFit); - } - } - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class KeyToValueStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - private sealed class OutKeyColumn : Key, IColInput - { - public PipelineColumn Input { get; } - - public OutKeyColumn(Key> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutScalarColumn : Scalar, IColInput - { - public PipelineColumn Input { get; } - - public OutScalarColumn(Key input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - - public OutVarVectorColumn(VarVector> input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var cols = new (string outputColumnName, string inputColumnName)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (IColInput)toOutput[i]; - cols[i] = (outputNames[toOutput[i]], inputNames[outCol.Input]); - } - return new KeyToValueMappingEstimator(env, cols); - } - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Key ToValue(this Key> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutKeyColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Scalar ToValue(this Key input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalarColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static Vector ToValue(this Vector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Convert a key column to a column containing the corresponding value. - /// - public static VarVector ToValue(this VarVector> input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - } - - /// - /// The extension methods and implementation support for concatenating columns together. - /// - public static class ConcatStaticExtensions - { - /// - /// Given a scalar vector, produce a vector of length one. - /// - /// The value type. - /// The scalar column. - /// The vector column, whose single item has the same value as the input. - public static Vector AsVector(this Scalar me) - => new Impl(Join(me, (PipelineColumn[])null)); - - /// - /// Given a bunch of normalized vectors, concatenate them together into a normalized vector. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static NormVector ConcatWith(this NormVector me, params NormVector[] others) - => new ImplNorm(Join(me, others)); - - /// - /// Given a set of columns, concatenate them together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static Vector ConcatWith(this Scalar me, params ScalarOrVector[] others) - => new Impl(Join(me, others)); - - /// - /// Given a set of columns, concatenate them together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static Vector ConcatWith(this Vector me, params ScalarOrVector[] others) - => new Impl(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this Scalar me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this Vector me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - /// - /// Given a set of columns including at least one variable sized vector column, concatenate them - /// together into a vector valued column of the same type. - /// - /// The value type. - /// The first input column. - /// Subsequent input columns. - /// The result of concatenating all input columns together. - public static VarVector ConcatWith(this VarVector me, params ScalarOrVectorOrVarVector[] others) - => new ImplVar(Join(me, others)); - - private interface IContainsColumn - { - PipelineColumn WrappedColumn { get; } - } - - /// - /// A wrapping object for the implicit conversions in - /// and other related methods. - /// - /// The value type. - public sealed class ScalarOrVector : ScalarOrVectorOrVarVector - { - private ScalarOrVector(PipelineColumn col) : base(col) { } - public static implicit operator ScalarOrVector(Scalar c) => new ScalarOrVector(c); - public static implicit operator ScalarOrVector(Vector c) => new ScalarOrVector(c); - public static implicit operator ScalarOrVector(NormVector c) => new ScalarOrVector(c); - } - - /// - /// A wrapping object for the implicit conversions in - /// and other related methods. - /// - /// The value type. - public class ScalarOrVectorOrVarVector : IContainsColumn - { - public PipelineColumn WrappedColumn { get; } - - private protected ScalarOrVectorOrVarVector(PipelineColumn col) - { - Contracts.CheckValue(col, nameof(col)); - WrappedColumn = col; - } - - public static implicit operator ScalarOrVectorOrVarVector(VarVector c) - => new ScalarOrVectorOrVarVector(c); - } - - #region Implementation support - private sealed class Rec : EstimatorReconciler - { - /// - /// For the moment the concat estimator can only do one at a time, so I want to apply these operations - /// one at a time, which means a separate reconciler. Otherwise there may be problems with name overwriting. - /// If that is ever adjusted, then we can make a slightly more efficient reconciler, though this is probably - /// not that important of a consideration from a runtime perspective. - /// - public static Rec Inst => new Rec(); - - private Rec() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - // For the moment, the concat estimator can only do one concatenation at a time. - // So we will chain the estimators. - Contracts.AssertNonEmpty(toOutput); - IEstimator est = null; - for (int i = 0; i < toOutput.Length; ++i) - { - var ccol = (IConcatCol)toOutput[i]; - string[] inputs = ccol.Sources.Select(s => inputNames[s]).ToArray(); - var localEst = new ColumnConcatenatingEstimator(env, outputNames[toOutput[i]], inputs); - if (i == 0) - est = localEst; - else - est = est.Append(localEst); - } - return est; - } - } - - private static PipelineColumn[] Join(PipelineColumn col, IContainsColumn[] cols) - { - if (Utils.Size(cols) == 0) - return new[] { col }; - var retVal = new PipelineColumn[cols.Length + 1]; - retVal[0] = col; - for (int i = 0; i < cols.Length; ++i) - retVal[i + 1] = cols[i].WrappedColumn; - return retVal; - } - - private static PipelineColumn[] Join(PipelineColumn col, PipelineColumn[] cols) - { - if (Utils.Size(cols) == 0) - return new[] { col }; - var retVal = new PipelineColumn[cols.Length + 1]; - retVal[0] = col; - Array.Copy(cols, 0, retVal, 1, cols.Length); - return retVal; - } - - private interface IConcatCol - { - PipelineColumn[] Sources { get; } - } - - private sealed class Impl : Vector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public Impl(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - - private sealed class ImplVar : VarVector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public ImplVar(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - - private sealed class ImplNorm : NormVector, IConcatCol - { - public PipelineColumn[] Sources { get; } - public ImplNorm(PipelineColumn[] cols) - : base(Rec.Inst, cols) - { - Sources = cols; - } - } - #endregion - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class NAIndicatorStaticExtensions - { - private interface IColInput - { - PipelineColumn Input { get; } - } - - private sealed class OutScalar : Scalar, IColInput - { - public PipelineColumn Input { get; } - - public OutScalar(Scalar input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVectorColumn : Vector, IColInput - { - public PipelineColumn Input { get; } - - public OutVectorColumn(Vector input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class OutVarVectorColumn : VarVector, IColInput - { - public PipelineColumn Input { get; } - - public OutVarVectorColumn(VarVector input) - : base(Reconciler.Inst, input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static Reconciler Inst = new Reconciler(); - - private Reconciler() { } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - var columnPairs = new (string outputColumnName, string inputColumnName)[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var col = (IColInput)toOutput[i]; - columnPairs[i] = (outputNames[toOutput[i]], inputNames[col.Input]); - } - return new MissingValueIndicatorEstimator(env, columnPairs); - } - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Scalar IsMissingValue(this Scalar input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Scalar IsMissingValue(this Scalar input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutScalar(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Vector IsMissingValue(this Vector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static Vector IsMissingValue(this Vector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static VarVector IsMissingValue(this VarVector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - - /// - /// Produces a column of boolean entries indicating whether input column entries were missing. - /// - /// The input column. - /// A column indicating whether input column entries were missing. - public static VarVector IsMissingValue(this VarVector input) - { - Contracts.CheckValue(input, nameof(input)); - return new OutVarVectorColumn(input); - } - } - - /// - /// Extension methods for the static-pipeline over objects. - /// - public static class TextFeaturizerStaticExtensions - { - internal sealed class OutPipelineColumn : Vector - { - public readonly Scalar[] Inputs; - - public OutPipelineColumn(IEnumerable> inputs, Options options) - : base(new Reconciler(options), inputs.ToArray()) - { - Inputs = inputs.ToArray(); - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly Options _settings; - - public Reconciler(Options options) - { - _settings = options; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutPipelineColumn)toOutput[0]; - var inputs = outCol.Inputs.Select(x => inputNames[x]); - return new TextFeaturizingEstimator(env, outputNames[outCol], inputs, _settings); - } - } - /// - /// Accept text data and converts it to array which represent combinations of n-gram/skip-gram token counts. - /// - /// Input data. - /// Additional data. - /// Advanced transform settings. - /// - public static Vector FeaturizeText(this Scalar input, Scalar[] otherInputs = null, TextFeaturizingEstimator.Options options = null) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckValueOrNull(otherInputs); - otherInputs = otherInputs ?? new Scalar[0]; - return new OutPipelineColumn(new[] { input }.Concat(otherInputs), options); - } - } - - public static class ApproximatedKernelMappingStaticExtenensions - { - private readonly struct Config - { - public readonly int Rank; - public readonly bool UseCosAndSinBases; - public readonly int? Seed; - public readonly KernelBase Generator; - - public Config(int rank, bool useCosAndSinBases, KernelBase generator, int? seed = null) - { - Rank = rank; - UseCosAndSinBases = useCosAndSinBases; - Generator = generator; - Seed = seed; - } - } - private interface IColInput - { - PipelineColumn Input { get; } - Config Config { get; } - } - - private sealed class ImplVector : Vector, IColInput - { - public PipelineColumn Input { get; } - public Config Config { get; } - public ImplVector(PipelineColumn input, Config config) : base(Reconciler.Inst, input) - { - Input = input; - Config = config; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - public static readonly Reconciler Inst = new Reconciler(); - - public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) - { - var infos = new ApproximatedKernelMappingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var tcol = (IColInput)toOutput[i]; - infos[i] = new ApproximatedKernelMappingEstimator.ColumnOptions(outputNames[toOutput[i]], tcol.Config.Rank, tcol.Config.UseCosAndSinBases, inputNames[tcol.Input], tcol.Config.Generator, tcol.Config.Seed); - } - return new ApproximatedKernelMappingEstimator(env, infos); - } - } - - /// - /// It maps input to a random low-dimensional feature space. It is useful when data has non-linear features, since the transform - /// is designed so that the inner products of the transformed data are approximately equal to those in the feature space of a user - /// specified shift-invariant kernel. With this transform, we are able to use linear methods (which are scalable) to approximate more complex kernel SVM models. - /// - /// The column to apply Random Fourier transfomration. - /// The number of random Fourier features to create. - /// If , use both of cos and sin basis functions to create two features for every random Fourier frequency. - /// Otherwise, only cos bases would be used. - /// Which kernel to use. (if it is null, is used.) - /// The seed of the random number generator for generating the new features. If not specified global random would be used. - public static Vector ApproximatedKernelMap(this Vector input, - int rank = ApproximatedKernelMappingEstimator.Defaults.Rank, bool useCosAndSinBases = ApproximatedKernelMappingEstimator.Defaults.UseCosAndSinBases, - KernelBase generator = null, int? seed = null) - { - Contracts.CheckValue(input, nameof(input)); - return new ImplVector(input, new Config(rank, useCosAndSinBases, generator, seed)); - } - } - - public static class PcaStaticExtensions - { - private sealed class OutPipelineColumn : Vector - { - public readonly Vector Input; - - public OutPipelineColumn(Vector input, string weightColumn, int rank, - int overSampling, bool ensureZeroMean, int? seed = null) - : base(new Reconciler(weightColumn, rank, overSampling, ensureZeroMean, seed), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly PrincipalComponentAnalyzer.ColumnOptions _colInfo; - - public Reconciler(string weightColumn, int rank, int overSampling, bool ensureZeroMean, int? seed = null) - { - _colInfo = new PrincipalComponentAnalyzer.ColumnOptions( - null, null, weightColumn, rank, overSampling, ensureZeroMean, seed); - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutPipelineColumn)toOutput[0]; - var inputColName = inputNames[outCol.Input]; - var outputColName = outputNames[outCol]; - return new PrincipalComponentAnalyzer(env, outputColName, inputColName, - _colInfo.WeightColumn, _colInfo.Rank, _colInfo.Oversampling, - _colInfo.EnsureZeroMean, _colInfo.Seed); - } - } - - /// - /// Replaces the input vector with its projection to the principal component subspace, - /// which can significantly reduce size of vector. - /// - /// - /// The column to apply PCA to. - /// The name of the weight column. - /// The number of components in the PCA. - /// Oversampling parameter for randomized PCA training. - /// If enabled, data is centered to be zero mean. - /// The seed for random number generation - /// Vector containing the principal components. - public static Vector ProjectToPrincipalComponents(this Vector input, - string weightColumn = PrincipalComponentAnalyzer.Defaults.WeightColumn, - int rank = PrincipalComponentAnalyzer.Defaults.Rank, - int overSampling = PrincipalComponentAnalyzer.Defaults.Oversampling, - bool ensureZeroMean = PrincipalComponentAnalyzer.Defaults.EnsureZeroMean, - int? seed = null) => new OutPipelineColumn(input, weightColumn, rank, overSampling, ensureZeroMean, seed); - } -} diff --git a/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs b/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs deleted file mode 100644 index bb9ef3514f..0000000000 --- a/src/Microsoft.ML.StaticPipe/TreeTrainersStatic.cs +++ /dev/null @@ -1,316 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Runtime; -using Microsoft.ML.Trainers.FastTree; - -namespace Microsoft.ML.StaticPipe -{ - /// - /// FastTree extension methods. - /// - public static class TreeRegressionExtensions - { - /// - /// FastTree extension method. - /// Predicts a target using a decision tree regression model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of data points allowed in a leaf of a regression tree, out of the subsampled data. - /// The learning rate. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - /// - /// - /// - /// - public static Scalar FastTree(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights = null, - int numberOfLeaves = Defaults.NumberOfLeaves, - int numberOfTrees = Defaults.NumberOfTrees, - int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, - double learningRate = Defaults.LearningRate, - Action onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - var trainer = new FastTreeRegressionTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - numberOfTrees, minimumExampleCountPerLeaf, learningRate); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// FastTree extension method. - /// Predicts a target using a decision tree regression model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The optional weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - /// - /// - /// - /// - public static Scalar FastTree(this RegressionCatalog.RegressionTrainers catalog, - Scalar label, Vector features, Scalar weights, - FastTreeRegressionTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValueOrNull(options); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.Regression( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new FastTreeRegressionTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, weights); - - return rec.Score; - } - - /// - /// FastTree extension method. - /// Predict a target using a decision tree binary classification model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of data points allowed in a leaf of the tree, out of the subsampled data. - /// The learning rate. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - /// - /// - /// - /// - public static (Scalar score, Scalar probability, Scalar predictedLabel) FastTree(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights = null, - int numberOfLeaves = Defaults.NumberOfLeaves, - int numberOfTrees = Defaults.NumberOfTrees, - int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, - double learningRate = Defaults.LearningRate, - Action> onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - var trainer = new FastTreeBinaryTrainer(env, labelName, featuresName, weightsName, numberOfLeaves, - numberOfTrees, minimumExampleCountPerLeaf, learningRate); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// FastTree extension method. - /// Predict a target using a decision tree binary classification model trained with the . - /// - /// The . - /// The label column. - /// The features column. - /// The optional weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The set of output columns including in order the predicted binary classification score (which will range - /// from negative to positive infinity), the calibrated prediction (from 0 to 1), and the predicted label. - /// - /// - /// - /// - public static (Scalar score, Scalar probability, Scalar predictedLabel) FastTree(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, - Scalar label, Vector features, Scalar weights, - FastTreeBinaryTrainer.Options options, - Action> onFit = null) - { - Contracts.CheckValueOrNull(options); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.BinaryClassifier( - (env, labelName, featuresName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new FastTreeBinaryTrainer(env, options); - - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - else - return trainer; - }, label, features, weights); - - return rec.Output; - } - - /// - /// FastTree . - /// Ranks a series of inputs based on their relevance, training a decision tree ranking model through the . - /// - /// The . - /// The label column. - /// The features column. - /// The groupId column. - /// The optional weights column. - /// Total number of decision trees to create in the ensemble. - /// The maximum number of leaves per decision tree. - /// The minimal number of data points allowed in a leaf of a regression tree, out of the subsampled data. - /// The learning rate. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - public static Scalar FastTree(this RankingCatalog.RankingTrainers catalog, - Scalar label, Vector features, Key groupId, Scalar weights = null, - int numberOfLeaves = Defaults.NumberOfLeaves, - int numberOfTrees = Defaults.NumberOfTrees, - int minimumExampleCountPerLeaf = Defaults.MinimumExampleCountPerLeaf, - double learningRate = Defaults.LearningRate, - Action onFit = null) - { - CheckUserValues(label, features, weights, numberOfLeaves, numberOfTrees, minimumExampleCountPerLeaf, learningRate, onFit); - - var rec = new TrainerEstimatorReconciler.Ranker( - (env, labelName, featuresName, groupIdName, weightsName) => - { - var trainer = new FastTreeRankingTrainer(env, labelName, featuresName, groupIdName, weightsName, numberOfLeaves, - numberOfTrees, minimumExampleCountPerLeaf, learningRate); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, groupId, weights); - - return rec.Score; - } - - /// - /// FastTree . - /// Ranks a series of inputs based on their relevance, training a decision tree ranking model through the . - /// - /// The . - /// The label column. - /// The features column. - /// The groupId column. - /// The optional weights column. - /// Algorithm advanced settings. - /// A delegate that is called every time the - /// method is called on the - /// instance created out of this. This delegate will receive - /// the linear model that was trained. Note that this action cannot change the result in any way; - /// it is only a way for the caller to be informed about what was learnt. - /// The Score output column indicating the predicted value. - public static Scalar FastTree(this RankingCatalog.RankingTrainers catalog, - Scalar label, Vector features, Key groupId, Scalar weights, - FastTreeRankingTrainer.Options options, - Action onFit = null) - { - Contracts.CheckValueOrNull(options); - CheckUserValues(label, features, weights, onFit); - - var rec = new TrainerEstimatorReconciler.Ranker( - (env, labelName, featuresName, groupIdName, weightsName) => - { - options.LabelColumnName = labelName; - options.FeatureColumnName = featuresName; - options.RowGroupColumnName = groupIdName; - options.ExampleWeightColumnName = weightsName; - - var trainer = new FastTreeRankingTrainer(env, options); - if (onFit != null) - return trainer.WithOnFitDelegate(trans => onFit(trans.Model)); - return trainer; - }, label, features, groupId, weights); - - return rec.Score; - } - - internal static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, - int numberOfLeaves, - int numberOfTrees, - int minimumExampleCountPerLeaf, - double learningRate, - Delegate onFit) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckParam(numberOfLeaves >= 2, nameof(numberOfLeaves), "Must be at least 2."); - Contracts.CheckParam(numberOfTrees > 0, nameof(numberOfTrees), "Must be positive"); - Contracts.CheckParam(minimumExampleCountPerLeaf > 0, nameof(minimumExampleCountPerLeaf), "Must be positive"); - Contracts.CheckParam(learningRate > 0, nameof(learningRate), "Must be positive"); - Contracts.CheckValueOrNull(onFit); - } - - internal static void CheckUserValues(PipelineColumn label, Vector features, Scalar weights, - Delegate onFit) - { - Contracts.CheckValue(label, nameof(label)); - Contracts.CheckValue(features, nameof(features)); - Contracts.CheckValueOrNull(weights); - Contracts.CheckValueOrNull(onFit); - } - } -} diff --git a/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs deleted file mode 100644 index 455699fe20..0000000000 --- a/src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs +++ /dev/null @@ -1,89 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.Text; - -namespace Microsoft.ML.StaticPipe -{ - public static class WordEmbeddingsStaticExtensions - { - /// - /// Vector of tokenized text. - /// The pretrained word embedding model. - /// - public static Vector WordEmbeddings(this VarVector input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, modelKind); - } - - /// - /// Vector of tokenized text. - /// The custom word embedding model file. - public static Vector WordEmbeddings(this VarVector input, string customModelFile) - { - Contracts.CheckValue(input, nameof(input)); - return new OutColumn(input, customModelFile); - } - - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(VarVector input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) - : base(new Reconciler(modelKind), input) - { - Input = input; - } - - public OutColumn(VarVector input, string customModelFile = null) - : base(new Reconciler(customModelFile), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly WordEmbeddingEstimator.PretrainedModelKind? _modelKind; - private readonly string _customLookupTable; - - public Reconciler(WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) - { - _modelKind = modelKind; - _customLookupTable = null; - } - - public Reconciler(string customModelFile) - { - _modelKind = null; - _customLookupTable = customModelFile; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var cols = new WordEmbeddingEstimator.ColumnOptions[toOutput.Length]; - for (int i = 0; i < toOutput.Length; ++i) - { - var outCol = (OutColumn)toOutput[i]; - cols[i] = new WordEmbeddingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]); - } - - bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable); - if (customLookup) - return new WordEmbeddingEstimator(env, _customLookupTable, cols); - else - return new WordEmbeddingEstimator(env, _modelKind.Value, cols); - } - } - } -} diff --git a/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj b/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj deleted file mode 100644 index 76f95affa4..0000000000 --- a/src/Microsoft.ML.TensorFlow.StaticPipe/Microsoft.ML.TensorFlow.StaticPipe.csproj +++ /dev/null @@ -1,13 +0,0 @@ - - - - netstandard2.0 - - - - - - - - - diff --git a/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs b/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs deleted file mode 100644 index 213fe996a5..0000000000 --- a/src/Microsoft.ML.TensorFlow.StaticPipe/TensorFlowStaticExtensions.cs +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Microsoft.ML.Transforms.StaticPipe -{ - public static class TensorFlowStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Vector input, string modelFile, bool addBatchDimensionInput) - : base(new Reconciler(modelFile, addBatchDimensionInput), input) - { - Input = input; - } - - public OutColumn(Vector input, TensorFlowModel tensorFlowModel, bool addBatchDimensionInput) - : base(new Reconciler(tensorFlowModel, addBatchDimensionInput), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly string _modelFile; - private readonly TensorFlowModel _tensorFlowModel; - private readonly bool _addBatchDimensionInput; - - public Reconciler(string modelFile, bool addBatchDimensionInput) - { - Contracts.AssertNonEmpty(modelFile); - _modelFile = modelFile; - _tensorFlowModel = null; - _addBatchDimensionInput = addBatchDimensionInput; - } - - public Reconciler(TensorFlowModel tensorFlowModel, bool addBatchDimensionInput) - { - Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); - - _modelFile = null; - _tensorFlowModel = tensorFlowModel; - _addBatchDimensionInput = addBatchDimensionInput; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - - var outCol = (OutColumn)toOutput[0]; - if (_modelFile == null) - return new TensorFlowEstimator(env, new[] { outputNames[outCol] }, new[] { inputNames[outCol.Input] }, _tensorFlowModel, _addBatchDimensionInput); - else - return new TensorFlowEstimator(env, new[] { outputNames[outCol] }, new[] { inputNames[outCol.Input] }, _modelFile, _addBatchDimensionInput); - } - } - - // REVIEW: this method only covers one use case of using TensorFlow models: consuming one - // input and producing one output of floats. - // We could consider selectively adding some more extensions to enable common scenarios. - /// - /// Load the TensorFlow model from and run it on the input column and extract one output column. - /// The inputs and outputs are matched to TensorFlow graph nodes by name. - /// - public static Vector ApplyTensorFlowGraph(this Vector input, string modelFile, bool addBatchDimensionInput = false) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckNonEmpty(modelFile, nameof(modelFile)); - return new OutColumn(input, modelFile, addBatchDimensionInput); - } - - /// - /// Run a TensorFlow model provided through on the input column and extract one output column. - /// The inputs and outputs are matched to TensorFlow graph nodes by name. - /// - public static Vector ApplyTensorFlowGraph(this Vector input, TensorFlowModel tensorFlowModel, bool addBatchDimensionInput = false) - { - Contracts.CheckValue(input, nameof(input)); - Contracts.CheckValue(tensorFlowModel, nameof(tensorFlowModel)); - return new OutColumn(input, tensorFlowModel, addBatchDimensionInput); - } - } -} diff --git a/src/Microsoft.ML.TensorFlow/AssemblyInfo.cs b/src/Microsoft.ML.TensorFlow/AssemblyInfo.cs deleted file mode 100644 index ea6400c0d1..0000000000 --- a/src/Microsoft.ML.TensorFlow/AssemblyInfo.cs +++ /dev/null @@ -1,9 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Runtime.CompilerServices; -using Microsoft.ML; - -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow.StaticPipe" + PublicKey.Value)] \ No newline at end of file diff --git a/src/Microsoft.ML.TensorFlow/Properties/AssemblyInfo.cs b/src/Microsoft.ML.TensorFlow/Properties/AssemblyInfo.cs index 9c3d11958d..db4c6427b6 100644 --- a/src/Microsoft.ML.TensorFlow/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.TensorFlow/Properties/AssemblyInfo.cs @@ -1,9 +1,8 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System.Runtime.CompilerServices; using Microsoft.ML; -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow.StaticPipe" + PublicKey.Value)] - +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] diff --git a/src/Microsoft.ML.TimeSeries.StaticPipe/Microsoft.ML.TimeSeries.StaticPipe.csproj b/src/Microsoft.ML.TimeSeries.StaticPipe/Microsoft.ML.TimeSeries.StaticPipe.csproj deleted file mode 100644 index 08eaf4f029..0000000000 --- a/src/Microsoft.ML.TimeSeries.StaticPipe/Microsoft.ML.TimeSeries.StaticPipe.csproj +++ /dev/null @@ -1,13 +0,0 @@ - - - - netstandard2.0 - - - - - - - - - diff --git a/src/Microsoft.ML.TimeSeries.StaticPipe/TimeSeriesStatic.cs b/src/Microsoft.ML.TimeSeries.StaticPipe/TimeSeriesStatic.cs deleted file mode 100644 index 2575bf32b8..0000000000 --- a/src/Microsoft.ML.TimeSeries.StaticPipe/TimeSeriesStatic.cs +++ /dev/null @@ -1,311 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.ML.Runtime; -using Microsoft.ML.Transforms.TimeSeries; - -namespace Microsoft.ML.StaticPipe -{ - - /// - /// Static API extension methods for . - /// - public static class IidChangePointStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn( - Scalar input, - int confidence, - int changeHistoryLength, - MartingaleType martingale, - double eps) - : base(new Reconciler(confidence, changeHistoryLength, martingale, eps), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly int _confidence; - private readonly int _changeHistoryLength; - private readonly MartingaleType _martingale; - private readonly double _eps; - - public Reconciler( - int confidence, - int changeHistoryLength, - MartingaleType martingale, - double eps) - { - _confidence = confidence; - _changeHistoryLength = changeHistoryLength; - _martingale = martingale; - _eps = eps; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutColumn)toOutput[0]; - return new MLContext().Transforms.DetectIidChangePoint( - outputNames[outCol], - inputNames[outCol.Input], - _confidence, - _changeHistoryLength, - _martingale, - _eps); - } - } - - /// - /// Perform IID change point detection over a column of time series data. See . - /// - public static Vector DetectIidChangePoint( - this Scalar input, - int confidence, - int changeHistoryLength, - MartingaleType martingale = MartingaleType.Power, - double eps = 0.1) => new OutColumn(input, confidence, changeHistoryLength, martingale, eps); - } - - /// - /// Static API extension methods for . - /// - public static class IidSpikeDetectorStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Scalar input, - int confidence, - int pvalueHistoryLength, - AnomalySide side) - : base(new Reconciler(confidence, pvalueHistoryLength, side), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly int _confidence; - private readonly int _pvalueHistoryLength; - private readonly AnomalySide _side; - - public Reconciler( - int confidence, - int pvalueHistoryLength, - AnomalySide side) - { - _confidence = confidence; - _pvalueHistoryLength = pvalueHistoryLength; - _side = side; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutColumn)toOutput[0]; - return new MLContext().Transforms.DetectIidSpike( - outputNames[outCol], - inputNames[outCol.Input], - _confidence, - _pvalueHistoryLength, - _side); - } - } - - /// - /// Perform IID spike detection over a column of time series data. See . - /// - public static Vector DetectIidSpike( - this Scalar input, - int confidence, - int pvalueHistoryLength, - AnomalySide side = AnomalySide.TwoSided - ) => new OutColumn(input, confidence, pvalueHistoryLength, side); - } - - /// - /// Static API extension methods for . - /// - public static class SsaChangePointStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Scalar input, - int confidence, - int changeHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - ErrorFunction errorFunction, - MartingaleType martingale, - double eps) - : base(new Reconciler(confidence, changeHistoryLength, trainingWindowSize, seasonalityWindowSize, errorFunction, martingale, eps), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly int _confidence; - private readonly int _changeHistoryLength; - private readonly int _trainingWindowSize; - private readonly int _seasonalityWindowSize; - private readonly ErrorFunction _errorFunction; - private readonly MartingaleType _martingale; - private readonly double _eps; - - public Reconciler( - int confidence, - int changeHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - ErrorFunction errorFunction, - MartingaleType martingale, - double eps) - { - _confidence = confidence; - _changeHistoryLength = changeHistoryLength; - _trainingWindowSize = trainingWindowSize; - _seasonalityWindowSize = seasonalityWindowSize; - _errorFunction = errorFunction; - _martingale = martingale; - _eps = eps; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutColumn)toOutput[0]; - return new MLContext().Transforms.DetectChangePointBySsa( - outputNames[outCol], - inputNames[outCol.Input], - _confidence, - _changeHistoryLength, - _trainingWindowSize, - _seasonalityWindowSize, - _errorFunction, - _martingale, - _eps); - } - } - - /// - /// Perform SSA change point detection over a column of time series data. See . - /// - public static Vector DetectChangePointBySsa( - this Scalar input, - int confidence, - int changeHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - ErrorFunction errorFunction = ErrorFunction.SignedDifference, - MartingaleType martingale = MartingaleType.Power, - double eps = 0.1) => new OutColumn(input, confidence, changeHistoryLength, trainingWindowSize, seasonalityWindowSize, errorFunction, martingale, eps); - } - - /// - /// Static API extension methods for . - /// - public static class SsaSpikeDetectorStaticExtensions - { - private sealed class OutColumn : Vector - { - public PipelineColumn Input { get; } - - public OutColumn(Scalar input, - int confidence, - int pvalueHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - AnomalySide side, - ErrorFunction errorFunction) - : base(new Reconciler(confidence, pvalueHistoryLength, trainingWindowSize, seasonalityWindowSize, side, errorFunction), input) - { - Input = input; - } - } - - private sealed class Reconciler : EstimatorReconciler - { - private readonly int _confidence; - private readonly int _pvalueHistoryLength; - private readonly int _trainingWindowSize; - private readonly int _seasonalityWindowSize; - private readonly AnomalySide _side; - private readonly ErrorFunction _errorFunction; - - public Reconciler( - int confidence, - int pvalueHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - AnomalySide side, - ErrorFunction errorFunction) - { - _confidence = confidence; - _pvalueHistoryLength = pvalueHistoryLength; - _trainingWindowSize = trainingWindowSize; - _seasonalityWindowSize = seasonalityWindowSize; - _side = side; - _errorFunction = errorFunction; - } - - public override IEstimator Reconcile(IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Contracts.Assert(toOutput.Length == 1); - var outCol = (OutColumn)toOutput[0]; - return new MLContext().Transforms.DetectSpikeBySsa( - outputNames[outCol], - inputNames[outCol.Input], - _confidence, - _pvalueHistoryLength, - _trainingWindowSize, - _seasonalityWindowSize, - _side, - _errorFunction); - } - } - - /// - /// Perform SSA spike detection over a column of time series data. See . - /// - public static Vector DetectSpikeBySsa( - this Scalar input, - int confidence, - int changeHistoryLength, - int trainingWindowSize, - int seasonalityWindowSize, - AnomalySide side = AnomalySide.TwoSided, - ErrorFunction errorFunction = ErrorFunction.SignedDifference - ) => new OutColumn(input, confidence, changeHistoryLength, trainingWindowSize, seasonalityWindowSize, side, errorFunction); - - } -} diff --git a/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs index 7e8829aed4..2f38ef02fc 100644 --- a/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Transforms/Properties/AssemblyInfo.cs @@ -5,7 +5,6 @@ using System.Runtime.CompilerServices; using Microsoft.ML; -[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Ensemble" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Code/BestFriendTest.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Code/BestFriendTest.cs index ace3a88390..2d9b399133 100644 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Code/BestFriendTest.cs +++ b/test/Microsoft.ML.CodeAnalyzer.Tests/Code/BestFriendTest.cs @@ -85,7 +85,6 @@ public async Task BestFriend() // compilation will not be able to locate a single definition for use in the analyzer. test.TestState.AdditionalReferences.Remove(AdditionalMetadataReferences.MLNetCoreReference); test.TestState.AdditionalReferences.Remove(AdditionalMetadataReferences.MLNetDataReference); - test.TestState.AdditionalReferences.Remove(AdditionalMetadataReferences.MLNetStaticPipeReference); test.Exclusions &= ~AnalysisExclusions.GeneratedCode; test.ExpectedDiagnostics.AddRange(expected); diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/AdditionalMetadataReferences.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/AdditionalMetadataReferences.cs index 128f86939d..f14bb93c9d 100644 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/AdditionalMetadataReferences.cs +++ b/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/AdditionalMetadataReferences.cs @@ -6,7 +6,6 @@ using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; namespace Microsoft.ML.CodeAnalyzer.Tests.Helpers { @@ -18,7 +17,6 @@ internal static class AdditionalMetadataReferences internal static readonly MetadataReference MSDataDataViewReference = RefFromType(); internal static readonly MetadataReference MLNetCoreReference = RefFromType(); internal static readonly MetadataReference MLNetDataReference = RefFromType(); - internal static readonly MetadataReference MLNetStaticPipeReference = RefFromType(); internal static MetadataReference RefFromType() => MetadataReference.CreateFromFile(typeof(TType).Assembly.Location); diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/CSharpCodeFixVerifier`2.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/CSharpCodeFixVerifier`2.cs index 46e738d6c0..77d40747a5 100644 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/CSharpCodeFixVerifier`2.cs +++ b/test/Microsoft.ML.CodeAnalyzer.Tests/Helpers/CSharpCodeFixVerifier`2.cs @@ -65,7 +65,6 @@ public Test() TestState.AdditionalReferences.Add(AdditionalMetadataReferences.MSDataDataViewReference); TestState.AdditionalReferences.Add(AdditionalMetadataReferences.MLNetCoreReference); TestState.AdditionalReferences.Add(AdditionalMetadataReferences.MLNetDataReference); - TestState.AdditionalReferences.Add(AdditionalMetadataReferences.MLNetStaticPipeReference); SolutionTransforms.Add((solution, projectId) => { diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Microsoft.ML.CodeAnalyzer.Tests.csproj b/test/Microsoft.ML.CodeAnalyzer.Tests/Microsoft.ML.CodeAnalyzer.Tests.csproj index 56e3b9a36e..d7a83aace0 100644 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Microsoft.ML.CodeAnalyzer.Tests.csproj +++ b/test/Microsoft.ML.CodeAnalyzer.Tests/Microsoft.ML.CodeAnalyzer.Tests.csproj @@ -30,9 +30,7 @@ - - diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeClassResource.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeClassResource.cs deleted file mode 100644 index b2729a7073..0000000000 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeClassResource.cs +++ /dev/null @@ -1,154 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Bubba -{ - class Foo - { - public static void Bar() - { - IHostEnvironment env = null; - var text = TextLoaderStatic.CreateLoader(env, ctx => new - { - Label = ctx.LoadBool(0), - Text = ctx.LoadText(1), - NumericFeatures = ctx.LoadFloat(2, 5) - }); - - var est = text.MakeNewEstimator(); - // This should work. - est.Append(r => new { r.Text }); - - IDataView view = null; - view.AssertStatic(env, c => new Class1(c.I4.Scalar, c.Bool.Vector)); - view.AssertStatic(env, c => new Class2 { F1 = c.I4.Scalar, F2 = c.Bool.Vector }); - view.AssertStatic(env, c => new Class3 - { - F1 = new Class1(c.I4.Scalar, c.Bool.Vector), - F2 = new Class2 { F1 = c.I4.Scalar, F2 = c.Bool.Vector } - }); - view.AssertStatic(env, c => new Class4 { F1 = c.I4.Scalar }); - view.AssertStatic(env, c => null); - view.AssertStatic(env, c => new Class6(c.I4.Scalar, c.Bool.Vector)); - view.AssertStatic(env, c => new Class7 { F2 = c.Bool.Vector }); - view.AssertStatic(env, c => new Class8(c.I4.Scalar, c.Bool.Vector)); - view.AssertStatic(env, c => new Class9 { F1 = c.I4.Scalar, F2 = c.Bool.Vector }); - view.AssertStatic(env, c => new Class10(c.I4.Scalar, c.Bool.Vector)); - view.AssertStatic(env, c => new Class11(c.I4.Scalar, c.Bool.Vector, c.Bool.Vector)); - - // This is wrong but should not fail with our diagnostic since there is a deeper problem that the class - // simply is not there. - var text2 = TextLoaderStatic.CreateLoader(env, ctx => new MissingClass(ctx.LoadText(0))); - } - } - - class Class1 // This is good. - { - public Class1(Scalar f1, Vector f2) - { - F1 = f1; - F2 = f2; - } - - public Scalar F1 { get; } - public Vector F2 { get; } - } - - class Class2 // This is good. - { - public Scalar F1 { get; set; } - public Vector F2 { get; set; } - } - - class Class3<[IsShape] T> // This is good. - { - public Class1 F1 { get; set; } - public T F2 { get; set; } - } - - class Class4 // This is bad, since it has fields, not properties. - { - public Scalar F1; - } - - class Class5 // This is bad since its single constructor is not accessible. - { - protected Class5(Scalar f1, Vector f2) - { - F1 = f1; - F2 = f2; - } - - public Scalar F1 { get; } - public Vector F2 { get; } - } - - class Class6 // This is bad since it has two public constructors. - { - public Class6(Scalar f1, Vector f2) - { - F1 = f1; - F2 = f2; - } - - public Class6(Vector f2, Scalar f1) - : this(f1, f2) - { - } - - public Scalar F1 { get; } - public Vector F2 { get; } - } - - class Class7 // This is bad since it has only an implicit constructor, but only F2 has a set accessor. - { - public Scalar F1 { get; } - public Vector F2 { get; set; } - } - - class Class8 // This is bad since it has a constructor with explicit parameters, but also a set accessor on F2. - { - public Class8(Scalar f1, Vector f2) - { - F1 = f1; - F2 = f2; - } - - public Scalar F1 { get; } - public Vector F2 { get; set; } - } - - class Class9 // This is bad since F2's get accessor is not publicly accessible. - { - public Scalar F1 { get; set; } - public Vector F2 { private get; set; } - } - - class Class10 // This is bad since there are two Vector properties but only one present in the constructor. - { - public Class10(Scalar f1, Vector f2) - { - F1 = f1; - F3 = F2 = f2; - } - - public Scalar F1 { get; } - public Vector F2 { get; } - public Vector F3 { get; } - } - - class Class11 // This is bad since there is one Vector property but two present in the constructor. - { - public Class11(Scalar f1, Vector f2, Vector f3) - { - F1 = f1; - F2 = f2; - } - - public Scalar F1 { get; } - public Vector F2 { get; } - } -} \ No newline at end of file diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResource.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResource.cs deleted file mode 100644 index a53edf9543..0000000000 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResource.cs +++ /dev/null @@ -1,46 +0,0 @@ -using System; -using Microsoft.ML; -using Microsoft.ML.CommandLine; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Bubba -{ - class Foo - { - public static void Bar() - { - IHostEnvironment env = null; - var text = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1), - numericFeatures: ctx.LoadFloat(2, 5))); - - var est = text.MakeNewEstimator(); - // This should work. - est.Append(r => r.text); - // These should not. - est.Append(r => 5); - est.Append(r => new { r.text, bad = 2 }); - // This should work. - est.Append(r => Tuple.Create(r.text, r.numericFeatures)); - // This should work. - est.Append(r => (a: r.text, b: r.label, c: (d: r.text, r.label))); - // This should not, and it should indicate a path to the problematic item. - est.Append(r => (a: r.text, b: r.label, c: (d: r.text, 5.2f))); - - // Check a different entrance into static land now, with one of the asserts. - var view = text.Load(null).AsDynamic; - // Despite the fact that the names are all wrong, this should still work - // from the point of view of this analyzer. - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU1.I4Values.Vector)); - // However, this should not. - view.AssertStatic(env, c => ( - and: c.KeyU4.TextValues.Scalar, - listen: 1l)); - } - } -} diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResourceChained.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResourceChained.cs deleted file mode 100644 index 735583db5d..0000000000 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/Resources/TypeIsSchemaShapeResourceChained.cs +++ /dev/null @@ -1,64 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML; -using Microsoft.ML.CommandLine; -using Microsoft.ML.Data; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -namespace Bubba -{ - class Foo - { - public static void Bar() - { - DataLoader Foo1(Func m) - { - IHostEnvironment env = null; - // We ought to fail here. - return TextLoaderStatic.CreateLoader(env, m); - } - - DataLoader Foo2<[IsShape] T>(Func m) - { - IHostEnvironment env = null; - // We ought not to fail here due to that [IsShape], but calls to this method might fail. - return TextLoaderStatic.CreateLoader(env, m); - } - - DataLoader Foo3(Func m) - where T : PipelineColumn - { - IHostEnvironment env = null; - // This should work. - return TextLoaderStatic.CreateLoader(env, m); - } - - DataLoader Foo4(Func m) - where T : IEnumerable - { - IHostEnvironment env = null; - // This should not work. - return TextLoaderStatic.CreateLoader(env, m); - } - - void Scratch() - { - // Neither of these two should fail here, though the method they're calling ought to fail. - var f1 = Foo1(ctx => ( - label: ctx.LoadBool(0), text: ctx.LoadText(1))); - var f2 = Foo1(ctx => ( - label: ctx.LoadBool(0), text: 5)); - - // The first should succeed, the second should fail. - var f3 = Foo2(ctx => ( - label: ctx.LoadBool(0), text: ctx.LoadText(1))); - var f4 = Foo2(ctx => ( - label: ctx.LoadBool(0), text: 6)); - - // This should succeed. - var f5 = Foo3(ctx => ctx.LoadBool(0)); - } - } - } -} diff --git a/test/Microsoft.ML.CodeAnalyzer.Tests/TypeIsSchemaShapeTest.cs b/test/Microsoft.ML.CodeAnalyzer.Tests/TypeIsSchemaShapeTest.cs deleted file mode 100644 index feb00f95ed..0000000000 --- a/test/Microsoft.ML.CodeAnalyzer.Tests/TypeIsSchemaShapeTest.cs +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Threading.Tasks; -using Microsoft.CodeAnalysis; -using Microsoft.CodeAnalysis.Testing; -using Microsoft.ML.CodeAnalyzer.Tests.Helpers; -using Xunit; -using VerifyCS = Microsoft.ML.CodeAnalyzer.Tests.Helpers.CSharpCodeFixVerifier< - Microsoft.ML.Analyzer.TypeIsSchemaShapeAnalyzer, - Microsoft.CodeAnalysis.Testing.EmptyCodeFixProvider>; - -namespace Microsoft.ML.Analyzer.Tests -{ - public sealed class TypeIsSchemaShapeTest - { - private static string _srcResource; - internal static string Source => TestUtils.EnsureSourceLoaded(ref _srcResource, "TypeIsSchemaShapeResource.cs"); - - [Fact] - public async Task ReturnTypeIsSchemaShape() - { - var expected = new DiagnosticResult[] { - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeDiagnostic.Rule).WithLocation(24, 13).WithArguments(""), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeDiagnostic.Rule).WithLocation(25, 13).WithArguments(" of item bad"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeDiagnostic.Rule).WithLocation(31, 13).WithArguments(" of item c.Item2"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeDiagnostic.Rule).WithLocation(41, 13).WithArguments(" of item listen"), - }; - - var test = new VerifyCS.Test { TestCode = Source }; - test.ExpectedDiagnostics.AddRange(expected); - test.Exclusions &= ~AnalysisExclusions.GeneratedCode; - await test.RunAsync(); - } - - private static string _srcResourceChained; - internal static string SourceChained => TestUtils.EnsureSourceLoaded( - ref _srcResourceChained, "TypeIsSchemaShapeResourceChained.cs"); - - [Fact] - public async Task ReturnTypeIsSchemaShapeChained() - { - // This is a somewhat more complex example, where instead of direct usage the user of the API is devising their own - // function where the shape type is a generic type parameter. In this case, we would ideally like the analysis to get - // chained out of their function. - var expected = new DiagnosticResult[] { - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeParameterDiagnostic.Rule).WithLocation(19, 24).WithArguments("T"), - new DiagnosticResult("CS8205", DiagnosticSeverity.Error).WithLocation(22, 52).WithMessage("Attributes are not allowed on local function parameters or type parameters"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeParameterDiagnostic.Rule).WithLocation(42, 24).WithArguments("T"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeDiagnostic.Rule).WithLocation(56, 26).WithArguments(" of item text"), - }; - - var test = new VerifyCS.Test { TestCode = SourceChained }; - test.ExpectedDiagnostics.AddRange(expected); - test.Exclusions &= ~AnalysisExclusions.GeneratedCode; - await test.RunAsync(); - } - - private static string _srcResourceClass; - internal static string SourceClass => TestUtils.EnsureSourceLoaded( - ref _srcResourceClass, "TypeIsSchemaShapeClassResource.cs"); - - [Fact] - public async Task ReturnTypeIsSchemaShapeClass() - { - // This is a somewhat more complex example, where instead of direct usage the user of the API is devising their own - // function where the shape type is a generic type parameter. In this case, we would ideally like the analysis to get - // chained out of their function. - var expected = new DiagnosticResult[] { - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticField.Rule).WithLocation(33, 13).WithArguments("Class4", "F1"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticConstructor.Rule).WithLocation(34, 13).WithArguments("Class5"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticConstructor.Rule).WithLocation(35, 13).WithArguments("Class6"), - - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticNoArgsSettable.Rule).WithLocation(36, 13).WithArguments("Class7", "F1"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticArgsSettable.Rule).WithLocation(37, 13).WithArguments("Class8", "F2"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticGettable.Rule).WithLocation(38, 13).WithArguments("Class9", "F2"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticCorrespondence.Rule).WithLocation(39, 13).WithArguments("Class10"), - VerifyCS.Diagnostic(TypeIsSchemaShapeAnalyzer.ShapeClassDiagnosticCorrespondence.Rule).WithLocation(40, 13).WithArguments("Class11"), - - new DiagnosticResult("CS0246", DiagnosticSeverity.Error).WithLocation(44, 71).WithMessage("The type or namespace name 'MissingClass' could not be found (are you missing a using directive or an assembly reference?)"), - }; - - var test = new VerifyCS.Test { TestCode = SourceClass }; - test.ExpectedDiagnostics.AddRange(expected); - test.Exclusions &= ~AnalysisExclusions.GeneratedCode; - await test.RunAsync(); - } - } -} diff --git a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj index ef3f8517fa..9bb7d13241 100644 --- a/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj +++ b/test/Microsoft.ML.Functional.Tests/Microsoft.ML.Functional.Tests.csproj @@ -23,8 +23,6 @@ - - @@ -39,7 +37,7 @@ - + diff --git a/test/Microsoft.ML.OnnxTransformerTest/DnnImageFeaturizerTest.cs b/test/Microsoft.ML.OnnxTransformerTest/DnnImageFeaturizerTest.cs index 5cc1b2b524..791756f0d0 100644 --- a/test/Microsoft.ML.OnnxTransformerTest/DnnImageFeaturizerTest.cs +++ b/test/Microsoft.ML.OnnxTransformerTest/DnnImageFeaturizerTest.cs @@ -5,14 +5,10 @@ using System; using System.Collections.Generic; using System.IO; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework.Attributes; -using Microsoft.ML.Transforms.Onnx; -using Microsoft.ML.Transforms.StaticPipe; using Xunit; using Xunit.Abstractions; @@ -89,7 +85,7 @@ void TestDnnImageFeaturizer() } [OnnxFact] - public void OnnxStatic() + public void OnnxFeaturizerWorkout() { var env = new MLContext(null); var imageHeight = 224; @@ -97,20 +93,19 @@ public void OnnxStatic() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoaderStatic.CreateLoader(env, ctx => ( - imagePath: ctx.LoadText(0), - name: ctx.LoadText(1))) - .Load(dataFile); + var data = ML.Data.LoadFromTextFile(dataFile, new[] { + new TextLoader.Column("imagePath", DataKind.String, 0), + new TextLoader.Column("name", DataKind.String, 1) + }); - var pipe = data.MakeNewEstimator() - .Append(row => ( - row.name, - data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) - .Append(row => (row.name, output_1: row.data_0.DnnImageFeaturizer(m => m.ModelSelector.ResNet18(m.Environment, m.OutputColumn, m.InputColumn)))); + var pipe = ML.Transforms.LoadImages("data_0", imageFolder, "imagePath") + .Append(ML.Transforms.ResizeImages("data_0", imageHeight, imageWidth)) + .Append(ML.Transforms.ExtractPixels("data_0", interleavePixelColors: true)) + .Append(ML.Transforms.DnnFeaturizeImage("output_1", m => m.ModelSelector.ResNet18(m.Environment, m.OutputColumn, m.InputColumn), "data_0")); - TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); + TestEstimatorCore(pipe, data); - var result = pipe.Fit(data).Transform(data).AsDynamic; + var result = pipe.Fit(data).Transform(data); using (var cursor = result.GetRowCursor(result.Schema["output_1"])) { var buffer = default(VBuffer); diff --git a/test/Microsoft.ML.OnnxTransformerTest/Microsoft.ML.OnnxTransformerTest.csproj b/test/Microsoft.ML.OnnxTransformerTest/Microsoft.ML.OnnxTransformerTest.csproj index 6d11c627ee..24d91f113a 100644 --- a/test/Microsoft.ML.OnnxTransformerTest/Microsoft.ML.OnnxTransformerTest.csproj +++ b/test/Microsoft.ML.OnnxTransformerTest/Microsoft.ML.OnnxTransformerTest.csproj @@ -4,9 +4,7 @@ - - @@ -30,7 +28,7 @@ - + diff --git a/test/Microsoft.ML.OnnxTransformerTest/OnnxTransformTests.cs b/test/Microsoft.ML.OnnxTransformerTest/OnnxTransformTests.cs index 2be146218c..ec1da8fe33 100644 --- a/test/Microsoft.ML.OnnxTransformerTest/OnnxTransformTests.cs +++ b/test/Microsoft.ML.OnnxTransformerTest/OnnxTransformTests.cs @@ -7,16 +7,13 @@ using System.Drawing; using System.IO; using System.Linq; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Tools; using Microsoft.ML.Transforms.Image; -using Microsoft.ML.Transforms.StaticPipe; using Xunit; using Xunit.Abstractions; using Microsoft.ML.Transforms.Onnx; @@ -180,7 +177,7 @@ void TestOldSavingAndLoading(int? gpuDeviceId, bool fallbackToCpu) } [OnnxFact] - public void OnnxStatic() + public void OnnxWorkout() { var modelFile = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet", "00000001", "model.onnx"); @@ -190,21 +187,19 @@ public void OnnxStatic() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoaderStatic.CreateLoader(env, ctx => ( - imagePath: ctx.LoadText(0), - name: ctx.LoadText(1))) - .Load(dataFile); - + var data = ML.Data.LoadFromTextFile(dataFile, new[] { + new TextLoader.Column("imagePath", DataKind.String, 0), + new TextLoader.Column("name", DataKind.String, 1) + }); // Note that CamelCase column names are there to match the TF graph node names. - var pipe = data.MakeNewEstimator() - .Append(row => ( - row.name, - data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) - .Append(row => (row.name, softmaxout_1: row.data_0.ApplyOnnxModel(modelFile))); + var pipe = ML.Transforms.LoadImages("data_0", imageFolder, "imagePath") + .Append(ML.Transforms.ResizeImages("data_0", imageHeight, imageWidth)) + .Append(ML.Transforms.ExtractPixels("data_0", interleavePixelColors: true)) + .Append(ML.Transforms.ApplyOnnxModel("softmaxout_1", "data_0", modelFile)); - TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); + TestEstimatorCore(pipe, data); - var result = pipe.Fit(data).Transform(data).AsDynamic; + var result = pipe.Fit(data).Transform(data); var softmaxOutCol = result.Schema["softmaxout_1"]; using (var cursor = result.GetRowCursor(softmaxOutCol)) @@ -572,7 +567,7 @@ private class OnnxMapOutput /// /// Use - /// to test if ML.NET can manipulate properly. ONNXRuntime's C# API doesn't support map yet. + /// to test if ML.NET can manipulate properly. ONNXRuntime's C# API doesn't support map yet. /// [OnnxFact] public void SmokeInMemoryOnnxMapTypeTest() diff --git a/test/Microsoft.ML.StaticPipelineTesting/ImageAnalyticsTests.cs b/test/Microsoft.ML.StaticPipelineTesting/ImageAnalyticsTests.cs deleted file mode 100644 index bb9f1ba39f..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/ImageAnalyticsTests.cs +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.StaticPipe; -using Xunit; -using Xunit.Abstractions; - -namespace Microsoft.ML.StaticPipelineTesting -{ - public sealed class ImageAnalyticsTests : BaseTestClassWithConsole - { - public ImageAnalyticsTests(ITestOutputHelper output) - : base(output) - { - } - - [Fact] - public void SimpleImageSmokeTest() - { - var env = new MLContext(0); - - var reader = TextLoaderStatic.CreateLoader(env, - ctx => ctx.LoadText(0).LoadAsImage().AsGrayscale().Resize(10, 8).ExtractPixels()); - - var schema = reader.AsDynamic.GetOutputSchema(); - Assert.True(schema.TryGetColumnIndex("Data", out int col), "Could not find 'Data' column"); - var type = schema[col].Type; - var vecType = type as VectorDataViewType; - Assert.True(vecType?.Size > 0, $"Type was supposed to be known size vector but was instead '{type}'"); - Assert.Equal(NumberDataViewType.Single, vecType.ItemType); - Assert.Equal(3, vecType.Dimensions.Length); - Assert.Equal(3, vecType.Dimensions[0]); - Assert.Equal(8, vecType.Dimensions[1]); - Assert.Equal(10, vecType.Dimensions[2]); - - var readAsImage = TextLoaderStatic.CreateLoader(env, - ctx => ctx.LoadText(0).LoadAsImage()); - var est = readAsImage.MakeNewEstimator().Append(r => r.AsGrayscale().Resize(10, 8).ExtractPixels()); - var pipe= readAsImage.Append(est); - } - } -} diff --git a/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj b/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj deleted file mode 100644 index f78940c900..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/Microsoft.ML.StaticPipelineTesting.csproj +++ /dev/null @@ -1,31 +0,0 @@ - - - CORECLR - - - - - - - - - - - - - - - - false - Analyzer - - - - - - - - - - - diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeFakes.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeFakes.cs deleted file mode 100644 index 4e8306d0ba..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeFakes.cs +++ /dev/null @@ -1,210 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; - -// Holds some classes that superficially represent classes, at least sufficiently to give the idea of the -// statically typed columnar estimator helper API. As more "real" examples of the static functions get -// added, this file will gradully disappear. - -namespace FakeStaticPipes -{ - /// - /// This is a reconciler that doesn't really do anything, just a fake for testing the infrastructure. - /// - internal sealed class FakeTransformReconciler : EstimatorReconciler - { - private readonly string _name; - - public FakeTransformReconciler(string name) - { - _name = name; - } - - public override IEstimator Reconcile( - IHostEnvironment env, - PipelineColumn[] toOutput, - IReadOnlyDictionary inputNames, - IReadOnlyDictionary outputNames, - IReadOnlyCollection usedNames) - { - Console.WriteLine($"Constructing {_name} estimator!"); - - foreach (var col in toOutput) - { - if ((((IDeps)col).Deps?.Length ?? 0) == 0) - Console.WriteLine($" Will make '{outputNames[col]}' from nothing"); - else - { - Console.WriteLine($" Will make '{outputNames[col]}' out of " + - string.Join(", ", ((IDeps)col).Deps.Select(d => $"'{inputNames[d]}'"))); - } - } - - return new FakeEstimator(); - } - - private sealed class FakeEstimator : IEstimator - { - public ITransformer Fit(IDataView input) => throw new NotImplementedException(); - public SchemaShape GetOutputSchema(SchemaShape inputSchema) => throw new NotImplementedException(); - } - - private interface IDeps { PipelineColumn[] Deps { get; } } - - private sealed class AScalar : Scalar, IDeps { public AScalar(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - private sealed class AVector : Vector, IDeps { public AVector(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - private sealed class AVarVector : VarVector, IDeps { public AVarVector(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - private sealed class AKey : Key, IDeps { public AKey(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - private sealed class AKey : Key, IDeps { public AKey(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - private sealed class AVarKey : VarKey, IDeps { public AVarKey(Reconciler rec, PipelineColumn[] dependencies) : base(rec, dependencies) { Deps = dependencies; } public PipelineColumn[] Deps { get; } } - - public Scalar Scalar(params PipelineColumn[] dependencies) => new AScalar(this, dependencies); - public Vector Vector(params PipelineColumn[] dependencies) => new AVector(this, dependencies); - public VarVector VarVector(params PipelineColumn[] dependencies) => new AVarVector(this, dependencies); - public Key Key(params PipelineColumn[] dependencies) => new AKey(this, dependencies); - public Key Key(params PipelineColumn[] dependencies) => new AKey(this, dependencies); - public VarKey VarKey(params PipelineColumn[] dependencies) => new AVarKey(this, dependencies); - } - - public static class ConcatTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("Concat"); - - public sealed class ScalarOrVector : ScalarOrVectorOrVarVector - { - private ScalarOrVector(PipelineColumn col) : base(col) { } - public static implicit operator ScalarOrVector(Scalar c) => new ScalarOrVector(c); - public static implicit operator ScalarOrVector(Vector c) => new ScalarOrVector(c); - } - - private interface IContainsColumn - { - PipelineColumn WrappedColumn { get; } - } - - - public class ScalarOrVectorOrVarVector : IContainsColumn - { - private readonly PipelineColumn _wrappedColumn; - PipelineColumn IContainsColumn.WrappedColumn => _wrappedColumn; - - private protected ScalarOrVectorOrVarVector(PipelineColumn col) - { - _wrappedColumn = col; - } - - public static implicit operator ScalarOrVectorOrVarVector(VarVector c) - => new ScalarOrVectorOrVarVector(c); - } - - private static PipelineColumn[] Helper(PipelineColumn first, IList> list) - { - PipelineColumn[] retval = new PipelineColumn[list.Count + 1]; - retval[0] = first; - for (int i = 0; i < list.Count; ++i) - retval[i + 1] = ((IContainsColumn)list[i]).WrappedColumn; - return retval; - } - - public static Vector ConcatWith(this Scalar me, params ScalarOrVector[] i) - => _rec.Vector(Helper(me, i)); - public static Vector ConcatWith(this Vector me, params ScalarOrVector[] i) - => _rec.Vector(Helper(me, i)); - - public static VarVector ConcatWith(this Scalar me, params ScalarOrVectorOrVarVector[] i) - => _rec.VarVector(Helper(me, i)); - public static VarVector ConcatWith(this Vector me, params ScalarOrVectorOrVarVector[] i) - => _rec.VarVector(Helper(me, i)); - public static VarVector ConcatWith(this VarVector me, params ScalarOrVectorOrVarVector[] i) - => _rec.VarVector(Helper(me, i)); - } - - public static class NormalizeTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("Normalize"); - - public static Vector Normalize(this Vector me) - => _rec.Vector(me); - - public static Vector Normalize(this Vector me) - => _rec.Vector(me); - } - - public static class WordTokenizeTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("WordTokenize"); - - public static VarVector Tokenize(this Scalar me) - => _rec.VarVector(me); - public static VarVector Tokenize(this Vector me) - => _rec.VarVector(me); - public static VarVector Tokenize(this VarVector me) - => _rec.VarVector(me); - } - - public static class TermTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("Term"); - - public static Key Dictionarize(this Scalar me) - => _rec.Key(me); - public static Vector> Dictionarize(this Vector me) - => _rec.Vector>(me); - public static VarVector> Dictionarize(this VarVector me) - => _rec.VarVector>(me); - } - - - public static class TextTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("FeaturizeTextEstimator"); - - /// - /// Performs text featurization on the input text. This will tokenize, do n-gram featurization, - /// dictionary based term mapping, and finally produce a word-bag vector for the output. - /// - /// The text to featurize - /// - public static Vector TextFeaturizer(this Scalar me, bool keepDiacritics = true) - => _rec.Vector(me); - } - - public static class TrainerTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("LinearBinaryClassification"); - - /// - /// Trains a linear predictor using logistic regression. - /// - /// The target label for this binary classification task - /// The features to train on. Should be normalized. - /// A tuple of columns representing the score, the calibrated score as a probability, and the boolean predicted label - public static (Scalar score, Scalar probability, Scalar predictedLabel) TrainLinearClassification(this Scalar label, Vector features) - => (_rec.Scalar(label, features), _rec.Scalar(label, features), _rec.Scalar(label, features)); - } - - public static class HashTransformExtensions - { - private static FakeTransformReconciler _rec = new FakeTransformReconciler("Hash"); - - public static Key Hash(this Scalar me) - => _rec.Key(me); - public static Key Hash(this Scalar me, int maximumNumberOfInvertsTokens) - => _rec.Key(me); - public static Vector> Hash(this Vector me) - => _rec.Vector>(me); - public static Vector> Hash(this Vector me, int maximumNumberOfInvertsTokens) - => _rec.Vector>(me); - public static VarVector> Hash(this VarVector me) - => _rec.VarVector>(me); - public static VarVector> Hash(this VarVector me, int maximumNumberOfInvertsTokens) - => _rec.VarVector>(me); - } -} diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs deleted file mode 100644 index d4fe311083..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ /dev/null @@ -1,889 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.IO; -using System.Linq; -using System.Text; -using Microsoft.ML.Data; -using Microsoft.ML.Data.IO; -using Microsoft.ML.Mkl.Components.StaticPipe; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.TestFramework; -using Microsoft.ML.Transforms.Text; -using Xunit; -using Xunit.Abstractions; -using static Microsoft.ML.Transforms.Text.LatentDirichletAllocationTransformer; - -namespace Microsoft.ML.StaticPipelineTesting -{ - public abstract class BaseTestClassWithConsole : BaseTestClass, IDisposable - { - private readonly TextWriter _originalOut; - private readonly TextWriter _textWriter; - - public BaseTestClassWithConsole(ITestOutputHelper output) - : base(output) - { - _originalOut = Console.Out; - _textWriter = new StringWriter(); - Console.SetOut(_textWriter); - } - - public void Dispose() - { - Output.WriteLine(_textWriter.ToString()); - Console.SetOut(_originalOut); - } - } - - public sealed class StaticPipeTests : BaseTestClassWithConsole - { - public StaticPipeTests(ITestOutputHelper output) - : base(output) - { - } - - [Fact] - public void SimpleTextLoaderCopyColumnsTest() - { - var env = new MLContext(0); - - const string data = "0 hello 3.14159 -0 2\n" - + "1 1 2 4 15"; - var dataSource = new BytesStreamSource(data); - - var text = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1), - numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. - dataSource, separator: ' '); - - // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions - // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now. - // For now, just operate over the actual `IDataView`. - var textData = text.Load(dataSource).AsDynamic; - - Action CheckSchemaHasColumn = (dataSchema, name) => - { - Assert.True(dataSchema.GetColumnOrNull(name).HasValue, "Could not find column '" + name + "'"); - }; - - var schema = textData.Schema; - // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. - CheckSchemaHasColumn(schema, "label"); - CheckSchemaHasColumn(schema, "text"); - CheckSchemaHasColumn(schema, "numericFeatures"); - // Next verify they have the expected types. - Assert.Equal(BooleanDataViewType.Instance, schema["label"].Type); - Assert.Equal(TextDataViewType.Instance, schema["text"].Type); - Assert.Equal(new VectorDataViewType(NumberDataViewType.Single, 3), schema["numericFeatures"].Type); - // Next actually inspect the data. - using (var cursor = textData.GetRowCursorForAllColumns()) - { - var textGetter = cursor.GetGetter>(schema["text"]); - var numericFeaturesGetter = cursor.GetGetter>(schema["numericFeatures"]); - ReadOnlyMemory textVal = default; - var labelGetter = cursor.GetGetter(schema["label"]); - bool labelVal = default; - VBuffer numVal = default; - - void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2) - { - labelGetter(ref labelVal); - textGetter(ref textVal); - numericFeaturesGetter(ref numVal); - Assert.True(tx.AsSpan().SequenceEqual(textVal.Span)); - Assert.Equal((bool)bl, labelVal); - Assert.Equal(3, numVal.Length); - Assert.Equal(v0, numVal.GetItemOrDefault(0)); - Assert.Equal(v1, numVal.GetItemOrDefault(1)); - Assert.Equal(v2, numVal.GetItemOrDefault(2)); - } - - Assert.True(cursor.MoveNext(), "Could not move even to first row"); - CheckValuesSame(false, "hello", 3.14159f, -0f, 2f); - Assert.True(cursor.MoveNext(), "Could not move to second row"); - CheckValuesSame(true, "1", 2f, 4f, 15f); - Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two"); - } - - // The next step where we shuffle the names around a little bit is one where we are - // testing out the implicit usage of copy columns. - - var est = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures)); - var newText = text.Append(est); - var newTextData = newText.Fit(dataSource).Load(dataSource); - - schema = newTextData.AsDynamic.Schema; - // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. - CheckSchemaHasColumn(schema, "label"); - CheckSchemaHasColumn(schema, "text"); - // Next verify they have the expected types. - Assert.Equal(BooleanDataViewType.Instance, schema["text"].Type); - Assert.Equal(new VectorDataViewType(NumberDataViewType.Single, 3), schema["label"].Type); - } - - private sealed class Obnoxious1 - { - public Scalar Foo { get; } - public Vector Bar { get; } - - public Obnoxious1(Scalar f1, Vector f2) - { - Foo = f1; - Bar = f2; - } - } - - private sealed class Obnoxious2 - { - public Scalar Biz { get; set; } - public Vector Blam { get; set; } - } - - private sealed class Obnoxious3 - { - public (Scalar hi, Obnoxious1 my, T friend) Donut { get; set; } - } - - private static Obnoxious3 MakeObnoxious3(Scalar hi, Obnoxious1 my, T friend) - => new Obnoxious3() { Donut = (hi, my, friend) }; - - [Fact] - public void SimpleTextLoaderObnoxiousTypeTest() - { - var env = new MLContext(0); - - const string data = "0 hello 3.14159 -0 2\n" - + "1 1 2 4 15"; - var dataSource = new BytesStreamSource(data); - - // Ahhh. No one would ever, ever do this, of course, but just having fun with it. - - void Helper(DataViewSchema thisSchema, string name, DataViewType expected) - { - Assert.True(thisSchema.TryGetColumnIndex(name, out int thisCol), $"Could not find column '{name}'"); - Assert.Equal(expected, thisSchema[thisCol].Type); - } - - var text = TextLoaderStatic.CreateLoader(env, ctx => ( - yo: new Obnoxious1(ctx.LoadText(0), ctx.LoadFloat(1, 5)), - dawg: new Obnoxious2() { Biz = ctx.LoadText(2), Blam = ctx.LoadDouble(1, 2) }, - how: MakeObnoxious3(ctx.LoadBool(2), new Obnoxious1(ctx.LoadText(0), ctx.LoadFloat(1, 4)), - new Obnoxious2() { Biz = ctx.LoadText(5), Blam = ctx.LoadDouble(1, 10) }))); - - var schema = text.AsDynamic.GetOutputSchema(); - Helper(schema, "yo.Foo", TextDataViewType.Instance); - Helper(schema, "yo.Bar", new VectorDataViewType(NumberDataViewType.Single, 5)); - Helper(schema, "dawg.Biz", TextDataViewType.Instance); - Helper(schema, "dawg.Blam", new VectorDataViewType(NumberDataViewType.Double, 2)); - - Helper(schema, "how.Donut.hi", BooleanDataViewType.Instance); - Helper(schema, "how.Donut.my.Foo", TextDataViewType.Instance); - Helper(schema, "how.Donut.my.Bar", new VectorDataViewType(NumberDataViewType.Single, 4)); - Helper(schema, "how.Donut.friend.Biz", TextDataViewType.Instance); - Helper(schema, "how.Donut.friend.Blam", new VectorDataViewType(NumberDataViewType.Double, 10)); - - var textData = text.Load(null); - - var est = text.MakeNewEstimator().Append(r => r.how.Donut.friend.Blam.ConcatWith(r.dawg.Blam)); - var outData = est.Fit(textData).Transform(textData); - - var xfSchema = outData.AsDynamic.Schema; - Helper(xfSchema, "Data", new VectorDataViewType(NumberDataViewType.Double, 12)); - } - - private static KeyValuePair P(string name, DataViewType type) - => new KeyValuePair(name, type); - - [Fact] - public void AssertStaticSimple() - { - var env = new MLContext(0); - var schemaBuilder = new DataViewSchema.Builder(); - schemaBuilder.AddColumn("hello", TextDataViewType.Instance); - schemaBuilder.AddColumn("my", new VectorDataViewType(NumberDataViewType.Int64, 5)); - schemaBuilder.AddColumn("friend", new KeyDataViewType(typeof(uint), 3)); - var view = new EmptyDataView(env, schemaBuilder.ToSchema()); - - view.AssertStatic(env, c => new - { - my = c.I8.Vector, - friend = c.KeyU4.NoValue.Scalar, - hello = c.Text.Scalar - }); - - view.AssertStatic(env, c => ( - my: c.I8.Vector, - friend: c.KeyU4.NoValue.Scalar, - hello: c.Text.Scalar - )); - } - - [Fact] - public void AssertStaticSimpleFailure() - { - var env = new MLContext(0); - var schemaBuilder = new DataViewSchema.Builder(); - schemaBuilder.AddColumn("hello", TextDataViewType.Instance); - schemaBuilder.AddColumn("my", new VectorDataViewType(NumberDataViewType.Int64, 5)); - schemaBuilder.AddColumn("friend", new KeyDataViewType(typeof(uint), 3)); - - var view = new EmptyDataView(env, schemaBuilder.ToSchema()); - - Assert.ThrowsAny(() => - view.AssertStatic(env, c => new - { - my = c.I8.Scalar, // Shouldn't work, the type is wrong. - friend = c.KeyU4.NoValue.Scalar, - hello = c.Text.Scalar - })); - - Assert.ThrowsAny(() => - view.AssertStatic(env, c => ( - mie: c.I8.Vector, // Shouldn't work, the name is wrong. - friend: c.KeyU4.NoValue.Scalar, - hello: c.Text.Scalar))); - } - - [Fact] - public void AssertStaticKeys() - { - var env = new MLContext(0); - - // We'll test a few things here. First, the case where the key-value metadata is text. - var metaValues1 = new VBuffer>(3, new[] { "a".AsMemory(), "b".AsMemory(), "c".AsMemory() }); - var metaBuilder = new DataViewSchema.Annotations.Builder(); - metaBuilder.AddKeyValues>(3, TextDataViewType.Instance, metaValues1.CopyTo); - - var builder = new DataViewSchema.Annotations.Builder(); - builder.AddPrimitiveValue("stay", new KeyDataViewType(typeof(uint), 3), 2u, metaBuilder.ToAnnotations()); - - // Next the case where those values are ints. - var metaValues2 = new VBuffer(3, new int[] { 1, 2, 3, 4 }); - metaBuilder = new DataViewSchema.Annotations.Builder(); - metaBuilder.AddKeyValues(3, NumberDataViewType.Int32, metaValues2.CopyTo); - var value2 = new VBuffer(2, 0, null, null); - builder.Add>("awhile", new VectorDataViewType(new KeyDataViewType(typeof(byte), 3), 2), value2.CopyTo, metaBuilder.ToAnnotations()); - - // Then the case where a value of that kind exists, but is of not of the right kind, in which case it should not be identified as containing that metadata. - metaBuilder = new DataViewSchema.Annotations.Builder(); - metaBuilder.AddPrimitiveValue(AnnotationUtils.Kinds.KeyValues, NumberDataViewType.Single, 2f); - builder.AddPrimitiveValue("and", new KeyDataViewType(typeof(ushort), 2), (ushort)1, metaBuilder.ToAnnotations()); - - // Then a final case where metadata of that kind is actaully simply altogether absent. - var value4 = new VBuffer(5, 0, null, null); - builder.Add>("listen", new VectorDataViewType(new KeyDataViewType(typeof(uint), 2)), value4.CopyTo); - - // Finally compose a trivial data view out of all this. - var view = RowCursorUtils.RowAsDataView(env, AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations())); - - // Whew! I'm glad that's over with. Let us start running the test in ernest. - // First let's do a direct match of the types to ensure that works. - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU1.I4Values.Vector, - and: c.KeyU2.NoValue.Scalar, - listen: c.KeyU4.NoValue.VarVector)); - - // Next let's match against the superclasses (where no value types are - // asserted), to ensure that the less specific case still passes. - view.AssertStatic(env, c => ( - stay: c.KeyU4.NoValue.Scalar, - awhile: c.KeyU1.NoValue.Vector, - and: c.KeyU2.NoValue.Scalar, - listen: c.KeyU4.NoValue.VarVector)); - - // Here we assert a subset. - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU1.I4Values.Vector)); - - // OK. Now we've confirmed the basic stuff works, let's check other scenarios. - // Due to the fact that we cannot yet assert only a *single* column, these always appear - // in at least pairs. - - // First try to get the right type of exception to test against. - Type e = null; - try - { - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU2.I4Values.Vector)); - } - catch (Exception eCaught) - { - e = eCaught.GetType(); - } - Assert.NotNull(e); - - // What if the key representation type is wrong? - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU2.I4Values.Vector))); - - // What if the key value type is wrong? - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - awhile: c.KeyU1.I2Values.Vector))); - - // Same two tests, but for scalar? - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU2.TextValues.Scalar, - awhile: c.KeyU1.I2Values.Vector))); - - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU4.BoolValues.Scalar, - awhile: c.KeyU1.I2Values.Vector))); - - // How about if we misidentify the vectorness? - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Vector, - awhile: c.KeyU1.I2Values.Vector))); - - // How about the names? - Assert.Throws(e, () => - view.AssertStatic(env, c => ( - stay: c.KeyU4.TextValues.Scalar, - alot: c.KeyU1.I4Values.Vector))); - } - - [Fact] - public void Normalizer() - { - var env = new MLContext(0); - var dataPath = GetDataPath("generated_regression_dataset.csv"); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - var data = reader.Load(dataSource); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, r.features, bin: r.features.NormalizeByBinning(), mm: r.features.Normalize())); - var tdata = est.Fit(data).Transform(data); - - var schema = tdata.AsDynamic.Schema; - Assert.True(schema.TryGetColumnIndex("features", out int featCol)); - Assert.True(schema.TryGetColumnIndex("bin", out int binCol)); - Assert.True(schema.TryGetColumnIndex("mm", out int mmCol)); - Assert.False(schema[featCol].IsNormalized()); - Assert.True(schema[binCol].IsNormalized()); - Assert.True(schema[mmCol].IsNormalized()); - } - - [Fact] - public void NormalizerWithOnFit() - { - var ml = new MLContext(0); - var dataPath = GetDataPath("generated_regression_dataset.csv"); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(ml, - c => c.LoadFloat(0, 2), - separator: ';', hasHeader: true); - var data = reader.Load(dataSource); - - // These will be populated once we call fit. - ImmutableArray mm; - ImmutableArray ss; - ImmutableArray> bb; - - var est = reader.MakeNewEstimator() - .Append(r => (r, - ncdf: r.NormalizeByCumulativeDistribution(onFit: (m, s) => mm = m), - n: r.NormalizeMeanVariance(onFit: (s, o) => { ss = s; Assert.Empty(o); }), - b: r.NormalizeByBinning(onFit: b => bb = b))); - var tdata = est.Fit(data).Transform(data); - - Assert.Equal(3, mm.Length); - Assert.Equal(3, ss.Length); - Assert.Equal(3, bb.Length); - - // Just for fun, let's also write out some of the lines of the data to the console. - using (var stream = new MemoryStream()) - { - IDataView v = ml.Transforms.SelectColumns("r", "ncdf", "n", "b").Fit(tdata.AsDynamic).Transform(tdata.AsDynamic); - v = ml.Data.TakeRows(v, 10); - var saver = new TextSaver(ml, new TextSaver.Arguments() - { - Dense = true, - Separator = ",", - OutputHeader = false - }); - saver.SaveData(stream, v, Utils.GetIdentityPermutation(v.Schema.Count)); - Console.WriteLine(Encoding.UTF8.GetString(stream.ToArray())); - } - } - - [Fact] - public void ToKey() - { - var env = new MLContext(0); - var dataPath = GetDataPath("iris.data"); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(4), values: c.LoadFloat(0, 3)), - separator: ','); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => (labelKey: r.label.ToKey(), valuesKey: r.values.ToKey(onFit: m => { }))) - .Append(r => (r.labelKey, r.valuesKey, valuesKeyKey: r.valuesKey.ToKey())); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - Assert.True(schema.TryGetColumnIndex("labelKey", out int labelCol)); - Assert.True(schema.TryGetColumnIndex("valuesKey", out int valuesCol)); - Assert.True(schema.TryGetColumnIndex("valuesKeyKey", out int valuesKeyCol)); - - Assert.Equal((ulong)3, (schema[labelCol].Type as KeyDataViewType)?.Count); - Assert.True(schema[valuesCol].Type is VectorDataViewType valuesVecType && valuesVecType.ItemType is KeyDataViewType); - Assert.True(schema[valuesKeyCol].Type is VectorDataViewType valuesKeyVecType && valuesKeyVecType.ItemType is KeyDataViewType); - - var labelKeyType = schema[labelCol].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type; - var valuesKeyType = schema[valuesCol].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type; - var valuesKeyKeyType = schema[valuesKeyCol].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type; - Assert.NotNull(labelKeyType); - Assert.NotNull(valuesKeyType); - Assert.NotNull(valuesKeyKeyType); - Assert.True(labelKeyType is VectorDataViewType labelVecType && labelVecType.ItemType == TextDataViewType.Instance); - Assert.True(valuesKeyType is VectorDataViewType valuesVecType2 && valuesVecType2.ItemType == NumberDataViewType.Single); - Assert.True(valuesKeyKeyType is VectorDataViewType valuesKeyVecType2 && valuesKeyVecType2.ItemType == NumberDataViewType.Single); - // Because they're over exactly the same data, they ought to have the same cardinality and everything. - Assert.True(valuesKeyKeyType.Equals(valuesKeyType)); - } - - [Fact] - public void ConcatWith() - { - var env = new MLContext(0); - var dataPath = GetDataPath("iris.data"); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(4), values: c.LoadFloat(0, 3), value: c.LoadFloat(2)), - separator: ','); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, r.values, r.value, - c0: r.label.AsVector(), c1: r.label.ConcatWith(r.label), - c2: r.value.ConcatWith(r.values), c3: r.values.ConcatWith(r.value, r.values))); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - int[] idx = new int[4]; - for (int i = 0; i < idx.Length; ++i) - Assert.True(schema.TryGetColumnIndex("c" + i, out idx[i]), $"Could not find col c{i}"); - var types = new VectorDataViewType[idx.Length]; - int[] expectedLen = new int[] { 1, 2, 5, 9 }; - for (int i = 0; i < idx.Length; ++i) - { - var type = schema[idx[i]].Type; - types[i] = type as VectorDataViewType; - Assert.True(types[i]?.Size > 0, $"Col c{i} had unexpected type {type}"); - Assert.Equal(expectedLen[i], types[i].Size); - } - Assert.Equal(TextDataViewType.Instance, types[0].ItemType); - Assert.Equal(TextDataViewType.Instance, types[1].ItemType); - Assert.Equal(NumberDataViewType.Single, types[2].ItemType); - Assert.Equal(NumberDataViewType.Single, types[3].ItemType); - } - - [Fact] - public void Tokenize() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - tokens: r.text.TokenizeIntoWords(), - chars: r.text.TokenizeIntoCharactersAsKeys())); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - var type = schema["tokens"].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size == 0 && vecType.ItemType == TextDataViewType.Instance); - type = schema["chars"].Type; - Assert.True(type is VectorDataViewType vecType2 && vecType2.Size == 0 && vecType2.ItemType is KeyDataViewType - && vecType2.ItemType.RawType == typeof(ushort)); - } - - [Fact] - public void NormalizeTextAndRemoveStopWords() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - normalized_text: r.text.NormalizeText(), - words_without_stopwords: r.text.TokenizeIntoWords().RemoveDefaultStopWords())); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("words_without_stopwords", out int stopwordsCol)); - var type = schema[stopwordsCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size == 0 && vecType.ItemType == TextDataViewType.Instance); - - Assert.True(schema.TryGetColumnIndex("normalized_text", out int normTextCol)); - type = schema[normTextCol].Type; - Assert.Equal(TextDataViewType.Instance, type); - } - - [Fact] - public void ConvertToWordBag() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - bagofword: r.text.ProduceWordBags(), - bagofhashedword: r.text.ProduceHashedWordBags())); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("bagofword", out int bagofwordCol)); - var type = schema[bagofwordCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0&& vecType.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("bagofhashedword", out int bagofhashedwordCol)); - type = schema[bagofhashedwordCol].Type; - Assert.True(type is VectorDataViewType vecType2 && vecType2.Size > 0 && vecType2.ItemType is NumberDataViewType); - } - - [Fact] - public void Ngrams() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - ngrams: r.text.TokenizeIntoWords().ToKey().ProduceNgrams(), - ngramshash: r.text.TokenizeIntoWords().ToKey().ProduceHashedNgrams())); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("ngrams", out int ngramsCol)); - var type = schema[ngramsCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0 && vecType.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("ngramshash", out int ngramshashCol)); - type = schema[ngramshashCol].Type; - Assert.True(type is VectorDataViewType vecType2 && vecType2.Size > 0 && vecType2.ItemType is NumberDataViewType); - } - - - [Fact] - public void LpGcNormAndWhitening() - { - var env = new MLContext(0); - var dataPath = GetDataPath("generated_regression_dataset.csv"); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - var data = reader.Load(dataSource); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, - lpnorm: r.features.NormalizeLpNorm(), - gcnorm: r.features.NormalizeGlobalContrast(), - zcawhitened: r.features.ZcaWhitening(), - pcswhitened: r.features.PcaWhitening())); - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("lpnorm", out int lpnormCol)); - var type = schema[lpnormCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0 && vecType.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("gcnorm", out int gcnormCol)); - type = schema[gcnormCol].Type; - Assert.True(type is VectorDataViewType vecType2 && vecType2.Size > 0 && vecType2.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("zcawhitened", out int zcawhitenedCol)); - type = schema[zcawhitenedCol].Type; - Assert.True(type is VectorDataViewType vecType3 && vecType3.Size > 0 && vecType3.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("pcswhitened", out int pcswhitenedCol)); - type = schema[pcswhitenedCol].Type; - Assert.True(type is VectorDataViewType vecType4 && vecType4.Size > 0 && vecType4.ItemType is NumberDataViewType); - } - - [Fact] - public void LdaTopicModel() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - // This will be populated once we call fit. - ModelParameters ldaSummary; - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - topics: r.text.ProduceWordBags().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); - - var transformer = est.Fit(data); - var tdata = transformer.Transform(data); - - var schema = tdata.AsDynamic.Schema; - Assert.True(schema.TryGetColumnIndex("topics", out int topicsCol)); - var type = schema[topicsCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0 && vecType.ItemType is NumberDataViewType); - } - - [Fact] - public void FeatureSelection() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - bag_of_words_count: r.text.ProduceWordBags().SelectFeaturesBasedOnCount(10), - bag_of_words_mi: r.text.ProduceWordBags().SelectFeaturesBasedOnMutualInformation(r.label))); - - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("bag_of_words_count", out int bagofwordCountCol)); - var type = schema[bagofwordCountCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0 && vecType.ItemType is NumberDataViewType); - - Assert.True(schema.TryGetColumnIndex("bag_of_words_mi", out int bagofwordMiCol)); - type = schema[bagofwordMiCol].Type; - Assert.True(type is VectorDataViewType vecType2 && vecType2.Size > 0 && vecType2.ItemType is NumberDataViewType); - } - - [Fact] - public void TrainTestSplit() - { - var env = new MLContext(0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(0), features: c.LoadFloat(1, 4))); - var data = reader.Load(dataSource); - - var (train, test) = env.Data.TrainTestSplit(data, 0.5); - - // Just make sure that the train is about the same size as the test set. - var trainCount = train.GetColumn(r => r.label).Count(); - var testCount = test.GetColumn(r => r.label).Count(); - - Assert.InRange(trainCount * 1.0 / testCount, 0.8, 1.2); - - // Now stratify by label. Silly thing to do. - (train, test) = env.Data.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label); - var trainLabels = train.GetColumn(r => r.label).Distinct(); - var testLabels = test.GetColumn(r => r.label).Distinct(); - Assert.True(trainLabels.Count() > 0); - Assert.True(testLabels.Count() > 0); - Assert.False(trainLabels.Intersect(testLabels).Any()); - } - - [Fact] - public void PrincipalComponentAnalysis() - { - var env = new MLContext(0); - var dataPath = GetDataPath("generated_regression_dataset.csv"); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - var data = reader.Load(dataSource); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, - pca: r.features.ProjectToPrincipalComponents(rank: 5))); - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("pca", out int pcaCol)); - var type = schema[pcaCol].Type; - Assert.True(type is VectorDataViewType vecType && vecType.Size > 0 && vecType.ItemType is NumberDataViewType); - } - - [Fact] - public void NAIndicatorStatic() - { - var ml = new MLContext(0); - - string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ml, ctx => ( - ScalarFloat: ctx.LoadFloat(1), - ScalarDouble: ctx.LoadDouble(1), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDoulbe: ctx.LoadDouble(1, 4) - )); - - var data = reader.Load(new MultiFileSource(dataPath)); - - var est = data.MakeNewEstimator(). - Append(row => ( - A: row.ScalarFloat.IsMissingValue(), - B: row.ScalarDouble.IsMissingValue(), - C: row.VectorFloat.IsMissingValue(), - D: row.VectorDoulbe.IsMissingValue() - )); - - IDataView newData = ml.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); - Assert.NotNull(newData); - bool[] ScalarFloat = newData.GetColumn(newData.Schema["A"]).ToArray(); - bool[] ScalarDouble = newData.GetColumn(newData.Schema["B"]).ToArray(); - bool[][] VectorFloat = newData.GetColumn(newData.Schema["C"]).ToArray(); - bool[][] VectorDoulbe = newData.GetColumn(newData.Schema["D"]).ToArray(); - - Assert.NotNull(ScalarFloat); - Assert.NotNull(ScalarDouble); - Assert.NotNull(VectorFloat); - Assert.NotNull(VectorDoulbe); - for (int i = 0; i < 4; i++) - { - Assert.True(!ScalarFloat[i] && !ScalarDouble[i]); - Assert.NotNull(VectorFloat[i]); - Assert.NotNull(VectorDoulbe[i]); - for (int j = 0; j < 4; j++) - Assert.True(!VectorFloat[i][j] && !VectorDoulbe[i][j]); - } - } - - [Fact] - public void TextNormalizeStatic() - { - var env = new MLContext(0); - var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - var data = reader.Load(dataSource); - - var est = data.MakeNewEstimator() - .Append(r => ( - r.label, - norm: r.text.NormalizeText(), - norm_Upper: r.text.NormalizeText(caseMode: TextNormalizingEstimator.CaseMode.Upper), - norm_KeepDiacritics: r.text.NormalizeText(keepDiacritics: true), - norm_NoPuctuations: r.text.NormalizeText(keepPunctuations: false), - norm_NoNumbers: r.text.NormalizeText(keepNumbers: false))); - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema["norm"].Type is TextDataViewType); - Assert.True(schema["norm_Upper"].Type is TextDataViewType); - Assert.True(schema["norm_KeepDiacritics"].Type is TextDataViewType); - Assert.True(schema["norm_NoPuctuations"].Type is TextDataViewType); - Assert.True(schema["norm_NoNumbers"].Type is TextDataViewType); - } - - [Fact] - public void TestPcaStatic() - { - var env = new MLContext(0); - var dataSource = GetDataPath("generated_regression_dataset.csv"); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - var data = reader.Load(dataSource); - var est = reader.MakeNewEstimator() - .Append(r => (r.label, pca: r.features.ProjectToPrincipalComponents(rank: 5))); - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("pca", out int pca)); - var type = schema[pca].Type; - Assert.Equal(new VectorDataViewType(NumberDataViewType.Single, 5), type); - } - - [Fact] - public void TestConvertStatic() - { - MLContext ml = new MLContext(); - const string content = "0 hello 3.14159 -0 2\n" - + "1 1 2 4 15"; - var dataSource = new BytesStreamSource(content); - - var text = ml.Data.CreateTextLoader(ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1), - numericFeatures: ctx.LoadDouble(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. - dataSource, separator: ' '); - var data = text.Load(dataSource); - var est = text.MakeNewEstimator().Append(r => (floatLabel: r.label.ToFloat(), txtFloat: r.text.ToFloat(), num: r.numericFeatures.ToFloat())); - var tdata = est.Fit(data).Transform(data); - var schema = tdata.AsDynamic.Schema; - - Assert.True(schema.TryGetColumnIndex("floatLabel", out int floatLabel)); - var type = schema[floatLabel].Type; - Assert.Equal(NumberDataViewType.Single, type); - Assert.True(schema.TryGetColumnIndex("txtFloat", out int txtFloat)); - type = schema[txtFloat].Type; - Assert.Equal(NumberDataViewType.Single, type); - Assert.True(schema.TryGetColumnIndex("num", out int num)); - type = schema[num].Type; - Assert.Equal(new VectorDataViewType(NumberDataViewType.Single, 3), type); - } - } -} \ No newline at end of file diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs deleted file mode 100644 index b98cacd2f1..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ /dev/null @@ -1,1364 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Data; -using Microsoft.ML.Trainers.LightGbm; -using Microsoft.ML.Trainers.LightGbm.StaticPipe; -using Microsoft.ML.Model; -using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.TestFramework.Attributes; -using Microsoft.ML.Trainers; -using Microsoft.ML.Trainers.FastTree; -using Microsoft.ML.Trainers.Recommender; -using Xunit; -using Xunit.Abstractions; - -namespace Microsoft.ML.StaticPipelineTesting -{ - public sealed class Training : BaseTestClassWithConsole - { - public Training(ITestOutputHelper output) : base(output) - { - } - - [Fact] - public void SdcaRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - LinearRegressionModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.Sdca(r.label, r.features, null, - new SdcaRegressionTrainer.Options() { MaximumNumberOfIterations = 2, NumberOfThreads = 1 }, - onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 11 input features, so we ought to have 11 weights. - Assert.Equal(11, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.score, new PoissonLoss()); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.MeanAbsoluteError, 0, double.PositiveInfinity); - Assert.InRange(metrics.MeanSquaredError, 0, double.PositiveInfinity); - Assert.InRange(metrics.RootMeanSquaredError, 0, double.PositiveInfinity); - Assert.Equal(metrics.RootMeanSquaredError * metrics.RootMeanSquaredError, metrics.MeanSquaredError, 5); - Assert.InRange(metrics.LossFunction, 0, double.PositiveInfinity); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - } - - [Fact] - public void SdcaRegressionNameCollision() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new RegressionCatalog(env); - - // Here we introduce another column called "Score" to collide with the name of the default output. Heh heh heh... - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10), Score: c.LoadText(2)), - separator: ';', hasHeader: true); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, r.Score, score: catalog.Trainers.Sdca(r.label, r.features, null, - new SdcaRegressionTrainer.Options() { MaximumNumberOfIterations = 2, NumberOfThreads = 1 }))); - - var pipe = reader.Append(est); - - var model = pipe.Fit(dataSource); - var data = model.Load(dataSource); - - // Now, let's see if that column is still there, and still text! - var schema = data.AsDynamic.Schema; - Assert.True(schema.TryGetColumnIndex("Score", out int scoreCol), "Score column not present!"); - Assert.Equal(TextDataViewType.Instance, schema[scoreCol].Type); - - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - } - - [Fact] - public void SdcaBinaryClassification() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.Sdca(r.label, r.features, null, - new SdcaLogisticRegressionBinaryTrainer.Options { MaximumNumberOfIterations = 2, NumberOfThreads = 1 }, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.SubModel.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - Assert.InRange(metrics.LogLoss, 0, double.PositiveInfinity); - Assert.InRange(metrics.Entropy, 0, double.PositiveInfinity); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - } - - [Fact] - public void SdcaBinaryClassificationSimple() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.Sdca(r.label, r.features, onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.SubModel.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.9, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.9, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.9, 1); - Assert.InRange(metrics.LogLoss, 0, 0.2); - Assert.InRange(metrics.Entropy, 0.9, double.PositiveInfinity); - } - - [Fact] - public void SdcaBinaryClassificationNoCalibration() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var loss = new HingeLoss(1); - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.SdcaNonCalibrated(r.label, r.features, null, loss, - new SdcaNonCalibratedBinaryTrainer.Options { MaximumNumberOfIterations = 2, NumberOfThreads = 1 }, - onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - } - - [Fact] - public void SdcaBinaryClassificationNoCalibrationSimple() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var loss = new HingeLoss(1); - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.SdcaNonCalibrated(r.label, r.features, loss, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.95, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - } - - [Fact] - public void SdcaBinaryClassificationNoCalibrationSimpleWithPRCurve() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var loss = new HingeLoss(1); - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.SdcaNonCalibrated(r.label, r.features, loss, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.EvaluateWithPRCurve(data, r => r.label, r => r.preds, out List prCurve); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.95, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - - Assert.NotNull(prCurve); - Assert.InRange(prCurve.Count, 400, 500); - } - - [Fact] - public void AveragePerceptronNoCalibration() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var loss = new HingeLoss(1); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss, - numIterations: 2, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [Fact] - public void AveragePerceptronCalibration() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var loss = new HingeLoss(1); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss, - numIterations: 2, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [Fact] - public void FfmBinaryClassification() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features1: c.LoadFloat(1, 4), features2: c.LoadFloat(5, 9))); - - FieldAwareFactorizationMachineModelParameters pred = null; - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.FieldAwareFactorizationMachine(r.label, new[] { r.features1, r.features2 }, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [Fact] - public void SdcaMulticlass() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - MaximumEntropyModelParameters pred = null; - - var loss = new HingeLoss(1); - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.Sdca( - r.label, - r.features, - numberOfIterations: 2, - onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - VBuffer[] weights = default; - pred.GetWeights(ref weights, out int n); - Assert.True(n == 3 && n == weights.Length); - foreach (var w in weights) - Assert.True(w.Length == 4); - - var biases = pred.GetBiases(); - Assert.True(biases.Count() == 3); - - var data = model.Load(dataSource); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds, 2); - Assert.True(metrics.LogLoss > 0); - Assert.True(metrics.TopKAccuracy > 0); - } - - [Fact] - public void SdcaMulticlassSvm() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - LinearMulticlassModelParameters pred = null; - - var loss = new HingeLoss(1); - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.SdcaNonCalibrated( - r.label, - r.features, - lossFunction: new HingeLoss(), - numberOfIterations: 2, - onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - VBuffer[] weights = default; - pred.GetWeights(ref weights, out int n); - Assert.True(n == 3 && n == weights.Length); - foreach (var w in weights) - Assert.True(w.Length == 4); - - var biases = pred.GetBiases(); - Assert.True(biases.Count() == 3); - - var data = model.Load(dataSource); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds, 2); - Assert.InRange(metrics.MacroAccuracy, 0.6, 1); - Assert.InRange(metrics.TopKAccuracy, 0.8, 1); - } - - [Fact] - public void CrossValidate() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.Sdca( - r.label, - r.features, - numberOfIterations: 2))); - - var results = catalog.CrossValidate(reader.Load(dataSource), est, r => r.label) - .Select(x => x.metrics).ToArray(); - Assert.Equal(5, results.Length); - Assert.True(results.All(x => x.LogLoss > 0)); - } - - [Fact] - public void FastTreeBinaryClassification() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.FastTree(r.label, r.features, - numberOfTrees: 10, - numberOfLeaves: 5, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - VBuffer weights = new VBuffer(); - ((IPredictorWithFeatureWeights)pred).GetFeatureWeights(ref weights); - Assert.Equal(9, weights.Length); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [Fact] - public void FastTreeRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - FastTreeRegressionModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, - numberOfTrees: 10, - numberOfLeaves: 5, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 11 input features, so we ought to have 11 weights. - VBuffer weights = new VBuffer(); - pred.GetFeatureWeights(ref weights); - Assert.Equal(11, weights.Length); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.score, new PoissonLoss()); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.MeanAbsoluteError, 0, double.PositiveInfinity); - Assert.InRange(metrics.MeanSquaredError, 0, double.PositiveInfinity); - Assert.InRange(metrics.RootMeanSquaredError, 0, double.PositiveInfinity); - Assert.Equal(metrics.RootMeanSquaredError * metrics.RootMeanSquaredError, metrics.MeanSquaredError, 5); - Assert.InRange(metrics.LossFunction, 0, double.PositiveInfinity); - } - - [LightGBMFact] - public void LightGbmBinaryClassification() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.LightGbm(r.label, r.features, - numberOfIterations: 10, - numberOfLeaves: 5, - learningRate: 0.01, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - VBuffer weights = new VBuffer(); - ((IHaveFeatureWeights)pred).GetFeatureWeights(ref weights); - Assert.Equal(9, weights.Length); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [LightGBMFact] - public void LightGbmRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - LightGbmRegressionModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.LightGbm(r.label, r.features, - numberOfIterations: 10, - numberOfLeaves: 5, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 11 input features, so we ought to have 11 weights. - VBuffer weights = new VBuffer(); - pred.GetFeatureWeights(ref weights); - Assert.Equal(11, weights.Length); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.score, new PoissonLoss()); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.MeanAbsoluteError, 0, double.PositiveInfinity); - Assert.InRange(metrics.MeanSquaredError, 0, double.PositiveInfinity); - Assert.InRange(metrics.RootMeanSquaredError, 0, double.PositiveInfinity); - Assert.Equal(metrics.RootMeanSquaredError * metrics.RootMeanSquaredError, metrics.MeanSquaredError, 5); - Assert.InRange(metrics.LossFunction, 0, double.PositiveInfinity); - } - - [Fact] - public void PoissonRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - PoissonRegressionModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.LbfgsPoissonRegression(r.label, r.features, null, - new LbfgsPoissonRegressionTrainer.Options { L2Regularization = 2, EnforceNonNegativity = true, NumberOfThreads = 1 }, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 11 input features, so we ought to have 11 weights. - Assert.Equal(11, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.score, new PoissonLoss()); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.MeanAbsoluteError, 0, double.PositiveInfinity); - Assert.InRange(metrics.MeanSquaredError, 0, double.PositiveInfinity); - Assert.InRange(metrics.RootMeanSquaredError, 0, double.PositiveInfinity); - Assert.Equal(metrics.RootMeanSquaredError * metrics.RootMeanSquaredError, metrics.MeanSquaredError, 5); - Assert.InRange(metrics.LossFunction, 0, double.PositiveInfinity); - } - - [Fact] - public void LogisticRegressionBinaryClassification() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.LbfgsLogisticRegression(r.label, r.features, null, - new LbfgsLogisticRegressionBinaryTrainer.Options { L1Regularization = 10, NumberOfThreads = 1 }, onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.SubModel.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0, 1); - } - - [Fact] - public void MulticlassLogisticRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - MaximumEntropyModelParameters pred = null; - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.LbfgsMaximumEntropy( - r.label, - r.features, - null, - new LbfgsMaximumEntropyMulticlassTrainer.Options { NumberOfThreads = 1 }, - onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - VBuffer[] weights = default; - pred.GetWeights(ref weights, out int n); - Assert.True(n == 3 && n == weights.Length); - foreach (var w in weights) - Assert.True(w.Length == 4); - - var data = model.Load(dataSource); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds, 2); - Assert.True(metrics.LogLoss > 0); - Assert.True(metrics.TopKAccuracy > 0); - } - - [Fact] - public void OnlineGradientDescent() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - LinearRegressionModelParameters pred = null; - - var loss = new SquaredLoss(); - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.OnlineGradientDescent(r.label, r.features, - lossFunction: loss, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - // 11 input features, so we ought to have 11 weights. - Assert.Equal(11, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.score, new PoissonLoss()); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.MeanAbsoluteError, 0, double.PositiveInfinity); - Assert.InRange(metrics.MeanSquaredError, 0, double.PositiveInfinity); - Assert.InRange(metrics.RootMeanSquaredError, 0, double.PositiveInfinity); - Assert.Equal(metrics.RootMeanSquaredError * metrics.RootMeanSquaredError, metrics.MeanSquaredError, 5); - Assert.InRange(metrics.LossFunction, 0, double.PositiveInfinity); - } - - [Fact] - public void KMeans() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - KMeansModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .AppendCacheCheckpoint() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => ( - r.label, - r.features, - preds: env.Clustering.Trainers.KMeans - ( - r.features, - null, - options: new KMeansTrainer.Options - { - NumberOfClusters = 3, - NumberOfThreads = 1 - }, - onFit: p => pred = p - ))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - VBuffer[] centroids = default; - int k; - pred.GetClusterCentroids(ref centroids, out k); - - Assert.True(k == 3); - - var data = model.Load(dataSource); - - var metrics = env.Clustering.Evaluate(data, r => r.preds.score, r => r.label, r => r.features); - Assert.NotNull(metrics); - - Assert.InRange(metrics.AverageDistance, 0.5262, 0.5264); - Assert.InRange(metrics.NormalizedMutualInformation, 0.73, 0.77); - Assert.InRange(metrics.DaviesBouldinIndex, 0.662, 0.667); - - metrics = env.Clustering.Evaluate(data, r => r.preds.score, label: r => r.label); - Assert.NotNull(metrics); - - Assert.InRange(metrics.AverageDistance, 0.5262, 0.5264); - Assert.True(metrics.DaviesBouldinIndex == 0.0); - - metrics = env.Clustering.Evaluate(data, r => r.preds.score, features: r => r.features); - Assert.True(double.IsNaN(metrics.NormalizedMutualInformation)); - - metrics = env.Clustering.Evaluate(data, r => r.preds.score); - Assert.NotNull(metrics); - Assert.InRange(metrics.AverageDistance, 0.5262, 0.5264); - Assert.True(double.IsNaN(metrics.NormalizedMutualInformation)); - Assert.True(metrics.DaviesBouldinIndex == 0.0); - - } - - [Fact] - public void FastTreeRanking() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.adultRanking.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RankingCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(0), features: c.LoadFloat(9, 14), groupId: c.LoadText(1)), - separator: '\t', hasHeader: true); - - FastTreeRankingModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, r.features, groupId: r.groupId.ToKey())) - .Append(r => (r.label, r.groupId, score: catalog.Trainers.FastTree(r.label, r.features, r.groupId, onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.groupId, r => r.score); - Assert.NotNull(metrics); - - Assert.True(metrics.NormalizedDiscountedCumulativeGains.Count == metrics.DiscountedCumulativeGains.Count && metrics.DiscountedCumulativeGains.Count == 3); - - Assert.InRange(metrics.DiscountedCumulativeGains[0], 1.4, 1.6); - Assert.InRange(metrics.DiscountedCumulativeGains[1], 1.4, 1.8); - Assert.InRange(metrics.DiscountedCumulativeGains[2], 1.4, 1.8); - - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[0], 0.365, 0.37); - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[1], 0.365, 0.37); - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[2], 0.365, 0.37); - } - - [LightGBMFact] - public void LightGBMRanking() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.adultRanking.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RankingCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(0), features: c.LoadFloat(9, 14), groupId: c.LoadText(1)), - separator: '\t', hasHeader: true); - - LightGbmRankingModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, r.features, groupId: r.groupId.ToKey())) - .Append(r => (r.label, r.groupId, score: catalog.Trainers.LightGbm(r.label, r.features, r.groupId, onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.groupId, r => r.score); - Assert.NotNull(metrics); - - Assert.True(metrics.NormalizedDiscountedCumulativeGains.Count == metrics.DiscountedCumulativeGains.Count && metrics.DiscountedCumulativeGains.Count == 3); - - Assert.InRange(metrics.DiscountedCumulativeGains[0], 1.4, 1.6); - Assert.InRange(metrics.DiscountedCumulativeGains[1], 1.4, 1.8); - Assert.InRange(metrics.DiscountedCumulativeGains[2], 1.4, 1.8); - - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[0], 0.365, 0.37); - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[1], 0.365, 0.37); - Assert.InRange(metrics.NormalizedDiscountedCumulativeGains[2], 0.365, 0.37); - } - - [LightGBMFact] - public void MulticlassLightGBM() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - OneVersusAllModelParameters pred = null; - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.LightGbm( - r.label, - r.features, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var data = model.Load(dataSource); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds, 2); - Assert.True(metrics.LogLoss > 0); - Assert.True(metrics.TopKAccuracy > 0); - } - - [Fact] - public void MulticlassNaiveBayesTrainer() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.iris.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new MulticlassClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); - - NaiveBayesMulticlassModelParameters pred = null; - - // With a custom loss function we no longer get calibrated predictions. - var est = reader.MakeNewEstimator() - .Append(r => (label: r.label.ToKey(), r.features)) - .Append(r => (r.label, preds: catalog.Trainers.MulticlassNaiveBayesTrainer( - r.label, - r.features, onFit: p => pred = p))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - var labelHistogram = pred.GetLabelHistogram(); - var labelCount1 = labelHistogram.Count; - var featureHistogram = pred.GetFeatureHistogram(); - Assert.True(labelCount1 == 3 && labelCount1 == featureHistogram.Count); - for (int i = 0; i < labelCount1; i++) - Assert.True(featureHistogram[i].Count == 4); - - var data = model.Load(dataSource); - - // Just output some data on the schema for fun. - var schema = data.AsDynamic.Schema; - for (int c = 0; c < schema.Count; ++c) - Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds, 2); - Assert.True(metrics.LogLoss > 0); - Assert.True(metrics.TopKAccuracy > 0); - } - - [Fact] - public void HogwildSGDLogisticRegression() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features, null, - new SgdCalibratedTrainer.Options { L2Regularization = 0, NumberOfThreads = 1 }, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.SubModel.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.9, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - Assert.InRange(metrics.LogLoss, 0, 0.2); - } - - [Fact] - public void HogwildSGDLogisticRegressionSimple() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - CalibratedModelParametersBase pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features, null, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.SubModel.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.9, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - Assert.InRange(metrics.LogLoss, 0, 0.2); - } - - [Fact] - public void HogwildSGDSupportVectorMachine() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.StochasticGradientDescentNonCalibratedClassificationTrainer(r.label, r.features, null, - new SgdNonCalibratedTrainer.Options { L2Regularization = 0, NumberOfThreads = 1, LossFunction = new HingeLoss()}, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.9, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - } - - [Fact] - public void HogwildSGDSupportVectorMachineSimple() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - var catalog = new BinaryClassificationCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - LinearBinaryModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, preds: catalog.Trainers.StochasticGradientDescentNonCalibratedClassificationTrainer(r.label, r.features, lossFunction: new HingeLoss(), onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - // 9 input features, so we ought to have 9 weights. - Assert.Equal(9, pred.Weights.Count); - - var data = model.Load(dataSource); - - var metrics = catalog.Evaluate(data, r => r.label, r => r.preds); - // Run a sanity check against a few of the metrics. - Assert.InRange(metrics.Accuracy, 0.9, 1); - Assert.InRange(metrics.AreaUnderRocCurve, 0.95, 1); - Assert.InRange(metrics.AreaUnderPrecisionRecallCurve, 0.95, 1); - } - - [LessThanNetCore30OrNotNetCoreAndX64Fact("netcoreapp3.0 and x86 output differs from Baseline. Being tracked as part of https://github.com/dotnet/machinelearning/issues/1441")] - public void MatrixFactorization() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(seed: 1); - - // Specify where to find data file - var dataPath = GetDataPath(TestDatasets.trivialMatrixFactorization.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - // Read data file. The file contains 3 columns, label (float value), matrixColumnIndex (unsigned integer key), and matrixRowIndex (unsigned integer key). - // More specifically, LoadKey(1, 0, 19) means that the matrixColumnIndex column is read from the 2nd (indexed by 1) column in the data file and as - // a key type (stored as 32-bit unsigned integer) ranged from 0 to 19 (aka the training matrix has 20 columns). - var reader = mlContext.Data.CreateTextLoader(ctx => (label: ctx.LoadFloat(0), matrixColumnIndex: ctx.LoadKey(1, 20), matrixRowIndex: ctx.LoadKey(2, 40)), hasHeader: true); - - // The parameter that will be into the onFit method below. The obtained predictor will be assigned to this variable - // so that we will be able to touch it. - MatrixFactorizationModelParameters pred = null; - - // Create a statically-typed matrix factorization estimator. The MatrixFactorization's input and output defined in MatrixFactorizationStatic - // tell what (aks a Scalar) is expected. Notice that only one thread is used for deterministic outcome. - var matrixFactorizationEstimator = reader.MakeNewEstimator() - .Append(r => (r.label, score: mlContext.Regression.Trainers.MatrixFactorization( - r.label, r.matrixRowIndex, r.matrixColumnIndex, - new MatrixFactorizationTrainer.Options { NumberOfThreads = 1 }, - onFit: p => pred = p))); - - // Create a pipeline from the reader (the 1st step) and the matrix factorization estimator (the 2nd step). - var pipe = reader.Append(matrixFactorizationEstimator); - - // pred will be assigned by the onFit method once the training process is finished, so pred must be null before training. - Assert.Null(pred); - - // Train the pipeline on the given data file. Steps in the pipeline are sequentially fitted (by calling their Fit function). - var model = pipe.Fit(dataSource); - - // pred got assigned so that one can inspect the predictor trained in pipeline. - Assert.NotNull(pred); - - // Feed the data file into the trained pipeline. The data would be loaded by TextLoader (the 1st step) and then the output of the - // TextLoader would be fed into MatrixFactorizationEstimator. - var estimatedData = model.Load(dataSource); - - // After the training process, the metrics for regression problems can be computed. - var metrics = mlContext.Regression.Evaluate(estimatedData, r => r.label, r => r.score); - - // Naive test. Just make sure the pipeline runs. - Assert.InRange(metrics.MeanSquaredError, 0, 0.5); - } - - [LightGBMFact] - public void MulticlassLightGbmStaticPipelineWithInMemoryData() - { - // Create a general context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(seed: 1); - - // Create in-memory examples as C# native class. - var examples = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000); - - // Convert native C# class to IDataView, a consumble format to ML.NET functions. - var dataView = mlContext.Data.LoadFromEnumerable(examples); - - // IDataView is the data format used in dynamic-typed pipeline. To use static-typed pipeline, we need to convert - // IDataView to DataView by calling AssertStatic(...). The basic idea is to specify the static type for each column - // in IDataView in a lambda function. - var staticDataView = dataView.AssertStatic(mlContext, c => ( - Features: c.R4.Vector, - Label: c.Text.Scalar)); - - // Create static pipeline. First, we make an estimator out of static DataView as the starting of a pipeline. - // Then, we append necessary transforms and a classifier to the starting estimator. - var pipe = staticDataView.MakeNewEstimator() - .Append(mapper: r => ( - r.Label, - // Train multi-class LightGBM. The trained model maps Features to Label and probability of each class. - // The call of ToKey() is needed to convert string labels to integer indexes. - Predictions: mlContext.MulticlassClassification.Trainers.LightGbm(r.Label.ToKey(), r.Features) - )) - .Append(r => ( - // Actual label. - r.Label, - // Labels are converted to keys when training LightGBM so we convert it here again for calling evaluation function. - LabelIndex: r.Label.ToKey(), - // Used to compute metrics such as accuracy. - r.Predictions, - // Assign a new name to predicted class index. - PredictedLabelIndex: r.Predictions.predictedLabel, - // Assign a new name to class probabilities. - Scores: r.Predictions.score - )); - - // Split the static-typed data into training and test sets. Only training set is used in fitting - // the created pipeline. Metrics are computed on the test. - var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5); - - // Train the model. - var model = pipe.Fit(trainingData); - - // Do prediction on the test set. - var prediction = model.Transform(testingData); - - // Evaluate the trained model is the test set. - var metrics = mlContext.MulticlassClassification.Evaluate(prediction, r => r.LabelIndex, r => r.Predictions); - - // Check if metrics are resonable. - Assert.Equal(0.86545065082827088, metrics.MacroAccuracy, 6); - Assert.Equal(0.86507936507936511, metrics.MicroAccuracy, 6); - - // Convert prediction in ML.NET format to native C# class. - var nativePredictions = mlContext.Data.CreateEnumerable(prediction.AsDynamic, false).ToList(); - - // Get schema object of the prediction. It contains metadata such as the mapping from predicted label index - // (e.g., 1) to its actual label (e.g., "AA"). - var schema = prediction.AsDynamic.Schema; - - // Retrieve the mapping from labels to label indexes. - var labelBuffer = new VBuffer>(); - schema[nameof(SamplesUtils.DatasetUtils.MulticlassClassificationExample.PredictedLabelIndex)].Annotations.GetValue("KeyValues", ref labelBuffer); - var nativeLabels = labelBuffer.DenseValues().ToList(); // nativeLabels[nativePrediction.PredictedLabelIndex-1] is the original label indexed by nativePrediction.PredictedLabelIndex. - - // Show prediction result for the 3rd example. - var nativePrediction = nativePredictions[2]; - var expectedProbabilities = new float[] { 0.92574507f, 0.0739398f, 0.0002437812f, 7.13458649E-05f }; - // Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i]. - for (int i = 0; i < labelBuffer.Length; ++i) - Assert.Equal(expectedProbabilities[i], nativePrediction.Scores[i], 6); - - // The predicted label below should be with probability 0.922597349. - Console.WriteLine("Our predicted label to this example is {0} with probability {1}", - nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1], - nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]); - } - } -} \ No newline at end of file diff --git a/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs b/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs deleted file mode 100644 index ac790fecfe..0000000000 --- a/test/Microsoft.ML.StaticPipelineTesting/TreeRepresentation.cs +++ /dev/null @@ -1,191 +0,0 @@ -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.Trainers.FastTree; -using Xunit; -using Xunit.Abstractions; - -namespace Microsoft.ML.StaticPipelineTesting -{ - public sealed class TreeRepresentation : BaseTestClassWithConsole - { - public TreeRepresentation(ITestOutputHelper output) : base(output) - { - } - - [Fact] - public void FastTreeRegressionRepresentation() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true); - - var opts = new FastTreeRegressionTrainer.Options() - { - NumberOfTrees = 10, - NumberOfLeaves = 5, - NumberOfThreads = 1 - }; - - FastTreeRegressionModelParameters pred = null; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var treeCollection = pred.TrainedTreeEnsemble; - Assert.Equal(0, treeCollection.Bias); - Assert.Equal(10, treeCollection.Trees.Count); - Assert.Equal(10, treeCollection.TreeWeights.Count); - - var trees = treeCollection.Trees; - Assert.Equal(4, trees[0].NumberOfNodes); - - // Numerical split. There is no categorical split so the follwoing vector contains 0-element. - var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0); - Assert.Equal(0, categoricalSplitFeatures.Count); - - // Numerical split. There is no categorical split so the follwoing vector contains 0-element. - var categoricalSplitFeatureRange = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); - Assert.Equal(0, categoricalSplitFeatureRange.Count); - - var expectedGtChild = new int[] { 3, 2, -4, -5 }; - Assert.Equal(4, trees[0].RightChild.Count); - Assert.Equal(expectedGtChild, trees[0].RightChild); - - var expectedLteChild = new int[] { 1, -1, -3, -2 }; - Assert.Equal(4, trees[0].LeftChild.Count); - Assert.Equal(expectedLteChild, trees[0].LeftChild); - - var expectedCategoricalSplitFlags = new bool[] { false, false, false, false }; - Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); - Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); - - var expectedNumericalSplitFeatureIndexes = new int[] { 0, 10, 2, 10 }; - Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); - Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); - - var expectedNumericalSplitThresholds = new float[] { 0.14f, -0.645f, -0.095f, 0.31f }; - Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); - for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) - Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); - - Assert.Equal(5, trees[0].NumberOfLeaves); - - var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 }; - Assert.Equal(5, trees[0].LeafValues.Count); - for (int i = 0; i < trees[0].LeafValues.Count; ++i) - Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); - } - - [Fact] - public void FastTreeRegressionRepresentationWithCategoricalSplit() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var catalog = new RegressionCatalog(env); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), - separator: ';', hasHeader: true); - - FastTreeRegressionModelParameters pred = null; - - var opts = new FastTreeRegressionTrainer.Options() - { - CategoricalSplit = true, - NumberOfTrees = 3, - NumberOfLeaves = 5, - NumberOfThreads = 1, - // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set, - // we should set a small value. Otherwise, the trained trees could be empty. - MinimumExampleCountPerLeaf = 2 - }; - - var est = reader.MakeNewEstimator() - .Append(r => (r.label, features: r.features.OneHotEncoding())) - .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, - onFit: (p) => { pred = p; }))); - - var pipe = reader.Append(est); - - Assert.Null(pred); - var model = pipe.Fit(dataSource); - Assert.NotNull(pred); - - var treeCollection = pred.TrainedTreeEnsemble; - Assert.Equal(0, treeCollection.Bias); - Assert.Equal(3, treeCollection.Trees.Count); - Assert.Equal(3, treeCollection.TreeWeights.Count); - - var trees = treeCollection.Trees; - Assert.Equal(4, trees[0].NumberOfNodes); - - var expectedGtChild = new int[] { 3, -3, -4, -5 }; - Assert.Equal(4, trees[0].RightChild.Count); - Assert.Equal(expectedGtChild, trees[0].RightChild); - - var expectedLteChild = new int[] { 1, 2, -1, -2 }; - Assert.Equal(4, trees[0].LeftChild.Count); - Assert.Equal(expectedLteChild, trees[0].LeftChild); - - var expectedCategoricalSplitFlags = new bool[] { true, true, true, true }; - Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); - Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); - - var expectedNumericalSplitFeatureIndexes = new int[] { 5312, 2, 2126, 533 }; - Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); - Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); - - var expectedNumericalSplitThresholds = new float[] { 0.5f, 0.5f, 0.5f, 0.5f }; - Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); - for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) - Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); - - var actualCategoricalRanges0 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); - Assert.Equal(actualCategoricalRanges0, new int[] { 5312, 5782 }); - - var actualCategoricalRanges1 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(1); - Assert.Equal(actualCategoricalRanges1, new int[] { 2, 417 }); - - var actualCategoricalRanges2 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(2); - Assert.Equal(actualCategoricalRanges2, new int[] { 2126, 2593 }); - - var actualCategoricalRanges3 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(3); - Assert.Equal(actualCategoricalRanges3, new int[] { 533, 983 }); - - int[] expectedCounts = { 62, 52, 54, 22 }; - int[] expectedStarts = { 5315, 10, 2141, 533 }; - int[] expectedEnds = { 5782, 401, 2558, 874 }; - for (int i = 0; i < trees[0].NumberOfNodes; ++i) - { - // Retrieve i-th node's split features. - var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i); - Assert.Equal(expectedCounts[i], actualCategoricalSplitFeatures.Count); - Assert.Equal(expectedStarts[i], actualCategoricalSplitFeatures[0]); - Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]); - } - - Assert.Equal(5, trees[0].NumberOfLeaves); - - var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 }; - Assert.Equal(5, trees[0].LeafValues.Count); - for (int i = 0; i < trees[0].LeafValues.Count; ++i) - Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); - } - } -} diff --git a/test/Microsoft.ML.Tests/CachingTests.cs b/test/Microsoft.ML.Tests/CachingTests.cs index 46d1c7149e..9f608972de 100644 --- a/test/Microsoft.ML.Tests/CachingTests.cs +++ b/test/Microsoft.ML.Tests/CachingTests.cs @@ -6,7 +6,6 @@ using System.Threading; using Microsoft.ML.Data; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Xunit; using Xunit.Abstractions; @@ -77,25 +76,5 @@ public void CacheTest() data.GetColumn(data.Schema["Features"]).ToArray(); Assert.True(src.All(x => x.AccessCount == 1)); } - - [Fact] - public void StaticDataCacheTest() - { - var env = new MLContext(seed: 0); - var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); - var dataSource = new MultiFileSource(dataPath); - - var reader = TextLoaderStatic.CreateLoader(env, - c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); - - var data = reader.Load(dataSource); - - var cachedData = data.Cache(); - - // Before caching, we are not able to shuffle the data. - Assert.True(data.AsDynamic.CanShuffle == false); - // After caching, we are able to shuffle the data! - Assert.True(cachedData.AsDynamic.CanShuffle == true); - } } } diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index 7f257530c1..2dfede0987 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -22,8 +22,6 @@ - - diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs deleted file mode 100644 index d6c80e9482..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ /dev/null @@ -1,716 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Collections.Immutable; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; -using Microsoft.ML.TestFramework; -using Microsoft.ML.Trainers; -using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.Text; -using Xunit; -using Xunit.Abstractions; - -namespace Microsoft.ML.Tests.Scenarios.Api.CookbookSamples -{ - /// - /// Samples that are written as part of 'ML.NET Cookbook' are also added here as tests. - /// These tests don't actually test anything, other than the fact that the code compiles and - /// doesn't throw when it is executed. - /// - public sealed class CookbookSamples : BaseTestClass - { - public CookbookSamples(ITestOutputHelper output) : base(output) - { - } - - private void IntermediateData(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Create the loader: define the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // A boolean column depicting the 'target label'. - IsOver50K: ctx.LoadBool(0), - // Three text columns. - Workclass: ctx.LoadText(1), - Education: ctx.LoadText(2), - MaritalStatus: ctx.LoadText(3)), - hasHeader: true); - - // Start creating our processing pipeline. For now, let's just concatenate all the text columns - // together into one. - var pipeline = loader.MakeNewEstimator() - .Append(row => ( - row.IsOver50K, - AllFeatures: row.Workclass.ConcatWith(row.Education, row.MaritalStatus) - )); - - // Let's verify that the data has been read correctly. - // First, we read the data file. - var data = loader.Load(dataPath); - - // Fit our data pipeline and transform data with it. - var transformedData = pipeline.Fit(data).Transform(data); - - // 'transformedData' is a 'promise' of data. Let's actually read it. - var someRows = mlContext - // Convert to an enumerable of user-defined type. - .Data.CreateEnumerable(transformedData.AsDynamic, reuseRowObject: false) - // Take a couple values as an array. - .Take(4).ToArray(); - - // Extract the 'AllFeatures' column. - // This will give the entire dataset: make sure to only take several row - // in case the dataset is huge. - var featureColumns = transformedData.GetColumn(r => r.AllFeatures) - .Take(20).ToArray(); - - // The same extension method also applies to the dynamic-typed data, except you have to - // specify the column name and type: - var dynamicData = transformedData.AsDynamic; - var sameFeatureColumns = dynamicData.GetColumn(dynamicData.Schema["AllFeatures"]) - .Take(20).ToArray(); - } - - [Fact] - public void InspectIntermediateDataGetColumn() - => IntermediateData(GetDataPath("adult.tiny.with-schema.txt")); - - private void TrainRegression(string trainDataPath, string testDataPath, string modelPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step one: read the data as an IDataView. - // First, we define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // We read the first 11 values as a single float vector. - FeatureVector: ctx.LoadFloat(0, 10), - // Separately, read the target variable. - Target: ctx.LoadFloat(11) - ), - // The data file has header. - hasHeader: true, - // Default separator is tab, but we need a semicolon. - separator: ';'); - - - // Now read the file (remember though, loaders are lazy, so the actual reading will happen when the data is accessed). - var trainData = loader.Load(trainDataPath); - - // Sometime, caching data in-memory after its first access can save some loading time when the data is going to used - // several times somewhere. The caching mechanism is also lazy; it only caches things after being used. - // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because - // a caching step, which provides the same caching function, will be inserted in the considered "pipeline." - var cachedTrainData = trainData.Cache(); - - // Step two: define the learning pipeline. - - // We 'start' the pipeline with the output of the loader. - var pipeline = loader.MakeNewEstimator() - // We add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. Otherwise, the following - // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy. - // The data accessed in any downstream step will be cached since its first use. In general, you only - // need to add a caching step before trainable step, because caching is not helpful if the data is - // only scanned once. This step can be removed if user doesn't have enough memory to store the whole - // data set. - .AppendCacheCheckpoint() - // Now we can add any 'training steps' to it. In our case we want to 'normalize' the data (rescale to be - // between -1 and 1 for all examples), and then train the model. - .Append(r => ( - // Retain the 'Target' column for evaluation purposes. - r.Target, - // We choose the SDCA regression trainer. Note that we normalize the 'FeatureVector' right here in - // the the same call. - Prediction: mlContext.Regression.Trainers.Sdca(label: r.Target, features: r.FeatureVector.Normalize()))); - - var fx = trainData.GetColumn(x => x.FeatureVector); - - // Step three. Train the pipeline. - var model = pipeline.Fit(trainData); - - // Read the test dataset. - var testData = loader.Load(testDataPath); - // Calculate metrics of the model on the test data. - var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r => r.Target, score: r => r.Prediction); - - // Saving and loading happens to 'dynamic' models, so the static typing is lost in the process. - mlContext.Model.Save(model.AsDynamic, trainData.AsDynamic.Schema, modelPath); - - // Potentially, the lines below can be in a different process altogether. - - // When you load the model, it's a 'dynamic' transformer. - ITransformer loadedModel = mlContext.Model.Load(modelPath, out var schema); - } - - [Fact] - public void TrainRegressionModel() - => TrainRegression(GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename), GetDataPath(TestDatasets.generatedRegressionDataset.testFilename), - DeleteOutputPath("cook_model_static.zip")); - - private ITransformer TrainOnIris(string irisDataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step one: read the data as an IDataView. - // First, we define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separator: ','); - - // Retrieve the training data. - var trainData = loader.Load(irisDataPath); - - // Build the training pipeline. - var pipeline = loader.MakeNewEstimator() - .Append(r => ( - r.Label, - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - // We add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. Otherwise, the following - // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy. - // The data accessed in any downstream step will be cached since its first use. In general, you only - // need to add a caching step before trainable step, because caching is not helpful if the data is - // only scanned once. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - // Note that the label is a text, so it needs to be converted to key using 'ToKey' estimator. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label.ToKey(), r.Features))) - // Apply the inverse conversion from 'predictedLabel' key back to string value. - // Note that the final output column is only one, and we didn't assign a name to it. - // In this case, ML.NET auto-assigns the name 'Data' to the produced column. - .Append(r => r.Predictions.predictedLabel.ToValue()); - - // Train the model. - var model = pipeline.Fit(trainData).AsDynamic; - return model; - } - - private void PredictOnIris(ITransformer model) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Use the model for one-time prediction. - // Make the prediction function object. Note that, on average, this call takes around 200x longer - // than one prediction, so you might want to cache and reuse the prediction function, instead of - // creating one per prediction. - var predictionFunc = mlContext.Model.CreatePredictionEngine(model); - - // Obtain the prediction. Remember that 'Predict' is not reentrant. If you want to use multiple threads - // for simultaneous prediction, make sure each thread is using its own PredictionFunction. - var prediction = predictionFunc.Predict(new IrisInput - { - SepalLength = 4.1f, - SepalWidth = 0.1f, - PetalLength = 3.2f, - PetalWidth = 1.4f - }); - } - - [Fact] - public void TrainAndPredictOnIris() - => PredictOnIris(TrainOnIris(GetDataPath("iris.data"))); - - private void TrainAndInspectWeights(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step one: read the data as an IDataView. - // First, we define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separator: ','); - - // Retrieve the training data. - var trainData = loader.Load(dataPath); - - // This is the predictor ('weights collection') that we will train. - MaximumEntropyModelParameters predictor = null; - // And these are the normalizer scales that we will learn. - ImmutableArray normScales; - // Build the training pipeline. - var pipeline = loader.MakeNewEstimator() - .Append(r => ( - r.Label, - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - .Append(r => ( - r.Label, - // Normalize (rescale) the features to be between -1 and 1. - Features: r.Features.Normalize( - // When the normalizer is trained, the below delegate is going to be called. - // We use it to memorize the scales. - onFit: (scales, offsets) => normScales = scales))) - // Cache data used in memory because the subsequently trainer needs to access the data multiple times. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - // Note that the label is a text, so it needs to be converted to key using 'ToKey' estimator. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label.ToKey(), r.Features, - // When the model is trained, the below delegate is going to be called. - // We use that to memorize the predictor object. - onFit: p => predictor = p))); - - // Train the model. During this call our 'onFit' delegate will be invoked, - // and our 'predictor' will be set. - var model = pipeline.Fit(trainData); - - // Now we can use 'predictor' to look at the weights. - // 'weights' will be an array of weight vectors, one vector per class. - // Our problem has 3 classes, so numClasses will be 3, and weights will contain - // 3 vectors (of 4 values each). - VBuffer[] weights = null; - predictor.GetWeights(ref weights, out int numClasses); - - // Similarly we can also inspect the biases for the 3 classes. - var biases = predictor.GetBiases(); - - // Inspect the normalizer scales. - Console.WriteLine(string.Join(" ", normScales)); - } - - [Fact] - public void InspectModelWeights() - => TrainAndInspectWeights(GetDataPath("iris.data")); - - private void NormalizationWorkout(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset will be grouped together as one Features column. - Features: ctx.LoadFloat(0, 3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separator: ','); - - // Read the training data. - var trainData = loader.Load(dataPath); - - // Apply all kinds of standard ML.NET normalization to the raw features. - var pipeline = loader.MakeNewEstimator() - .Append(r => ( - MinMaxNormalized: r.Features.Normalize(ensureZeroUntouched: true), - MeanVarNormalized: r.Features.NormalizeMeanVariance(ensureZeroUntouched: false), - CdfNormalized: r.Features.NormalizeByCumulativeDistribution(), - BinNormalized: r.Features.NormalizeByBinning(maximumBinCount: 256) - )); - - // Let's train our pipeline of normalizers, and then apply it to the same data. - var normalizedData = pipeline.Fit(trainData).Transform(trainData); - - // Inspect one column of the resulting dataset. - var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray(); - } - - [Fact] - public void Normalization() - => NormalizationWorkout(GetDataPath("iris.data")); - - private class IrisInput - { - // Unfortunately, we still need the dummy 'Label' column to be present. - [ColumnName("Label")] - public string IgnoredLabel { get; set; } - public float SepalLength { get; set; } - public float SepalWidth { get; set; } - public float PetalLength { get; set; } - public float PetalWidth { get; set; } - } - - private IEnumerable GetChurnInfo() - { - var r = new Random(454); - return Enumerable.Range(0, 500) - .Select(x => new CustomerChurnInfo - { - HasChurned = x % 2 == 0 || (r.NextDouble() < 0.05), - DemographicCategory = (x % 10).ToString(), - LastVisits = new float[] { x, x * 2, x * 3, x * 4, x * 5 } - }); - } - - [Fact] - public void TrainOnAutoGeneratedData() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step one: read the data as an IDataView. - // Let's assume that 'GetChurnData()' fetches and returns the training data from somewhere. - IEnumerable churnData = GetChurnInfo(); - - // Turn the data into the ML.NET data view. - // We can use CreateDataView or ReadFromEnumerable, depending on whether 'churnData' is an IList, - // or merely an IEnumerable. - var trainData = mlContext.Data.LoadFromEnumerable(churnData); - - // Now note that 'trainData' is just an IDataView, so we face a choice here: either declare the static type - // and proceed in the statically typed fashion, or keep dynamic types and build a dynamic pipeline. - // We demonstrate both below. - - // Build the learning pipeline. - // In our case, we will one-hot encode the demographic category, and concatenate that with the number of visits. - // We apply our FastTree binary classifier to predict the 'HasChurned' label. - - var dynamicpipeline = mlContext.Transforms.Categorical.OneHotEncoding("DemographicCategory") - .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "DemographicCategory", "LastVisits")) - .AppendCacheCheckpoint(mlContext) // FastTree will benefit from caching data in memory. - .Append(mlContext.BinaryClassification.Trainers.FastTree("HasChurned", "Features", numberOfTrees: 20)); - - var dynamicModel = dynamicpipeline.Fit(trainData); - - // Build the same learning pipeline, but statically typed. - // First, transition to the statically-typed data view. - var staticData = trainData.AssertStatic(mlContext, c => ( - HasChurned: c.Bool.Scalar, - DemographicCategory: c.Text.Scalar, - LastVisits: c.R4.Vector)); - - // Build the pipeline, same as the one above. - var staticpipeline = staticData.MakeNewEstimator() - .Append(r => ( - r.HasChurned, - Features: r.DemographicCategory.OneHotEncoding().ConcatWith(r.LastVisits))) - .AppendCacheCheckpoint() // FastTree will benefit from caching data in memory. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.HasChurned, r.Features, numberOfTrees: 20)); - - var staticModel = staticpipeline.Fit(staticData); - - // Note that dynamicModel should be the same as staticModel.AsDynamic (give or take random variance from - // the training procedure). - - var qualityMetrics = mlContext.BinaryClassification.Evaluate(dynamicModel.Transform(trainData), "HasChurned"); - } - - private void TextFeaturizationOn(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - IsToxic: ctx.LoadBool(0), - Message: ctx.LoadText(1) - ), hasHeader: true); - - // Read the data. - var data = loader.Load(dataPath); - - // Inspect the message texts that are read from the file. - var messageTexts = data.GetColumn(x => x.Message).Take(20).ToArray(); - - // Apply various kinds of text operations supported by ML.NET. - var pipeline = loader.MakeNewEstimator() - // Cache data in memory in an on-demand manner. Columns used in any downstream step will be - // cached in memory at their first uses. This step can be removed if user's machine doesn't - // have enough memory. - .AppendCacheCheckpoint() - .Append(r => ( - // One-stop shop to run the full text featurization. - TextFeatures: r.Message.FeaturizeText(), - - // NLP pipeline 1: bag of words. - BagOfWords: r.Message.NormalizeText().ProduceWordBags(), - - // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), - - // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.TokenizeIntoCharactersAsKeys().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), - - // NLP pipeline 4: word embeddings. - // PretrainedModelKind.Sswe is used here for performance of the test. In a real - // scenario, it is best to use a different model for more accuracy. - Embeddings: r.Message.NormalizeText().TokenizeIntoWords().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) - )); - - // Let's train our pipeline, and then apply it to the same data. - // Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train. - var transformedData = pipeline.Fit(data).Transform(data); - - // Inspect some columns of the resulting dataset. - var embeddings = transformedData.GetColumn(x => x.Embeddings).Take(10).ToArray(); - var unigrams = transformedData.GetColumn(x => x.BagOfWords).Take(10).ToArray(); - } - - [Fact] - public void TextFeaturization() - => TextFeaturizationOn(GetDataPath("wikipedia-detox-250-line-data.tsv")); - - [Fact] - public void CategoricalFeaturization() - => CategoricalFeaturizationOn(GetDataPath("adult.tiny.with-schema.txt")); - - [Fact] - public void ReadMultipleFiles() - => CategoricalFeaturizationOn(GetDataPath("adult.tiny.with-schema.txt"), GetDataPath("adult.tiny.with-schema.txt")); - - private void CategoricalFeaturizationOn(params string[] dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - Label: ctx.LoadBool(0), - // We will load all the categorical features into one vector column of size 8. - CategoricalFeatures: ctx.LoadText(1, 8), - // Similarly, load all numerical features into one vector of size 6. - NumericalFeatures: ctx.LoadFloat(9, 14), - // Let's also separately load the 'Workclass' column. - Workclass: ctx.LoadText(1) - ), hasHeader: true); - - // Read the data. - var data = loader.Load(dataPath); - - // Inspect the categorical columns to check that they are correctly read. - var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray(); - - // Build several alternative featurization pipelines. - var pipeline = loader.MakeNewEstimator() - // Cache data in memory in an on-demand manner. Columns used in any downstream step will be - // cached in memory at their first uses. This step can be removed if user's machine doesn't - // have enough memory. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - r.NumericalFeatures, - // Convert each categorical feature into one-hot encoding independently. - CategoricalOneHot: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), - // Convert all categorical features into indices, and build a 'word bag' of these. - CategoricalBag: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), - // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set. - WorkclassOneHotTrimmed: r.Workclass.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10) - )); - - // Let's train our pipeline, and then apply it to the same data. - var transformedData = pipeline.Fit(data).Transform(data); - - // Inspect some columns of the resulting dataset. - var categoricalBags = transformedData.GetColumn(x => x.CategoricalBag).Take(10).ToArray(); - var workclasses = transformedData.GetColumn(x => x.WorkclassOneHotTrimmed).Take(10).ToArray(); - - // Of course, if we want to train the model, we will need to compose a single float vector of all the features. - // Here's how we could do this: - - var fullpipeline = pipeline - .Append(r => ( - r.Label, - // Concatenate two of the 3 categorical pipelines, and the numeric features. - Features: r.NumericalFeatures.ConcatWith(r.CategoricalBag, r.WorkclassOneHotTrimmed))) - // Now we're ready to train. We chose our FastTree trainer for this classification task. - .Append(r => mlContext.BinaryClassification.Trainers.FastTree(r.Label, r.Features, numberOfTrees: 50)); - - // Train the model. - var model = fullpipeline.Fit(data); - } - - [Fact] - public void CrossValidationIris() - => CrossValidationOn(GetDataPath("iris.data")); - - private void CrossValidationOn(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step one: read the data as an IDataView. - // First, we define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separator: ','); - - // Read the data. - var data = loader.Load(dataPath); - - // Build the training pipeline. - var pipeline = loader.MakeNewEstimator() - .Append(r => ( - // Convert string label to a key. - Label: r.Label.ToKey(), - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))) - // Add a step for caching data in memory so that the downstream iterative training - // algorithm can efficiently scan through the data multiple times. - .AppendCacheCheckpoint() - .Append(r => ( - r.Label, - // Train the multi-class SDCA model to predict the label using features. - Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features))); - - // Split the data 90:10 into train and test sets, train and evaluate. - var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); - - // Train the model. - var model = pipeline.Fit(trainData); - // Compute quality metrics on the test set. - var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(testData), r => r.Label, r => r.Predictions); - Console.WriteLine(metrics.MicroAccuracy); - - // Now run the 5-fold cross-validation experiment, using the same pipeline. - var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, r => r.Label, numFolds: 5); - - // The results object is an array of 5 elements. For each of the 5 folds, we have metrics, model and scored test data. - // Let's compute the average micro-accuracy. - var microAccuracies = cvResults.Select(r => r.metrics.MicroAccuracy); - Console.WriteLine(microAccuracies.Average()); - } - - [Fact] - public void MixAndMatchStaticDynamicOnIris() - => MixMatch(GetDataPath("iris.data")); - - private void MixMatch(string dataPath) - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Read the data as an IDataView. - // First, we define the loader: specify the data columns and where to find them in the text file. - var loader = mlContext.Data.CreateTextLoader(ctx => ( - // The four features of the Iris dataset. - SepalLength: ctx.LoadFloat(0), - SepalWidth: ctx.LoadFloat(1), - PetalLength: ctx.LoadFloat(2), - PetalWidth: ctx.LoadFloat(3), - // Label: kind of iris. - Label: ctx.LoadText(4) - ), - // Default separator is tab, but the dataset has comma. - separator: ','); - - // Read the data. - var data = loader.Load(dataPath); - - // Build the pre-processing pipeline. - var pipeline = loader.MakeNewEstimator() - .Append(r => ( - // Convert string label to a key. - Label: r.Label.ToKey(), - // Concatenate all the features together into one column 'Features'. - Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))); - - // Now, at the time of writing, there is no static pipeline for OVA (one-versus-all). So, let's - // append the OVA learner to the dynamic pipeline. - IEstimator dynamicPipe = pipeline.AsDynamic; - - // Create a binary classification trainer. - var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features"); - - // Append the OVA learner to the pipeline. - dynamicPipe = dynamicPipe.Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer)); - - // At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and - // ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world. - // Here's how we go back to the static pipeline: - var staticFinalPipe = dynamicPipe.AssertStatic(mlContext, - // Declare the shape of the input. As you can see, it's identical to the shape of the loader: - // four float features and a string label. - c => ( - SepalLength: c.R4.Scalar, - SepalWidth: c.R4.Scalar, - PetalLength: c.R4.Scalar, - PetalWidth: c.R4.Scalar, - Label: c.Text.Scalar), - // Declare the shape of the output (or a relevant subset of it). - // In our case, we care only about the predicted label column (a key type), and scores (vector of floats). - c => ( - Score: c.R4.Vector, - // Predicted label is a key backed by uint, with text values (since original labels are text). - PredictedLabel: c.KeyU4.TextValues.Scalar)) - // Convert the predicted label from key back to the original string value. - .Append(r => r.PredictedLabel.ToValue()); - - // Train the model in a statically typed way. - var model = staticFinalPipe.Fit(data); - - // And here is how we could've stayed in the dynamic pipeline and train that way. - dynamicPipe = dynamicPipe.Append(new KeyToValueMappingEstimator(mlContext, "PredictedLabel")); - var dynamicModel = dynamicPipe.Fit(data.AsDynamic); - - // Now 'dynamicModel', and 'model.AsDynamic' are equivalent. - } - - private class CustomerChurnInfo - { - public string CustomerID { get; set; } - public bool HasChurned { get; set; } - public string DemographicCategory { get; set; } - // Visits during last 5 days, latest to newest. - [VectorType(5)] - public float[] LastVisits { get; set; } - } - - private class IrisPrediction - { - [ColumnName("Data")] - public string PredictedClass { get; set; } - } - - private class InspectedRow - { - public bool IsOver50K { get; set; } - public string Workclass { get; set; } - public string Education { get; set; } - public string MaritalStatus { get; set; } - public string[] AllFeatures { get; set; } - } - } -} diff --git a/test/Microsoft.ML.Tests/Scenarios/GetColumnTests.cs b/test/Microsoft.ML.Tests/Scenarios/GetColumnTests.cs index 0d0e191046..4c59f177bb 100644 --- a/test/Microsoft.ML.Tests/Scenarios/GetColumnTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/GetColumnTests.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.RunTests; using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework; using Xunit; using Xunit.Abstractions; @@ -25,74 +24,69 @@ public GetColumnTests(ITestOutputHelper output) : base(output) public void TestGetColumn() { var path = GetDataPath(TestDatasets.breastCancer.trainFilename); - var env = new MLContext(); - var data = TextLoaderStatic.CreateLoader(env, ctx => ( - floatScalar: ctx.LoadFloat(1), - floatVector: ctx.LoadFloat(2, 6), - stringScalar: ctx.LoadText(4), - stringVector: ctx.LoadText(5, 7) - )).Load(path); + var mlContext = new MLContext(); + var data = mlContext.Data.LoadFromTextFile(path, new[] { + new TextLoader.Column("floatScalar", DataKind.Single, 1), + new TextLoader.Column("floatVector", DataKind.Single, 2, 6), + new TextLoader.Column("stringScalar", DataKind.String, 4), + new TextLoader.Column("stringVector", DataKind.String, 5, 7) + }); - var enum1 = data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatScalar"]).ToArray(); - var enum2 = data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatVector"]).ToArray(); - var enum3 = data.AsDynamic.GetColumn>(data.AsDynamic.Schema["floatVector"]).ToArray(); + var enum1 = data.GetColumn(data.Schema["floatScalar"]).ToArray(); + var enum2 = data.GetColumn(data.Schema["floatVector"]).ToArray(); + var enum3 = data.GetColumn>(data.Schema["floatVector"]).ToArray(); - var enum4 = data.AsDynamic.GetColumn(data.AsDynamic.Schema["stringScalar"]).ToArray(); - var enum5 = data.AsDynamic.GetColumn(data.AsDynamic.Schema["stringVector"]).ToArray(); + var enum4 = data.GetColumn(data.Schema["stringScalar"]).ToArray(); + var enum5 = data.GetColumn(data.Schema["stringVector"]).ToArray(); var mustFail = GetMustFail(); - mustFail(() => data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatScalar"])); - mustFail(() => data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatVector"])); - mustFail(() => data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatScalar"])); - mustFail(() => data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatScalar"])); - mustFail(() => data.AsDynamic.GetColumn(data.AsDynamic.Schema["floatScalar"])); + mustFail(() => data.GetColumn(data.Schema["floatScalar"])); + mustFail(() => data.GetColumn(data.Schema["floatVector"])); + mustFail(() => data.GetColumn(data.Schema["floatScalar"])); + mustFail(() => data.GetColumn(data.Schema["floatScalar"])); + mustFail(() => data.GetColumn(data.Schema["floatScalar"])); - // Static types. - var enum8 = data.GetColumn(r => r.floatScalar); - var enum9 = data.GetColumn(r => r.floatVector); - var enum10 = data.GetColumn(r => r.stringScalar); - var enum11 = data.GetColumn(r => r.stringVector); - var data1 = TextLoaderStatic.CreateLoader(env, ctx => ( - floatScalar: ctx.LoadText(1), - anotherFloatVector: ctx.LoadFloat(2, 6), - stringVector: ctx.LoadText(5, 7) - )).Load(path); + var data1 = mlContext.Data.LoadFromTextFile(path, new[] { + new TextLoader.Column("floatScalar", DataKind.String, 1), + new TextLoader.Column("anotherFloatVector", DataKind.Single, 2, 6), + new TextLoader.Column("stringVector", DataKind.String, 5, 7) + }); // Type wrong. Load float as string. - mustFail(() => data.AsDynamic.GetColumn(data1.AsDynamic.Schema["floatScalar"])); + mustFail(() => data.GetColumn(data1.Schema["floatScalar"])); // Name wrong. Load anotherFloatVector from floatVector column. - mustFail(() => data.AsDynamic.GetColumn(data1.AsDynamic.Schema["anotherFloatVector"])); + mustFail(() => data.GetColumn(data1.Schema["anotherFloatVector"])); // Index wrong. stringVector is indexed by 3 in data but 2 in data1. - mustFail(() => data.AsDynamic.GetColumn(data1.AsDynamic.Schema["stringVector"]).ToArray()); + mustFail(() => data.GetColumn(data1.Schema["stringVector"]).ToArray()); } [Fact] public void TestGetColumnSelectedByString() { var path = GetDataPath(TestDatasets.breastCancer.trainFilename); - var env = new MLContext(); - var data = TextLoaderStatic.CreateLoader(env, ctx => ( - floatScalar: ctx.LoadFloat(1), - floatVector: ctx.LoadFloat(2, 6), - stringScalar: ctx.LoadText(4), - stringVector: ctx.LoadText(5, 7) - )).Load(path); + var mlContext = new MLContext(); + var data = mlContext.Data.LoadFromTextFile(path, new[] { + new TextLoader.Column("floatScalar", DataKind.Single, 1), + new TextLoader.Column("floatVector", DataKind.Single, 2, 6), + new TextLoader.Column("stringScalar", DataKind.String, 4), + new TextLoader.Column("stringVector", DataKind.String, 5, 7) + }); - var enum1 = data.AsDynamic.GetColumn("floatScalar").ToArray(); - var enum2 = data.AsDynamic.GetColumn("floatVector").ToArray(); - var enum3 = data.AsDynamic.GetColumn>("floatVector").ToArray(); + var enum1 = data.GetColumn("floatScalar").ToArray(); + var enum2 = data.GetColumn("floatVector").ToArray(); + var enum3 = data.GetColumn>("floatVector").ToArray(); - var enum4 = data.AsDynamic.GetColumn("stringScalar").ToArray(); - var enum5 = data.AsDynamic.GetColumn("stringVector").ToArray(); + var enum4 = data.GetColumn("stringScalar").ToArray(); + var enum5 = data.GetColumn("stringVector").ToArray(); var mustFail = GetMustFail(); - mustFail(() => data.AsDynamic.GetColumn("floatScalar")); - mustFail(() => data.AsDynamic.GetColumn("floatVector")); - mustFail(() => data.AsDynamic.GetColumn("floatScalar")); - mustFail(() => data.AsDynamic.GetColumn("floatScalar")); - mustFail(() => data.AsDynamic.GetColumn("floatScalar")); + mustFail(() => data.GetColumn("floatScalar")); + mustFail(() => data.GetColumn("floatVector")); + mustFail(() => data.GetColumn("floatScalar")); + mustFail(() => data.GetColumn("floatScalar")); + mustFail(() => data.GetColumn("floatScalar")); } private static Action GetMustFail() diff --git a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs index 41227f4242..b3838a791e 100644 --- a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs @@ -8,11 +8,9 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.StaticPipe; using Microsoft.ML.Transforms.TensorFlow; using Xunit; using Xunit.Abstractions; @@ -139,7 +137,7 @@ void TestCommandLine() } [TensorFlowFact] - public void TestTensorFlowStatic() + public void TestTensorFlow() { var modelLocation = "cifar_model/frozen_model.pb"; @@ -149,21 +147,20 @@ public void TestTensorFlowStatic() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoaderStatic.CreateLoader(mlContext, ctx => ( - imagePath: ctx.LoadText(0), - name: ctx.LoadText(1))) - .Load(dataFile); + var data = ML.Data.LoadFromTextFile(dataFile, new[] { + new TextLoader.Column("imagePath", DataKind.String, 0), + new TextLoader.Column("name", DataKind.String, 1) + }); // Note that CamelCase column names are there to match the TF graph node names. - var pipe = data.MakeNewEstimator() - .Append(row => ( - row.name, - Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) - .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(modelLocation))); + var pipe = ML.Transforms.LoadImages("Input", imageFolder, "imagePath") + .Append(ML.Transforms.ResizeImages("Input", imageHeight, imageWidth)) + .Append(ML.Transforms.ExtractPixels("Input", interleavePixelColors: true)) + .Append(ML.Model.LoadTensorFlowModel(modelLocation).ScoreTensorFlowModel("Output", "Input")); - TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); + TestEstimatorCore(pipe, data); - var result = pipe.Fit(data).Transform(data).AsDynamic; + var result = pipe.Fit(data).Transform(data); result.Schema.TryGetColumnIndex("Output", out int output); using (var cursor = result.GetRowCursor(result.Schema["Output"])) { @@ -181,7 +178,7 @@ public void TestTensorFlowStatic() } [TensorFlowFact] - public void TestTensorFlowStaticWithSchema() + public void TestTensorFlowWithSchema() { const string modelLocation = "cifar_model/frozen_model.pb"; @@ -196,21 +193,20 @@ public void TestTensorFlowStaticWithSchema() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoaderStatic.CreateLoader(mlContext, ctx => ( - imagePath: ctx.LoadText(0), - name: ctx.LoadText(1))) - .Load(dataFile); + var data = ML.Data.LoadFromTextFile(dataFile, new[] { + new TextLoader.Column("imagePath", DataKind.String, 0), + new TextLoader.Column("name", DataKind.String, 1) + }); // Note that CamelCase column names are there to match the TF graph node names. - var pipe = data.MakeNewEstimator() - .Append(row => ( - row.name, - Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) - .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(tensorFlowModel))); + var pipe = ML.Transforms.LoadImages("Input", imageFolder, "imagePath") + .Append(ML.Transforms.ResizeImages("Input", imageHeight, imageWidth)) + .Append(ML.Transforms.ExtractPixels("Input", interleavePixelColors: true)) + .Append(tensorFlowModel.ScoreTensorFlowModel("Output", "Input")); - TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); + TestEstimatorCore(pipe, data); - var result = pipe.Fit(data).Transform(data).AsDynamic; + var result = pipe.Fit(data).Transform(data); result.Schema.TryGetColumnIndex("Output", out int output); using (var cursor = result.GetRowCursor(result.Schema["Output"])) { diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/OnlineLinearTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/OnlineLinearTests.cs index 08e889c600..bad04f7c87 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/OnlineLinearTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/OnlineLinearTests.cs @@ -4,8 +4,8 @@ using Microsoft.ML; using Microsoft.ML.Trainers; -using Microsoft.ML.StaticPipe; using Xunit; +using Microsoft.ML.Data; namespace Microsoft.ML.Tests.TrainerEstimators { @@ -16,26 +16,28 @@ public void OnlineLinearWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); - var regressionData = TextLoaderStatic.CreateLoader(ML, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) - .Load(dataPath); + var regressionData = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("Features", DataKind.Single, 1, 10) + }); - var regressionPipe = regressionData.MakeNewEstimator() - .Append(r => (r.Label, Features: r.Features.Normalize())); + var regressionPipe = ML.Transforms.NormalizeMinMax("Features"); - var regressionTrainData = regressionPipe.Fit(regressionData).Transform(regressionData).AsDynamic; + var regressionTrainData = regressionPipe.Fit(regressionData).Transform(regressionData); var ogdTrainer = ML.Regression.Trainers.OnlineGradientDescent(); TestEstimatorCore(ogdTrainer, regressionTrainData); var ogdModel = ogdTrainer.Fit(regressionTrainData); ogdTrainer.Fit(regressionTrainData, ogdModel.Model); - var binaryData = TextLoaderStatic.CreateLoader(ML, ctx => (Label: ctx.LoadBool(0), Features: ctx.LoadFloat(1, 10))) - .Load(dataPath); + var binaryData = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("Features", DataKind.Single, 1, 10) + }); - var binaryPipe = binaryData.MakeNewEstimator() - .Append(r => (r.Label, Features: r.Features.Normalize())); + var binaryPipe = ML.Transforms.NormalizeMinMax("Features"); - var binaryTrainData = binaryPipe.Fit(binaryData).Transform(binaryData).AsDynamic; + var binaryTrainData = binaryPipe.Fit(binaryData).Transform(binaryData); var apTrainer = ML.BinaryClassification.Trainers.AveragedPerceptron( new AveragedPerceptronTrainer.Options{ LearningRate = 0.5f }); TestEstimatorCore(apTrainer, binaryTrainData); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs index a5cdfddae5..aef7e19874 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/SdcaTests.cs @@ -4,7 +4,6 @@ using System.Linq; using Microsoft.ML.Data; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Trainers; using Xunit; @@ -17,10 +16,15 @@ public void SdcaWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); - var data = TextLoaderStatic.CreateLoader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) - .Load(dataPath).Cache(); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("Features", DataKind.Single, 1, 10) + }); + + data = ML.Data.Cache(data); + var binaryData = ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean) - .Fit(data.AsDynamic).Transform(data.AsDynamic); + .Fit(data).Transform(data); var binaryTrainer = ML.BinaryClassification.Trainers.SdcaLogisticRegression( new SdcaLogisticRegressionBinaryTrainer.Options { ConvergenceTolerance = 1e-2f, MaximumNumberOfIterations = 10 }); @@ -33,9 +37,9 @@ public void SdcaWorkout() var regressionTrainer = ML.Regression.Trainers.Sdca( new SdcaRegressionTrainer.Options { ConvergenceTolerance = 1e-2f, MaximumNumberOfIterations = 10 }); - TestEstimatorCore(regressionTrainer, data.AsDynamic); - var mcData = ML.Transforms.Conversion.MapValueToKey("Label").Fit(data.AsDynamic).Transform(data.AsDynamic); - + TestEstimatorCore(regressionTrainer, data); + var mcData = ML.Transforms.Conversion.MapValueToKey("Label").Fit(data).Transform(data); + var mcTrainer = ML.MulticlassClassification.Trainers.SdcaMaximumEntropy( new SdcaMaximumEntropyMulticlassTrainer.Options { ConvergenceTolerance = 1e-2f, MaximumNumberOfIterations = 10 }); TestEstimatorCore(mcTrainer, mcData); @@ -53,7 +57,7 @@ public void SdcaLogisticRegression() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); @@ -85,7 +89,7 @@ public void SdcaLogisticRegression() var first = rawPrediction.First(); // This is a positive example. Assert.True(first.Label); - // Positive example should have non-negative score. + // Positive example should have non-negative score. Assert.True(first.Score > 0); // Positive example should have high probability of belonging the positive class. Assert.InRange(first.Probability, 0.8, 1); @@ -97,7 +101,7 @@ public void SdcaLogisticRegressionWithWeight() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(0); @@ -151,7 +155,7 @@ public void SdcaMaximumEntropyWithWeight() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(0); @@ -208,7 +212,7 @@ public void SdcaSupportVectorMachine() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); @@ -240,7 +244,7 @@ public void SdcaSupportVectorMachine() var first = rawPrediction.First(); // This is a positive example. Assert.True(first.Label); - // Positive example should have non-negative score. + // Positive example should have non-negative score. Assert.True(first.Score > 0); } @@ -250,7 +254,7 @@ public void SdcaMulticlassLogisticRegression() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(512); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); @@ -285,7 +289,7 @@ public void SdcaMulticlassSupportVectorMachine() // Generate C# objects as training examples. var rawData = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(512); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index 48c344dfb4..4b2947c61c 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -75,39 +74,31 @@ public void CategoricalHashWorkout() } [Fact] - public void CategoricalHashStatic() + public void CategoricalHash() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarString: ctx.LoadText(1), - VectorString: ctx.LoadText(1, 4), - SingleVectorString: ctx.LoadText(1, 1))); - var data = reader.Load(dataPath); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarString", DataKind.String, 1), + new TextLoader.Column("VectorString", DataKind.String, 1, 4), + new TextLoader.Column("SingleVectorString", DataKind.String, new[] { new TextLoader.Range(1, 1) }) + }); var wrongCollection = new[] { new TestClass() { A = "1", B = new[] { "2", "3" }, C = new[] { "2", "3", "4" } }, new TestClass() { A = "4", B = new[] { "4", "5" }, C = new[] { "3", "4", "5" } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); - var est = data.MakeNewEstimator(). - Append(row => ( - row.ScalarString, - row.VectorString, - row.SingleVectorString, - // Create a VarVector column - VarVectorString: row.ScalarString.TokenizeIntoWords())). - Append(row => ( - A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), - B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), - C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag), - D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin), - E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin), - F: row.VarVectorString.OneHotHashEncoding(), - // The following column and SingleVectorString are meant to test the special case of a vector that happens to be of length 1. - G: row.SingleVectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag) - )); - - TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); + var est = ML.Transforms.Text.TokenizeIntoWords("VarVectorString", "ScalarString") + .Append(ML.Transforms.Categorical.OneHotHashEncoding("A", "ScalarString", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)) + .Append(ML.Transforms.Categorical.OneHotHashEncoding("B", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)) + .Append(ML.Transforms.Categorical.OneHotHashEncoding("C", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Bag)) + .Append(ML.Transforms.Categorical.OneHotHashEncoding("D", "ScalarString", outputKind: OneHotEncodingEstimator.OutputKind.Binary)) + .Append(ML.Transforms.Categorical.OneHotHashEncoding("E", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Binary)) + .Append(ML.Transforms.Categorical.OneHotHashEncoding("F", "VarVectorString", outputKind: OneHotEncodingEstimator.OutputKind.Bag)) + // The following column and SingleVectorString are meant to test the special case of a vector that happens to be of length 1. + .Append(ML.Transforms.Categorical.OneHotHashEncoding("G", "SingleVectorString", outputKind: OneHotEncodingEstimator.OutputKind.Bag)); + + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var view = ML.Transforms.SelectColumns("A", "B", "C", "D", "E", "F").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index e5efd9e646..85cf591f36 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -151,29 +150,27 @@ public void CategoricalOneHotEncodingFromSideData() } [Fact] - public void CategoricalStatic() + public void Categorical() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarString: ctx.LoadText(1), - VectorString: ctx.LoadText(1, 4))); - var data = reader.Load(dataPath); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarString", DataKind.String, 1), + new TextLoader.Column("VectorString", DataKind.String, 1, 4) + }); var wrongCollection = new[] { new TestClass() { A = 1, B = new int[2] { 2, 3 } }, new TestClass() { A = 4, B = new int[2] { 2, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); - var est = data.MakeNewEstimator(). - Append(row => ( - A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind), - B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), - C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), - D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin), - E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin) - )); + var est = ML.Transforms.Text.TokenizeIntoWords("VarVectorString", "ScalarString") + .Append(ML.Transforms.Categorical.OneHotEncoding("A", "ScalarString", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)) + .Append(ML.Transforms.Categorical.OneHotEncoding("B", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Indicator)) + .Append(ML.Transforms.Categorical.OneHotEncoding("C", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Bag)) + .Append(ML.Transforms.Categorical.OneHotEncoding("D", "ScalarString", outputKind: OneHotEncodingEstimator.OutputKind.Binary)) + .Append(ML.Transforms.Categorical.OneHotEncoding("E", "VectorString", outputKind: OneHotEncodingEstimator.OutputKind.Binary)); - TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Categorical", "featurized.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var view = ML.Transforms.SelectColumns("A", "B", "C", "D", "E").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); diff --git a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs index a7e8c685cf..ed7f0cd8e0 100644 --- a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs @@ -7,7 +7,6 @@ using Microsoft.ML.Data.IO; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Text; @@ -27,15 +26,15 @@ public FeatureSelectionTests(ITestOutputHelper helper) public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true, allowQuoting: true, allowSparse: true); - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true, allowQuoting: true, allowSparse: true); var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) @@ -46,7 +45,7 @@ public void FeatureSelectionWorkout() using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("bag_of_words_count", "bag_of_words_mi").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -61,14 +60,12 @@ public void FeatureSelectionWorkout() public void DropSlotsTransform() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarFloat: ctx.LoadFloat(1), - ScalarDouble: ctx.LoadDouble(1), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDouble: ctx.LoadDouble(4, 8) - )); - - var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarFloat", DataKind.Single, 1), + new TextLoader.Column("ScalarDouble", DataKind.Double, 1), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDouble", DataKind.Double, 4, 8), + }); var columns = new[] { @@ -104,13 +101,12 @@ public void TestDropSlotsSelectionCommandLine() public void CountFeatureSelectionWorkout() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarFloat: ctx.LoadFloat(6), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDouble: ctx.LoadDouble(4, 8) - )); - var data = ML.Data.Cache(reader.Load(new MultiFileSource(dataPath)).AsDynamic); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarFloat", DataKind.Single, 6), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDouble", DataKind.Double, 4, 8), + }); var columns = new[] { new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1), @@ -147,12 +143,10 @@ public void TestCountFeatureSelectionCommandLine() public void TestCountSelectOldSavingAndLoading() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - Label: ctx.LoadKey(0, 3), - VectorFloat: ctx.LoadFloat(1, 4) - )); - - var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic; + var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.UInt32, new[]{ new TextLoader.Range(0) }, new KeyCount(3)), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4) + }); var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1); @@ -171,14 +165,12 @@ public void TestCountSelectOldSavingAndLoading() public void MutualInformationSelectionWorkout() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - Label: ctx.LoadKey(0, 3), - ScalarFloat: ctx.LoadFloat(6), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDouble: ctx.LoadDouble(4, 8) - )); - - var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.UInt32, new[] { new TextLoader.Range(0) }, new KeyCount(3)), + new TextLoader.Column("ScalarFloat", DataKind.Single, 6), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDouble", DataKind.Double, 4, 8), + }); var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label") .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100, @@ -211,12 +203,10 @@ public void TestMutualInformationFeatureSelectionCommandLine() public void TestMutualInformationOldSavingAndLoading() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - Label: ctx.LoadKey(0, 3), - VectorFloat: ctx.LoadFloat(1, 4) - )); - - var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic; + var dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("Label", DataKind.UInt32, new[]{ new TextLoader.Range(0) }, new KeyCount(3)), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4) + }); var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label"); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs index 4dfe700764..da4f78f412 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -57,32 +56,24 @@ public void KeyToBinaryVectorWorkout() } [Fact] - public void KeyToBinaryVectorStatic() + public void KeyToBinaryVector() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(Env, ctx => ( - ScalarString: ctx.LoadText(1), - VectorString: ctx.LoadText(1, 4) - )); - - var data = reader.Load(dataPath); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarString", DataKind.String, 0), + new TextLoader.Column("VectorString", DataKind.String, 1, 4), + }); - // Non-pigsty Term. - var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { + var transformedData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnOptions("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnOptions("B", "VectorString") }) - .Fit(data.AsDynamic).Transform(data.AsDynamic); + .Fit(data).Transform(data); - var data2 = dynamicData.AssertStatic(Env, ctx => ( - A: ctx.KeyU4.TextValues.Scalar, - B: ctx.KeyU4.TextValues.Vector)); - var est = data2.MakeNewEstimator() - .Append(row => ( - ScalarString: row.A.ToBinaryVector(), - VectorString: row.B.ToBinaryVector())); + var est = ML.Transforms.Conversion.MapKeyToBinaryVector("ScalarString", "A") + .Append(ML.Transforms.Conversion.MapKeyToBinaryVector("VectorString", "B")); - TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); + TestEstimatorCore(est, transformedData, invalidInput: data); Done(); } diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs index ee1438b3f5..95726789fd 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs @@ -6,7 +6,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Transforms; using Xunit; using Xunit.Abstractions; @@ -62,36 +61,27 @@ public void KeyToValueWorkout() } [Fact] - public void KeyToValuePigsty() + public void KeyToValue() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(Env, ctx => ( - ScalarString: ctx.LoadText(1), - VectorString: ctx.LoadText(1, 4) - )); - - var data = reader.Load(dataPath); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarString", DataKind.String, 0), + new TextLoader.Column("VectorString", DataKind.String, 1, 4), + }); - // Non-pigsty Term. - var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { + var transformedData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnOptions("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnOptions("B", "VectorString") }) - .Fit(data.AsDynamic).Transform(data.AsDynamic); - - var data2 = dynamicData.AssertStatic(Env, ctx => ( - A: ctx.KeyU4.TextValues.Scalar, - B: ctx.KeyU4.TextValues.Vector)); + .Fit(data).Transform(data); - var est = data2.MakeNewEstimator() - .Append(row => ( - ScalarString: row.A.ToValue(), - VectorString: row.B.ToValue())); + var est = ML.Transforms.Conversion.MapKeyToValue("ScalarString", "A") + .Append(ML.Transforms.Conversion.MapKeyToValue("VectorString", "B")); - TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); + TestEstimatorCore(est, transformedData, invalidInput: data); - var data2Transformed = est.Fit(data2).Transform(data2).AsDynamic; + var data2Transformed = est.Fit(transformedData).Transform(transformedData); // Check that term and ToValue are round-trippable. - var dataLeft = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data.AsDynamic).Transform(data.AsDynamic); + var dataLeft = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data).Transform(data); var dataRight = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data2Transformed).Transform(data2Transformed); CheckSameSchemas(dataLeft.Schema, dataRight.Schema); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index 50bacedc27..e60f09074e 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -66,34 +65,24 @@ public void KeyToVectorWorkout() } [Fact] - public void KeyToVectorStatic() + public void KeyToVector() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(Env, ctx => ( - ScalarString: ctx.LoadText(1), - VectorString: ctx.LoadText(1, 4) - )); - - var data = reader.Load(dataPath); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarString", DataKind.String, 0), + new TextLoader.Column("VectorString", DataKind.String, 1, 4), + }); - // Non-pigsty Term. - var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { + var transformedData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnOptions("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnOptions("B", "VectorString") }) - .Fit(data.AsDynamic).Transform(data.AsDynamic); - - var data2 = dynamicData.AssertStatic(Env, ctx => ( - A: ctx.KeyU4.TextValues.Scalar, - B: ctx.KeyU4.TextValues.Vector)); + .Fit(data).Transform(data); - var est = data2.MakeNewEstimator() - .Append(row => ( - ScalarString: row.A.ToVector(), - VectorString: row.B.ToVector(), - VectorBaggedString: row.B.ToBaggedVector() - )); + var est = ML.Transforms.Conversion.MapKeyToVector("ScalarString", "A") + .Append(ML.Transforms.Conversion.MapKeyToVector("VectorString", "B")) + .Append(ML.Transforms.Conversion.MapKeyToVector("VectorBaggedString", "B", true)); - TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); + TestEstimatorCore(est, transformedData, invalidInput: data); Done(); } diff --git a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs index a5bb6f7c1c..c555993c3c 100644 --- a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs @@ -9,7 +9,6 @@ using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Xunit; using Xunit.Abstractions; @@ -92,17 +91,16 @@ public void TestOldSavingAndLoading() public void NAIndicatorFileOutput() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarFloat: ctx.LoadFloat(1), - ScalarDouble: ctx.LoadDouble(1), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDoulbe: ctx.LoadDouble(1, 4) - )); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarFloat", DataKind.Single, 1), + new TextLoader.Column("ScalarDouble", DataKind.Double, 1), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDoulbe", DataKind.Double, 1, 4) + }); - var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); - var est = ML.Transforms.IndicateMissingValues(new[] + var est = ML.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("A", "ScalarFloat"), new InputOutputColumnPair("B", "ScalarDouble"), diff --git a/test/Microsoft.ML.Tests/Transformers/NAReplaceTests.cs b/test/Microsoft.ML.Tests/Transformers/NAReplaceTests.cs index fc55126dc6..edfdc3fedc 100644 --- a/test/Microsoft.ML.Tests/Transformers/NAReplaceTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NAReplaceTests.cs @@ -6,7 +6,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -52,32 +51,28 @@ public void NAReplaceWorkout() } [Fact] - public void NAReplaceStatic() + public void NAReplace() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - ScalarFloat: ctx.LoadFloat(1), - ScalarDouble: ctx.LoadDouble(1), - VectorFloat: ctx.LoadFloat(1, 4), - VectorDoulbe: ctx.LoadDouble(1, 4) - )); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("ScalarFloat", DataKind.Single, 1), + new TextLoader.Column("ScalarDouble", DataKind.Double, 1), + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4), + new TextLoader.Column("VectorDoulbe", DataKind.Double, 1, 4) + }); - var data = reader.Load(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 3, C = new float[2] { 1, 2 }, D = new double[2] { 3, 4 } } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); - var est = data.MakeNewEstimator(). - Append(row => ( - A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ReplacementMode.Maximum), - B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingEstimator.ReplacementMode.Mean), - C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ReplacementMode.Mean), - D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingEstimator.ReplacementMode.Minimum) - )); + var est = ML.Transforms.ReplaceMissingValues("A", "ScalarFloat", replacementMode: MissingValueReplacingEstimator.ReplacementMode.Maximum) + .Append(ML.Transforms.ReplaceMissingValues("B", "ScalarDouble", replacementMode: MissingValueReplacingEstimator.ReplacementMode.Mean)) + .Append(ML.Transforms.ReplaceMissingValues("C", "VectorFloat", replacementMode: MissingValueReplacingEstimator.ReplacementMode.Mean)) + .Append(ML.Transforms.ReplaceMissingValues("D", "VectorDoulbe", replacementMode: MissingValueReplacingEstimator.ReplacementMode.Minimum)); - TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NAReplace", "featurized.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); - var view = ML.Transforms.SelectColumns("A", "B", "C", "D" ).Fit(savedData).Transform(savedData); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); + var view = ML.Transforms.SelectColumns("A", "B", "C", "D").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); diff --git a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs index b5c2ce13e6..6329f7b102 100644 --- a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs @@ -6,13 +6,11 @@ using System.Collections.Generic; using System.Collections.Immutable; using System.IO; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.Experimental; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; @@ -562,26 +560,26 @@ public void NormalizerExperimentalExtensionGetColumnPairs() public void LpGcNormAndWhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var data = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var data = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); - var invalidData = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var invalidData = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.String, 0, 10) + }, hasHeader: true, separatorChar: ';'); var est = ML.Transforms.NormalizeLpNorm("lpnorm", "features") .Append(ML.Transforms.NormalizeGlobalContrast("gcnorm", "features")) .Append(new VectorWhiteningEstimator(ML, "whitened", "features")); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("lpnorm", "gcnorm", "whitened").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -596,25 +594,26 @@ public void LpGcNormAndWhiteningWorkout() public void WhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var data = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var data = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); + + var invalidData = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.String, 0, 10) + }, hasHeader: true, separatorChar: ';'); - var invalidData = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); var est = new VectorWhiteningEstimator(ML, "whitened1", "features") .Append(new VectorWhiteningEstimator(ML, "whitened2", "features", kind: WhiteningKind.PrincipalComponentAnalysis, rank: 5)); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("whitened1", "whitened2").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -637,10 +636,11 @@ public void TestWhiteningCommandLine() public void TestWhiteningOldSavingAndLoading() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataView = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource).AsDynamic; + var dataView = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); + var pipe = new VectorWhiteningEstimator(ML, "whitened", "features"); var result = pipe.Fit(dataView).Transform(dataView); @@ -658,25 +658,25 @@ public void TestWhiteningOldSavingAndLoading() public void LpNormWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var data = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var data = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); - var invalidData = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var invalidData = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.String, 0, 10) + }, hasHeader: true, separatorChar: ';'); var est = ML.Transforms.NormalizeLpNorm("lpNorm1", "features") .Append(ML.Transforms.NormalizeLpNorm("lpNorm2", "features", norm: LpNormNormalizingEstimatorBase.NormFunction.L1, ensureZeroMean: true)); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("lpNorm1", "lpNorm2").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -697,10 +697,11 @@ public void TestLpNormCommandLine() public void TestLpNormOldSavingAndLoading() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataView = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource).AsDynamic; + var dataView = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); + var pipe = ML.Transforms.NormalizeLpNorm("whitened", "features"); var result = pipe.Fit(dataView).Transform(dataView); @@ -717,25 +718,25 @@ public void TestLpNormOldSavingAndLoading() public void GcnWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var data = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var data = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); - var invalidData = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource); + var invalidData = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.String, 0, 10) + }, hasHeader: true, separatorChar: ';'); var est = ML.Transforms.NormalizeGlobalContrast("gcnNorm1", "features") .Append(ML.Transforms.NormalizeGlobalContrast("gcnNorm2", "features", ensureZeroMean: false, ensureUnitStandardDeviation: true, scale: 3)); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("gcnNorm1", "gcnNorm2").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -756,10 +757,11 @@ public void TestGcnNormCommandLine() public void TestGcnNormOldSavingAndLoading() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - var dataView = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(dataSource).AsDynamic; + var dataView = ML.Data.LoadFromTextFile(dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); + var pipe = ML.Transforms.NormalizeGlobalContrast("whitened", "features"); var result = pipe.Fit(dataView).Transform(dataView); diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index 2cee00a560..aae2441f2a 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information. using System.IO; +using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Xunit; using Xunit.Abstractions; @@ -26,21 +26,23 @@ public PcaTests(ITestOutputHelper helper) [Fact] public void PcaWorkout() { - var data = TextLoaderStatic.CreateLoader(_env, - c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)), - separator: ';', hasHeader: true) - .Load(_dataSource); + var data = ML.Data.LoadFromTextFile(_dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("weight", DataKind.Single, 0), + new TextLoader.Column("features", DataKind.Single, 1, 10) + }, hasHeader: true, separatorChar: ';'); - var invalidData = TextLoaderStatic.CreateLoader(_env, - c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)), - separator: ';', hasHeader: true) - .Load(_dataSource); + var invalidData = ML.Data.LoadFromTextFile(_dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("weight", DataKind.Single, 0), + new TextLoader.Column("features", DataKind.String, 1, 10) + }, hasHeader: true, separatorChar: ';'); var est = ML.Transforms.ProjectToPrincipalComponents("pca", "features", rank: 4, seed: 10); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var estNonDefaultArgs = ML.Transforms.ProjectToPrincipalComponents("pca", "features", rank: 3, exampleWeightColumnName: "weight", overSampling: 2, ensureZeroMean: false); - TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(estNonDefaultArgs, data, invalidInput: invalidData); Done(); } @@ -48,14 +50,14 @@ public void PcaWorkout() [Fact] public void TestPcaEstimator() { - var data = TextLoaderStatic.CreateLoader(ML, - c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), - separator: ';', hasHeader: true) - .Load(_dataSource); + var data = ML.Data.LoadFromTextFile(_dataSource, new[] { + new TextLoader.Column("label", DataKind.Single, 11), + new TextLoader.Column("features", DataKind.Single, 0, 10) + }, hasHeader: true, separatorChar: ';'); var est = ML.Transforms.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("pca").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) diff --git a/test/Microsoft.ML.Tests/Transformers/RffTests.cs b/test/Microsoft.ML.Tests/Transformers/RffTests.cs index a594508575..88ed2ad98f 100644 --- a/test/Microsoft.ML.Tests/Transformers/RffTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/RffTests.cs @@ -8,7 +8,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Xunit; @@ -61,24 +60,20 @@ public void RffWorkout() } [Fact] - public void RffStatic() + public void ApproximateKernelMap() { string dataPath = GetDataPath("breast-cancer.txt"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - VectorFloat: ctx.LoadFloat(1, 8), - Label: ctx.LoadFloat(0) - )); + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("VectorFloat", DataKind.Single, 1, 8), + new TextLoader.Column("Label", DataKind.Single, 0) + }); - var data = reader.Load(dataPath); + var est = ML.Transforms.ApproximatedKernelMap("RffVectorFloat", "VectorFloat", 3, true); - var est = data.MakeNewEstimator() - .Append(row => ( - RffVectorFloat: row.VectorFloat.ApproximatedKernelMap(3, true), row.Label)); - - TestEstimatorCore(est.AsDynamic, data.AsDynamic); + TestEstimatorCore(est, data); var outputPath = GetOutputPath("Rff", "featurized.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Rff", "featurized.tsv"); diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 8bff1c556f..c648937e9c 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -6,13 +6,11 @@ using System.IO; using System.Linq; using System.Text.RegularExpressions; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.Runtime; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Text; @@ -333,27 +331,25 @@ public void TextFeaturizerWithKeepDiacriticsTest() public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true, allowQuoting: true); - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath) - .AsDynamic; + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true, allowQuoting: true); - var feat = data.MakeNewEstimator() - .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens", })); + var feat = ML.Transforms.Text.FeaturizeText("Data", new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens" }, new[] { "text" }); - TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData); + TestEstimatorCore(feat, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); - var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4); + var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -368,23 +364,23 @@ public void TextFeaturizerWorkout() public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true); - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true); var est = new WordTokenizingEstimator(ML, "words", "text") .Append(new TokenizingByCharactersEstimator(ML, "chars", "text")) .Append(new KeyToValueMappingEstimator(ML, "chars")); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "tokenized.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "words", "chars").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -398,10 +394,10 @@ public void TextTokenizationWorkout() public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(Env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(dataPath).AsDynamic; + var data = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true); var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); @@ -435,24 +431,25 @@ public void TokenizeWithSeparatorCommandLine() public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); - - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true); + + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true); + var est = ML.Transforms.Text.NormalizeText("text") .Append(ML.Transforms.Text.TokenizeIntoWords("words", "text")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words")) .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY")); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "NoDefaultStopwords", "NoStopWords").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); @@ -497,15 +494,15 @@ public void StopWordsRemoverFromFactory() public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true, allowQuoting: true); - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true, allowQuoting: true); var est = new WordBagEstimator(ML, "bag_of_words", "text"). Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", maximumNumberOfInverts: -1)); @@ -515,7 +512,7 @@ public void WordBagWorkout() // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "bag_of_words", "bag_of_wordshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -529,15 +526,15 @@ public void WordBagWorkout() public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true, allowQuoting: true); - var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true, allowQuoting: true); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) @@ -547,10 +544,10 @@ public void NgramWorkout() // the original non-inverted column to the actual baseline file. .Append(new NgramHashingEstimator(ML, "ngramshashinvert", "terms", maximumNumberOfInverts: 2)); - TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); + TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "ngrams.tsv"); - var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); + var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) @@ -565,13 +562,14 @@ void TestNgramCompatColumns() { string dropModelPath = GetDataPath("backcompat/ngram.zip"); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(ML, ctx => ( - Sentiment: ctx.LoadBool(0), - SentimentText: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("Sentiment", DataKind.Boolean, 0), + new TextLoader.Column("SentimentText", DataKind.String, 1) }, + hasHeader: true, allowQuoting: true); using (FileStream fs = File.OpenRead(dropModelPath)) { - var result = ModelFileUtils.LoadTransforms(Env, data.AsDynamic, fs); + var result = ModelFileUtils.LoadTransforms(Env, data, fs); var featureColumn = result.Schema.GetColumnOrNull("Features"); Assert.NotNull(featureColumn); } @@ -582,15 +580,15 @@ public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var data = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true) - .Load(sentimentDataPath); + var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) }, + hasHeader: true); - var invalidData = TextLoaderStatic.CreateLoader(env, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadFloat(1)), hasHeader: true) - .Load(sentimentDataPath); + var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.Single, 1) }, + hasHeader: true); var est = new WordBagEstimator(env, "bag_of_words", "text"). Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, maximumNumberOfIterations: 10, @@ -605,8 +603,8 @@ public void LdaWorkout() using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); - var transformer = est.Fit(data.AsDynamic); - var transformedData = transformer.Transform(data.AsDynamic); + var transformer = est.Fit(data); + var transformedData = transformer.Transform(data); var savedData = ML.Data.TakeRows(transformedData, 4); savedData = ML.Transforms.SelectColumns("topics").Fit(savedData).Transform(savedData); diff --git a/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs b/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs index bc685e5b58..93b7f0612e 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextNormalizer.cs @@ -6,7 +6,6 @@ using Microsoft.ML.Data; using Microsoft.ML.Model; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Microsoft.ML.Tools; using Microsoft.ML.Transforms.Text; using Xunit; @@ -49,11 +48,10 @@ public void TextNormalizerWorkout() TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView); var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); - var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( - label: ctx.LoadBool(0), - text: ctx.LoadText(1)), hasHeader: true); - var dataSource = new MultiFileSource(dataPath); - dataView = reader.Load(dataSource).AsDynamic; + dataView = ML.Data.LoadFromTextFile(dataPath, new[] { + new TextLoader.Column("label", DataKind.Boolean, 0), + new TextLoader.Column("text", DataKind.String, 1) + }, hasHeader: true); var pipeVariations = new TextNormalizingEstimator(ML, columns: new[] { ("NormText", "text") }).Append( new TextNormalizingEstimator(ML, caseMode: TextNormalizingEstimator.CaseMode.Upper, columns: new[] { ("UpperText", "text") })).Append( diff --git a/test/Microsoft.ML.TimeSeries.Tests/Microsoft.ML.TimeSeries.Tests.csproj b/test/Microsoft.ML.TimeSeries.Tests/Microsoft.ML.TimeSeries.Tests.csproj index 218978ae86..4ced31bcb9 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/Microsoft.ML.TimeSeries.Tests.csproj +++ b/test/Microsoft.ML.TimeSeries.Tests/Microsoft.ML.TimeSeries.Tests.csproj @@ -5,7 +5,6 @@ - @@ -13,6 +12,6 @@ - + diff --git a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesStaticTests.cs b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesSimpleApiTests.cs similarity index 76% rename from test/Microsoft.ML.TimeSeries.Tests/TimeSeriesStaticTests.cs rename to test/Microsoft.ML.TimeSeries.Tests/TimeSeriesSimpleApiTests.cs index da85ad5be4..2ca132745e 100644 --- a/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesStaticTests.cs +++ b/test/Microsoft.ML.TimeSeries.Tests/TimeSeriesSimpleApiTests.cs @@ -5,34 +5,27 @@ using System.Collections.Generic; using Microsoft.ML.Data; using Microsoft.ML.RunTests; -using Microsoft.ML.StaticPipe; using Xunit; using Xunit.Abstractions; namespace Microsoft.ML.Tests { - public sealed class TimeSeriesStaticTests : BaseTestBaseline + public sealed class TimeSeriesSimpleApiTests : BaseTestBaseline { - public TimeSeriesStaticTests(ITestOutputHelper output) : base(output) + public TimeSeriesSimpleApiTests(ITestOutputHelper output) : base(output) { } -#pragma warning disable CS0649 // Ignore unintialized field warning private sealed class ChangePointPrediction { - // Note that this field must be named "Data"; we ultimately convert - // to a dynamic IDataView in order to extract AsEnumerable - // predictions and that process uses "Data" as the default column - // name for an output column from a static pipeline. [VectorType(4)] - public double[] Data; + public double[] Data { get; set; } } private sealed class SpikePrediction { [VectorType(3)] - public double[] Data; + public double[] Data { get; set; } } -#pragma warning restore CS0649 private sealed class Data { @@ -54,18 +47,16 @@ public void ChangeDetection() for (int i = 0; i < Size / 2; i++) data.Add(new Data((float)(5 + i * 1.1))); - // Convert to statically-typed data view. - var staticData = dataView.AssertStatic(env, c => new { Value = c.R4.Scalar }); // Build the pipeline - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => r.Value.DetectIidChangePoint(80, Size)); + var learningPipeline = ML.Transforms.DetectIidChangePoint("Data", "Value", 80, Size); + // Train - var detector = staticLearningPipeline.Fit(staticData); + var detector = learningPipeline.Fit(dataView); // Transform - var output = detector.Transform(staticData); + var output = detector.Transform(dataView); // Get predictions - var enumerator = env.Data.CreateEnumerable(output.AsDynamic, true).GetEnumerator(); + var enumerator = env.Data.CreateEnumerable(output, true).GetEnumerator(); ChangePointPrediction row = null; List expectedValues = new List() { 0, 5, 0.5, 5.1200000000000114E-08, 0, 5, 0.4999999995, 5.1200000046080209E-08, 0, 5, 0.4999999995, 5.1200000092160303E-08, 0, 5, 0.4999999995, 5.12000001382404E-08}; @@ -100,18 +91,15 @@ public void ChangePointDetectionWithSeasonality() for (int i = 0; i < ChangeHistorySize; i++) data.Add(new Data(i * 100)); - // Convert to statically-typed data view. - var staticData = dataView.AssertStatic(env, c => new { Value = c.R4.Scalar }); // Build the pipeline - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => r.Value.DetectChangePointBySsa(95, ChangeHistorySize, MaxTrainingSize, SeasonalitySize)); + var learningPipeline = ML.Transforms.DetectChangePointBySsa("Data", "Value", 95, ChangeHistorySize, MaxTrainingSize, SeasonalitySize); // Train - var detector = staticLearningPipeline.Fit(staticData); + var detector = learningPipeline.Fit(dataView); // Transform - var output = detector.Transform(staticData); + var output = detector.Transform(dataView); // Get predictions - var enumerator = env.Data.CreateEnumerable(output.AsDynamic, true).GetEnumerator(); + var enumerator = env.Data.CreateEnumerable(output, true).GetEnumerator(); ChangePointPrediction row = null; List expectedValues = new List() { 0, -3.31410598754883, 0.5, 5.12000000000001E-08, 0, 1.5700820684432983, 5.2001145245395008E-07, 0.012414560443710681, 0, 1.2854313254356384, 0.28810801662678009, 0.02038940454467935, 0, -1.0950627326965332, 0.36663890634019225, 0.026956459625565483}; @@ -144,18 +132,15 @@ public void SpikeDetection() for (int i = 0; i < Size / 2 - 1; i++) data.Add(new Data(5)); - // Convert to statically-typed data view. - var staticData = dataView.AssertStatic(env, c => new { Value = c.R4.Scalar }); // Build the pipeline - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => r.Value.DetectIidSpike(80, PvalHistoryLength)); + var learningPipeline = ML.Transforms.DetectIidSpike("Data", "Value", 80, PvalHistoryLength); // Train - var detector = staticLearningPipeline.Fit(staticData); + var detector = learningPipeline.Fit(dataView); // Transform - var output = detector.Transform(staticData); + var output = detector.Transform(dataView); // Get predictions - var enumerator = env.Data.CreateEnumerable(output.AsDynamic, true).GetEnumerator(); + var enumerator = env.Data.CreateEnumerable(output, true).GetEnumerator(); var expectedValues = new List() { // Alert Score P-Value new double[] {0, 5, 0.5}, @@ -199,18 +184,15 @@ public void SsaSpikeDetection() for (int i = 0; i < Size / 2 - 1; i++) data.Add(new Data(5)); - // Convert to statically-typed data view. - var staticData = dataView.AssertStatic(env, c => new { Value = c.R4.Scalar }); // Build the pipeline - var staticLearningPipeline = staticData.MakeNewEstimator() - .Append(r => r.Value.DetectSpikeBySsa(80, ChangeHistoryLength, TrainingWindowSize, SeasonalityWindowSize)); + var learningPipeline = ML.Transforms.DetectSpikeBySsa("Data", "Value", 80, ChangeHistoryLength, TrainingWindowSize, SeasonalityWindowSize); // Train - var detector = staticLearningPipeline.Fit(staticData); + var detector = learningPipeline.Fit(dataView); // Transform - var output = detector.Transform(staticData); + var output = detector.Transform(dataView); // Get predictions - var enumerator = env.Data.CreateEnumerable(output.AsDynamic, true).GetEnumerator(); + var enumerator = env.Data.CreateEnumerable(output, true).GetEnumerator(); var expectedValues = new List() { // Alert Score P-Value new double[] {0, 0.0, 0.5},