microsoft · ganik · Sep 19, 2019 · Sep 17, 2019 · Sep 17, 2019 · Sep 17, 2019
diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs
@@ -7,10 +7,8 @@
 using System.Runtime.InteropServices;
 using System.Text;
 using System.Threading;
-using Microsoft.ML;
 using Microsoft.ML.Data;
 using Microsoft.ML.EntryPoints;
-using Microsoft.ML.Model.OnnxConverter;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Trainers;
 using Microsoft.ML.Trainers.Ensemble;
@@ -19,7 +17,7 @@
 using Microsoft.ML.Transforms;
 using Microsoft.ML.Transforms.TimeSeries;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     /// <summary>
     /// The main entry point from native code. Note that GC / lifetime issues are critical to get correct.
@@ -302,6 +300,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd
             //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly);
             //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);
             env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly);
+            env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly);
 
             using (var ch = host.Start("Executing"))
             {

diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs
@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.DotNetBridge;
+using Microsoft.ML.Data;
+using Microsoft.ML.EntryPoints;
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Transforms;
+
+[assembly: LoadableClass(typeof(void), typeof(DotNetBridgeEntrypoints), null, typeof(SignatureEntryPointModule), "DotNetBridgeEntrypoints")]
+
+namespace Microsoft.ML.DotNetBridge
+{
+    internal static class DotNetBridgeEntrypoints
+    {
+        [TlcModule.EntryPoint(Name = "Transforms.PrefixColumnConcatenator", Desc = ColumnConcatenatingTransformer.Summary,
+            UserName = ColumnConcatenatingTransformer.UserName, ShortName = ColumnConcatenatingTransformer.LoadName)]
+        public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input)
+        {
+            Contracts.CheckValue(env, nameof(env));
+            var host = env.Register("PrefixConcatColumns");
+            host.CheckValue(input, nameof(input));
+            EntryPointUtils.CheckInputArgs(host, input);
+
+            // Get all column names with preserving order.
+            var colNames = new List<string>(input.Data.Schema.Count);
+            for (int i = 0; i < input.Data.Schema.Count; i++)
+                colNames.Add(input.Data.Schema[i].Name);
+
+            // Iterate throuh input options, find matching source columns, create new input options
+            var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data };
+            var columns = new List<ColumnConcatenatingTransformer.Column>(input.Columns.Length);
+            foreach (var col in input.Columns)
+            {
+                var newCol = new ColumnConcatenatingTransformer.Column();
+                newCol.Name = col.Name;
+                var prefix = col.Source;
+                newCol.Source = colNames.Where(x => x.StartsWith(prefix, StringComparison.InvariantCulture)).ToArray();
+                if (newCol.Source.Length == 0)
+                    throw new ArgumentOutOfRangeException("No matching columns found for prefix: " + prefix);
+
+                columns.Add(newCol);
+            }
+            inputOptions.Columns = columns.ToArray();
+
+            var xf = ColumnConcatenatingTransformer.Create(env, inputOptions, inputOptions.Data);
+            return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf };
+        }
+    }
+}
diff --git a/src/DotNetBridge/MessageValidator.cs b/src/DotNetBridge/MessageValidator.cs
@@ -7,7 +7,7 @@
 using System.Globalization;
 using Microsoft.ML.Runtime;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     /// <summary>
     /// This is a temporary solution to validate the messages from ML.NET to nimbusml.

diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs
@@ -9,11 +9,10 @@
 using System.Globalization;
 using System.Runtime.InteropServices;
 using System.Text;
-using Microsoft.ML;
 using Microsoft.ML.Data;
 using Microsoft.ML.Runtime;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     public unsafe static partial class Bridge
     {

diff --git a/src/DotNetBridge/NativeDataView.cs b/src/DotNetBridge/NativeDataView.cs
@@ -8,13 +8,12 @@
 using System.Collections.Concurrent;
 using System.Linq;
 using System.Threading;
-using Microsoft.ML;
 using Microsoft.ML.Data;
 using Microsoft.ML.Internal.Utilities;
 using System.Threading.Tasks;
 using Microsoft.ML.Runtime;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     public unsafe static partial class Bridge
     {

diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs
@@ -5,10 +5,9 @@
 
 using System;
 using System.Globalization;
-using Microsoft.ML;
 using Microsoft.ML.Runtime;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     internal class RmlEnvironment : HostEnvironmentBase<RmlEnvironment>
     {

diff --git a/src/DotNetBridge/RunGraph.cs b/src/DotNetBridge/RunGraph.cs
@@ -9,8 +9,6 @@
 using System.IO;
 using System.Linq;
 using Microsoft.DataPrep.Common;
-using Microsoft.ML;
-using Microsoft.ML.CommandLine;
 using Microsoft.ML.Data;
 using Microsoft.ML.Data.IO;
 using Microsoft.ML.EntryPoints;
@@ -20,7 +18,7 @@
 using Newtonsoft.Json;
 using Newtonsoft.Json.Linq;
 
-namespace Microsoft.MachineLearning.DotNetBridge
+namespace Microsoft.ML.DotNetBridge
 {
     public unsafe static partial class Bridge
     {

diff --git a/src/NativeBridge/UnixInterface.h b/src/NativeBridge/UnixInterface.h
@@ -24,7 +24,7 @@
 #define CORECLR_SHUTDOWN "coreclr_shutdown"
 
 #define DOTNETBRIDGE "DotNetBridge"
-#define DOTNETBRIDGE_FQDN "Microsoft.MachineLearning.DotNetBridge.Bridge"
+#define DOTNETBRIDGE_FQDN "Microsoft.ML.DotNetBridge.Bridge"
 
 #define GET_FN "GetFn"
 

diff --git a/src/NativeBridge/WinInterface.h b/src/NativeBridge/WinInterface.h
@@ -302,7 +302,7 @@ class WinMlNetInterface
         HRESULT hr = host->CreateDelegate(
             _domainId,
             W("DotNetBridge"),
-            W("Microsoft.MachineLearning.DotNetBridge.Bridge"),
+            W("Microsoft.ML.DotNetBridge.Bridge"),
             W("GetFn"),
             &getter);
         if (FAILED(hr))

diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -91,6 +91,8 @@
     <Compile Include="nimbusml\examples\BootStrapSample.py" />
     <Compile Include="nimbusml\examples\CharTokenizer.py" />
     <Compile Include="nimbusml\examples\ColumnConcatenator.py" />
+    <Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
+    <Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\ColumnDropper.py" />
     <Compile Include="nimbusml\examples\ColumnDuplicator.py" />
     <Compile Include="nimbusml\examples\ColumnSelector.py" />
@@ -299,6 +301,7 @@
     <Compile Include="nimbusml\internal\core\preprocessing\filter\skipfilter.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\filter\takefilter.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\normalization\lpscaler.py" />
+    <Compile Include="nimbusml\internal\core\preprocessing\schema\prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columnduplicator.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columndropper.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\tensorflowscorer.py" />
@@ -386,6 +389,7 @@
     <Compile Include="nimbusml\internal\entrypoints\transforms_categoricalhashonehotvectorizer.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_categoricalonehotvectorizer.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_charactertokenizer.py" />
+    <Compile Include="nimbusml\internal\entrypoints\transforms_prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_columnconcatenator.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_columncopier.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_columnselector.py" />
@@ -633,6 +637,7 @@
     <Compile Include="nimbusml\preprocessing\normalization\meanvariancescaler.py" />
     <Compile Include="nimbusml\preprocessing\normalization\minmaxscaler.py" />
     <Compile Include="nimbusml\preprocessing\normalization\__init__.py" />
+    <Compile Include="nimbusml\preprocessing\schema\prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\preprocessing\schema\columnconcatenator.py" />
     <Compile Include="nimbusml\preprocessing\schema\columndropper.py" />
     <Compile Include="nimbusml\preprocessing\schema\columnduplicator.py" />
@@ -669,10 +674,12 @@
     <Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
     <Compile Include="nimbusml\tests\idv\__init__.py" />
     <Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
+    <Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_lpscaler.py" />
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
+    <Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
     <Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
     <Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />

diff --git a/src/python/nimbusml/examples/PrefixColumnConcatenator.py b/src/python/nimbusml/examples/PrefixColumnConcatenator.py
@@ -0,0 +1,25 @@
+###############################################################################
+# PrefixColumnConcatenator
+import numpy as np
+import pandas as pd
+from nimbusml.preprocessing.schema import PrefixColumnConcatenator
+
+data = pd.DataFrame(
+    data=dict(
+        PrefixA=[2.5, np.nan, 2.1, 1.0],
+        PrefixB=[.75, .9, .8, .76],
+        AnotherColumn=[np.nan, 2.5, 2.6, 2.4]))
+
+# transform usage
+xf = PrefixColumnConcatenator(columns={'combined': 'Prefix'})
+
+# fit and transform
+features = xf.fit_transform(data)
+
+# print features
+print(features.head())
+#   PrefixA  PrefixB  AnotherColumn  combined.PrefixA  combined.PrefixB
+#0      2.5     0.75            NaN               2.5              0.75
+#1      NaN     0.90            2.5               NaN              0.90
+#2      2.1     0.80            2.6               2.1              0.80
+#3      1.0     0.76            2.4               1.0              0.76
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/PrefixColumnConcatenator_df.py b/src/python/nimbusml/examples/examples_from_dataframe/PrefixColumnConcatenator_df.py
@@ -0,0 +1,31 @@
+###############################################################################
+# PrefixColumnConcatenator
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline, Role
+from nimbusml.datasets import get_dataset
+from nimbusml.linear_model import LogisticRegressionClassifier
+from nimbusml.preprocessing.schema import PrefixColumnConcatenator
+from nimbusml.preprocessing.schema import ColumnDropper
+from sklearn.model_selection import train_test_split
+
+# use 'iris' data set to create test and train data
+#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
+# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
+# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
+df = get_dataset("iris").as_df()
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])
+
+concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'}
+concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'}
+dropcols = ColumnDropper() << ['Sepal_Length', 'Sepal_Width', 'Petal_Length',
+                              'Petal_Width', 'Setosa', 'Species']
+
+pipeline = Pipeline([concat, concat1, dropcols, LogisticRegressionClassifier()])
+pipeline.fit(X_train, y_train)
+
+# Evaluate the model
+metrics, scores = pipeline.test(X_test, y_test, output_scores=True)
+print(metrics)
diff --git a/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py b/src/python/nimbusml/internal/core/preprocessing/schema/prefixcolumnconcatenator.py
@@ -0,0 +1,100 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+"""
+PrefixColumnConcatenator
+"""
+
+__all__ = ["PrefixColumnConcatenator"]
+
+
+from ....entrypoints.transforms_prefixcolumnconcatenator import \
+    transforms_prefixcolumnconcatenator
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class PrefixColumnConcatenator(BasePipelineItem, DefaultSignature):
+    """
+
+    Combines several columns into a single vector-valued column by prefix
+
+    .. remarks::
+        ``PrefixColumnConcatenator`` creates a single vector-valued column from
+        multiple
+        columns. It can be performed on data before training a model. The
+        concatenation
+        can significantly speed up the processing of data when the number of
+        columns
+        is as large as hundreds to thousands.
+
+    :param params: Additional arguments sent to compute engine.
+
+    .. seealso::
+        :py:class:`ColumnDropper
+        <nimbusml.preprocessing.schema.ColumnDropper>`,
+        :py:class:`ColumnSelector
+        <nimbusml.preprocessing.schema.ColumnSelector>`.
+
+    .. index:: transform, schema
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py
+              :language: python
+    """
+
+    @trace
+    def __init__(
+            self,
+            **params):
+        BasePipelineItem.__init__(
+            self, type='transform', **params)
+
+    @property
+    def _entrypoint(self):
+        return transforms_prefixcolumnconcatenator
+
+    @trace
+    def _get_node(self, **all_args):
+
+        input_columns = self.input
+        if input_columns is None and 'input' in all_args:
+            input_columns = all_args['input']
+        if 'input' in all_args:
+            all_args.pop('input')
+
+        output_columns = self.output
+        if output_columns is None and 'output' in all_args:
+            output_columns = all_args['output']
+        if 'output' in all_args:
+            all_args.pop('output')
+
+        # validate input
+        if input_columns is None:
+            raise ValueError(
+                "'None' input passed when it cannot be none.")
+
+        if not isinstance(input_columns, list):
+            raise ValueError(
+                "input has to be a list of strings, instead got %s" %
+                type(input_columns))
+
+        # validate output
+        if output_columns is None:
+            raise ValueError(
+                "'None' output passed when it cannot be none.")
+
+        if not isinstance(output_columns, list):
+            raise ValueError(
+                "output has to be a list of strings, instead got %s" %
+                type(output_columns))
+
+        algo_args = dict(
+            column=[
+                dict(
+                    Source=i, Name=o) for i, o in zip(
+                    input_columns, output_columns)] if input_columns else None)
+
+        all_args.update(algo_args)
+        return self._entrypoint(**all_args)