Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Output predictor model file optionally #270

Merged
merged 16 commits into from
Sep 19, 2019
5 changes: 2 additions & 3 deletions src/DotNetBridge/Bridge.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Model.OnnxConverter;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
using Microsoft.ML.Trainers.Ensemble;
Expand All @@ -19,7 +17,7 @@
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.TimeSeries;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
/// <summary>
/// The main entry point from native code. Note that GC / lifetime issues are critical to get correct.
Expand Down Expand Up @@ -302,6 +300,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd
//env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly);
//env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly);

using (var ch = host.Start("Executing"))
{
Expand Down
51 changes: 51 additions & 0 deletions src/DotNetBridge/Entrypoints.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.DotNetBridge;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;

[assembly: LoadableClass(typeof(void), typeof(DotNetBridgeEntrypoints), null, typeof(SignatureEntryPointModule), "DotNetBridgeEntrypoints")]

namespace Microsoft.ML.DotNetBridge
{
internal static class DotNetBridgeEntrypoints
{
[TlcModule.EntryPoint(Name = "Transforms.PrefixColumnConcatenator", Desc = ColumnConcatenatingTransformer.Summary,
UserName = ColumnConcatenatingTransformer.UserName, ShortName = ColumnConcatenatingTransformer.LoadName)]
public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input)
{
Contracts.CheckValue(env, nameof(env));
var host = env.Register("PrefixConcatColumns");
host.CheckValue(input, nameof(input));
EntryPointUtils.CheckInputArgs(host, input);

// Get all column names with preserving order.
var colNames = new List<string>(input.Data.Schema.Count);
for (int i = 0; i < input.Data.Schema.Count; i++)
colNames.Add(input.Data.Schema[i].Name);

// Iterate throuh input options, find matching source columns, create new input options
var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data };
var columns = new List<ColumnConcatenatingTransformer.Column>(input.Columns.Length);
foreach (var col in input.Columns)
{
var newCol = new ColumnConcatenatingTransformer.Column();
newCol.Name = col.Name;
var prefix = col.Source;
newCol.Source = colNames.Where(x => x.StartsWith(prefix, StringComparison.InvariantCulture)).ToArray();
if (newCol.Source.Length == 0)
throw new ArgumentOutOfRangeException("No matching columns found for prefix: " + prefix);

columns.Add(newCol);
}
inputOptions.Columns = columns.ToArray();

var xf = ColumnConcatenatingTransformer.Create(env, inputOptions, inputOptions.Data);
return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf };
}
}
}
2 changes: 1 addition & 1 deletion src/DotNetBridge/MessageValidator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
using System.Globalization;
using Microsoft.ML.Runtime;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
/// <summary>
/// This is a temporary solution to validate the messages from ML.NET to nimbusml.
Expand Down
3 changes: 1 addition & 2 deletions src/DotNetBridge/NativeDataInterop.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
using System.Globalization;
using System.Runtime.InteropServices;
using System.Text;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
public unsafe static partial class Bridge
{
Expand Down
3 changes: 1 addition & 2 deletions src/DotNetBridge/NativeDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
using System.Collections.Concurrent;
using System.Linq;
using System.Threading;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
using System.Threading.Tasks;
using Microsoft.ML.Runtime;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
public unsafe static partial class Bridge
{
Expand Down
3 changes: 1 addition & 2 deletions src/DotNetBridge/RmlEnvironment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@

using System;
using System.Globalization;
using Microsoft.ML;
using Microsoft.ML.Runtime;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
internal class RmlEnvironment : HostEnvironmentBase<RmlEnvironment>
{
Expand Down
4 changes: 1 addition & 3 deletions src/DotNetBridge/RunGraph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
using System.IO;
using System.Linq;
using Microsoft.DataPrep.Common;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.Data.IO;
using Microsoft.ML.EntryPoints;
Expand All @@ -20,7 +18,7 @@
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;

namespace Microsoft.MachineLearning.DotNetBridge
namespace Microsoft.ML.DotNetBridge
{
public unsafe static partial class Bridge
{
Expand Down
2 changes: 1 addition & 1 deletion src/NativeBridge/UnixInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#define CORECLR_SHUTDOWN "coreclr_shutdown"

#define DOTNETBRIDGE "DotNetBridge"
#define DOTNETBRIDGE_FQDN "Microsoft.MachineLearning.DotNetBridge.Bridge"
#define DOTNETBRIDGE_FQDN "Microsoft.ML.DotNetBridge.Bridge"

#define GET_FN "GetFn"

Expand Down
2 changes: 1 addition & 1 deletion src/NativeBridge/WinInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ class WinMlNetInterface
HRESULT hr = host->CreateDelegate(
_domainId,
W("DotNetBridge"),
W("Microsoft.MachineLearning.DotNetBridge.Bridge"),
W("Microsoft.ML.DotNetBridge.Bridge"),
W("GetFn"),
&getter);
if (FAILED(hr))
Expand Down
7 changes: 7 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@
<Compile Include="nimbusml\examples\BootStrapSample.py" />
<Compile Include="nimbusml\examples\CharTokenizer.py" />
<Compile Include="nimbusml\examples\ColumnConcatenator.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
<Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
<Compile Include="nimbusml\examples\ColumnDropper.py" />
<Compile Include="nimbusml\examples\ColumnDuplicator.py" />
<Compile Include="nimbusml\examples\ColumnSelector.py" />
Expand Down Expand Up @@ -299,6 +301,7 @@
<Compile Include="nimbusml\internal\core\preprocessing\filter\skipfilter.py" />
<Compile Include="nimbusml\internal\core\preprocessing\filter\takefilter.py" />
<Compile Include="nimbusml\internal\core\preprocessing\normalization\lpscaler.py" />
<Compile Include="nimbusml\internal\core\preprocessing\schema\prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\internal\core\preprocessing\schema\columnduplicator.py" />
<Compile Include="nimbusml\internal\core\preprocessing\schema\columndropper.py" />
<Compile Include="nimbusml\internal\core\preprocessing\tensorflowscorer.py" />
Expand Down Expand Up @@ -386,6 +389,7 @@
<Compile Include="nimbusml\internal\entrypoints\transforms_categoricalhashonehotvectorizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_categoricalonehotvectorizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_charactertokenizer.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_columnconcatenator.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_columncopier.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_columnselector.py" />
Expand Down Expand Up @@ -633,6 +637,7 @@
<Compile Include="nimbusml\preprocessing\normalization\meanvariancescaler.py" />
<Compile Include="nimbusml\preprocessing\normalization\minmaxscaler.py" />
<Compile Include="nimbusml\preprocessing\normalization\__init__.py" />
<Compile Include="nimbusml\preprocessing\schema\prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\preprocessing\schema\columnconcatenator.py" />
<Compile Include="nimbusml\preprocessing\schema\columndropper.py" />
<Compile Include="nimbusml\preprocessing\schema\columnduplicator.py" />
Expand Down Expand Up @@ -669,10 +674,12 @@
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
<Compile Include="nimbusml\tests\idv\__init__.py" />
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_lpscaler.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
<Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
<Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
<Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
Expand Down
25 changes: 25 additions & 0 deletions src/python/nimbusml/examples/PrefixColumnConcatenator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
###############################################################################
# PrefixColumnConcatenator
import numpy as np
import pandas as pd
from nimbusml.preprocessing.schema import PrefixColumnConcatenator

data = pd.DataFrame(
data=dict(
PrefixA=[2.5, np.nan, 2.1, 1.0],
PrefixB=[.75, .9, .8, .76],
AnotherColumn=[np.nan, 2.5, 2.6, 2.4]))

# transform usage
xf = PrefixColumnConcatenator(columns={'combined': 'Prefix'})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
# PrefixA PrefixB AnotherColumn combined.PrefixA combined.PrefixB
#0 2.5 0.75 NaN 2.5 0.75
#1 NaN 0.90 2.5 NaN 0.90
#2 2.1 0.80 2.6 2.1 0.80
#3 1.0 0.76 2.4 1.0 0.76
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
###############################################################################
# PrefixColumnConcatenator
import numpy as np
import pandas as pd
from nimbusml import Pipeline, Role
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LogisticRegressionClassifier
from nimbusml.preprocessing.schema import PrefixColumnConcatenator
from nimbusml.preprocessing.schema import ColumnDropper
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa
# 0 5.1 3.5 1.4 0.2 0 setosa 1.0
# 1 4.9 3.0 1.4 0.2 0 setosa 1.0
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'}
concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'}
dropcols = ColumnDropper() << ['Sepal_Length', 'Sepal_Width', 'Petal_Length',
'Petal_Width', 'Setosa', 'Species']

pipeline = Pipeline([concat, concat1, dropcols, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)

# Evaluate the model
metrics, scores = pipeline.test(X_test, y_test, output_scores=True)
print(metrics)
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
"""
PrefixColumnConcatenator
"""

__all__ = ["PrefixColumnConcatenator"]


from ....entrypoints.transforms_prefixcolumnconcatenator import \
transforms_prefixcolumnconcatenator
from ....utils.utils import trace
from ...base_pipeline_item import BasePipelineItem, DefaultSignature


class PrefixColumnConcatenator(BasePipelineItem, DefaultSignature):
"""

Combines several columns into a single vector-valued column by prefix

.. remarks::
``PrefixColumnConcatenator`` creates a single vector-valued column from
multiple
columns. It can be performed on data before training a model. The
concatenation
can significantly speed up the processing of data when the number of
columns
is as large as hundreds to thousands.

:param params: Additional arguments sent to compute engine.

.. seealso::
:py:class:`ColumnDropper
<nimbusml.preprocessing.schema.ColumnDropper>`,
:py:class:`ColumnSelector
<nimbusml.preprocessing.schema.ColumnSelector>`.

.. index:: transform, schema

Example:
.. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py
:language: python
"""

@trace
def __init__(
self,
**params):
BasePipelineItem.__init__(
self, type='transform', **params)

@property
def _entrypoint(self):
return transforms_prefixcolumnconcatenator

@trace
def _get_node(self, **all_args):

input_columns = self.input
if input_columns is None and 'input' in all_args:
input_columns = all_args['input']
if 'input' in all_args:
all_args.pop('input')

output_columns = self.output
if output_columns is None and 'output' in all_args:
output_columns = all_args['output']
if 'output' in all_args:
all_args.pop('output')

# validate input
if input_columns is None:
raise ValueError(
"'None' input passed when it cannot be none.")

if not isinstance(input_columns, list):
raise ValueError(
"input has to be a list of strings, instead got %s" %
type(input_columns))

# validate output
if output_columns is None:
raise ValueError(
"'None' output passed when it cannot be none.")

if not isinstance(output_columns, list):
raise ValueError(
"output has to be a list of strings, instead got %s" %
type(output_columns))

algo_args = dict(
column=[
dict(
Source=i, Name=o) for i, o in zip(
input_columns, output_columns)] if input_columns else None)

all_args.update(algo_args)
return self._entrypoint(**all_args)
Loading