Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Add azureml-dataprep support for dataflow objects #181

Merged
merged 18 commits into from
Jul 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion build.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ if /i [%1] == [DbgWinPy2.7] (
:Build
:: Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script
echo Installing dotnet SDK ...
powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.200 -InstallDir ./cli"
powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.701 -InstallDir ./cli"

set _dotnetRoot=%__currentScriptDir%cli

Expand Down Expand Up @@ -339,6 +339,8 @@ echo "Running tests ... "
echo "#################################"
call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0"
if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq )
:: Run azureml-dataprep tests only in pyhon 3.7 as its an optional dependency
if %PythonVersion% == 3.7 ( call "%PythonExe%" -m pip install --upgrade azureml-dataprep )
call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%"
call "%PythonExe%" -m pip install "scikit-learn==0.19.2"

Expand Down
7 changes: 5 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ if [ ${__buildDotNetBridge} = true ]
then
# Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script
echo "Installing dotnet SDK ... "
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.200 -InstallDir ./cli
curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli

# Build managed code
echo "Building managed code ... "
Expand Down Expand Up @@ -266,7 +266,10 @@ then
elif [ ${PythonVersion} = 3.6 ] && [ "$(uname -s)" = "Darwin" ]
then
"${PythonExe}" -m pip install --upgrade pytest-remotedata
fi
elif [ ${PythonVersion} = 3.7 ]
then
"${PythonExe}" -m pip install --upgrade azureml-dataprep
fi
"${PythonExe}" -m pip install --upgrade "${Wheel}"
"${PythonExe}" -m pip install "scikit-learn==0.19.2"

Expand Down
2 changes: 2 additions & 0 deletions build/libs_linux.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ lib_lightgbm.so
libtensorflow.so
libtensorflow_framework.so
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
2 changes: 2 additions & 0 deletions build/libs_mac.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ lib_lightgbm.dylib
libtensorflow.dylib
libtensorflow_framework.dylib
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
2 changes: 2 additions & 0 deletions build/libs_win.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ MklImports.dll
SymSgdNative.dll
tensorflow.dll
System.Drawing.Common.dll
Microsoft.DataPrep.dll
Microsoft.DPrep.*
Microsoft.ML.*
3 changes: 2 additions & 1 deletion src/DotNetBridge/DotNetBridge.csproj
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<TargetFramework>netcoreapp2.1</TargetFramework>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<Platforms>x64</Platforms>
<DefineConstants>CORECLR</DefineConstants>
Expand Down Expand Up @@ -42,5 +42,6 @@
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.2.0" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.14.0" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.2.0" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.5-preview" />
</ItemGroup>
</Project>
4 changes: 3 additions & 1 deletion src/DotNetBridge/RunGraph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using Microsoft.DataPrep.Common;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
Expand Down Expand Up @@ -146,7 +147,8 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s
var extension = Path.GetExtension(path);
if (extension == ".txt")
dv = TextLoader.LoadFile(host, new TextLoader.Options(), new MultiFileSource(path));

else if(extension == ".dprep")
dv = DataFlow.FromDPrepFile(path).ToDataView();
else
dv = new BinaryLoader(host, new BinaryLoader.Arguments(), path);
}
Expand Down
15 changes: 10 additions & 5 deletions src/NativeBridge/UnixInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,20 @@ class UnixMlNetInterface
{
}

FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath)
FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (_getter != nullptr)
return _getter;

std::string libsroot(nimbuslibspath);
std::string libsroot(mlnetpath);
std::string coreclrdir(coreclrpath);
if (strlen(dpreppath) == 0)
{
dpreppath = mlnetpath;
}
std::string dprepdir(dpreppath);

ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str());
ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str(), dprepdir.c_str());
if (host == nullptr)
return nullptr;

Expand Down Expand Up @@ -246,7 +251,7 @@ class UnixMlNetInterface
closedir(dir);
}

ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot)
ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot, const char * dprepDirRoot)
{
if (_host != nullptr)
return _host;
Expand Down Expand Up @@ -284,7 +289,7 @@ class UnixMlNetInterface
// TRUSTED_PLATFORM_ASSEMBLIES
tpaList.c_str(),
// APP_PATHS
libsRoot,
dprepDirRoot,
Copy link
Member

@eerhardt eerhardt Jul 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems wrong. Why is the DataPrep directory considered the APP_PATHS? Shouldn't it just instead be added to tpaList? #Resolved

Copy link
Member Author

@ganik ganik Jul 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep folder contains numerous dlls for azureml-dataprep package to run. Not all of them needed for in process integration that NimbusML doing here. Also many of dlls are duplicates of dotnetcore2 package (.NET Core CLR runtime). Putting DataPrep dlls on TPAList will

  1. make the list huuge.
  2. will have lot of duplicates like System.* dlls. I can of course filter out them, but this will be an additional logic step.
  3. If there are duplicates no guarantee which of them will be used at runtime.

I want to avoid this and use only dotnetcore2, ML.NET, Microsoft.DataPrep.* and Microsoft.DPrep.* dlls - these are put into TPA list. If for some reason I missed any of dlls that DPrep needs I have set probing path to Dprep folder.


In reply to: 302699171 [](ancestors = 302699171)

Copy link
Member

@eerhardt eerhardt Jul 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we treating the ML.NET and the DataPrep libraries differently? Maybe we should just put both locations on the APP_PATHS.

That way, when the next library comes along that we need to do this for, it is obvious to just add it to APP_PATHS. #Resolved

Copy link
Member Author

@ganik ganik Jul 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, next library will go same way as Dprep into APP_PATHS. Reasons I treat ML.NET and DataPrep differently bcs:

  1. Dprep is optional dependency, if its not installed NimbusML should still work in all scenario except taking input from Dprep files. ML.NET is a mandatory core dependency here. I am also very familiar with all the needed dlls. So felt TPAList is more appropriate here to list all of them.
  2. Want to be on a safer side. I am not familiar how probing with APP_PATHS work. If there are duplicate dlls in ML.NET and Dprep folder, will first in wins? With TPAList this seems not guaranteed, so could be same with APP-PATHS. I want to ensure NimbusML core ML.NET uses are untouched by this, so I used TPA List for ML.NET.
  3. With Dprep dlls - there are seems tons of them, which ones are necessary for my case and which ones are not, difficult to figure. So I packaged only few ones (<3MB total size) that are needed for JIT compilation when running NimbusML and set the APP_PATHS to Dprep folder for the rest.
    Another issue: there are mismatches in versions of *Dprep.dlls that are exposed in NuGet and installed with azureml-dataprep package. During testing I found out that I need to keep the versions of *Dprep.dll that I built against but for the rest of supporting dlls I can point to Dprep folder. We will have to do further testing to figure out correct versions of azureml-dataprep to be installed to work with built ones in NimbusML. Now it seems that latest azureml-dataprep package works.

In reply to: 302984801 [](ancestors = 302984801)

Copy link
Member

@eerhardt eerhardt Jul 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After reading https://docs.microsoft.com/en-us/dotnet/core/tutorials/netcore-hosting#step-3---prepare-runtime-properties, and talking with a CoreCLR dev, the recommendation here is to use the TRUSTED_PLATFORM_ASSEMBLIES for all the assemblies. From the doc:

Because the host has more control over which assemblies are loaded using the TPA list, it is a best practice for hosts to determine which assemblies they expect to load and list them explicitly.

I don't want to block you going this route, since I haven't really worked in NimbusML, but I just figured I'd give you as much information as possible to make an informed decision. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you Eric!


In reply to: 303107371 [](ancestors = 303107371)

// AppDomainCompatSwitch
W("UseLatestBehaviorWhenTFMNotSpecified")
};
Expand Down
23 changes: 13 additions & 10 deletions src/NativeBridge/WinInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class WinMlNetInterface
FindClose(findHandle);
}

ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot)
ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot, const wchar_t * dprepDirRoot)
{
if (_host != nullptr)
return _host;
Expand Down Expand Up @@ -228,7 +228,7 @@ class WinMlNetInterface
// TRUSTED_PLATFORM_ASSEMBLIES
tpaList.c_str(),
// APP_PATHS
libsRoot,
dprepDirRoot,
// AppDomainCompatSwitch
W("UseLatestBehaviorWhenTFMNotSpecified")
};
Expand Down Expand Up @@ -267,26 +267,29 @@ class WinMlNetInterface
}

public:
FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath)
FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (_getter != nullptr)
return _getter;

std::wstring libsdir = Utf8ToUtf16le(nimbuslibspath);
std::wstring libsdir = Utf8ToUtf16le(mlnetpath);
ConvertToWinPath(libsdir);

std::wstring coreclrdir;
if (strlen(coreclrpath) != 0)
std::wstring coreclrdir = Utf8ToUtf16le(coreclrpath);
ConvertToWinPath(coreclrdir);

std::wstring dprepdir;
if (strlen(dpreppath) != 0)
{
coreclrdir = Utf8ToUtf16le(coreclrpath);
ConvertToWinPath(coreclrdir);
dprepdir = Utf8ToUtf16le(dpreppath);
ConvertToWinPath(dprepdir);
}
else
{
coreclrdir = libsdir;
dprepdir = libsdir;
}

ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str());
ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str(), dprepdir.c_str());
if (host == nullptr)
return nullptr;

Expand Down
22 changes: 13 additions & 9 deletions src/NativeBridge/dllmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
#define PARAM_SEED "seed"
#define PARAM_GRAPH "graph"
#define PARAM_VERBOSE "verbose"
#define PARAM_NIMBUSML_PATH "nimbusmlPath"
#define PARAM_MLNET_PATH "mlnetPath"
#define PARAM_DOTNETCLR_PATH "dotnetClrPath"
#define PARAM_DPREP_PATH "dprepPath"
#define PARAM_DATA "data"


Expand Down Expand Up @@ -44,14 +45,14 @@ static MlNetInterface *g_mlnetInterface = nullptr;
static GENERICEXEC g_exec = nullptr;

// Ensure that we have the DotNetBridge managed code entry point.
GENERICEXEC EnsureExec(const char *nimbuslibspath, const char *coreclrpath)
GENERICEXEC EnsureExec(const char *mlnetpath, const char *coreclrpath, const char *dpreppath)
{
if (g_mlnetInterface == nullptr)
g_mlnetInterface = new MlNetInterface();

if (g_exec == nullptr)
{
FNGETTER getter = g_mlnetInterface->EnsureGetter(nimbuslibspath, coreclrpath);
FNGETTER getter = g_mlnetInterface->EnsureGetter(mlnetpath, coreclrpath, dpreppath);
if (getter != nullptr)
g_exec = (GENERICEXEC)getter(FnIdGenericExec);
}
Expand All @@ -70,20 +71,23 @@ bp::dict pxCall(bp::dict& params)
try
{
bp::extract<std::string> graph(params[PARAM_GRAPH]);
bp::extract<std::string> nimbusmlPath(params[PARAM_NIMBUSML_PATH]);
bp::extract<std::string> mlnetPath(params[PARAM_MLNET_PATH]);
bp::extract<std::string> dotnetClrPath(params[PARAM_DOTNETCLR_PATH]);
bp::extract<std::string> dprepPath(params[PARAM_DPREP_PATH]);
bp::extract<std::int32_t> verbose(params[PARAM_VERBOSE]);
std::int32_t i_verbose = std::int32_t(verbose);
std::string s_nimbusmlPath = std::string(nimbusmlPath);
std::string s_mlnetPath = std::string(mlnetPath);
std::string s_dotnetClrPath = std::string(dotnetClrPath);
std::string s_dprepPath = std::string(dprepPath);
std::string s_graph = std::string(graph);
const char *nimbuslibspath = s_nimbusmlPath.c_str();
const char *mlnetpath = s_mlnetPath.c_str();
const char *coreclrpath = s_dotnetClrPath.c_str();
const char *dpreppath = s_dprepPath.c_str();

GENERICEXEC exec = EnsureExec(nimbuslibspath, coreclrpath);
GENERICEXEC exec = EnsureExec(mlnetpath, coreclrpath, dpreppath);
if (exec == nullptr)
throw std::invalid_argument("Failed to communicate with the managed library. Path searched: "
+ s_nimbusmlPath + " and " + s_dotnetClrPath);
throw std::invalid_argument("Failed to communicate with the managed library. Paths searched: "
+ s_mlnetPath + " and " + s_dotnetClrPath);

int seed = 42;
if (params.has_key(PARAM_SEED))
Expand Down
3 changes: 2 additions & 1 deletion src/Platforms/build.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<PropertyGroup>
<AssemblyName>dummy</AssemblyName>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.0</TargetFramework>
<TargetFramework>netcoreapp2.1</TargetFramework>
<Platforms>x64</Platforms>
<Configurations>DbgWinPy3.7;DbgWinPy3.6;DbgWinPy3.5;DbgWinPy2.7;RlsWinPy3.7;RlsWinPy3.6;RlsWinPy3.5;RlsWinPy2.7;DbgLinPy3.7;DbgLinPy3.6;DbgLinPy3.5;DbgLinPy2.7;RlsLinPy3.7;RlsLinPy3.6;RlsLinPy3.5;RlsLinPy2.7;RlsMacPy3.7;RlsMacPy3.6</Configurations>
<OutputPath>$(ProjectDir)..\..\x64\$(Configuration)\Platform\</OutputPath>
Expand All @@ -21,6 +21,7 @@
<PackageReference Include="Microsoft.ML.TensorFlow" Version="1.2.0" />
<PackageReference Include="Microsoft.ML.Ensemble" Version="0.14.0" />
<PackageReference Include="Microsoft.ML.TimeSeries" Version="1.2.0" />
<PackageReference Include="Microsoft.DataPrep" Version="0.0.1.5-preview" />
</ItemGroup>

</Project>
8 changes: 6 additions & 2 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
<ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
<LaunchProvider>Standard Python launcher</LaunchProvider>
<Name>nimbusml</Name>
<InterpreterId>Global|VisualStudio|Py3.7</InterpreterId>
<InterpreterId>Global|VisualStudio|MinePy37</InterpreterId>
<InterpreterPath>..\..\dependencies\Python3.7\python.exe</InterpreterPath>
<EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
<StartupFile>nimbusml\tests\dprep\test_dprep.py</StartupFile>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
<PropertyGroup Condition="'$(Configuration)' == 'Release'" />
Expand Down Expand Up @@ -581,6 +582,8 @@
<Compile Include="nimbusml\tests\decomposition\test_pcaanomalydetector.py" />
<Compile Include="nimbusml\tests\decomposition\test_pcatransformer.py" />
<Compile Include="nimbusml\tests\decomposition\__init__.py" />
<Compile Include="nimbusml\tests\dprep\test_dprep.py" />
<Compile Include="nimbusml\tests\dprep\__init__.py" />
<Compile Include="nimbusml\tests\ensemble\test_fasttreesbinaryclassifier.py" />
<Compile Include="nimbusml\tests\ensemble\test_fasttreestweedieregressor.py" />
<Compile Include="nimbusml\tests\ensemble\test_gambinaryclassifier.py" />
Expand Down Expand Up @@ -735,6 +738,7 @@
<Folder Include="docs\sphinx\modules\svm\kernel\" />
<Folder Include="docs\sphinx\_static\" />
<Folder Include="docs\sphinx\_static\images\" />
<Folder Include="nimbusml\tests\dprep\" />
<Folder Include="tests_extended\" />
<Folder Include="nimbusml\" />
<Folder Include="nimbusml\cluster\" />
Expand Down Expand Up @@ -1136,7 +1140,7 @@
<Content Include="tools\manifest_diff.json" />
</ItemGroup>
<ItemGroup>
<InterpreterReference Include="Global|VisualStudio|Py3.7" />
<InterpreterReference Include="Global|VisualStudio|MinePy37" />
</ItemGroup>
<Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
</Project>
1 change: 1 addition & 0 deletions src/python/nimbusml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .internal.utils.data_roles import Role
from .internal.utils.data_schema import DataSchema
from .internal.utils.data_stream import BinaryDataStream
from .internal.utils.data_stream import DprepDataStream
from .internal.utils.data_stream import FileDataStream
from .internal.utils.utils import run_tests
from .pipeline import Pipeline
Expand Down
33 changes: 33 additions & 0 deletions src/python/nimbusml/internal/utils/data_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
Owns nimbusml's containers.
"""
import os
import tempfile
from shutil import copyfile

from .data_roles import DataRoles
Expand Down Expand Up @@ -467,3 +469,34 @@ def clone(self):
"Method clone was not overwritten for class '{0}'".format(
type(self)))
return BinaryDataStream(self._filename)


class DprepDataStream(BinaryDataStream):
"""
Defines a data view over dprep file.
"""

def __init__(self, dataflow=None, filename=None):
if dataflow is None and filename is None:
raise ValueError('Both dataflow object and filename are None')
super(DprepDataStream, self).__init__(DataSchema(""))
if dataflow is not None:
(fd, filename) = tempfile.mkstemp(suffix='.dprep')
fl = os.fdopen(fd, "wt")
fl.write(dataflow.to_json())
fl.close()
self._filename = filename

def __repr__(self):
return "DprepDataStream('{2}',\n '{0}',\n {1})".format(
self._schema, self._roles, self._filename.replace('\\', '\\\\'))

def clone(self):
"""
Copy/clone the object.
"""
if not isinstance(self, DprepDataStream):
raise NotImplementedError(
"Method clone was not overwritten for class '{0}'".format(
type(self)))
return DprepDataStream(self._filename)
Loading