Skip to content

Updating DatabaseLoader to support getting column info from a given .NET type. #4091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Microsoft.ML.Data/Data/SchemaDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ public sealed class ColumnNameAttribute : Attribute
/// <summary>
/// Column name.
/// </summary>
[BestFriend]
internal string Name { get; }

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ public LoadColumnAttribute(int[] columnIndexes)
Sources.Add(new TextLoader.Range(col));
}

[BestFriend]
internal List<TextLoader.Range> Sources;
}
}
149 changes: 149 additions & 0 deletions src/Microsoft.ML.Experimental/DataLoadSave/Database/DatabaseLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
using System;
using System.Collections.Generic;
using System.Data;
using System.Data.Common;
using System.Linq;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
Expand Down Expand Up @@ -98,6 +102,71 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
/// <param name="source">The source from which to load data.</param>
public IDataView Load(DatabaseSource source) => new BoundLoader(this, source);

internal static DatabaseLoader CreateDatabaseLoader<TInput>(IHostEnvironment host)
{
var userType = typeof(TInput);

var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance);

var propertyInfos =
userType
.GetProperties(BindingFlags.Public | BindingFlags.Instance)
.Where(x => x.CanRead && x.GetGetMethod() != null && x.GetIndexParameters().Length == 0);

var memberInfos = (fieldInfos as IEnumerable<MemberInfo>).Concat(propertyInfos).ToArray();

if (memberInfos.Length == 0)
throw host.ExceptParam(nameof(TInput), $"Should define at least one public, readable field or property in {nameof(TInput)}.");

var columns = new List<Column>();

for (int index = 0; index < memberInfos.Length; index++)
{
var memberInfo = memberInfos[index];
var mappingAttrName = memberInfo.GetCustomAttribute<ColumnNameAttribute>();

var column = new Column();
column.Name = mappingAttrName?.Name ?? memberInfo.Name;

var mappingAttr = memberInfo.GetCustomAttribute<LoadColumnAttribute>();

if (mappingAttr is object)
{
var sources = mappingAttr.Sources.Select((source) => Range.FromTextLoaderRange(source)).ToArray();
column.Source = sources.Single().Min;
}

InternalDataKind dk;
switch (memberInfo)
{
case FieldInfo field:
if (!InternalDataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk))
throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type.");

break;

case PropertyInfo property:
if (!InternalDataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk))
throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type.");
break;

default:
Contracts.Assert(false);
throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo");
}

column.Type = dk.ToDbType();

columns.Add(column);
}

var options = new Options
{
Columns = columns.ToArray()
};
return new DatabaseLoader(host, options);
}

/// <summary>
/// Describes how an input column should be mapped to an <see cref="IDataView"/> column.
/// </summary>
Expand Down Expand Up @@ -128,6 +197,86 @@ public sealed class Column
public KeyCount KeyCount;
}

/// <summary>
/// Specifies the range of indices of input columns that should be mapped to an output column.
/// </summary>
public sealed class Range
{
public Range() { }

/// <summary>
/// A range representing a single value. Will result in a scalar column.
/// </summary>
/// <param name="index">The index of the field of the text file to read.</param>
public Range(int index)
{
Contracts.CheckParam(index >= 0, nameof(index), "Must be non-negative");
Min = index;
Max = index;
}

/// <summary>
/// A range representing a set of values. Will result in a vector column.
/// </summary>
/// <param name="min">The minimum inclusive index of the column.</param>
/// <param name="max">The maximum-inclusive index of the column. If <c>null</c>
/// indicates that the <see cref="TextLoader"/> should auto-detect the legnth
/// of the lines, and read untill the end.</param>
public Range(int min, int? max)
{
Contracts.CheckParam(min >= 0, nameof(min), "Must be non-negative");
Contracts.CheckParam(!(max < min), nameof(max), "If specified, must be greater than or equal to " + nameof(min));

Min = min;
Max = max;
// Note that without the following being set, in the case where there is a single range
// where Min == Max, the result will not be a vector valued but a scalar column.
ForceVector = true;
AutoEnd = max == null;
}

/// <summary>
/// The minimum index of the column, inclusive.
/// </summary>
[Argument(ArgumentType.Required, HelpText = "First index in the range")]
public int Min;

/// <summary>
/// The maximum index of the column, inclusive. If <see langword="null"/>
/// indicates that the <see cref="TextLoader"/> should auto-detect the legnth
/// of the lines, and read untill the end.
/// If <see cref="Max"/> is specified, the field <see cref="AutoEnd"/> is ignored.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Last index in the range")]
public int? Max;

/// <summary>
/// Whether this range extends to the end of the line, but should be a fixed number of items.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "to the end of the line" mean to a database loader?

I think we should remove these options until we decide we have data that says they are required. That way they don't "sneak" in to the product, and don't do anything.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case it would mean to the end of the table. I'm pretty sure it is used for vector support.

/// If <see cref="Max"/> is specified, the field <see cref="AutoEnd"/> is ignored.
/// </summary>
[Argument(ArgumentType.AtMostOnce,
HelpText = "This range extends to the end of the line, but should be a fixed number of items",
ShortName = "auto")]
public bool AutoEnd;

/// <summary>
/// Whether this range includes only other indices not specified.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "This range includes only other indices not specified", ShortName = "other")]
public bool AllOther;

/// <summary>
/// Force scalar columns to be treated as vectors of length one.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Force scalar columns to be treated as vectors of length one", ShortName = "vector")]
public bool ForceVector;

internal static Range FromTextLoaderRange(TextLoader.Range range)
{
return new Range(range.Min, range.Max);
}
}

/// <summary>
/// The settings for <see cref="DatabaseLoader"/>
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ namespace Microsoft.ML
/// </summary>
public static class DatabaseLoaderCatalog
{
/// <summary>
/// Create a database loader <see cref="DatabaseLoader"/>.
/// </summary>
/// <summary>Create a database loader <see cref="DatabaseLoader"/>.</summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="columns">Array of columns <see cref="DatabaseLoader.Column"/> defining the schema.</param>
public static DatabaseLoader CreateDatabaseLoader(this DataOperationsCatalog catalog,
Expand All @@ -23,8 +21,22 @@ public static DatabaseLoader CreateDatabaseLoader(this DataOperationsCatalog cat
{
Columns = columns,
};

return new DatabaseLoader(CatalogUtils.GetEnvironment(catalog), options);
return catalog.CreateDatabaseLoader(options);
}

/// <summary>Create a database loader <see cref="DatabaseLoader"/>.</summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="options">Defines the settings of the load operation.</param>
public static DatabaseLoader CreateDatabaseLoader(this DataOperationsCatalog catalog,
DatabaseLoader.Options options)
=> new DatabaseLoader(CatalogUtils.GetEnvironment(catalog), options);

/// <summary>Create a database loader <see cref="DatabaseLoader"/>.</summary>
/// <typeparam name="TInput">Defines the schema of the data to be loaded. Use public fields or properties
/// decorated with <see cref="LoadColumnAttribute"/> (and possibly other attributes) to specify the column
/// names and their data types in the schema of the loaded data.</typeparam>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
public static DatabaseLoader CreateDatabaseLoader<TInput>(this DataOperationsCatalog catalog)
=> DatabaseLoader.CreateDatabaseLoader<TInput>(CatalogUtils.GetEnvironment(catalog));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,82 @@ public static Type ToType(this DbType dbType)
return null;
}
}

/// <summary>Maps a <see cref="InternalDataKind"/> to the associated <see cref="DbType"/>.</summary>
public static DbType ToDbType(this InternalDataKind dataKind)
{
switch (dataKind)
{
case InternalDataKind.I1:
{
return DbType.SByte;
}

case InternalDataKind.U1:
{
return DbType.Byte;
}

case InternalDataKind.I2:
{
return DbType.Int16;
}

case InternalDataKind.U2:
{
return DbType.UInt16;
}

case InternalDataKind.I4:
{
return DbType.Int32;
}

case InternalDataKind.U4:
{
return DbType.UInt32;
}

case InternalDataKind.I8:
{
return DbType.Int64;
}

case InternalDataKind.U8:
{
return DbType.UInt64;
}

case InternalDataKind.R4:
{
return DbType.Single;
}

case InternalDataKind.R8:
{
return DbType.Double;
}

case InternalDataKind.TX:
{
return DbType.String;
}

case InternalDataKind.BL:
{
return DbType.Boolean;
}

case InternalDataKind.DT:
Copy link
Member

@eerhardt eerhardt Aug 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about DateTimeOffset (InternalDataKind.DZ) and TimeSpan (InternalDataKind.TS) ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no equivalent for DbKind

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even DbType.DateTimeOffset and DbType.Time?

https://docs.microsoft.com/en-us/dotnet/api/system.data.dbtype

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latter is the SQL time type, not an equivalent to the System.TimeSpan type; I'm not sure what the appropriate conversion is. I missed DateTimeOffset.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latter is the SQL time type, not an equivalent to the System.TimeSpan type;

According to https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/sql-server-data-type-mappings, it is the appropriate mapping:

SQL Server Database Engine type .NET Framework type SqlDbType enumeration SqlDataReader SqlTypes typed accessor DbType enumeration SqlDataReader DbType typed accessor
time(SQL Server 2008 and later) TimeSpan Time none Time GetDateTime

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that also the appropriate mapping for all database types, not just SQL?

Also, it looks like it isn't even the SQL time type, it is the SQL datetime type and there is a separate value in a separate enum that should be used for SQL time:

| Time | 17 | A type representing a SQL Server DateTime value. If you want to use a SQL Server time value, use Time. |

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, right.
These weren't done because DbDataReader doesn't expose Get methods for them

{
return DbType.DateTime;
}

default:
{
throw new NotSupportedException();
}
}
}
}
}
Loading