Skip to content

Commit 326727f

Browse files
authored
Simple IDataView implementation sample. (#3302)
1 parent 43d4c18 commit 326727f

File tree

3 files changed

+262
-0
lines changed

3 files changed

+262
-0
lines changed
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML;
5+
using Microsoft.ML.Data;
6+
7+
namespace Samples.Dynamic
8+
{
9+
/// <summary>
10+
/// The <see cref="IDataView"/> interface is the central concept of "data" in ML.NET. While many conveniences exist
11+
/// to create pre-baked implementations, it is also useful to know how to create one completely from scratch. We also
12+
/// take this opportunity to illustrate and motivate the basic principles of how the IDataView system is architected,
13+
/// since people interested in implementing <see cref="IDataView"/> need at least some knowledge of those principles.
14+
/// </summary>
15+
public static class SimpleDataViewImplementation
16+
{
17+
public static void Example()
18+
{
19+
// First we create an array of these objects, which we "present" as this IDataView implementation so that it
20+
// can be used in a simple ML.NET pipeline.
21+
var inputArray = new[]
22+
{
23+
new InputObject(false, "Hello my friend."),
24+
new InputObject(true, "Stay awhile and listen."),
25+
new InputObject(true, "Masterfully done hero!")
26+
};
27+
var dataView = new InputObjectDataView(inputArray);
28+
29+
// So, this is a very simple pipeline: a transformer that tokenizes Text, does nothing with the Label column
30+
// at all.
31+
var mlContext = new MLContext();
32+
var transformedDataView = mlContext.Transforms.Text.TokenizeIntoWords(
33+
"TokenizedText", "Text").Fit(dataView).Transform(dataView);
34+
35+
var textColumn = transformedDataView.Schema["Text"];
36+
var tokensColumn = transformedDataView.Schema["TokenizedText"];
37+
38+
using (var cursor = transformedDataView.GetRowCursor(new[] { textColumn, tokensColumn }))
39+
{
40+
// Note that it is best to get the getters and values *before* iteration, so as to faciliate buffer
41+
// sharing (if applicable), and column-type validation once, rather than many times.
42+
ReadOnlyMemory<char> textValue = default;
43+
VBuffer<ReadOnlyMemory<char>> tokensValue = default;
44+
45+
var textGetter = cursor.GetGetter<ReadOnlyMemory<char>>(textColumn);
46+
var tokensGetter = cursor.GetGetter<VBuffer<ReadOnlyMemory<char>>>(tokensColumn);
47+
48+
while (cursor.MoveNext())
49+
{
50+
textGetter(ref textValue);
51+
tokensGetter(ref tokensValue);
52+
53+
Console.WriteLine($"{textValue} => {string.Join(", ", tokensValue.DenseValues())}");
54+
}
55+
56+
// The output to console is this:
57+
58+
// Hello my friend. => Hello, my, friend.
59+
// Stay awhile and listen. => Stay, awhile, and, listen.
60+
// Masterfully done hero! => Masterfully, done, hero!
61+
62+
// Note that it may be interesting to set a breakpoint on the Console.WriteLine, and explore
63+
// what is going on with the cursor, and the buffers. In particular, on the third iteration,
64+
// while `tokensValue` is logically presented as a three element array, internally you will
65+
// see that the arrays internal to that structure have (at least) four items, specifically:
66+
// `Masterfully`, `done`, `hero!`, `listen.`. In this way we see a simple example of the details
67+
// of how buffer sharing from one iteration to the next actually works.
68+
}
69+
}
70+
71+
private sealed class InputObject
72+
{
73+
public bool Label { get; }
74+
public string Text { get; }
75+
76+
public InputObject(bool label, string text)
77+
{
78+
Label = label;
79+
Text = text;
80+
}
81+
}
82+
83+
/// <summary>
84+
/// This is an implementation of <see cref="IDataView"/> that wraps an <see cref="IEnumerable{T}"/>
85+
/// of the above <see cref="InputObject"/>. Note that normally under these circumstances, the first
86+
/// recommendation would be to use a convenience like
87+
/// <see cref="DataOperationsCatalog.LoadFromEnumerable{TRow}(IEnumerable{TRow}, SchemaDefinition)"/>
88+
/// or something like that, rather than implementing <see cref="IDataView"/> outright. However, sometimes when
89+
/// code generation is impossible on some situations, like Unity or other similar platforms, implementing
90+
/// something even closely resembling this may become necessary.
91+
///
92+
/// This implementation of <see cref="IDataView"/>, being didactic, is much simpler than practically
93+
/// anything one would find in the ML.NET codebase. In this case we have a completely fixed schema (the two
94+
/// fields of <see cref="InputObject"/>), with fixed types.
95+
///
96+
/// For <see cref="Schema"/>, note that we keep a very simple schema based off the members of the object. You
97+
/// may in fact note that it is possible in this specific case, this implementation of
98+
/// <see cref="IDatView"/> could share the same <see cref="DataViewSchema"/> object across all instances of this
99+
/// object, but since this is almost never the case, I do not take advantage of that.
100+
///
101+
/// We have chosen to wrap an <see cref="IEnumerable{T}"/>, so in fact only a very simple implementation is
102+
/// possible. Specifically: we cannot meaningfully shuffle (so <see cref="CanShuffle"/> is
103+
/// <see langword="false"/>, and even if a <see cref="Random"/> parameter were passed to
104+
/// <see cref="GetRowCursor(IEnumerable{DataViewSchema.Column}, Random)"/>, we could not make use of it), we do
105+
/// not know the count of the item right away without counting (so, it is most correct for
106+
/// <see cref="GetRowCount"/> to return <see langword="null"/>, even after we might hypothetically know after
107+
/// the first pass, given the immutability principle of <see cref="IDatView"/>), and the
108+
/// <see cref="GetRowCursorSet(IEnumerable{DataViewSchema.Column}, int, Random)"/> method returns a single item.
109+
///
110+
/// The <see cref="DataViewRowCursor"/> derived class has more documentation specific to its behavior.
111+
///
112+
/// Note that this implementation, as well as the nested <see cref="DataViewRowCursor"/> derived class, does
113+
/// almost no validation of parameters or guard against misuse than we would like from, say, implementations of
114+
/// the same classes within the ML.NET codebase.
115+
/// </summary>
116+
private sealed class InputObjectDataView : IDataView
117+
{
118+
private readonly IEnumerable<InputObject> _data;
119+
public DataViewSchema Schema { get; }
120+
public bool CanShuffle => false;
121+
122+
public InputObjectDataView(IEnumerable<InputObject> data)
123+
{
124+
_data = data;
125+
126+
var builder = new DataViewSchema.Builder();
127+
builder.AddColumn("Label", BooleanDataViewType.Instance);
128+
builder.AddColumn("Text", TextDataViewType.Instance);
129+
Schema = builder.ToSchema();
130+
}
131+
132+
public long? GetRowCount() => null;
133+
134+
public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null)
135+
=> new Cursor(this, columnsNeeded.Any(c => c.Index == 0), columnsNeeded.Any(c => c.Index == 1));
136+
137+
public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null)
138+
=> new[] { GetRowCursor(columnsNeeded, rand) };
139+
140+
/// <summary>
141+
/// Having this be a private sealed nested class follows the typical pattern: in most
142+
/// <see cref="IDataView"/> implementations, the cursor instance is almost always that. The only "common"
143+
/// exceptions to this tendency are those implementations that are such thin wrappings of existing
144+
/// <see cref="IDataView"/> without even bothering to change the schema.
145+
///
146+
/// On the subject of schema, note that there is an expectation that the <see cref="Schema"/> object is
147+
/// reference equal to the <see cref="IDataView.Schema"/> object that created this cursor, as we see here.
148+
///
149+
/// Note that <see cref="Batch"/> returns <c>0</c>. As described in the documentation of that property, that
150+
/// is meant to facilitate the reconciliation of the partitioning of the data in the case where multiple
151+
/// cursors are returned from
152+
/// <see cref="GetRowCursorSet(IEnumerable{DataViewSchema.Column}, int, Random)"/>, but since only one is
153+
/// ever returned from the implementation, this behavior is appropriate.
154+
///
155+
/// Similarly, since it is impossible to have a shuffled cursor or a cursor set, it is sufficient for the
156+
/// <see cref="GetIdGetter"/> implementation to return a simple ID based on the position. If, however, this
157+
/// had been something built on, hypothetically, an <see cref="IList{T}"/> or some other such structure, and
158+
/// shuffling and partitioning was available, an ID based on the index of whatever item was being returned
159+
/// would be appropriate.
160+
///
161+
/// Note the usage of the <see langword="ref"/> parameters on the <see cref="ValueGetter{TValue}"/>
162+
/// implementations. This is most valuable in the case of buffer sharing for <see cref="VBuffer{T}"/>, but
163+
/// we still of course have to deal with it here.
164+
///
165+
/// Note also that we spend a considerable amount of effort to not make the
166+
/// <see cref="GetGetter{TValue}(DataViewSchema.Column)"/> and
167+
/// <see cref="IsColumnActive(DataViewSchema.Column)"/> methods correctly reflect what was asked for from
168+
/// the <see cref="GetRowCursor(IEnumerable{DataViewSchema.Column}, Random)"/>
169+
/// method that was used to create this method. In this particular case, the point is somewhat moot: this
170+
/// mechanism exists to enable lazy evaluation, but since this cursor is implemented to wrap an
171+
/// <see cref="IEnumerator{T}"/> which has no concept of lazy evaluation, there is no real practical benefit
172+
/// to doing this. However, it is best of course to illustrate the general principle for the sake of the
173+
/// example.
174+
///
175+
/// Even in this simple form, we see the reason why <see cref="GetGetter{TValue}(DataViewSchema.Column)"/>
176+
/// is beneficial: the <see cref="ValueGetter{TValue}"/> implementations themselves are simple to the point
177+
/// where their operation is dwarfed by the simple acts of casting and validation checking one sees in
178+
/// <see cref="GetGetter{TValue}(DataViewSchema.Column)"/>. In this way we only pay the cost of validation
179+
/// and casting once, not every time we get a value.
180+
/// </summary>
181+
private sealed class Cursor : DataViewRowCursor
182+
{
183+
private bool _disposed;
184+
private long _position;
185+
private readonly IEnumerator<InputObject> _enumerator;
186+
private readonly Delegate[] _getters;
187+
188+
public override long Position => _position;
189+
public override long Batch => 0;
190+
public override DataViewSchema Schema { get; }
191+
192+
public Cursor(InputObjectDataView parent, bool wantsLabel, bool wantsText)
193+
{
194+
Schema = parent.Schema;
195+
_position = -1;
196+
_enumerator = parent._data.GetEnumerator();
197+
_getters = new Delegate[]
198+
{
199+
wantsLabel ? (ValueGetter<bool>)LabelGetterImplementation : null,
200+
wantsText ? (ValueGetter<ReadOnlyMemory<char>>)TextGetterImplementation : null
201+
};
202+
}
203+
204+
protected override void Dispose(bool disposing)
205+
{
206+
if (_disposed)
207+
return;
208+
if (disposing)
209+
{
210+
_enumerator.Dispose();
211+
_position = -1;
212+
}
213+
_disposed = true;
214+
base.Dispose(disposing);
215+
}
216+
217+
private void LabelGetterImplementation(ref bool value)
218+
=> value = _enumerator.Current.Label;
219+
220+
private void TextGetterImplementation(ref ReadOnlyMemory<char> value)
221+
=> value = _enumerator.Current.Text.AsMemory();
222+
223+
private void IdGetterImplementation(ref DataViewRowId id)
224+
=> id = new DataViewRowId((ulong)_position, 0);
225+
226+
public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column)
227+
{
228+
if (!IsColumnActive(column))
229+
throw new ArgumentOutOfRangeException(nameof(column));
230+
return (ValueGetter<TValue>)_getters[column.Index];
231+
}
232+
233+
public override ValueGetter<DataViewRowId> GetIdGetter()
234+
=> IdGetterImplementation;
235+
236+
public override bool IsColumnActive(DataViewSchema.Column column)
237+
=> _getters[column.Index] != null;
238+
239+
public override bool MoveNext()
240+
{
241+
if (_disposed)
242+
return false;
243+
if (_enumerator.MoveNext())
244+
{
245+
_position++;
246+
return true;
247+
}
248+
Dispose();
249+
return false;
250+
}
251+
}
252+
}
253+
}
254+
}

docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
<OutputType>Exe</OutputType>
66
<SignAssembly>false</SignAssembly>
77
<PublicSign>false</PublicSign>
8+
<RootNamespace>Samples</RootNamespace>
89
</PropertyGroup>
910

1011
<ItemGroup>

src/Microsoft.ML.DataView/IDataView.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ namespace Microsoft.ML
1313
/// The input and output of Query Operators (Transforms). This is the fundamental data pipeline
1414
/// type, comparable to <see cref="IEnumerable{T}"/> for LINQ.
1515
/// </summary>
16+
/// <example>
17+
/// <format type="text/markdown">
18+
/// <![CDATA[
19+
/// [!code-csharp[SimpleDataViewImplementation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs)]
20+
/// ]]>
21+
/// </format>
22+
/// </example>
1623
public interface IDataView
1724
{
1825
/// <summary>

0 commit comments

Comments
 (0)