|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Linq; |
| 4 | +using Microsoft.ML; |
| 5 | +using Microsoft.ML.Data; |
| 6 | + |
| 7 | +namespace Samples.Dynamic |
| 8 | +{ |
| 9 | + /// <summary> |
| 10 | + /// The <see cref="IDataView"/> interface is the central concept of "data" in ML.NET. While many conveniences exist |
| 11 | + /// to create pre-baked implementations, it is also useful to know how to create one completely from scratch. We also |
| 12 | + /// take this opportunity to illustrate and motivate the basic principles of how the IDataView system is architected, |
| 13 | + /// since people interested in implementing <see cref="IDataView"/> need at least some knowledge of those principles. |
| 14 | + /// </summary> |
| 15 | + public static class SimpleDataViewImplementation |
| 16 | + { |
| 17 | + public static void Example() |
| 18 | + { |
| 19 | + // First we create an array of these objects, which we "present" as this IDataView implementation so that it |
| 20 | + // can be used in a simple ML.NET pipeline. |
| 21 | + var inputArray = new[] |
| 22 | + { |
| 23 | + new InputObject(false, "Hello my friend."), |
| 24 | + new InputObject(true, "Stay awhile and listen."), |
| 25 | + new InputObject(true, "Masterfully done hero!") |
| 26 | + }; |
| 27 | + var dataView = new InputObjectDataView(inputArray); |
| 28 | + |
| 29 | + // So, this is a very simple pipeline: a transformer that tokenizes Text, does nothing with the Label column |
| 30 | + // at all. |
| 31 | + var mlContext = new MLContext(); |
| 32 | + var transformedDataView = mlContext.Transforms.Text.TokenizeIntoWords( |
| 33 | + "TokenizedText", "Text").Fit(dataView).Transform(dataView); |
| 34 | + |
| 35 | + var textColumn = transformedDataView.Schema["Text"]; |
| 36 | + var tokensColumn = transformedDataView.Schema["TokenizedText"]; |
| 37 | + |
| 38 | + using (var cursor = transformedDataView.GetRowCursor(new[] { textColumn, tokensColumn })) |
| 39 | + { |
| 40 | + // Note that it is best to get the getters and values *before* iteration, so as to faciliate buffer |
| 41 | + // sharing (if applicable), and column-type validation once, rather than many times. |
| 42 | + ReadOnlyMemory<char> textValue = default; |
| 43 | + VBuffer<ReadOnlyMemory<char>> tokensValue = default; |
| 44 | + |
| 45 | + var textGetter = cursor.GetGetter<ReadOnlyMemory<char>>(textColumn); |
| 46 | + var tokensGetter = cursor.GetGetter<VBuffer<ReadOnlyMemory<char>>>(tokensColumn); |
| 47 | + |
| 48 | + while (cursor.MoveNext()) |
| 49 | + { |
| 50 | + textGetter(ref textValue); |
| 51 | + tokensGetter(ref tokensValue); |
| 52 | + |
| 53 | + Console.WriteLine($"{textValue} => {string.Join(", ", tokensValue.DenseValues())}"); |
| 54 | + } |
| 55 | + |
| 56 | + // The output to console is this: |
| 57 | + |
| 58 | + // Hello my friend. => Hello, my, friend. |
| 59 | + // Stay awhile and listen. => Stay, awhile, and, listen. |
| 60 | + // Masterfully done hero! => Masterfully, done, hero! |
| 61 | + |
| 62 | + // Note that it may be interesting to set a breakpoint on the Console.WriteLine, and explore |
| 63 | + // what is going on with the cursor, and the buffers. In particular, on the third iteration, |
| 64 | + // while `tokensValue` is logically presented as a three element array, internally you will |
| 65 | + // see that the arrays internal to that structure have (at least) four items, specifically: |
| 66 | + // `Masterfully`, `done`, `hero!`, `listen.`. In this way we see a simple example of the details |
| 67 | + // of how buffer sharing from one iteration to the next actually works. |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + private sealed class InputObject |
| 72 | + { |
| 73 | + public bool Label { get; } |
| 74 | + public string Text { get; } |
| 75 | + |
| 76 | + public InputObject(bool label, string text) |
| 77 | + { |
| 78 | + Label = label; |
| 79 | + Text = text; |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + /// <summary> |
| 84 | + /// This is an implementation of <see cref="IDataView"/> that wraps an <see cref="IEnumerable{T}"/> |
| 85 | + /// of the above <see cref="InputObject"/>. Note that normally under these circumstances, the first |
| 86 | + /// recommendation would be to use a convenience like |
| 87 | + /// <see cref="DataOperationsCatalog.LoadFromEnumerable{TRow}(IEnumerable{TRow}, SchemaDefinition)"/> |
| 88 | + /// or something like that, rather than implementing <see cref="IDataView"/> outright. However, sometimes when |
| 89 | + /// code generation is impossible on some situations, like Unity or other similar platforms, implementing |
| 90 | + /// something even closely resembling this may become necessary. |
| 91 | + /// |
| 92 | + /// This implementation of <see cref="IDataView"/>, being didactic, is much simpler than practically |
| 93 | + /// anything one would find in the ML.NET codebase. In this case we have a completely fixed schema (the two |
| 94 | + /// fields of <see cref="InputObject"/>), with fixed types. |
| 95 | + /// |
| 96 | + /// For <see cref="Schema"/>, note that we keep a very simple schema based off the members of the object. You |
| 97 | + /// may in fact note that it is possible in this specific case, this implementation of |
| 98 | + /// <see cref="IDatView"/> could share the same <see cref="DataViewSchema"/> object across all instances of this |
| 99 | + /// object, but since this is almost never the case, I do not take advantage of that. |
| 100 | + /// |
| 101 | + /// We have chosen to wrap an <see cref="IEnumerable{T}"/>, so in fact only a very simple implementation is |
| 102 | + /// possible. Specifically: we cannot meaningfully shuffle (so <see cref="CanShuffle"/> is |
| 103 | + /// <see langword="false"/>, and even if a <see cref="Random"/> parameter were passed to |
| 104 | + /// <see cref="GetRowCursor(IEnumerable{DataViewSchema.Column}, Random)"/>, we could not make use of it), we do |
| 105 | + /// not know the count of the item right away without counting (so, it is most correct for |
| 106 | + /// <see cref="GetRowCount"/> to return <see langword="null"/>, even after we might hypothetically know after |
| 107 | + /// the first pass, given the immutability principle of <see cref="IDatView"/>), and the |
| 108 | + /// <see cref="GetRowCursorSet(IEnumerable{DataViewSchema.Column}, int, Random)"/> method returns a single item. |
| 109 | + /// |
| 110 | + /// The <see cref="DataViewRowCursor"/> derived class has more documentation specific to its behavior. |
| 111 | + /// |
| 112 | + /// Note that this implementation, as well as the nested <see cref="DataViewRowCursor"/> derived class, does |
| 113 | + /// almost no validation of parameters or guard against misuse than we would like from, say, implementations of |
| 114 | + /// the same classes within the ML.NET codebase. |
| 115 | + /// </summary> |
| 116 | + private sealed class InputObjectDataView : IDataView |
| 117 | + { |
| 118 | + private readonly IEnumerable<InputObject> _data; |
| 119 | + public DataViewSchema Schema { get; } |
| 120 | + public bool CanShuffle => false; |
| 121 | + |
| 122 | + public InputObjectDataView(IEnumerable<InputObject> data) |
| 123 | + { |
| 124 | + _data = data; |
| 125 | + |
| 126 | + var builder = new DataViewSchema.Builder(); |
| 127 | + builder.AddColumn("Label", BooleanDataViewType.Instance); |
| 128 | + builder.AddColumn("Text", TextDataViewType.Instance); |
| 129 | + Schema = builder.ToSchema(); |
| 130 | + } |
| 131 | + |
| 132 | + public long? GetRowCount() => null; |
| 133 | + |
| 134 | + public DataViewRowCursor GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand = null) |
| 135 | + => new Cursor(this, columnsNeeded.Any(c => c.Index == 0), columnsNeeded.Any(c => c.Index == 1)); |
| 136 | + |
| 137 | + public DataViewRowCursor[] GetRowCursorSet(IEnumerable<DataViewSchema.Column> columnsNeeded, int n, Random rand = null) |
| 138 | + => new[] { GetRowCursor(columnsNeeded, rand) }; |
| 139 | + |
| 140 | + /// <summary> |
| 141 | + /// Having this be a private sealed nested class follows the typical pattern: in most |
| 142 | + /// <see cref="IDataView"/> implementations, the cursor instance is almost always that. The only "common" |
| 143 | + /// exceptions to this tendency are those implementations that are such thin wrappings of existing |
| 144 | + /// <see cref="IDataView"/> without even bothering to change the schema. |
| 145 | + /// |
| 146 | + /// On the subject of schema, note that there is an expectation that the <see cref="Schema"/> object is |
| 147 | + /// reference equal to the <see cref="IDataView.Schema"/> object that created this cursor, as we see here. |
| 148 | + /// |
| 149 | + /// Note that <see cref="Batch"/> returns <c>0</c>. As described in the documentation of that property, that |
| 150 | + /// is meant to facilitate the reconciliation of the partitioning of the data in the case where multiple |
| 151 | + /// cursors are returned from |
| 152 | + /// <see cref="GetRowCursorSet(IEnumerable{DataViewSchema.Column}, int, Random)"/>, but since only one is |
| 153 | + /// ever returned from the implementation, this behavior is appropriate. |
| 154 | + /// |
| 155 | + /// Similarly, since it is impossible to have a shuffled cursor or a cursor set, it is sufficient for the |
| 156 | + /// <see cref="GetIdGetter"/> implementation to return a simple ID based on the position. If, however, this |
| 157 | + /// had been something built on, hypothetically, an <see cref="IList{T}"/> or some other such structure, and |
| 158 | + /// shuffling and partitioning was available, an ID based on the index of whatever item was being returned |
| 159 | + /// would be appropriate. |
| 160 | + /// |
| 161 | + /// Note the usage of the <see langword="ref"/> parameters on the <see cref="ValueGetter{TValue}"/> |
| 162 | + /// implementations. This is most valuable in the case of buffer sharing for <see cref="VBuffer{T}"/>, but |
| 163 | + /// we still of course have to deal with it here. |
| 164 | + /// |
| 165 | + /// Note also that we spend a considerable amount of effort to not make the |
| 166 | + /// <see cref="GetGetter{TValue}(DataViewSchema.Column)"/> and |
| 167 | + /// <see cref="IsColumnActive(DataViewSchema.Column)"/> methods correctly reflect what was asked for from |
| 168 | + /// the <see cref="GetRowCursor(IEnumerable{DataViewSchema.Column}, Random)"/> |
| 169 | + /// method that was used to create this method. In this particular case, the point is somewhat moot: this |
| 170 | + /// mechanism exists to enable lazy evaluation, but since this cursor is implemented to wrap an |
| 171 | + /// <see cref="IEnumerator{T}"/> which has no concept of lazy evaluation, there is no real practical benefit |
| 172 | + /// to doing this. However, it is best of course to illustrate the general principle for the sake of the |
| 173 | + /// example. |
| 174 | + /// |
| 175 | + /// Even in this simple form, we see the reason why <see cref="GetGetter{TValue}(DataViewSchema.Column)"/> |
| 176 | + /// is beneficial: the <see cref="ValueGetter{TValue}"/> implementations themselves are simple to the point |
| 177 | + /// where their operation is dwarfed by the simple acts of casting and validation checking one sees in |
| 178 | + /// <see cref="GetGetter{TValue}(DataViewSchema.Column)"/>. In this way we only pay the cost of validation |
| 179 | + /// and casting once, not every time we get a value. |
| 180 | + /// </summary> |
| 181 | + private sealed class Cursor : DataViewRowCursor |
| 182 | + { |
| 183 | + private bool _disposed; |
| 184 | + private long _position; |
| 185 | + private readonly IEnumerator<InputObject> _enumerator; |
| 186 | + private readonly Delegate[] _getters; |
| 187 | + |
| 188 | + public override long Position => _position; |
| 189 | + public override long Batch => 0; |
| 190 | + public override DataViewSchema Schema { get; } |
| 191 | + |
| 192 | + public Cursor(InputObjectDataView parent, bool wantsLabel, bool wantsText) |
| 193 | + { |
| 194 | + Schema = parent.Schema; |
| 195 | + _position = -1; |
| 196 | + _enumerator = parent._data.GetEnumerator(); |
| 197 | + _getters = new Delegate[] |
| 198 | + { |
| 199 | + wantsLabel ? (ValueGetter<bool>)LabelGetterImplementation : null, |
| 200 | + wantsText ? (ValueGetter<ReadOnlyMemory<char>>)TextGetterImplementation : null |
| 201 | + }; |
| 202 | + } |
| 203 | + |
| 204 | + protected override void Dispose(bool disposing) |
| 205 | + { |
| 206 | + if (_disposed) |
| 207 | + return; |
| 208 | + if (disposing) |
| 209 | + { |
| 210 | + _enumerator.Dispose(); |
| 211 | + _position = -1; |
| 212 | + } |
| 213 | + _disposed = true; |
| 214 | + base.Dispose(disposing); |
| 215 | + } |
| 216 | + |
| 217 | + private void LabelGetterImplementation(ref bool value) |
| 218 | + => value = _enumerator.Current.Label; |
| 219 | + |
| 220 | + private void TextGetterImplementation(ref ReadOnlyMemory<char> value) |
| 221 | + => value = _enumerator.Current.Text.AsMemory(); |
| 222 | + |
| 223 | + private void IdGetterImplementation(ref DataViewRowId id) |
| 224 | + => id = new DataViewRowId((ulong)_position, 0); |
| 225 | + |
| 226 | + public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column column) |
| 227 | + { |
| 228 | + if (!IsColumnActive(column)) |
| 229 | + throw new ArgumentOutOfRangeException(nameof(column)); |
| 230 | + return (ValueGetter<TValue>)_getters[column.Index]; |
| 231 | + } |
| 232 | + |
| 233 | + public override ValueGetter<DataViewRowId> GetIdGetter() |
| 234 | + => IdGetterImplementation; |
| 235 | + |
| 236 | + public override bool IsColumnActive(DataViewSchema.Column column) |
| 237 | + => _getters[column.Index] != null; |
| 238 | + |
| 239 | + public override bool MoveNext() |
| 240 | + { |
| 241 | + if (_disposed) |
| 242 | + return false; |
| 243 | + if (_enumerator.MoveNext()) |
| 244 | + { |
| 245 | + _position++; |
| 246 | + return true; |
| 247 | + } |
| 248 | + Dispose(); |
| 249 | + return false; |
| 250 | + } |
| 251 | + } |
| 252 | + } |
| 253 | + } |
| 254 | +} |
0 commit comments