Skip to content

Commit 7e6c828

Browse files
committed
Address code review comments and add a sample
1 parent 4c2d59c commit 7e6c828

File tree

4 files changed

+173
-93
lines changed

4 files changed

+173
-93
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Text;
5+
using Microsoft.ML;
6+
using Microsoft.ML.Data;
7+
using Microsoft.ML.Transforms;
8+
using Microsoft.VisualBasic.CompilerServices;
9+
using Tensorflow;
10+
11+
namespace Samples.Dynamic.DataOperations
12+
{
13+
public static class LoadingSvmLight
14+
{
15+
// This examples shows all the ways to load data with TextLoader.
16+
public static void Example()
17+
{
18+
// Create a random SVM light format file.
19+
var random = new Random(42);
20+
var dataDirectoryName = "DataDir";
21+
Directory.CreateDirectory(dataDirectoryName);
22+
var fileName = Path.Combine(dataDirectoryName, $"SVM_Data.csv");
23+
using (var fs = File.CreateText(fileName))
24+
{
25+
// Write random lines in SVM light format
26+
for (int line = 0; line < 10; line++)
27+
{
28+
var sb = new StringBuilder();
29+
if (random.NextDouble() > 0.5)
30+
sb.Append("1 ");
31+
else
32+
sb.Append("-1 ");
33+
if (line % 2 == 0)
34+
sb.Append("cost:1");
35+
else
36+
sb.Append("cost:2");
37+
for (int i = 1; i <= 10; i++)
38+
{
39+
if (random.NextDouble() > 0.5)
40+
continue;
41+
sb.Append($"{i}:{random.NextDouble()} ");
42+
}
43+
fs.WriteLine(sb.ToString());
44+
}
45+
}
46+
47+
// Create an SvmLightLoader.
48+
var mlContext = new MLContext();
49+
var file = new MultiFileSource(fileName);
50+
var loader = mlContext.Data.CreateSvmLightLoader(dataSample: file);
51+
52+
// Load a single file from path.
53+
var svmData = loader.Load(file);
54+
55+
PrintSchema(svmData);
56+
57+
// Expected Output:
58+
// Column Label type Single
59+
// Column Weight type Single
60+
// Column GroupId type Key<UInt64, 0 - 18446744073709551613>
61+
// Column Comment type String
62+
// Column Features type Vector<Single, 10>
63+
64+
PrintData(svmData);
65+
66+
// Expected Output:
67+
// 1 1 0 0 0.2625927 0 0 0.7612506 0.2573214 0 0.3809696 0.5174511
68+
// -1 1 0 0 0 0.7051522 0 0 0.7111546 0.9062127 0 0
69+
// -1 1 0 0 0 0.535722 0 0 0.1491191 0.05100901 0 0
70+
// -1 1 0 0.6481459 0.04449836 0 0 0.4203662 0 0 0.01325378 0.2674384
71+
// -1 1 0 0 0.7978093 0.5134962 0.008952909 0 0.003074009 0.6541431 0.9135142 0
72+
// -1 1 0 0.3727672 0.4369507 0 0 0.2973725 0 0 0 0.8816807
73+
// 1 1 0 0.1031429 0.3332489 0 0.1346936 0.5916625 0 0 0 0
74+
// 1 1 0 0 0 0.3454075 0 0.2197472 0.03848049 0.5923384 0.09373277 0
75+
// -1 1 0 0.7511514 0 0.0420841 0 0 0.9262196 0 0.545344 0
76+
// 1 1 0 0.02958358 0.9334617 0 0 0.8833956 0.2947684 0 0 0
77+
78+
// If the loader is created without a data sample we need to specify the number of features expected in the file.
79+
loader = mlContext.Data.CreateSvmLightLoader(10);
80+
svmData = loader.Load(file);
81+
82+
PrintSchema(svmData);
83+
PrintData(svmData);
84+
}
85+
86+
private static void PrintSchema(IDataView svmData)
87+
{
88+
foreach (var col in svmData.Schema)
89+
Console.WriteLine($"Column {col.Name} type {col.Type}");
90+
}
91+
92+
private static void PrintData(IDataView svmData)
93+
{
94+
using (var cursor = svmData.GetRowCursor(svmData.Schema))
95+
{
96+
var labelGetter = cursor.GetGetter<float>(svmData.Schema["Label"]);
97+
var weightGetter = cursor.GetGetter<float>(svmData.Schema["Weight"]);
98+
var featuresGetter = cursor.GetGetter<VBuffer<float>>(svmData.Schema["Features"]);
99+
100+
VBuffer<float> features = default;
101+
while (cursor.MoveNext())
102+
{
103+
float label = default;
104+
labelGetter(ref label);
105+
106+
float weight = default;
107+
weightGetter(ref weight);
108+
109+
featuresGetter(ref features);
110+
111+
Console.WriteLine($"{label} {weight} {string.Join(' ', features.DenseValues())}");
112+
}
113+
}
114+
}
115+
}
116+
}

docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
<ProjectReference Include="..\..\..\src\Microsoft.ML.TensorFlow\Microsoft.ML.TensorFlow.csproj" />
2424
<ProjectReference Include="..\..\..\src\Microsoft.ML.TimeSeries\Microsoft.ML.TimeSeries.csproj" />
2525
<ProjectReference Include="..\..\..\src\Microsoft.ML.DnnImageFeaturizer.ResNet18\Microsoft.ML.DnnImageFeaturizer.ResNet18.csproj" />
26+
<ProjectReference Include="..\..\..\src\Microsoft.ML.Transforms\Microsoft.ML.Transforms.csproj" />
2627

2728
<NativeAssemblyReference Include="CpuMathNative" />
2829
<NativeAssemblyReference Include="FastTreeNative" />

src/Microsoft.ML.Transforms/SvmLight/SvmLightLoader.cs

Lines changed: 55 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
//------------------------------------------------------------------------------
2-
// <copyright company="Microsoft Corporation">
3-
// Copyright (c) Microsoft Corporation. All rights reserved.
4-
// </copyright>
5-
//------------------------------------------------------------------------------
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
64

75
using System;
86
using System.Collections.Generic;
@@ -302,53 +300,43 @@ private sealed class IntermediateInput
302300
/// </summary>
303301
private sealed class Indices
304302
{
305-
public VBuffer<int> FeatureKeys;
303+
[KeyType(uint.MaxValue - 1)]
304+
public VBuffer<uint> FeatureKeys;
305+
}
306306

307-
public static void ParseIndicesToOneBased(IntermediateInput input, Indices output)
307+
private sealed class IndexParser
308+
{
309+
private readonly uint _offset;
310+
private readonly uint _na;
311+
312+
public IndexParser(bool zeroBased, ulong featureCount)
308313
{
309-
var editor = VBufferEditor.Create(ref output.FeatureKeys, input.FeatureKeys.Length);
310-
var inputValues = input.FeatureKeys.GetValues();
311-
for (int i = 0; i < inputValues.Length; i++)
312-
{
313-
if (Conversions.Instance.TryParse(in inputValues[i], out int index) && index > 0)
314-
editor.Values[i] = index - 1;
315-
else
316-
editor.Values[i] = -1;
317-
}
318-
output.FeatureKeys = editor.Commit();
314+
_offset = zeroBased ? (uint)0 : 1;
315+
_na = (uint)featureCount + 1;
319316
}
320317

321-
public static void ParseIndicesToZeroBased(IntermediateInput input, Indices output)
318+
public void ParseIndices(IntermediateInput input, Indices output)
322319
{
323320
var editor = VBufferEditor.Create(ref output.FeatureKeys, input.FeatureKeys.Length);
324321
var inputValues = input.FeatureKeys.GetValues();
325322
for (int i = 0; i < inputValues.Length; i++)
326323
{
327-
if (Conversions.Instance.TryParse(in inputValues[i], out int index) && index >= 0)
328-
editor.Values[i] = index;
324+
if (Conversions.Instance.TryParse(in inputValues[i], out uint index) && index >= _offset)
325+
editor.Values[i] = index - _offset + 1;
329326
else
330-
editor.Values[i] = -1;
327+
editor.Values[i] = _na;
331328
}
332329
output.FeatureKeys = editor.Commit();
333330
}
334331
}
335332

336333
/// <summary>
337-
/// This class and the <see cref="IntermediateOut"/> class are used by the <see cref="CustomMappingTransformer{TSrc, TDst}"/>
338-
/// that maps a vector of indices and a vector of values into a single <see cref="VBuffer{T}"/> of values. When the indices
339-
/// originate from the <see cref="ValueToKeyMappingTransformer"/> (in case features are specified by name), <see cref="IntermediateOutKeys"/>
340-
/// is used, and when they originate from a <see cref="CustomMappingTransformer{TSrc, TDst}"/> that produces an <see cref="Indices"/>,
341-
/// <see cref="IntermediateOut"/> is used.
334+
/// This class is used by the <see cref="CustomMappingTransformer{TSrc, TDst}"/>
335+
/// that maps a vector of indices and a vector of values into a single <see cref="VBuffer{T}"/> of values.
342336
/// </summary>
343-
private sealed class IntermediateOutKeys
344-
{
345-
public VBuffer<uint> FeatureKeys;
346-
public VBuffer<float> FeatureValues;
347-
}
348-
349337
private sealed class IntermediateOut
350338
{
351-
public VBuffer<int> FeatureKeys;
339+
public VBuffer<uint> FeatureKeys;
352340
public VBuffer<float> FeatureValues;
353341
}
354342

@@ -359,81 +347,54 @@ private sealed class Output
359347
#pragma warning restore 0649
360348

361349
/// <summary>
362-
/// This class contains the mapper that maps an <see cref="IntermediateOut"/> or an <see cref="IntermediateOutKeys"/>
350+
/// This class contains the mapper that maps an an <see cref="IntermediateOut"/>
363351
/// to an <see cref="Output"/>.
364352
/// </summary>
365353
private sealed class OutputMapper
366354
{
367355
private readonly uint _keyMax;
356+
private readonly BufferBuilder<float> _bldr;
357+
private readonly bool[] _indexUsed;
368358

369359
public OutputMapper(int keyCount)
370360
{
371361
Contracts.Assert(keyCount > 0);
372362
// Leave as uint, so that comparisons against uint key values do not
373363
// incur any sort of implicit value conversions.
374364
_keyMax = (uint)keyCount;
365+
_bldr = new BufferBuilder<float>(FloatAdder.Instance);
366+
_indexUsed = new bool[_keyMax];
375367
}
376368

377369
public void Map(IntermediateOut intermediate, Output output)
378370
{
379371
MapCore(ref intermediate.FeatureKeys, ref intermediate.FeatureValues, output);
380372
}
381373

382-
public void Map(IntermediateOutKeys intermediate, Output output)
383-
{
384-
MapCore(ref intermediate.FeatureKeys, ref intermediate.FeatureValues, output);
385-
}
386-
387-
private void MapCore(ref VBuffer<int> keys, ref VBuffer<float> values, Output output)
388-
{
389-
var editor = VBufferEditor.Create(ref output.Features, (int)_keyMax);
390-
editor.Values.Clear();
391-
392-
// I fully expect that these inputs will be of equal size. But I don't want to
393-
// throw in the event that they're not. Instead just have it be an empty vector.
394-
// REVIEW: Add warning and reporting for bad inputs for these.
395-
if (keys.Length == values.Length)
396-
{
397-
// Both of these inputs should be dense, but still work even if they're not.
398-
VBufferUtils.Densify(ref keys);
399-
VBufferUtils.Densify(ref values);
400-
var keysValues = keys.GetValues();
401-
var valuesValues = values.GetValues();
402-
for (int i = 0; i < keysValues.Length; ++i)
403-
{
404-
var key = keysValues[i];
405-
if (key < 0 || key >= _keyMax)
406-
continue;
407-
editor.Values[key] = valuesValues[i];
408-
}
409-
}
410-
output.Features = editor.Commit();
411-
}
412-
413374
private void MapCore(ref VBuffer<uint> keys, ref VBuffer<float> values, Output output)
414375
{
415-
var editor = VBufferEditor.Create(ref output.Features, (int)_keyMax);
416-
editor.Values.Clear();
417-
418-
// I fully expect that these inputs will be of equal size. But I don't want to
419-
// throw in the event that they're not. Instead just have it be an empty vector.
420-
// REVIEW: Add warning and reporting for bad inputs for these.
421-
if (keys.Length == values.Length)
376+
Contracts.Check(keys.Length == values.Length, "number of keys does not match number of values.");
377+
378+
// Both of these inputs should be dense, but still work even if they're not.
379+
VBufferUtils.Densify(ref keys);
380+
VBufferUtils.Densify(ref values);
381+
var keysValues = keys.GetValues();
382+
var valuesValues = values.GetValues();
383+
384+
// The output vector could be sparse, so we use BufferBuilder here.
385+
_bldr.Reset((int)_keyMax, false);
386+
Array.Clear(_indexUsed, 0, _indexUsed.Length);
387+
for (int i = 0; i < keys.Length; ++i)
422388
{
423-
// Both of these inputs should be dense, but still work even if they're not.
424-
VBufferUtils.Densify(ref keys);
425-
VBufferUtils.Densify(ref values);
426-
var keysValues = keys.GetValues();
427-
var valuesValues = values.GetValues();
428-
for (int i = 0; i < keys.Length; ++i)
429-
{
430-
var key = keysValues[i];
431-
if (key == 0 || key > _keyMax)
432-
continue;
433-
editor.Values[(int)key - 1] = valuesValues[i];
434-
}
389+
var key = keysValues[i];
390+
if (key == 0 || key > _keyMax)
391+
continue;
392+
if (_indexUsed[(int)key - 1])
393+
continue;
394+
_bldr.AddFeature((int)key - 1, valuesValues[i]);
395+
_indexUsed[(int)key - 1] = true;
435396
}
436-
output.Features = editor.Commit();
397+
_bldr.GetResult(ref output.Features);
437398
}
438399
}
439400

@@ -684,10 +645,11 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
684645
private DataViewSchema CreateOutputSchema()
685646
{
686647
var data = GetData(_host, null, null);
648+
var indexParser = new IndexParser(_indicesKind == FeatureIndices.ZeroBased, _featureCount);
649+
var schemaDef = SchemaDefinition.Create(typeof(Indices));
650+
schemaDef[nameof(Indices.FeatureKeys)].ColumnType = new KeyDataViewType(typeof(uint), _featureCount);
687651
var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??
688-
(_indicesKind == FeatureIndices.OneBased ?
689-
new CustomMappingTransformer<IntermediateInput, Indices>(_host, Indices.ParseIndicesToOneBased, null) :
690-
new CustomMappingTransformer<IntermediateInput, Indices>(_host, Indices.ParseIndicesToZeroBased, null));
652+
new CustomMappingTransformer<IntermediateInput, Indices>(_host, indexParser.ParseIndices, null);
691653
var schema = keyVectorsToIndexVectors.GetOutputSchema(data.Schema);
692654
return CreateOutputTransformer(_host, (int)_featureCount,
693655
_indicesKind == FeatureIndices.Names, schema).GetOutputSchema(schema);
@@ -783,13 +745,15 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
783745
col.Annotations.GetValue(AnnotationUtils.Kinds.KeyValues, ref keyValues);
784746
schemaDef[0].AddAnnotation(AnnotationUtils.Kinds.SlotNames, keyValues, keyValuesCol.Value.Type);
785747
}
786-
outputTransformer = new CustomMappingTransformer<IntermediateOutKeys, Output>(env,
748+
outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
787749
outputMapper.Map, null, outputSchemaDefinition: schemaDef);
788750
}
789751
else
790752
{
791753
outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
792754
outputMapper.Map, null, outputSchemaDefinition: schemaDef);
755+
//outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
756+
// outputMapper.Map, null, outputSchemaDefinition: schemaDef);
793757
}
794758

795759
string[] toKeep = { "Label", "Weight", "GroupId", "Comment", "Features" };
@@ -801,10 +765,9 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
801765
public IDataView Load(IMultiStreamSource input)
802766
{
803767
var data = GetData(_host, null, input);
768+
var indexParser = new IndexParser(_indicesKind == FeatureIndices.ZeroBased, _featureCount);
804769
var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??
805-
(_indicesKind == FeatureIndices.OneBased ?
806-
new CustomMappingTransformer<IntermediateInput, Indices>(_host, Indices.ParseIndicesToOneBased, null) :
807-
new CustomMappingTransformer<IntermediateInput, Indices>(_host, Indices.ParseIndicesToZeroBased, null));
770+
new CustomMappingTransformer<IntermediateInput, Indices>(_host, indexParser.ParseIndices, null);
808771
data = keyVectorsToIndexVectors.Transform(data);
809772
return CreateOutputTransformer(_host, (int)_featureCount, _indicesKind == FeatureIndices.Names, data.Schema).Transform(data);
810773
}

test/BaselineOutput/Common/Command/CommandShowDataSvmLight-3-out.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
#@ }
1010
Label Weight GroupId Comment aurora beachwood chagrin
1111
-1 1 5 2:1 4:2
12-
1 1 5 4:5
12+
1 1 5 4:3
1313
Wrote 2 rows of length 7

0 commit comments

Comments
 (0)