Skip to content

Commit 7358266

Browse files
committed
address code review comments and add more tests
1 parent 6a1c7fd commit 7358266

File tree

6 files changed

+440
-92
lines changed

6 files changed

+440
-92
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/LoadingSvmLight.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public static void Example()
7676
// 1 1 0 0.02958358 0.9334617 0 0 0.8833956 0.2947684 0 0 0
7777

7878
// If the loader is created without a data sample we need to specify the number of features expected in the file.
79-
loader = mlContext.Data.CreateSvmLightLoader(10);
79+
loader = mlContext.Data.CreateSvmLightLoader(inputSize: 10);
8080
svmData = loader.Load(file);
8181

8282
PrintSchema(svmData);

src/Microsoft.ML.Transforms/SvmLight/SvmLightLoader.cs

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Collections;
67
using System.Collections.Generic;
78
using System.IO;
89
using System.Linq;
@@ -169,14 +170,9 @@ public void MapInput(Input input, IntermediateInput intermediate)
169170

170171
var ator = ReadOnlyMemoryUtils.Split(left, _seps).GetEnumerator();
171172

172-
if (!ator.MoveNext())
173-
{
174-
intermediate.Label = float.NaN;
175-
intermediate.Weight = float.NaN;
176-
VBufferUtils.Clear(ref intermediate.FeatureKeys);
177-
VBufferUtils.Clear(ref intermediate.FeatureValues);
178-
return;
179-
}
173+
// Empty lines are filtered in the Input.MapComment step.
174+
var notEmpty = ator.MoveNext();
175+
Contracts.Assert(notEmpty);
180176

181177
ReadOnlyMemory<char> token = ator.Current;
182178

@@ -239,6 +235,7 @@ public void MapInput(Input input, IntermediateInput intermediate)
239235
// it a feature, but right now we have no learners that pay
240236
// attention to so-called "slack IDs" so we'll ignore these for
241237
// right now.
238+
continue;
242239
}
243240
else
244241
{
@@ -321,10 +318,16 @@ public void ParseIndices(IntermediateInput input, Indices output)
321318
var inputValues = input.FeatureKeys.GetValues();
322319
for (int i = 0; i < inputValues.Length; i++)
323320
{
324-
if (Conversions.Instance.TryParse(in inputValues[i], out uint index) && index >= _offset)
321+
if (Conversions.Instance.TryParse(in inputValues[i], out uint index))
322+
{
323+
if (index < _offset)
324+
{
325+
throw Contracts.Except("Encountered 0 index while parsing a 1-based dataset");
326+
}
325327
editor.Values[i] = index - _offset + 1;
328+
}
326329
else
327-
editor.Values[i] = _na;
330+
throw Contracts.Except($"Encountered non-parsable index '{inputValues[i]}' while parsing dataset");
328331
}
329332
output.FeatureKeys = editor.Commit();
330333
}
@@ -354,7 +357,7 @@ private sealed class OutputMapper
354357
{
355358
private readonly uint _keyMax;
356359
private readonly BufferBuilder<float> _bldr;
357-
private readonly bool[] _indexUsed;
360+
private readonly BitArray _indexUsed;
358361

359362
public OutputMapper(int keyCount)
360363
{
@@ -363,7 +366,7 @@ public OutputMapper(int keyCount)
363366
// incur any sort of implicit value conversions.
364367
_keyMax = (uint)keyCount;
365368
_bldr = new BufferBuilder<float>(FloatAdder.Instance);
366-
_indexUsed = new bool[_keyMax];
369+
_indexUsed = new BitArray((int)_keyMax);
367370
}
368371

369372
public void Map(IntermediateOut intermediate, Output output)
@@ -383,14 +386,14 @@ private void MapCore(ref VBuffer<uint> keys, ref VBuffer<float> values, Output o
383386

384387
// The output vector could be sparse, so we use BufferBuilder here.
385388
_bldr.Reset((int)_keyMax, false);
386-
Array.Clear(_indexUsed, 0, _indexUsed.Length);
389+
_indexUsed.SetAll(false);
387390
for (int i = 0; i < keys.Length; ++i)
388391
{
389392
var key = keysValues[i];
390393
if (key == 0 || key > _keyMax)
391394
continue;
392395
if (_indexUsed[(int)key - 1])
393-
continue;
396+
throw Contracts.Except("Duplicate keys found in dataset");
394397
_bldr.AddFeature((int)key - 1, valuesValues[i]);
395398
_indexUsed[(int)key - 1] = true;
396399
}
@@ -411,9 +414,11 @@ private sealed class TextDataView : IDataView
411414
private readonly IHost _host;
412415
private readonly IMultiStreamSource _files;
413416

414-
public TextDataView(IHostEnvironment env, IMultiStreamSource files = null)
417+
public TextDataView(IHostEnvironment env, IMultiStreamSource files)
415418
{
416419
Contracts.CheckValue(env, nameof(env));
420+
env.CheckValue(files, nameof(files));
421+
417422
_host = env.Register("TextDataView");
418423
_files = files;
419424

@@ -424,7 +429,7 @@ public TextDataView(IHostEnvironment env, IMultiStreamSource files = null)
424429

425430
public long? GetRowCount()
426431
{
427-
if (_files == null || _files.Count == 0)
432+
if (_files.Count == 0)
428433
return 0;
429434
return null;
430435
}
@@ -461,7 +466,7 @@ public Cursor(TextDataView parent, bool isActive)
461466
{
462467
_parent = parent;
463468
_isActive = isActive;
464-
if (_parent._files == null || _parent._files.Count == 0)
469+
if (_parent._files.Count == 0)
465470
{
466471
// Rather than corrupt MoveNextCore with a bunch of custom logic for
467472
// the empty file case and make that less efficient, be slightly inefficient
@@ -513,9 +518,9 @@ public override bool IsColumnActive(DataViewSchema.Column column)
513518
protected override bool MoveNextCore()
514519
{
515520
Ch.AssertValue(_currReader);
516-
Ch.Assert(-1 <= _fileIdx && _fileIdx < (_parent._files == null ? 0 : _parent._files.Count));
521+
Ch.Assert(-1 <= _fileIdx && _fileIdx < _parent._files.Count);
517522

518-
var count = _parent._files == null ? 0 : _parent._files.Count;
523+
var count = _parent._files.Count;
519524
for (; ; )
520525
{
521526
var line = _currReader.ReadLine();
@@ -570,7 +575,7 @@ internal SvmLightLoader(IHostEnvironment env, Options options = null, IMultiStre
570575
_featureCount = (ulong)options.InputSize;
571576
else
572577
{
573-
if (dataSample == null)
578+
if (dataSample == null || dataSample.Count == 0)
574579
throw env.Except("If the number of features is not specified, a dataset must be provided to infer it.");
575580
var data = GetData(_host, options.NumberOfRows, dataSample);
576581
_featureCount = InferMax(_host, data) + (ulong)(_indicesKind == FeatureIndices.ZeroBased ? 1 : 0);
@@ -580,7 +585,7 @@ internal SvmLightLoader(IHostEnvironment env, Options options = null, IMultiStre
580585
else
581586
{
582587
// We need to train a ValueToKeyMappingTransformer.
583-
if (dataSample == null)
588+
if (dataSample == null || dataSample.Count == 0)
584589
throw env.Except("To use the text feature names option, a dataset must be provided");
585590

586591
var data = GetData(_host, options.NumberOfRows, dataSample);
@@ -644,7 +649,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
644649

645650
private DataViewSchema CreateOutputSchema()
646651
{
647-
var data = GetData(_host, null, null);
652+
var data = GetData(_host, null, new MultiFileSource(null));
648653
var indexParser = new IndexParser(_indicesKind == FeatureIndices.ZeroBased, _featureCount);
649654
var schemaDef = SchemaDefinition.Create(typeof(Indices));
650655
schemaDef[nameof(Indices.FeatureKeys)].ColumnType = new KeyDataViewType(typeof(uint), _featureCount);
@@ -752,8 +757,6 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
752757
{
753758
outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
754759
outputMapper.Map, null, outputSchemaDefinition: schemaDef);
755-
//outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
756-
// outputMapper.Map, null, outputSchemaDefinition: schemaDef);
757760
}
758761

759762
string[] toKeep = { "Label", "Weight", "GroupId", "Comment", "Features" };
@@ -764,6 +767,8 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
764767

765768
public IDataView Load(IMultiStreamSource input)
766769
{
770+
_host.CheckValue(input, nameof(input));
771+
767772
var data = GetData(_host, null, input);
768773
var indexParser = new IndexParser(_indicesKind == FeatureIndices.ZeroBased, _featureCount);
769774
var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??

src/Microsoft.ML.Transforms/SvmLight/SvmLightLoaderSaverCatalog.cs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,9 @@
44

55
using System.IO;
66
using System.Linq;
7-
using Microsoft.ML.Data;
87
using Microsoft.ML.Runtime;
98

10-
namespace Microsoft.ML.Transforms
9+
namespace Microsoft.ML.Data
1110
{
1211
public static class SvmLightLoaderSaverCatalog
1312
{
@@ -22,8 +21,8 @@ public static class SvmLightLoaderSaverCatalog
2221
/// it should be set to false.</param>
2322
/// <param name="dataSample">A data sample to be used for determining the number of features in the Features column.</param>
2423
public static SvmLightLoader CreateSvmLightLoader(this DataOperationsCatalog catalog,
25-
int inputSize = 0,
2624
long? numberOfRows = null,
25+
int inputSize = 0,
2726
bool zeroBased = false,
2827
IMultiStreamSource dataSample = null)
2928
=> new SvmLightLoader(CatalogUtils.GetEnvironment(catalog), new SvmLightLoader.Options()
@@ -54,14 +53,14 @@ public static SvmLightLoader CreateSvmLightLoaderWithFeatureNames(this DataOpera
5453
/// <param name="numberOfRows">The number of rows from the sample to be used for determining the number of features.</param>
5554
public static IDataView LoadFromSvmLightFile(this DataOperationsCatalog catalog,
5655
string path,
56+
long? numberOfRows = null,
5757
int inputSize = 0,
58-
bool zeroBased = false,
59-
long? numberOfRows = null)
58+
bool zeroBased = false)
6059
{
6160
Contracts.CheckNonEmpty(path, nameof(path));
6261

6362
var file = new MultiFileSource(path);
64-
var loader = catalog.CreateSvmLightLoader(inputSize, numberOfRows, zeroBased, file);
63+
var loader = catalog.CreateSvmLightLoader(numberOfRows, inputSize, zeroBased, file);
6564
return loader.Load(file);
6665
}
6766

src/Microsoft.ML.Transforms/SvmLight/SvmLightSaver.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@
1010
using Microsoft.ML.Data;
1111
using Microsoft.ML.Data.IO;
1212
using Microsoft.ML.Runtime;
13-
using Microsoft.ML.Transforms;
1413

1514
[assembly: LoadableClass(SvmLightSaver.Summary, typeof(SvmLightSaver), typeof(SvmLightSaver.Arguments), typeof(SignatureDataSaver),
1615
"SVM-Light Saver", SvmLightSaver.LoadName, "SvmLight", "Svm")]
1716

18-
namespace Microsoft.ML.Transforms
17+
namespace Microsoft.ML.Data
1918
{
2019
/// <summary>
2120
/// The SVM-light saver is a saver class that is capable of saving the label,

test/Microsoft.ML.TestFramework/TestCommandBase.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2199,7 +2199,7 @@ public void CommandShowDataSvmLight()
21992199
path = CreateOutputPath("DataC.txt");
22002200
File.WriteAllLines(path.Path, new string[] {
22012201
"-1 aurora:1 chagrin:2",
2202-
"1 chagrin:3 euclid:4 chagrin:5"
2202+
"1 chagrin:3 euclid:4"
22032203
});
22042204
TestInCore("showdata", path.Path, modelPath, "");
22052205

0 commit comments

Comments
 (0)