33// See the LICENSE file in the project root for more information.
44
55using System ;
6+ using System . Collections ;
67using System . Collections . Generic ;
78using System . IO ;
89using System . Linq ;
@@ -169,14 +170,9 @@ public void MapInput(Input input, IntermediateInput intermediate)
169170
170171 var ator = ReadOnlyMemoryUtils . Split ( left , _seps ) . GetEnumerator ( ) ;
171172
172- if ( ! ator . MoveNext ( ) )
173- {
174- intermediate . Label = float . NaN ;
175- intermediate . Weight = float . NaN ;
176- VBufferUtils . Clear ( ref intermediate . FeatureKeys ) ;
177- VBufferUtils . Clear ( ref intermediate . FeatureValues ) ;
178- return ;
179- }
173+ // Empty lines are filtered in the Input.MapComment step.
174+ var notEmpty = ator . MoveNext ( ) ;
175+ Contracts . Assert ( notEmpty ) ;
180176
181177 ReadOnlyMemory < char > token = ator . Current ;
182178
@@ -239,6 +235,7 @@ public void MapInput(Input input, IntermediateInput intermediate)
239235 // it a feature, but right now we have no learners that pay
240236 // attention to so-called "slack IDs" so we'll ignore these for
241237 // right now.
238+ continue ;
242239 }
243240 else
244241 {
@@ -321,10 +318,16 @@ public void ParseIndices(IntermediateInput input, Indices output)
321318 var inputValues = input . FeatureKeys . GetValues ( ) ;
322319 for ( int i = 0 ; i < inputValues . Length ; i ++ )
323320 {
324- if ( Conversions . Instance . TryParse ( in inputValues [ i ] , out uint index ) && index >= _offset )
321+ if ( Conversions . Instance . TryParse ( in inputValues [ i ] , out uint index ) )
322+ {
323+ if ( index < _offset )
324+ {
325+ throw Contracts . Except ( "Encountered 0 index while parsing a 1-based dataset" ) ;
326+ }
325327 editor . Values [ i ] = index - _offset + 1 ;
328+ }
326329 else
327- editor . Values [ i ] = _na ;
330+ throw Contracts . Except ( $ "Encountered non-parsable index ' { inputValues [ i ] } ' while parsing dataset" ) ;
328331 }
329332 output . FeatureKeys = editor . Commit ( ) ;
330333 }
@@ -354,7 +357,7 @@ private sealed class OutputMapper
354357 {
355358 private readonly uint _keyMax ;
356359 private readonly BufferBuilder < float > _bldr ;
357- private readonly bool [ ] _indexUsed ;
360+ private readonly BitArray _indexUsed ;
358361
359362 public OutputMapper ( int keyCount )
360363 {
@@ -363,7 +366,7 @@ public OutputMapper(int keyCount)
363366 // incur any sort of implicit value conversions.
364367 _keyMax = ( uint ) keyCount ;
365368 _bldr = new BufferBuilder < float > ( FloatAdder . Instance ) ;
366- _indexUsed = new bool [ _keyMax ] ;
369+ _indexUsed = new BitArray ( ( int ) _keyMax ) ;
367370 }
368371
369372 public void Map ( IntermediateOut intermediate , Output output )
@@ -383,14 +386,14 @@ private void MapCore(ref VBuffer<uint> keys, ref VBuffer<float> values, Output o
383386
384387 // The output vector could be sparse, so we use BufferBuilder here.
385388 _bldr . Reset ( ( int ) _keyMax , false ) ;
386- Array . Clear ( _indexUsed , 0 , _indexUsed . Length ) ;
389+ _indexUsed . SetAll ( false ) ;
387390 for ( int i = 0 ; i < keys . Length ; ++ i )
388391 {
389392 var key = keysValues [ i ] ;
390393 if ( key == 0 || key > _keyMax )
391394 continue ;
392395 if ( _indexUsed [ ( int ) key - 1 ] )
393- continue ;
396+ throw Contracts . Except ( "Duplicate keys found in dataset" ) ;
394397 _bldr . AddFeature ( ( int ) key - 1 , valuesValues [ i ] ) ;
395398 _indexUsed [ ( int ) key - 1 ] = true ;
396399 }
@@ -411,9 +414,11 @@ private sealed class TextDataView : IDataView
411414 private readonly IHost _host ;
412415 private readonly IMultiStreamSource _files ;
413416
414- public TextDataView ( IHostEnvironment env , IMultiStreamSource files = null )
417+ public TextDataView ( IHostEnvironment env , IMultiStreamSource files )
415418 {
416419 Contracts . CheckValue ( env , nameof ( env ) ) ;
420+ env . CheckValue ( files , nameof ( files ) ) ;
421+
417422 _host = env . Register ( "TextDataView" ) ;
418423 _files = files ;
419424
@@ -424,7 +429,7 @@ public TextDataView(IHostEnvironment env, IMultiStreamSource files = null)
424429
425430 public long ? GetRowCount ( )
426431 {
427- if ( _files == null || _files . Count == 0 )
432+ if ( _files . Count == 0 )
428433 return 0 ;
429434 return null ;
430435 }
@@ -461,7 +466,7 @@ public Cursor(TextDataView parent, bool isActive)
461466 {
462467 _parent = parent ;
463468 _isActive = isActive ;
464- if ( _parent . _files == null || _parent . _files . Count == 0 )
469+ if ( _parent . _files . Count == 0 )
465470 {
466471 // Rather than corrupt MoveNextCore with a bunch of custom logic for
467472 // the empty file case and make that less efficient, be slightly inefficient
@@ -513,9 +518,9 @@ public override bool IsColumnActive(DataViewSchema.Column column)
513518 protected override bool MoveNextCore ( )
514519 {
515520 Ch . AssertValue ( _currReader ) ;
516- Ch . Assert ( - 1 <= _fileIdx && _fileIdx < ( _parent . _files == null ? 0 : _parent . _files . Count ) ) ;
521+ Ch . Assert ( - 1 <= _fileIdx && _fileIdx < _parent . _files . Count ) ;
517522
518- var count = _parent . _files == null ? 0 : _parent . _files . Count ;
523+ var count = _parent . _files . Count ;
519524 for ( ; ; )
520525 {
521526 var line = _currReader . ReadLine ( ) ;
@@ -570,7 +575,7 @@ internal SvmLightLoader(IHostEnvironment env, Options options = null, IMultiStre
570575 _featureCount = ( ulong ) options . InputSize ;
571576 else
572577 {
573- if ( dataSample == null )
578+ if ( dataSample == null || dataSample . Count == 0 )
574579 throw env . Except ( "If the number of features is not specified, a dataset must be provided to infer it." ) ;
575580 var data = GetData ( _host , options . NumberOfRows , dataSample ) ;
576581 _featureCount = InferMax ( _host , data ) + ( ulong ) ( _indicesKind == FeatureIndices . ZeroBased ? 1 : 0 ) ;
@@ -580,7 +585,7 @@ internal SvmLightLoader(IHostEnvironment env, Options options = null, IMultiStre
580585 else
581586 {
582587 // We need to train a ValueToKeyMappingTransformer.
583- if ( dataSample == null )
588+ if ( dataSample == null || dataSample . Count == 0 )
584589 throw env . Except ( "To use the text feature names option, a dataset must be provided" ) ;
585590
586591 var data = GetData ( _host , options . NumberOfRows , dataSample ) ;
@@ -644,7 +649,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
644649
645650 private DataViewSchema CreateOutputSchema ( )
646651 {
647- var data = GetData ( _host , null , null ) ;
652+ var data = GetData ( _host , null , new MultiFileSource ( null ) ) ;
648653 var indexParser = new IndexParser ( _indicesKind == FeatureIndices . ZeroBased , _featureCount ) ;
649654 var schemaDef = SchemaDefinition . Create ( typeof ( Indices ) ) ;
650655 schemaDef [ nameof ( Indices . FeatureKeys ) ] . ColumnType = new KeyDataViewType ( typeof ( uint ) , _featureCount ) ;
@@ -752,8 +757,6 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
752757 {
753758 outputTransformer = new CustomMappingTransformer < IntermediateOut , Output > ( env ,
754759 outputMapper . Map , null , outputSchemaDefinition : schemaDef ) ;
755- //outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
756- // outputMapper.Map, null, outputSchemaDefinition: schemaDef);
757760 }
758761
759762 string [ ] toKeep = { "Label" , "Weight" , "GroupId" , "Comment" , "Features" } ;
@@ -764,6 +767,8 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
764767
765768 public IDataView Load ( IMultiStreamSource input )
766769 {
770+ _host . CheckValue ( input , nameof ( input ) ) ;
771+
767772 var data = GetData ( _host , null , input ) ;
768773 var indexParser = new IndexParser ( _indicesKind == FeatureIndices . ZeroBased , _featureCount ) ;
769774 var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??
0 commit comments