1
- //------------------------------------------------------------------------------
2
- // <copyright company="Microsoft Corporation">
3
- // Copyright (c) Microsoft Corporation. All rights reserved.
4
- // </copyright>
5
- //------------------------------------------------------------------------------
1
+ // Licensed to the .NET Foundation under one or more agreements.
2
+ // The .NET Foundation licenses this file to you under the MIT license.
3
+ // See the LICENSE file in the project root for more information.
6
4
7
5
using System ;
8
6
using System . Collections . Generic ;
@@ -302,53 +300,43 @@ private sealed class IntermediateInput
302
300
/// </summary>
303
301
private sealed class Indices
304
302
{
305
- public VBuffer < int > FeatureKeys ;
303
+ [ KeyType ( uint . MaxValue - 1 ) ]
304
+ public VBuffer < uint > FeatureKeys ;
305
+ }
306
306
307
- public static void ParseIndicesToOneBased ( IntermediateInput input , Indices output )
307
+ private sealed class IndexParser
308
+ {
309
+ private readonly uint _offset ;
310
+ private readonly uint _na ;
311
+
312
+ public IndexParser ( bool zeroBased , ulong featureCount )
308
313
{
309
- var editor = VBufferEditor . Create ( ref output . FeatureKeys , input . FeatureKeys . Length ) ;
310
- var inputValues = input . FeatureKeys . GetValues ( ) ;
311
- for ( int i = 0 ; i < inputValues . Length ; i ++ )
312
- {
313
- if ( Conversions . Instance . TryParse ( in inputValues [ i ] , out int index ) && index > 0 )
314
- editor . Values [ i ] = index - 1 ;
315
- else
316
- editor . Values [ i ] = - 1 ;
317
- }
318
- output . FeatureKeys = editor . Commit ( ) ;
314
+ _offset = zeroBased ? ( uint ) 0 : 1 ;
315
+ _na = ( uint ) featureCount + 1 ;
319
316
}
320
317
321
- public static void ParseIndicesToZeroBased ( IntermediateInput input , Indices output )
318
+ public void ParseIndices ( IntermediateInput input , Indices output )
322
319
{
323
320
var editor = VBufferEditor . Create ( ref output . FeatureKeys , input . FeatureKeys . Length ) ;
324
321
var inputValues = input . FeatureKeys . GetValues ( ) ;
325
322
for ( int i = 0 ; i < inputValues . Length ; i ++ )
326
323
{
327
- if ( Conversions . Instance . TryParse ( in inputValues [ i ] , out int index ) && index >= 0 )
328
- editor . Values [ i ] = index ;
324
+ if ( Conversions . Instance . TryParse ( in inputValues [ i ] , out uint index ) && index >= _offset )
325
+ editor . Values [ i ] = index - _offset + 1 ;
329
326
else
330
- editor . Values [ i ] = - 1 ;
327
+ editor . Values [ i ] = _na ;
331
328
}
332
329
output . FeatureKeys = editor . Commit ( ) ;
333
330
}
334
331
}
335
332
336
333
/// <summary>
337
- /// This class and the <see cref="IntermediateOut"/> class are used by the <see cref="CustomMappingTransformer{TSrc, TDst}"/>
338
- /// that maps a vector of indices and a vector of values into a single <see cref="VBuffer{T}"/> of values. When the indices
339
- /// originate from the <see cref="ValueToKeyMappingTransformer"/> (in case features are specified by name), <see cref="IntermediateOutKeys"/>
340
- /// is used, and when they originate from a <see cref="CustomMappingTransformer{TSrc, TDst}"/> that produces an <see cref="Indices"/>,
341
- /// <see cref="IntermediateOut"/> is used.
334
+ /// This class is used by the <see cref="CustomMappingTransformer{TSrc, TDst}"/>
335
+ /// that maps a vector of indices and a vector of values into a single <see cref="VBuffer{T}"/> of values.
342
336
/// </summary>
343
- private sealed class IntermediateOutKeys
344
- {
345
- public VBuffer < uint > FeatureKeys ;
346
- public VBuffer < float > FeatureValues ;
347
- }
348
-
349
337
private sealed class IntermediateOut
350
338
{
351
- public VBuffer < int > FeatureKeys ;
339
+ public VBuffer < uint > FeatureKeys ;
352
340
public VBuffer < float > FeatureValues ;
353
341
}
354
342
@@ -359,81 +347,54 @@ private sealed class Output
359
347
#pragma warning restore 0649
360
348
361
349
/// <summary>
362
- /// This class contains the mapper that maps an <see cref="IntermediateOut"/> or an <see cref="IntermediateOutKeys "/>
350
+ /// This class contains the mapper that maps an an <see cref="IntermediateOut "/>
363
351
/// to an <see cref="Output"/>.
364
352
/// </summary>
365
353
private sealed class OutputMapper
366
354
{
367
355
private readonly uint _keyMax ;
356
+ private readonly BufferBuilder < float > _bldr ;
357
+ private readonly bool [ ] _indexUsed ;
368
358
369
359
public OutputMapper ( int keyCount )
370
360
{
371
361
Contracts . Assert ( keyCount > 0 ) ;
372
362
// Leave as uint, so that comparisons against uint key values do not
373
363
// incur any sort of implicit value conversions.
374
364
_keyMax = ( uint ) keyCount ;
365
+ _bldr = new BufferBuilder < float > ( FloatAdder . Instance ) ;
366
+ _indexUsed = new bool [ _keyMax ] ;
375
367
}
376
368
377
369
public void Map ( IntermediateOut intermediate , Output output )
378
370
{
379
371
MapCore ( ref intermediate . FeatureKeys , ref intermediate . FeatureValues , output ) ;
380
372
}
381
373
382
- public void Map ( IntermediateOutKeys intermediate , Output output )
383
- {
384
- MapCore ( ref intermediate . FeatureKeys , ref intermediate . FeatureValues , output ) ;
385
- }
386
-
387
- private void MapCore ( ref VBuffer < int > keys , ref VBuffer < float > values , Output output )
388
- {
389
- var editor = VBufferEditor . Create ( ref output . Features , ( int ) _keyMax ) ;
390
- editor . Values . Clear ( ) ;
391
-
392
- // I fully expect that these inputs will be of equal size. But I don't want to
393
- // throw in the event that they're not. Instead just have it be an empty vector.
394
- // REVIEW: Add warning and reporting for bad inputs for these.
395
- if ( keys . Length == values . Length )
396
- {
397
- // Both of these inputs should be dense, but still work even if they're not.
398
- VBufferUtils . Densify ( ref keys ) ;
399
- VBufferUtils . Densify ( ref values ) ;
400
- var keysValues = keys . GetValues ( ) ;
401
- var valuesValues = values . GetValues ( ) ;
402
- for ( int i = 0 ; i < keysValues . Length ; ++ i )
403
- {
404
- var key = keysValues [ i ] ;
405
- if ( key < 0 || key >= _keyMax )
406
- continue ;
407
- editor . Values [ key ] = valuesValues [ i ] ;
408
- }
409
- }
410
- output . Features = editor . Commit ( ) ;
411
- }
412
-
413
374
private void MapCore ( ref VBuffer < uint > keys , ref VBuffer < float > values , Output output )
414
375
{
415
- var editor = VBufferEditor . Create ( ref output . Features , ( int ) _keyMax ) ;
416
- editor . Values . Clear ( ) ;
417
-
418
- // I fully expect that these inputs will be of equal size. But I don't want to
419
- // throw in the event that they're not. Instead just have it be an empty vector.
420
- // REVIEW: Add warning and reporting for bad inputs for these.
421
- if ( keys . Length == values . Length )
376
+ Contracts . Check ( keys . Length == values . Length , "number of keys does not match number of values." ) ;
377
+
378
+ // Both of these inputs should be dense, but still work even if they're not.
379
+ VBufferUtils . Densify ( ref keys ) ;
380
+ VBufferUtils . Densify ( ref values ) ;
381
+ var keysValues = keys . GetValues ( ) ;
382
+ var valuesValues = values . GetValues ( ) ;
383
+
384
+ // The output vector could be sparse, so we use BufferBuilder here.
385
+ _bldr . Reset ( ( int ) _keyMax , false ) ;
386
+ Array . Clear ( _indexUsed , 0 , _indexUsed . Length ) ;
387
+ for ( int i = 0 ; i < keys . Length ; ++ i )
422
388
{
423
- // Both of these inputs should be dense, but still work even if they're not.
424
- VBufferUtils . Densify ( ref keys ) ;
425
- VBufferUtils . Densify ( ref values ) ;
426
- var keysValues = keys . GetValues ( ) ;
427
- var valuesValues = values . GetValues ( ) ;
428
- for ( int i = 0 ; i < keys . Length ; ++ i )
429
- {
430
- var key = keysValues [ i ] ;
431
- if ( key == 0 || key > _keyMax )
432
- continue ;
433
- editor . Values [ ( int ) key - 1 ] = valuesValues [ i ] ;
434
- }
389
+ var key = keysValues [ i ] ;
390
+ if ( key == 0 || key > _keyMax )
391
+ continue ;
392
+ if ( _indexUsed [ ( int ) key - 1 ] )
393
+ continue ;
394
+ _bldr . AddFeature ( ( int ) key - 1 , valuesValues [ i ] ) ;
395
+ _indexUsed [ ( int ) key - 1 ] = true ;
435
396
}
436
- output . Features = editor . Commit ( ) ;
397
+ _bldr . GetResult ( ref output . Features ) ;
437
398
}
438
399
}
439
400
@@ -684,10 +645,11 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
684
645
private DataViewSchema CreateOutputSchema ( )
685
646
{
686
647
var data = GetData ( _host , null , null ) ;
648
+ var indexParser = new IndexParser ( _indicesKind == FeatureIndices . ZeroBased , _featureCount ) ;
649
+ var schemaDef = SchemaDefinition . Create ( typeof ( Indices ) ) ;
650
+ schemaDef [ nameof ( Indices . FeatureKeys ) ] . ColumnType = new KeyDataViewType ( typeof ( uint ) , _featureCount ) ;
687
651
var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??
688
- ( _indicesKind == FeatureIndices . OneBased ?
689
- new CustomMappingTransformer < IntermediateInput , Indices > ( _host , Indices . ParseIndicesToOneBased , null ) :
690
- new CustomMappingTransformer < IntermediateInput , Indices > ( _host , Indices . ParseIndicesToZeroBased , null ) ) ;
652
+ new CustomMappingTransformer < IntermediateInput , Indices > ( _host , indexParser . ParseIndices , null ) ;
691
653
var schema = keyVectorsToIndexVectors . GetOutputSchema ( data . Schema ) ;
692
654
return CreateOutputTransformer ( _host , ( int ) _featureCount ,
693
655
_indicesKind == FeatureIndices . Names , schema ) . GetOutputSchema ( schema ) ;
@@ -783,13 +745,15 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
783
745
col . Annotations . GetValue ( AnnotationUtils . Kinds . KeyValues , ref keyValues ) ;
784
746
schemaDef [ 0 ] . AddAnnotation ( AnnotationUtils . Kinds . SlotNames , keyValues , keyValuesCol . Value . Type ) ;
785
747
}
786
- outputTransformer = new CustomMappingTransformer < IntermediateOutKeys , Output > ( env ,
748
+ outputTransformer = new CustomMappingTransformer < IntermediateOut , Output > ( env ,
787
749
outputMapper . Map , null , outputSchemaDefinition : schemaDef ) ;
788
750
}
789
751
else
790
752
{
791
753
outputTransformer = new CustomMappingTransformer < IntermediateOut , Output > ( env ,
792
754
outputMapper . Map , null , outputSchemaDefinition : schemaDef ) ;
755
+ //outputTransformer = new CustomMappingTransformer<IntermediateOut, Output>(env,
756
+ // outputMapper.Map, null, outputSchemaDefinition: schemaDef);
793
757
}
794
758
795
759
string [ ] toKeep = { "Label" , "Weight" , "GroupId" , "Comment" , "Features" } ;
@@ -801,10 +765,9 @@ private static ITransformer CreateOutputTransformer(IHostEnvironment env, int ke
801
765
public IDataView Load ( IMultiStreamSource input )
802
766
{
803
767
var data = GetData ( _host , null , input ) ;
768
+ var indexParser = new IndexParser ( _indicesKind == FeatureIndices . ZeroBased , _featureCount ) ;
804
769
var keyVectorsToIndexVectors = _keyVectorsToIndexVectors ??
805
- ( _indicesKind == FeatureIndices . OneBased ?
806
- new CustomMappingTransformer < IntermediateInput , Indices > ( _host , Indices . ParseIndicesToOneBased , null ) :
807
- new CustomMappingTransformer < IntermediateInput , Indices > ( _host , Indices . ParseIndicesToZeroBased , null ) ) ;
770
+ new CustomMappingTransformer < IntermediateInput , Indices > ( _host , indexParser . ParseIndices , null ) ;
808
771
data = keyVectorsToIndexVectors . Transform ( data ) ;
809
772
return CreateOutputTransformer ( _host , ( int ) _featureCount , _indicesKind == FeatureIndices . Names , data . Schema ) . Transform ( data ) ;
810
773
}
0 commit comments