@@ -840,6 +840,168 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
840
840
Assert . Equal ( "Iris-setosa" , previewIris . RowView [ 0 ] . Values [ index ] . Value . ToString ( ) ) ;
841
841
}
842
842
843
+ [ Theory ]
844
+ [ InlineData ( true ) ]
845
+ [ InlineData ( false ) ]
846
+ public void TestCommaAsDecimalMarker ( bool useCsvVersion )
847
+ {
848
+ // When userCsvVersion == false:
849
+ // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
850
+ // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
851
+
852
+ // When userCsvVersion == true:
853
+ // Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals
854
+ // are enclosed with quotes, and with the decimal marker being ','.
855
+
856
+ // Do these checks with both float and double as types of features being read, to test decimal marker
857
+ // recognition with both doubles and floats.
858
+ TestCommaAsDecimalMarkerHelper < float > ( useCsvVersion ) ;
859
+ TestCommaAsDecimalMarkerHelper < double > ( useCsvVersion ) ;
860
+ }
861
+
862
+ private void TestCommaAsDecimalMarkerHelper < T > ( bool useCsvVersion )
863
+ {
864
+ // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
865
+ // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
866
+ // Datasets iris.txt and iris-decimal-marker-as-comma.csv have the exact same data, however the .csv
867
+ // version has ',' as decimal marker and separator, and feature values are enclosed with quotes.
868
+ // T varies as either float or double, so that decimal markers can be tested for both floating
869
+ // point value types.
870
+ var mlContext = new MLContext ( seed : 1 ) ;
871
+
872
+ // Read dataset with period as decimal marker.
873
+ string dataPathDecimalMarkerPeriod = GetDataPath ( "iris.txt" ) ;
874
+ var readerDecimalMarkerPeriod = new TextLoader ( mlContext , new TextLoader . Options ( )
875
+ {
876
+ Columns = new [ ]
877
+ {
878
+ new TextLoader . Column ( "Label" , DataKind . UInt32 , 0 ) ,
879
+ new TextLoader . Column ( "Features" , typeof ( T ) == typeof ( double ) ? DataKind . Double : DataKind . Single , new [ ] { new TextLoader . Range ( 1 , 4 ) } ) ,
880
+ } ,
881
+ DecimalMarker = '.'
882
+ } ) ;
883
+ var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod . Load ( GetDataPath ( dataPathDecimalMarkerPeriod ) ) ;
884
+
885
+ // Load values from iris.txt
886
+ DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod . Schema ;
887
+ using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod . GetRowCursor ( columnsPeriod ) ;
888
+ UInt32 labelPeriod = default ;
889
+ ValueGetter < UInt32 > labelDelegatePeriod = cursorPeriod . GetGetter < UInt32 > ( columnsPeriod [ 0 ] ) ;
890
+ VBuffer < T > featuresPeriod = default ;
891
+ ValueGetter < VBuffer < T > > featuresDelegatePeriod = cursorPeriod . GetGetter < VBuffer < T > > ( columnsPeriod [ 1 ] ) ;
892
+
893
+ // Iterate over each row and save labels and features to array for future comparison
894
+ int count = 0 ;
895
+ UInt32 [ ] labels = new uint [ 150 ] ;
896
+ T [ ] [ ] features = new T [ 150 ] [ ] ;
897
+ while ( cursorPeriod . MoveNext ( ) )
898
+ {
899
+ //Get values from respective columns
900
+ labelDelegatePeriod ( ref labelPeriod ) ;
901
+ featuresDelegatePeriod ( ref featuresPeriod ) ;
902
+ labels [ count ] = labelPeriod ;
903
+ features [ count ] = featuresPeriod . GetValues ( ) . ToArray ( ) ;
904
+ count ++ ;
905
+ }
906
+
907
+ // Read dataset with comma as decimal marker.
908
+ // Dataset is either the .csv version or the .txt version.
909
+ string dataPathDecimalMarkerComma ;
910
+ TextLoader . Options options = new TextLoader . Options ( )
911
+ {
912
+ Columns = new [ ]
913
+ {
914
+ new TextLoader . Column ( "Label" , DataKind . UInt32 , 0 ) ,
915
+ new TextLoader . Column ( "Features" , typeof ( T ) == typeof ( double ) ? DataKind . Double : DataKind . Single , new [ ] { new TextLoader . Range ( 1 , 4 ) } )
916
+ } ,
917
+ } ;
918
+ // Set TextLoader.Options for the .csv or .txt cases.
919
+ if ( useCsvVersion )
920
+ {
921
+ dataPathDecimalMarkerComma = GetDataPath ( "iris-decimal-marker-as-comma.csv" ) ;
922
+ options . DecimalMarker = ',' ;
923
+ options . Separator = "," ;
924
+ options . AllowQuoting = true ;
925
+ options . HasHeader = true ;
926
+ }
927
+ else
928
+ {
929
+ dataPathDecimalMarkerComma = GetDataPath ( "iris-decimal-marker-as-comma.txt" ) ;
930
+ options . DecimalMarker = ',' ;
931
+ }
932
+ var readerDecimalMarkerComma = new TextLoader ( mlContext , options ) ;
933
+ var textDataDecimalMarkerComma = readerDecimalMarkerComma . Load ( GetDataPath ( dataPathDecimalMarkerComma ) ) ;
934
+
935
+ // Load values from dataset with comma as decimal marker
936
+ DataViewSchema columnsComma = textDataDecimalMarkerComma . Schema ;
937
+ using DataViewRowCursor cursorComma = textDataDecimalMarkerComma . GetRowCursor ( columnsComma ) ;
938
+ UInt32 labelComma = default ;
939
+ ValueGetter < UInt32 > labelDelegateComma = cursorComma . GetGetter < UInt32 > ( columnsComma [ 0 ] ) ;
940
+ VBuffer < T > featuresComma = default ;
941
+ ValueGetter < VBuffer < T > > featuresDelegateComma = cursorComma . GetGetter < VBuffer < T > > ( columnsComma [ 1 ] ) ;
942
+
943
+ // Check values from dataset with comma as decimal marker match those in iris.txt (period decimal marker)
944
+ count = 0 ;
945
+ while ( cursorComma . MoveNext ( ) )
946
+ {
947
+ //Get values from respective columns
948
+ labelDelegateComma ( ref labelComma ) ;
949
+ featuresDelegateComma ( ref featuresComma ) ;
950
+ Assert . Equal ( labels [ count ] , labelComma ) ;
951
+ Assert . Equal ( features [ count ] , featuresComma . GetValues ( ) . ToArray ( ) ) ;
952
+ count ++ ;
953
+ }
954
+ }
955
+
956
+ [ Theory ]
957
+ [ InlineData ( true ) ]
958
+ [ InlineData ( false ) ]
959
+ public void TestWrongDecimalMarkerInputs ( bool useCommaAsDecimalMarker )
960
+ {
961
+ // When DecimalMarker does not match the actual decimal marker used in the dataset,
962
+ // we obtain values of NaN. Check that the values are indeed NaN in this case.
963
+ // Do this check for both cases where decimal markers in the dataset are '.' and ','.
964
+ var mlContext = new MLContext ( seed : 1 ) ;
965
+
966
+ // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ',',
967
+ // and vice versa.
968
+ string dataPath ;
969
+ TextLoader . Options options = new TextLoader . Options ( )
970
+ {
971
+ Columns = new [ ]
972
+ {
973
+ new TextLoader . Column ( "Label" , DataKind . UInt32 , 0 ) ,
974
+ new TextLoader . Column ( "Features" , DataKind . Single , new [ ] { new TextLoader . Range ( 1 , 4 ) } )
975
+ } ,
976
+ } ;
977
+ if ( useCommaAsDecimalMarker )
978
+ {
979
+ dataPath = GetDataPath ( "iris.txt" ) ; // Has '.' as decimal marker inside dataset
980
+ options . DecimalMarker = ',' ; // Choose wrong decimal marker on purpose
981
+ }
982
+ else
983
+ {
984
+ dataPath = GetDataPath ( "iris-decimal-marker-as-comma.txt" ) ; // Has ',' as decimal marker inside dataset
985
+ options . DecimalMarker = '.' ; // Choose wrong decimal marker on purpose
986
+ }
987
+ var reader = new TextLoader ( mlContext , options ) ;
988
+ var textData = reader . Load ( GetDataPath ( dataPath ) ) ;
989
+
990
+ // Check that the features being loaded are NaN.
991
+ DataViewSchema columns = textData . Schema ;
992
+ using DataViewRowCursor cursor = textData . GetRowCursor ( columns ) ;
993
+ VBuffer < Single > featuresPeriod = default ;
994
+ ValueGetter < VBuffer < Single > > featuresDelegatePeriod = cursor . GetGetter < VBuffer < Single > > ( columns [ 1 ] ) ;
995
+
996
+ // Iterate over each row and check that feature values are NaN.
997
+ while ( cursor . MoveNext ( ) )
998
+ {
999
+ featuresDelegatePeriod . Invoke ( ref featuresPeriod ) ;
1000
+ foreach ( float feature in featuresPeriod . GetValues ( ) . ToArray ( ) )
1001
+ Assert . Equal ( feature , Single . NaN ) ;
1002
+ }
1003
+ }
1004
+
843
1005
private class IrisNoFields
844
1006
{
845
1007
}
0 commit comments