microsoft · ganik · Oct 24, 2018 · Oct 23, 2018 · Oct 23, 2018 · Oct 23, 2018
diff --git a/.gitignore b/.gitignore
@@ -347,3 +347,4 @@ _doc_report.txt
 data.csv
 data.txt
 
+/build/TestCoverageReport
diff --git a/build.cmd b/build.cmd
@@ -299,9 +299,13 @@ set TestsPath1=%PackagePath%\tests
 set TestsPath2=%__currentScriptDir%src\python\tests
 set ReportPath=%__currentScriptDir%build\TestCoverageReport
 call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath1%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
+if errorlevel 1 (
+    goto :Exit_Error
+)
 call "%PythonExe%" -m pytest --verbose --maxfail=1000 --capture=sys "%TestsPath2%" --cov="%PackagePath%" --cov-report term-missing --cov-report html:"%ReportPath%"
-goto :Exit_Success
-
+if errorlevel 1 (
+    goto :Exit_Error
+)
 
 :Exit_Success
 endlocal

diff --git a/src/python/docs/sphinx/installationguide.rst b/src/python/docs/sphinx/installationguide.rst
@@ -8,7 +8,7 @@ Installation Guide
 Supported Platforms 
 -------------------
 
-Release 0.6.0:
+Release 0.6:
    * Windows 10, Ubuntu 14.04, Ubuntu 16.04, CentOS 7, RHEL 7, Mac OS 10.11, 10.12, 10.13
 
 

diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py
@@ -2,7 +2,7 @@
 Microsoft Machine Learning for Python
 """
 
-__version__ = '0.6.0'
+__version__ = '0.6.1'
 
 # CoreCLR version of MicrosoftML is built on Windows.
 # But file permissions are not preserved when it's copied to Linux.

diff --git a/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py
@@ -227,9 +227,12 @@ def __init__(
             dictionary=None,
             word_feature_extractor=Ngram(
                 max_num_terms=[10000000]),
-            char_feature_extractor=None,
-            vector_normalizer='L2',
-            columns=None,
+            char_feature_extractor=Ngram(
+                ngram_length=3,
+                all_lengths=False,
+                max_num_terms=[10000000]),
+        vector_normalizer='L2',
+        columns=None,
             **params):
 
         if columns:

diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py
@@ -206,8 +206,11 @@ def __init__(
             dictionary=None,
             word_feature_extractor=n_gram(
                 max_num_terms=[10000000]),
-            char_feature_extractor=None,
-            vector_normalizer='L2',
+            char_feature_extractor=n_gram(
+                ngram_length=3,
+                all_lengths=False,
+                max_num_terms=[10000000]),
+        vector_normalizer='L2',
             **params):
         BasePipelineItem.__init__(
             self, type='transform', **params)

diff --git a/src/python/nimbusml/tests/data_type/test_text.py b/src/python/nimbusml/tests/data_type/test_text.py
@@ -9,9 +9,9 @@
 from nimbusml import Pipeline
 from nimbusml.ensemble import LightGbmClassifier
 from nimbusml.feature_extraction.text import NGramFeaturizer
-from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
 from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
+from sklearn.utils.testing import assert_array_almost_equal
 
 
 def transform_data(data=None, datatype=None):
@@ -34,7 +34,7 @@ def train_data_type_single(
         "Talk about second",
         "Thrid one",
         "Final example."]
-    model = NGramFeaturizer(word_feature_extractor=n_gram())
+    model = NGramFeaturizer()
     data_with_new_type = transform_data(data, fit_X_type)
     model.fit(data_with_new_type)
     test_data_with_new_type = transform_data(data, predict_X_type)
@@ -49,7 +49,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
         "Final example."]
     label = [1, 0, 1, 1]
     model = Pipeline([
-        NGramFeaturizer(word_feature_extractor=n_gram()),
+        NGramFeaturizer(),
         LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
     ])
     data_with_new_type = transform_data(data, fit_X_type)
@@ -66,127 +66,127 @@ class TestTextDataType(unittest.TestCase):
     def test_check_text_datatype_single_list_list_series(self):
         result = train_data_type_single("list", "list", "series")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_series_list_series(self):
         result = train_data_type_single("series", "list", "series")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_series_list_list(self):
         result = train_data_type_single("series", "list", "list")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_array_list_series(self):
         result = train_data_type_single("array", "list", "series")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_series_array_dataframe(self):
         result = train_data_type_single("series", "array", "dataframe")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_array_series_series(self):
         result = train_data_type_single("array", "series", "series")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_dataframe_list_series(self):
         result = train_data_type_single("dataframe", "list", "series")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_series_series_dataframe(self):
         result = train_data_type_single("series", "series", "dataframe")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_single_dataframe_series_list(self):
         result = train_data_type_single("dataframe", "series", "list")
         assert len(result) == 4
-        assert len(result.columns) == 11
+        assert len(result.columns) == 66
         assert all([col.startswith('F0') for col in result.columns])
 
     def test_check_text_datatype_ppl_series_list_array(self):
         result, scores, metrics = train_data_type_ppl(
             "series", "list", "array")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_list_series_dataframe(self):
         result, scores, metrics = train_data_type_ppl(
             "list", "series", "dataframe")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_list_list_series(self):
         result, scores, metrics = train_data_type_ppl("list", "list", "series")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_array_series_array(self):
         result, scores, metrics = train_data_type_ppl(
             "array", "series", "array")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_series_array_dataframe(self):
         result, scores, metrics = train_data_type_ppl(
             "series", "array", "dataframe")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_array_series_list(self):
         result, scores, metrics = train_data_type_ppl(
             "array", "series", "list")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_dataframe_list_series(self):
         result, scores, metrics = train_data_type_ppl(
             "dataframe", "list", "series")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_series_series_dataframe(self):
         result, scores, metrics = train_data_type_ppl(
             "series", "series", "dataframe")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
     def test_check_text_datatype_ppl_dataframe_series_series(self):
         result, scores, metrics = train_data_type_ppl(
             "dataframe", "series", "series")
         assert len(result) == 4
-        assert_almost_equal(metrics['Log-loss'].item(), 0.69314718)
-        assert_array_equal(scores['Score.0'].values, scores['Score.1'].values)
-        assert_array_equal(scores['Score.0'].values, [0.5, 0.5, 0.5, 0.5])
+        assert_almost_equal(metrics['Log-loss'].item(), 0.4402459)
+        assert_array_equal(scores['Score.0'].values, result['Score.0'].values)
+        assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186])
 
 
 if __name__ == '__main__':

diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/feature_extraction/text/test_ngramfeaturizer.py
@@ -38,7 +38,7 @@ def test_ngramfeaturizer(self):
         X_train = texttransform.fit_transform(X_train[:100])
         sum = X_train.iloc[:].sum().sum()
         print(sum)
-        assert_equal(sum, 4594, "sum of all features is incorrect!")
+        assert_equal(sum, 30513, "sum of all features is incorrect!")
 
 
 if __name__ == '__main__':

diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py
@@ -91,7 +91,7 @@ def test_word_embedding_example(self):
         ])
 
         features = pipeline.fit_transform(data)
-        assert features.shape == (248, 409)
+        assert features.shape == (248, 802)
 
     # TODO: fix ssl issue on test centos7 & ubuntu14 boxes.
     # Test works on ubuntu16.
@@ -127,7 +127,7 @@ def test_word_embedding_example2(self):
         ])
 
         features = pipeline.fit_transform(data)
-        assert features.shape == (248, 409)
+        assert features.shape == (248, 802)
         assert 'features_TransformedText.94' in list(features.columns)
 
     # TODO: fix ssl issue on test centos7 & ubuntu14 boxes.
@@ -166,7 +166,7 @@ def test_word_embedding_example_dict_same_name(self):
         ])
 
         features = pipeline.fit_transform(data)
-        assert features.shape == (248, 409)
+        assert features.shape == (248, 802)
 
     @unittest.skip('System.ArgumentOutOfRangeException')
     def test_word_embedding_example_dict_newname(self):

diff --git a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py
@@ -91,7 +91,7 @@ def test_ngramfeaturizer(self):
         textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
         X = textt.fit_transform(X)
 
-        assert X.shape == (25, 21)
+        assert X.shape == (25, 116)
 
         mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
         X_test = textt.transform(test_reviews)
@@ -180,7 +180,7 @@ def test_ngramfeaturizer_syntax_dict(self):
             'outg': ['review']}
         X = textt.fit_transform(X)
 
-        assert X.shape == (25, 22)
+        assert X.shape == (25, 117)
         # columns ordering changed between 0.22 and 0.23
         assert 'review' in (X.columns[0], X.columns[-1])
         X = X.drop('review', axis=1)
@@ -204,7 +204,7 @@ def test_ngramfeaturizer_single(self):
                              columns={'features': ['id', 'education']})
 
         features = xf.fit_transform(data)
-        assert features.shape == (248, 259)
+        assert features.shape == (248, 652)
 
     def test_ngramfeaturizer_multi(self):
 

diff --git a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py
@@ -146,4 +146,4 @@ def test_syntax9_multiple_inputs(self):
         ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
             'out1': ['education1', 'education2']}
         output4 = ng4.fit_transform(X)
-        assert output4.shape == (5, 7)
+        assert output4.shape == (5, 13)
diff --git a/src/python/setup.py b/src/python/setup.py
@@ -40,7 +40,7 @@
     # Versions should comply with PEP440.  For a discussion on
     # single-sourcing the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.6.0',
+    version='0.6.1',
 
     description='NimbusML',
     long_description=long_description,

diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py
@@ -87,7 +87,9 @@
                       ('name=name,', 'output=output,')],
     'NGramFeaturizer': [(NG_1, NG_1_correct),
                         ('word_feature_extractor = n_gram',
-                         'word_feature_extractor = Ngram')],
+                         'word_feature_extractor = Ngram'),
+                         ('char_feature_extractor = n_gram',
+                         'char_feature_extractor = Ngram')],
     'CountSelector': ('count = 0,', 'count = 1.0,'),
     'OneClassSvmAnomalyDetector': (
         'label_column=label_column,', 'label_column=None,'),

diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json
@@ -682,13 +682,7 @@
       "Name": "Transforms.TextFeaturizer",
       "NewName": "NGramFeaturizer",
       "Module": "feature_extraction.text",
-      "Type": "Transform",
-      "Inputs": [
-        {
-          "Name": "CharFeatureExtractor",
-          "Default": null
-        }
-      ]
+      "Type": "Transform"
     },
     {
       "Name": "Transforms.WordEmbeddings",

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.6.0
+0.6.1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -347,3 +347,4 @@ _doc_report.txt
		data.csv
		data.txt

		/build/TestCoverageReport