microsoft · pieths · Oct 3, 2019 · Aug 30, 2019 · Oct 3, 2019 · Oct 3, 2019
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -92,6 +92,7 @@
     <Compile Include="nimbusml\examples\CharTokenizer.py" />
     <Compile Include="nimbusml\examples\ColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
+    <Compile Include="nimbusml\examples\WordTokenizer.py" />
     <Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\ColumnDropper.py" />
     <Compile Include="nimbusml\examples\ColumnDuplicator.py" />
@@ -102,6 +103,7 @@
     <Compile Include="nimbusml\examples\examples_from_dataframe\AveragedPerceptronBinaryClassifier_infert_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\Binner_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\BootStrapSample_df.py" />
+    <Compile Include="nimbusml\examples\examples_from_dataframe\WordTokenizer_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\CharTokenizer_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\ColumnConcatenator_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\ColumnDuplicator_df.py" />
@@ -306,6 +308,7 @@
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columnduplicator.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columndropper.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\tensorflowscorer.py" />
+    <Compile Include="nimbusml\internal\core\preprocessing\text\wordtokenizer.py" />
     <Compile Include="nimbusml\internal\core\timeseries\iidchangepointdetector.py" />
     <Compile Include="nimbusml\internal\core\timeseries\iidspikedetector.py" />
     <Compile Include="nimbusml\internal\core\timeseries\ssachangepointdetector.py" />
@@ -451,6 +454,7 @@
     <Compile Include="nimbusml\internal\entrypoints\transforms_twoheterogeneousmodelcombiner.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_vectortoimage.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_wordembeddings.py" />
+    <Compile Include="nimbusml\internal\entrypoints\transforms_wordtokenizer.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_dart.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_gbdt.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_goss.py" />
@@ -649,6 +653,7 @@
     <Compile Include="nimbusml\preprocessing\schema\__init__.py" />
     <Compile Include="nimbusml\preprocessing\tensorflowscorer.py" />
     <Compile Include="nimbusml\preprocessing\text\chartokenizer.py" />
+    <Compile Include="nimbusml\preprocessing\text\wordtokenizer.py" />
     <Compile Include="nimbusml\preprocessing\text\__init__.py" />
     <Compile Include="nimbusml\preprocessing\tokey.py" />
     <Compile Include="nimbusml\preprocessing\__init__.py" />
@@ -685,6 +690,7 @@
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
     <Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
+    <Compile Include="nimbusml\tests\preprocessing\text\test_wordtokenizer.py" />
     <Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
     <Compile Include="nimbusml\tests\test_variable_column.py" />
     <Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />

diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py
@@ -0,0 +1,32 @@
+###############################################################################
+# WordTokenizer 
+
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.text import WordTokenizer
+
+# data input (as a FileDataStream)
+path = get_dataset("wiki_detox_train").as_filepath()
+
+data = FileDataStream.read_csv(path, sep='\t')
+print(data.head())
+#   Sentiment                                      SentimentText
+# 0          1  ==RUDE== Dude, you are rude upload that carl p...
+# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
+# 2          1  Stop trolling, zapatancas, calling me a liar m...
+# 3          1  ==You're cool==  You seem like a really cool g...
+# 4          1  ::::: Why are you threatening me? I'm not bein...
+
+tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(data)
+y = tokenize.transform(data)
+
+print(y.drop(labels='SentimentText', axis=1).head())
+#    Sentiment    wt.000     wt.001       wt.002   wt.003       wt.004  wt.005  ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372
+# 0          1  ==RUDE==      Dude,          you      are         rude  upload  ...   None   None   None   None   None   None   None
+# 1          1        ==        OK!           ==       IM        GOING      TO  ...   None   None   None   None   None   None   None
+# 2          1      Stop  trolling,  zapatancas,  calling           me       a  ...   None   None   None   None   None   None   None
+# 3          1  ==You're     cool==          You     seem         like       a  ...   None   None   None   None   None   None   None
+# 4          1     :::::        Why          are      you  threatening     me?  ...   None   None   None   None   None   None   None
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
@@ -0,0 +1,33 @@
+###############################################################################
+# WordTokenizer 
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.text import WordTokenizer
+
+# create the data
+customer_reviews = pandas.DataFrame(data=dict(review=[
+    "I really did not like the taste of it",
+    "It was surprisingly quite good!",
+    "I will never ever ever go to that place again!!",
+    "The best ever!! It was amazingly good and super fast",
+    "I wish I had gone earlier, it was that great",
+    "somewhat dissapointing. I'd probably wont try again",
+    "Never visit again... rascals!"]))
+
+tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
+
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(customer_reviews)
+y = tokenize.transform(customer_reviews)
+
+print(y)
+#   review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11
+# 0         I    really       did        ot      like       the     taste        of        it      None      None      None
+# 1        It       was  surprisi       gly     quite     good!      None      None      None      None      None      None
+# 2         I      will      ever      ever      ever        go        to      that     place      agai        !!      None
+# 3       The      best    ever!!        It       was     amazi       gly      good         a         d     super      fast
+# 4         I      wish         I       had        go         e  earlier,        it       was      that     great      None
+# 5  somewhat  dissapoi        ti        g.       I'd  probably        wo         t       try      agai      None      None
+# 6     Never     visit      agai       ...  rascals!      None      None      None      None      None      None      None
diff --git a/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,89 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class WordTokenizer(BasePipelineItem, DefaultSignature):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            char_array_term_separators=None,
+            **params):
+        BasePipelineItem.__init__(
+            self, type='transform', **params)
+
+        self.char_array_term_separators = char_array_term_separators
+
+    @property
+    def _entrypoint(self):
+        return transforms_wordtokenizer
+
+    @trace
+    def _get_node(self, **all_args):
+
+        input_columns = self.input
+        if input_columns is None and 'input' in all_args:
+            input_columns = all_args['input']
+        if 'input' in all_args:
+            all_args.pop('input')
+
+        output_columns = self.output
+        if output_columns is None and 'output' in all_args:
+            output_columns = all_args['output']
+        if 'output' in all_args:
+            all_args.pop('output')
+
+        # validate input
+        if input_columns is None:
+            raise ValueError(
+                "'None' input passed when it cannot be none.")
+
+        if not isinstance(input_columns, list):
+            raise ValueError(
+                "input has to be a list of strings, instead got %s" %
+                type(input_columns))
+
+        # validate output
+        if output_columns is None:
+            output_columns = input_columns
+
+        if not isinstance(output_columns, list):
+            raise ValueError(
+                "output has to be a list of strings, instead got %s" %
+                type(output_columns))
+
+        algo_args = dict(
+            column=[
+                dict(
+                    Source=i,
+                    Name=o) for i,
+                o in zip(
+                    input_columns,
+                    output_columns)] if input_columns else None,
+            char_array_term_separators=self.char_array_term_separators)
+
+        all_args.update(algo_args)
+        return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
@@ -0,0 +1,76 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.WordTokenizer
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_wordtokenizer(
+        data,
+        output_data=None,
+        model=None,
+        column=None,
+        char_array_term_separators=None,
+        **params):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of
+        text containing the words (tokens) in the original text. The
+        separator is space, but can be specified as any other
+        character (or multiple characters) if needed.
+
+    :param column: New column definition(s) (inputs).
+    :param data: Input dataset (inputs).
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+        (inputs).
+    :param output_data: Transformed dataset (outputs).
+    :param model: Transform model (outputs).
+    """
+
+    entrypoint_name = 'Transforms.WordTokenizer'
+    inputs = {}
+    outputs = {}
+
+    if column is not None:
+        inputs['Column'] = try_set(
+            obj=column,
+            none_acceptable=True,
+            is_of_type=list,
+            is_column=True)
+    if data is not None:
+        inputs['Data'] = try_set(
+            obj=data,
+            none_acceptable=False,
+            is_of_type=str)
+    if char_array_term_separators is not None:
+        inputs['CharArrayTermSeparators'] = try_set(
+            obj=char_array_term_separators,
+            none_acceptable=True,
+            is_of_type=list)
+    if output_data is not None:
+        outputs['OutputData'] = try_set(
+            obj=output_data,
+            none_acceptable=False,
+            is_of_type=str)
+    if model is not None:
+        outputs['Model'] = try_set(
+            obj=model,
+            none_acceptable=False,
+            is_of_type=str)
+
+    input_variables = {
+        x for x in unlist(inputs.values())
+        if isinstance(x, str) and x.startswith("$")}
+    output_variables = {
+        x for x in unlist(outputs.values())
+        if isinstance(x, str) and x.startswith("$")}
+
+    entrypoint = EntryPoint(
+        name=entrypoint_name, inputs=inputs, outputs=outputs,
+        input_variables=input_variables,
+        output_variables=output_variables)
+    return entrypoint
diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py
@@ -1,5 +1,7 @@
 from .chartokenizer import CharTokenizer
+from .wordtokenizer import WordTokenizer
 
 __all__ = [
-    'CharTokenizer'
+    'CharTokenizer',
+    'WordTokenizer'
 ]
diff --git a/src/python/nimbusml/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,55 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.preprocessing.text.wordtokenizer import \
+    WordTokenizer as core
+from ...internal.utils.utils import trace
+
+
+class WordTokenizer(core, BaseTransform, TransformerMixin):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+    :param columns: see `Columns </nimbusml/concepts/columns>`_.
+
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            char_array_term_separators=None,
+            columns=None,
+            **params):
+
+        if columns:
+            params['columns'] = columns
+        BaseTransform.__init__(self, **params)
+        core.__init__(
+            self,
+            char_array_term_separators=char_array_term_separators,
+            **params)
+        self._columns = columns
+
+    def get_params(self, deep=False):
+        """
+        Get the parameters for this operator.
+        """
+        return core.get_params(self)
diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
@@ -0,0 +1,33 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import unittest
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.text import WordTokenizer
+
+
+class TestWordTokenizer(unittest.TestCase):
+
+    def test_wordtokenizer(self):
+        customer_reviews = pandas.DataFrame(data=dict(review=[
+            "I really did not like the taste of it",
+            "It was surprisingly quite good!"]))
+
+        tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
+        pipeline = Pipeline([tokenize])
+
+        tokenize.fit(customer_reviews)
+        y = tokenize.transform(customer_reviews)
+
+        self.assertEqual(y.shape, (2, 9))
+
+        self.assertEqual(y.loc[0, 'review.3'], 'ot')
+        self.assertEqual(y.loc[1, 'review.3'], 'gly')
+        self.assertEqual(y.loc[1, 'review.6'], None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py
@@ -160,7 +160,7 @@
     'PixelExtractor, Loader, Resizer, \
                         GlobalContrastRowScaler, PcaTransformer, '
     'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, '
-    'NGramFeaturizer, WordEmbedding, LpScaler',
+    'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer',
     'check_transformer_data_not_an_array, check_pipeline_consistency, '
     'check_fit2d_1feature, check_estimators_fit_returns_self,\
                        check_fit2d_1sample, '

diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py
@@ -1457,7 +1457,7 @@ def parse_arg(argument, inout):
                 assert not is_column
                 arg_obj = NumericArrayArg(argument, inout)
             elif itemType in ["String", "DataView", "PredictorModel",
-                              "TransformModel", "Node"]:
+                              "TransformModel", "Node", "Char"]:
                 arg_obj = StringArrayArg(argument, inout,
                                          is_column=is_column)
             elif isinstance(itemType, dict):