microsoft · pieths · Oct 10, 2019 · Oct 9, 2019 · Oct 9, 2019 · ganik
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -91,7 +91,9 @@
     <Compile Include="nimbusml\examples\BootStrapSample.py" />
     <Compile Include="nimbusml\examples\CharTokenizer.py" />
     <Compile Include="nimbusml\examples\ColumnConcatenator.py" />
+    <Compile Include="nimbusml\examples\examples_from_dataframe\NGramExtractor_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
+    <Compile Include="nimbusml\examples\NGramExtractor.py" />
     <Compile Include="nimbusml\examples\WordTokenizer.py" />
     <Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\ColumnDropper.py" />
@@ -240,6 +242,7 @@
     <Compile Include="nimbusml\feature_extraction\text\extractor\ngramhash.py" />
     <Compile Include="nimbusml\feature_extraction\text\extractor\__init__.py" />
     <Compile Include="nimbusml\feature_extraction\text\lightlda.py" />
+    <Compile Include="nimbusml\feature_extraction\text\ngramextractor.py" />
     <Compile Include="nimbusml\feature_extraction\text\stopwords\customstopwordsremover.py" />
     <Compile Include="nimbusml\feature_extraction\text\stopwords\predefinedstopwordsremover.py" />
     <Compile Include="nimbusml\feature_extraction\text\stopwords\__init__.py" />
@@ -287,6 +290,7 @@
     <Compile Include="nimbusml\internal\core\feature_extraction\text\extractor\ngramhash.py" />
     <Compile Include="nimbusml\internal\core\feature_extraction\text\extractor\__init__.py" />
     <Compile Include="nimbusml\internal\core\feature_extraction\text\lightlda.py" />
+    <Compile Include="nimbusml\internal\core\feature_extraction\text\ngramextractor.py" />
     <Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\customstopwordsremover.py" />
     <Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\predefinedstopwordsremover.py" />
     <Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\__init__.py" />
@@ -683,6 +687,7 @@
     <Compile Include="nimbusml\tests\ensemble\test_lightgbmranker.py" />
     <Compile Include="nimbusml\tests\ensemble\test_lightgbmregressor.py" />
     <Compile Include="nimbusml\tests\ensemble\__init__.py" />
+    <Compile Include="nimbusml\tests\feature_extraction\text\test_ngramextractor.py" />
     <Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
     <Compile Include="nimbusml\tests\idv\__init__.py" />
     <Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />

diff --git a/src/python/nimbusml/examples/NGramExtractor.py b/src/python/nimbusml/examples/NGramExtractor.py
@@ -0,0 +1,37 @@
+###############################################################################
+# NGramExtractor
+from nimbusml import FileDataStream, Pipeline
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.schema import ColumnDropper
+from nimbusml.preprocessing.text import CharTokenizer
+from nimbusml.feature_extraction.text import NGramExtractor
+
+# data input (as a FileDataStream)
+path = get_dataset("wiki_detox_train").as_filepath()
+
+data = FileDataStream.read_csv(path, sep='\t')
+print(data.head())
+#   Sentiment                                      SentimentText
+# 0          1  ==RUDE== Dude, you are rude upload that carl p...
+# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
+# 2          1  Stop trolling, zapatancas, calling me a liar m...
+# 3          1  ==You're cool==  You seem like a really cool g...
+# 4          1  ::::: Why are you threatening me? I'm not bein...
+
+# transform usage
+pipe = Pipeline([
+        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
+        NGramExtractor(ngram_length=1, all_lengths=False, columns={'Ngrams': 'SentimentText_Transform'}),
+        ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment'])
+        ])
+
+# fit and transform
+features = pipe.fit_transform(data)
+
+print(features.head())
+#    Ngrams.<␂>  Ngrams.=  Ngrams.R  Ngrams.U  Ngrams.D  Ngrams.E  ...
+# 0         1.0       4.0       1.0       1.0       2.0       1.0  ...
+# 1         1.0       4.0       0.0       0.0       2.0       3.0  ...
+# 2         1.0       0.0       0.0       0.0       0.0       0.0  ...
+# 3         1.0       4.0       0.0       0.0       0.0       0.0  ...
+# 4         1.0       0.0       0.0       0.0       0.0       0.0  ...
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py
@@ -0,0 +1,112 @@
+###############################################################################
+# Example with NGramExtractor and LogisticRegressionBinaryClassifier
+import pandas
+from nimbusml import Pipeline
+from nimbusml.feature_extraction.text import NGramExtractor
+from nimbusml.linear_model import LogisticRegressionBinaryClassifier
+from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper
+from nimbusml.preprocessing.text import CharTokenizer
+
+train_reviews = pandas.DataFrame(
+    data=dict(
+        review=[
+            "This is great",
+            "I hate it",
+            "Love it",
+            "Do not like it",
+            "Really like it",
+            "I hate it",
+            "I like it a lot",
+            "I kind of hate it",
+            "I do like it",
+            "I really hate it",
+            "It is very good",
+            "I hate it a bunch",
+            "I love it a bunch",
+            "I hate it",
+            "I like it very much",
+            "I hate it very much.",
+            "I really do love it",
+            "I really do hate it",
+            "Love it!",
+            "Hate it!",
+            "I love it",
+            "I hate it",
+            "I love it",
+            "I hate it",
+            "I love it"],
+        like=[
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True,
+            False,
+            True]))
+
+test_reviews = pandas.DataFrame(
+    data=dict(
+        review=[
+            "This is great",
+            "I hate it",
+            "Love it",
+            "Really like it",
+            "I hate it",
+            "I like it a lot",
+            "I love it",
+            "I do like it",
+            "I really hate it",
+            "I love it"]))
+
+y = train_reviews['like']
+X = train_reviews.loc[:, train_reviews.columns != 'like']
+
+pipeline = Pipeline([
+    CharTokenizer(columns={'review_transform': 'review'}),
+    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
+    ColumnDropper(columns=['review_transform', 'review'])
+])
+X = pipeline.fit_transform(X)
+
+print(X.head())
+#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
+# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
+# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
+# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
+# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
+# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
+
+model = LogisticRegressionBinaryClassifier().fit(X, y)
+
+X_test = pipeline.transform(test_reviews)
+result = model.predict(X_test)
+
+print(result)
+# 0     True
+# 1    False
+# 2     True
+# 3     True
+# 4    False
+# 5     True
+# 6     True
+# 7     True
+# 8    False
+# 9     True
diff --git a/src/python/nimbusml/feature_extraction/text/__init__.py b/src/python/nimbusml/feature_extraction/text/__init__.py
@@ -1,10 +1,12 @@
 from .lightlda import LightLda
+from .ngramextractor import NGramExtractor
 from .ngramfeaturizer import NGramFeaturizer
 from .sentiment import Sentiment
 from .wordembedding import WordEmbedding
 
 __all__ = [
     'LightLda',
+    'NGramExtractor',
     'NGramFeaturizer',
     'Sentiment',
     'WordEmbedding'

diff --git a/src/python/nimbusml/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/feature_extraction/text/ngramextractor.py
@@ -0,0 +1,72 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+NGramExtractor
+"""
+
+__all__ = ["NGramExtractor"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.feature_extraction.text.ngramextractor import \
+    NGramExtractor as core
+from ...internal.utils.utils import trace
+
+
+class NGramExtractor(core, BaseTransform, TransformerMixin):
+    """
+    **Description**
+        Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
+
+    :param columns: see `Columns </nimbusml/concepts/columns>`_.
+
+    :param ngram_length: Maximum n-gram length.
+
+    :param all_lengths: Whether to store all n-gram lengths up to ngramLength,
+        or only ngramLength.
+
+    :param skip_length: Maximum number of tokens to skip when constructing an
+        n-gram.
+
+    :param max_num_terms: Maximum number of n-grams to store in the dictionary.
+
+    :param weighting: The weighting criteria.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            ngram_length=2,
+            all_lengths=True,
+            skip_length=0,
+            max_num_terms=[10000000],
+            weighting='Tf',
+            columns=None,
+            **params):
+
+        if columns:
+            params['columns'] = columns
+        BaseTransform.__init__(self, **params)
+        core.__init__(
+            self,
+            ngram_length=ngram_length,
+            all_lengths=all_lengths,
+            skip_length=skip_length,
+            max_num_terms=max_num_terms,
+            weighting=weighting,
+            **params)
+        self._columns = columns
+
+    def get_params(self, deep=False):
+        """
+        Get the parameters for this operator.
+        """
+        return core.get_params(self)
diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py
@@ -0,0 +1,111 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+NGramExtractor
+"""
+
+__all__ = ["NGramExtractor"]
+
+
+from ....entrypoints.transforms_ngramtranslator import \
+    transforms_ngramtranslator
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class NGramExtractor(BasePipelineItem, DefaultSignature):
+    """
+    **Description**
+        Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
+
+    :param ngram_length: Maximum n-gram length.
+
+    :param all_lengths: Whether to store all n-gram lengths up to ngramLength,
+        or only ngramLength.
+
+    :param skip_length: Maximum number of tokens to skip when constructing an
+        n-gram.
+
+    :param max_num_terms: Maximum number of n-grams to store in the dictionary.
+
+    :param weighting: The weighting criteria.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            ngram_length=2,
+            all_lengths=True,
+            skip_length=0,
+            max_num_terms=[10000000],
+            weighting='Tf',
+            **params):
+        BasePipelineItem.__init__(
+            self, type='transform', **params)
+
+        self.ngram_length = ngram_length
+        self.all_lengths = all_lengths
+        self.skip_length = skip_length
+        self.max_num_terms = max_num_terms
+        self.weighting = weighting
+
+    @property
+    def _entrypoint(self):
+        return transforms_ngramtranslator
+
+    @trace
+    def _get_node(self, **all_args):
+
+        input_columns = self.input
+        if input_columns is None and 'input' in all_args:
+            input_columns = all_args['input']
+        if 'input' in all_args:
+            all_args.pop('input')
+
+        output_columns = self.output
+        if output_columns is None and 'output' in all_args:
+            output_columns = all_args['output']
+        if 'output' in all_args:
+            all_args.pop('output')
+
+        # validate input
+        if input_columns is None:
+            raise ValueError(
+                "'None' input passed when it cannot be none.")
+
+        if not isinstance(input_columns, list):
+            raise ValueError(
+                "input has to be a list of strings, instead got %s" %
+                type(input_columns))
+
+        # validate output
+        if output_columns is None:
+            output_columns = input_columns
+
+        if not isinstance(output_columns, list):
+            raise ValueError(
+                "output has to be a list of strings, instead got %s" %
+                type(output_columns))
+
+        algo_args = dict(
+            column=[
+                dict(
+                    Source=i,
+                    Name=o) for i,
+                o in zip(
+                    input_columns,
+                    output_columns)] if input_columns else None,
+            ngram_length=self.ngram_length,
+            all_lengths=self.all_lengths,
+            skip_length=self.skip_length,
+            max_num_terms=self.max_num_terms,
+            weighting=self.weighting)
+
+        all_args.update(algo_args)
+        return self._entrypoint(**all_args)