Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Initial implementation of NGramExtractor. #320

Merged
merged 2 commits into from
Oct 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@
<Compile Include="nimbusml\examples\BootStrapSample.py" />
<Compile Include="nimbusml\examples\CharTokenizer.py" />
<Compile Include="nimbusml\examples\ColumnConcatenator.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\NGramExtractor_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
<Compile Include="nimbusml\examples\NGramExtractor.py" />
<Compile Include="nimbusml\examples\WordTokenizer.py" />
<Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
<Compile Include="nimbusml\examples\ColumnDropper.py" />
Expand Down Expand Up @@ -240,6 +242,7 @@
<Compile Include="nimbusml\feature_extraction\text\extractor\ngramhash.py" />
<Compile Include="nimbusml\feature_extraction\text\extractor\__init__.py" />
<Compile Include="nimbusml\feature_extraction\text\lightlda.py" />
<Compile Include="nimbusml\feature_extraction\text\ngramextractor.py" />
<Compile Include="nimbusml\feature_extraction\text\stopwords\customstopwordsremover.py" />
<Compile Include="nimbusml\feature_extraction\text\stopwords\predefinedstopwordsremover.py" />
<Compile Include="nimbusml\feature_extraction\text\stopwords\__init__.py" />
Expand Down Expand Up @@ -287,6 +290,7 @@
<Compile Include="nimbusml\internal\core\feature_extraction\text\extractor\ngramhash.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\extractor\__init__.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\lightlda.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\ngramextractor.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\customstopwordsremover.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\predefinedstopwordsremover.py" />
<Compile Include="nimbusml\internal\core\feature_extraction\text\stopwords\__init__.py" />
Expand Down Expand Up @@ -683,6 +687,7 @@
<Compile Include="nimbusml\tests\ensemble\test_lightgbmranker.py" />
<Compile Include="nimbusml\tests\ensemble\test_lightgbmregressor.py" />
<Compile Include="nimbusml\tests\ensemble\__init__.py" />
<Compile Include="nimbusml\tests\feature_extraction\text\test_ngramextractor.py" />
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
<Compile Include="nimbusml\tests\idv\__init__.py" />
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
Expand Down
37 changes: 37 additions & 0 deletions src/python/nimbusml/examples/NGramExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
###############################################################################
# NGramExtractor
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnDropper
from nimbusml.preprocessing.text import CharTokenizer
from nimbusml.feature_extraction.text import NGramExtractor

# data input (as a FileDataStream)
path = get_dataset("wiki_detox_train").as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
# Sentiment SentimentText
# 0 1 ==RUDE== Dude, you are rude upload that carl p...
# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK...
# 2 1 Stop trolling, zapatancas, calling me a liar m...
# 3 1 ==You're cool== You seem like a really cool g...
# 4 1 ::::: Why are you threatening me? I'm not bein...

# transform usage
pipe = Pipeline([
CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
NGramExtractor(ngram_length=1, all_lengths=False, columns={'Ngrams': 'SentimentText_Transform'}),
ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment']) [](start = 8, length = 80)

What happens if you remove ColumnDropper? Will example still work?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will still work. The only difference is that those columns will still be there. Those columns were removed so that the output at the end of the example would only show the columns output from the NGramExtractor.


In reply to: 333696240 [](ancestors = 333696240)

])

# fit and transform
features = pipe.fit_transform(data)

print(features.head())
# Ngrams.<␂> Ngrams.= Ngrams.R Ngrams.U Ngrams.D Ngrams.E ...
# 0 1.0 4.0 1.0 1.0 2.0 1.0 ...
# 1 1.0 4.0 0.0 0.0 2.0 3.0 ...
# 2 1.0 0.0 0.0 0.0 0.0 0.0 ...
# 3 1.0 4.0 0.0 0.0 0.0 0.0 ...
# 4 1.0 0.0 0.0 0.0 0.0 0.0 ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
###############################################################################
# Example with NGramExtractor and LogisticRegressionBinaryClassifier
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramExtractor
from nimbusml.linear_model import LogisticRegressionBinaryClassifier
from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper
from nimbusml.preprocessing.text import CharTokenizer

train_reviews = pandas.DataFrame(
data=dict(
review=[
"This is great",
"I hate it",
"Love it",
"Do not like it",
"Really like it",
"I hate it",
"I like it a lot",
"I kind of hate it",
"I do like it",
"I really hate it",
"It is very good",
"I hate it a bunch",
"I love it a bunch",
"I hate it",
"I like it very much",
"I hate it very much.",
"I really do love it",
"I really do hate it",
"Love it!",
"Hate it!",
"I love it",
"I hate it",
"I love it",
"I hate it",
"I love it"],
like=[
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True,
False,
True]))

test_reviews = pandas.DataFrame(
data=dict(
review=[
"This is great",
"I hate it",
"Love it",
"Really like it",
"I hate it",
"I like it a lot",
"I love it",
"I do like it",
"I really hate it",
"I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

pipeline = Pipeline([
CharTokenizer(columns={'review_transform': 'review'}),
NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
# ngrams.<␂>|T|h ngrams.T|h|i ngrams.h|i|s ngrams.i|s|<␠> ... ngrams.i|t|! ngrams.t|!|<␃> ngrams.<␂>|H|a ngrams.H|a|t
# 0 1.0 1.0 1.0 2.0 ... 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
# 4 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = pipeline.transform(test_reviews)
result = model.predict(X_test)

print(result)
# 0 True
# 1 False
# 2 True
# 3 True
# 4 False
# 5 True
# 6 True
# 7 True
# 8 False
# 9 True
2 changes: 2 additions & 0 deletions src/python/nimbusml/feature_extraction/text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .lightlda import LightLda
from .ngramextractor import NGramExtractor
from .ngramfeaturizer import NGramFeaturizer
from .sentiment import Sentiment
from .wordembedding import WordEmbedding

__all__ = [
'LightLda',
'NGramExtractor',
'NGramFeaturizer',
'Sentiment',
'WordEmbedding'
Expand Down
72 changes: 72 additions & 0 deletions src/python/nimbusml/feature_extraction/text/ngramextractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
NGramExtractor
"""

__all__ = ["NGramExtractor"]


from sklearn.base import TransformerMixin

from ...base_transform import BaseTransform
from ...internal.core.feature_extraction.text.ngramextractor import \
NGramExtractor as core
from ...internal.utils.utils import trace


class NGramExtractor(core, BaseTransform, TransformerMixin):
"""
**Description**
Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.

:param columns: see `Columns </nimbusml/concepts/columns>`_.

:param ngram_length: Maximum n-gram length.

:param all_lengths: Whether to store all n-gram lengths up to ngramLength,
or only ngramLength.

:param skip_length: Maximum number of tokens to skip when constructing an
n-gram.

:param max_num_terms: Maximum number of n-grams to store in the dictionary.

:param weighting: The weighting criteria.

:param params: Additional arguments sent to compute engine.

"""

@trace
def __init__(
self,
ngram_length=2,
all_lengths=True,
skip_length=0,
max_num_terms=[10000000],
weighting='Tf',
columns=None,
**params):

if columns:
params['columns'] = columns
BaseTransform.__init__(self, **params)
core.__init__(
self,
ngram_length=ngram_length,
all_lengths=all_lengths,
skip_length=skip_length,
max_num_terms=max_num_terms,
weighting=weighting,
**params)
self._columns = columns

def get_params(self, deep=False):
"""
Get the parameters for this operator.
"""
return core.get_params(self)
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
NGramExtractor
"""

__all__ = ["NGramExtractor"]


from ....entrypoints.transforms_ngramtranslator import \
transforms_ngramtranslator
from ....utils.utils import trace
from ...base_pipeline_item import BasePipelineItem, DefaultSignature


class NGramExtractor(BasePipelineItem, DefaultSignature):
"""
**Description**
Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.

:param ngram_length: Maximum n-gram length.

:param all_lengths: Whether to store all n-gram lengths up to ngramLength,
or only ngramLength.

:param skip_length: Maximum number of tokens to skip when constructing an
n-gram.

:param max_num_terms: Maximum number of n-grams to store in the dictionary.

:param weighting: The weighting criteria.

:param params: Additional arguments sent to compute engine.

"""

@trace
def __init__(
self,
ngram_length=2,
all_lengths=True,
skip_length=0,
max_num_terms=[10000000],
weighting='Tf',
**params):
BasePipelineItem.__init__(
self, type='transform', **params)

self.ngram_length = ngram_length
self.all_lengths = all_lengths
self.skip_length = skip_length
self.max_num_terms = max_num_terms
self.weighting = weighting

@property
def _entrypoint(self):
return transforms_ngramtranslator

@trace
def _get_node(self, **all_args):

input_columns = self.input
if input_columns is None and 'input' in all_args:
input_columns = all_args['input']
if 'input' in all_args:
all_args.pop('input')

output_columns = self.output
if output_columns is None and 'output' in all_args:
output_columns = all_args['output']
if 'output' in all_args:
all_args.pop('output')

# validate input
if input_columns is None:
raise ValueError(
"'None' input passed when it cannot be none.")

if not isinstance(input_columns, list):
raise ValueError(
"input has to be a list of strings, instead got %s" %
type(input_columns))

# validate output
if output_columns is None:
output_columns = input_columns

if not isinstance(output_columns, list):
raise ValueError(
"output has to be a list of strings, instead got %s" %
type(output_columns))

algo_args = dict(
column=[
dict(
Source=i,
Name=o) for i,
o in zip(
input_columns,
output_columns)] if input_columns else None,
ngram_length=self.ngram_length,
all_lengths=self.all_lengths,
skip_length=self.skip_length,
max_num_terms=self.max_num_terms,
weighting=self.weighting)

all_args.update(algo_args)
return self._entrypoint(**all_args)
Loading