This repository was archived by the owner on Nov 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 62
Initial implementation of NGramExtractor. #320
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
############################################################################### | ||
# NGramExtractor | ||
from nimbusml import FileDataStream, Pipeline | ||
from nimbusml.datasets import get_dataset | ||
from nimbusml.preprocessing.schema import ColumnDropper | ||
from nimbusml.preprocessing.text import CharTokenizer | ||
from nimbusml.feature_extraction.text import NGramExtractor | ||
|
||
# data input (as a FileDataStream) | ||
path = get_dataset("wiki_detox_train").as_filepath() | ||
|
||
data = FileDataStream.read_csv(path, sep='\t') | ||
print(data.head()) | ||
# Sentiment SentimentText | ||
# 0 1 ==RUDE== Dude, you are rude upload that carl p... | ||
# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... | ||
# 2 1 Stop trolling, zapatancas, calling me a liar m... | ||
# 3 1 ==You're cool== You seem like a really cool g... | ||
# 4 1 ::::: Why are you threatening me? I'm not bein... | ||
|
||
# transform usage | ||
pipe = Pipeline([ | ||
CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), | ||
NGramExtractor(ngram_length=1, all_lengths=False, columns={'Ngrams': 'SentimentText_Transform'}), | ||
ColumnDropper(columns=['SentimentText_Transform', 'SentimentText', 'Sentiment']) | ||
]) | ||
|
||
# fit and transform | ||
features = pipe.fit_transform(data) | ||
|
||
print(features.head()) | ||
# Ngrams.<␂> Ngrams.= Ngrams.R Ngrams.U Ngrams.D Ngrams.E ... | ||
# 0 1.0 4.0 1.0 1.0 2.0 1.0 ... | ||
# 1 1.0 4.0 0.0 0.0 2.0 3.0 ... | ||
# 2 1.0 0.0 0.0 0.0 0.0 0.0 ... | ||
# 3 1.0 4.0 0.0 0.0 0.0 0.0 ... | ||
# 4 1.0 0.0 0.0 0.0 0.0 0.0 ... |
112 changes: 112 additions & 0 deletions
112
src/python/nimbusml/examples/examples_from_dataframe/NGramExtractor_df.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
############################################################################### | ||
# Example with NGramExtractor and LogisticRegressionBinaryClassifier | ||
import pandas | ||
from nimbusml import Pipeline | ||
from nimbusml.feature_extraction.text import NGramExtractor | ||
from nimbusml.linear_model import LogisticRegressionBinaryClassifier | ||
from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper | ||
from nimbusml.preprocessing.text import CharTokenizer | ||
|
||
train_reviews = pandas.DataFrame( | ||
data=dict( | ||
review=[ | ||
"This is great", | ||
"I hate it", | ||
"Love it", | ||
"Do not like it", | ||
"Really like it", | ||
"I hate it", | ||
"I like it a lot", | ||
"I kind of hate it", | ||
"I do like it", | ||
"I really hate it", | ||
"It is very good", | ||
"I hate it a bunch", | ||
"I love it a bunch", | ||
"I hate it", | ||
"I like it very much", | ||
"I hate it very much.", | ||
"I really do love it", | ||
"I really do hate it", | ||
"Love it!", | ||
"Hate it!", | ||
"I love it", | ||
"I hate it", | ||
"I love it", | ||
"I hate it", | ||
"I love it"], | ||
like=[ | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True, | ||
False, | ||
True])) | ||
|
||
test_reviews = pandas.DataFrame( | ||
data=dict( | ||
review=[ | ||
"This is great", | ||
"I hate it", | ||
"Love it", | ||
"Really like it", | ||
"I hate it", | ||
"I like it a lot", | ||
"I love it", | ||
"I do like it", | ||
"I really hate it", | ||
"I love it"])) | ||
|
||
y = train_reviews['like'] | ||
X = train_reviews.loc[:, train_reviews.columns != 'like'] | ||
|
||
pipeline = Pipeline([ | ||
CharTokenizer(columns={'review_transform': 'review'}), | ||
NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}), | ||
ColumnDropper(columns=['review_transform', 'review']) | ||
]) | ||
X = pipeline.fit_transform(X) | ||
|
||
print(X.head()) | ||
# ngrams.<␂>|T|h ngrams.T|h|i ngrams.h|i|s ngrams.i|s|<␠> ... ngrams.i|t|! ngrams.t|!|<␃> ngrams.<␂>|H|a ngrams.H|a|t | ||
# 0 1.0 1.0 1.0 2.0 ... 0.0 0.0 0.0 0.0 | ||
# 1 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 | ||
# 2 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 | ||
# 3 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 | ||
# 4 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 | ||
|
||
model = LogisticRegressionBinaryClassifier().fit(X, y) | ||
|
||
X_test = pipeline.transform(test_reviews) | ||
result = model.predict(X_test) | ||
|
||
print(result) | ||
# 0 True | ||
# 1 False | ||
# 2 True | ||
# 3 True | ||
# 4 False | ||
# 5 True | ||
# 6 True | ||
# 7 True | ||
# 8 False | ||
# 9 True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
72 changes: 72 additions & 0 deletions
72
src/python/nimbusml/feature_extraction/text/ngramextractor.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# -------------------------------------------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
# -------------------------------------------------------------------------------------------- | ||
# - Generated by tools/entrypoint_compiler.py: do not edit by hand | ||
""" | ||
NGramExtractor | ||
""" | ||
|
||
__all__ = ["NGramExtractor"] | ||
|
||
|
||
from sklearn.base import TransformerMixin | ||
|
||
from ...base_transform import BaseTransform | ||
from ...internal.core.feature_extraction.text.ngramextractor import \ | ||
NGramExtractor as core | ||
from ...internal.utils.utils import trace | ||
|
||
|
||
class NGramExtractor(core, BaseTransform, TransformerMixin): | ||
""" | ||
**Description** | ||
Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. | ||
|
||
:param columns: see `Columns </nimbusml/concepts/columns>`_. | ||
|
||
:param ngram_length: Maximum n-gram length. | ||
|
||
:param all_lengths: Whether to store all n-gram lengths up to ngramLength, | ||
or only ngramLength. | ||
|
||
:param skip_length: Maximum number of tokens to skip when constructing an | ||
n-gram. | ||
|
||
:param max_num_terms: Maximum number of n-grams to store in the dictionary. | ||
|
||
:param weighting: The weighting criteria. | ||
|
||
:param params: Additional arguments sent to compute engine. | ||
|
||
""" | ||
|
||
@trace | ||
def __init__( | ||
self, | ||
ngram_length=2, | ||
all_lengths=True, | ||
skip_length=0, | ||
max_num_terms=[10000000], | ||
weighting='Tf', | ||
columns=None, | ||
**params): | ||
|
||
if columns: | ||
params['columns'] = columns | ||
BaseTransform.__init__(self, **params) | ||
core.__init__( | ||
self, | ||
ngram_length=ngram_length, | ||
all_lengths=all_lengths, | ||
skip_length=skip_length, | ||
max_num_terms=max_num_terms, | ||
weighting=weighting, | ||
**params) | ||
self._columns = columns | ||
|
||
def get_params(self, deep=False): | ||
""" | ||
Get the parameters for this operator. | ||
""" | ||
return core.get_params(self) |
111 changes: 111 additions & 0 deletions
111
src/python/nimbusml/internal/core/feature_extraction/text/ngramextractor.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# -------------------------------------------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
# -------------------------------------------------------------------------------------------- | ||
# - Generated by tools/entrypoint_compiler.py: do not edit by hand | ||
""" | ||
NGramExtractor | ||
""" | ||
|
||
__all__ = ["NGramExtractor"] | ||
|
||
|
||
from ....entrypoints.transforms_ngramtranslator import \ | ||
transforms_ngramtranslator | ||
from ....utils.utils import trace | ||
from ...base_pipeline_item import BasePipelineItem, DefaultSignature | ||
|
||
|
||
class NGramExtractor(BasePipelineItem, DefaultSignature): | ||
""" | ||
**Description** | ||
Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag. | ||
|
||
:param ngram_length: Maximum n-gram length. | ||
|
||
:param all_lengths: Whether to store all n-gram lengths up to ngramLength, | ||
or only ngramLength. | ||
|
||
:param skip_length: Maximum number of tokens to skip when constructing an | ||
n-gram. | ||
|
||
:param max_num_terms: Maximum number of n-grams to store in the dictionary. | ||
|
||
:param weighting: The weighting criteria. | ||
|
||
:param params: Additional arguments sent to compute engine. | ||
|
||
""" | ||
|
||
@trace | ||
def __init__( | ||
self, | ||
ngram_length=2, | ||
all_lengths=True, | ||
skip_length=0, | ||
max_num_terms=[10000000], | ||
weighting='Tf', | ||
**params): | ||
BasePipelineItem.__init__( | ||
self, type='transform', **params) | ||
|
||
self.ngram_length = ngram_length | ||
self.all_lengths = all_lengths | ||
self.skip_length = skip_length | ||
self.max_num_terms = max_num_terms | ||
self.weighting = weighting | ||
|
||
@property | ||
def _entrypoint(self): | ||
return transforms_ngramtranslator | ||
|
||
@trace | ||
def _get_node(self, **all_args): | ||
|
||
input_columns = self.input | ||
if input_columns is None and 'input' in all_args: | ||
input_columns = all_args['input'] | ||
if 'input' in all_args: | ||
all_args.pop('input') | ||
|
||
output_columns = self.output | ||
if output_columns is None and 'output' in all_args: | ||
output_columns = all_args['output'] | ||
if 'output' in all_args: | ||
all_args.pop('output') | ||
|
||
# validate input | ||
if input_columns is None: | ||
raise ValueError( | ||
"'None' input passed when it cannot be none.") | ||
|
||
if not isinstance(input_columns, list): | ||
raise ValueError( | ||
"input has to be a list of strings, instead got %s" % | ||
type(input_columns)) | ||
|
||
# validate output | ||
if output_columns is None: | ||
output_columns = input_columns | ||
|
||
if not isinstance(output_columns, list): | ||
raise ValueError( | ||
"output has to be a list of strings, instead got %s" % | ||
type(output_columns)) | ||
|
||
algo_args = dict( | ||
column=[ | ||
dict( | ||
Source=i, | ||
Name=o) for i, | ||
o in zip( | ||
input_columns, | ||
output_columns)] if input_columns else None, | ||
ngram_length=self.ngram_length, | ||
all_lengths=self.all_lengths, | ||
skip_length=self.skip_length, | ||
max_num_terms=self.max_num_terms, | ||
weighting=self.weighting) | ||
|
||
all_args.update(algo_args) | ||
return self._entrypoint(**all_args) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if you remove ColumnDropper? Will example still work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it will still work. The only difference is that those columns will still be there. Those columns were removed so that the output at the end of the example would only show the columns output from the NGramExtractor.
In reply to: 333696240 [](ancestors = 333696240)