Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Initial implementation of the WordTokenizer transform. #296

Merged
merged 10 commits into from
Oct 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
<Compile Include="nimbusml\examples\CharTokenizer.py" />
<Compile Include="nimbusml\examples\ColumnConcatenator.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
<Compile Include="nimbusml\examples\WordTokenizer.py" />
<Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
<Compile Include="nimbusml\examples\ColumnDropper.py" />
<Compile Include="nimbusml\examples\ColumnDuplicator.py" />
Expand All @@ -102,6 +103,7 @@
<Compile Include="nimbusml\examples\examples_from_dataframe\AveragedPerceptronBinaryClassifier_infert_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\Binner_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\BootStrapSample_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\WordTokenizer_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\CharTokenizer_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\ColumnConcatenator_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\ColumnDuplicator_df.py" />
Expand Down Expand Up @@ -306,6 +308,7 @@
<Compile Include="nimbusml\internal\core\preprocessing\schema\columnduplicator.py" />
<Compile Include="nimbusml\internal\core\preprocessing\schema\columndropper.py" />
<Compile Include="nimbusml\internal\core\preprocessing\tensorflowscorer.py" />
<Compile Include="nimbusml\internal\core\preprocessing\text\wordtokenizer.py" />
<Compile Include="nimbusml\internal\core\timeseries\iidchangepointdetector.py" />
<Compile Include="nimbusml\internal\core\timeseries\iidspikedetector.py" />
<Compile Include="nimbusml\internal\core\timeseries\ssachangepointdetector.py" />
Expand Down Expand Up @@ -451,6 +454,7 @@
<Compile Include="nimbusml\internal\entrypoints\transforms_twoheterogeneousmodelcombiner.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_vectortoimage.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_wordembeddings.py" />
<Compile Include="nimbusml\internal\entrypoints\transforms_wordtokenizer.py" />
<Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_dart.py" />
<Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_gbdt.py" />
<Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_goss.py" />
Expand Down Expand Up @@ -649,6 +653,7 @@
<Compile Include="nimbusml\preprocessing\schema\__init__.py" />
<Compile Include="nimbusml\preprocessing\tensorflowscorer.py" />
<Compile Include="nimbusml\preprocessing\text\chartokenizer.py" />
<Compile Include="nimbusml\preprocessing\text\wordtokenizer.py" />
<Compile Include="nimbusml\preprocessing\text\__init__.py" />
<Compile Include="nimbusml\preprocessing\tokey.py" />
<Compile Include="nimbusml\preprocessing\__init__.py" />
Expand Down Expand Up @@ -685,6 +690,7 @@
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
<Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
<Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
<Compile Include="nimbusml\tests\preprocessing\text\test_wordtokenizer.py" />
<Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
<Compile Include="nimbusml\tests\test_variable_column.py" />
<Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
Expand Down
32 changes: 32 additions & 0 deletions src/python/nimbusml/examples/WordTokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
###############################################################################
# WordTokenizer

from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.text import WordTokenizer

# data input (as a FileDataStream)
path = get_dataset("wiki_detox_train").as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
# Sentiment SentimentText
# 0 1 ==RUDE== Dude, you are rude upload that carl p...
# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK...
# 2 1 Stop trolling, zapatancas, calling me a liar m...
# 3 1 ==You're cool== You seem like a really cool g...
# 4 1 ::::: Why are you threatening me? I'm not bein...

tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
pipeline = Pipeline([tokenize])

tokenize.fit(data)
y = tokenize.transform(data)

print(y.drop(labels='SentimentText', axis=1).head())
# Sentiment wt.000 wt.001 wt.002 wt.003 wt.004 wt.005 ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372
# 0 1 ==RUDE== Dude, you are rude upload ... None None None None None None None
# 1 1 == OK! == IM GOING TO ... None None None None None None None
# 2 1 Stop trolling, zapatancas, calling me a ... None None None None None None None
# 3 1 ==You're cool== You seem like a ... None None None None None None None
# 4 1 ::::: Why are you threatening me? ... None None None None None None None
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
###############################################################################
# WordTokenizer

import pandas
from nimbusml import Pipeline
from nimbusml.preprocessing.text import WordTokenizer

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
"I really did not like the taste of it",
"It was surprisingly quite good!",
"I will never ever ever go to that place again!!",
"The best ever!! It was amazingly good and super fast",
"I wish I had gone earlier, it was that great",
"somewhat dissapointing. I'd probably wont try again",
"Never visit again... rascals!"]))

tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'

pipeline = Pipeline([tokenize])

tokenize.fit(customer_reviews)
y = tokenize.transform(customer_reviews)

print(y)
# review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11
# 0 I really did ot like the taste of it None None None
# 1 It was surprisi gly quite good! None None None None None None
# 2 I will ever ever ever go to that place agai !! None
# 3 The best ever!! It was amazi gly good a d super fast
# 4 I wish I had go e earlier, it was that great None
# 5 somewhat dissapoi ti g. I'd probably wo t try agai None None
# 6 Never visit agai ... rascals! None None None None None None None
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
WordTokenizer
"""

__all__ = ["WordTokenizer"]


from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer
from ....utils.utils import trace
from ...base_pipeline_item import BasePipelineItem, DefaultSignature


class WordTokenizer(BasePipelineItem, DefaultSignature):
"""
**Description**
The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.

:param char_array_term_separators: Array of single character term
separator(s). By default uses space character separator.

:param params: Additional arguments sent to compute engine.

"""

@trace
def __init__(
self,
char_array_term_separators=None,
**params):
BasePipelineItem.__init__(
self, type='transform', **params)

self.char_array_term_separators = char_array_term_separators

@property
def _entrypoint(self):
return transforms_wordtokenizer

@trace
def _get_node(self, **all_args):

input_columns = self.input
if input_columns is None and 'input' in all_args:
input_columns = all_args['input']
if 'input' in all_args:
all_args.pop('input')

output_columns = self.output
if output_columns is None and 'output' in all_args:
output_columns = all_args['output']
if 'output' in all_args:
all_args.pop('output')

# validate input
if input_columns is None:
raise ValueError(
"'None' input passed when it cannot be none.")

if not isinstance(input_columns, list):
raise ValueError(
"input has to be a list of strings, instead got %s" %
type(input_columns))

# validate output
if output_columns is None:
output_columns = input_columns

if not isinstance(output_columns, list):
raise ValueError(
"output has to be a list of strings, instead got %s" %
type(output_columns))

algo_args = dict(
column=[
dict(
Source=i,
Name=o) for i,
o in zip(
input_columns,
output_columns)] if input_columns else None,
char_array_term_separators=self.char_array_term_separators)

all_args.update(algo_args)
return self._entrypoint(**all_args)
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
Transforms.WordTokenizer
"""


from ..utils.entrypoints import EntryPoint
from ..utils.utils import try_set, unlist


def transforms_wordtokenizer(
data,
output_data=None,
model=None,
column=None,
char_array_term_separators=None,
**params):
"""
**Description**
The input to this transform is text, and the output is a vector of
text containing the words (tokens) in the original text. The
separator is space, but can be specified as any other
character (or multiple characters) if needed.

:param column: New column definition(s) (inputs).
:param data: Input dataset (inputs).
:param char_array_term_separators: Array of single character term
separator(s). By default uses space character separator.
(inputs).
:param output_data: Transformed dataset (outputs).
:param model: Transform model (outputs).
"""

entrypoint_name = 'Transforms.WordTokenizer'
inputs = {}
outputs = {}

if column is not None:
inputs['Column'] = try_set(
obj=column,
none_acceptable=True,
is_of_type=list,
is_column=True)
if data is not None:
inputs['Data'] = try_set(
obj=data,
none_acceptable=False,
is_of_type=str)
if char_array_term_separators is not None:
inputs['CharArrayTermSeparators'] = try_set(
obj=char_array_term_separators,
none_acceptable=True,
is_of_type=list)
if output_data is not None:
outputs['OutputData'] = try_set(
obj=output_data,
none_acceptable=False,
is_of_type=str)
if model is not None:
outputs['Model'] = try_set(
obj=model,
none_acceptable=False,
is_of_type=str)

input_variables = {
x for x in unlist(inputs.values())
if isinstance(x, str) and x.startswith("$")}
output_variables = {
x for x in unlist(outputs.values())
if isinstance(x, str) and x.startswith("$")}

entrypoint = EntryPoint(
name=entrypoint_name, inputs=inputs, outputs=outputs,
input_variables=input_variables,
output_variables=output_variables)
return entrypoint
4 changes: 3 additions & 1 deletion src/python/nimbusml/preprocessing/text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .chartokenizer import CharTokenizer
from .wordtokenizer import WordTokenizer

__all__ = [
'CharTokenizer'
'CharTokenizer',
'WordTokenizer'
]
55 changes: 55 additions & 0 deletions src/python/nimbusml/preprocessing/text/wordtokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
WordTokenizer
"""

__all__ = ["WordTokenizer"]


from sklearn.base import TransformerMixin

from ...base_transform import BaseTransform
from ...internal.core.preprocessing.text.wordtokenizer import \
WordTokenizer as core
from ...internal.utils.utils import trace


class WordTokenizer(core, BaseTransform, TransformerMixin):
"""
**Description**
The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.

:param columns: see `Columns </nimbusml/concepts/columns>`_.

:param char_array_term_separators: Array of single character term
separator(s). By default uses space character separator.

:param params: Additional arguments sent to compute engine.

"""

@trace
def __init__(
self,
char_array_term_separators=None,
columns=None,
**params):

if columns:
params['columns'] = columns
BaseTransform.__init__(self, **params)
core.__init__(
self,
char_array_term_separators=char_array_term_separators,
**params)
self._columns = columns

def get_params(self, deep=False):
"""
Get the parameters for this operator.
"""
return core.get_params(self)
33 changes: 33 additions & 0 deletions src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
import unittest

import pandas
from nimbusml import Pipeline
from nimbusml.preprocessing.text import WordTokenizer


class TestWordTokenizer(unittest.TestCase):

def test_wordtokenizer(self):
customer_reviews = pandas.DataFrame(data=dict(review=[
"I really did not like the taste of it",
"It was surprisingly quite good!"]))

tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
pipeline = Pipeline([tokenize])

tokenize.fit(customer_reviews)
y = tokenize.transform(customer_reviews)

self.assertEqual(y.shape, (2, 9))

self.assertEqual(y.loc[0, 'review.3'], 'ot')
self.assertEqual(y.loc[1, 'review.3'], 'gly')
self.assertEqual(y.loc[1, 'review.6'], None)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion src/python/tests/test_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
'PixelExtractor, Loader, Resizer, \
GlobalContrastRowScaler, PcaTransformer, '
'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, '
'NGramFeaturizer, WordEmbedding, LpScaler',
'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer',
'check_transformer_data_not_an_array, check_pipeline_consistency, '
'check_fit2d_1feature, check_estimators_fit_returns_self,\
check_fit2d_1sample, '
Expand Down
2 changes: 1 addition & 1 deletion src/python/tools/entrypoint_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1457,7 +1457,7 @@ def parse_arg(argument, inout):
assert not is_column
arg_obj = NumericArrayArg(argument, inout)
elif itemType in ["String", "DataView", "PredictorModel",
"TransformModel", "Node"]:
"TransformModel", "Node", "Char"]:
arg_obj = StringArrayArg(argument, inout,
is_column=is_column)
elif isinstance(itemType, dict):
Expand Down
Loading