Skip to content

Commit

Permalink
Rename WordTensorizer to TokenTensorizer (facebookresearch#427)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookresearch#427

name more consistent with the behavior, since it's used for more
than just words. Suggested by Barlas

Reviewed By: m3rlin45

Differential Revision: D14633297

fbshipit-source-id: 434639c98cc39d686f6c07190597dca0bf5fac6d
  • Loading branch information
Titousensei authored and facebook-github-bot committed Mar 28, 2019
1 parent 6ec230d commit f0f2543
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 16 deletions.
8 changes: 6 additions & 2 deletions pytext/data/tensorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def initialize(self):
yield


class WordTensorizer(Tensorizer):
class TokenTensorizer(Tensorizer):
"""Convert text to a list of tokens. Do this based on a tokenizer configuration,
and build a vocabulary for numberization. Finally, pad the batch to create
a square tensor of the correct size.
Expand Down Expand Up @@ -208,7 +208,7 @@ def sort_key(self, row):
return row[1]


class WordCharacterTensorizer(WordTensorizer):
class CharacterTokenTensorizer(TokenTensorizer):
"""Turn words into 2-dimensional tensors of ints based on their ascii values.
Words are padded to the maximum word length. Sequence lengths are the lengths
of each token, 0 for pad token.
Expand All @@ -226,6 +226,10 @@ def tensorize(self, batch):
return (pad_and_tensorize(characters), pad_and_tensorize(lengths))


class FloatVectorTensorizer(Tensorizer):
"""TODO: support for dense features."""


class LabelTensorizer(Tensorizer):
"""Numberize labels."""

Expand Down
10 changes: 5 additions & 5 deletions pytext/data/test/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pytext.data import Batcher, Data, PoolingBatcher
from pytext.data.sources.data_source import SafeFileWrapper
from pytext.data.sources.tsv import TSVDataSource
from pytext.data.tensorizers import LabelTensorizer, WordTensorizer
from pytext.data.tensorizers import LabelTensorizer, TokenTensorizer
from pytext.utils.test import import_tests_module


Expand All @@ -25,7 +25,7 @@ def setUp(self):
)

self.tensorizers = {
"tokens": WordTensorizer(text_column="text"),
"tokens": TokenTensorizer(text_column="text"),
"labels": LabelTensorizer(label_column="label", allow_unknown=True),
}

Expand All @@ -50,7 +50,7 @@ def test_create_batches(self):
self.assertEqual(10, len(tokens))

def test_create_batches_different_tensorizers(self):
tensorizers = {"tokens": WordTensorizer(text_column="text")}
tensorizers = {"tokens": TokenTensorizer(text_column="text")}
data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16))
batches = list(data.batches(Stage.TRAIN))
self.assertEqual(1, len(batches))
Expand All @@ -62,10 +62,10 @@ def test_create_batches_different_tensorizers(self):

def test_data_initializes_tensorsizers(self):
tensorizers = {
"tokens": WordTensorizer(text_column="text"),
"tokens": TokenTensorizer(text_column="text"),
"labels": LabelTensorizer(label_column="label"),
}
# verify WordTensorizer isn't in an initialized state yet
# verify TokenTensorizer isn't in an initialized state yet
assert tensorizers["tokens"].vocab is None
Data(self.data_source, tensorizers)
# Tensorizers should have been initialized
Expand Down
12 changes: 6 additions & 6 deletions pytext/data/test/tensorizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from pytext.data.sources.tsv import TSVDataSource
from pytext.data.tensorizers import (
ByteTensorizer,
CharacterTokenTensorizer,
LabelTensorizer,
WordCharacterTensorizer,
WordTensorizer,
TokenTensorizer,
initialize_tensorizers,
)
from pytext.utils.test import import_tests_module
Expand All @@ -31,7 +31,7 @@ def setUp(self):

def test_initialize_tensorizers(self):
tensorizers = {
"tokens": WordTensorizer(text_column="text"),
"tokens": TokenTensorizer(text_column="text"),
"labels": LabelTensorizer(label_column="label"),
"chars": ByteTensorizer(text_column="text"),
}
Expand All @@ -40,7 +40,7 @@ def test_initialize_tensorizers(self):
self.assertEqual(7, len(tensorizers["labels"].labels))

def test_initialize_word_tensorizer(self):
tensorizer = WordTensorizer(text_column="text")
tensorizer = TokenTensorizer(text_column="text")
init = tensorizer.initialize()
init.send(None) # kick
for row in self.data.train:
Expand All @@ -49,7 +49,7 @@ def test_initialize_word_tensorizer(self):
self.assertEqual(49, len(tensorizer.vocab))

def test_create_word_tensors(self):
tensorizer = WordTensorizer(text_column="text")
tensorizer = TokenTensorizer(text_column="text")
init = tensorizer.initialize()
init.send(None) # kick
for row in self.data.train:
Expand Down Expand Up @@ -80,7 +80,7 @@ def test_create_byte_tensors(self):
self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)

def test_create_word_character_tensors(self):
tensorizer = WordCharacterTensorizer(text_column="text")
tensorizer = CharacterTokenTensorizer(text_column="text")
# not initializing because initializing is a no-op for ByteTensorizer

s1 = "I want some coffee"
Expand Down
6 changes: 3 additions & 3 deletions pytext/models/doc_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
MetaInput,
NumericLabelTensorizer,
Tensorizer,
WordTensorizer,
TokenTensorizer,
)
from pytext.data.utils import UNK
from pytext.exporters.exporter import ModelExporter
Expand Down Expand Up @@ -56,7 +56,7 @@ class NewDocModel(DocModel):

class Config(DocModel.Config):
class ModelInput(Model.Config.ModelInput):
tokens: WordTensorizer.Config = WordTensorizer.Config()
tokens: TokenTensorizer.Config = TokenTensorizer.Config()
labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True)
# for metric reporter
raw_text: MetaInput.Config = MetaInput.Config(column="text")
Expand Down Expand Up @@ -125,7 +125,7 @@ class NewDocRegressionModel(NewDocModel):

class Config(NewDocModel.Config):
class RegressionModelInput(Model.Config.ModelInput):
tokens: WordTensorizer.Config = WordTensorizer.Config()
tokens: TokenTensorizer.Config = TokenTensorizer.Config()
labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config()

inputs: RegressionModelInput = RegressionModelInput()
Expand Down

0 comments on commit f0f2543

Please sign in to comment.