From f0f25437c90b365e1f47687523c74c6ace04bdc6 Mon Sep 17 00:00:00 2001 From: Eric Gaudet Date: Thu, 28 Mar 2019 11:10:10 -0700 Subject: [PATCH] Rename WordTensorizer to TokenTensorizer (#427) Summary: Pull Request resolved: https://github.com/facebookresearch/pytext/pull/427 name more consistent with the behavior, since it's used for more than just words. Suggested by Barlas Reviewed By: m3rlin45 Differential Revision: D14633297 fbshipit-source-id: 434639c98cc39d686f6c07190597dca0bf5fac6d --- pytext/data/tensorizers.py | 8 ++++++-- pytext/data/test/data_test.py | 10 +++++----- pytext/data/test/tensorizers_test.py | 12 ++++++------ pytext/models/doc_model.py | 6 +++--- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pytext/data/tensorizers.py b/pytext/data/tensorizers.py index 07b4eae07..ff6fff89c 100644 --- a/pytext/data/tensorizers.py +++ b/pytext/data/tensorizers.py @@ -68,7 +68,7 @@ def initialize(self): yield -class WordTensorizer(Tensorizer): +class TokenTensorizer(Tensorizer): """Convert text to a list of tokens. Do this based on a tokenizer configuration, and build a vocabulary for numberization. Finally, pad the batch to create a square tensor of the correct size. @@ -208,7 +208,7 @@ def sort_key(self, row): return row[1] -class WordCharacterTensorizer(WordTensorizer): +class CharacterTokenTensorizer(TokenTensorizer): """Turn words into 2-dimensional tensors of ints based on their ascii values. Words are padded to the maximum word length. Sequence lengths are the lengths of each token, 0 for pad token. @@ -226,6 +226,10 @@ def tensorize(self, batch): return (pad_and_tensorize(characters), pad_and_tensorize(lengths)) +class FloatVectorTensorizer(Tensorizer): + """TODO: support for dense features.""" + + class LabelTensorizer(Tensorizer): """Numberize labels.""" diff --git a/pytext/data/test/data_test.py b/pytext/data/test/data_test.py index 89c606c13..d2ff00e75 100644 --- a/pytext/data/test/data_test.py +++ b/pytext/data/test/data_test.py @@ -7,7 +7,7 @@ from pytext.data import Batcher, Data, PoolingBatcher from pytext.data.sources.data_source import SafeFileWrapper from pytext.data.sources.tsv import TSVDataSource -from pytext.data.tensorizers import LabelTensorizer, WordTensorizer +from pytext.data.tensorizers import LabelTensorizer, TokenTensorizer from pytext.utils.test import import_tests_module @@ -25,7 +25,7 @@ def setUp(self): ) self.tensorizers = { - "tokens": WordTensorizer(text_column="text"), + "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label", allow_unknown=True), } @@ -50,7 +50,7 @@ def test_create_batches(self): self.assertEqual(10, len(tokens)) def test_create_batches_different_tensorizers(self): - tensorizers = {"tokens": WordTensorizer(text_column="text")} + tensorizers = {"tokens": TokenTensorizer(text_column="text")} data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) @@ -62,10 +62,10 @@ def test_create_batches_different_tensorizers(self): def test_data_initializes_tensorsizers(self): tensorizers = { - "tokens": WordTensorizer(text_column="text"), + "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } - # verify WordTensorizer isn't in an initialized state yet + # verify TokenTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized diff --git a/pytext/data/test/tensorizers_test.py b/pytext/data/test/tensorizers_test.py index 50b9726a9..c738e65b7 100644 --- a/pytext/data/test/tensorizers_test.py +++ b/pytext/data/test/tensorizers_test.py @@ -8,9 +8,9 @@ from pytext.data.sources.tsv import TSVDataSource from pytext.data.tensorizers import ( ByteTensorizer, + CharacterTokenTensorizer, LabelTensorizer, - WordCharacterTensorizer, - WordTensorizer, + TokenTensorizer, initialize_tensorizers, ) from pytext.utils.test import import_tests_module @@ -31,7 +31,7 @@ def setUp(self): def test_initialize_tensorizers(self): tensorizers = { - "tokens": WordTensorizer(text_column="text"), + "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } @@ -40,7 +40,7 @@ def test_initialize_tensorizers(self): self.assertEqual(7, len(tensorizers["labels"].labels)) def test_initialize_word_tensorizer(self): - tensorizer = WordTensorizer(text_column="text") + tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: @@ -49,7 +49,7 @@ def test_initialize_word_tensorizer(self): self.assertEqual(49, len(tensorizer.vocab)) def test_create_word_tensors(self): - tensorizer = WordTensorizer(text_column="text") + tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: @@ -80,7 +80,7 @@ def test_create_byte_tensors(self): self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors) def test_create_word_character_tensors(self): - tensorizer = WordCharacterTensorizer(text_column="text") + tensorizer = CharacterTokenTensorizer(text_column="text") # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" diff --git a/pytext/models/doc_model.py b/pytext/models/doc_model.py index 247bb3c6e..38f20d606 100644 --- a/pytext/models/doc_model.py +++ b/pytext/models/doc_model.py @@ -11,7 +11,7 @@ MetaInput, NumericLabelTensorizer, Tensorizer, - WordTensorizer, + TokenTensorizer, ) from pytext.data.utils import UNK from pytext.exporters.exporter import ModelExporter @@ -56,7 +56,7 @@ class NewDocModel(DocModel): class Config(DocModel.Config): class ModelInput(Model.Config.ModelInput): - tokens: WordTensorizer.Config = WordTensorizer.Config() + tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) # for metric reporter raw_text: MetaInput.Config = MetaInput.Config(column="text") @@ -125,7 +125,7 @@ class NewDocRegressionModel(NewDocModel): class Config(NewDocModel.Config): class RegressionModelInput(Model.Config.ModelInput): - tokens: WordTensorizer.Config = WordTensorizer.Config() + tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config() inputs: RegressionModelInput = RegressionModelInput()