Skip to content

Commit

Permalink
Adding a SparseFeature to the tf.Example parser API.
Browse files Browse the repository at this point in the history
Allows declaring two Example keys (an index key and a value key) for a single
Feature, the returned value is a SparseTensor with indices given by the index
key.
Change: 140671682
  • Loading branch information
tensorflower-gardener committed Dec 1, 2016
1 parent d5c9d24 commit 0b40c9a
Show file tree
Hide file tree
Showing 3 changed files with 294 additions and 26 deletions.
143 changes: 139 additions & 4 deletions tensorflow/python/kernel_tests/parsing_ops_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,82 @@ def testSerializedContainingSparse(self):
}
}, expected_output)

def testSerializedContainingSparseFeature(self):
original = [
example(features=features({
"val": float_feature([3, 4]),
"idx": int64_feature([5, 10])
})),
example(features=features({
"val": float_feature([]), # empty float list
"idx": int64_feature([])
})),
example(features=features({
"val": feature(), # feature with nothing in it
# missing idx feature
})),
example(features=features({
"val": float_feature([1, 2, -1]),
"idx": int64_feature([0, 9, 3]) # unsorted
}))
]

serialized = [m.SerializeToString() for m in original]

expected_sp = ( # indices, values, shape
np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
np.array([4, 13], dtype=np.int64)) # batch == 4, max_elems = 13

expected_output = {
"sp": expected_sp,
}

self._test({
"serialized": tf.convert_to_tensor(serialized),
"features": {
"sp": tf.SparseFeature("idx", "val", tf.float32, 13)
}
}, expected_output)

def testSerializedContainingSparseFeatureReuse(self):
original = [
example(features=features({
"val1": float_feature([3, 4]),
"val2": float_feature([5, 6]),
"idx": int64_feature([5, 10])
})),
example(features=features({
"val1": float_feature([]), # empty float list
"idx": int64_feature([])
})),
]

serialized = [m.SerializeToString() for m in original]

expected_sp1 = ( # indices, values, shape
np.array([[0, 5], [0, 10]], dtype=np.int64),
np.array([3.0, 4.0], dtype=np.float32),
np.array([2, 13], dtype=np.int64)) # batch == 2, max_elems = 13

expected_sp2 = ( # indices, values, shape
np.array([[0, 5], [0, 10]], dtype=np.int64),
np.array([5.0, 6.0], dtype=np.float32),
np.array([2, 7], dtype=np.int64)) # batch == 2, max_elems = 13

expected_output = {
"sp1": expected_sp1,
"sp2": expected_sp2,
}

self._test({
"serialized": tf.convert_to_tensor(serialized),
"features": {
"sp1": tf.SparseFeature("idx", "val1", tf.float32, 13),
"sp2": tf.SparseFeature("idx", "val2", tf.float32, 7)
}
}, expected_output)

def testSerializedContainingDense(self):
aname = "a"
bname = "b*has+a:tricky_name"
Expand Down Expand Up @@ -400,20 +476,28 @@ def testSerializedContainingDenseWithDefaults(self):
},
expected_output)

def testSerializedContainingSparseAndDenseWithNoDefault(self):
def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
expected_st_a = ( # indices, values, shape
np.empty(
(0, 2), dtype=np.int64), # indices
np.empty(
(0,), dtype=np.int64), # sp_a is DT_INT64
np.array(
[2, 0], dtype=np.int64)) # batch == 2, max_elems = 0
expected_sp = ( # indices, values, shape
np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64),
np.array(["a", "b", "c"], dtype="|S"),
np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13

original = [
example(features=features({
"c": float_feature([3, 4])
"c": float_feature([3, 4]),
"val": bytes_feature([b"a", b"b"]),
"idx": int64_feature([0, 3])
})), example(features=features({
"c": float_feature([1, 2])
"c": float_feature([1, 2]),
"val": bytes_feature([b"c"]),
"idx": int64_feature([7])
}))
]

Expand All @@ -424,6 +508,7 @@ def testSerializedContainingSparseAndDenseWithNoDefault(self):
b_default = np.random.rand(3, 3).astype(bytes)
expected_output = {
"st_a": expected_st_a,
"sp": expected_sp,
"a": np.array(2 * [[a_default]]),
"b": np.array(2 * [b_default]),
"c": np.array(
Expand All @@ -436,6 +521,7 @@ def testSerializedContainingSparseAndDenseWithNoDefault(self):
"serialized": tf.convert_to_tensor(serialized),
"features": {
"st_a": tf.VarLenFeature(tf.int64),
"sp": tf.SparseFeature("idx", "val", tf.string, 13),
"a": tf.FixedLenFeature(
(1, 3), tf.int64, default_value=a_default),
"b": tf.FixedLenFeature(
Expand All @@ -446,6 +532,46 @@ def testSerializedContainingSparseAndDenseWithNoDefault(self):
},
expected_output)

def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
expected_idx = ( # indices, values, shape
np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
np.array([0, 3, 7, 1]),
np.array([2, 2], dtype=np.int64)) # batch == 4, max_elems = 2

expected_sp = ( # indices, values, shape
np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64),
np.array(["a", "b", "d", "c"], dtype="|S"),
np.array([2, 13], dtype=np.int64)) # batch == 4, max_elems = 13

original = [
example(features=features({
"val": bytes_feature([b"a", b"b"]),
"idx": int64_feature([0, 3])
})), example(features=features({
"val": bytes_feature([b"c", b"d"]),
"idx": int64_feature([7, 1])
}))
]

names = ["in1", "in2"]
serialized = [m.SerializeToString() for m in original]

expected_output = {
"idx": expected_idx,
"sp": expected_sp,
}

self._test(
{
"example_names": names,
"serialized": tf.convert_to_tensor(serialized),
"features": {
"idx": tf.VarLenFeature(tf.int64),
"sp": tf.SparseFeature("idx", "val", tf.string, 13),
}
},
expected_output)


class ParseSingleExampleTest(tf.test.TestCase):

Expand Down Expand Up @@ -473,8 +599,10 @@ def _test(self, kwargs, expected_values=None, expected_err=None):
self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
self.assertEqual(tuple(out[k].shape.get_shape().as_list()), (1,))

def testSingleExampleWithSparseAndDense(self):
def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
original = example(features=features({"c": float_feature([3, 4]),
"val": bytes_feature([b"a", b"b"]),
"idx": int64_feature([0, 3]),
"st_a": float_feature([3.0, 4.0])}))

serialized = original.SerializeToString()
Expand All @@ -486,10 +614,16 @@ def testSingleExampleWithSparseAndDense(self):
np.array(
[2], dtype=np.int64)) # shape: max_values = 2

expected_sp = ( # indices, values, shape
np.array([[0], [3]], dtype=np.int64),
np.array(["a", "b"], dtype="|S"),
np.array([13], dtype=np.int64)) # max_values = 13

a_default = [1, 2, 3]
b_default = np.random.rand(3, 3).astype(bytes)
expected_output = {
"st_a": expected_st_a,
"sp": expected_sp,
"a": [a_default],
"b": b_default,
"c": np.array(
Expand All @@ -502,6 +636,7 @@ def testSingleExampleWithSparseAndDense(self):
"serialized": tf.convert_to_tensor(serialized),
"features": {
"st_a": tf.VarLenFeature(tf.float32),
"sp": tf.SparseFeature("idx", "val", tf.string, 13),
"a": tf.FixedLenFeature(
(1, 3), tf.int64, default_value=a_default),
"b": tf.FixedLenFeature(
Expand Down
1 change: 1 addition & 0 deletions tensorflow/python/ops/io_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
@@VarLenFeature
@@FixedLenFeature
@@FixedLenSequenceFeature
@@SparseFeature
@@parse_example
@@parse_single_example
@@parse_tensor
Expand Down
Loading

0 comments on commit 0b40c9a

Please sign in to comment.