From e139cbf91ab416822ce01f5515e9dc230e7294e6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 5 Mar 2018 14:14:01 -0800 Subject: [PATCH] Add sequence_indicator_column PiperOrigin-RevId: 187920673 --- .../feature_column/sequence_feature_column.py | 67 +++++++++-- .../feature_column/sequence_feature_column_test.py | 126 +++++++++++++++++++++ 2 files changed, 181 insertions(+), 12 deletions(-) diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py index e446043..ba17b56 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py @@ -184,7 +184,7 @@ def _sequence_embedding_column( ```python watches = sequence_categorical_column_with_identity( 'watches', num_buckets=1000) - watches_embedding = embedding_column(watches, dimension=10) + watches_embedding = _sequence_embedding_column(watches, dimension=10) columns = [watches] features = tf.parse_example(..., features=make_parse_example_spec(columns)) @@ -209,7 +209,7 @@ def _sequence_embedding_column( trainable: Whether or not the embedding is trainable. Default is True. Returns: - A `_SequenceEmbeddingColumn`. + A `_SequenceCategoricalToDenseColumn`. Raises: ValueError: If `categorical_column` is not the right type. @@ -219,7 +219,7 @@ def _sequence_embedding_column( 'categorical_column must be of type _SequenceCategoricalColumn. ' 'Given (type {}): {}'.format( type(categorical_column), categorical_column)) - return _SequenceEmbeddingColumn( + return _SequenceCategoricalToDenseColumn( fc.embedding_column( categorical_column, dimension=dimension, @@ -230,6 +230,48 @@ def _sequence_embedding_column( trainable=trainable)) +# TODO(b/73160931): Merge with indicator_column +def _sequence_indicator_column(categorical_column): + """Returns a feature column that represents sequences of multi-hot tensors. + + Use this to convert sequence categorical data into dense representation for + input to sequence NN, such as RNN. + + Example: + + ```python + colors = sequence_categorical_column_with_vocabulary_list( + key='colors', vocabulary_list=('R', 'G', 'B', 'Y')) + colors_indicator = _sequence_indicator_column(colors) + columns = [colors] + + features = tf.parse_example(..., features=make_parse_example_spec(columns)) + input_layer, sequence_length = sequence_input_layer(features, columns) + + rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) + outputs, state = tf.nn.dynamic_rnn( + rnn_cell, inputs=input_layer, sequence_length=sequence_length) + ``` + + Args: + categorical_column: A `_SequenceCategoricalColumn` created with a + `sequence_cateogrical_column_with_*` function. + + Returns: + A `_SequenceCategoricalToDenseColumn`. + + Raises: + ValueError: If `categorical_column` is not the right type. + """ + if not isinstance(categorical_column, _SequenceCategoricalColumn): + raise ValueError( + 'categorical_column must be of type _SequenceCategoricalColumn. ' + 'Given (type {}): {}'.format( + type(categorical_column), categorical_column)) + return _SequenceCategoricalToDenseColumn( + fc.indicator_column(categorical_column)) + + def sequence_numeric_column( key, shape=(1,), @@ -358,33 +400,34 @@ class _SequenceCategoricalColumn( return _sequence_length_from_sparse_tensor(sparse_tensors.id_tensor) -class _SequenceEmbeddingColumn( +class _SequenceCategoricalToDenseColumn( _SequenceDenseColumn, - collections.namedtuple('_SequenceEmbeddingColumn', ['embedding_column'])): - """Represents sequences of embeddings.""" + collections.namedtuple( + '_SequenceCategoricalToDenseColumn', ['dense_column'])): + """Densifies a _SequenceCategoricalColumn using the specified column.""" @property def name(self): - return self.embedding_column.name + return self.dense_column.name @property def _parse_example_spec(self): - return self.embedding_column._parse_example_spec + return self.dense_column._parse_example_spec def _transform_feature(self, inputs): - return self.embedding_column._transform_feature(inputs) + return self.dense_column._transform_feature(inputs) @property def _variable_shape(self): - return self.embedding_column._variable_shape + return self.dense_column._variable_shape def _get_sequence_dense_tensor( self, inputs, weight_collections=None, trainable=None): - dense_tensor = self.embedding_column._get_dense_tensor( + dense_tensor = self.dense_column._get_dense_tensor( inputs=inputs, weight_collections=weight_collections, trainable=trainable) - sequence_length = self.embedding_column.categorical_column._sequence_length( + sequence_length = self.dense_column.categorical_column._sequence_length( inputs) return _SequenceDenseColumn.TensorSequenceLengthPair( dense_tensor=dense_tensor, sequence_length=sequence_length) diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py index 1052136..39caa60 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py @@ -106,6 +106,49 @@ class SequenceInputLayerTest(test.TestCase): self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess)) + def test_indicator_column(self): + vocabulary_size_a = 3 + sparse_input_a = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + vocabulary_size_b = 2 + sparse_input_b = sparse_tensor.SparseTensorValue( + # example 0, ids [1] + # example 1, ids [1, 0] + indices=((0, 0), (1, 0), (1, 1)), + values=(1, 1, 0), + dense_shape=(2, 2)) + + expected_input_layer = [ + # example 0, ids_a [2], ids_b [1] + [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]], + # example 1, ids_a [0, 1], ids_b [1, 0] + [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]], + ] + expected_sequence_length = [1, 2] + + categorical_column_a = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size_a) + indicator_column_a = sfc._sequence_indicator_column(categorical_column_a) + categorical_column_b = sfc.sequence_categorical_column_with_identity( + key='bbb', num_buckets=vocabulary_size_b) + indicator_column_b = sfc._sequence_indicator_column(categorical_column_b) + input_layer, sequence_length = sfc.sequence_input_layer( + features={ + 'aaa': sparse_input_a, + 'bbb': sparse_input_b, + }, + # Test that columns are reordered alphabetically. + feature_columns=[indicator_column_b, indicator_column_a]) + + with monitored_session.MonitoredSession() as sess: + self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess)) + self.assertAllEqual( + expected_sequence_length, sequence_length.eval(session=sess)) + def test_numeric_column(self): sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0.], [1]] @@ -344,6 +387,89 @@ class SequenceEmbeddingColumnTest(test.TestCase): expected_sequence_length, sequence_length.eval(session=sess)) +class SequenceIndicatorColumnTest(test.TestCase): + + def test_get_sequence_dense_tensor(self): + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + # example 2, ids [] + # example 3, ids [1] + indices=((0, 0), (1, 0), (1, 1), (3, 0)), + values=(2, 0, 1, 1), + dense_shape=(4, 2)) + + expected_lookups = [ + # example 0, ids [2] + [[0., 0., 1.], [0., 0., 0.]], + # example 1, ids [0, 1] + [[1., 0., 0.], [0., 1., 0.]], + # example 2, ids [] + [[0., 0., 0.], [0., 0., 0.]], + # example 3, ids [1] + [[0., 1., 0.], [0., 0., 0.]], + ] + + categorical_column = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + indicator_column = sfc._sequence_indicator_column(categorical_column) + + indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( + _LazyBuilder({'aaa': sparse_input})) + + with monitored_session.MonitoredSession() as sess: + self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess)) + + def test_sequence_length(self): + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + expected_sequence_length = [1, 2] + + categorical_column = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + indicator_column = sfc._sequence_indicator_column(categorical_column) + + _, sequence_length = indicator_column._get_sequence_dense_tensor( + _LazyBuilder({'aaa': sparse_input})) + + with monitored_session.MonitoredSession() as sess: + sequence_length = sess.run(sequence_length) + self.assertAllEqual(expected_sequence_length, sequence_length) + self.assertEqual(np.int64, sequence_length.dtype) + + def test_sequence_length_with_empty_rows(self): + """Tests _sequence_length when some examples do not have ids.""" + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [] + # example 1, ids [2] + # example 2, ids [0, 1] + # example 3, ids [] + # example 4, ids [1] + # example 5, ids [] + indices=((1, 0), (2, 0), (2, 1), (4, 0)), + values=(2, 0, 1, 1), + dense_shape=(6, 2)) + expected_sequence_length = [0, 1, 2, 0, 1, 0] + + categorical_column = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + indicator_column = sfc._sequence_indicator_column(categorical_column) + + _, sequence_length = indicator_column._get_sequence_dense_tensor( + _LazyBuilder({'aaa': sparse_input})) + + with monitored_session.MonitoredSession() as sess: + self.assertAllEqual( + expected_sequence_length, sequence_length.eval(session=sess)) + + class SequenceNumericColumnTest(test.TestCase): def test_get_sequence_dense_tensor(self): -- 2.7.4