From ed21fd502316634fe9d139a3ba4d4d1002137e0b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 12 Mar 2018 21:53:54 -0700 Subject: [PATCH] Merges (embedding|indicator)_column with their sequence counterparts, and exposes sequence methods under tf.contrib.feature_column. PiperOrigin-RevId: 188826187 --- tensorflow/contrib/feature_column/BUILD | 2 +- tensorflow/contrib/feature_column/__init__.py | 6 + .../feature_column/sequence_feature_column.py | 259 +++------------------ .../feature_column/sequence_feature_column_test.py | 194 ++++++++------- tensorflow/python/feature_column/feature_column.py | 163 ++++++++++++- 5 files changed, 298 insertions(+), 326 deletions(-) diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD index 8ba0823..3614b2b 100644 --- a/tensorflow/contrib/feature_column/BUILD +++ b/tensorflow/contrib/feature_column/BUILD @@ -26,6 +26,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":sequence_feature_column", + "//tensorflow/python:util", ], ) @@ -38,7 +39,6 @@ py_library( "//tensorflow/python:check_ops", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", - "//tensorflow/python:math_ops", "//tensorflow/python:parsing_ops", "//tensorflow/python:sparse_ops", "//tensorflow/python:tensor_shape", diff --git a/tensorflow/contrib/feature_column/__init__.py b/tensorflow/contrib/feature_column/__init__.py index 650a801..baa8c15 100644 --- a/tensorflow/contrib/feature_column/__init__.py +++ b/tensorflow/contrib/feature_column/__init__.py @@ -25,6 +25,12 @@ from tensorflow.python.util.all_util import remove_undocumented # pylint: enable=unused-import,line-too-long,wildcard-import _allowed_symbols = [ + 'sequence_categorical_column_with_hash_bucket', + 'sequence_categorical_column_with_identity', + 'sequence_categorical_column_with_vocabulary_list', + 'sequence_categorical_column_with_vocabulary_file', + 'sequence_input_layer', + 'sequence_numeric_column', ] remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py index f57557c..e601169 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py @@ -19,7 +19,6 @@ from __future__ import division from __future__ import print_function -import abc import collections @@ -29,7 +28,6 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops -from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.ops import variable_scope @@ -99,9 +97,11 @@ def sequence_input_layer( """ feature_columns = fc._clean_feature_columns(feature_columns) for c in feature_columns: - if not isinstance(c, _SequenceDenseColumn): + if not isinstance(c, fc._SequenceDenseColumn): raise ValueError( 'All feature_columns must be of type _SequenceDenseColumn. ' + 'You can wrap a sequence_categorical_column with an embedding_column ' + 'or indicator_column. ' 'Given (type {}): {}'.format(type(c), c)) with variable_scope.variable_scope( @@ -136,6 +136,10 @@ def sequence_categorical_column_with_identity( key, num_buckets, default_value=None): """Returns a feature column that represents sequences of integers. + Pass this to `embedding_column` or `indicator_column` to convert sequence + categorical data into dense representation for input to sequence NN, such as + RNN. + Example: ```python @@ -163,7 +167,7 @@ def sequence_categorical_column_with_identity( Returns: A `_SequenceCategoricalColumn`. """ - return _SequenceCategoricalColumn( + return fc._SequenceCategoricalColumn( fc.categorical_column_with_identity( key=key, num_buckets=num_buckets, @@ -174,6 +178,10 @@ def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. + Pass this to `embedding_column` or `indicator_column` to convert sequence + categorical data into dense representation for input to sequence NN, such as + RNN. + Example: ```python @@ -198,7 +206,7 @@ def sequence_categorical_column_with_hash_bucket( Returns: A `_SequenceCategoricalColumn`. """ - return _SequenceCategoricalColumn( + return fc._SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, @@ -210,6 +218,10 @@ def sequence_categorical_column_with_vocabulary_file( default_value=None, dtype=dtypes.string): """A sequence of categorical terms where ids use a vocabulary file. + Pass this to `embedding_column` or `indicator_column` to convert sequence + categorical data into dense representation for input to sequence NN, such as + RNN. + Example: ```python @@ -246,7 +258,7 @@ def sequence_categorical_column_with_vocabulary_file( Returns: A `_SequenceCategoricalColumn`. """ - return _SequenceCategoricalColumn( + return fc._SequenceCategoricalColumn( fc.categorical_column_with_vocabulary_file( key=key, vocabulary_file=vocabulary_file, @@ -260,6 +272,10 @@ def sequence_categorical_column_with_vocabulary_list( key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A sequence of categorical terms where ids use an in-memory list. + Pass this to `embedding_column` or `indicator_column` to convert sequence + categorical data into dense representation for input to sequence NN, such as + RNN. + Example: ```python @@ -296,7 +312,7 @@ def sequence_categorical_column_with_vocabulary_list( Returns: A `_SequenceCategoricalColumn`. """ - return _SequenceCategoricalColumn( + return fc._SequenceCategoricalColumn( fc.categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, @@ -305,108 +321,6 @@ def sequence_categorical_column_with_vocabulary_list( num_oov_buckets=num_oov_buckets)) -# TODO(b/73160931): Merge with embedding_column -def _sequence_embedding_column( - categorical_column, dimension, initializer=None, ckpt_to_load_from=None, - tensor_name_in_ckpt=None, max_norm=None, trainable=True): - """Returns a feature column that represents sequences of embeddings. - - Use this to convert sequence categorical data into dense representation for - input to sequence NN, such as RNN. - - Example: - - ```python - watches = sequence_categorical_column_with_identity( - 'watches', num_buckets=1000) - watches_embedding = _sequence_embedding_column(watches, dimension=10) - columns = [watches] - - features = tf.parse_example(..., features=make_parse_example_spec(columns)) - input_layer, sequence_length = sequence_input_layer(features, columns) - - rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) - outputs, state = tf.nn.dynamic_rnn( - rnn_cell, inputs=input_layer, sequence_length=sequence_length) - ``` - - Args: - categorical_column: A `_SequenceCategoricalColumn` created with a - `sequence_cateogrical_column_with_*` function. - dimension: Integer dimension of the embedding. - initializer: Initializer function used to initialize the embeddings. - ckpt_to_load_from: String representing checkpoint name/pattern from which to - restore column weights. Required if `tensor_name_in_ckpt` is not `None`. - tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from - which to restore the column weights. Required if `ckpt_to_load_from` is - not `None`. - max_norm: If not `None`, embedding values are l2-normalized to this value. - trainable: Whether or not the embedding is trainable. Default is True. - - Returns: - A `_SequenceCategoricalToDenseColumn`. - - Raises: - ValueError: If `categorical_column` is not the right type. - """ - if not isinstance(categorical_column, _SequenceCategoricalColumn): - raise ValueError( - 'categorical_column must be of type _SequenceCategoricalColumn. ' - 'Given (type {}): {}'.format( - type(categorical_column), categorical_column)) - return _SequenceCategoricalToDenseColumn( - fc.embedding_column( - categorical_column, - dimension=dimension, - initializer=initializer, - ckpt_to_load_from=ckpt_to_load_from, - tensor_name_in_ckpt=tensor_name_in_ckpt, - max_norm=max_norm, - trainable=trainable)) - - -# TODO(b/73160931): Merge with indicator_column -def _sequence_indicator_column(categorical_column): - """Returns a feature column that represents sequences of multi-hot tensors. - - Use this to convert sequence categorical data into dense representation for - input to sequence NN, such as RNN. - - Example: - - ```python - colors = sequence_categorical_column_with_vocabulary_list( - key='colors', vocabulary_list=('R', 'G', 'B', 'Y')) - colors_indicator = _sequence_indicator_column(colors) - columns = [colors] - - features = tf.parse_example(..., features=make_parse_example_spec(columns)) - input_layer, sequence_length = sequence_input_layer(features, columns) - - rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) - outputs, state = tf.nn.dynamic_rnn( - rnn_cell, inputs=input_layer, sequence_length=sequence_length) - ``` - - Args: - categorical_column: A `_SequenceCategoricalColumn` created with a - `sequence_cateogrical_column_with_*` function. - - Returns: - A `_SequenceCategoricalToDenseColumn`. - - Raises: - ValueError: If `categorical_column` is not the right type. - """ - if not isinstance(categorical_column, _SequenceCategoricalColumn): - raise ValueError( - 'categorical_column must be of type _SequenceCategoricalColumn. ' - 'Given (type {}): {}'.format( - type(categorical_column), categorical_column)) - return _SequenceCategoricalToDenseColumn( - fc.indicator_column(categorical_column)) - - def sequence_numeric_column( key, shape=(1,), @@ -459,129 +373,8 @@ def _assert_all_equal_and_return(tensors, name=None): return array_ops.identity(tensors[0]) -class _SequenceDenseColumn(fc._FeatureColumn): - """Represents dense sequence data.""" - - __metaclass__ = abc.ABCMeta - - TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name - 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length']) - - @abc.abstractproperty - def _variable_shape(self): - """`TensorShape` without batch and sequence dimensions.""" - pass - - @abc.abstractmethod - def _get_sequence_dense_tensor( - self, inputs, weight_collections=None, trainable=None): - """Returns a `TensorSequenceLengthPair`.""" - pass - - -def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1): - with ops.name_scope(None, 'sequence_length') as name_scope: - row_ids = sp_tensor.indices[:, 0] - column_ids = sp_tensor.indices[:, 1] - column_ids += array_ops.ones_like(column_ids) - seq_length = math_ops.to_int64( - math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements) - # If the last n rows do not have ids, seq_length will have shape - # [batch_size - n]. Pad the remaining values with zeros. - n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1] - padding = array_ops.zeros(n_pad, dtype=seq_length.dtype) - return array_ops.concat([seq_length, padding], axis=0, name=name_scope) - - -class _SequenceCategoricalColumn( - fc._CategoricalColumn, - collections.namedtuple( - '_SequenceCategoricalColumn', ['categorical_column'])): - """Represents sequences of categorical data.""" - - @property - def name(self): - return self.categorical_column.name - - @property - def _parse_example_spec(self): - return self.categorical_column._parse_example_spec - - def _transform_feature(self, inputs): - return self.categorical_column._transform_feature(inputs) - - @property - def _num_buckets(self): - return self.categorical_column._num_buckets - - def _get_sparse_tensors(self, inputs, weight_collections=None, - trainable=None): - sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) - id_tensor = sparse_tensors.id_tensor - weight_tensor = sparse_tensors.weight_tensor - # Expands final dimension, so that embeddings are not combined during - # embedding lookup. - check_id_rank = check_ops.assert_equal( - array_ops.rank(id_tensor), 2, - data=[ - 'Column {} expected ID tensor of rank 2. '.format(self.name), - 'id_tensor shape: ', array_ops.shape(id_tensor)]) - with ops.control_dependencies([check_id_rank]): - id_tensor = sparse_ops.sparse_reshape( - id_tensor, - shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0)) - if weight_tensor is not None: - check_weight_rank = check_ops.assert_equal( - array_ops.rank(weight_tensor), 2, - data=[ - 'Column {} expected weight tensor of rank 2.'.format(self.name), - 'weight_tensor shape:', array_ops.shape(weight_tensor)]) - with ops.control_dependencies([check_weight_rank]): - weight_tensor = sparse_ops.sparse_reshape( - weight_tensor, - shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0)) - return fc._CategoricalColumn.IdWeightPair(id_tensor, weight_tensor) - - def _sequence_length(self, inputs): - sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) - return _sequence_length_from_sparse_tensor(sparse_tensors.id_tensor) - - -class _SequenceCategoricalToDenseColumn( - _SequenceDenseColumn, - collections.namedtuple( - '_SequenceCategoricalToDenseColumn', ['dense_column'])): - """Densifies a _SequenceCategoricalColumn using the specified column.""" - - @property - def name(self): - return self.dense_column.name - - @property - def _parse_example_spec(self): - return self.dense_column._parse_example_spec - - def _transform_feature(self, inputs): - return self.dense_column._transform_feature(inputs) - - @property - def _variable_shape(self): - return self.dense_column._variable_shape - - def _get_sequence_dense_tensor( - self, inputs, weight_collections=None, trainable=None): - dense_tensor = self.dense_column._get_dense_tensor( - inputs=inputs, - weight_collections=weight_collections, - trainable=trainable) - sequence_length = self.dense_column.categorical_column._sequence_length( - inputs) - return _SequenceDenseColumn.TensorSequenceLengthPair( - dense_tensor=dense_tensor, sequence_length=sequence_length) - - class _SequenceNumericColumn( - _SequenceDenseColumn, + fc._SequenceDenseColumn, collections.namedtuple( '_SequenceNumericColumn', ['key', 'shape', 'default_value', 'dtype'])): @@ -616,9 +409,9 @@ class _SequenceNumericColumn( [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape], axis=0) dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape) - sequence_length = _sequence_length_from_sparse_tensor( + sequence_length = fc._sequence_length_from_sparse_tensor( sp_tensor, num_elements=self._variable_shape.num_elements()) - return _SequenceDenseColumn.TensorSequenceLengthPair( + return fc._SequenceDenseColumn.TensorSequenceLengthPair( dense_tensor=dense_tensor, sequence_length=sequence_length) # pylint: enable=protected-access diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py index c077f03..b64f086 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py @@ -22,6 +22,7 @@ import os import numpy as np from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc +from tensorflow.python.feature_column import feature_column as fc from tensorflow.python.feature_column.feature_column import _LazyBuilder from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -78,12 +79,12 @@ class SequenceInputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = sfc._sequence_embedding_column( + embedding_column_a = fc.embedding_column( categorical_column_a, dimension=embedding_dimension_a, initializer=_get_initializer(embedding_dimension_a, embedding_values_a)) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - embedding_column_b = sfc._sequence_embedding_column( + embedding_column_b = fc.embedding_column( categorical_column_b, dimension=embedding_dimension_b, initializer=_get_initializer(embedding_dimension_b, embedding_values_b)) @@ -107,6 +108,29 @@ class SequenceInputLayerTest(test.TestCase): self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess)) + def test_embedding_column_with_non_sequence_categorical(self): + """Tests that error is raised for non-sequence categorical column.""" + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column_a = fc.embedding_column( + categorical_column_a, dimension=2) + + with self.assertRaisesRegexp( + ValueError, + r'In embedding_column: aaa_embedding\. categorical_column must be of ' + r'type _SequenceCategoricalColumn to use sequence_input_layer\.'): + _, _ = sfc.sequence_input_layer( + features={'aaa': sparse_input}, + feature_columns=[embedding_column_a]) + def test_indicator_column(self): vocabulary_size_a = 3 sparse_input_a = sparse_tensor.SparseTensorValue( @@ -133,10 +157,10 @@ class SequenceInputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size_a) - indicator_column_a = sfc._sequence_indicator_column(categorical_column_a) + indicator_column_a = fc.indicator_column(categorical_column_a) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size_b) - indicator_column_b = sfc._sequence_indicator_column(categorical_column_b) + indicator_column_b = fc.indicator_column(categorical_column_b) input_layer, sequence_length = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, @@ -150,6 +174,28 @@ class SequenceInputLayerTest(test.TestCase): self.assertAllEqual( expected_sequence_length, sequence_length.eval(session=sess)) + def test_indicator_column_with_non_sequence_categorical(self): + """Tests that error is raised for non-sequence categorical column.""" + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + + categorical_column_a = fc.categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + indicator_column_a = fc.indicator_column(categorical_column_a) + + with self.assertRaisesRegexp( + ValueError, + r'In indicator_column: aaa_indicator\. categorical_column must be of ' + r'type _SequenceCategoricalColumn to use sequence_input_layer\.'): + _, _ = sfc.sequence_input_layer( + features={'aaa': sparse_input}, + feature_columns=[indicator_column_a]) + def test_numeric_column(self): sparse_input = sparse_tensor.SparseTensorValue( # example 0, values [[0.], [1]] @@ -230,6 +276,55 @@ class SequenceInputLayerTest(test.TestCase): sess.run(sequence_length) +class InputLayerTest(test.TestCase): + """Tests input_layer with sequence feature columns.""" + + def test_embedding_column(self): + """Tests that error is raised for sequence embedding column.""" + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + + categorical_column_a = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + embedding_column_a = fc.embedding_column( + categorical_column_a, dimension=2) + + with self.assertRaisesRegexp( + ValueError, + r'In embedding_column: aaa_embedding\. categorical_column must not be ' + r'of type _SequenceCategoricalColumn\.'): + _ = fc.input_layer( + features={'aaa': sparse_input}, + feature_columns=[embedding_column_a]) + + def test_indicator_column(self): + """Tests that error is raised for sequence indicator column.""" + vocabulary_size = 3 + sparse_input = sparse_tensor.SparseTensorValue( + # example 0, ids [2] + # example 1, ids [0, 1] + indices=((0, 0), (1, 0), (1, 1)), + values=(2, 0, 1), + dense_shape=(2, 2)) + + categorical_column_a = sfc.sequence_categorical_column_with_identity( + key='aaa', num_buckets=vocabulary_size) + indicator_column_a = fc.indicator_column(categorical_column_a) + + with self.assertRaisesRegexp( + ValueError, + r'In indicator_column: aaa_indicator\. categorical_column must not be ' + r'of type _SequenceCategoricalColumn\.'): + _ = fc.input_layer( + features={'aaa': sparse_input}, + feature_columns=[indicator_column_a]) + + def _assert_sparse_tensor_value(test_case, expected, actual): _assert_sparse_tensor_indices_shape(test_case, expected, actual) @@ -287,37 +382,6 @@ class SequenceCategoricalColumnWithIdentityTest(test.TestCase): with monitored_session.MonitoredSession() as sess: id_weight_pair.id_tensor.eval(session=sess) - def test_sequence_length(self): - column = sfc.sequence_categorical_column_with_identity( - 'aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( - indices=((0, 0), (1, 0), (1, 1)), - values=(1, 2, 0), - dense_shape=(2, 2)) - expected_sequence_length = [1, 2] - - sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) - - with monitored_session.MonitoredSession() as sess: - sequence_length = sess.run(sequence_length) - self.assertAllEqual(expected_sequence_length, sequence_length) - self.assertEqual(np.int64, sequence_length.dtype) - - def test_sequence_length_with_zeros(self): - column = sfc.sequence_categorical_column_with_identity( - 'aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( - indices=((1, 0), (3, 0), (3, 1)), - values=(1, 2, 0), - dense_shape=(5, 2)) - expected_sequence_length = [0, 1, 0, 2, 0] - - sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) - - with monitored_session.MonitoredSession() as sess: - self.assertAllEqual( - expected_sequence_length, sequence_length.eval(session=sess)) - class SequenceCategoricalColumnWithHashBucketTest(test.TestCase): @@ -344,21 +408,6 @@ class SequenceCategoricalColumnWithHashBucketTest(test.TestCase): expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess)) - def test_sequence_length(self): - column = sfc.sequence_categorical_column_with_hash_bucket( - 'aaa', hash_bucket_size=10) - inputs = sparse_tensor.SparseTensorValue( - indices=((0, 0), (1, 0), (1, 1)), - values=('omar', 'stringer', 'marlo'), - dense_shape=(2, 2)) - expected_sequence_length = [1, 2] - - sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) - - with monitored_session.MonitoredSession() as sess: - self.assertAllEqual( - expected_sequence_length, sequence_length.eval(session=sess)) - class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase): @@ -399,23 +448,6 @@ class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase): expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess)) - def test_sequence_length(self): - column = sfc.sequence_categorical_column_with_vocabulary_file( - key='aaa', - vocabulary_file=self._wire_vocabulary_file_name, - vocabulary_size=self._wire_vocabulary_size) - inputs = sparse_tensor.SparseTensorValue( - indices=((0, 0), (1, 0), (1, 1)), - values=('marlo', 'skywalker', 'omar'), - dense_shape=(2, 2)) - expected_sequence_length = [1, 2] - - sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) - - with monitored_session.MonitoredSession() as sess: - self.assertAllEqual( - expected_sequence_length, sequence_length.eval(session=sess)) - class SequenceCategoricalColumnWithVocabularyListTest(test.TestCase): @@ -441,22 +473,6 @@ class SequenceCategoricalColumnWithVocabularyListTest(test.TestCase): expected_sparse_ids, id_weight_pair.id_tensor.eval(session=sess)) - def test_sequence_length(self): - column = sfc.sequence_categorical_column_with_vocabulary_list( - key='aaa', - vocabulary_list=('omar', 'stringer', 'marlo')) - inputs = sparse_tensor.SparseTensorValue( - indices=((0, 0), (1, 0), (1, 1)), - values=('marlo', 'skywalker', 'omar'), - dense_shape=(2, 2)) - expected_sequence_length = [1, 2] - - sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs})) - - with monitored_session.MonitoredSession() as sess: - self.assertAllEqual( - expected_sequence_length, sequence_length.eval(session=sess)) - class SequenceEmbeddingColumnTest(test.TestCase): @@ -496,7 +512,7 @@ class SequenceEmbeddingColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = sfc._sequence_embedding_column( + embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_initializer) @@ -522,7 +538,7 @@ class SequenceEmbeddingColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = sfc._sequence_embedding_column( + embedding_column = fc.embedding_column( categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( @@ -550,7 +566,7 @@ class SequenceEmbeddingColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = sfc._sequence_embedding_column( + embedding_column = fc.embedding_column( categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( @@ -587,7 +603,7 @@ class SequenceIndicatorColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = sfc._sequence_indicator_column(categorical_column) + indicator_column = fc.indicator_column(categorical_column) indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) @@ -607,7 +623,7 @@ class SequenceIndicatorColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = sfc._sequence_indicator_column(categorical_column) + indicator_column = fc.indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) @@ -634,7 +650,7 @@ class SequenceIndicatorColumnTest(test.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = sfc._sequence_indicator_column(categorical_column) + indicator_column = fc.indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 85971c9..381153c 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -1804,6 +1804,21 @@ def _create_categorical_column_weighted_sum( name='weighted_sum') +class _SequenceDenseColumn(_FeatureColumn): + """Represents dense sequence data.""" + + __metaclass__ = abc.ABCMeta + + TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name + 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length']) + + @abc.abstractmethod + def _get_sequence_dense_tensor( + self, inputs, weight_collections=None, trainable=None): + """Returns a `TensorSequenceLengthPair`.""" + pass + + class _LazyBuilder(object): """Handles caching of transformations while building the model. @@ -2152,7 +2167,7 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn, class _EmbeddingColumn( - _DenseColumn, + _DenseColumn, _SequenceDenseColumn, collections.namedtuple('_EmbeddingColumn', ( 'categorical_column', 'dimension', 'combiner', 'initializer', 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable' @@ -2178,7 +2193,9 @@ class _EmbeddingColumn( self._shape = tensor_shape.vector(self.dimension) return self._shape - def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): + def _get_dense_tensor_internal( + self, inputs, weight_collections=None, trainable=None): + """Private method that follows the signature of _get_dense_tensor.""" # Get sparse IDs and weights. sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access inputs, weight_collections=weight_collections, trainable=trainable) @@ -2210,6 +2227,43 @@ class _EmbeddingColumn( name='%s_weights' % self.name, max_norm=self.max_norm) + def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): + if isinstance(self.categorical_column, _SequenceCategoricalColumn): + raise ValueError( + 'In embedding_column: {}. ' + 'categorical_column must not be of type _SequenceCategoricalColumn. ' + 'Suggested fix A: If you wish to use input_layer, use a ' + 'non-sequence categorical_column_with_*. ' + 'Suggested fix B: If you wish to create sequence input, use ' + 'sequence_input_layer instead of input_layer. ' + 'Given (type {}): {}'.format( + self.name, type(self.categorical_column), + self.categorical_column)) + return self._get_dense_tensor_internal( + inputs=inputs, weight_collections=weight_collections, + trainable=trainable) + + def _get_sequence_dense_tensor( + self, inputs, weight_collections=None, trainable=None): + if not isinstance(self.categorical_column, _SequenceCategoricalColumn): + raise ValueError( + 'In embedding_column: {}. ' + 'categorical_column must be of type _SequenceCategoricalColumn ' + 'to use sequence_input_layer. ' + 'Suggested fix: Use one of sequence_categorical_column_with_*. ' + 'Given (type {}): {}'.format( + self.name, type(self.categorical_column), + self.categorical_column)) + dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access + inputs=inputs, + weight_collections=weight_collections, + trainable=trainable) + sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access + sequence_length = _sequence_length_from_sparse_tensor( + sparse_tensors.id_tensor) + return _SequenceDenseColumn.TensorSequenceLengthPair( + dense_tensor=dense_tensor, sequence_length=sequence_length) + class _SharedEmbeddingColumn( _DenseColumn, @@ -2890,7 +2944,7 @@ def _prune_invalid_ids(sparse_ids, sparse_weights): return sparse_ids, sparse_weights -class _IndicatorColumn(_DenseColumn, +class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn, collections.namedtuple('_IndicatorColumn', ['categorical_column'])): """Represents a one-hot column for use in deep networks. @@ -2966,15 +3020,53 @@ class _IndicatorColumn(_DenseColumn, Returns: Dense `Tensor` created within `_transform_feature`. + + Raises: + ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`. """ # Do nothing with weight_collections and trainable since no variables are # created in this function. del weight_collections del trainable + if isinstance(self.categorical_column, _SequenceCategoricalColumn): + raise ValueError( + 'In indicator_column: {}. ' + 'categorical_column must not be of type _SequenceCategoricalColumn. ' + 'Suggested fix A: If you wish to use input_layer, use a ' + 'non-sequence categorical_column_with_*. ' + 'Suggested fix B: If you wish to create sequence input, use ' + 'sequence_input_layer instead of input_layer. ' + 'Given (type {}): {}'.format( + self.name, type(self.categorical_column), + self.categorical_column)) # Feature has been already transformed. Return the intermediate # representation created by _transform_feature. return inputs.get(self) + def _get_sequence_dense_tensor( + self, inputs, weight_collections=None, trainable=None): + # Do nothing with weight_collections and trainable since no variables are + # created in this function. + del weight_collections + del trainable + if not isinstance(self.categorical_column, _SequenceCategoricalColumn): + raise ValueError( + 'In indicator_column: {}. ' + 'categorical_column must be of type _SequenceCategoricalColumn ' + 'to use sequence_input_layer. ' + 'Suggested fix: Use one of sequence_categorical_column_with_*. ' + 'Given (type {}): {}'.format( + self.name, type(self.categorical_column), + self.categorical_column)) + # Feature has been already transformed. Return the intermediate + # representation created by _transform_feature. + dense_tensor = inputs.get(self) + sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access + sequence_length = _sequence_length_from_sparse_tensor( + sparse_tensors.id_tensor) + return _SequenceDenseColumn.TensorSequenceLengthPair( + dense_tensor=dense_tensor, sequence_length=sequence_length) + def _verify_static_batch_size_equality(tensors, columns): # bath_size is a tf.Dimension object. @@ -2990,3 +3082,68 @@ def _verify_static_batch_size_equality(tensors, columns): 'Batch size of columns ({}, {}): ({}, {})'.format( columns[bath_size_column_index].name, columns[i].name, expected_batch_size, tensors[i].shape[0])) + + +def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1): + """Returns a [batch_size] Tensor with per-example sequence length.""" + with ops.name_scope(None, 'sequence_length') as name_scope: + row_ids = sp_tensor.indices[:, 0] + column_ids = sp_tensor.indices[:, 1] + column_ids += array_ops.ones_like(column_ids) + seq_length = math_ops.to_int64( + math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements) + # If the last n rows do not have ids, seq_length will have shape + # [batch_size - n]. Pad the remaining values with zeros. + n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1] + padding = array_ops.zeros(n_pad, dtype=seq_length.dtype) + return array_ops.concat([seq_length, padding], axis=0, name=name_scope) + + +class _SequenceCategoricalColumn( + _CategoricalColumn, + collections.namedtuple( + '_SequenceCategoricalColumn', ['categorical_column'])): + """Represents sequences of categorical data.""" + + @property + def name(self): + return self.categorical_column.name + + @property + def _parse_example_spec(self): + return self.categorical_column._parse_example_spec # pylint: disable=protected-access + + def _transform_feature(self, inputs): + return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access + + @property + def _num_buckets(self): + return self.categorical_column._num_buckets # pylint: disable=protected-access + + def _get_sparse_tensors(self, inputs, weight_collections=None, + trainable=None): + sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access + id_tensor = sparse_tensors.id_tensor + weight_tensor = sparse_tensors.weight_tensor + # Expands final dimension, so that embeddings are not combined during + # embedding lookup. + check_id_rank = check_ops.assert_equal( + array_ops.rank(id_tensor), 2, + data=[ + 'Column {} expected ID tensor of rank 2. '.format(self.name), + 'id_tensor shape: ', array_ops.shape(id_tensor)]) + with ops.control_dependencies([check_id_rank]): + id_tensor = sparse_ops.sparse_reshape( + id_tensor, + shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0)) + if weight_tensor is not None: + check_weight_rank = check_ops.assert_equal( + array_ops.rank(weight_tensor), 2, + data=[ + 'Column {} expected weight tensor of rank 2.'.format(self.name), + 'weight_tensor shape:', array_ops.shape(weight_tensor)]) + with ops.control_dependencies([check_weight_rank]): + weight_tensor = sparse_ops.sparse_reshape( + weight_tensor, + shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0)) + return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor) -- 2.7.4