from tensorflow.contrib import framework
from tensorflow.contrib.factorization.python.ops import gmm_ops
from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.python.training import training_util
from tensorflow.contrib.learn.python.learn.estimators import estimator
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import logging_ops as logging
-from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops.control_flow_ops import with_dependencies
from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
def _streaming_sum(scalar_tensor):
class GMM(estimator.Estimator):
"""An estimator for GMM clustering."""
SCORES = 'scores'
+ LOG_LIKELIHOOD = 'loss'
ASSIGNMENTS = 'assignments'
- ALL_SCORES = 'all_scores'
def __init__(self,
num_clusters,
yield result[GMM.ASSIGNMENTS]
def score(self, input_fn=None, batch_size=None, steps=None):
- """Predict total sum of distances to nearest clusters.
-
- Note that this function is different from the corresponding one in sklearn
- which returns the negative of the sum of distances.
+ """Predict total log-likelihood.
Args:
input_fn: see predict.
steps: see predict.
Returns:
- Total sum of distances to nearest clusters.
+ Total log-likelihood.
"""
results = self.evaluate(input_fn=input_fn, batch_size=batch_size,
steps=steps)
- return np.sum(results[GMM.SCORES])
+ return np.log(np.sum(np.exp(results[GMM.SCORES])))
def weights(self):
"""Returns the cluster weights."""
def _model_fn(features, labels, mode, config):
"""Model function."""
assert labels is None, labels
- (all_scores,
+ (loss,
+ scores,
model_predictions,
- losses, training_op,
+ training_op,
init_op,
is_initialized) = gmm_ops.gmm(self._parse_tensor_or_dict(features),
self._training_initial_clusters,
self._covariance_type,
self._params)
incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
- loss = math_ops.reduce_sum(losses)
training_op = with_dependencies([training_op, incr_step], loss)
training_hooks = [_InitializeClustersHook(
init_op, is_initialized, config.is_chief)]
predictions = {
- GMM.ALL_SCORES: all_scores[0],
GMM.ASSIGNMENTS: model_predictions[0][0],
}
eval_metric_ops = {
- GMM.SCORES: _streaming_sum(loss),
+ GMM.SCORES: scores,
+ GMM.LOG_LIKELIHOOD: _streaming_sum(loss),
}
return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions,
eval_metric_ops=eval_metric_ops,
from __future__ import print_function
import numpy as np
-from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import variables
from tensorflow.python.ops.embedding_ops import embedding_lookup
-from tensorflow.python.summary import summary
# Machine epsilon.
MEPS = np.finfo(float).eps
return ret
def scores(self):
- """Returns the distances to each class.
+ """Returns the per-sample likelihood fo the data.
Returns:
- A tuple with two Tensors. The first contains the distance to
- each class. The second contains the distance to the assigned
- class.
+ Log probabilities of each data point.
"""
- return (self._all_scores, self._scores)
+ return self._scores
+
+ def log_likelihood_op(self):
+ """Returns the log-likelihood operation."""
+ return self._log_likelihood_op
def _define_graph(self, data):
"""Define graph for a single iteration.
self._define_expectation_operation(shard_id)
self._define_partial_maximization_operation(shard_id, shard)
self._define_maximization_operation(len(data))
- self._define_distance_to_clusters(data)
+ self._define_loglikelihood_operation()
+ self._define_score_samples()
def _define_full_covariance_probs(self, shard_id, shard):
"""Defines the full covariance probabilties per example in a class.
state_ops.assign(
self._covs, new_covs, validate_shape=False))
- def _define_distance_to_clusters(self, data):
- """Defines the Mahalanobis distance to the assigned Gaussian."""
- # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input -
- # mean) from log probability function.
- self._all_scores = []
- for shard in data:
- all_scores = []
- shard = array_ops.expand_dims(shard, 0)
- for c in xrange(self._num_classes):
- if self._covariance_type == FULL_COVARIANCE:
- cov = self._covs[c, :, :]
- elif self._covariance_type == DIAG_COVARIANCE:
- cov = array_ops.diag(self._covs[c, :])
- inverse = linalg_ops.matrix_inverse(cov + self._min_var)
- inv_cov = array_ops.tile(
- array_ops.expand_dims(inverse, 0),
- array_ops.stack([self._num_examples, 1, 1]))
- diff = array_ops.transpose(shard - self._means[c, :, :], perm=[1, 0, 2])
- m_left = math_ops.matmul(diff, inv_cov)
- all_scores.append(
- math_ops.sqrt(
- math_ops.matmul(
- m_left, array_ops.transpose(
- diff, perm=[0, 2, 1]))))
- self._all_scores.append(
- array_ops.reshape(
- array_ops.concat(all_scores, 1),
- array_ops.stack([self._num_examples, self._num_classes])))
-
- # Distance to the associated class.
- self._all_scores = array_ops.concat(self._all_scores, 0)
- assignments = array_ops.concat(self.assignments(), 0)
- rows = math_ops.to_int64(math_ops.range(0, self._num_examples))
- indices = array_ops.concat(
- [array_ops.expand_dims(rows, 1), array_ops.expand_dims(assignments, 1)],
- 1)
- self._scores = array_ops.gather_nd(self._all_scores, indices)
-
def _define_loglikelihood_operation(self):
"""Defines the total log-likelihood of current iteration."""
- self._ll_op = []
+ op = []
for prior_probs in self._prior_probs:
- self._ll_op.append(math_ops.reduce_sum(math_ops.log(prior_probs)))
- summary.scalar('ll', math_ops.reduce_sum(self._ll_op))
+ op.append(math_ops.reduce_logsumexp(prior_probs))
+ self._log_likelihood_op = math_ops.reduce_logsumexp(op)
+
+ def _define_score_samples(self):
+ """Defines the likelihood of each data sample."""
+ op = []
+ for shard_id, prior_probs in enumerate(self._prior_probs):
+ op.append(prior_probs + math_ops.log(self._w[shard_id]))
+ self._scores = array_ops.squeeze(
+ math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
def gmm(inp,
Returns:
Note: tuple of lists returned to be consistent with skflow
A tuple consisting of:
- all_scores: A matrix (or list of matrices) of dimensions (num_input,
- num_clusters) where the value is the distance of an input vector and a
- cluster center.
assignments: A vector (or list of vectors). Each element in the vector
corresponds to an input row in 'inp' and specifies the cluster id
corresponding to the input.
- scores: Similar to assignments but specifies the distance to the
- assigned cluster instead.
training_op: an op that runs an iteration of training.
init_op: an op that runs the initialization.
"""
gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
covariance_type, random_seed)
assignments = gmm_tool.assignments()
- all_scores, scores = gmm_tool.scores()
- return ([all_scores], [assignments], [scores], gmm_tool.training_ops(),
+ scores = gmm_tool.scores()
+ loss = gmm_tool.log_likelihood_op()
+ return (loss, scores, [assignments], gmm_tool.training_ops(),
gmm_tool.init_ops(), gmm_tool.is_initialized())
g.seed = 5
with self.test_session() as sess:
data = constant_op.constant(self.data, dtype=dtypes.float32)
- _, assignments, _, training_op, init_op, _ = gmm_ops.gmm(
+ loss_op, scores, assignments, training_op, init_op, _ = gmm_ops.gmm(
data, 'random', num_classes, random_seed=self.seed)
variables.global_variables_initializer().run()
sess.run(init_op)
+ first_loss = sess.run(loss_op)
for _ in xrange(self.iterations):
sess.run(training_op)
assignments = sess.run(assignments)
+ end_loss = sess.run(loss_op)
+ scores = sess.run(scores)
+ self.assertEqual((self.num_examples, 1), scores.shape)
accuracy = np.mean(
np.asarray(self.true_assignments) == np.squeeze(assignments))
logging.info('Accuracy: %f', accuracy)
+ logging.info('First loss: %f, end loss: %f', first_loss, end_loss)
+ self.assertGreater(end_loss, first_loss)
self.assertGreater(accuracy, 0.98)
def testParams(self):
from __future__ import print_function
import numpy as np
-from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.contrib.factorization.python.ops import gmm as gmm_lib
from tensorflow.contrib.learn.python.learn.estimators import kmeans
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import flags
from tensorflow.python.platform import test
from tensorflow.python.training import queue_runner
-FLAGS = flags.FLAGS
-
class GMMTest(test.TestCase):
self.batch_size = self.num_points
self.true_centers = self.make_random_centers(self.num_centers,
self.num_dims)
- self.points, self.assignments, self.scores = self.make_random_points(
+ self.points, self.assignments = self.make_random_points(
self.true_centers, self.num_points)
- self.true_score = np.add.reduce(self.scores)
# Use initial means from kmeans (just like scikit-learn does).
clusterer = kmeans.KMeansClustering(num_clusters=self.num_centers)
offsets = np.round(
np.random.randn(num_points, num_dims).astype(np.float32) * 20)
points = centers[assignments] + offsets
- means = [
- np.mean(
- points[assignments == center], axis=0)
- for center in xrange(num_centers)
- ]
- covs = [
- np.cov(points[assignments == center].T)
- for center in xrange(num_centers)
- ]
- scores = []
- for r in xrange(num_points):
- scores.append(
- np.sqrt(
- np.dot(
- np.dot(points[r, :] - means[assignments[r]],
- np.linalg.inv(covs[assignments[r]])), points[r, :] -
- means[assignments[r]])))
- return (points, assignments, scores)
+ return (points, assignments)
def test_weights(self):
"""Tests the shape of the weights."""
gmm.fit(input_fn=self.input_fn(), steps=10)
score2 = gmm.score(input_fn=self.input_fn(batch_size=self.num_points),
steps=1)
- self.assertGreater(score1, score2)
- self.assertNear(self.true_score, score2, self.true_score * 0.15)
+ self.assertLess(score1, score2)
def test_infer(self):
gmm = gmm_lib.GMM(self.num_centers,
# Make a small test set
num_points = 40
- points, true_assignments, true_offsets = (
- self.make_random_points(clusters, num_points))
+ points, true_assignments = self.make_random_points(clusters, num_points)
assignments = []
for item in gmm.predict_assignments(
assignments = np.ravel(assignments)
self.assertAllEqual(true_assignments, assignments)
- # Test score
- score = gmm.score(input_fn=self.input_fn(points=points,
- batch_size=num_points), steps=1)
- self.assertNear(score, np.sum(true_offsets), 4.05)
-
def _compare_with_sklearn(self, cov_type):
# sklearn version.
iterations = 40