src/native_client/toolchain_build/once.py

   1 #!/usr/bin/python
   2 # Copyright (c) 2012 The Native Client Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Memoize the data produced by slow operations into Google storage.
   7
   8 Caches computations described in terms of command lines and inputs directories
   9 or files, which yield a set of output file.
  10 """
  11
  12 import collections
  13 import hashlib
  14 import logging
  15 import os
  16 import platform
  17 import shutil
  18 import subprocess
  19 import sys
  20
  21 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  22 import pynacl.directory_storage
  23 import pynacl.file_tools
  24 import pynacl.gsd_storage
  25 import pynacl.hashing_tools
  26 import pynacl.log_tools
  27 import pynacl.working_directory
  28
  29 import command
  30 import substituter
  31
  32
  33 CloudStorageItem = collections.namedtuple('CloudStorageItem',
  34                                           ['dir_item', 'log_url'])
  35
  36
  37 class UserError(Exception):
  38   pass
  39
  40
  41 class HumanReadableSignature(object):
  42   """Accumator of signature information in human readable form.
  43
  44   A replacement for hashlib that collects the inputs for later display.
  45   """
  46   def __init__(self):
  47     self._items = []
  48
  49   def update(self, data):
  50     """Add an item to the signature."""
  51     # Drop paranoid nulls for human readable output.
  52     data = data.replace('\0', '')
  53     self._items.append(data)
  54
  55   def hexdigest(self):
  56     """Fake version of hexdigest that returns the inputs."""
  57     return ('*' * 30 + ' PACKAGE SIGNATURE ' + '*' * 30 + '\n' +
  58             '\n'.join(self._items) + '\n' +
  59             '=' * 70 + '\n')
  60
  61
  62 class Once(object):
  63   """Class to memoize slow operations."""
  64
  65   def __init__(self, storage, use_cached_results=True, cache_results=True,
  66                print_url=None, system_summary=None, extra_paths={}):
  67     """Constructor.
  68
  69     Args:
  70       storage: An storage layer to read/write from (GSDStorage).
  71       use_cached_results: Flag indicating that cached computation results
  72                           should be used when possible.
  73       cache_results: Flag that indicates if successful computations should be
  74                      written to the cache.
  75       print_url: Function that accepts a CloudStorageItem for printing URL
  76                  results, or None if no printing is needed.
  77       extra_paths: Extra substitution paths that can be used by commands.
  78     """
  79     self._storage = storage
  80     self._directory_storage = pynacl.directory_storage.DirectoryStorageAdapter(
  81         storage
  82     )
  83     self._use_cached_results = use_cached_results
  84     self._cache_results = cache_results
  85     self._cached_cloud_items = {}
  86     self._print_url = print_url
  87     self._system_summary = system_summary
  88     self._path_hash_cache = {}
  89     self._extra_paths = extra_paths
  90
  91   def KeyForOutput(self, package, output_hash):
  92     """Compute the key to store a given output in the data-store.
  93
  94     Args:
  95       package: Package name.
  96       output_hash: Stable hash of the package output.
  97     Returns:
  98       Key that this instance of the package output should be stored/retrieved.
  99     """
 100     return 'object/%s_%s.tgz' % (package, output_hash)
 101
 102   def KeyForBuildSignature(self, build_signature):
 103     """Compute the key to store a computation result in the data-store.
 104
 105     Args:
 106       build_signature: Stable hash of the computation.
 107     Returns:
 108       Key that this instance of the computation result should be
 109       stored/retrieved.
 110     """
 111     return 'computed/%s.txt' % build_signature
 112
 113   def KeyForLog(self, package, output_hash):
 114     """Compute the key to store a given log file in the data-store.
 115
 116     Args:
 117       package: Package name.
 118       output_hash: Stable hash of the package output.
 119     Returns:
 120       Key that this instance of the package log should be stored/retrieved.
 121     """
 122     return 'log/%s_%s.log' % (package, output_hash)
 123
 124   def GetLogFile(self, work_dir, package):
 125     """Returns the local log file for a given package.
 126
 127     Args:
 128       work_dir: The work directory for the package.
 129       package: The package name.
 130     Returns:
 131       Path to the local log file within the work directory.
 132     """
 133     return os.path.join(work_dir, '%s.log' % package)
 134
 135   def WriteOutputFromHash(self, work_dir, package, out_hash, output):
 136     """Write output from the cache.
 137
 138     Args:
 139       work_dir: Working directory path.
 140       package: Package name (for tgz name).
 141       out_hash: Hash of desired output.
 142       output: Output path.
 143     Returns:
 144       CloudStorageItem on success, None if not.
 145     """
 146     key = self.KeyForOutput(package, out_hash)
 147     dir_item = self._directory_storage.GetDirectory(key, output)
 148     if not dir_item:
 149       logging.debug('Failed to retrieve %s' % key)
 150       return None
 151     if pynacl.hashing_tools.StableHashPath(output) != out_hash:
 152       logging.warning('Object does not match expected hash, '
 153                       'has hashing method changed?')
 154       return None
 155
 156     log_key = self.KeyForLog(package, out_hash)
 157     log_file = self.GetLogFile(work_dir, package)
 158     pynacl.file_tools.RemoveFile(log_file)
 159     log_url = self._storage.GetFile(log_key, log_file)
 160
 161     return CloudStorageItem(dir_item, log_url)
 162
 163   def _ProcessCloudItem(self, package, cloud_item):
 164     """Processes cached directory storage items.
 165
 166     Args:
 167       package: Package name for the cached directory item.
 168       cloud_item: CloudStorageItem representing a memoized item in the cloud.
 169     """
 170     # Store the cached URL as a tuple for book keeping.
 171     self._cached_cloud_items[package] = cloud_item
 172
 173     # If a print URL function has been specified, print the URL now.
 174     if self._print_url is not None:
 175       self._print_url(cloud_item)
 176
 177   def WriteResultToCache(self, work_dir, package, build_signature, output):
 178     """Cache a computed result by key.
 179
 180     Also prints URLs when appropriate.
 181     Args:
 182       work_dir: work directory for the package builder.
 183       package: Package name (for tgz name).
 184       build_signature: The input hash of the computation.
 185       output: A path containing the output of the computation.
 186     """
 187     if not self._cache_results:
 188       return
 189     out_hash = pynacl.hashing_tools.StableHashPath(output)
 190     try:
 191       output_key = self.KeyForOutput(package, out_hash)
 192       # Try to get an existing copy in a temporary directory.
 193       wd = pynacl.working_directory.TemporaryWorkingDirectory()
 194       with wd as temp_dir:
 195         temp_output = os.path.join(temp_dir, 'out')
 196         dir_item = self._directory_storage.GetDirectory(output_key, temp_output)
 197
 198         log_key = self.KeyForLog(package, out_hash)
 199         log_file = self.GetLogFile(work_dir, package)
 200         log_url = None
 201
 202         if dir_item is None:
 203           # Isn't present. Cache the computed result instead.
 204           dir_item = self._directory_storage.PutDirectory(output, output_key)
 205
 206           if os.path.isfile(log_file):
 207             log_url = self._storage.PutFile(log_file, log_key)
 208
 209           logging.info('Computed fresh result and cached it.')
 210         else:
 211           # Cached version is present. Replace the current output with that.
 212           if self._use_cached_results:
 213             pynacl.file_tools.RemoveDirectoryIfPresent(output)
 214             shutil.move(temp_output, output)
 215
 216             pynacl.file_tools.RemoveFile(log_file)
 217             log_url = self._storage.GetFile(log_key, log_file)
 218
 219             logging.info('Recomputed result matches cached value, '
 220                          'using cached value instead.')
 221           else:
 222             log_key_exists = self._storage.Exists(log_key)
 223             if log_key_exists:
 224               log_url = log_key_exists
 225
 226       # Upload an entry mapping from computation input to output hash.
 227       self._storage.PutData(
 228           out_hash, self.KeyForBuildSignature(build_signature))
 229
 230       cloud_item = CloudStorageItem(dir_item, log_url)
 231       self._ProcessCloudItem(package, cloud_item)
 232     except pynacl.gsd_storage.GSDStorageError:
 233       logging.info('Failed to cache result.')
 234       raise
 235
 236   def ReadMemoizedResultFromCache(self, work_dir, package,
 237                                   build_signature, output):
 238     """Read a cached result (if it exists) from the cache.
 239
 240     Also prints URLs when appropriate.
 241     Args:
 242       work_dir: Working directory for the build.
 243       package: Package name (for tgz name).
 244       build_signature: Build signature of the computation.
 245       output: Output path.
 246     Returns:
 247       Boolean indicating successful retrieval.
 248     """
 249     # Check if its in the cache.
 250     if self._use_cached_results:
 251       out_hash = self._storage.GetData(
 252           self.KeyForBuildSignature(build_signature))
 253       if out_hash is not None:
 254         cloud_item = self.WriteOutputFromHash(work_dir, package,
 255                                               out_hash, output)
 256         if cloud_item is not None:
 257           logging.info('Retrieved cached result.')
 258           self._ProcessCloudItem(package, cloud_item)
 259           return True
 260     return False
 261
 262   def GetCachedCloudItems(self):
 263     """Returns the complete list of all cached cloud items for this run."""
 264     return self._cached_cloud_items.values()
 265
 266   def GetCachedCloudItemForPackage(self, package):
 267     """Returns cached cloud item for package or None if not processed."""
 268     return self._cached_cloud_items.get(package, None)
 269
 270   def Run(self, package, inputs, output, commands, cmd_options=None,
 271           working_dir=None, memoize=True, signature_file=None, subdir=None):
 272     """Run an operation once, possibly hitting cache.
 273
 274     Args:
 275       package: Name of the computation/module.
 276       inputs: A dict of names mapped to files that are inputs.
 277       output: An output directory.
 278       commands: A list of command.Command objects to run.
 279       working_dir: Working directory to use, or None for a temp dir.
 280       memoize: Boolean indicating the the result should be memoized.
 281       signature_file: File to write human readable build signatures to or None.
 282       subdir: If not None, use this directory instead of the output dir as the
 283               substituter's output path. Must be a subdirectory of output.
 284     """
 285     if working_dir is None:
 286       wdm = pynacl.working_directory.TemporaryWorkingDirectory()
 287     else:
 288       wdm = pynacl.working_directory.FixedWorkingDirectory(working_dir)
 289
 290     pynacl.file_tools.MakeDirectoryIfAbsent(output)
 291
 292     nonpath_subst = { 'package': package }
 293
 294     with wdm as work_dir:
 295       # Compute the build signature with modified inputs.
 296       build_signature = self.BuildSignature(
 297           package, inputs=inputs, commands=commands)
 298       # Optionally write human readable version of signature.
 299       if signature_file:
 300         signature_file.write(self.BuildSignature(
 301             package, inputs=inputs, commands=commands,
 302             hasher=HumanReadableSignature()))
 303         signature_file.flush()
 304
 305       # We're done if it's in the cache.
 306       if (memoize and self.ReadMemoizedResultFromCache(work_dir, package,
 307                                                        build_signature,
 308                                                        output)):
 309         return
 310
 311       if subdir:
 312         assert subdir.startswith(output)
 313
 314       # Filter out commands that have a run condition of False.
 315       # This must be done before any commands are invoked in case the run
 316       # conditions rely on any pre-existing states.
 317       commands = [command for command in commands
 318                   if command.CheckRunCond(cmd_options)]
 319
 320       # Create a logger that will save the log for each command.
 321       # This logger will process any messages and then pass the results
 322       # up to the base logger.
 323       base_logger = pynacl.log_tools.GetConsoleLogger()
 324       cmd_logger = base_logger.getChild('OnceCmdLogger')
 325       cmd_logger.setLevel(logging.DEBUG)
 326
 327       log_file = self.GetLogFile(work_dir, package)
 328       file_log_handler = logging.FileHandler(log_file, 'wb')
 329       file_log_handler.setLevel(logging.DEBUG)
 330       file_log_handler.setFormatter(
 331           logging.Formatter(fmt='[%(levelname)s - %(asctime)s] %(message)s'))
 332       cmd_logger.addHandler(file_log_handler)
 333
 334       # Log some helpful information
 335       cmd_logger.propagate = False
 336       cmd_logger.debug('Hostname: %s', platform.node())
 337       cmd_logger.debug('Machine: %s', platform.machine())
 338       cmd_logger.debug('Platform: %s', sys.platform)
 339       cmd_logger.propagate = True
 340
 341       for command in commands:
 342         paths = inputs.copy()
 343         paths.update(self._extra_paths)
 344         paths['output'] = subdir if subdir else output
 345         nonpath_subst['build_signature'] = build_signature
 346         subst = substituter.Substituter(work_dir, paths, nonpath_subst)
 347         command.Invoke(cmd_logger, subst)
 348
 349       # Uninstall the file log handler
 350       cmd_logger.removeHandler(file_log_handler)
 351       file_log_handler.close()
 352
 353       # Confirm that we aren't hitting something we've cached.
 354       for path in self._path_hash_cache:
 355         if not os.path.relpath(output, path).startswith(os.pardir + os.sep):
 356           raise UserError(
 357               'Package %s outputs to a directory already used as an input: %s' %
 358               (package, path))
 359
 360       if memoize:
 361         self.WriteResultToCache(work_dir, package, build_signature, output)
 362
 363   def SystemSummary(self):
 364     """Gather a string describing intrinsic properties of the current machine.
 365
 366     Ideally this would capture anything relevant about the current machine that
 367     would cause build output to vary (other than build recipe + inputs).
 368     """
 369     if self._system_summary is not None:
 370       return self._system_summary
 371
 372     # Note there is no attempt to canonicalize these values.  If two
 373     # machines that would in fact produce identical builds differ in
 374     # these values, it just means that a superfluous build will be
 375     # done once to get the mapping from new input hash to preexisting
 376     # output hash into the cache.
 377     assert len(sys.platform) != 0, len(platform.machine()) != 0
 378     # Use environment from command so we can access MinGW on windows.
 379     env = command.PlatformEnvironment([])
 380
 381     def GetCompilerVersion(compiler_name):
 382       try:
 383         compiler_file = pynacl.file_tools.Which(
 384             compiler_name, paths=env['PATH'].split(os.pathsep))
 385         p = subprocess.Popen([compiler_file, '-v'], stdout=subprocess.PIPE,
 386                              stderr=subprocess.PIPE, env=env)
 387         _, compiler_version = p.communicate()
 388         assert p.returncode == 0
 389       except pynacl.file_tools.ExecutableNotFound:
 390         compiler_version = 0
 391       return compiler_version
 392
 393     items = [
 394         ('platform', sys.platform),
 395         ('machine', platform.machine()),
 396         ('gcc-v', GetCompilerVersion('gcc')),
 397         ('arm-gcc-v', GetCompilerVersion('arm-linux-gnueabihf-gcc')),
 398         ]
 399     self._system_summary = str(items)
 400     return self._system_summary
 401
 402   def BuildSignature(self, package, inputs, commands, hasher=None):
 403     """Compute a total checksum for a computation.
 404
 405     The computed hash includes system properties, inputs, and the commands run.
 406     Args:
 407       package: The name of the package computed.
 408       inputs: A dict of names -> files/directories to be included in the
 409               inputs set.
 410       commands: A list of command.Command objects describing the commands run
 411                 for this computation.
 412       hasher: Optional hasher to use.
 413     Returns:
 414       A hex formatted sha1 to use as a computation key or a human readable
 415       signature.
 416     """
 417     if hasher is None:
 418       h = hashlib.sha1()
 419     else:
 420       h = hasher
 421
 422     h.update('package:' + package)
 423     h.update('summary:' + self.SystemSummary())
 424     for command in commands:
 425       h.update('command:')
 426       h.update(str(command))
 427     for key in sorted(inputs.keys()):
 428       h.update('item_name:' + key + '\x00')
 429       if inputs[key] in self._path_hash_cache:
 430         path_hash = self._path_hash_cache[inputs[key]]
 431       else:
 432         path_hash = 'item:' + pynacl.hashing_tools.StableHashPath(inputs[key])
 433         self._path_hash_cache[inputs[key]] = path_hash
 434       h.update(path_hash)
 435     return h.hexdigest()