2 # Copyright (c) 2012 The Native Client Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Memoize the data produced by slow operations into Google storage.
8 Caches computations described in terms of command lines and inputs directories
9 or files, which yield a set of output file.
21 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
22 import pynacl.directory_storage
23 import pynacl.file_tools
24 import pynacl.gsd_storage
25 import pynacl.hashing_tools
26 import pynacl.log_tools
27 import pynacl.working_directory
33 CloudStorageItem = collections.namedtuple('CloudStorageItem',
34 ['dir_item', 'log_url'])
37 class UserError(Exception):
41 class HumanReadableSignature(object):
42 """Accumator of signature information in human readable form.
44 A replacement for hashlib that collects the inputs for later display.
49 def update(self, data):
50 """Add an item to the signature."""
51 # Drop paranoid nulls for human readable output.
52 data = data.replace('\0', '')
53 self._items.append(data)
56 """Fake version of hexdigest that returns the inputs."""
57 return ('*' * 30 + ' PACKAGE SIGNATURE ' + '*' * 30 + '\n' +
58 '\n'.join(self._items) + '\n' +
63 """Class to memoize slow operations."""
65 def __init__(self, storage, use_cached_results=True, cache_results=True,
66 print_url=None, system_summary=None, extra_paths={}):
70 storage: An storage layer to read/write from (GSDStorage).
71 use_cached_results: Flag indicating that cached computation results
72 should be used when possible.
73 cache_results: Flag that indicates if successful computations should be
75 print_url: Function that accepts a CloudStorageItem for printing URL
76 results, or None if no printing is needed.
77 extra_paths: Extra substitution paths that can be used by commands.
79 self._storage = storage
80 self._directory_storage = pynacl.directory_storage.DirectoryStorageAdapter(
83 self._use_cached_results = use_cached_results
84 self._cache_results = cache_results
85 self._cached_cloud_items = {}
86 self._print_url = print_url
87 self._system_summary = system_summary
88 self._path_hash_cache = {}
89 self._extra_paths = extra_paths
91 def KeyForOutput(self, package, output_hash):
92 """Compute the key to store a given output in the data-store.
95 package: Package name.
96 output_hash: Stable hash of the package output.
98 Key that this instance of the package output should be stored/retrieved.
100 return 'object/%s_%s.tgz' % (package, output_hash)
102 def KeyForBuildSignature(self, build_signature):
103 """Compute the key to store a computation result in the data-store.
106 build_signature: Stable hash of the computation.
108 Key that this instance of the computation result should be
111 return 'computed/%s.txt' % build_signature
113 def KeyForLog(self, package, output_hash):
114 """Compute the key to store a given log file in the data-store.
117 package: Package name.
118 output_hash: Stable hash of the package output.
120 Key that this instance of the package log should be stored/retrieved.
122 return 'log/%s_%s.log' % (package, output_hash)
124 def GetLogFile(self, work_dir, package):
125 """Returns the local log file for a given package.
128 work_dir: The work directory for the package.
129 package: The package name.
131 Path to the local log file within the work directory.
133 return os.path.join(work_dir, '%s.log' % package)
135 def WriteOutputFromHash(self, work_dir, package, out_hash, output):
136 """Write output from the cache.
139 work_dir: Working directory path.
140 package: Package name (for tgz name).
141 out_hash: Hash of desired output.
144 CloudStorageItem on success, None if not.
146 key = self.KeyForOutput(package, out_hash)
147 dir_item = self._directory_storage.GetDirectory(key, output)
149 logging.debug('Failed to retrieve %s' % key)
151 if pynacl.hashing_tools.StableHashPath(output) != out_hash:
152 logging.warning('Object does not match expected hash, '
153 'has hashing method changed?')
156 log_key = self.KeyForLog(package, out_hash)
157 log_file = self.GetLogFile(work_dir, package)
158 pynacl.file_tools.RemoveFile(log_file)
159 log_url = self._storage.GetFile(log_key, log_file)
161 return CloudStorageItem(dir_item, log_url)
163 def _ProcessCloudItem(self, package, cloud_item):
164 """Processes cached directory storage items.
167 package: Package name for the cached directory item.
168 cloud_item: CloudStorageItem representing a memoized item in the cloud.
170 # Store the cached URL as a tuple for book keeping.
171 self._cached_cloud_items[package] = cloud_item
173 # If a print URL function has been specified, print the URL now.
174 if self._print_url is not None:
175 self._print_url(cloud_item)
177 def WriteResultToCache(self, work_dir, package, build_signature, output):
178 """Cache a computed result by key.
180 Also prints URLs when appropriate.
182 work_dir: work directory for the package builder.
183 package: Package name (for tgz name).
184 build_signature: The input hash of the computation.
185 output: A path containing the output of the computation.
187 if not self._cache_results:
189 out_hash = pynacl.hashing_tools.StableHashPath(output)
191 output_key = self.KeyForOutput(package, out_hash)
192 # Try to get an existing copy in a temporary directory.
193 wd = pynacl.working_directory.TemporaryWorkingDirectory()
195 temp_output = os.path.join(temp_dir, 'out')
196 dir_item = self._directory_storage.GetDirectory(output_key, temp_output)
198 log_key = self.KeyForLog(package, out_hash)
199 log_file = self.GetLogFile(work_dir, package)
203 # Isn't present. Cache the computed result instead.
204 dir_item = self._directory_storage.PutDirectory(output, output_key)
206 if os.path.isfile(log_file):
207 log_url = self._storage.PutFile(log_file, log_key)
209 logging.info('Computed fresh result and cached it.')
211 # Cached version is present. Replace the current output with that.
212 if self._use_cached_results:
213 pynacl.file_tools.RemoveDirectoryIfPresent(output)
214 shutil.move(temp_output, output)
216 pynacl.file_tools.RemoveFile(log_file)
217 log_url = self._storage.GetFile(log_key, log_file)
219 logging.info('Recomputed result matches cached value, '
220 'using cached value instead.')
222 log_key_exists = self._storage.Exists(log_key)
224 log_url = log_key_exists
226 # Upload an entry mapping from computation input to output hash.
227 self._storage.PutData(
228 out_hash, self.KeyForBuildSignature(build_signature))
230 cloud_item = CloudStorageItem(dir_item, log_url)
231 self._ProcessCloudItem(package, cloud_item)
232 except pynacl.gsd_storage.GSDStorageError:
233 logging.info('Failed to cache result.')
236 def ReadMemoizedResultFromCache(self, work_dir, package,
237 build_signature, output):
238 """Read a cached result (if it exists) from the cache.
240 Also prints URLs when appropriate.
242 work_dir: Working directory for the build.
243 package: Package name (for tgz name).
244 build_signature: Build signature of the computation.
247 Boolean indicating successful retrieval.
249 # Check if its in the cache.
250 if self._use_cached_results:
251 out_hash = self._storage.GetData(
252 self.KeyForBuildSignature(build_signature))
253 if out_hash is not None:
254 cloud_item = self.WriteOutputFromHash(work_dir, package,
256 if cloud_item is not None:
257 logging.info('Retrieved cached result.')
258 self._ProcessCloudItem(package, cloud_item)
262 def GetCachedCloudItems(self):
263 """Returns the complete list of all cached cloud items for this run."""
264 return self._cached_cloud_items.values()
266 def GetCachedCloudItemForPackage(self, package):
267 """Returns cached cloud item for package or None if not processed."""
268 return self._cached_cloud_items.get(package, None)
270 def Run(self, package, inputs, output, commands, cmd_options=None,
271 working_dir=None, memoize=True, signature_file=None, subdir=None):
272 """Run an operation once, possibly hitting cache.
275 package: Name of the computation/module.
276 inputs: A dict of names mapped to files that are inputs.
277 output: An output directory.
278 commands: A list of command.Command objects to run.
279 working_dir: Working directory to use, or None for a temp dir.
280 memoize: Boolean indicating the the result should be memoized.
281 signature_file: File to write human readable build signatures to or None.
282 subdir: If not None, use this directory instead of the output dir as the
283 substituter's output path. Must be a subdirectory of output.
285 if working_dir is None:
286 wdm = pynacl.working_directory.TemporaryWorkingDirectory()
288 wdm = pynacl.working_directory.FixedWorkingDirectory(working_dir)
290 pynacl.file_tools.MakeDirectoryIfAbsent(output)
292 nonpath_subst = { 'package': package }
294 with wdm as work_dir:
295 # Compute the build signature with modified inputs.
296 build_signature = self.BuildSignature(
297 package, inputs=inputs, commands=commands)
298 # Optionally write human readable version of signature.
300 signature_file.write(self.BuildSignature(
301 package, inputs=inputs, commands=commands,
302 hasher=HumanReadableSignature()))
303 signature_file.flush()
305 # We're done if it's in the cache.
306 if (memoize and self.ReadMemoizedResultFromCache(work_dir, package,
312 assert subdir.startswith(output)
314 # Filter out commands that have a run condition of False.
315 # This must be done before any commands are invoked in case the run
316 # conditions rely on any pre-existing states.
317 commands = [command for command in commands
318 if command.CheckRunCond(cmd_options)]
320 # Create a logger that will save the log for each command.
321 # This logger will process any messages and then pass the results
322 # up to the base logger.
323 base_logger = pynacl.log_tools.GetConsoleLogger()
324 cmd_logger = base_logger.getChild('OnceCmdLogger')
325 cmd_logger.setLevel(logging.DEBUG)
327 log_file = self.GetLogFile(work_dir, package)
328 file_log_handler = logging.FileHandler(log_file, 'wb')
329 file_log_handler.setLevel(logging.DEBUG)
330 file_log_handler.setFormatter(
331 logging.Formatter(fmt='[%(levelname)s - %(asctime)s] %(message)s'))
332 cmd_logger.addHandler(file_log_handler)
334 # Log some helpful information
335 cmd_logger.propagate = False
336 cmd_logger.debug('Hostname: %s', platform.node())
337 cmd_logger.debug('Machine: %s', platform.machine())
338 cmd_logger.debug('Platform: %s', sys.platform)
339 cmd_logger.propagate = True
341 for command in commands:
342 paths = inputs.copy()
343 paths.update(self._extra_paths)
344 paths['output'] = subdir if subdir else output
345 nonpath_subst['build_signature'] = build_signature
346 subst = substituter.Substituter(work_dir, paths, nonpath_subst)
347 command.Invoke(cmd_logger, subst)
349 # Uninstall the file log handler
350 cmd_logger.removeHandler(file_log_handler)
351 file_log_handler.close()
353 # Confirm that we aren't hitting something we've cached.
354 for path in self._path_hash_cache:
355 if not os.path.relpath(output, path).startswith(os.pardir + os.sep):
357 'Package %s outputs to a directory already used as an input: %s' %
361 self.WriteResultToCache(work_dir, package, build_signature, output)
363 def SystemSummary(self):
364 """Gather a string describing intrinsic properties of the current machine.
366 Ideally this would capture anything relevant about the current machine that
367 would cause build output to vary (other than build recipe + inputs).
369 if self._system_summary is not None:
370 return self._system_summary
372 # Note there is no attempt to canonicalize these values. If two
373 # machines that would in fact produce identical builds differ in
374 # these values, it just means that a superfluous build will be
375 # done once to get the mapping from new input hash to preexisting
376 # output hash into the cache.
377 assert len(sys.platform) != 0, len(platform.machine()) != 0
378 # Use environment from command so we can access MinGW on windows.
379 env = command.PlatformEnvironment([])
381 def GetCompilerVersion(compiler_name):
383 compiler_file = pynacl.file_tools.Which(
384 compiler_name, paths=env['PATH'].split(os.pathsep))
385 p = subprocess.Popen([compiler_file, '-v'], stdout=subprocess.PIPE,
386 stderr=subprocess.PIPE, env=env)
387 _, compiler_version = p.communicate()
388 assert p.returncode == 0
389 except pynacl.file_tools.ExecutableNotFound:
391 return compiler_version
394 ('platform', sys.platform),
395 ('machine', platform.machine()),
396 ('gcc-v', GetCompilerVersion('gcc')),
397 ('arm-gcc-v', GetCompilerVersion('arm-linux-gnueabihf-gcc')),
399 self._system_summary = str(items)
400 return self._system_summary
402 def BuildSignature(self, package, inputs, commands, hasher=None):
403 """Compute a total checksum for a computation.
405 The computed hash includes system properties, inputs, and the commands run.
407 package: The name of the package computed.
408 inputs: A dict of names -> files/directories to be included in the
410 commands: A list of command.Command objects describing the commands run
411 for this computation.
412 hasher: Optional hasher to use.
414 A hex formatted sha1 to use as a computation key or a human readable
422 h.update('package:' + package)
423 h.update('summary:' + self.SystemSummary())
424 for command in commands:
426 h.update(str(command))
427 for key in sorted(inputs.keys()):
428 h.update('item_name:' + key + '\x00')
429 if inputs[key] in self._path_hash_cache:
430 path_hash = self._path_hash_cache[inputs[key]]
432 path_hash = 'item:' + pynacl.hashing_tools.StableHashPath(inputs[key])
433 self._path_hash_cache[inputs[key]] = path_hash