2 # Copyright (c) 2012 The Native Client Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Memoize the data produced by slow operations into Google storage.
8 Caches computations described in terms of command lines and inputs directories
9 or files, which yield a set of output file.
20 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
21 import pynacl.directory_storage
22 import pynacl.file_tools
23 import pynacl.gsd_storage
24 import pynacl.hashing_tools
25 import pynacl.working_directory
31 class UserError(Exception):
35 class HumanReadableSignature(object):
36 """Accumator of signature information in human readable form.
38 A replacement for hashlib that collects the inputs for later display.
43 def update(self, data):
44 """Add an item to the signature."""
45 # Drop paranoid nulls for human readable output.
46 data = data.replace('\0', '')
47 self._items.append(data)
50 """Fake version of hexdigest that returns the inputs."""
51 return ('*' * 30 + ' PACKAGE SIGNATURE ' + '*' * 30 + '\n' +
52 '\n'.join(self._items) + '\n' +
57 """Class to memoize slow operations."""
59 def __init__(self, storage, use_cached_results=True, cache_results=True,
60 print_url=None, system_summary=None, extra_paths={}):
64 storage: An storage layer to read/write from (GSDStorage).
65 use_cached_results: Flag indicating that cached computation results
66 should be used when possible.
67 cache_results: Flag that indicates if successful computations should be
69 print_url: Function that accepts an URL for printing the build result,
71 extra_paths: Extra substitution paths that can be used by commands.
73 self._storage = storage
74 self._directory_storage = pynacl.directory_storage.DirectoryStorageAdapter(
77 self._use_cached_results = use_cached_results
78 self._cache_results = cache_results
79 self._cached_dir_items = {}
80 self._print_url = print_url
81 self._system_summary = system_summary
82 self._path_hash_cache = {}
83 self._extra_paths = extra_paths
85 def KeyForOutput(self, package, output_hash):
86 """Compute the key to store a give output in the data-store.
89 package: Package name.
90 output_hash: Stable hash of the package output.
92 Key that this instance of the package output should be stored/retrieved.
94 return 'object/%s_%s.tgz' % (package, output_hash)
96 def KeyForBuildSignature(self, build_signature):
97 """Compute the key to store a computation result in the data-store.
100 build_signature: Stable hash of the computation.
102 Key that this instance of the computation result should be
105 return 'computed/%s.txt' % build_signature
107 def WriteOutputFromHash(self, package, out_hash, output):
108 """Write output from the cache.
111 package: Package name (for tgz name).
112 out_hash: Hash of desired output.
115 URL from which output was obtained if successful, or None if not.
117 key = self.KeyForOutput(package, out_hash)
118 dir_item = self._directory_storage.GetDirectory(key, output)
120 logging.debug('Failed to retrieve %s' % key)
122 if pynacl.hashing_tools.StableHashPath(output) != out_hash:
123 logging.warning('Object does not match expected hash, '
124 'has hashing method changed?')
128 def _ProcessCachedDir(self, package, dir_item):
129 """Processes cached directory storage items.
132 package: Package name for the cached directory item.
133 dir_item: DirectoryStorageItem returned from directory_storage.
135 # Store the cached URL as a tuple for book keeping.
136 self._cached_dir_items[package] = dir_item
138 # If a print URL function has been specified, print the URL now.
139 if self._print_url is not None:
140 self._print_url(dir_item.url)
142 def WriteResultToCache(self, package, build_signature, output):
143 """Cache a computed result by key.
145 Also prints URLs when appropriate.
147 package: Package name (for tgz name).
148 build_signature: The input hash of the computation.
149 output: A path containing the output of the computation.
151 if not self._cache_results:
153 out_hash = pynacl.hashing_tools.StableHashPath(output)
155 output_key = self.KeyForOutput(package, out_hash)
156 # Try to get an existing copy in a temporary directory.
157 wd = pynacl.working_directory.TemporaryWorkingDirectory()
159 temp_output = os.path.join(work_dir, 'out')
160 dir_item = self._directory_storage.GetDirectory(output_key, temp_output)
162 # Isn't present. Cache the computed result instead.
163 dir_item = self._directory_storage.PutDirectory(output, output_key)
164 logging.info('Computed fresh result and cached it.')
166 # Cached version is present. Replace the current output with that.
167 if self._use_cached_results:
168 pynacl.file_tools.RemoveDirectoryIfPresent(output)
169 shutil.move(temp_output, output)
171 'Recomputed result matches cached value, '
172 'using cached value instead.')
173 # Upload an entry mapping from computation input to output hash.
174 self._storage.PutData(
175 out_hash, self.KeyForBuildSignature(build_signature))
176 self._ProcessCachedDir(package, dir_item)
177 except pynacl.gsd_storage.GSDStorageError:
178 logging.info('Failed to cache result.')
181 def ReadMemoizedResultFromCache(self, package, build_signature, output):
182 """Read a cached result (if it exists) from the cache.
184 Also prints URLs when appropriate.
186 package: Package name (for tgz name).
187 build_signature: Build signature of the computation.
190 Boolean indicating successful retrieval.
192 # Check if its in the cache.
193 if self._use_cached_results:
194 out_hash = self._storage.GetData(
195 self.KeyForBuildSignature(build_signature))
196 if out_hash is not None:
197 dir_item = self.WriteOutputFromHash(package, out_hash, output)
198 if dir_item is not None:
199 logging.info('Retrieved cached result.')
200 self._ProcessCachedDir(package, dir_item)
204 def GetCachedDirItems(self):
205 """Returns the complete list of all cached directory items for this run."""
206 return self._cached_dir_items.values()
208 def GetCachedDirItemForPackage(self, package):
209 """Returns cached directory item for package or None if not processed."""
210 return self._cached_dir_items.get(package, None)
212 def Run(self, package, inputs, output, commands, cmd_options=None,
213 working_dir=None, memoize=True, signature_file=None, subdir=None):
214 """Run an operation once, possibly hitting cache.
217 package: Name of the computation/module.
218 inputs: A dict of names mapped to files that are inputs.
219 output: An output directory.
220 commands: A list of command.Command objects to run.
221 working_dir: Working directory to use, or None for a temp dir.
222 memoize: Boolean indicating the the result should be memoized.
223 signature_file: File to write human readable build signatures to or None.
224 subdir: If not None, use this directory instead of the output dir as the
225 substituter's output path. Must be a subdirectory of output.
227 if working_dir is None:
228 wdm = pynacl.working_directory.TemporaryWorkingDirectory()
230 wdm = pynacl.working_directory.FixedWorkingDirectory(working_dir)
232 pynacl.file_tools.MakeDirectoryIfAbsent(output)
234 nonpath_subst = { 'package': package }
236 with wdm as work_dir:
237 # Compute the build signature with modified inputs.
238 build_signature = self.BuildSignature(
239 package, inputs=inputs, commands=commands)
240 # Optionally write human readable version of signature.
242 signature_file.write(self.BuildSignature(
243 package, inputs=inputs, commands=commands,
244 hasher=HumanReadableSignature()))
245 signature_file.flush()
247 # We're done if it's in the cache.
249 self.ReadMemoizedResultFromCache(package, build_signature, output)):
253 assert subdir.startswith(output)
255 # Filter out commands that have a run condition of False.
256 # This must be done before any commands are invoked in case the run
257 # conditions rely on any pre-existing states.
258 commands = [command for command in commands
259 if command.CheckRunCond(cmd_options)]
261 for command in commands:
262 paths = inputs.copy()
263 paths.update(self._extra_paths)
264 paths['output'] = subdir if subdir else output
265 nonpath_subst['build_signature'] = build_signature
266 subst = substituter.Substituter(work_dir, paths, nonpath_subst)
267 command.Invoke(subst)
269 # Confirm that we aren't hitting something we've cached.
270 for path in self._path_hash_cache:
271 if not os.path.relpath(output, path).startswith(os.pardir + os.sep):
273 'Package %s outputs to a directory already used as an input: %s' %
277 self.WriteResultToCache(package, build_signature, output)
279 def SystemSummary(self):
280 """Gather a string describing intrinsic properties of the current machine.
282 Ideally this would capture anything relevant about the current machine that
283 would cause build output to vary (other than build recipe + inputs).
285 if self._system_summary is not None:
286 return self._system_summary
288 # Note there is no attempt to canonicalize these values. If two
289 # machines that would in fact produce identical builds differ in
290 # these values, it just means that a superfluous build will be
291 # done once to get the mapping from new input hash to preexisting
292 # output hash into the cache.
293 assert len(sys.platform) != 0, len(platform.machine()) != 0
294 # Use environment from command so we can access MinGW on windows.
295 env = command.PlatformEnvironment([])
297 def GetCompilerVersion(compiler_name):
299 compiler_file = pynacl.file_tools.Which(
300 compiler_name, paths=env['PATH'].split(os.pathsep))
301 p = subprocess.Popen([compiler_file, '-v'], stdout=subprocess.PIPE,
302 stderr=subprocess.PIPE, env=env)
303 _, compiler_version = p.communicate()
304 assert p.returncode == 0
305 except pynacl.file_tools.ExecutableNotFound:
307 return compiler_version
310 ('platform', sys.platform),
311 ('machine', platform.machine()),
312 ('gcc-v', GetCompilerVersion('gcc')),
313 ('arm-gcc-v', GetCompilerVersion('arm-linux-gnueabihf-gcc')),
315 self._system_summary = str(items)
316 return self._system_summary
318 def BuildSignature(self, package, inputs, commands, hasher=None):
319 """Compute a total checksum for a computation.
321 The computed hash includes system properties, inputs, and the commands run.
323 package: The name of the package computed.
324 inputs: A dict of names -> files/directories to be included in the
326 commands: A list of command.Command objects describing the commands run
327 for this computation.
328 hasher: Optional hasher to use.
330 A hex formatted sha1 to use as a computation key or a human readable
338 h.update('package:' + package)
339 h.update('summary:' + self.SystemSummary())
340 for command in commands:
342 h.update(str(command))
343 for key in sorted(inputs.keys()):
344 h.update('item_name:' + key + '\x00')
345 if inputs[key] in self._path_hash_cache:
346 path_hash = self._path_hash_cache[inputs[key]]
348 path_hash = 'item:' + pynacl.hashing_tools.StableHashPath(inputs[key])
349 self._path_hash_cache[inputs[key]] = path_hash