2 # Copyright (c) 2012 The Native Client Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Memoize the data produced by slow operations into Google storage.
8 Caches computations described in terms of command lines and inputs directories
9 or files, which yield a set of output file.
12 # Done first to setup python module path.
24 import directory_storage
29 import working_directory
32 class HumanReadableSignature(object):
33 """Accumator of signature information in human readable form.
35 A replacement for hashlib that collects the inputs for later display.
40 def update(self, data):
41 """Add an item to the signature."""
42 # Drop paranoid nulls for human readable output.
43 data = data.replace('\0', '')
44 self._items.append(data)
47 """Fake version of hexdigest that returns the inputs."""
48 return ('*' * 30 + ' PACKAGE SIGNATURE ' + '*' * 30 + '\n' +
49 '\n'.join(self._items) + '\n' +
54 """Class to memoize slow operations."""
56 def __init__(self, storage, use_cached_results=True, cache_results=True,
57 print_url=None, system_summary=None):
61 storage: An storage layer to read/write from (GSDStorage).
62 use_cached_results: Flag indicating that cached computation results
63 should be used when possible.
64 cache_results: Flag that indicates if successful computations should be
66 print_url: Function that accepts an URL for printing the build result,
69 self._storage = storage
70 self._directory_storage = directory_storage.DirectoryStorageAdapter(storage)
71 self._use_cached_results = use_cached_results
72 self._cache_results = cache_results
73 self._print_url = print_url
74 self._system_summary = system_summary
76 def KeyForOutput(self, package, output_hash):
77 """Compute the key to store a give output in the data-store.
80 package: Package name.
81 output_hash: Stable hash of the package output.
83 Key that this instance of the package output should be stored/retrieved.
85 return 'object/%s_%s.tgz' % (package, output_hash)
87 def KeyForBuildSignature(self, build_signature):
88 """Compute the key to store a computation result in the data-store.
91 build_signature: Stable hash of the computation.
93 Key that this instance of the computation result should be
96 return 'computed/%s.txt' % build_signature
98 def WriteOutputFromHash(self, package, out_hash, output):
99 """Write output from the cache.
102 package: Package name (for tgz name).
103 out_hash: Hash of desired output.
106 URL from which output was obtained if successful, or None if not.
108 key = self.KeyForOutput(package, out_hash)
109 url = self._directory_storage.GetDirectory(key, output)
111 logging.debug('Failed to retrieve %s' % key)
113 if hashing_tools.StableHashPath(output) != out_hash:
114 logging.warning('Object does not match expected hash, '
115 'has hashing method changed?')
119 def PrintDownloadURL(self, url):
120 """Print download URL if function was provided in the constructor.
123 urls: A list of urls to print.
125 if self._print_url is not None:
128 def WriteResultToCache(self, package, build_signature, output):
129 """Cache a computed result by key.
131 Also prints URLs when appropriate.
133 package: Package name (for tgz name).
134 build_signature: The input hash of the computation.
135 output: A path containing the output of the computation.
137 if not self._cache_results:
139 out_hash = hashing_tools.StableHashPath(output)
141 output_key = self.KeyForOutput(package, out_hash)
142 # Try to get an existing copy in a temporary directory.
143 wd = working_directory.TemporaryWorkingDirectory()
145 temp_output = os.path.join(work_dir, 'out')
146 url = self._directory_storage.GetDirectory(output_key, temp_output)
148 # Isn't present. Cache the computed result instead.
149 url = self._directory_storage.PutDirectory(output, output_key)
150 logging.info('Computed fresh result and cached it.')
152 # Cached version is present. Replace the current output with that.
153 if self._use_cached_results:
154 file_tools.RemoveDirectoryIfPresent(output)
155 shutil.move(temp_output, output)
157 'Recomputed result matches cached value, '
158 'using cached value instead.')
159 # Upload an entry mapping from computation input to output hash.
160 self._storage.PutData(
161 out_hash, self.KeyForBuildSignature(build_signature))
162 self.PrintDownloadURL(url)
163 except gsd_storage.GSDStorageError:
164 logging.info('Failed to cache result.')
167 def ReadMemoizedResultFromCache(self, package, build_signature, output):
168 """Read a cached result (if it exists) from the cache.
170 Also prints URLs when appropriate.
172 package: Package name (for tgz name).
173 build_signature: Build signature of the computation.
176 Boolean indicating successful retrieval.
178 # Check if its in the cache.
179 if self._use_cached_results:
180 out_hash = self._storage.GetData(
181 self.KeyForBuildSignature(build_signature))
182 if out_hash is not None:
183 url = self.WriteOutputFromHash(package, out_hash, output)
185 logging.info('Retrieved cached result.')
186 self.PrintDownloadURL(url)
190 def Run(self, package, inputs, output, commands,
191 working_dir=None, memoize=True, signature_file=None, subdir=None):
192 """Run an operation once, possibly hitting cache.
195 package: Name of the computation/module.
196 inputs: A dict of names mapped to files that are inputs.
197 output: An output directory.
198 commands: A list of command.Command objects to run.
199 working_dir: Working directory to use, or None for a temp dir.
200 memoize: Boolean indicating the the result should be memoized.
201 signature_file: File to write human readable build signatures to or None.
202 subdir: If not None, use this directory instead of the output dir as the
203 substituter's output path. Must be a subdirectory of output.
205 if working_dir is None:
206 wdm = working_directory.TemporaryWorkingDirectory()
208 wdm = working_directory.FixedWorkingDirectory(working_dir)
210 file_tools.MakeDirectoryIfAbsent(output)
212 nonpath_subst = { 'package': package }
214 with wdm as work_dir:
215 # Compute the build signature with modified inputs.
216 build_signature = self.BuildSignature(
217 package, inputs=inputs, commands=commands)
218 # Optionally write human readable version of signature.
220 signature_file.write(self.BuildSignature(
221 package, inputs=inputs, commands=commands,
222 hasher=HumanReadableSignature()))
223 signature_file.flush()
225 # We're done if it's in the cache.
227 self.ReadMemoizedResultFromCache(package, build_signature, output)):
231 assert subdir.startswith(output)
233 for command in commands:
234 paths = inputs.copy()
235 paths['output'] = subdir if subdir else output
236 nonpath_subst['build_signature'] = build_signature
237 subst = substituter.Substituter(work_dir, paths, nonpath_subst)
238 command.Invoke(subst)
241 self.WriteResultToCache(package, build_signature, output)
243 def SystemSummary(self):
244 """Gather a string describing intrinsic properties of the current machine.
246 Ideally this would capture anything relevant about the current machine that
247 would cause build output to vary (other than build recipe + inputs).
249 if self._system_summary is None:
250 # Note there is no attempt to canonicalize these values. If two
251 # machines that would in fact produce identical builds differ in
252 # these values, it just means that a superfluous build will be
253 # done once to get the mapping from new input hash to preexisting
254 # output hash into the cache.
255 assert len(sys.platform) != 0, len(platform.machine()) != 0
256 # Use environment from command so we can access MinGW on windows.
257 env = command.PlatformEnvironment([])
258 gcc = file_tools.Which('gcc', paths=env['PATH'].split(os.pathsep))
259 p = subprocess.Popen(
260 [gcc, '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
261 _, gcc_version = p.communicate()
262 assert p.returncode == 0
264 ('platform', sys.platform),
265 ('machine', platform.machine()),
266 ('gcc-v', gcc_version),
268 self._system_summary = str(items)
269 return self._system_summary
271 def BuildSignature(self, package, inputs, commands, hasher=None):
272 """Compute a total checksum for a computation.
274 The computed hash includes system properties, inputs, and the commands run.
276 package: The name of the package computed.
277 inputs: A dict of names -> files/directories to be included in the
279 commands: A list of command.Command objects describing the commands run
280 for this computation.
281 hasher: Optional hasher to use.
283 A hex formatted sha1 to use as a computation key or a human readable
291 h.update('package:' + package)
292 h.update('summary:' + self.SystemSummary())
293 for command in commands:
295 h.update(str(command))
296 for key in sorted(inputs.keys()):
297 h.update('item_name:' + key + '\x00')
298 h.update('item:' + hashing_tools.StableHashPath(inputs[key]))