src/tools/swarming_client/isolated_format.py

   1 # Copyright 2014 The Swarming Authors. All rights reserved.
   2 # Use of this source code is governed under the Apache License, Version 2.0 that
   3 # can be found in the LICENSE file.
   4
   5 """Understands .isolated files and can do local operations on them."""
   6
   7 import hashlib
   8 import json
   9 import logging
  10 import os
  11 import re
  12 import stat
  13 import sys
  14
  15 from utils import file_path
  16 from utils import tools
  17
  18
  19 # Version stored and expected in .isolated files.
  20 ISOLATED_FILE_VERSION = '1.4'
  21
  22
  23 # Chunk size to use when doing disk I/O.
  24 DISK_FILE_CHUNK = 1024 * 1024
  25
  26
  27 # Sadly, hashlib uses 'sha1' instead of the standard 'sha-1' so explicitly
  28 # specify the names here.
  29 SUPPORTED_ALGOS = {
  30   'md5': hashlib.md5,
  31   'sha-1': hashlib.sha1,
  32   'sha-512': hashlib.sha512,
  33 }
  34
  35
  36 # Used for serialization.
  37 SUPPORTED_ALGOS_REVERSE = dict((v, k) for k, v in SUPPORTED_ALGOS.iteritems())
  38
  39
  40 class IsolatedError(ValueError):
  41   """Generic failure to load a .isolated file."""
  42   pass
  43
  44
  45 class MappingError(OSError):
  46   """Failed to recreate the tree."""
  47   pass
  48
  49
  50 def is_valid_hash(value, algo):
  51   """Returns if the value is a valid hash for the corresponding algorithm."""
  52   size = 2 * algo().digest_size
  53   return bool(re.match(r'^[a-fA-F0-9]{%d}$' % size, value))
  54
  55
  56 def get_hash_algo(_namespace):
  57   """Return hash algorithm class to use when uploading to given |namespace|."""
  58   # TODO(vadimsh): Implement this at some point.
  59   return hashlib.sha1
  60
  61
  62 def is_namespace_with_compression(namespace):
  63   """Returns True if given |namespace| stores compressed objects."""
  64   return namespace.endswith(('-gzip', '-deflate'))
  65
  66
  67 def hash_file(filepath, algo):
  68   """Calculates the hash of a file without reading it all in memory at once.
  69
  70   |algo| should be one of hashlib hashing algorithm.
  71   """
  72   digest = algo()
  73   with open(filepath, 'rb') as f:
  74     while True:
  75       chunk = f.read(DISK_FILE_CHUNK)
  76       if not chunk:
  77         break
  78       digest.update(chunk)
  79   return digest.hexdigest()
  80
  81
  82 class IsolatedFile(object):
  83   """Represents a single parsed .isolated file."""
  84
  85   def __init__(self, obj_hash, algo):
  86     """|obj_hash| is really the sha-1 of the file."""
  87     self.obj_hash = obj_hash
  88     self.algo = algo
  89
  90     # Raw data.
  91     self.data = {}
  92     # A IsolatedFile instance, one per object in self.includes.
  93     self.children = []
  94
  95     # Set once the .isolated file is loaded.
  96     self._is_loaded = False
  97
  98   def __repr__(self):
  99     return 'IsolatedFile(%s, loaded: %s)' % (self.obj_hash, self._is_loaded)
 100
 101   def load(self, content):
 102     """Verifies the .isolated file is valid and loads this object with the json
 103     data.
 104     """
 105     logging.debug('IsolatedFile.load(%s)' % self.obj_hash)
 106     assert not self._is_loaded
 107     self.data = load_isolated(content, self.algo)
 108     self.children = [
 109         IsolatedFile(i, self.algo) for i in self.data.get('includes', [])
 110     ]
 111     self._is_loaded = True
 112
 113   @property
 114   def is_loaded(self):
 115     """Returns True if 'load' was already called."""
 116     return self._is_loaded
 117
 118
 119 def walk_includes(isolated):
 120   """Walks IsolatedFile include graph and yields IsolatedFile objects.
 121
 122   Visits root node first, then recursively all children, left to right.
 123   Not yet loaded nodes are considered childless.
 124   """
 125   yield isolated
 126   for child in isolated.children:
 127     for x in walk_includes(child):
 128       yield x
 129
 130
 131 @tools.profile
 132 def expand_symlinks(indir, relfile):
 133   """Follows symlinks in |relfile|, but treating symlinks that point outside the
 134   build tree as if they were ordinary directories/files. Returns the final
 135   symlink-free target and a list of paths to symlinks encountered in the
 136   process.
 137
 138   The rule about symlinks outside the build tree is for the benefit of the
 139   Chromium OS ebuild, which symlinks the output directory to an unrelated path
 140   in the chroot.
 141
 142   Fails when a directory loop is detected, although in theory we could support
 143   that case.
 144   """
 145   is_directory = relfile.endswith(os.path.sep)
 146   done = indir
 147   todo = relfile.strip(os.path.sep)
 148   symlinks = []
 149
 150   while todo:
 151     pre_symlink, symlink, post_symlink = file_path.split_at_symlink(done, todo)
 152     if not symlink:
 153       todo = file_path.fix_native_path_case(done, todo)
 154       done = os.path.join(done, todo)
 155       break
 156     symlink_path = os.path.join(done, pre_symlink, symlink)
 157     post_symlink = post_symlink.lstrip(os.path.sep)
 158     # readlink doesn't exist on Windows.
 159     # pylint: disable=E1101
 160     target = os.path.normpath(os.path.join(done, pre_symlink))
 161     symlink_target = os.readlink(symlink_path)
 162     if os.path.isabs(symlink_target):
 163       # Absolute path are considered a normal directories. The use case is
 164       # generally someone who puts the output directory on a separate drive.
 165       target = symlink_target
 166     else:
 167       # The symlink itself could be using the wrong path case.
 168       target = file_path.fix_native_path_case(target, symlink_target)
 169
 170     if not os.path.exists(target):
 171       raise MappingError(
 172           'Symlink target doesn\'t exist: %s -> %s' % (symlink_path, target))
 173     target = file_path.get_native_path_case(target)
 174     if not file_path.path_starts_with(indir, target):
 175       done = symlink_path
 176       todo = post_symlink
 177       continue
 178     if file_path.path_starts_with(target, symlink_path):
 179       raise MappingError(
 180           'Can\'t map recursive symlink reference %s -> %s' %
 181           (symlink_path, target))
 182     logging.info('Found symlink: %s -> %s', symlink_path, target)
 183     symlinks.append(os.path.relpath(symlink_path, indir))
 184     # Treat the common prefix of the old and new paths as done, and start
 185     # scanning again.
 186     target = target.split(os.path.sep)
 187     symlink_path = symlink_path.split(os.path.sep)
 188     prefix_length = 0
 189     for target_piece, symlink_path_piece in zip(target, symlink_path):
 190       if target_piece == symlink_path_piece:
 191         prefix_length += 1
 192       else:
 193         break
 194     done = os.path.sep.join(target[:prefix_length])
 195     todo = os.path.join(
 196         os.path.sep.join(target[prefix_length:]), post_symlink)
 197
 198   relfile = os.path.relpath(done, indir)
 199   relfile = relfile.rstrip(os.path.sep) + is_directory * os.path.sep
 200   return relfile, symlinks
 201
 202
 203 @tools.profile
 204 def expand_directory_and_symlink(indir, relfile, blacklist, follow_symlinks):
 205   """Expands a single input. It can result in multiple outputs.
 206
 207   This function is recursive when relfile is a directory.
 208
 209   Note: this code doesn't properly handle recursive symlink like one created
 210   with:
 211     ln -s .. foo
 212   """
 213   if os.path.isabs(relfile):
 214     raise MappingError('Can\'t map absolute path %s' % relfile)
 215
 216   infile = file_path.normpath(os.path.join(indir, relfile))
 217   if not infile.startswith(indir):
 218     raise MappingError('Can\'t map file %s outside %s' % (infile, indir))
 219
 220   filepath = os.path.join(indir, relfile)
 221   native_filepath = file_path.get_native_path_case(filepath)
 222   if filepath != native_filepath:
 223     # Special case './'.
 224     if filepath != native_filepath + '.' + os.path.sep:
 225       # While it'd be nice to enforce path casing on Windows, it's impractical.
 226       # Also give up enforcing strict path case on OSX. Really, it's that sad.
 227       # The case where it happens is very specific and hard to reproduce:
 228       # get_native_path_case(
 229       #    u'Foo.framework/Versions/A/Resources/Something.nib') will return
 230       # u'Foo.framework/Versions/A/resources/Something.nib', e.g. lowercase 'r'.
 231       #
 232       # Note that this is really something deep in OSX because running
 233       # ls Foo.framework/Versions/A
 234       # will print out 'Resources', while file_path.get_native_path_case()
 235       # returns a lower case 'r'.
 236       #
 237       # So *something* is happening under the hood resulting in the command 'ls'
 238       # and Carbon.File.FSPathMakeRef('path').FSRefMakePath() to disagree.  We
 239       # have no idea why.
 240       if sys.platform not in ('darwin', 'win32'):
 241         raise MappingError(
 242             'File path doesn\'t equal native file path\n%s != %s' %
 243             (filepath, native_filepath))
 244
 245   symlinks = []
 246   if follow_symlinks:
 247     relfile, symlinks = expand_symlinks(indir, relfile)
 248
 249   if relfile.endswith(os.path.sep):
 250     if not os.path.isdir(infile):
 251       raise MappingError(
 252           '%s is not a directory but ends with "%s"' % (infile, os.path.sep))
 253
 254     # Special case './'.
 255     if relfile.startswith('.' + os.path.sep):
 256       relfile = relfile[2:]
 257     outfiles = symlinks
 258     try:
 259       for filename in file_path.listdir(infile):
 260         inner_relfile = os.path.join(relfile, filename)
 261         if blacklist and blacklist(inner_relfile):
 262           continue
 263         if os.path.isdir(os.path.join(indir, inner_relfile)):
 264           inner_relfile += os.path.sep
 265         outfiles.extend(
 266             expand_directory_and_symlink(indir, inner_relfile, blacklist,
 267                                          follow_symlinks))
 268       return outfiles
 269     except OSError as e:
 270       raise MappingError(
 271           'Unable to iterate over directory %s.\n%s' % (infile, e))
 272   else:
 273     # Always add individual files even if they were blacklisted.
 274     if os.path.isdir(infile):
 275       raise MappingError(
 276           'Input directory %s must have a trailing slash' % infile)
 277
 278     if not os.path.isfile(infile):
 279       raise MappingError('Input file %s doesn\'t exist' % infile)
 280
 281     return symlinks + [relfile]
 282
 283
 284 def expand_directories_and_symlinks(
 285     indir, infiles, blacklist, follow_symlinks, ignore_broken_items):
 286   """Expands the directories and the symlinks, applies the blacklist and
 287   verifies files exist.
 288
 289   Files are specified in os native path separator.
 290   """
 291   outfiles = []
 292   for relfile in infiles:
 293     try:
 294       outfiles.extend(
 295           expand_directory_and_symlink(
 296               indir, relfile, blacklist, follow_symlinks))
 297     except MappingError as e:
 298       if not ignore_broken_items:
 299         raise
 300       logging.info('warning: %s', e)
 301   return outfiles
 302
 303
 304 @tools.profile
 305 def file_to_metadata(filepath, prevdict, read_only, algo):
 306   """Processes an input file, a dependency, and return meta data about it.
 307
 308   Behaviors:
 309   - Retrieves the file mode, file size, file timestamp, file link
 310     destination if it is a file link and calcultate the SHA-1 of the file's
 311     content if the path points to a file and not a symlink.
 312
 313   Arguments:
 314     filepath: File to act on.
 315     prevdict: the previous dictionary. It is used to retrieve the cached sha-1
 316               to skip recalculating the hash. Optional.
 317     read_only: If 1 or 2, the file mode is manipulated. In practice, only save
 318                one of 4 modes: 0755 (rwx), 0644 (rw), 0555 (rx), 0444 (r). On
 319                windows, mode is not set since all files are 'executable' by
 320                default.
 321     algo:      Hashing algorithm used.
 322
 323   Returns:
 324     The necessary dict to create a entry in the 'files' section of an .isolated
 325     file.
 326   """
 327   out = {}
 328   # Always check the file stat and check if it is a link. The timestamp is used
 329   # to know if the file's content/symlink destination should be looked into.
 330   # E.g. only reuse from prevdict if the timestamp hasn't changed.
 331   # There is the risk of the file's timestamp being reset to its last value
 332   # manually while its content changed. We don't protect against that use case.
 333   try:
 334     filestats = os.lstat(filepath)
 335   except OSError:
 336     # The file is not present.
 337     raise MappingError('%s is missing' % filepath)
 338   is_link = stat.S_ISLNK(filestats.st_mode)
 339
 340   if sys.platform != 'win32':
 341     # Ignore file mode on Windows since it's not really useful there.
 342     filemode = stat.S_IMODE(filestats.st_mode)
 343     # Remove write access for group and all access to 'others'.
 344     filemode &= ~(stat.S_IWGRP | stat.S_IRWXO)
 345     if read_only:
 346       filemode &= ~stat.S_IWUSR
 347     if filemode & stat.S_IXUSR:
 348       filemode |= stat.S_IXGRP
 349     else:
 350       filemode &= ~stat.S_IXGRP
 351     if not is_link:
 352       out['m'] = filemode
 353
 354   # Used to skip recalculating the hash or link destination. Use the most recent
 355   # update time.
 356   out['t'] = int(round(filestats.st_mtime))
 357
 358   if not is_link:
 359     out['s'] = filestats.st_size
 360     # If the timestamp wasn't updated and the file size is still the same, carry
 361     # on the sha-1.
 362     if (prevdict.get('t') == out['t'] and
 363         prevdict.get('s') == out['s']):
 364       # Reuse the previous hash if available.
 365       out['h'] = prevdict.get('h')
 366     if not out.get('h'):
 367       out['h'] = hash_file(filepath, algo)
 368   else:
 369     # If the timestamp wasn't updated, carry on the link destination.
 370     if prevdict.get('t') == out['t']:
 371       # Reuse the previous link destination if available.
 372       out['l'] = prevdict.get('l')
 373     if out.get('l') is None:
 374       # The link could be in an incorrect path case. In practice, this only
 375       # happen on OSX on case insensitive HFS.
 376       # TODO(maruel): It'd be better if it was only done once, in
 377       # expand_directory_and_symlink(), so it would not be necessary to do again
 378       # here.
 379       symlink_value = os.readlink(filepath)  # pylint: disable=E1101
 380       filedir = file_path.get_native_path_case(os.path.dirname(filepath))
 381       native_dest = file_path.fix_native_path_case(filedir, symlink_value)
 382       out['l'] = os.path.relpath(native_dest, filedir)
 383   return out
 384
 385
 386 def save_isolated(isolated, data):
 387   """Writes one or multiple .isolated files.
 388
 389   Note: this reference implementation does not create child .isolated file so it
 390   always returns an empty list.
 391
 392   Returns the list of child isolated files that are included by |isolated|.
 393   """
 394   # Make sure the data is valid .isolated data by 'reloading' it.
 395   algo = SUPPORTED_ALGOS[data['algo']]
 396   load_isolated(json.dumps(data), algo)
 397   tools.write_json(isolated, data, True)
 398   return []
 399
 400
 401 def load_isolated(content, algo):
 402   """Verifies the .isolated file is valid and loads this object with the json
 403   data.
 404
 405   Arguments:
 406   - content: raw serialized content to load.
 407   - algo: hashlib algorithm class. Used to confirm the algorithm matches the
 408           algorithm used on the Isolate Server.
 409   """
 410   try:
 411     data = json.loads(content)
 412   except ValueError:
 413     raise IsolatedError('Failed to parse: %s...' % content[:100])
 414
 415   if not isinstance(data, dict):
 416     raise IsolatedError('Expected dict, got %r' % data)
 417
 418   # Check 'version' first, since it could modify the parsing after.
 419   value = data.get('version', '1.0')
 420   if not isinstance(value, basestring):
 421     raise IsolatedError('Expected string, got %r' % value)
 422   try:
 423     version = tuple(map(int, value.split('.')))
 424   except ValueError:
 425     raise IsolatedError('Expected valid version, got %r' % value)
 426
 427   expected_version = tuple(map(int, ISOLATED_FILE_VERSION.split('.')))
 428   # Major version must match.
 429   if version[0] != expected_version[0]:
 430     raise IsolatedError(
 431         'Expected compatible \'%s\' version, got %r' %
 432         (ISOLATED_FILE_VERSION, value))
 433
 434   if algo is None:
 435     # TODO(maruel): Remove the default around Jan 2014.
 436     # Default the algorithm used in the .isolated file itself, falls back to
 437     # 'sha-1' if unspecified.
 438     algo = SUPPORTED_ALGOS_REVERSE[data.get('algo', 'sha-1')]
 439
 440   for key, value in data.iteritems():
 441     if key == 'algo':
 442       if not isinstance(value, basestring):
 443         raise IsolatedError('Expected string, got %r' % value)
 444       if value not in SUPPORTED_ALGOS:
 445         raise IsolatedError(
 446             'Expected one of \'%s\', got %r' %
 447             (', '.join(sorted(SUPPORTED_ALGOS)), value))
 448       if value != SUPPORTED_ALGOS_REVERSE[algo]:
 449         raise IsolatedError(
 450             'Expected \'%s\', got %r' % (SUPPORTED_ALGOS_REVERSE[algo], value))
 451
 452     elif key == 'command':
 453       if not isinstance(value, list):
 454         raise IsolatedError('Expected list, got %r' % value)
 455       if not value:
 456         raise IsolatedError('Expected non-empty command')
 457       for subvalue in value:
 458         if not isinstance(subvalue, basestring):
 459           raise IsolatedError('Expected string, got %r' % subvalue)
 460
 461     elif key == 'files':
 462       if not isinstance(value, dict):
 463         raise IsolatedError('Expected dict, got %r' % value)
 464       for subkey, subvalue in value.iteritems():
 465         if not isinstance(subkey, basestring):
 466           raise IsolatedError('Expected string, got %r' % subkey)
 467         if not isinstance(subvalue, dict):
 468           raise IsolatedError('Expected dict, got %r' % subvalue)
 469         for subsubkey, subsubvalue in subvalue.iteritems():
 470           if subsubkey == 'l':
 471             if not isinstance(subsubvalue, basestring):
 472               raise IsolatedError('Expected string, got %r' % subsubvalue)
 473           elif subsubkey == 'm':
 474             if not isinstance(subsubvalue, int):
 475               raise IsolatedError('Expected int, got %r' % subsubvalue)
 476           elif subsubkey == 'h':
 477             if not is_valid_hash(subsubvalue, algo):
 478               raise IsolatedError('Expected sha-1, got %r' % subsubvalue)
 479           elif subsubkey == 's':
 480             if not isinstance(subsubvalue, (int, long)):
 481               raise IsolatedError('Expected int or long, got %r' % subsubvalue)
 482           else:
 483             raise IsolatedError('Unknown subsubkey %s' % subsubkey)
 484         if bool('h' in subvalue) == bool('l' in subvalue):
 485           raise IsolatedError(
 486               'Need only one of \'h\' (sha-1) or \'l\' (link), got: %r' %
 487               subvalue)
 488         if bool('h' in subvalue) != bool('s' in subvalue):
 489           raise IsolatedError(
 490               'Both \'h\' (sha-1) and \'s\' (size) should be set, got: %r' %
 491               subvalue)
 492         if bool('s' in subvalue) == bool('l' in subvalue):
 493           raise IsolatedError(
 494               'Need only one of \'s\' (size) or \'l\' (link), got: %r' %
 495               subvalue)
 496         if bool('l' in subvalue) and bool('m' in subvalue):
 497           raise IsolatedError(
 498               'Cannot use \'m\' (mode) and \'l\' (link), got: %r' %
 499               subvalue)
 500
 501     elif key == 'includes':
 502       if not isinstance(value, list):
 503         raise IsolatedError('Expected list, got %r' % value)
 504       if not value:
 505         raise IsolatedError('Expected non-empty includes list')
 506       for subvalue in value:
 507         if not is_valid_hash(subvalue, algo):
 508           raise IsolatedError('Expected sha-1, got %r' % subvalue)
 509
 510     elif key == 'os':
 511       if version >= (1, 4):
 512         raise IsolatedError('Key \'os\' is not allowed starting version 1.4')
 513
 514     elif key == 'read_only':
 515       if not value in (0, 1, 2):
 516         raise IsolatedError('Expected 0, 1 or 2, got %r' % value)
 517
 518     elif key == 'relative_cwd':
 519       if not isinstance(value, basestring):
 520         raise IsolatedError('Expected string, got %r' % value)
 521
 522     elif key == 'version':
 523       # Already checked above.
 524       pass
 525
 526     else:
 527       raise IsolatedError('Unknown key %r' % key)
 528
 529   # Automatically fix os.path.sep if necessary. While .isolated files are always
 530   # in the the native path format, someone could want to download an .isolated
 531   # tree from another OS.
 532   wrong_path_sep = '/' if os.path.sep == '\\' else '\\'
 533   if 'files' in data:
 534     data['files'] = dict(
 535         (k.replace(wrong_path_sep, os.path.sep), v)
 536         for k, v in data['files'].iteritems())
 537     for v in data['files'].itervalues():
 538       if 'l' in v:
 539         v['l'] = v['l'].replace(wrong_path_sep, os.path.sep)
 540   if 'relative_cwd' in data:
 541     data['relative_cwd'] = data['relative_cwd'].replace(
 542         wrong_path_sep, os.path.sep)
 543   return data