src/third_party/chromite/lib/filetype.py

   1 # Copyright 2014 The Chromium OS Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """File type decoding class for Chromium OS rootfs file bucketing.
   6
   7 This file decodes the type of file based on the contents, filename and other
   8 metadata. The result is a string that represents the file type and subtypes
   9 of the file, separated by slashes (/). The first level is one of the following:
  10 "text", "binary" and "inode". The first two refer to the contents of the file
  11 for regular files, while the third one is used for special files such as
  12 directories, symlinks, block devices, etc.
  13
  14 The file type can have more than one level, for example "binary/elf/static",
  15 "binary/image/png", or "text/conf". See the filetype_unittest.py file for more
  16 examples.
  17
  18 The purpose of this module is to provide a file type that splits the contents
  19 of a Chromium OS build in small buckets, partitioning cases where other standard
  20 classifications keep in the same set.
  21 """
  22
  23 from __future__ import print_function
  24
  25 import itertools
  26 import magic
  27 import mmap
  28 import os
  29 import re
  30 import stat
  31
  32 from chromite.lib import parseelf
  33
  34
  35 # The buffer size we would use to read files from the disk.
  36 FILE_BUFFER_SIZE = 32 * 1024
  37
  38
  39 def SplitShebang(header):
  40   """Splits a shebang (#!) into command and arguments.
  41
  42   Args:
  43     header: The first line of a shebang file, for example
  44         "#!/usr/bin/env -uPWD python foo.py\n". The referenced command must be
  45         an absolute path with optionally some arguments.
  46
  47   Returns:
  48     A tuple of strings (command, args) where the first string is the called
  49     and the second is the list of arguments as passed in the header.
  50
  51   Riases:
  52     ValueError if the passed header is not a valid shebang line.
  53   """
  54   m = re.match(r'#!\s*(/[a-z/0-9\.-]+)\s*(.*)$', header)
  55   if m:
  56     return m.group(1), m.group(2).strip()
  57   raise ValueError("shebang (#!) line expected")
  58
  59
  60 class FileTypeDecoder(object):
  61   """Class to help decode the type of a file.
  62
  63   This class implements a single GetType() method that decodes the type of a
  64   file based on the contents and metadata. This class holds some global data
  65   shared between several calls to that method.
  66   """
  67
  68   # Whitelist of mime types and their mapping to file type.
  69   MIME_TYPE_MAPPING = {
  70       'application/x-gzip': 'binary/compressed/gzip',
  71       'application/x-bzip2': 'binary/compressed/bzip2',
  72       'application/x-xz': 'binary/compressed/xz',
  73
  74       # Goobuntu magic database returns 'gzip' instead of 'x-gzip'. This
  75       # supports running dep_tracker outside the chroot for development.
  76       'application/gzip': 'binary/compressed/gzip',
  77   }
  78
  79   def __init__(self, root='/'):
  80     """Initializes the internal state.
  81
  82     Args:
  83       root: Path to the root directory where all the files live. This will be
  84       assumed as the root directory for absolute symlinks.
  85     """
  86     self._root = root
  87     self._mime = magic.open(magic.MIME_TYPE)
  88     self._mime.load()
  89
  90   def __del__(self):
  91     self._mime.close()
  92
  93   def GetType(self, rel_path, st=None, elf=None):
  94     """Return the file type of the passed file.
  95
  96     Does a best-effort attempt to infer the file type of the passed file. If
  97     only rel_path is provided, the stat_struct information and parsed ELF data
  98     will be computed. If the information is already available, such as if the
  99     ELF file is already parsed, passing st and elf will speed up the file
 100     detection.
 101
 102     Args:
 103       rel_path: The path to the file, used to detect the filetype from the
 104           contents of the file.
 105       st: The stat_result struct of the file.
 106       elf: The result of parseelf.ParseELF().
 107
 108     Returns:
 109       A string with the file type classified in categories separated by /. For
 110       example, a dynamic library will return 'binary/elf/dynamic-so'. If the
 111       type can't be inferred it returns None.
 112     """
 113     # Analysis based on inode data.
 114     if st is None:
 115       st = os.lstat(os.path.join(self._root, rel_path))
 116     if stat.S_ISDIR(st.st_mode):
 117       return 'inode/directory'
 118     if stat.S_ISLNK(st.st_mode):
 119       return 'inode/symlink'
 120     if not stat.S_ISREG(st.st_mode):
 121       return 'inode/special'
 122     if st.st_size == 0:
 123       return 'inode/empty'
 124
 125     # Analysis based on the ELF header and contents.
 126     if elf:
 127       return self._GetELFType(elf)
 128
 129     # Analysis based on the file contents.
 130     try:
 131       with open(os.path.join(self._root, rel_path), 'rb') as fobj:
 132         fmap = mmap.mmap(fobj.fileno(), 0, prot=mmap.PROT_READ)
 133         result = self._GetTypeFromContent(rel_path, fobj, fmap)
 134         fmap.close()
 135         return result
 136     except IOError:
 137       return
 138
 139   def _GetTypeFromContent(self, rel_path, fobj, fmap):
 140     """Return the file path based on the file contents.
 141
 142     This helper function detect the file type based on the contents of the file.
 143
 144     Args:
 145       rel_path: The path to the file, used to detect the filetype from the
 146           contents of the file.
 147       fobj: a file() object for random access to rel_path.
 148       fmap: a mmap object mapping the whole rel_path file for reading.
 149     """
 150
 151     # Detect if the file is binary based on the presence of non-ASCII chars. We
 152     # include some the first 32 chars often used in text files but we exclude
 153     # the rest.
 154     ascii_chars = '\x07\x08\t\n\x0c\r\x1b' + ''.join(map(chr, range(32, 128)))
 155     is_binary = any(bool(chunk.translate(None, ascii_chars))
 156                     for chunk in iter(lambda: fmap.read(FILE_BUFFER_SIZE), ''))
 157
 158     # We use the first part of the file in several checks.
 159     fmap.seek(0)
 160     first_kib = fmap.read(1024)
 161
 162     # Binary files.
 163     if is_binary:
 164       # The elf argument was not passed, so compute it now if the file is an
 165       # ELF.
 166       if first_kib.startswith('\x7fELF'):
 167         return self._GetELFType(parseelf.ParseELF(self._root, rel_path,
 168                                 parse_symbols=False))
 169
 170       if first_kib.startswith('MZ\x90\0'):
 171         return 'binary/dos-bin'
 172
 173       if len(first_kib) >= 512 and first_kib[510:512] == '\x55\xaa':
 174         return 'binary/bootsector/x86'
 175
 176       # Firmware file depend on the technical details of the device they run on,
 177       # so there's no easy way to detect them. We use the filename to guess that
 178       # case.
 179       if '/firmware/' in rel_path and (
 180           rel_path.endswith('.fw') or
 181           rel_path[-4:] in ('.bin', '.cis', '.csp', '.dsp')):
 182         return 'binary/firmware'
 183
 184       # TZif (timezone) files. See tzfile(5) for details.
 185       if (first_kib.startswith('TZif' + '\0' * 16) or
 186           first_kib.startswith('TZif2' + '\0' * 15) or
 187           first_kib.startswith('TZif3' + '\0' * 15)):
 188         return 'binary/tzfile'
 189
 190       # Whitelist some binary mime types.
 191       fobj.seek(0)
 192       # _mime.descriptor() will close the passed file descriptor.
 193       mime_type = self._mime.descriptor(os.dup(fobj.fileno()))
 194       if mime_type.startswith('image/'):
 195         return 'binary/' + mime_type
 196       if mime_type in self.MIME_TYPE_MAPPING:
 197         return self.MIME_TYPE_MAPPING[mime_type]
 198
 199       # Other binary files.
 200       return 'binary'
 201
 202     # Text files.
 203     # Read the first couple of lines used in the following checks. This will
 204     # only read the required lines, with the '\n' char at the end of each line
 205     # except on the last one if it is not present on that line. At this point
 206     # we know that the file is not empty, so at least one line existst.
 207     fmap.seek(0)
 208     first_lines = list(itertools.islice(iter(fmap.readline, ''), 0, 10))
 209     head_line = first_lines[0]
 210
 211     # #! or "shebangs". Only those files with a single line are considered
 212     # shebangs. Some files start with "#!" but are other kind of files, such
 213     # as python or bash scripts.
 214     try:
 215       prog_name, args = SplitShebang(head_line)
 216       if len(first_lines) == 1:
 217         return 'text/shebang'
 218
 219       prog_name = os.path.basename(prog_name)
 220       args = args.split()
 221       if prog_name == 'env':
 222         # If "env" is called, we skip all the arguments passed to env (flags,
 223         # VAR=value) and treat the program name as the program to use.
 224         for i, arg in enumerate(args):
 225           if arg == '--' and (i + 1) < len(args):
 226             prog_name = args[i + 1]
 227             break
 228           if not arg or arg[0] == '-' or '=' in arg:
 229             continue
 230           prog_name = arg
 231           break
 232
 233       # Strip the version number from comon programs like "python2.7".
 234       prog_name = prog_name.rstrip('0123456789-.')
 235
 236       if prog_name in ('awk', 'bash', 'dash', 'ksh', 'perl', 'python', 'sh'):
 237         return 'text/script/' + prog_name
 238       # Other unknown script.
 239       return 'text/script'
 240     except ValueError:
 241       pass
 242
 243     # PEM files.
 244     if head_line.strip() == '-----BEGIN CERTIFICATE-----':
 245       return 'text/pem/cert'
 246     if head_line.strip() == '-----BEGIN RSA PRIVATE KEY-----':
 247       return 'text/pem/rsa-private'
 248
 249     # Linker script.
 250     if head_line.strip() == '/* GNU ld script':
 251       return 'text/ld-script'
 252
 253     # Protobuf files.
 254     if rel_path.endswith('.proto'):
 255       return 'text/proto'
 256
 257     if len(first_lines) == 1:
 258       if re.match(r'[0-9\.]+$', head_line):
 259         return 'text/oneline/number'
 260       return 'text/oneline'
 261
 262     return 'text'
 263
 264   @staticmethod
 265   def _GetELFType(elf):
 266     """Returns the file type for ELF files.
 267
 268     Args:
 269       elf: The result of parseelf.ParseELF().
 270     """
 271     if elf['type'] == 'ET_REL':
 272       elf_type = 'object'
 273     elif (not '.dynamic' in elf['sections'] and
 274           not 'PT_DYNAMIC' in elf['segments']):
 275       elf_type = 'static'
 276     else:
 277       if elf['is_lib']:
 278         elf_type = 'dynamic-so'
 279       else:
 280         elf_type = 'dynamic-bin'
 281     return 'binary/elf/' + elf_type
 282
 283   @classmethod
 284   def DecodeFile(cls, path):
 285     """Decodes the file type of the passed file.
 286
 287     This function is a wrapper to the FileTypeDecoder class to decode the type
 288     of a single file. If you need to decode multiple files please use
 289     FileTypeDecoder class instead.
 290
 291     Args:
 292       path: The path to the file or directory.
 293
 294     Returns:
 295       A string with the decoded file type or None if it couldn't be decoded.
 296     """
 297     return cls('.').GetType(path)