1 # Copyright 2014 The Chromium OS Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """File type decoding class for Chromium OS rootfs file bucketing.
7 This file decodes the type of file based on the contents, filename and other
8 metadata. The result is a string that represents the file type and subtypes
9 of the file, separated by slashes (/). The first level is one of the following:
10 "text", "binary" and "inode". The first two refer to the contents of the file
11 for regular files, while the third one is used for special files such as
12 directories, symlinks, block devices, etc.
14 The file type can have more than one level, for example "binary/elf/static",
15 "binary/image/png", or "text/conf". See the filetype_unittest.py file for more
18 The purpose of this module is to provide a file type that splits the contents
19 of a Chromium OS build in small buckets, partitioning cases where other standard
20 classifications keep in the same set.
23 from __future__ import print_function
32 from chromite.lib import parseelf
35 # The buffer size we would use to read files from the disk.
36 FILE_BUFFER_SIZE = 32 * 1024
39 def SplitShebang(header):
40 """Splits a shebang (#!) into command and arguments.
43 header: The first line of a shebang file, for example
44 "#!/usr/bin/env -uPWD python foo.py\n". The referenced command must be
45 an absolute path with optionally some arguments.
48 A tuple of strings (command, args) where the first string is the called
49 and the second is the list of arguments as passed in the header.
52 ValueError if the passed header is not a valid shebang line.
54 m = re.match(r'#!\s*(/[a-z/0-9\.-]+)\s*(.*)$', header)
56 return m.group(1), m.group(2).strip()
57 raise ValueError("shebang (#!) line expected")
60 class FileTypeDecoder(object):
61 """Class to help decode the type of a file.
63 This class implements a single GetType() method that decodes the type of a
64 file based on the contents and metadata. This class holds some global data
65 shared between several calls to that method.
68 # Whitelist of mime types and their mapping to file type.
70 'application/x-gzip': 'binary/compressed/gzip',
71 'application/x-bzip2': 'binary/compressed/bzip2',
72 'application/x-xz': 'binary/compressed/xz',
74 # Goobuntu magic database returns 'gzip' instead of 'x-gzip'. This
75 # supports running dep_tracker outside the chroot for development.
76 'application/gzip': 'binary/compressed/gzip',
79 def __init__(self, root='/'):
80 """Initializes the internal state.
83 root: Path to the root directory where all the files live. This will be
84 assumed as the root directory for absolute symlinks.
87 self._mime = magic.open(magic.MIME_TYPE)
93 def GetType(self, rel_path, st=None, elf=None):
94 """Return the file type of the passed file.
96 Does a best-effort attempt to infer the file type of the passed file. If
97 only rel_path is provided, the stat_struct information and parsed ELF data
98 will be computed. If the information is already available, such as if the
99 ELF file is already parsed, passing st and elf will speed up the file
103 rel_path: The path to the file, used to detect the filetype from the
104 contents of the file.
105 st: The stat_result struct of the file.
106 elf: The result of parseelf.ParseELF().
109 A string with the file type classified in categories separated by /. For
110 example, a dynamic library will return 'binary/elf/dynamic-so'. If the
111 type can't be inferred it returns None.
113 # Analysis based on inode data.
115 st = os.lstat(os.path.join(self._root, rel_path))
116 if stat.S_ISDIR(st.st_mode):
117 return 'inode/directory'
118 if stat.S_ISLNK(st.st_mode):
119 return 'inode/symlink'
120 if not stat.S_ISREG(st.st_mode):
121 return 'inode/special'
125 # Analysis based on the ELF header and contents.
127 return self._GetELFType(elf)
129 # Analysis based on the file contents.
131 with open(os.path.join(self._root, rel_path), 'rb') as fobj:
132 fmap = mmap.mmap(fobj.fileno(), 0, prot=mmap.PROT_READ)
133 result = self._GetTypeFromContent(rel_path, fobj, fmap)
139 def _GetTypeFromContent(self, rel_path, fobj, fmap):
140 """Return the file path based on the file contents.
142 This helper function detect the file type based on the contents of the file.
145 rel_path: The path to the file, used to detect the filetype from the
146 contents of the file.
147 fobj: a file() object for random access to rel_path.
148 fmap: a mmap object mapping the whole rel_path file for reading.
151 # Detect if the file is binary based on the presence of non-ASCII chars. We
152 # include some the first 32 chars often used in text files but we exclude
154 ascii_chars = '\x07\x08\t\n\x0c\r\x1b' + ''.join(map(chr, range(32, 128)))
155 is_binary = any(bool(chunk.translate(None, ascii_chars))
156 for chunk in iter(lambda: fmap.read(FILE_BUFFER_SIZE), ''))
158 # We use the first part of the file in several checks.
160 first_kib = fmap.read(1024)
164 # The elf argument was not passed, so compute it now if the file is an
166 if first_kib.startswith('\x7fELF'):
167 return self._GetELFType(parseelf.ParseELF(self._root, rel_path,
168 parse_symbols=False))
170 if first_kib.startswith('MZ\x90\0'):
171 return 'binary/dos-bin'
173 if len(first_kib) >= 512 and first_kib[510:512] == '\x55\xaa':
174 return 'binary/bootsector/x86'
176 # Firmware file depend on the technical details of the device they run on,
177 # so there's no easy way to detect them. We use the filename to guess that
179 if '/firmware/' in rel_path and (
180 rel_path.endswith('.fw') or
181 rel_path[-4:] in ('.bin', '.cis', '.csp', '.dsp')):
182 return 'binary/firmware'
184 # TZif (timezone) files. See tzfile(5) for details.
185 if (first_kib.startswith('TZif' + '\0' * 16) or
186 first_kib.startswith('TZif2' + '\0' * 15) or
187 first_kib.startswith('TZif3' + '\0' * 15)):
188 return 'binary/tzfile'
190 # Whitelist some binary mime types.
192 # _mime.descriptor() will close the passed file descriptor.
193 mime_type = self._mime.descriptor(os.dup(fobj.fileno()))
194 if mime_type.startswith('image/'):
195 return 'binary/' + mime_type
196 if mime_type in self.MIME_TYPE_MAPPING:
197 return self.MIME_TYPE_MAPPING[mime_type]
199 # Other binary files.
203 # Read the first couple of lines used in the following checks. This will
204 # only read the required lines, with the '\n' char at the end of each line
205 # except on the last one if it is not present on that line. At this point
206 # we know that the file is not empty, so at least one line existst.
208 first_lines = list(itertools.islice(iter(fmap.readline, ''), 0, 10))
209 head_line = first_lines[0]
211 # #! or "shebangs". Only those files with a single line are considered
212 # shebangs. Some files start with "#!" but are other kind of files, such
213 # as python or bash scripts.
215 prog_name, args = SplitShebang(head_line)
216 if len(first_lines) == 1:
217 return 'text/shebang'
219 prog_name = os.path.basename(prog_name)
221 if prog_name == 'env':
222 # If "env" is called, we skip all the arguments passed to env (flags,
223 # VAR=value) and treat the program name as the program to use.
224 for i, arg in enumerate(args):
225 if arg == '--' and (i + 1) < len(args):
226 prog_name = args[i + 1]
228 if not arg or arg[0] == '-' or '=' in arg:
233 # Strip the version number from comon programs like "python2.7".
234 prog_name = prog_name.rstrip('0123456789-.')
236 if prog_name in ('awk', 'bash', 'dash', 'ksh', 'perl', 'python', 'sh'):
237 return 'text/script/' + prog_name
238 # Other unknown script.
244 if head_line.strip() == '-----BEGIN CERTIFICATE-----':
245 return 'text/pem/cert'
246 if head_line.strip() == '-----BEGIN RSA PRIVATE KEY-----':
247 return 'text/pem/rsa-private'
250 if head_line.strip() == '/* GNU ld script':
251 return 'text/ld-script'
254 if rel_path.endswith('.proto'):
257 if len(first_lines) == 1:
258 if re.match(r'[0-9\.]+$', head_line):
259 return 'text/oneline/number'
260 return 'text/oneline'
265 def _GetELFType(elf):
266 """Returns the file type for ELF files.
269 elf: The result of parseelf.ParseELF().
271 if elf['type'] == 'ET_REL':
273 elif (not '.dynamic' in elf['sections'] and
274 not 'PT_DYNAMIC' in elf['segments']):
278 elf_type = 'dynamic-so'
280 elf_type = 'dynamic-bin'
281 return 'binary/elf/' + elf_type
284 def DecodeFile(cls, path):
285 """Decodes the file type of the passed file.
287 This function is a wrapper to the FileTypeDecoder class to decode the type
288 of a single file. If you need to decode multiple files please use
289 FileTypeDecoder class instead.
292 path: The path to the file or directory.
295 A string with the decoded file type or None if it couldn't be decoded.
297 return cls('.').GetType(path)