third_party/pigweed/repo/pw_tokenizer/py/pw_tokenizer/elf_reader.py

   1 #!/usr/bin/env python3
   2 # Copyright 2020 The Pigweed Authors
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License"); you may not
   5 # use this file except in compliance with the License. You may obtain a copy of
   6 # the License at
   7 #
   8 #     https://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13 # License for the specific language governing permissions and limitations under
  14 # the License.
  15 """Reads data from ELF sections.
  16
  17 This module provides tools for dumping the contents of an ELF section. It can
  18 also be used to read values at a particular address. A command line interface
  19 for both of these features is provided.
  20
  21 This module supports any ELF-format file, including .o and .so files. This
  22 module also has basic support for archive (.a) files. All ELF files in an
  23 archive are read as one unit.
  24 """
  25
  26 import argparse
  27 from pathlib import Path
  28 import re
  29 import struct
  30 import sys
  31 from typing import BinaryIO, Dict, Iterable, NamedTuple, Optional
  32 from typing import Pattern, Tuple, Union
  33
  34 ARCHIVE_MAGIC = b'!<arch>\n'
  35 ELF_MAGIC = b'\x7fELF'
  36
  37
  38 def _check_next_bytes(fd: BinaryIO, expected: bytes, what: str) -> None:
  39     actual = fd.read(len(expected))
  40     if expected != actual:
  41         raise FileDecodeError(
  42             f'Invalid {what}: expected {expected!r}, found {actual!r} in file '
  43             f'{getattr(fd, "name", "(unknown")}')
  44
  45
  46 def files_in_archive(fd: BinaryIO) -> Iterable[int]:
  47     """Seeks to each file in an archive and yields its size."""
  48
  49     _check_next_bytes(fd, ARCHIVE_MAGIC, 'archive magic number')
  50
  51     while True:
  52         # In some archives, the first file ends with an additional \n. If that
  53         # is present, skip it.
  54         if fd.read(1) != b'\n':
  55             fd.seek(-1, 1)
  56
  57         # Each file in an archive is prefixed with an ASCII header:
  58         #
  59         #   16 B - file identifier (text)
  60         #   12 B - file modification timestamp (decimal)
  61         #    6 B - owner ID (decimal)
  62         #    6 B - group ID (decimal)
  63         #    8 B - file mode (octal)
  64         #   10 B - file size in bytes (decimal)
  65         #    2 B - ending characters (`\n)
  66         #
  67         # Skip the unused portions of the file header, then read the size.
  68         fd.seek(16 + 12 + 6 + 6 + 8, 1)
  69         size_str = fd.read(10)
  70         if not size_str:
  71             return
  72
  73         try:
  74             size = int(size_str, 10)
  75         except ValueError as exc:
  76             raise FileDecodeError(
  77                 'Archive file sizes must be decimal integers') from exc
  78
  79         _check_next_bytes(fd, b'`\n', 'archive file header ending')
  80         offset = fd.tell()  # Store offset in case the caller reads the file.
  81
  82         yield size
  83
  84         fd.seek(offset + size)
  85
  86
  87 def _elf_files_in_archive(fd: BinaryIO):
  88     if _bytes_match(fd, ELF_MAGIC):
  89         yield  # The value isn't used, so just yield None.
  90     else:
  91         for _ in files_in_archive(fd):
  92             if _bytes_match(fd, ELF_MAGIC):
  93                 yield
  94
  95
  96 class Field(NamedTuple):
  97     """A field in an ELF file.
  98
  99     Fields refer to a particular piece of data in an ELF file or section header.
 100     """
 101
 102     name: str
 103     offset_32: int
 104     offset_64: int
 105     size_32: int
 106     size_64: int
 107
 108
 109 class _FileHeader(NamedTuple):
 110     """Fields in the ELF file header."""
 111
 112     section_header_offset: Field = Field('e_shoff', 0x20, 0x28, 4, 8)
 113     section_count: Field = Field('e_shnum', 0x30, 0x3C, 2, 2)
 114     section_names_index: Field = Field('e_shstrndx', 0x32, 0x3E, 2, 2)
 115
 116
 117 FILE_HEADER = _FileHeader()
 118
 119
 120 class _SectionHeader(NamedTuple):
 121     """Fields in an ELF section header."""
 122
 123     section_name_offset: Field = Field('sh_name', 0x00, 0x00, 4, 4)
 124     section_address: Field = Field('sh_addr', 0x0C, 0x10, 4, 8)
 125     section_offset: Field = Field('sh_offset', 0x10, 0x18, 4, 8)
 126     section_size: Field = Field('sh_size', 0x14, 0x20, 4, 8)
 127
 128     # section_header_end records the size of the header.
 129     section_header_end: Field = Field('section end', 0x28, 0x40, 0, 0)
 130
 131
 132 SECTION_HEADER = _SectionHeader()
 133
 134
 135 def read_c_string(fd: BinaryIO) -> bytes:
 136     """Reads a null-terminated string from the provided file descriptor."""
 137     string = bytearray()
 138     while True:
 139         byte = fd.read(1)
 140         if not byte or byte == b'\0':
 141             return bytes(string)
 142         string += byte
 143
 144
 145 def _bytes_match(fd: BinaryIO, expected: bytes) -> bool:
 146     """Peeks at the next bytes to see if they match the expected."""
 147     try:
 148         offset = fd.tell()
 149         data = fd.read(len(expected))
 150         fd.seek(offset)
 151         return data == expected
 152     except IOError:
 153         return False
 154
 155
 156 def compatible_file(file: Union[BinaryIO, str, Path]) -> bool:
 157     """True if the file type is supported (ELF or archive)."""
 158     try:
 159         fd = open(file, 'rb') if isinstance(file, (str, Path)) else file
 160
 161         offset = fd.tell()
 162         fd.seek(0)
 163         result = _bytes_match(fd, ELF_MAGIC) or _bytes_match(fd, ARCHIVE_MAGIC)
 164         fd.seek(offset)
 165     finally:
 166         if isinstance(file, (str, Path)):
 167             fd.close()
 168
 169     return result
 170
 171
 172 class FileDecodeError(Exception):
 173     """Invalid data was read from an ELF file."""
 174
 175
 176 class FieldReader:
 177     """Reads ELF fields defined with a Field tuple from an ELF file."""
 178     def __init__(self, elf: BinaryIO):
 179         self._elf = elf
 180         self.file_offset = self._elf.tell()
 181
 182         _check_next_bytes(self._elf, ELF_MAGIC, 'ELF file header')
 183         size_field = self._elf.read(1)  # e_ident[EI_CLASS] (address size)
 184
 185         int_unpacker = self._determine_integer_format()
 186
 187         if size_field == b'\x01':
 188             self.offset = lambda field: field.offset_32
 189             self._size = lambda field: field.size_32
 190             self._decode = lambda f, d: int_unpacker[f.size_32].unpack(d)[0]
 191         elif size_field == b'\x02':
 192             self.offset = lambda field: field.offset_64
 193             self._size = lambda field: field.size_64
 194             self._decode = lambda f, d: int_unpacker[f.size_64].unpack(d)[0]
 195         else:
 196             raise FileDecodeError('Unknown size {!r}'.format(size_field))
 197
 198     def _determine_integer_format(self) -> Dict[int, struct.Struct]:
 199         """Returns a dict of structs used for converting bytes to integers."""
 200         endianness_byte = self._elf.read(1)  # e_ident[EI_DATA] (endianness)
 201         if endianness_byte == b'\x01':
 202             endianness = '<'
 203         elif endianness_byte == b'\x02':
 204             endianness = '>'
 205         else:
 206             raise FileDecodeError(
 207                 'Unknown endianness {!r}'.format(endianness_byte))
 208
 209         return {
 210             1: struct.Struct(endianness + 'B'),
 211             2: struct.Struct(endianness + 'H'),
 212             4: struct.Struct(endianness + 'I'),
 213             8: struct.Struct(endianness + 'Q'),
 214         }
 215
 216     def read(self, field: Field, base: int = 0) -> int:
 217         self._elf.seek(self.file_offset + base + self.offset(field))
 218         data = self._elf.read(self._size(field))
 219         return self._decode(field, data)
 220
 221     def read_string(self, offset: int) -> str:
 222         self._elf.seek(self.file_offset + offset)
 223         return read_c_string(self._elf).decode()
 224
 225
 226 class Elf:
 227     """Represents an ELF file and the sections in it."""
 228     class Section(NamedTuple):
 229         """Info about a section in an ELF file."""
 230         name: str
 231         address: int
 232         offset: int
 233         size: int
 234
 235         file_offset: int  # Starting place in the file; 0 unless in an archive.
 236
 237         def range(self) -> range:
 238             return range(self.address, self.address + self.size)
 239
 240         def __lt__(self, other) -> bool:
 241             return self.address < other.address
 242
 243     def __init__(self, elf: BinaryIO):
 244         self._elf = elf
 245         self.sections: Tuple[Elf.Section, ...] = tuple(self._list_sections())
 246
 247     def _list_sections(self) -> Iterable['Elf.Section']:
 248         """Reads the section headers to enumerate all ELF sections."""
 249         for _ in _elf_files_in_archive(self._elf):
 250             reader = FieldReader(self._elf)
 251             base = reader.read(FILE_HEADER.section_header_offset)
 252             section_header_size = reader.offset(
 253                 SECTION_HEADER.section_header_end)
 254
 255             # Find the section with the section names in it.
 256             names_section_header_base = (
 257                 base + section_header_size *
 258                 reader.read(FILE_HEADER.section_names_index))
 259             names_table_base = reader.read(SECTION_HEADER.section_offset,
 260                                            names_section_header_base)
 261
 262             base = reader.read(FILE_HEADER.section_header_offset)
 263             for _ in range(reader.read(FILE_HEADER.section_count)):
 264                 name_offset = reader.read(SECTION_HEADER.section_name_offset,
 265                                           base)
 266
 267                 yield self.Section(
 268                     reader.read_string(names_table_base + name_offset),
 269                     reader.read(SECTION_HEADER.section_address, base),
 270                     reader.read(SECTION_HEADER.section_offset, base),
 271                     reader.read(SECTION_HEADER.section_size, base),
 272                     reader.file_offset)
 273
 274                 base += section_header_size
 275
 276     def section_by_address(self, address: int) -> Optional['Elf.Section']:
 277         """Returns the section that contains the provided address, if any."""
 278         # Iterate in reverse to give priority to sections with nonzero addresses
 279         for section in sorted(self.sections, reverse=True):
 280             if address in section.range():
 281                 return section
 282
 283         return None
 284
 285     def sections_with_name(self, name: str) -> Iterable['Elf.Section']:
 286         for section in self.sections:
 287             if section.name == name:
 288                 yield section
 289
 290     def read_value(self,
 291                    address: int,
 292                    size: Optional[int] = None) -> Union[None, bytes, int]:
 293         """Reads specified bytes or null-terminated string at address."""
 294         section = self.section_by_address(address)
 295         if not section:
 296             return None
 297
 298         assert section.address <= address
 299         self._elf.seek(section.file_offset + section.offset + address -
 300                        section.address)
 301
 302         if size is None:
 303             return read_c_string(self._elf)
 304
 305         return self._elf.read(size)
 306
 307     def dump_sections(self, name: Union[str,
 308                                         Pattern[str]]) -> Dict[str, bytes]:
 309         """Dumps a binary string containing the sections matching the regex."""
 310         name_regex = re.compile(name)
 311
 312         sections: Dict[str, bytes] = {}
 313         for section in self.sections:
 314             if name_regex.match(section.name):
 315                 self._elf.seek(section.file_offset + section.offset)
 316                 sections[section.name] = self._elf.read(section.size)
 317
 318         return sections
 319
 320     def dump_section_contents(
 321             self, name: Union[str, Pattern[str]]) -> Optional[bytes]:
 322         sections = self.dump_sections(name)
 323         return b''.join(sections.values()) if sections else None
 324
 325     def summary(self) -> str:
 326         return '\n'.join(
 327             '[{0:2}] {1.address:08x} {1.offset:08x} {1.size:08x} {1.name}'.
 328             format(i, section) for i, section in enumerate(self.sections))
 329
 330     def __str__(self) -> str:
 331         return 'Elf({}\n)'.format(''.join('\n  {},'.format(s)
 332                                           for s in self.sections))
 333
 334
 335 def _read_addresses(elf, size: int, output, address: Iterable[int]) -> None:
 336     for addr in address:
 337         value = elf.read_value(addr, size)
 338
 339         if value is None:
 340             raise ValueError('Invalid address 0x{:08x}'.format(addr))
 341
 342         output(value)
 343
 344
 345 def _dump_sections(elf: Elf, output, sections: Iterable[Pattern[str]]) -> None:
 346     if not sections:
 347         output(elf.summary().encode())
 348         return
 349
 350     for section_pattern in sections:
 351         output(elf.dump_section_contents(section_pattern))
 352
 353
 354 def _parse_args() -> argparse.Namespace:
 355     """Parses and returns command line arguments."""
 356     parser = argparse.ArgumentParser(description=__doc__)
 357
 358     def hex_int(arg):
 359         return int(arg, 16)
 360
 361     parser.add_argument('-e',
 362                         '--elf',
 363                         type=argparse.FileType('rb'),
 364                         help='the ELF file to examine',
 365                         required=True)
 366
 367     parser.add_argument(
 368         '-d',
 369         '--delimiter',
 370         default=ord('\n'),
 371         type=int,
 372         help=r'delimiter to write after each value; \n by default')
 373
 374     parser.set_defaults(handler=lambda **_: parser.print_help())
 375
 376     subparsers = parser.add_subparsers(
 377         help='select whether to work with addresses or whole sections')
 378
 379     section_parser = subparsers.add_parser('section')
 380     section_parser.set_defaults(handler=_dump_sections)
 381     section_parser.add_argument(
 382         'sections',
 383         metavar='section_regex',
 384         nargs='*',
 385         type=re.compile,  # type: ignore
 386         help='section name regular expression')
 387
 388     address_parser = subparsers.add_parser('address')
 389     address_parser.set_defaults(handler=_read_addresses)
 390     address_parser.add_argument(
 391         '--size',
 392         type=int,
 393         help='the size to read; reads until a null terminator by default')
 394     address_parser.add_argument('address',
 395                                 nargs='+',
 396                                 type=hex_int,
 397                                 help='hexadecimal addresses to read')
 398
 399     return parser.parse_args()
 400
 401
 402 def _main(args):
 403     """Calls the appropriate handler for the command line options."""
 404     handler = args.handler
 405     del args.handler
 406
 407     delim = args.delimiter
 408     del args.delimiter
 409
 410     def output(value):
 411         if value is not None:
 412             sys.stdout.buffer.write(value)
 413             sys.stdout.buffer.write(bytearray([delim]))
 414             sys.stdout.flush()
 415
 416     args.output = output
 417     args.elf = Elf(args.elf)
 418
 419     handler(**vars(args))
 420
 421
 422 if __name__ == '__main__':
 423     _main(_parse_args())