1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "courgette/disassembler_win32.h"
10 #include "base/logging.h"
11 #include "courgette/assembly_program.h"
12 #include "courgette/courgette.h"
14 #if COURGETTE_HISTOGRAM_TARGETS
20 DisassemblerWin32::DisassemblerWin32(const uint8_t* start, size_t length)
21 : Disassembler(start, length) {}
23 RVA DisassemblerWin32::FileOffsetToRVA(FileOffset file_offset) const {
24 for (int i = 0; i < number_of_sections_; ++i) {
25 const Section* section = §ions_[i];
26 if (file_offset >= section->file_offset_of_raw_data) {
27 FileOffset offset_in_section =
28 file_offset - section->file_offset_of_raw_data;
29 if (offset_in_section < section->size_of_raw_data)
30 return static_cast<RVA>(section->virtual_address + offset_in_section);
38 FileOffset DisassemblerWin32::RVAToFileOffset(RVA rva) const {
39 const Section* section = RVAToSection(rva);
40 if (section != nullptr) {
41 FileOffset offset_in_section = rva - section->virtual_address;
42 // Need this extra check, since an |rva| may be valid for a section, but is
43 // non-existent in an image (e.g. uninit data).
44 if (offset_in_section >= section->size_of_raw_data)
47 return static_cast<FileOffset>(section->file_offset_of_raw_data +
51 // Small RVA values point into the file header in the loaded image.
52 // RVA 0 is the module load address which Windows uses as the module handle.
53 // RVA 2 sometimes occurs, I'm not sure what it is, but it would map into the
55 if (rva == 0 || rva == 2)
56 return static_cast<FileOffset>(rva);
62 // ParseHeader attempts to match up the buffer with the Windows data
63 // structures that exist within a Windows 'Portable Executable' format file.
64 // Returns 'true' if the buffer matches, and 'false' if the data looks
65 // suspicious. Rather than try to 'map' the buffer to the numerous windows
66 // structures, we extract the information we need into the courgette::PEInfo
69 bool DisassemblerWin32::ParseHeader() {
70 if (!IsRangeInBounds(kOffsetOfFileAddressOfNewExeHeader, 4))
71 return Bad("Too small");
73 // Have 'MZ' magic for a DOS header?
74 if (start()[0] != 'M' || start()[1] != 'Z')
77 // offset from DOS header to PE header is stored in DOS header.
78 FileOffset pe_header_offset = static_cast<FileOffset>(
79 ReadU32(start(), kOffsetOfFileAddressOfNewExeHeader));
80 if (pe_header_offset % 8 != 0)
81 return Bad("Misaligned PE header");
82 if (pe_header_offset < kOffsetOfFileAddressOfNewExeHeader + 4)
83 return Bad("PE header pathological overlap");
84 if (!IsRangeInBounds(pe_header_offset, kMinPeHeaderSize))
85 return Bad("PE header past end of file");
87 const uint8_t* const pe_header = FileOffsetToPointer(pe_header_offset);
89 // The 'PE' header is an IMAGE_NT_HEADERS structure as defined in WINNT.H.
90 // See http://msdn.microsoft.com/en-us/library/ms680336(VS.85).aspx
92 // The first field of the IMAGE_NT_HEADERS is the signature.
93 if (!(pe_header[0] == 'P' && pe_header[1] == 'E' && pe_header[2] == 0 &&
95 return Bad("No PE signature");
98 // The second field of the IMAGE_NT_HEADERS is the COFF header.
99 // The COFF header is also called an IMAGE_FILE_HEADER
100 // http://msdn.microsoft.com/en-us/library/ms680313(VS.85).aspx
101 FileOffset coff_header_offset = pe_header_offset + 4;
102 if (!IsRangeInBounds(coff_header_offset, kSizeOfCoffHeader))
103 return Bad("COFF header past end of file");
104 const uint8_t* const coff_header = start() + coff_header_offset;
105 machine_type_ = ReadU16(coff_header, 0);
106 number_of_sections_ = ReadU16(coff_header, 2);
107 size_of_optional_header_ = ReadU16(coff_header, 16);
108 // Check we can read the magic.
109 if (size_of_optional_header_ < 2)
110 return Bad("Optional header no magic");
111 // Check that we can read the rest of the the fixed fields. Data directories
112 // directly follow the fixed fields of the IMAGE_OPTIONAL_HEADER.
113 if (size_of_optional_header_ < RelativeOffsetOfDataDirectories())
114 return Bad("Optional header too short");
116 // The rest of the IMAGE_NT_HEADERS is the IMAGE_OPTIONAL_HEADER(32|64)
117 FileOffset optional_header_offset = pe_header_offset + kMinPeHeaderSize;
118 if (!IsRangeInBounds(optional_header_offset, size_of_optional_header_))
119 return Bad("Optional header past end of file");
120 optional_header_ = start() + optional_header_offset;
122 uint16_t magic = ReadU16(optional_header_, 0);
125 if (magic != kImageNtOptionalHdr32Magic)
126 return Bad("64 bit executables are not supported by this disassembler");
130 if (magic != kImageNtOptionalHdr64Magic)
131 return Bad("32 bit executables are not supported by this disassembler");
135 return Bad("Unrecognized magic");
138 // The optional header is either an IMAGE_OPTIONAL_HEADER32 or
139 // IMAGE_OPTIONAL_HEADER64
140 // http://msdn.microsoft.com/en-us/library/ms680339(VS.85).aspx
142 // Copy the fields we care about.
143 size_of_code_ = ReadU32(optional_header_, 4);
144 size_of_initialized_data_ = ReadU32(optional_header_, 8);
145 size_of_uninitialized_data_ = ReadU32(optional_header_, 12);
146 base_of_code_ = ReadU32(optional_header_, 20);
150 base_of_data_ = ReadU32(optional_header_, 24);
151 image_base_ = ReadU32(optional_header_, 28);
152 size_of_image_ = ReadU32(optional_header_, 56);
153 number_of_data_directories_ = ReadU32(optional_header_, 92);
158 image_base_ = ReadU64(optional_header_, 24);
159 size_of_image_ = ReadU32(optional_header_, 56);
160 number_of_data_directories_ = ReadU32(optional_header_, 108);
167 if (size_of_image_ >= 0x80000000U)
168 return Bad("Invalid SizeOfImage");
170 if (size_of_code_ >= length() || size_of_initialized_data_ >= length() ||
171 size_of_code_ + size_of_initialized_data_ >= length()) {
172 // This validation fires on some perfectly fine executables.
173 // return Bad("code or initialized data too big");
176 // TODO(sra): we can probably get rid of most of the data directories.
178 // 'b &= ...' could be short circuit 'b = b && ...' but it is not necessary
179 // for correctness and it compiles smaller this way.
180 b &= ReadDataDirectory(0, &export_table_);
181 b &= ReadDataDirectory(1, &import_table_);
182 b &= ReadDataDirectory(2, &resource_table_);
183 b &= ReadDataDirectory(3, &exception_table_);
184 b &= ReadDataDirectory(5, &base_relocation_table_);
185 b &= ReadDataDirectory(11, &bound_import_table_);
186 b &= ReadDataDirectory(12, &import_address_table_);
187 b &= ReadDataDirectory(13, &delay_import_descriptor_);
188 b &= ReadDataDirectory(14, &clr_runtime_header_);
190 return Bad("Malformed data directory");
192 // Sections follow the optional header.
193 FileOffset sections_offset =
194 optional_header_offset + size_of_optional_header_;
195 if (!IsArrayInBounds(sections_offset, number_of_sections_, sizeof(Section)))
196 return Bad("Sections past end of file");
197 sections_ = reinterpret_cast<const Section*>(start() + sections_offset);
198 if (!CheckSectionRanges())
199 return Bad("Out of bound section");
201 size_t detected_length = 0;
202 for (int i = 0; i < number_of_sections_; ++i) {
203 const Section* section = §ions_[i];
205 // TODO(sra): consider using the 'characteristics' field of the section
206 // header to see if the section contains instructions.
207 if (memcmp(section->name, ".text", 6) == 0)
208 has_text_section_ = true;
210 uint32_t section_end =
211 section->file_offset_of_raw_data + section->size_of_raw_data;
212 if (section_end > detected_length)
213 detected_length = section_end;
216 // Pretend our in-memory copy is only as long as our detected length.
217 ReduceLength(detected_length);
219 if (!has_text_section()) {
220 return Bad("Resource-only executables are not yet supported");
226 ////////////////////////////////////////////////////////////////////////////////
228 bool DisassemblerWin32::ParseRelocs(std::vector<RVA>* relocs) {
231 size_t relocs_size = base_relocation_table_.size_;
232 if (relocs_size == 0)
235 // The format of the base relocation table is a sequence of variable sized
236 // IMAGE_BASE_RELOCATION blocks. Search for
237 // "The format of the base relocation data is somewhat quirky"
238 // at http://msdn.microsoft.com/en-us/library/ms809762.aspx
240 const uint8_t* relocs_start = RVAToPointer(base_relocation_table_.address_);
241 const uint8_t* relocs_end = relocs_start + relocs_size;
243 // Make sure entire base relocation table is within the buffer.
244 if (relocs_start < start() || relocs_start >= end() ||
245 relocs_end <= start() || relocs_end > end()) {
246 return Bad(".relocs outside image");
249 const uint8_t* block = relocs_start;
251 // Walk the variable sized blocks.
252 while (block + 8 < relocs_end) {
253 RVA page_rva = ReadU32(block, 0);
254 uint32_t size = ReadU32(block, 4);
255 if (size < 8 || // Size includes header ...
256 size % 4 != 0) // ... and is word aligned.
257 return Bad("Unreasonable relocs block");
259 const uint8_t* end_entries = block + size;
261 if (end_entries <= block || end_entries <= start() || end_entries > end())
262 return Bad(".relocs block outside image");
264 // Walk through the two-byte entries.
265 for (const uint8_t* p = block + 8; p < end_entries; p += 2) {
266 uint16_t entry = ReadU16(p, 0);
267 int type = entry >> 12;
268 int offset = entry & 0xFFF;
270 RVA rva = page_rva + offset;
271 // Skip the relocs that live outside of the image. It might be the case
272 // if a reloc is relative to a register, e.g.:
273 // mov ecx,dword ptr [eax+044D5888h]
274 RVA target_rva = PointerToTargetRVA(RVAToPointer(rva));
275 if (target_rva == kNoRVA) {
279 if (SupportsRelTableType(type)) {
280 relocs->push_back(rva);
281 } else if (type == 0) { // IMAGE_REL_BASED_ABSOLUTE
282 // Ignore, used as padding.
284 // Does not occur in Windows x86/x64 executables.
285 return Bad("Unknown type of reloc");
292 std::sort(relocs->begin(), relocs->end());
293 DCHECK(relocs->empty() || relocs->back() != kUnassignedRVA);
298 const Section* DisassemblerWin32::RVAToSection(RVA rva) const {
299 for (int i = 0; i < number_of_sections_; ++i) {
300 const Section* section = §ions_[i];
301 if (rva >= section->virtual_address) {
302 FileOffset offset_in_section = rva - section->virtual_address;
303 if (offset_in_section < section->virtual_size)
310 std::string DisassemblerWin32::SectionName(const Section* section) {
311 if (section == nullptr)
314 memcpy(name, section->name, 8);
315 name[8] = '\0'; // Ensure termination.
320 bool DisassemblerWin32::QuickDetect(const uint8_t* start,
323 if (length < kOffsetOfFileAddressOfNewExeHeader + 4)
326 // Have 'MZ' magic for a DOS header?
327 if (start[0] != 'M' || start[1] != 'Z')
330 FileOffset pe_header_offset = static_cast<FileOffset>(
331 ReadU32(start, kOffsetOfFileAddressOfNewExeHeader));
332 if (pe_header_offset % 8 != 0 ||
333 pe_header_offset < kOffsetOfFileAddressOfNewExeHeader + 4 ||
334 pe_header_offset >= length ||
335 length - pe_header_offset < kMinPeHeaderSize) {
338 const uint8_t* pe_header = start + pe_header_offset;
339 if (!(pe_header[0] == 'P' && pe_header[1] == 'E' && pe_header[2] == 0 &&
340 pe_header[3] == 0)) {
344 FileOffset optional_header_offset = pe_header_offset + kMinPeHeaderSize;
345 if (optional_header_offset >= length || length - optional_header_offset < 2)
347 const uint8_t* optional_header = start + optional_header_offset;
348 return magic == ReadU16(optional_header, 0);
351 bool DisassemblerWin32::IsRvaRangeInBounds(size_t start, size_t length) {
352 return start < size_of_image_ && length <= size_of_image_ - start;
355 bool DisassemblerWin32::CheckSectionRanges() {
356 for (int i = 0; i < number_of_sections_; ++i) {
357 const Section* section = §ions_[i];
358 if (!IsRangeInBounds(section->file_offset_of_raw_data,
359 section->size_of_raw_data) ||
360 !IsRvaRangeInBounds(section->virtual_address, section->virtual_size)) {
367 bool DisassemblerWin32::ExtractAbs32Locations() {
368 abs32_locations_.clear();
369 if (!ParseRelocs(&abs32_locations_))
372 #if COURGETTE_HISTOGRAM_TARGETS
373 for (size_t i = 0; i < abs32_locations_.size(); ++i) {
374 RVA rva = abs32_locations_[i];
375 // The 4 bytes at the relocation are a reference to some address.
376 ++abs32_target_rvas_[PointerToTargetRVA(RVAToPointer(rva))];
382 bool DisassemblerWin32::ExtractRel32Locations() {
383 FileOffset file_offset = 0;
384 while (file_offset < length()) {
385 const Section* section = FindNextSection(file_offset);
386 if (section == nullptr)
388 if (file_offset < section->file_offset_of_raw_data)
389 file_offset = section->file_offset_of_raw_data;
390 ParseRel32RelocsFromSection(section);
391 file_offset += section->size_of_raw_data;
393 std::sort(rel32_locations_.begin(), rel32_locations_.end());
394 DCHECK(rel32_locations_.empty() || rel32_locations_.back() != kUnassignedRVA);
396 #if COURGETTE_HISTOGRAM_TARGETS
397 VLOG(1) << "abs32_locations_ " << abs32_locations_.size()
398 << "\nrel32_locations_ " << rel32_locations_.size()
399 << "\nabs32_target_rvas_ " << abs32_target_rvas_.size()
400 << "\nrel32_target_rvas_ " << rel32_target_rvas_.size();
403 std::map<RVA, int>::iterator abs32_iter = abs32_target_rvas_.begin();
404 std::map<RVA, int>::iterator rel32_iter = rel32_target_rvas_.begin();
405 while (abs32_iter != abs32_target_rvas_.end() &&
406 rel32_iter != rel32_target_rvas_.end()) {
407 if (abs32_iter->first < rel32_iter->first) {
409 } else if (rel32_iter->first < abs32_iter->first) {
417 VLOG(1) << "common " << common;
422 RvaVisitor* DisassemblerWin32::CreateAbs32TargetRvaVisitor() {
423 return new RvaVisitor_Abs32(abs32_locations_, *this);
426 RvaVisitor* DisassemblerWin32::CreateRel32TargetRvaVisitor() {
427 return new RvaVisitor_Rel32(rel32_locations_, *this);
430 void DisassemblerWin32::RemoveUnusedRel32Locations(
431 AssemblyProgram* program) {
432 auto cond = [this, program](RVA rva) -> bool {
433 // + 4 since offset is relative to start of next instruction.
434 RVA target_rva = rva + 4 + Read32LittleEndian(RVAToPointer(rva));
435 return program->FindRel32Label(target_rva) == nullptr;
437 rel32_locations_.erase(
438 std::remove_if(rel32_locations_.begin(), rel32_locations_.end(), cond),
439 rel32_locations_.end());
442 InstructionGenerator DisassemblerWin32::GetInstructionGenerator(
443 AssemblyProgram* program) {
444 return base::BindRepeating(&DisassemblerWin32::ParseFile,
445 base::Unretained(this), program);
448 CheckBool DisassemblerWin32::ParseFile(AssemblyProgram* program,
449 InstructionReceptor* receptor) const {
450 // Walk all the bytes in the file, whether or not in a section.
451 FileOffset file_offset = 0;
452 while (file_offset < length()) {
453 const Section* section = FindNextSection(file_offset);
454 if (section == nullptr) {
455 // No more sections. There should not be extra stuff following last
457 // ParseNonSectionFileRegion(file_offset, pe_info().length(), receptor);
460 if (file_offset < section->file_offset_of_raw_data) {
461 FileOffset section_start_offset = section->file_offset_of_raw_data;
462 if (!ParseNonSectionFileRegion(file_offset, section_start_offset,
467 file_offset = section_start_offset;
469 FileOffset end = file_offset + section->size_of_raw_data;
470 if (!ParseFileRegion(section, file_offset, end, program, receptor))
475 #if COURGETTE_HISTOGRAM_TARGETS
476 HistogramTargets("abs32 relocs", abs32_target_rvas_);
477 HistogramTargets("rel32 relocs", rel32_target_rvas_);
483 CheckBool DisassemblerWin32::ParseNonSectionFileRegion(
484 FileOffset start_file_offset,
485 FileOffset end_file_offset,
486 InstructionReceptor* receptor) const {
487 if (incomplete_disassembly_)
490 if (end_file_offset > start_file_offset) {
491 if (!receptor->EmitMultipleBytes(FileOffsetToPointer(start_file_offset),
492 end_file_offset - start_file_offset)) {
500 CheckBool DisassemblerWin32::ParseFileRegion(
501 const Section* section,
502 FileOffset start_file_offset,
503 FileOffset end_file_offset,
504 AssemblyProgram* program,
505 InstructionReceptor* receptor) const {
506 RVA relocs_start_rva = base_relocation_table().address_;
508 const uint8_t* start_pointer = FileOffsetToPointer(start_file_offset);
509 const uint8_t* end_pointer = FileOffsetToPointer(end_file_offset);
511 RVA start_rva = FileOffsetToRVA(start_file_offset);
512 RVA end_rva = start_rva + section->virtual_size;
513 const int kVAWidth = AbsVAWidth();
515 // Quick way to convert from Pointer to RVA within a single Section is to
516 // subtract 'pointer_to_rva'.
517 const uint8_t* const adjust_pointer_to_rva = start_pointer - start_rva;
519 std::vector<RVA>::const_iterator rel32_pos = rel32_locations_.begin();
520 std::vector<RVA>::const_iterator abs32_pos = abs32_locations_.begin();
522 if (!receptor->EmitOrigin(start_rva))
525 const uint8_t* p = start_pointer;
527 while (p < end_pointer) {
528 RVA current_rva = static_cast<RVA>(p - adjust_pointer_to_rva);
530 // The base relocation table is usually in the .relocs section, but it could
531 // actually be anywhere. Make sure we skip it because we will regenerate it
533 if (current_rva == relocs_start_rva) {
534 if (!receptor->EmitPeRelocs())
536 uint32_t relocs_size = base_relocation_table().size_;
543 while (abs32_pos != abs32_locations_.end() && *abs32_pos < current_rva)
546 if (abs32_pos != abs32_locations_.end() && *abs32_pos == current_rva) {
547 RVA target_rva = PointerToTargetRVA(p);
548 DCHECK_NE(kNoRVA, target_rva);
549 // TODO(sra): target could be Label+offset. It is not clear how to guess
550 // which it might be. We assume offset==0.
551 Label* label = program->FindAbs32Label(target_rva);
553 if (!EmitAbs(label, receptor))
559 while (rel32_pos != rel32_locations_.end() && *rel32_pos < current_rva)
562 if (rel32_pos != rel32_locations_.end() && *rel32_pos == current_rva) {
563 // + 4 since offset is relative to start of next instruction.
564 RVA target_rva = current_rva + 4 + Read32LittleEndian(p);
565 Label* label = program->FindRel32Label(target_rva);
567 if (!receptor->EmitRel32(label))
573 if (incomplete_disassembly_) {
574 if ((abs32_pos == abs32_locations_.end() || end_rva <= *abs32_pos) &&
575 (rel32_pos == rel32_locations_.end() || end_rva <= *rel32_pos) &&
576 (end_rva <= relocs_start_rva || current_rva >= relocs_start_rva)) {
577 // No more relocs in this section, don't bother encoding bytes.
582 if (!receptor->EmitSingleByte(*p))
590 #if COURGETTE_HISTOGRAM_TARGETS
591 // Histogram is printed to std::cout. It is purely for debugging the algorithm
592 // and is only enabled manually in 'exploration' builds. I don't want to add
593 // command-line configuration for this feature because this code has to be
594 // small, which means compiled-out.
595 void DisassemblerWin32::HistogramTargets(const char* kind,
596 const std::map<RVA, int>& map) const {
598 std::map<int, std::vector<RVA>> h;
599 for (std::map<RVA, int>::const_iterator p = map.begin(); p != map.end();
601 h[p->second].push_back(p->first);
605 std::cout << total << " " << kind << " to " << map.size() << " unique targets"
608 std::cout << "indegree: #targets-with-indegree (example)" << std::endl;
609 const int kFirstN = 15;
610 bool someSkipped = false;
612 for (std::map<int, std::vector<RVA>>::reverse_iterator p = h.rbegin();
613 p != h.rend(); ++p) {
615 if (index <= kFirstN || p->first <= 3) {
617 std::cout << "..." << std::endl;
619 size_t count = p->second.size();
620 std::cout << std::dec << p->first << ": " << count;
622 for (size_t i = 0; i < count; ++i)
623 std::cout << " " << DescribeRVA(p->second[i]);
625 std::cout << std::endl;
632 #endif // COURGETTE_HISTOGRAM_TARGETS
634 // DescribeRVA is for debugging only. I would put it under #ifdef DEBUG except
635 // that during development I'm finding I need to call it when compiled in
636 // Release mode. Hence:
637 // TODO(sra): make this compile only for debug mode.
638 std::string DisassemblerWin32::DescribeRVA(RVA rva) const {
639 const Section* section = RVAToSection(rva);
640 std::ostringstream s;
641 s << std::hex << rva;
644 s << SectionName(section) << "+" << std::hex
645 << (rva - section->virtual_address) << ")";
650 const Section* DisassemblerWin32::FindNextSection(
651 FileOffset file_offset) const {
652 const Section* best = nullptr;
653 for (int i = 0; i < number_of_sections_; ++i) {
654 const Section* section = §ions_[i];
655 if (section->size_of_raw_data > 0) { // i.e. has data in file.
656 if (file_offset <= section->file_offset_of_raw_data) {
657 if (best == nullptr ||
658 section->file_offset_of_raw_data < best->file_offset_of_raw_data) {
667 bool DisassemblerWin32::ReadDataDirectory(int index,
668 ImageDataDirectory* directory) {
669 if (index < number_of_data_directories_) {
670 FileOffset file_offset = index * 8 + RelativeOffsetOfDataDirectories();
671 if (file_offset >= size_of_optional_header_)
672 return Bad("Number of data directories inconsistent");
673 const uint8_t* data_directory = optional_header_ + file_offset;
674 if (data_directory < start() || data_directory + 8 >= end())
675 return Bad("Data directory outside image");
676 RVA rva = ReadU32(data_directory, 0);
677 size_t size = ReadU32(data_directory, 4);
678 if (size > size_of_image_)
679 return Bad("Data directory size too big");
681 // TODO(sra): validate RVA.
682 directory->address_ = rva;
683 directory->size_ = static_cast<uint32_t>(size);
686 directory->address_ = 0;
687 directory->size_ = 0;
692 } // namespace courgette