From 6f326ce75bc6424d1e01bbae525323bc5a57f2c8 Mon Sep 17 00:00:00 2001 From: Kevin Enderby Date: Thu, 23 Oct 2014 19:37:31 +0000 Subject: [PATCH] =?utf8?q?Update=20llvm-objdump=E2=80=99s=20Mach-O=20symbo?= =?utf8?q?lizer=20code=20for=20Objective-C=20references.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This prints disassembly comments for Objective-C references to CFStrings, Selectors, Classes and method calls. llvm-svn: 220500 --- llvm/lib/Object/MachOObjectFile.cpp | 40 +- .../llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64 | Bin 0 -> 8944 bytes .../llvm-objdump/X86/Inputs/ObjC.obj.macho-x86_64 | Bin 0 -> 1732 bytes .../X86/macho-symbolized-disassembly.test | 17 + llvm/tools/llvm-objdump/MachODump.cpp | 615 +++++++++++++++++++-- 5 files changed, 626 insertions(+), 46 deletions(-) create mode 100755 llvm/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64 create mode 100644 llvm/test/tools/llvm-objdump/X86/Inputs/ObjC.obj.macho-x86_64 diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index 6e83d81..0bd61ce 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -2345,11 +2345,47 @@ MachOObjectFile::getDataInCodeTableEntry(uint32_t DataOffset, } MachO::symtab_command MachOObjectFile::getSymtabLoadCommand() const { - return getStruct(this, SymtabLoadCmd); + if (SymtabLoadCmd) + return getStruct(this, SymtabLoadCmd); + + // If there is no SymtabLoadCmd return a load command with zero'ed fields. + MachO::symtab_command Cmd; + Cmd.cmd = MachO::LC_SYMTAB; + Cmd.cmdsize = sizeof(MachO::symtab_command); + Cmd.symoff = 0; + Cmd.nsyms = 0; + Cmd.stroff = 0; + Cmd.strsize = 0; + return Cmd; } MachO::dysymtab_command MachOObjectFile::getDysymtabLoadCommand() const { - return getStruct(this, DysymtabLoadCmd); + if (DysymtabLoadCmd) + return getStruct(this, DysymtabLoadCmd); + + // If there is no DysymtabLoadCmd return a load command with zero'ed fields. + MachO::dysymtab_command Cmd; + Cmd.cmd = MachO::LC_DYSYMTAB; + Cmd.cmdsize = sizeof(MachO::dysymtab_command); + Cmd.ilocalsym = 0; + Cmd.nlocalsym = 0; + Cmd.iextdefsym = 0; + Cmd.nextdefsym = 0; + Cmd.iundefsym = 0; + Cmd.nundefsym = 0; + Cmd.tocoff = 0; + Cmd.ntoc = 0; + Cmd.modtaboff = 0; + Cmd.nmodtab = 0; + Cmd.extrefsymoff = 0; + Cmd.nextrefsyms = 0; + Cmd.indirectsymoff = 0; + Cmd.nindirectsyms = 0; + Cmd.extreloff = 0; + Cmd.nextrel = 0; + Cmd.locreloff = 0; + Cmd.nlocrel = 0; + return Cmd; } MachO::linkedit_data_command diff --git a/llvm/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64 b/llvm/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64 new file mode 100755 index 0000000000000000000000000000000000000000..4de8a1ff60af9192bf95bffeadf34ca2252d8978 GIT binary patch literal 8944 zcmeHN&u<$=6dpH(rZjCGTA}cx(xxP+KouhdQdBN+9S0YpCXE9W1e)5;BzCdaVRxe> zND&cjC88*jPjEwUMo8t>Ge6qg`VSz;0g#bEia1t?@O`th@p_$xBbR;B+nG1-&9`sf zzMZV+<=*{2|2Zr~qDzRiLqdp?;4d(6Q&{K;@i;gKmU1fnYWAJ%^dv{yDbX(OD-mZI zH;{5Bdu68GM9)vQj|n5g710f5OOn!4%bwr&<#xR7<5&z7C1XPQp_Vy?wXi6q>n#@D zy`iXoV<&vPUSko+!Gs=z_pll8@nl(57nFmsDBhI88#MNm8TUQv@ULFHR+2U4mF+@J zb>Mjh?}!1Q9HusGD5Na8o>Q^sjZxIURf9KR^pwNYiMMDkRqR<=vFGYAjN*N6@R&m+ z$_J5@sx0RmE>%+$uW9i54G3lA?gNQ=cUI@!Sb8QMGDCi~$%!h>8{j@ z^{QO(9ISJHya0yz8}^Gh;n+!8EgXcG#C-Y};870a@qGyAwUjzfDz(DA;-iTAXBoU6 z%p*|_`{R$ct_IIlRY%R?^@!q~HF#(=Q4afucQ#OIy}>J03$9DVDBef}k8&6fAQ8OM zTsV=VcsYY->APbO>%=n}ziI7r6BjSNo*law>`?${%M-|?9TPl02d&qmLY%=q`U3lw z5R7#X+BLKRePSUWA>2YsJHjxRw+4Lss{m*~J_?=!hl#f2ASc7Rpr6D1{q)!d>5||+ zmRfY3RJBq}&8}2uA(Ng2(~lp1{&scqi5IVYbt8TAuaBFbai2IW9)rR3@Yc9PvlM0e&L!HSiOywPAh!t%ffQW1W>Ej6s}N1_$#*-&;ttPsIU# zW$KPP*tc}5-NxJ}?4J<7V6@#ozOlpUuy;Z{11rXR+w$|_StBZ5(?AQf}krK{o66a_iY<^E@@{ z*`3@*_mHnWuC;gdAK%=@YBRTf=ay*l8{)Qxx&FQN-gk}Np*xUvhqiE?*gX9Wnsk15 z=u5N>ert%|f7{*NEgxg=NAzs9B=Di5#7tRPrA5b4wr6p`D%i7@SE(th;-0sLM)AEM z#8X57ge(D+=alZC`{A@mEGHAm-orf&!bj+;`F(ZG zw1GPKow*zt5A;F1#Sdao@H^7dQhbRc5Jw=6KpcTM0&xW52*eSHBM?U*jzAoNI0A75 z{?7>Xok>oor^hZoI?7^@E{4k@>bWRhUv@;}zOroKCkRqT@v%dl#uv;fJ3rYmTFm*7CnVpdx7o_)Cv__4Do2BOKhJuK@B?h zf9mDG5p07|C(?8dSDt{p3+AjFU>=^_hAkRho}-@SH0n>4>b2p*!a`LIx5_~n?=to% zg=xIN{%EKK1=XG%4{5@<{Vm*%GSY*vj{w^*9}yYYbnrw%SL!tQV_No4iFh-f9-(Dd z)6=*@;CeN&Ruz^fBk~8n9K=q1NMo9^Xxd7XN-y%d-MsviW+T}xEuv5; zNJ9~N^yp2G3LZQPiuKm3M=wGS3Z6x(-|Wts?N$*7-uq_edvA8$o42$7{p-O1QQ{b3 z!eHFw$-sb|!5C)yFJkg0$WvJp+K5R!jxz~S%F`QOpa*s-zgX(pp=OK1@p_?mFeTa7 zOPdU#DAinEv6a1Qxo%5WT&1S6rEJI&)kIATCJ~2I9PK0O349^tTN1CKy_#b+H1a*1 z--g5q&oj))MMPr$_DDQeuX1*L{=@lg%gQi`QOR#Y@&kmFzfvjN^;&hOD*NL3JrVyo z!HIw7XIG*D`<6jAsZ{|+lc)-s3>|Vg^JnWe5_bz#4;Eo(=_(d$&UJh`nS zG?i-Fu-95G?RW;SGc2cUc=d)h>h3k;{A7H`2W|!tF(P_YwF5({(Pd7Hnz%ntiH&q3 zeQF@Ni&Z1=pT+oi@Y5IBPn1oiANAx0`QBN$)j+>E4Lkw74n%D-SAnQc=7RVQ@pIo} zKi3ocGD#qgkOFW&O0f;zx1px^@gg1HMvw3jc0Yc7O5tIK{iDq`9zyuP?h5|^$np#$ zg#h7uMNn`~$1l|x|26lv|g_zBE6XmbOY%WWbC)Kd(4h7w{;Z}>}4~^*> k6#{xDUozF5?7W)I&g2!XsxkDZKa-a!Zw5c}C8STk0Mr%D*Z=?k literal 0 HcmV?d00001 diff --git a/llvm/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test b/llvm/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test index 6515316..c8322bc 100644 --- a/llvm/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test +++ b/llvm/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test @@ -1,8 +1,25 @@ // RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s -check-prefix=OBJ // RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-x86_64 | FileCheck %s -check-prefix=EXE +// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.obj.macho-x86_64 | FileCheck %s -check-prefix=ObjC-OBJ +// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.exe.macho-x86_64 | FileCheck %s -check-prefix=ObjC-EXE OBJ: 0000000000000008 leaq L_.str(%rip), %rax ## literal pool for: "Hello world\n" OBJ: 0000000000000026 callq _printf EXE: 0000000100000f38 leaq 0x4f(%rip), %rax ## literal pool for: "Hello world\n" EXE: 0000000100000f56 callq 0x100000f6c ## symbol stub for: _printf + +ObjC-OBJ: 0000000000000008 leaq 0xb1(%rip), %rax ## Objc cfstring ref: @"The current date and time is: %@" +ObjC-OBJ: 0000000000000016 movq 0x4b(%rip), %rcx ## Objc class ref: NSObject +ObjC-OBJ: 000000000000001d movq 0x64(%rip), %rsi ## Objc selector ref: new +ObjC-OBJ: 0000000000000034 movq 0x35(%rip), %rax ## Objc class ref: NSDate +ObjC-OBJ: 000000000000003b movq 0x4e(%rip), %rsi ## Objc selector ref: date + +ObjC-EXE: 0000000100000ee8 leaq 0x159(%rip), %rax ## Objc cfstring ref: @"The current date and time is: %@" +ObjC-EXE: 0000000100000ef6 movq 0x13b(%rip), %rcx ## Objc class ref: _OBJC_CLASS_$_NSObject +ObjC-EXE: 0000000100000efd movq 0x124(%rip), %rsi ## Objc selector ref: new +ObjC-EXE: 0000000100000f0b callq 0x100000f4a ## Objc message: +[NSObject new] +ObjC-EXE: 0000000100000f14 movq 0x125(%rip), %rax ## Objc class ref: _OBJC_CLASS_$_NSDate +ObjC-EXE: 0000000100000f1b movq 0x10e(%rip), %rsi ## Objc selector ref: date +ObjC-EXE: 0000000100000f25 callq 0x100000f4a ## Objc message: +[NSDate date] +ObjC-EXE: 0000000100000f33 callq 0x100000f44 ## symbol stub for: _NSLog diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp index 54aa3f2..bf0108c 100644 --- a/llvm/tools/llvm-objdump/MachODump.cpp +++ b/llvm/tools/llvm-objdump/MachODump.cpp @@ -235,6 +235,9 @@ void llvm::DisassembleInputMachO(StringRef Filename) { } typedef DenseMap SymbolAddressMap; +typedef std::pair BindInfoEntry; +typedef std::vector BindTable; +typedef BindTable::iterator bind_table_iterator; // The block of info used by the Symbolizer call backs. struct DisassembleInfo { @@ -242,6 +245,11 @@ struct DisassembleInfo { MachOObjectFile *O; SectionRef S; SymbolAddressMap *AddrMap; + std::vector *Sections; + const char *class_name; + const char *selector_name; + char *method; + BindTable *BindTable; }; // SymbolizerGetOpInfo() is the operand information call back function. @@ -342,7 +350,7 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset, // TODO: // Second search the external relocation entries of a fully linked image // (if any) for an entry that matches this segment offset. - //uint64_t seg_offset = (Pc + Offset); + // uint64_t seg_offset = (Pc + Offset); return 0; } else if (Arch == Triple::arm) { return 0; @@ -445,7 +453,7 @@ static const char *GuessIndirectSymbol(uint64_t ReferenceValue, uint32_t index = Sec.reserved1 + (ReferenceValue - Sec.addr) / stride; if (index < Dysymtab.nindirectsyms) { uint32_t indirect_symbol = - info->O->getIndirectSymbolTableEntry(Dysymtab, index); + info->O->getIndirectSymbolTableEntry(Dysymtab, index); if (indirect_symbol < Symtab.nsyms) { symbol_iterator Sym = info->O->getSymbolByIndex(indirect_symbol); SymbolRef Symbol = *Sym; @@ -479,7 +487,7 @@ static const char *GuessIndirectSymbol(uint64_t ReferenceValue, uint32_t index = Sec.reserved1 + (ReferenceValue - Sec.addr) / stride; if (index < Dysymtab.nindirectsyms) { uint32_t indirect_symbol = - info->O->getIndirectSymbolTableEntry(Dysymtab, index); + info->O->getIndirectSymbolTableEntry(Dysymtab, index); if (indirect_symbol < Symtab.nsyms) { symbol_iterator Sym = info->O->getSymbolByIndex(indirect_symbol); SymbolRef Symbol = *Sym; @@ -500,6 +508,401 @@ static const char *GuessIndirectSymbol(uint64_t ReferenceValue, return nullptr; } +// method_reference() is called passing it the ReferenceName that might be +// a reference it to an Objective-C method call. If so then it allocates and +// assembles a method call string with the values last seen and saved in +// the DisassembleInfo's class_name and selector_name fields. This is saved +// into the method field of the info and any previous string is free'ed. +// Then the class_name field in the info is set to nullptr. The method call +// string is set into ReferenceName and ReferenceType is set to +// LLVMDisassembler_ReferenceType_Out_Objc_Message. If this not a method call +// then both ReferenceType and ReferenceName are left unchanged. +static void method_reference(struct DisassembleInfo *info, + uint64_t *ReferenceType, + const char **ReferenceName) { + if (*ReferenceName != nullptr) { + if (strcmp(*ReferenceName, "_objc_msgSend") == 0) { + if (info->selector_name != NULL) { + if (info->method != nullptr) + free(info->method); + if (info->class_name != nullptr) { + info->method = (char *)malloc(5 + strlen(info->class_name) + + strlen(info->selector_name)); + if (info->method != nullptr) { + strcpy(info->method, "+["); + strcat(info->method, info->class_name); + strcat(info->method, " "); + strcat(info->method, info->selector_name); + strcat(info->method, "]"); + *ReferenceName = info->method; + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message; + } + } else { + info->method = (char *)malloc(9 + strlen(info->selector_name)); + if (info->method != nullptr) { + strcpy(info->method, "-[%rdi "); + strcat(info->method, info->selector_name); + strcat(info->method, "]"); + *ReferenceName = info->method; + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message; + } + } + info->class_name = nullptr; + } + } else if (strcmp(*ReferenceName, "_objc_msgSendSuper2") == 0) { + if (info->selector_name != NULL) { + if (info->method != nullptr) + free(info->method); + info->method = (char *)malloc(17 + strlen(info->selector_name)); + if (info->method != nullptr) { + strcpy(info->method, "-[[%rdi super] "); + strcat(info->method, info->selector_name); + strcat(info->method, "]"); + *ReferenceName = info->method; + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message; + } + info->class_name = nullptr; + } + } + } +} + +// GuessPointerPointer() is passed the address of what might be a pointer to +// a reference to an Objective-C class, selector, message ref or cfstring. +// If so the value of the pointer is returned and one of the booleans are set +// to true. If not zero is returned and all the booleans are set to false. +static uint64_t GuessPointerPointer(uint64_t ReferenceValue, + struct DisassembleInfo *info, + bool &classref, bool &selref, bool &msgref, + bool &cfstring) { + classref = false; + selref = false; + msgref = false; + cfstring = false; + uint32_t LoadCommandCount = info->O->getHeader().ncmds; + MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo(); + for (unsigned I = 0;; ++I) { + if (Load.C.cmd == MachO::LC_SEGMENT_64) { + MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load); + for (unsigned J = 0; J < Seg.nsects; ++J) { + MachO::section_64 Sec = info->O->getSection64(Load, J); + if ((strncmp(Sec.sectname, "__objc_selrefs", 16) == 0 || + strncmp(Sec.sectname, "__objc_classrefs", 16) == 0 || + strncmp(Sec.sectname, "__objc_superrefs", 16) == 0 || + strncmp(Sec.sectname, "__objc_msgrefs", 16) == 0 || + strncmp(Sec.sectname, "__cfstring", 16) == 0) && + ReferenceValue >= Sec.addr && + ReferenceValue < Sec.addr + Sec.size) { + uint64_t sect_offset = ReferenceValue - Sec.addr; + uint64_t object_offset = Sec.offset + sect_offset; + StringRef MachOContents = info->O->getData(); + uint64_t object_size = MachOContents.size(); + const char *object_addr = (const char *)MachOContents.data(); + if (object_offset < object_size) { + uint64_t pointer_value; + memcpy(&pointer_value, object_addr + object_offset, + sizeof(uint64_t)); + if (info->O->isLittleEndian() != sys::IsLittleEndianHost) + sys::swapByteOrder(pointer_value); + if (strncmp(Sec.sectname, "__objc_selrefs", 16) == 0) + selref = true; + else if (strncmp(Sec.sectname, "__objc_classrefs", 16) == 0 || + strncmp(Sec.sectname, "__objc_superrefs", 16) == 0) + classref = true; + else if (strncmp(Sec.sectname, "__objc_msgrefs", 16) == 0 && + ReferenceValue + 8 < Sec.addr + Sec.size) { + msgref = true; + memcpy(&pointer_value, object_addr + object_offset + 8, + sizeof(uint64_t)); + if (info->O->isLittleEndian() != sys::IsLittleEndianHost) + sys::swapByteOrder(pointer_value); + } else if (strncmp(Sec.sectname, "__cfstring", 16) == 0) + cfstring = true; + return pointer_value; + } else { + return 0; + } + } + } + } + // TODO: Look for LC_SEGMENT for 32-bit Mach-O files. + if (I == LoadCommandCount - 1) + break; + else + Load = info->O->getNextLoadCommandInfo(Load); + } + return 0; +} + +// get_pointer_64 returns a pointer to the bytes in the object file at the +// Address from a section in the Mach-O file. And indirectly returns the +// offset into the section, number of bytes left in the section past the offset +// and which section is was being referenced. If the Address is not in a +// section nullptr is returned. +const char *get_pointer_64(uint64_t Address, uint32_t &offset, uint32_t &left, + SectionRef &S, DisassembleInfo *info) { + offset = 0; + left = 0; + S = SectionRef(); + for (unsigned SectIdx = 0; SectIdx != info->Sections->size(); SectIdx++) { + uint64_t SectAddress = ((*(info->Sections))[SectIdx]).getAddress(); + uint64_t SectSize = ((*(info->Sections))[SectIdx]).getSize(); + if (Address >= SectAddress && Address < SectAddress + SectSize) { + S = (*(info->Sections))[SectIdx]; + offset = Address - SectAddress; + left = SectSize - offset; + StringRef SectContents; + ((*(info->Sections))[SectIdx]).getContents(SectContents); + return SectContents.data() + offset; + } + } + return nullptr; +} + +// get_symbol_64() returns the name of a symbol (or nullptr) and the address of +// the symbol indirectly through n_value. Based on the relocation information +// for the specified section offset in the specified section reference. +const char *get_symbol_64(uint32_t sect_offset, SectionRef S, + DisassembleInfo *info, uint64_t &n_value) { + n_value = 0; + if (info->verbose == false) + return nullptr; + + // See if there is an external relocation entry at the sect_offset. + bool reloc_found = false; + DataRefImpl Rel; + MachO::any_relocation_info RE; + bool isExtern = false; + SymbolRef Symbol; + for (const RelocationRef &Reloc : S.relocations()) { + uint64_t RelocOffset; + Reloc.getOffset(RelocOffset); + if (RelocOffset == sect_offset) { + Rel = Reloc.getRawDataRefImpl(); + RE = info->O->getRelocation(Rel); + if (info->O->isRelocationScattered(RE)) + continue; + isExtern = info->O->getPlainRelocationExternal(RE); + if (isExtern) { + symbol_iterator RelocSym = Reloc.getSymbol(); + Symbol = *RelocSym; + } + reloc_found = true; + break; + } + } + // If there is an external relocation entry for a symbol in this section + // at this section_offset then use that symbol's value for the n_value + // and return its name. + const char *SymbolName = nullptr; + if (reloc_found && isExtern) { + Symbol.getAddress(n_value); + StringRef name; + Symbol.getName(name); + if (!name.empty()) { + SymbolName = name.data(); + return SymbolName; + } + } + + // TODO: For fully linked images, look through the external relocation + // entries off the dynamic symtab command. For these the r_offset is from the + // start of the first writeable segment in the Mach-O file. So the offset + // to this section from that segment is passed to this routine by the caller, + // as the database_offset. Which is the difference of the section's starting + // address and the first writable segment. + // + // NOTE: need add passing the database_offset to this routine. + + // TODO: We did not find an external relocation entry so look up the + // ReferenceValue as an address of a symbol and if found return that symbol's + // name. + // + // NOTE: need add passing the ReferenceValue to this routine. Then that code + // would simply be this: + // + // if (ReferenceValue != 0xffffffffffffffffLLU && + // ReferenceValue != 0xfffffffffffffffeLLU) { + // StringRef name = info->AddrMap->lookup(ReferenceValue); + // if (!name.empty()) + // SymbolName = name.data(); + // } + + return SymbolName; +} + +// These are structs in the Objective-C meta data and read to produce the +// comments for disassembly. While these are part of the ABI they are no +// public defintions. So the are here not in include/llvm/Support/MachO.h . + +// The cfstring object in a 64-bit Mach-O file. +struct cfstring64_t { + uint64_t isa; // class64_t * (64-bit pointer) + uint64_t flags; // flag bits + uint64_t characters; // char * (64-bit pointer) + uint64_t length; // number of non-NULL characters in above +}; + +// The class object in a 64-bit Mach-O file. +struct class64_t { + uint64_t isa; // class64_t * (64-bit pointer) + uint64_t superclass; // class64_t * (64-bit pointer) + uint64_t cache; // Cache (64-bit pointer) + uint64_t vtable; // IMP * (64-bit pointer) + uint64_t data; // class_ro64_t * (64-bit pointer) +}; + +struct class_ro64_t { + uint32_t flags; + uint32_t instanceStart; + uint32_t instanceSize; + uint32_t reserved; + uint64_t ivarLayout; // const uint8_t * (64-bit pointer) + uint64_t name; // const char * (64-bit pointer) + uint64_t baseMethods; // const method_list_t * (64-bit pointer) + uint64_t baseProtocols; // const protocol_list_t * (64-bit pointer) + uint64_t ivars; // const ivar_list_t * (64-bit pointer) + uint64_t weakIvarLayout; // const uint8_t * (64-bit pointer) + uint64_t baseProperties; // const struct objc_property_list (64-bit pointer) +}; + +inline void swapStruct(struct cfstring64_t &cfs) { + sys::swapByteOrder(cfs.isa); + sys::swapByteOrder(cfs.flags); + sys::swapByteOrder(cfs.characters); + sys::swapByteOrder(cfs.length); +} + +inline void swapStruct(struct class64_t &c) { + sys::swapByteOrder(c.isa); + sys::swapByteOrder(c.superclass); + sys::swapByteOrder(c.cache); + sys::swapByteOrder(c.vtable); + sys::swapByteOrder(c.data); +} + +inline void swapStruct(struct class_ro64_t &cro) { + sys::swapByteOrder(cro.flags); + sys::swapByteOrder(cro.instanceStart); + sys::swapByteOrder(cro.instanceSize); + sys::swapByteOrder(cro.reserved); + sys::swapByteOrder(cro.ivarLayout); + sys::swapByteOrder(cro.name); + sys::swapByteOrder(cro.baseMethods); + sys::swapByteOrder(cro.baseProtocols); + sys::swapByteOrder(cro.ivars); + sys::swapByteOrder(cro.weakIvarLayout); + sys::swapByteOrder(cro.baseProperties); +} + +static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue, + struct DisassembleInfo *info); + +// get_objc2_64bit_class_name() is used for disassembly and is passed a pointer +// to an Objective-C class and returns the class name. It is also passed the +// address of the pointer, so when the pointer is zero as it can be in an .o +// file, that is used to look for an external relocation entry with a symbol +// name. +const char *get_objc2_64bit_class_name(uint64_t pointer_value, + uint64_t ReferenceValue, + struct DisassembleInfo *info) { + const char *r; + uint32_t offset, left; + SectionRef S; + + // The pointer_value can be 0 in an object file and have a relocation + // entry for the class symbol at the ReferenceValue (the address of the + // pointer). + if (pointer_value == 0) { + r = get_pointer_64(ReferenceValue, offset, left, S, info); + if (r == nullptr || left < sizeof(uint64_t)) + return nullptr; + uint64_t n_value; + const char *symbol_name = get_symbol_64(offset, S, info, n_value); + if (symbol_name == nullptr) + return nullptr; + const char *class_name = rindex(symbol_name, '$'); + if (class_name != nullptr && class_name[1] == '_' && class_name[2] != '\0') + return class_name + 2; + else + return nullptr; + } + + // The case were the pointer_value is non-zero and points to a class defined + // in this Mach-O file. + r = get_pointer_64(pointer_value, offset, left, S, info); + if (r == nullptr || left < sizeof(struct class64_t)) + return nullptr; + struct class64_t c; + memcpy(&c, r, sizeof(struct class64_t)); + if (info->O->isLittleEndian() != sys::IsLittleEndianHost) + swapStruct(c); + if (c.data == 0) + return nullptr; + r = get_pointer_64(c.data, offset, left, S, info); + if (r == nullptr || left < sizeof(struct class_ro64_t)) + return nullptr; + struct class_ro64_t cro; + memcpy(&cro, r, sizeof(struct class_ro64_t)); + if (info->O->isLittleEndian() != sys::IsLittleEndianHost) + swapStruct(cro); + if (cro.name == 0) + return nullptr; + const char *name = get_pointer_64(cro.name, offset, left, S, info); + return name; +} + +// get_objc2_64bit_cfstring_name is used for disassembly and is passed a +// pointer to a cfstring and returns its name or nullptr. +const char *get_objc2_64bit_cfstring_name(uint64_t ReferenceValue, + struct DisassembleInfo *info) { + const char *r, *name; + uint32_t offset, left; + SectionRef S; + struct cfstring64_t cfs; + uint64_t cfs_characters; + + r = get_pointer_64(ReferenceValue, offset, left, S, info); + if (r == nullptr || left < sizeof(struct cfstring64_t)) + return nullptr; + memcpy(&cfs, r, sizeof(struct cfstring64_t)); + if (info->O->isLittleEndian() != sys::IsLittleEndianHost) + swapStruct(cfs); + if (cfs.characters == 0) { + uint64_t n_value; + const char *symbol_name = get_symbol_64( + offset + offsetof(struct cfstring64_t, characters), S, info, n_value); + if (symbol_name == nullptr) + return nullptr; + cfs_characters = n_value; + } else + cfs_characters = cfs.characters; + name = get_pointer_64(cfs_characters, offset, left, S, info); + + return name; +} + +// get_objc2_64bit_selref() is used for disassembly and is passed a the address +// of a pointer to an Objective-C selector reference when the pointer value is +// zero as in a .o file and is likely to have a external relocation entry with +// who's symbol's n_value is the real pointer to the selector name. If that is +// the case the real pointer to the selector name is returned else 0 is +// returned +uint64_t get_objc2_64bit_selref(uint64_t ReferenceValue, + struct DisassembleInfo *info) { + uint32_t offset, left; + SectionRef S; + + const char *r = get_pointer_64(ReferenceValue, offset, left, S, info); + if (r == nullptr || left < sizeof(uint64_t)) + return 0; + uint64_t n_value; + const char *symbol_name = get_symbol_64(offset, S, info, n_value); + if (symbol_name == nullptr) + return 0; + return n_value; +} + // GuessLiteralPointer returns a string which for the item in the Mach-O file // for the address passed in as ReferenceValue for printing as a comment with // the instruction and also returns the corresponding type of that item @@ -509,13 +912,20 @@ static const char *GuessIndirectSymbol(uint64_t ReferenceValue, // cstring is returned and ReferenceType is set to // LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr . // -// TODO: other literals such as Objective-C CFStrings refs, Selector refs, -// Message refs, Class refs and a Symbol address in a literal pool are yet -// to be done here. +// If ReferenceValue is an address of an Objective-C CFString, Selector ref or +// Class ref that name is returned and the ReferenceType is set accordingly. +// +// Lastly, literals which are Symbol address in a literal pool are looked for +// and if found the symbol name is returned and ReferenceType is set to +// LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr . +// +// If there is no item in the Mach-O file for the address passed in as +// ReferenceValue nullptr is returned and ReferenceType is unchanged. const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC, uint64_t *ReferenceType, struct DisassembleInfo *info) { - // TODO: This rouine's code is only for an x86_64 Mach-O file for now. + // TODO: This rouine's code and the routines it calls are only work with + // x86_64 Mach-O files for now. unsigned int Arch = info->O->getArch(); if (Arch != Triple::x86_64) return nullptr; @@ -556,20 +966,71 @@ const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC, } } - // TODO: the code to look for other literals such as Objective-C CFStrings - // refs, Selector refs, Message refs, Class refs will be added here. + // Look for literals such as Objective-C CFStrings refs, Selector refs, + // Message refs and Class refs. + bool classref, selref, msgref, cfstring; + uint64_t pointer_value = GuessPointerPointer(ReferenceValue, info, classref, + selref, msgref, cfstring); + if (classref == true && pointer_value == 0) { + // Note the ReferenceValue is a pointer into the __objc_classrefs section. + // And the pointer_value in that section is typically zero as it will be + // set by dyld as part of the "bind information". + const char *name = get_dyld_bind_info_symbolname(ReferenceValue, info); + if (name != nullptr) { + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref; + const char *class_name = rindex(name, '$'); + if (class_name != nullptr && class_name[1] == '_' && + class_name[2] != '\0') { + info->class_name = class_name + 2; + return name; + } + } + } + + if (classref == true) { + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref; + const char *name = + get_objc2_64bit_class_name(pointer_value, ReferenceValue, info); + if (name != nullptr) + info->class_name = name; + else + name = "bad class ref"; + return name; + } + + if (cfstring == true) { + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref; + const char *name = get_objc2_64bit_cfstring_name(ReferenceValue, info); + return name; + } + + if (selref == true && pointer_value == 0) + pointer_value = get_objc2_64bit_selref(ReferenceValue, info); + + if (pointer_value != 0) + ReferenceValue = pointer_value; const char *name = GuessCstringPointer(ReferenceValue, info); if (name) { - // TODO: note when the code is added above for Selector refs and Message - // refs we will need check for that here and set the ReferenceType - // accordingly. - *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr; + if (pointer_value != 0 && selref == true) { + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref; + info->selector_name = name; + } else if (pointer_value != 0 && msgref == true) { + info->class_name = nullptr; + *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref; + info->selector_name = name; + } else + *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr; return name; } - // TODO: look for an indirect symbol with this ReferenceValue which is in - // a literal pool. + // Lastly look for an indirect symbol with this ReferenceValue which is in + // a literal pool. If found return that symbol name. + name = GuessIndirectSymbol(ReferenceValue, info); + if (name) { + *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr; + return name; + } return nullptr; } @@ -584,7 +1045,7 @@ const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC, // Out type and the ReferenceName will also be set which is added as a comment // to the disassembled instruction. // -// If the symbol name is a C++ mangled name then the demangled name is +// TODO: If the symbol name is a C++ mangled name then the demangled name is // returned through ReferenceName and ReferenceType is set to // LLVMDisassembler_ReferenceType_DeMangled_Name . // @@ -599,7 +1060,7 @@ const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC, // ReferenceType will be LLVMDisassembler_ReferenceType_In_PCrel_Load then the // SymbolValue is checked to be an address of literal pointer, symbol pointer, // or an Objective-C meta data reference. If so the output ReferenceType is -// set to correspond to that as well as ReferenceName. +// set to correspond to that as well as setting the ReferenceName. const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue, uint64_t *ReferenceType, uint64_t ReferencePC, @@ -613,24 +1074,34 @@ const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue, } const char *SymbolName = nullptr; - StringRef name = info->AddrMap->lookup(ReferenceValue); - if (!name.empty()) - SymbolName = name.data(); + if (ReferenceValue != 0xffffffffffffffffLLU && + ReferenceValue != 0xfffffffffffffffeLLU) { + StringRef name = info->AddrMap->lookup(ReferenceValue); + if (!name.empty()) + SymbolName = name.data(); + } if (*ReferenceType == LLVMDisassembler_ReferenceType_In_Branch) { *ReferenceName = GuessIndirectSymbol(ReferenceValue, info); + if (*ReferenceName) { + method_reference(info, ReferenceType, ReferenceName); + if (*ReferenceType != LLVMDisassembler_ReferenceType_Out_Objc_Message) + *ReferenceType = LLVMDisassembler_ReferenceType_Out_SymbolStub; + } else + // TODO: if SymbolName is not nullptr see if it is a C++ name + // and demangle it. + *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; + } else if (*ReferenceType == LLVMDisassembler_ReferenceType_In_PCrel_Load) { + *ReferenceName = + GuessLiteralPointer(ReferenceValue, ReferencePC, ReferenceType, info); if (*ReferenceName) - *ReferenceType = LLVMDisassembler_ReferenceType_Out_SymbolStub; + method_reference(info, ReferenceType, ReferenceName); else *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; } - else if (*ReferenceType == LLVMDisassembler_ReferenceType_In_PCrel_Load) { - *ReferenceName = GuessLiteralPointer(ReferenceValue, ReferencePC, - ReferenceType, info); - if (*ReferenceName == nullptr) - *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; - // TODO: other types of references to be added. - } else { + // TODO: if SymbolName is not nullptr see if it is a C++ name + // and demangle it. + else { *ReferenceName = nullptr; *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; } @@ -652,8 +1123,8 @@ class DisasmMemoryObject : public MemoryObject { uint64_t Size; uint64_t BasePC; public: - DisasmMemoryObject(const uint8_t *bytes, uint64_t size, uint64_t basePC) : - Bytes(bytes), Size(size), BasePC(basePC) {} + DisasmMemoryObject(const uint8_t *bytes, uint64_t size, uint64_t basePC) + : Bytes(bytes), Size(size), BasePC(basePC) {} uint64_t getBase() const override { return BasePC; } uint64_t getExtent() const override { return Size; } @@ -917,6 +1388,11 @@ static void DisassembleInputMachO2(StringRef Filename, SymbolizerInfo.O = MachOOF; SymbolizerInfo.S = Sections[SectIdx]; SymbolizerInfo.AddrMap = &AddrMap; + SymbolizerInfo.Sections = &Sections; + SymbolizerInfo.class_name = nullptr; + SymbolizerInfo.selector_name = nullptr; + SymbolizerInfo.method = nullptr; + SymbolizerInfo.BindTable = nullptr; // Disassemble symbol by symbol. for (unsigned SymIdx = 0; SymIdx != Symbols.size(); SymIdx++) { @@ -962,6 +1438,9 @@ static void DisassembleInputMachO2(StringRef Filename, uint64_t Size; symbolTableWorked = true; + DisasmMemoryObject SectionMemoryObject((const uint8_t *)Bytes.data() + + Start, + End - Start, SectAddress + Start); DataRefImpl Symb = Symbols[SymIdx].getRawDataRefImpl(); bool isThumb = @@ -976,7 +1455,7 @@ static void DisassembleInputMachO2(StringRef Filename, if (FullLeadingAddr) { if (MachOOF->is64Bit()) outs() << format("%016" PRIx64, PC); - else + else outs() << format("%08" PRIx64, PC); } else { outs() << format("%8" PRIx64 ":", PC); @@ -1006,10 +1485,10 @@ static void DisassembleInputMachO2(StringRef Filename, bool gotInst; if (isThumb) - gotInst = ThumbDisAsm->getInstruction(Inst, Size, MemoryObject, PC, - DebugOut, Annotations); + gotInst = ThumbDisAsm->getInstruction(Inst, Size, SectionMemoryObject, + PC, DebugOut, Annotations); else - gotInst = DisAsm->getInstruction(Inst, Size, MemoryObject, PC, + gotInst = DisAsm->getInstruction(Inst, Size, SectionMemoryObject, PC, DebugOut, Annotations); if (gotInst) { if (!NoShowRawInsn) { @@ -1036,9 +1515,16 @@ static void DisassembleInputMachO2(StringRef Filename, } outs() << "\n"; } else { - errs() << "llvm-objdump: warning: invalid instruction encoding\n"; - if (Size == 0) - Size = 1; // skip illegible bytes + unsigned int Arch = MachOOF->getArch(); + if (Arch == Triple::x86_64 || Arch == Triple::x86){ + outs() << format("\t.byte 0x%02x #bad opcode\n", + *(Bytes.data() + Index) & 0xff); + Size = 1; // skip exactly one illegible byte and move on. + } else { + errs() << "llvm-objdump: warning: invalid instruction encoding\n"; + if (Size == 0) + Size = 1; // skip illegible bytes + } } } } @@ -1051,12 +1537,12 @@ static void DisassembleInputMachO2(StringRef Filename, MCInst Inst; uint64_t PC = SectAddress + Index; - if (DisAsm->getInstruction(Inst, InstSize, MemoryObject, PC, - DebugOut, nulls())) { + if (DisAsm->getInstruction(Inst, InstSize, MemoryObject, PC, DebugOut, + nulls())) { if (FullLeadingAddr) { if (MachOOF->is64Bit()) outs() << format("%016" PRIx64, PC); - else + else outs() << format("%08" PRIx64, PC); } else { outs() << format("%8" PRIx64 ":", PC); @@ -1068,12 +1554,23 @@ static void DisassembleInputMachO2(StringRef Filename, IP->printInst(&Inst, outs(), ""); outs() << "\n"; } else { - errs() << "llvm-objdump: warning: invalid instruction encoding\n"; - if (InstSize == 0) - InstSize = 1; // skip illegible bytes + unsigned int Arch = MachOOF->getArch(); + if (Arch == Triple::x86_64 || Arch == Triple::x86){ + outs() << format("\t.byte 0x%02x #bad opcode\n", + *(Bytes.data() + Index) & 0xff); + InstSize = 1; // skip exactly one illegible byte and move on. + } else { + errs() << "llvm-objdump: warning: invalid instruction encoding\n"; + if (InstSize == 0) + InstSize = 1; // skip illegible bytes + } } } } + if (SymbolizerInfo.method != nullptr) + free(SymbolizerInfo.method); + if (SymbolizerInfo.BindTable != nullptr) + delete SymbolizerInfo.BindTable; } } @@ -2909,4 +3406,34 @@ void llvm::printMachOWeakBindTable(const object::MachOObjectFile *Obj) { } } - +// get_dyld_bind_info_symbolname() is used for disassembly and passed an +// address, ReferenceValue, in the Mach-O file and looks in the dyld bind +// information for that address. If the address is found its binding symbol +// name is returned. If not nullptr is returned. +static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue, + struct DisassembleInfo *info) { + if (info->BindTable == nullptr) { + info->BindTable = new (BindTable); + SegInfo sectionTable(info->O); + for (const llvm::object::MachOBindEntry &Entry : info->O->bindTable()) { + uint32_t SegIndex = Entry.segmentIndex(); + uint64_t OffsetInSeg = Entry.segmentOffset(); + uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg); + const char *SymbolName = nullptr; + StringRef name = Entry.symbolName(); + if (!name.empty()) + SymbolName = name.data(); + info->BindTable->push_back(std::make_pair(Address, SymbolName)); + } + } + for (bind_table_iterator BI = info->BindTable->begin(), + BE = info->BindTable->end(); + BI != BE; ++BI) { + uint64_t Address = BI->first; + if (ReferenceValue == Address) { + const char *SymbolName = BI->second; + return SymbolName; + } + } + return nullptr; +} -- 2.7.4