The access sequence for global variables in the medium and large code models use
2 instructions to add an offset to the toc-pointer. If the offset fits whithin
16-bits then the instruction that sets the high 16 bits is redundant.
This patch adds the --toc-optimize option, (on by default) and enables rewriting
of 2 instruction global variable accesses into 1 when the offset from the
TOC-pointer to the variable (or .got entry) fits in 16 signed bits. eg
addis %r3, %r2, 0 --> nop
addi %r3, %r3, -0x8000 --> addi %r3, %r2, -0x8000
This rewriting can be disabled with the --no-toc-optimize flag
Differential Revision: https://reviews.llvm.org/D49237
llvm-svn: 342602
enum DFormOpcd {
LBZ = 34,
+ LBZU = 35,
LHZ = 40,
+ LHZU = 41,
+ LHAU = 43,
LWZ = 32,
+ LWZU = 33,
+ LFSU = 49,
LD = 58,
+ LFDU = 51,
STB = 38,
+ STBU = 39,
STH = 44,
+ STHU = 45,
STW = 36,
+ STWU = 37,
+ STFSU = 53,
+ STFDU = 55,
STD = 62,
ADDI = 14
};
}
}
+static bool isInstructionUpdateForm(uint32_t Encoding) {
+ switch (getPrimaryOpCode(Encoding)) {
+ default:
+ return false;
+ case LBZU:
+ case LHAU:
+ case LHZU:
+ case LWZU:
+ case LFSU:
+ case LFDU:
+ case STBU:
+ case STHU:
+ case STWU:
+ case STFSU:
+ case STFDU:
+ return true;
+ // LWA has the same opcode as LD, and the DS bits is what differentiates
+ // between LD/LDU/LWA
+ case LD:
+ case STD:
+ return (Encoding & 3) == 1;
+ }
+}
+
// There are a number of places when we either want to read or write an
// instruction when handling a half16 relocation type. On big-endian the buffer
// pointer is pointing into the middle of the word we want to extract, and on
}
}
+static bool isTocRelType(RelType Type) {
+ return Type == R_PPC64_TOC16_HA || Type == R_PPC64_TOC16_LO_DS ||
+ Type == R_PPC64_TOC16_LO;
+}
+
void PPC64::relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const {
// For a TOC-relative relocation, proceed in terms of the corresponding
// ADDR16 relocation type.
+ bool IsTocRelType = isTocRelType(Type);
std::tie(Type, Val) = toAddr16Rel(Type, Val);
switch (Type) {
case R_PPC64_ADDR16_HA:
case R_PPC64_REL16_HA:
case R_PPC64_TPREL16_HA:
- write16(Loc, ha(Val));
+ if (Config->TocOptimize && IsTocRelType && ha(Val) == 0)
+ writeInstrFromHalf16(Loc, 0x60000000);
+ else
+ write16(Loc, ha(Val));
break;
case R_PPC64_ADDR16_HI:
case R_PPC64_REL16_HI:
case R_PPC64_ADDR16_LO:
case R_PPC64_REL16_LO:
case R_PPC64_TPREL16_LO:
+ // When the high-adjusted part of a toc relocation evalutes to 0, it is
+ // changed into a nop. The lo part then needs to be updated to use the
+ // toc-pointer register r2, as the base register.
+ if (Config->TocOptimize && IsTocRelType && ha(Val) == 0) {
+ uint32_t Instr = readInstrFromHalf16(Loc);
+ if (isInstructionUpdateForm(Instr))
+ error(getErrorLocation(Loc) +
+ "can't toc-optimize an update instruction: 0x" +
+ utohexstr(Instr));
+ Instr = (Instr & 0xFFE00000) | 0x00020000;
+ writeInstrFromHalf16(Loc, Instr);
+ }
write16(Loc, lo(Val));
break;
case R_PPC64_ADDR16_LO_DS:
case R_PPC64_TPREL16_LO_DS: {
// DQ-form instructions use bits 28-31 as part of the instruction encoding
// DS-form instructions only use bits 30-31.
- uint16_t Mask = isDQFormInstruction(readInstrFromHalf16(Loc)) ? 0xF : 0x3;
+ uint32_t Inst = readInstrFromHalf16(Loc);
+ uint16_t Mask = isDQFormInstruction(Inst) ? 0xF : 0x3;
checkAlignment(Loc, lo(Val), Mask + 1, Type);
+ if (Config->TocOptimize && IsTocRelType && ha(Val) == 0) {
+ // When the high-adjusted part of a toc relocation evalutes to 0, it is
+ // changed into a nop. The lo part then needs to be updated to use the toc
+ // pointer register r2, as the base register.
+ if (isInstructionUpdateForm(Inst))
+ error(getErrorLocation(Loc) +
+ "Can't toc-optimize an update instruction: 0x" +
+ Twine::utohexstr(Inst));
+ Inst = (Inst & 0xFFE0000F) | 0x00020000;
+ writeInstrFromHalf16(Loc, Inst);
+ }
write16(Loc, (read16(Loc) & Mask) | lo(Val));
} break;
case R_PPC64_ADDR32:
bool Trace;
bool ThinLTOEmitImportsFiles;
bool ThinLTOIndexOnly;
+ bool TocOptimize;
bool UndefinedVersion;
bool UseAndroidRelrTags = false;
bool WarnBackrefs;
if (Config->FixCortexA53Errata843419 && Config->EMachine != EM_AARCH64)
error("--fix-cortex-a53-843419 is only supported on AArch64 targets.");
+ if (Config->TocOptimize && Config->EMachine != EM_PPC64)
+ error("--toc-optimize is only supported on the PowerPC64 target.");
+
if (Config->Pie && Config->Shared)
error("-shared and -pie may not be used together");
Config->WriteAddends = Args.hasFlag(OPT_apply_dynamic_relocs,
OPT_no_apply_dynamic_relocs, false) ||
!Config->IsRela;
+
+ Config->TocOptimize =
+ Args.hasFlag(OPT_toc_optimize, OPT_no_toc_optimize, Machine == EM_PPC64);
}
// Returns a value of "-format" option.
"Run the linker multi-threaded (default)",
"Do not run the linker multi-threaded">;
+defm toc_optimize : B<"toc-optimize",
+ "(PowerPC64) Enable TOC related optimizations (default)",
+ "(PowerPC64) Disable TOC related optimizations">;
+
def trace: F<"trace">, HelpText<"Print the names of the input files">;
defm trace_symbol: Eq<"trace-symbol", "Trace references to symbols">;
// CHECK: foo_external_diff:
// CHECK-NEXT: 10010080: {{.*}} addis 2, 12, 2
// CHECK-NEXT: 10010084: {{.*}} addi 2, 2, 32640
-// CHECK-NEXT: 10010088: {{.*}} addis 5, 2, 0
+// CHECK-NEXT: 10010088: {{.*}} nop
// CHECK: foo_external_same:
// CHECK-NEXT: 100100b0: {{.*}} add 3, 4, 3
# CHECK: _start:
# CHECK-NEXT: 10010000: {{.*}} addis 2, 12, 3
# CHECK-NEXT: 10010004: {{.*}} addi 2, 2, -32768
-# CHECK-NEXT: 10010008: {{.*}} addis 3, 2, 0
-# CHECK-NEXT: 1001000c: {{.*}} ld 3, -32760(3)
+# CHECK-NEXT: 10010008: {{.*}} nop
+# CHECK-NEXT: 1001000c: {{.*}} ld 3, -32760(2)
# CHECK: 1001001c: {{.*}} lwa 3, 0(3)
# CHECK-LE: Disassembly of section .data:
# CHECK: Disassembly of section .R_PPC64_TOC16_HA:
# CHECK: .FR_PPC64_TOC16_HA:
-# CHECK: 10010018: {{.*}} addis 1, 2, 0
+# CHECK: 10010018: {{.*}} nop
.section .R_PPC64_REL24,"ax",@progbits
.globl .FR_PPC64_REL24
# 0x10000190 + 0xfeb4 = 0x10010044
# CHECK: Disassembly of section .R_PPC64_REL32:
# CHECK: .FR_PPC64_REL32:
-# CHECK: 1001003c: {{.*}} addis 5, 2, 0
-# CHECK: 10010040: {{.*}} ld 5, -32736(5)
+# CHECK: 1001003c: {{.*}} nop
+# CHECK: 10010040: {{.*}} ld 5, -32736(2)
# CHECK: 10010044: {{.*}} add 3, 3, 4
.section .R_PPC64_REL64, "ax",@progbits
--- /dev/null
+# REQUIRES: ppc
+
+# RUN: llvm-readelf -relocations --wide %p/Inputs/ppc64le-quadword-ldst.o | FileCheck --check-prefix=QuadInputRelocs %s
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
+# RUN: ld.lld -shared %t2.o -o %t2.so
+
+# RUN: ld.lld %t2.so %p/Inputs/ppc64le-quadword-ldst.o -o %t
+# RUN: llvm-objdump -D %t | FileCheck --check-prefix=Dis %s
+
+# RUN: ld.lld --no-toc-optimize %t2.so %p/Inputs/ppc64le-quadword-ldst.o -o %t
+# RUN: llvm-objdump -D %t | FileCheck --check-prefix=NoOpt %s
+
+# QuadInputRelocs: Relocation section '.rela.text'
+# QuadInputRelocs: R_PPC64_TOC16_LO_DS 0000000000000000 quadLd
+# QuadInputRelocs: R_PPC64_TOC16_LO_DS 0000000000000010 quadSt
+
+# The powerpc backend doesn't support the quadword load/store instructions yet.
+# So they are tested by linking against an object file assembled with
+# `as -mpower9 -o ppc64le-quadword-ldst.o in.s` and checking the encoding of
+# the unknown instructions in the dissasembly. Source used as input:
+#quads:
+#.Lbegin_quads:
+#.Lgep_quads:
+# addis 2, 12, .TOC.-.Lgep_quads@ha
+# addi 2, 2, .TOC.-.Lgep_quads@l
+#.Llep_quads:
+#.localentry quads, .Llep_quads-.Lgep_quads
+# addis 3, 2, quadLd@toc@ha
+# lq 4, quadLd@toc@l(3)
+# addis 3, 2, quadSt@toc@ha
+# stq 4, quadSt@toc@l(3)
+# blr
+#
+# .p2align 4
+# .global quadLd
+# .lcomm quadLd, 16
+#
+# .global quadSt
+# .lcomm quadSt, 16
+
+
+# e0 82 7f 70 decodes to | 111000 | 00100 | 00010 | 16-bit imm |
+# | 56 | 4 | 2 | 32624 |
+# which is `lq r4, 32624(r2)`
+# f8 82 7f 82 decodes to | 111110 | 00100 | 00010 | 14-bit imm | 10 |
+# | 62 | 4 | 2 | 8160 | 2 |
+# The immediate represents a word offset so this dissasembles to:
+# `stq r4, 32640(r2)`
+# Dis-LABEL: quads:
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: 70 7f 82 e0 <unknown>
+# Dis-NEXT: nop
+# Dis-NEXT: 82 7f 82 f8 <unknown>
+# Dis-NEXT: blr
+
+# e0 83 7f 70 decodes to | 111000 | 00100 | 00011 | 16-bit imm |
+# | 56 | 4 | 3 | 32624 |
+# `lq r4, 32624(r3)`
+# f8 83 7f 82 decodes to | 111110 | 00100 | 00010 | 14-bit imm | 10 |
+# | 62 | 4 | 2 | 8160 | 2 |
+# `stq r4, 32640(r3)`
+# NoOpt-LABEL: quads:
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: 70 7f 83 e0 <unknown>
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: 82 7f 83 f8 <unknown>
+# NoOpt-NEXT: blr
+
--- /dev/null
+# REQUIRES: ppc
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t.o
+# RUN: llvm-readelf -relocations --wide %t.o | FileCheck --check-prefix=InputRelocs %s
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
+# RUN: ld.lld -shared %t2.o -o %t2.so
+#
+# RUN: ld.lld %t2.so %t.o -o %t
+# RUN: llvm-objdump -D %t | FileCheck --check-prefix=Dis %s
+#
+# RUN: ld.lld --no-toc-optimize %t2.so %t.o -o %t
+# RUN: llvm-objdump -D %t | FileCheck --check-prefix=NoOpt %s
+
+# InputRelocs: Relocation section '.rela.text'
+# InputRelocs: R_PPC64_TOC16_HA
+# InputRelocs: R_PPC64_TOC16_LO
+# InputRelocs: R_PPC64_TOC16_LO_DS
+
+
+ .text
+ .abiversion 2
+
+ .global bytes
+ .p2align 4
+ .type bytes,@function
+bytes:
+.Lbytes_gep:
+ addis 2, 12, .TOC.-.Lbytes_gep@ha
+ addi 2, 2, .TOC.-.Lbytes_gep@l
+.Lbytes_lep:
+ .localentry bytes, .Lbytes_lep-.Lbytes_gep
+ addis 3, 2, byteLd@toc@ha
+ lbz 3, byteLd@toc@l(3)
+ addis 4, 2, byteSt@toc@ha
+ stb 3, byteSt@toc@l(4)
+ blr
+# Dis-LABEL: bytes
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: lbz 3, 32624(2)
+# Dis-NEXT: nop
+# Dis-NEXT: stb 3, 32625(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: bytes
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lbz 3, 32624(3)
+# NoOpt-NEXT: addis 4, 2, 0
+# NoOpt-NEXT: stb 3, 32625(4)
+# NoOpt-NEXT: blr
+
+ .global halfs
+ .p2align 4
+ .type halfs,@function
+halfs:
+.Lhalfs_gep:
+ addis 2, 12, .TOC.-.Lhalfs_gep@ha
+ addi 2, 2, .TOC.-.Lhalfs_gep@l
+.Lhalfs_lep:
+ .localentry halfs, .Lhalfs_lep-.Lhalfs_gep
+ addis 3, 2, halfLd@toc@ha
+ lhz 3, halfLd@toc@l(3)
+ addis 4, 2, halfLd@toc@ha
+ lha 4, halfLd@toc@l(4)
+ addis 5, 2, halfSt@toc@ha
+ sth 4, halfSt@toc@l(5)
+ blr
+# Dis-LABEL: halfs
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: lhz 3, 32626(2)
+# Dis-NEXT: nop
+# Dis-NEXT: lha 4, 32626(2)
+# Dis-NEXT: nop
+# Dis-NEXT: sth 4, 32628(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: halfs
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lhz 3, 32626(3)
+# NoOpt-NEXT: addis 4, 2, 0
+# NoOpt-NEXT: lha 4, 32626(4)
+# NoOpt-NEXT: addis 5, 2, 0
+# NoOpt-NEXT: sth 4, 32628(5)
+# NoOpt-NEXT: blr
+
+
+ .global words
+ .p2align 4
+ .type words,@function
+words:
+.Lwords_gep:
+ addis 2, 12, .TOC.-.Lwords_gep@ha
+ addi 2, 2, .TOC.-.Lwords_gep@l
+.Lwords_lep:
+ .localentry words, .Lwords_lep-.Lwords_gep
+ addis 3, 2, wordLd@toc@ha
+ lwz 3, wordLd@toc@l(3)
+ addis 4, 2, wordLd@toc@ha
+ lwa 4, wordLd@toc@l(4)
+ addis 5, 2, wordSt@toc@ha
+ stw 4, wordSt@toc@l(5)
+ blr
+# Dis-LABEL: words
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: lwz 3, 32632(2)
+# Dis-NEXT: nop
+# Dis-NEXT: lwa 4, 32632(2)
+# Dis-NEXT: nop
+# Dis-NEXT: stw 4, 32636(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: words
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lwz 3, 32632(3)
+# NoOpt-NEXT: addis 4, 2, 0
+# NoOpt-NEXT: lwa 4, 32632(4)
+# NoOpt-NEXT: addis 5, 2, 0
+# NoOpt-NEXT: stw 4, 32636(5)
+# NoOpt-NEXT: blr
+
+ .global doublewords
+ .p2align 4
+ .type doublewords,@function
+doublewords:
+.Ldoublewords_gep:
+ addis 2, 12, .TOC.-.Ldoublewords_gep@ha
+ addi 2, 2, .TOC.-.Ldoublewords_gep@l
+.Ldoublewords_lep:
+ .localentry doublewords, .Ldoublewords_lep-.Ldoublewords_gep
+ addis 3, 2, dwordLd@toc@ha
+ ld 3, dwordLd@toc@l(3)
+ addis 4, 2, dwordSt@toc@ha
+ std 3, dwordSt@toc@l(4)
+ blr
+
+# Dis-LABEL: doublewords
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: ld 3, 32640(2)
+# Dis-NEXT: nop
+# Dis-NEXT: std 3, 32648(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: doublewords
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: ld 3, 32640(3)
+# NoOpt-NEXT: addis 4, 2, 0
+# NoOpt-NEXT: std 3, 32648(4)
+# NoOpt-NEXT: blr
+
+ .global vec_dq
+ .p2align 4
+ .type vec_dq,@function
+vec_dq:
+.Lvec_dq_gep:
+ addis 2, 12, .TOC.-.Lvec_dq_gep@ha
+ addi 2, 2, .TOC.-.Lvec_dq_gep@l
+.Lvec_dq_lep:
+ .localentry vec_dq, .Lvec_dq_lep-.Lvec_dq_gep
+ addis 3, 2, vecLd@toc@ha
+ lxv 3, vecLd@toc@l(3)
+ addis 3, 2, vecSt@toc@ha
+ stxv 3, vecSt@toc@l(3)
+ blr
+
+# Dis-LABEL: vec_dq
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: lxv 3, 32656(2)
+# Dis-NEXT: nop
+# Dis-NEXT: stxv 3, 32672(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: vec_dq
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lxv 3, 32656(3)
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: stxv 3, 32672(3)
+# NoOpt-NEXT: blr
+
+ .global vec_ds
+ .p2align 4
+ .type vec_ds,@function
+vec_ds:
+.Lvec_ds_gep:
+ addis 2, 12, .TOC.-.Lvec_ds_gep@ha
+ addi 2, 2, .TOC.-.Lvec_ds_gep@l
+.Lvec_ds_lep:
+ .localentry vec_ds, .Lvec_dq_lep-.Lvec_dq_gep
+ addis 3, 2, vecLd@toc@ha
+ lxsd 3, vecLd@toc@l(3)
+ addis 3, 2, vecSt@toc@ha
+ stxsd 3, vecSt@toc@l(3)
+ addis 3, 2, vecLd@toc@ha
+ lxssp 3, vecLd@toc@l(3)
+ addis 3, 2, vecSt@toc@ha
+ stxssp 3, vecSt@toc@l(3)
+ blr
+# Dis-LABEL: vec_ds
+# Dis-NEXT: addis
+# Dis-NEXT: addi
+# Dis-NEXT: nop
+# Dis-NEXT: lxsd 3, 32656(2)
+# Dis-NEXT: nop
+# Dis-NEXT: stxsd 3, 32672(2)
+# Dis-NEXT: nop
+# Dis-NEXT: lxssp 3, 32656(2)
+# Dis-NEXT: nop
+# Dis-NEXT: stxssp 3, 32672(2)
+# Dis-NEXT: blr
+
+# NoOpt-LABEL: vec_ds
+# NoOpt-NEXT: addis
+# NoOpt-NEXT: addi
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lxsd 3, 32656(3)
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: stxsd 3, 32672(3)
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: lxssp 3, 32656(3)
+# NoOpt-NEXT: addis 3, 2, 0
+# NoOpt-NEXT: stxssp 3, 32672(3)
+# NoOpt-NEXT: blr
+
+
+ .global byteLd
+ .lcomm byteLd, 1, 1
+
+ .global byteSt
+ .lcomm byteSt, 1, 1
+
+ .global halfLd
+ .lcomm halfLd, 2, 2
+
+ .global halfSt
+ .lcomm halfSt, 2, 2
+
+ .global wordLd
+ .lcomm wordLd, 4, 4
+
+ .global wordSt
+ .lcomm wordSt, 4, 4
+
+ .global dwordLd
+ .lcomm dwordLd, 8, 8
+
+ .global dwordSt
+ .lcomm dwordSt, 8, 8
+
+ .global vecLd
+ .lcomm vecLd, 16, 16
+
+ .global vecSt
+ .lcomm vecSt, 16, 16
--- /dev/null
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: not ld.lld %t --toc-optimize -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: --toc-optimize is only supported on the PowerPC64 target.
+
+ .global __start
+ .type __start,@function
+
+ .text
+ .quad 0
+ __start:
+