From 7ec1221ff7a5e3faa4e58cdfeb3722b2958499e2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 3 Mar 2010 02:10:22 -0800
Subject: [PATCH] sparc: Use ba,a,pt in PLTs and fix bugs in R_SPARC_JMP_IREL
 handling.

2010-03-03  David S. Miller  <davem@davemloft.net>

	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
	pass '1' for 't' argument to sparc_fixup_plt.
	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
	Likewise.
	* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
	(sparc_fixup_plt): Document 't' argument.  Enable branch
	optimization and use v9 branches when possible.  Explain why we
	cannot unconditionally patch the branch into the first PLT
	instruction.
	* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
	argument.  Use v9 branches when possible.  Explain why we can in
	fact unconditionally use a branch in the first PLT instruction
	here.
---
 ChangeLog                          | 16 +++++++++++++
 sysdeps/sparc/sparc32/dl-machine.h |  2 +-
 sysdeps/sparc/sparc32/dl-plt.h     | 47 +++++++++++++++++++++++++++++++++-----
 sysdeps/sparc/sparc64/dl-machine.h |  2 +-
 sysdeps/sparc/sparc64/dl-plt.h     | 29 +++++++++++++++++++----
 5 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index b8118dd..bd9f6ec 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2010-03-03  David S. Miller  <davem@davemloft.net>
+
+	* sysdeps/sparc/sparc32/dl-machine.h (elf_machine_lazy_rel): Must
+	pass '1' for 't' argument to sparc_fixup_plt.
+	* sysdeps/sparc/sparc64/dl-machine.h (elf_machine_lazy_rel):
+	Likewise.
+	* sysdeps/sparc/sparc32/dl-plt.h (OPCODE_BA_PT): Define.
+	(sparc_fixup_plt): Document 't' argument.  Enable branch
+	optimization and use v9 branches when possible.  Explain why we
+	cannot unconditionally patch the branch into the first PLT
+	instruction.
+	* sysdeps/sparc/sparc64/dl-plt.h (sparc64_fixup_plt): Document 't'
+	argument.  Use v9 branches when possible.  Explain why we can in
+	fact unconditionally use a branch in the first PLT instruction
+	here.
+
 2010-02-28  Roland McGrath  <roland@redhat.com>
 
 	* elf/elf.h (NT_X86_XSTATE): New macro.
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index 5325710..9631db3 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -563,7 +563,7 @@ elf_machine_lazy_rel (struct link_map *map,
     {
       Elf32_Addr value = map->l_addr + reloc->r_addend;
       value = ((Elf32_Addr (*) (void)) value) ();
-      sparc_fixup_plt (reloc, reloc_addr, value, 0, 1);
+      sparc_fixup_plt (reloc, reloc_addr, value, 1, 1);
     }
   else if (r_type == R_SPARC_NONE)
     ;
diff --git a/sysdeps/sparc/sparc32/dl-plt.h b/sysdeps/sparc/sparc32/dl-plt.h
index edcc5c1..bfb891f 100644
--- a/sysdeps/sparc/sparc32/dl-plt.h
+++ b/sysdeps/sparc/sparc32/dl-plt.h
@@ -25,19 +25,55 @@
 #define OPCODE_JMP_G1	0x81c06000 /* jmp %g1+?; add lo 10 bits of value */
 #define OPCODE_SAVE_SP	0x9de3bfa8 /* save %sp, -(16+6)*4, %sp */
 #define OPCODE_BA	0x30800000 /* b,a ?; add PC-rel word address */
+#define OPCODE_BA_PT	0x30480000 /* ba,a,pt %icc, ?; add PC-rel word address */
 
 static inline __attribute__ ((always_inline)) Elf32_Addr
 sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
 		 Elf32_Addr value, int t, int do_flush)
 {
-  Elf32_Sword disp = value - (Elf32_Addr) reloc_addr;
+  Elf32_Sword disp;
 
-  if (0 && disp >= -0x800000 && disp < 0x800000)
+  /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+     in which case we'll be resolving all PLT entries and thus can
+     optimize by overwriting instructions starting at the first PLT entry
+     instruction and we need not be mindful of thread safety.
+
+     Otherwise, 't' is '1'.  */
+  reloc_addr += t;
+  disp = value - (Elf32_Addr) reloc_addr;
+
+  if (disp >= -0x800000 && disp < 0x800000)
     {
-      /* Don't need to worry about thread safety. We're writing just one
-	 instruction.  */
+      unsigned int insn = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+
+#ifdef __sparc_v9__
+      /* On V9 we can do even better by using a branch with
+	 prediction if we fit into the even smaller 19-bit
+	 displacement field.  */
+      if (disp >= -0x100000 && disp < 0x100000)
+	insn = OPCODE_BA_PT | ((disp >> 2) & 0x07ffff);
+#endif
+
+      /* Even if we are writing just a single branch, we must not
+	 ignore the 't' offset.  Consider a case where we have some
+	 PLT slots which can be optimized into a single branch and
+	 some which cannot.  Then we can end up with a PLT which looks
+	 like:
+
+		PLT4.0: sethi	%(PLT_4_INDEX), %g1
+			sethi	%(fully_resolved_sym_4), %g1
+			jmp	%g1 + %lo(fully_resolved_sym_4)
+		PLT5.0:	ba,a	fully_resolved_sym_5
+			ba,a	PLT0.0
+			...
+
+	  The delay slot of that jmp must always be either a sethi to
+	  %g1 or a nop.  But if we try to place this displacement
+	  branch there, PLT4.0 will jump to fully_resolved_sym_4 for 1
+	  instruction and then go immediately to
+	  fully_resolved_sym_5.  */
 
-      reloc_addr[0] = OPCODE_BA | ((disp >> 2) & 0x3fffff);
+      reloc_addr[0] = insn;
       if (do_flush)
 	__asm __volatile ("flush %0" : : "r"(reloc_addr));
     }
@@ -48,7 +84,6 @@ sparc_fixup_plt (const Elf32_Rela *reloc, Elf32_Addr *reloc_addr,
 	 need not be done during bootstrapping, since there are no threads.
 	 But we also can't tell if we _can_ use flush, so don't. */
 
-      reloc_addr += t;
       reloc_addr[1] = OPCODE_JMP_G1 | (value & 0x3ff);
       if (do_flush)
 	__asm __volatile ("flush %0+4" : : "r"(reloc_addr));
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 4c915eb..fcfbb06 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -661,7 +661,7 @@ elf_machine_lazy_rel (struct link_map *map,
 	{
 	  /* 'high' is always zero, for large PLT entries the linker
 	     emits an R_SPARC_IRELATIVE.  */
-	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 0);
+	  sparc64_fixup_plt (map, reloc, reloc_addr, value, 0, 1);
 	}
       else
 	*reloc_addr = value;
diff --git a/sysdeps/sparc/sparc64/dl-plt.h b/sysdeps/sparc/sparc64/dl-plt.h
index e06be43..ca2fe3b 100644
--- a/sysdeps/sparc/sparc64/dl-plt.h
+++ b/sysdeps/sparc/sparc64/dl-plt.h
@@ -28,7 +28,14 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
   Elf64_Addr plt_vaddr = (Elf64_Addr) reloc_addr;
   Elf64_Sxword disp = value - plt_vaddr;
 
-  /* Now move plt_vaddr up to the call instruction.  */
+  /* 't' is '0' if we are resolving this PLT entry for RTLD bootstrap,
+     in which case we'll be resolving all PLT entries and thus can
+     optimize by overwriting instructions starting at the first PLT entry
+     instruction and we need not be mindful of thread safety.
+
+     Otherwise, 't' is '1'.
+
+     Now move plt_vaddr up to the call instruction.  */
   plt_vaddr += ((t + 1) * 4);
 
   /* PLT entries .PLT32768 and above look always the same.  */
@@ -39,10 +46,22 @@ sparc64_fixup_plt (struct link_map *map, const Elf64_Rela *reloc,
   /* Near destination.  */
   else if (disp >= -0x800000 && disp < 0x800000)
     {
-      /* As this is just one instruction, it is thread safe and so
-	 we can avoid the unnecessary sethi FOO, %g1.
-	 b,a target  */
-      insns[0] = 0x30800000 | ((disp >> 2) & 0x3fffff);
+      unsigned int insn;
+
+      /* ba,a */
+      insn = 0x30800000 | ((disp >> 2) & 0x3fffff);
+
+      if (disp >= -0x100000 && disp < 0x100000)
+	{
+	  /* ba,a,pt %icc */
+	  insn = 0x30480000  | ((disp >> 2) & 0x07ffff);
+	}
+
+      /* As this is just one instruction, it is thread safe and so we
+	 can avoid the unnecessary sethi FOO, %g1.  Each 64-bit PLT
+	 entry is 8 instructions long, so we can't run into the 'jmp'
+	 delay slot problems 32-bit PLTs can.  */
+      insns[0] = insn;
       __asm __volatile ("flush %0" : : "r" (insns));
     }
   /* 32-bit Sparc style, the target is in the lower 32-bits of
-- 
2.7.4