SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
MachineInstr *
sinkPostLoadHardenedInst(MachineInstr &MI,
- SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
- bool canHardenPostLoad(MachineInstr &MI);
+ SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
+ bool canHardenRegister(unsigned Reg);
void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
// By default, assume that the instruction is not data invariant.
return false;
- // FIXME: For now, we just use a very boring, conservative set of unary
- // instructions because we're mostly interested in handling simple
- // transformations.
+ // Some target-independent operations that trivially lower to data-invariant
+ // instructions.
case TargetOpcode::COPY:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ return true;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rr:
+ case X86::IMUL16rri8:
+ case X86::IMUL16rri:
+ case X86::IMUL32rr:
+ case X86::IMUL32rri8:
+ case X86::IMUL32rri:
+ case X86::IMUL64rr:
+ case X86::IMUL64rri32:
+ case X86::IMUL64rri8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rr:
+ case X86::BLCFILL64rr:
+ case X86::BLCI32rr:
+ case X86::BLCI64rr:
+ case X86::BLCIC32rr:
+ case X86::BLCIC64rr:
+ case X86::BLCMSK32rr:
+ case X86::BLCMSK64rr:
+ case X86::BLCS32rr:
+ case X86::BLCS64rr:
+ case X86::BLSFILL32rr:
+ case X86::BLSFILL64rr:
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ case X86::BLSIC32rr:
+ case X86::BLSIC64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::TZMSK32rr:
+ case X86::TZMSK64rr:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rr:
+ case X86::BEXTR64rr:
+ case X86::BEXTRI32ri:
+ case X86::BEXTRI64ri:
+ case X86::BZHI32rr:
+ case X86::BZHI64rr:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rr: case X86::ADC8ri:
+ case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
+ case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
+ case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
+ case X86::ADD8rr: case X86::ADD8ri:
+ case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
+ case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
+ case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
+ case X86::AND8rr: case X86::AND8ri:
+ case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
+ case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
+ case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
+ case X86::OR8rr: case X86::OR8ri:
+ case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
+ case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
+ case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
+ case X86::SBB8rr: case X86::SBB8ri:
+ case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
+ case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
+ case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
+ case X86::SUB8rr: case X86::SUB8ri:
+ case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
+ case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
+ case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
+ case X86::XOR8rr: case X86::XOR8ri:
+ case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
+ case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
+ case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
+ // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+ case X86::ADCX32rr: case X86::ADCX64rr:
+ case X86::ADOX32rr: case X86::ADOX64rr:
+ case X86::ANDN32rr: case X86::ANDN64rr:
+ // Just one operand for inc and dec.
+ case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
+ case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
+ // Check whether the EFLAGS implicit-def is dead. We assume that this will
+ // always find the implicit-def because this code should only be reached
+ // for instructions that do in fact implicitly def this.
+ if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+ // If we would clobber EFLAGS that are used, just bail for now.
+ LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
+ MI.dump(); dbgs() << "\n");
+ return false;
+ }
+
+ // Otherwise, fallthrough to handle these the same as instructions that
+ // don't set EFLAGS.
+ LLVM_FALLTHROUGH;
+
+ // Integer multiply w/o affecting flags is still believed to be constant
+ // time on x86. Called out separately as this is among the most surprising
+ // instructions to exhibit that behavior.
+ case X86::MULX32rr:
+ case X86::MULX64rr:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32ri:
+ case X86::RORX64ri:
+ case X86::SARX32rr:
+ case X86::SARX64rr:
+ case X86::SHLX32rr:
+ case X86::SHLX64rr:
+ case X86::SHRX32rr:
+ case X86::SHRX64rr:
+
+ // LEA doesn't actually access memory, and its arithmetic is constant time.
+ case X86::LEA16r:
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ case X86::LEA64r:
return true;
}
}
// address registers, queue it up to be hardened post-load. Notably, even
// once hardened this won't introduce a useful dependency that could prune
// out subsequent loads.
- if (EnablePostLoadHardening && canHardenPostLoad(MI) &&
+ if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
+ MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+ canHardenRegister(MI.getOperand(0).getReg()) &&
!HardenedAddrRegs.count(BaseReg) &&
!HardenedAddrRegs.count(IndexReg)) {
HardenPostLoad.insert(&MI);
}
MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
- MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
+ MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
assert(isDataInvariantLoad(InitialMI) &&
"Cannot get here with a non-invariant load!");
MachineInstr *SingleUseMI = nullptr;
for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
// If we're already going to harden this use, it is data invariant and
- // within our block and we just need to check that the use isn't in an
- // address.
- if (HardenedLoads.count(&UseMI)) {
+ // within our block.
+ if (HardenedInstrs.count(&UseMI)) {
+ if (!isDataInvariantLoad(UseMI)) {
+ // If we've already decided to harden a non-load, we must have sunk
+ // some other post-load hardened instruction to it and it must itself
+ // be data-invariant.
+ assert(isDataInvariant(UseMI) &&
+ "Data variant instruction being hardened!");
+ continue;
+ }
+
+ // Otherwise, this is a load and the load component can't be data
+ // invariant so check how this register is being used.
const MCInstrDesc &Desc = UseMI.getDesc();
int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
assert(MemRefBeginIdx >= 0 &&
// can harden.
unsigned UseDefReg = UseMI.getOperand(0).getReg();
if (!TRI->isVirtualRegister(UseDefReg) ||
- !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
+ !canHardenRegister(UseDefReg))
return {};
SingleUseMI = &UseMI;
return MI;
}
-bool X86SpeculativeLoadHardeningPass::canHardenPostLoad(MachineInstr &MI) {
- if (!isDataInvariantLoad(MI))
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+ auto *RC = MRI->getRegClass(Reg);
+ int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
+ if (RegBytes > 8)
+ // We don't support post-load hardening of vectors.
return false;
- auto &DefOp = MI.getOperand(0);
- unsigned OldDefReg = DefOp.getReg();
-
- auto *DefRC = MRI->getRegClass(OldDefReg);
- int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
- if (DefRegBytes > 8)
- // We don't support post-load hardening of vectors.
+ // If this register class is explicitly constrained to a class that doesn't
+ // require REX prefix, we may not be able to satisfy that constraint when
+ // emitting the hardening instructions, so bail out here.
+ // FIXME: This seems like a pretty lame hack. The way this comes up is when we
+ // end up both with a NOREX and REX-only register as operands to the hardening
+ // instructions. It would be better to fix that code to handle this situation
+ // rather than hack around it in this way.
+ const TargetRegisterClass *NOREXRegClasses[] = {
+ &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
+ &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
+ if (RC == NOREXRegClasses[Log2_32(RegBytes)])
return false;
const TargetRegisterClass *GPRRegClasses[] = {
&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
&X86::GR64RegClass};
- return DefRC->hasSuperClassEq(GPRRegClasses[Log2_32(DefRegBytes)]);
+ return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
}
// We can harden non-leaking loads into register without touching the address
// execution and coercing them to one is sufficient.
void X86SpeculativeLoadHardeningPass::hardenPostLoad(
MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
- assert(canHardenPostLoad(MI) &&
- "Invalid instruction for post-load hardening!");
-
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc Loc = MI.getDebugLoc();
// For all of these, the def'ed register operand is operand zero.
auto &DefOp = MI.getOperand(0);
unsigned OldDefReg = DefOp.getReg();
+ assert(canHardenRegister(OldDefReg) &&
+ "Cannot harden this instruction's defined register!");
auto *DefRC = MRI->getRegClass(OldDefReg);
int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
call void @sink_v2i64(<2 x i64> %x6)
ret void
}
+
+define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind {
+; X64-LABEL: test_deferred_hardening:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %rax
+; X64-NEXT: movq %rsp, %rax
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: movl (%rdi), %edi
+; X64-NEXT: incl %edi
+; X64-NEXT: imull %edx, %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: shlq $47, %rax
+; X64-NEXT: orq %rax, %rsp
+; X64-NEXT: callq sink
+; X64-NEXT: movq %rsp, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: movl (%rbx), %ecx
+; X64-NEXT: movl (%r14), %edx
+; X64-NEXT: leal 1(%rcx,%rdx), %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: shlq $47, %rax
+; X64-NEXT: orq %rax, %rsp
+; X64-NEXT: callq sink
+; X64-NEXT: movq %rsp, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: shlq $47, %rax
+; X64-NEXT: orq %rax, %rsp
+; X64-NEXT: addq $8, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; X64-LFENCE-LABEL: test_deferred_hardening:
+; X64-LFENCE: # %bb.0: # %entry
+; X64-LFENCE-NEXT: pushq %r14
+; X64-LFENCE-NEXT: pushq %rbx
+; X64-LFENCE-NEXT: pushq %rax
+; X64-LFENCE-NEXT: movq %rsi, %r14
+; X64-LFENCE-NEXT: movq %rdi, %rbx
+; X64-LFENCE-NEXT: movl (%rdi), %edi
+; X64-LFENCE-NEXT: incl %edi
+; X64-LFENCE-NEXT: imull %edx, %edi
+; X64-LFENCE-NEXT: callq sink
+; X64-LFENCE-NEXT: movl (%rbx), %eax
+; X64-LFENCE-NEXT: movl (%r14), %ecx
+; X64-LFENCE-NEXT: leal 1(%rax,%rcx), %edi
+; X64-LFENCE-NEXT: callq sink
+; X64-LFENCE-NEXT: addq $8, %rsp
+; X64-LFENCE-NEXT: popq %rbx
+; X64-LFENCE-NEXT: popq %r14
+; X64-LFENCE-NEXT: retq
+entry:
+ %a1 = load i32, i32* %ptr1
+ %a2 = add i32 %a1, 1
+ %a3 = mul i32 %a2, %x
+ call void @sink(i32 %a3)
+ %b1 = load i32, i32* %ptr1
+ %b2 = add i32 %b1, 1
+ %b3 = load i32, i32* %ptr2
+ %b4 = add i32 %b2, %b3
+ call void @sink(i32 %b4)
+ ret void
+}