"amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per
execution unit. Generated by the ``amdgpu_waves_per_eu``
CLANG attribute [CLANG-ATTR]_.
+
+ "amdgpu-ieee" true/false. Specify whether the function expects
+ the IEEE field of the mode register to be set on entry. Overrides
+ the default for the calling convention.
+ "amdgpu-dx10-clamp" true/false. Specify whether the function expects
+ the DX10_CLAMP field of the mode register to be set on entry. Overrides
+ the default for the calling convention.
+
======================================= ==========================================================
Code Object
// register.
ProgInfo.FloatMode = getFPMode(MF);
- ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
+ const SIModeRegisterDefaults Mode = MFI->getMode();
+ ProgInfo.IEEEMode = Mode.IEEE;
// Make clamp modifier on NaN input returns 0.
- ProgInfo.DX10Clamp = STM.enableDX10Clamp();
+ ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
SubtargetFeature <Value, "Gen", Subtarget#"::"#Value,
Value#" GPU generation", Implies>;
-def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
- "DX10Clamp",
- "true",
- "clamp modifier clamps NaNs to 0.0"
->;
-
def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
"EnablePromoteAlloca",
"true",
R600Subtarget &
R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS) {
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
+ SmallString<256> FullFS("+promote-alloca,");
FullFS += FS;
ParseSubtargetFeatures(GPU, FullFS);
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
- SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
+ SmallString<256> FullFS("+promote-alloca,+load-store-opt,");
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
HalfRate64Ops(false),
FP64FP16Denormals(false),
- DX10Clamp(false),
FlatForGlobal(false),
AutoWaitcntBeforeBarrier(false),
CodeObjectV3(false),
FMA(false),
CaymanISA(false),
CFALUBug(false),
- DX10Clamp(false),
HasVertexCache(false),
R600ALUInst(false),
FP64(false),
// Dynamially set bits that enable features.
bool FP64FP16Denormals;
- bool DX10Clamp;
bool FlatForGlobal;
bool AutoWaitcntBeforeBarrier;
bool CodeObjectV3;
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
- bool enableDX10Clamp() const {
- return DX10Clamp;
- }
-
- bool enableIEEEBit(const MachineFunction &MF) const {
- return AMDGPU::isCompute(MF.getFunction().getCallingConv());
- }
-
bool useFlatForGlobal() const {
return FlatForGlobal;
}
bool FMA;
bool CaymanISA;
bool CFALUBug;
- bool DX10Clamp;
bool HasVertexCache;
bool R600ALUInst;
bool FP64;
}
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
- const Function *Callee) const {
+ const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
- return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
+ if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
+ return false;
+
+ // FIXME: dx10_clamp can just take the caller setting, but there seems to be
+ // no way to support merge for backend defined attributes.
+ AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
+ AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+ return CallerMode.isInlineCompatible(CalleeMode);
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// omod is ignored by hardware if IEEE bit is enabled. omod also does not
// correctly handle signed zeros.
//
- bool IsIEEEMode = ST->enableIEEEBit(MF);
+ // FIXME: Also need to check strictfp
+ bool IsIEEEMode = MFI->getMode().IEEE;
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
for (MachineBasicBlock *MBB : depth_first(&MF)) {
SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool IsIEEEMode = Info->getMode().IEEE;
// FIXME: Assert during eslection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee
if (Cmp == APFloat::cmpGreaterThan)
return SDValue();
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// TODO: Check IEEE bit enabled?
EVT VT = Op0.getValueType();
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
// hardware fmed3 behavior converting to a min.
// FIXME: Should this be allowing -0.0?
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
}
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
// handling no dx10-clamp?
- if (Subtarget->enableDX10Clamp()) {
+ if (Info->getMode().DX10Clamp) {
// If NaNs is clamped to 0, we are free to reorder the inputs.
if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
if (!CSrc)
return SDValue();
+ const MachineFunction &MF = DCI.DAG.getMachineFunction();
const APFloat &F = CSrc->getValueAPF();
APFloat Zero = APFloat::getZero(F.getSemantics());
APFloat::cmpResult Cmp0 = F.compare(Zero);
if (Cmp0 == APFloat::cmpLessThan ||
- (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ (Cmp0 == APFloat::cmpUnordered &&
+ MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
}
bool SNaN,
unsigned Depth) const {
if (Op.getOpcode() == AMDGPUISD::CLAMP) {
- if (Subtarget->enableDX10Clamp())
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (Info->getMode().DX10Clamp)
return true; // Clamped to 0.
return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
+ Mode(MF.getFunction()),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
AMDGPUFunctionArgInfo ArgInfo;
+ // State of MODE register, assumed FP mode.
+ AMDGPU::SIModeRegisterDefaults Mode;
+
// Graphics info.
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
return SpillVGPRs;
}
+ AMDGPU::SIModeRegisterDefaults getMode() const {
+ return Mode;
+ }
+
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
return true;
}
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+ *this = getDefaultForCallingConv(F.getCallingConv());
+
+ StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
+ if (!IEEEAttr.empty())
+ IEEE = IEEEAttr == "true";
+
+ StringRef DX10ClampAttr
+ = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
+ if (!DX10ClampAttr.empty())
+ DX10Clamp = DX10ClampAttr == "true";
+}
+
namespace {
struct SourceOfDivergence {
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
+
+// Track defaults for fields in the MODE registser.
+struct SIModeRegisterDefaults {
+ /// Floating point opcodes that support exception flag gathering quiet and
+ /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
+ /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
+ /// quieting.
+ bool IEEE : 1;
+
+ /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
+ /// clamp NaN to zero; otherwise, pass NaN through.
+ bool DX10Clamp : 1;
+
+ // TODO: FP mode fields
+
+ SIModeRegisterDefaults() :
+ IEEE(true),
+ DX10Clamp(true) {}
+
+ SIModeRegisterDefaults(const Function &F);
+
+ static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
+ SIModeRegisterDefaults Mode;
+ Mode.DX10Clamp = true;
+ Mode.IEEE = AMDGPU::isCompute(CC);
+ return Mode;
+ }
+
+ bool operator ==(const SIModeRegisterDefaults Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp;
+ }
+
+ // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
+ // be able to override.
+ bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
+ return *this == CalleeMode;
+ }
+};
+
} // end namespace AMDGPU
} // end namespace llvm
--- /dev/null
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}kernel_ieee_mode_default:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_ieee_mode_on:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_ieee_mode_off:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-NOT: [[VAL0]]
+; GCN-NOT: [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_ieee_mode_default:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define void @func_ieee_mode_default() #0 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_ieee_mode_on:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define void @func_ieee_mode_on() #1 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_ieee_mode_off:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-NOT: [[VAL0]]
+; GCN-NOT: [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
+; GCN-NOT: v_mul_f32
+define void @func_ieee_mode_off() #2 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}cs_ieee_mode_default:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_cs void @cs_ieee_mode_default() #0 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}cs_ieee_mode_on:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_cs void @cs_ieee_mode_on() #1 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}cs_ieee_mode_off:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-NOT: [[VAL0]]
+; GCN-NOT: [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_cs void @cs_ieee_mode_off() #2 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}ps_ieee_mode_default:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-NOT: [[VAL0]]
+; GCN-NOT: [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_ps void @ps_ieee_mode_default() #0 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}ps_ieee_mode_on:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
+; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_ps void @ps_ieee_mode_on() #1 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}ps_ieee_mode_off:
+; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
+; GCN-NOT: [[VAL0]]
+; GCN-NOT: [[VAL1]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
+; GCN-NOT: v_mul_f32
+define amdgpu_ps void @ps_ieee_mode_off() #2 {
+ %val0 = load volatile float, float addrspace(1)* undef
+ %val1 = load volatile float, float addrspace(1)* undef
+ %min = call float @llvm.minnum.f32(float %val0, float %val1)
+ store volatile float %min, float addrspace(1)* undef
+ ret void
+}
+
+declare float @llvm.minnum.f32(float, float) #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-ieee"="true" }
+attributes #2 = { nounwind "amdgpu-ieee"="false" }
+attributes #3 = { nounwind readnone speculatable }
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
-attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
-attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "no-nans-fp-math"="false" }
+attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }
+attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "no-nans-fp-math"="false" }
ret void
}
+; GCN-LABEL: {{^}}test_no_ieee_mode_vi:
+; GCN: float_mode = 192
+; GCN: enable_dx10_clamp = 1
+; GCN: enable_ieee_mode = 0
+define amdgpu_kernel void @test_no_ieee_mode_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_no_ieee_mode_no_dx10_clamp_vi:
+; GCN: float_mode = 192
+; GCN: enable_dx10_clamp = 0
+; GCN: enable_ieee_mode = 0
+define amdgpu_kernel void @test_no_ieee_mode_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #8 {
+ store float 0.0, float addrspace(1)* %out0
+ store double 0.0, double addrspace(1)* %out1
+ ret void
+}
+
attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" }
attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" }
attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" }
attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" }
attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" }
attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" }
-attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3,-dx10-clamp" }
+attributes #6 = { nounwind "amdgpu-dx10-clamp"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
+attributes #7 = { nounwind "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
+attributes #8 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" }
--- /dev/null
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s
+
+define i32 @func_default() #0 {
+ ret i32 0
+}
+
+define i32 @func_dx10_clamp_enabled() #1 {
+ ret i32 0
+}
+
+define i32 @func_dx10_clamp_disabled() #2 {
+ ret i32 0
+}
+
+; CHECK-LABEL: @default_call_default(
+; CHECK-NEXT: ret i32 0
+define i32 @default_call_default() #0 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_enabled_call_default(
+; CHECK-NEXT: ret i32 0
+define i32 @dx10_clamp_enabled_call_default() #1 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_enabled(
+; CHECK-NEXT: ret i32 0
+define i32 @dx10_clamp_enabled_call_dx10_clamp_enabled() #1 {
+ %call = call i32 @func_dx10_clamp_enabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_enabled_call_dx10_clamp_disabled(
+; CHECK-NEXT: call i32 @func_dx10_clamp_disabled()
+define i32 @dx10_clamp_enabled_call_dx10_clamp_disabled() #1 {
+ %call = call i32 @func_dx10_clamp_disabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_disabled_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define i32 @dx10_clamp_disabled_call_default() #2 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_enabled(
+; CHECK-NEXT: call i32 @func_dx10_clamp_enabled()
+define i32 @dx10_clamp_disabled_call_dx10_clamp_enabled() #2 {
+ %call = call i32 @func_dx10_clamp_enabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @dx10_clamp_disabled_call_dx10_clamp_disabled(
+; CHECK-NEXT: ret i32 0
+define i32 @dx10_clamp_disabled_call_dx10_clamp_disabled() #2 {
+ %call = call i32 @func_dx10_clamp_disabled()
+ ret i32 %call
+}
+
+; Shader calling a compute function
+; CHECK-LABEL: @amdgpu_ps_default_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; Shader with dx10_clamp enabled calling a compute function. Default
+; also implies ieee_mode, so this isn't inlinable.
+; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_call_default() #1 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_call_default() #2 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @amdgpu_ps_dx10_clamp_enabled_ieee_call_default(
+; CHECK-NEXT: ret i32 0
+define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_enabled_ieee_call_default() #3 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @amdgpu_ps_dx10_clamp_disabled_ieee_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_dx10_clamp_disabled_ieee_call_default() #4 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-dx10-clamp"="true" }
+attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" }
+attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "amdgpu-ieee"="true" }
+attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="true" }
--- /dev/null
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -inline < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes='cgscc(inline)' < %s | FileCheck %s
+
+define i32 @func_default() #0 {
+ ret i32 0
+}
+
+define i32 @func_ieee_enabled() #1 {
+ ret i32 0
+}
+
+define i32 @func_ieee_disabled() #2 {
+ ret i32 0
+}
+
+; CHECK-LABEL: @default_call_default(
+; CHECK-NEXT: ret i32 0
+define i32 @default_call_default() #0 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_enabled_call_default(
+; CHECK-NEXT: ret i32 0
+define i32 @ieee_enabled_call_default() #1 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_enabled_call_ieee_enabled(
+; CHECK-NEXT: ret i32 0
+define i32 @ieee_enabled_call_ieee_enabled() #1 {
+ %call = call i32 @func_ieee_enabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_enabled_call_ieee_disabled(
+; CHECK-NEXT: call i32 @func_ieee_disabled()
+define i32 @ieee_enabled_call_ieee_disabled() #1 {
+ %call = call i32 @func_ieee_disabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_disabled_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define i32 @ieee_disabled_call_default() #2 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_disabled_call_ieee_enabled(
+; CHECK-NEXT: call i32 @func_ieee_enabled()
+define i32 @ieee_disabled_call_ieee_enabled() #2 {
+ %call = call i32 @func_ieee_enabled()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @ieee_disabled_call_ieee_disabled(
+; CHECK-NEXT: ret i32 0
+define i32 @ieee_disabled_call_ieee_disabled() #2 {
+ %call = call i32 @func_ieee_disabled()
+ ret i32 %call
+}
+
+; Shader calling a compute function
+; CHECK-LABEL: @amdgpu_ps_default_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_default_call_default() #0 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; Shader with ieee enabled calling a compute function
+; CHECK-LABEL: @amdgpu_ps_ieee_enabled_call_default(
+; CHECK-NEXT: ret i32 0
+define amdgpu_ps i32 @amdgpu_ps_ieee_enabled_call_default() #1 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+; CHECK-LABEL: @amdgpu_ps_ieee_disabled_call_default(
+; CHECK-NEXT: call i32 @func_default()
+define amdgpu_ps i32 @amdgpu_ps_ieee_disabled_call_default() #2 {
+ %call = call i32 @func_default()
+ ret i32 %call
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-ieee"="true" }
+attributes #2 = { nounwind "amdgpu-ieee"="false" }