gcc supports selecting ymm0/zmm0 for the Yz constraint when used with 256 or 512 bit vector types.
Fixes PR45806
Differential Revision: https://reviews.llvm.org/D79448
return Size <= 64;
case 'z':
case '0':
- // XMM0
- if (FeatureMap.lookup("sse"))
+ // XMM0/YMM/ZMM0
+ if (FeatureMap.lookup("avx512f"))
+ // ZMM0 can be used if target supports AVX512F.
+ return Size <= 512U;
+ else if (FeatureMap.lookup("avx"))
+ // YMM0 can be used if target supports AVX.
+ return Size <= 256U;
+ else if (FeatureMap.lookup("sse"))
return Size <= 128U;
return false;
case 'i':
#endif
return _zmm0;
}
+
+// SSE: call <4 x float> asm "pcmpeqd $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m128 testXMM0(void) {
+ __m128 xmm0;
+ __asm__("pcmpeqd %0, %0" :"=Yz"(xmm0));
+ return xmm0;
+}
+
+// AVX: call <8 x float> asm "vpcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m256 testYMM0(void) {
+ __m256 ymm0;
+#ifdef AVX
+ __asm__("vpcmpeqd %0, %0, %0" :"=Yz"(ymm0));
+#endif
+ return ymm0;
+}
+
+// AVX512: call <16 x float> asm "vpternlogd $$255, $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+__m512 testZMM0(void) {
+ __m512 zmm0;
+#ifdef AVX512
+ __asm__("vpternlogd $255, %0, %0, %0" :"=Yz"(zmm0));
+#endif
+ return zmm0;
+}
// XMM0
case 'z':
case '0':
- if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+ ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
return CW_SpecificReg;
return CW_Invalid;
// Conditional OpMask regs (AVX512)
if (Subtarget.hasAVX())
return std::make_pair(0U, &X86::VR256RegClass);
break;
+ case MVT::v64i8:
+ case MVT::v32i16:
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
case 'z':
case '0':
if (!Subtarget.hasSSE1()) break;
- return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(X86::XMM0, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+ case MVT::f128:
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (Subtarget.hasAVX())
+ return std::make_pair(X86::YMM0, &X86::VR256RegClass);
+ break;
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
+ break;
+ }
+ break;
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) {
ret <8 x float> %0
}
+define <8 x float> @testYMM0() {
+; CHECK: vpcmpeqd %ymm0, %ymm0, %ymm0
+entry:
+ %ymm0 = alloca <8 x float>, align 32
+ %0 = call <8 x float> asm "vpcmpeqd $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+ store <8 x float> %0, <8 x float>* %ymm0, align 32
+ %1 = load <8 x float>, <8 x float>* %ymm0, align 32
+ ret <8 x float> %1
+}
+
ret <16 x float> %0
}
+define <16 x float> @testZMM0() {
+entry:
+; CHECK: vpternlogd $255, %zmm0, %zmm0, %zmm0
+ %zmm0 = alloca <16 x float>, align 64
+ %0 = call <16 x float> asm "vpternlogd $$255, $0, $0, $0", "=^Yz,~{dirflag},~{fpsr},~{flags}"()
+ store <16 x float> %0, <16 x float>* %zmm0, align 64
+ %1 = load <16 x float>, <16 x float>* %zmm0, align 64
+ ret <16 x float> %1
+}