From: Simon Pilgrim Date: Tue, 9 Aug 2016 09:32:34 +0000 (+0000) Subject: [X86][SSE] Fix memory folding of (v)roundsd / (v)roundss X-Git-Tag: llvmorg-4.0.0-rc1~12977 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=54c32ddf558030dc571ba6809944a1240b663d11;p=platform%2Fupstream%2Fllvm.git [X86][SSE] Fix memory folding of (v)roundsd / (v)roundss We only had partial memory folding support for the intrinsic definitions, and (as noted on PR27481) was causing FR32/FR64/VR128 mismatch errors with the machine verifier. This patch adds missing memory folding support for both intrinsics and the ffloor/fnearbyint/fceil/frint/ftrunc patterns and in doing so fixes the failing machine verifier stack folding tests from PR27481. Differential Revision: https://reviews.llvm.org/D23276 llvm-svn: 278106 --- diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 30c963c..b8338bb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1116,6 +1116,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, + { X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, 0 }, + { X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, 0 }, { X86::SBB32rr, X86::SBB32rm, 0 }, { X86::SBB64rr, X86::SBB64rm, 0 }, { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, @@ -1412,6 +1414,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPXORrr, X86::VPXORrm, 0 }, { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, + { X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, 0 }, + { X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, 0 }, { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, @@ -6208,9 +6212,11 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::ROUNDSDr: case X86::ROUNDSDm: case X86::ROUNDSDr_Int: + case X86::ROUNDSDm_Int: case X86::ROUNDSSr: case X86::ROUNDSSm: case X86::ROUNDSSr_Int: + case X86::ROUNDSSm_Int: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: @@ -6289,9 +6295,11 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VROUNDSDr: case X86::VROUNDSDm: case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: case X86::VROUNDSSr: case X86::VROUNDSSm: case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: case X86::VRSQRTSSr: case X86::VRSQRTSSr_Int: case X86::VRSQRTSSm: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 3ee1c1e..a6d30a9 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6409,6 +6409,17 @@ let ExeDomain = GenericDomain in { "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), []>, Sched<[WriteFAdd]>; + // Operation, mem. + let mayLoad = 1 in + def SSm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; + // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SSr_Int : SS4AIi8; // Intrinsic operation, mem. - def SSm : SS4AIi8, Sched<[WriteFAdd]>; + // Operation, mem. + let mayLoad = 1 in + def SDm : SS4AIi8, Sched<[WriteFAddLd, ReadAfterLd]>; + // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8; // Intrinsic operation, mem. - def SDm : SS4AIi8 @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize { + ;CHECK-LABEL: stack_fold_roundsd_int + ;CHECK: vroundsd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) + ret <2 x double> %2 +} declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone define float @stack_fold_roundss(float %a0) optsize { @@ -1571,7 +1577,13 @@ define float @stack_fold_roundss(float %a0) optsize { } declare float @llvm.floor.f32(float) nounwind readnone -; TODO stack_fold_roundss_int +define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize { + ;CHECK-LABEL: stack_fold_roundss_int + ;CHECK: vroundss $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) + ret <4 x float> %2 +} declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone ; TODO stack_fold_rsqrtps diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll index be0c5b7..bc346a6 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -1,4 +1,4 @@ -; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s +; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" @@ -987,7 +987,13 @@ define double @stack_fold_roundsd(double %a0) optsize { } declare double @llvm.floor.f64(double) nounwind readnone -; TODO stack_fold_roundsd_int +define <2 x double> @stack_fold_roundsd_int(<2 x double> %a0, <2 x double> %a1) optsize { + ;CHECK-LABEL: stack_fold_roundsd_int + ;CHECK: roundsd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) + ret <2 x double> %2 +} declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone define float @stack_fold_roundss(float %a0) minsize { @@ -999,7 +1005,13 @@ define float @stack_fold_roundss(float %a0) minsize { } declare float @llvm.floor.f32(float) nounwind readnone -; TODO stack_fold_roundss_int +define <4 x float> @stack_fold_roundss_int(<4 x float> %a0, <4 x float> %a1) optsize { + ;CHECK-LABEL: stack_fold_roundss_int + ;CHECK: roundss $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) + ret <4 x float> %2 +} declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone ; TODO stack_fold_rsqrtps