From: Simon Pilgrim Date: Wed, 5 Jun 2019 16:11:57 +0000 (+0000) Subject: [X86][SSE] Add additional nt-load test cases as discussed on D62910 X-Git-Tag: llvmorg-10-init~3696 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a0e350e640b3068717eb1522e199db99b7741984;p=platform%2Fupstream%2Fllvm.git [X86][SSE] Add additional nt-load test cases as discussed on D62910 llvm-svn: 362616 --- diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 0461008..9ef0ecb 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -2,14 +2,18 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE4A -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 ; ; PR42123 ; -; FIXME: AVX doesn't retain NT flag on store. -; Should be VMOVNTPS ymm. +; FIXME: AVX doesn't retain NT flag on load/store. +; AVX1 load should be 2 x VMOVNTDQA xmm. +; AVX2 load should be VMOVNTDQA ymm. +; AVX store should be VMOVNTPS ymm. define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align32: ; X86: # %bb.0: @@ -21,13 +25,29 @@ define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X86-NEXT: movntps %xmm1, 16(%eax) ; X86-NEXT: retl ; -; X64-SSE-LABEL: merge_2_v4f32_align32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps (%rdi), %xmm0 -; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 -; X64-SSE-NEXT: movntps %xmm0, (%rsi) -; X64-SSE-NEXT: movntps %xmm1, 16(%rsi) -; X64-SSE-NEXT: retq +; X64-SSE2-LABEL: merge_2_v4f32_align32: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movaps (%rdi), %xmm0 +; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movntps %xmm0, (%rsi) +; X64-SSE2-NEXT: movntps %xmm1, 16(%rsi) +; X64-SSE2-NEXT: retq +; +; X64-SSE4A-LABEL: merge_2_v4f32_align32: +; X64-SSE4A: # %bb.0: +; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 +; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movntps %xmm0, (%rsi) +; X64-SSE4A-NEXT: movntps %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: retq +; +; X64-SSE41-LABEL: merge_2_v4f32_align32: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 +; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; X64-SSE41-NEXT: movntdq %xmm0, (%rsi) +; X64-SSE41-NEXT: movntdq %xmm1, 16(%rsi) +; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: merge_2_v4f32_align32: ; X64-AVX: # %bb.0: @@ -37,8 +57,8 @@ define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* - %3 = load <4 x float>, <4 x float>* %a0, align 32 - %4 = load <4 x float>, <4 x float>* %2, align 16 + %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 + %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 %6 = bitcast float* %5 to <4 x float>* store <4 x float> %3, <4 x float>* %a1, align 32, !nontemporal !0 @@ -46,10 +66,64 @@ define void @merge_2_v4f32_align32(<4 x float>* %a0, <4 x float>* %a1) { ret void } +; FIXME: shouldn't attempt to merge nt and non-nt loads even if aligned. +; Must be kept seperate as VMOVNTDQA xmm + VMOVDQA xmm. +define void @merge_2_v4f32_align32_mix_ntload(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align32_mix_ntload: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movaps (%ecx), %xmm0 +; X86-NEXT: movaps 16(%ecx), %xmm1 +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: movaps %xmm1, 16(%eax) +; X86-NEXT: retl +; +; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movaps (%rdi), %xmm0 +; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movaps %xmm0, (%rsi) +; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) +; X64-SSE2-NEXT: retq +; +; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-SSE4A: # %bb.0: +; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 +; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) +; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: retq +; +; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 +; X64-SSE41-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) +; X64-SSE41-NEXT: movaps %xmm1, 16(%rsi) +; X64-SSE41-NEXT: retq +; +; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 +; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 + %2 = bitcast float* %1 to <4 x float>* + %3 = load <4 x float>, <4 x float>* %a0, align 32, !nontemporal !0 + %4 = load <4 x float>, <4 x float>* %2, align 16 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 + %6 = bitcast float* %5 to <4 x float>* + store <4 x float> %3, <4 x float>* %a1, align 32 + store <4 x float> %4, <4 x float>* %6, align 16 + ret void +} + ; FIXME: shouldn't attempt to merge nt and non-nt stores even if aligned. ; Must be kept seperate as VMOVNTPS xmm + VMOVAPS xmm. -define void @merge_2_v4f32_align32_mix(<4 x float>* %a0, <4 x float>* %a1) { -; X86-LABEL: merge_2_v4f32_align32_mix: +define void @merge_2_v4f32_align32_mix_ntstore(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -59,7 +133,7 @@ define void @merge_2_v4f32_align32_mix(<4 x float>* %a0, <4 x float>* %a1) { ; X86-NEXT: movaps %xmm1, 16(%eax) ; X86-NEXT: retl ; -; X64-SSE-LABEL: merge_2_v4f32_align32_mix: +; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movaps (%rdi), %xmm0 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 @@ -67,7 +141,7 @@ define void @merge_2_v4f32_align32_mix(<4 x float>* %a0, <4 x float>* %a1) { ; X64-SSE-NEXT: movaps %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align32_mix: +; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, (%rsi) @@ -84,10 +158,64 @@ define void @merge_2_v4f32_align32_mix(<4 x float>* %a0, <4 x float>* %a1) { ret void } +; FIXME: AVX can't perform NT-load-ymm on 16-byte aligned memory. +; Must be kept seperate as VMOVNTDQA xmm. +define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align16_ntload: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movaps (%ecx), %xmm0 +; X86-NEXT: movaps 16(%ecx), %xmm1 +; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: movaps %xmm1, 16(%eax) +; X86-NEXT: retl +; +; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movaps (%rdi), %xmm0 +; X64-SSE2-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movaps %xmm0, (%rsi) +; X64-SSE2-NEXT: movaps %xmm1, 16(%rsi) +; X64-SSE2-NEXT: retq +; +; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload: +; X64-SSE4A: # %bb.0: +; X64-SSE4A-NEXT: movaps (%rdi), %xmm0 +; X64-SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movaps %xmm0, (%rsi) +; X64-SSE4A-NEXT: movaps %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: retq +; +; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movntdqa (%rdi), %xmm0 +; X64-SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; X64-SSE41-NEXT: movdqa %xmm0, (%rsi) +; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi) +; X64-SSE41-NEXT: retq +; +; X64-AVX-LABEL: merge_2_v4f32_align16_ntload: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 + %2 = bitcast float* %1 to <4 x float>* + %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0 + %4 = load <4 x float>, <4 x float>* %2, align 16, !nontemporal !0 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 + %6 = bitcast float* %5 to <4 x float>* + store <4 x float> %3, <4 x float>* %a1, align 16 + store <4 x float> %4, <4 x float>* %6, align 16 + ret void +} + ; FIXME: AVX can't perform NT-store-ymm on 16-byte aligned memory. ; Must be kept seperate as VMOVNTPS xmm. -define void @merge_2_v4f32_align16(<4 x float>* %a0, <4 x float>* %a1) { -; X86-LABEL: merge_2_v4f32_align16: +define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align16_ntstore: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -97,7 +225,7 @@ define void @merge_2_v4f32_align16(<4 x float>* %a0, <4 x float>* %a1) { ; X86-NEXT: movntps %xmm1, 16(%eax) ; X86-NEXT: retl ; -; X64-SSE-LABEL: merge_2_v4f32_align16: +; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movaps (%rdi), %xmm0 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1 @@ -105,7 +233,7 @@ define void @merge_2_v4f32_align16(<4 x float>* %a0, <4 x float>* %a1) { ; X64-SSE-NEXT: movntps %xmm1, 16(%rsi) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: merge_2_v4f32_align16: +; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovups %ymm0, (%rsi) @@ -122,8 +250,84 @@ define void @merge_2_v4f32_align16(<4 x float>* %a0, <4 x float>* %a1) { ret void } +; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory. +; Just perform regular loads. +define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align1_ntload: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movups (%ecx), %xmm0 +; X86-NEXT: movups 16(%ecx), %xmm1 +; X86-NEXT: movups %xmm0, (%eax) +; X86-NEXT: movups %xmm1, 16(%eax) +; X86-NEXT: retl +; +; X64-SSE-LABEL: merge_2_v4f32_align1_ntload: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups (%rdi), %xmm0 +; X64-SSE-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE-NEXT: movups %xmm0, (%rsi) +; X64-SSE-NEXT: movups %xmm1, 16(%rsi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: merge_2_v4f32_align1_ntload: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 + %2 = bitcast float* %1 to <4 x float>* + %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0 + %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 + %6 = bitcast float* %5 to <4 x float>* + store <4 x float> %3, <4 x float>* %a1, align 1 + store <4 x float> %4, <4 x float>* %6, align 1 + ret void +} + ; FIXME: Nothing can perform NT-store-vector on 1-byte aligned memory. ; Must be scalarized to use MOVTNI/MOVNTSD. +define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) { +; X86-LABEL: merge_2_v4f32_align1_ntstore: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movups (%ecx), %xmm0 +; X86-NEXT: movups 16(%ecx), %xmm1 +; X86-NEXT: movups %xmm0, (%eax) +; X86-NEXT: movups %xmm1, 16(%eax) +; X86-NEXT: retl +; +; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups (%rdi), %xmm0 +; X64-SSE-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE-NEXT: movups %xmm0, (%rsi) +; X64-SSE-NEXT: movups %xmm1, 16(%rsi) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 + %2 = bitcast float* %1 to <4 x float>* + %3 = load <4 x float>, <4 x float>* %a0, align 1 + %4 = load <4 x float>, <4 x float>* %2, align 1 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 + %6 = bitcast float* %5 to <4 x float>* + store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0 + store <4 x float> %4, <4 x float>* %6, align 1, !nontemporal !0 + ret void +} + +; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory. +; Just perform regular loads and scalarize NT-stores. define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) { ; X86-LABEL: merge_2_v4f32_align1: ; X86: # %bb.0: @@ -151,8 +355,8 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) { ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 %2 = bitcast float* %1 to <4 x float>* - %3 = load <4 x float>, <4 x float>* %a0, align 1 - %4 = load <4 x float>, <4 x float>* %2, align 1 + %3 = load <4 x float>, <4 x float>* %a0, align 1, !nontemporal !0 + %4 = load <4 x float>, <4 x float>* %2, align 1, !nontemporal !0 %5 = getelementptr inbounds <4 x float>, <4 x float>* %a1, i64 1, i64 0 %6 = bitcast float* %5 to <4 x float>* store <4 x float> %3, <4 x float>* %a1, align 1, !nontemporal !0