From: Craig Topper Date: Sun, 22 Sep 2019 06:52:25 +0000 (+0000) Subject: [X86] Add test memset and memcpy testcases for D67874. NFC X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=38014c553f0d8aa8b5db4fe7b0a2d98c044580f3;p=platform%2Fupstream%2Fllvm.git [X86] Add test memset and memcpy testcases for D67874. NFC llvm-svn: 372494 --- diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll index 597b776..d2454d5 100644 --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s -check-prefix=LINUX-SKL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s -check-prefix=LINUX-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=LINUX-KNL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512bw | FileCheck %s -check-prefix=LINUX-AVX512BW declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind declare void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* nocapture, i8 addrspace(256)* nocapture, i64, i1) nounwind @@ -8,13 +12,29 @@ declare void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* nocapture, i8 add ; Variable memcpy's should lower to calls. define i8* @test1(i8* %a, i8* %b, i64 %n) nounwind { +; DARWIN-LABEL: test1: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; ; LINUX-LABEL: test1: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL ; -; DARWIN-LABEL: test1: -; DARWIN: ## %bb.0: ## %entry -; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; LINUX-SKL-LABEL: test1: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: jmp memcpy # TAILCALL +; +; LINUX-SKX-LABEL: test1: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: jmp memcpy # TAILCALL +; +; LINUX-KNL-LABEL: test1: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: jmp memcpy # TAILCALL +; +; LINUX-AVX512BW-LABEL: test1: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: jmp memcpy # TAILCALL entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 ) ret i8* %a @@ -22,13 +42,29 @@ entry: ; Variable memcpy's should lower to calls. define i8* @test2(i64* %a, i64* %b, i64 %n) nounwind { +; DARWIN-LABEL: test2: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; ; LINUX-LABEL: test2: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL ; -; DARWIN-LABEL: test2: -; DARWIN: ## %bb.0: ## %entry -; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; LINUX-SKL-LABEL: test2: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: jmp memcpy # TAILCALL +; +; LINUX-SKX-LABEL: test2: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: jmp memcpy # TAILCALL +; +; LINUX-KNL-LABEL: test2: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: jmp memcpy # TAILCALL +; +; LINUX-AVX512BW-LABEL: test2: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: jmp memcpy # TAILCALL entry: %tmp14 = bitcast i64* %a to i8* %tmp25 = bitcast i64* %b to i8* @@ -43,11 +79,6 @@ entry: ; hurting performance so it should just ignore optsize when expanding memcpy. ; rdar://8821501 define void @test3(i8* nocapture %A, i8* nocapture %B) nounwind optsize noredzone { -; LINUX-LABEL: test3: -; LINUX: # %bb.0: # %entry -; LINUX-NEXT: movl $64, %edx -; LINUX-NEXT: jmp memcpy # TAILCALL -; ; DARWIN-LABEL: test3: ; DARWIN: ## %bb.0: ## %entry ; DARWIN-NEXT: movq 56(%rsi), %rax @@ -67,45 +98,169 @@ define void @test3(i8* nocapture %A, i8* nocapture %B) nounwind optsize noredzon ; DARWIN-NEXT: movq %rcx, 8(%rdi) ; DARWIN-NEXT: movq %rax, (%rdi) ; DARWIN-NEXT: retq +; +; LINUX-LABEL: test3: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movl $64, %edx +; LINUX-NEXT: jmp memcpy # TAILCALL +; +; LINUX-SKL-LABEL: test3: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test3: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void } define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone { +; DARWIN-LABEL: test3_minsize: +; DARWIN: ## %bb.0: +; DARWIN-NEXT: pushq $64 +; DARWIN-NEXT: popq %rdx +; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; ; LINUX-LABEL: test3_minsize: ; LINUX: # %bb.0: ; LINUX-NEXT: pushq $64 ; LINUX-NEXT: popq %rdx ; LINUX-NEXT: jmp memcpy # TAILCALL ; -; DARWIN-LABEL: test3_minsize: -; DARWIN: ## %bb.0: -; DARWIN-NEXT: pushq $64 -; DARWIN-NEXT: popq %rdx -; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; LINUX-SKL-LABEL: test3_minsize: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_minsize: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_minsize: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test3_minsize: +; LINUX-AVX512BW: # %bb.0: +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void } define void @test3_minsize_optsize(i8* nocapture %A, i8* nocapture %B) nounwind optsize minsize noredzone { +; DARWIN-LABEL: test3_minsize_optsize: +; DARWIN: ## %bb.0: +; DARWIN-NEXT: pushq $64 +; DARWIN-NEXT: popq %rdx +; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; ; LINUX-LABEL: test3_minsize_optsize: ; LINUX: # %bb.0: ; LINUX-NEXT: pushq $64 ; LINUX-NEXT: popq %rdx ; LINUX-NEXT: jmp memcpy # TAILCALL ; -; DARWIN-LABEL: test3_minsize_optsize: -; DARWIN: ## %bb.0: -; DARWIN-NEXT: pushq $64 -; DARWIN-NEXT: popq %rdx -; DARWIN-NEXT: jmp _memcpy ## TAILCALL +; LINUX-SKL-LABEL: test3_minsize_optsize: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test3_minsize_optsize: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test3_minsize_optsize: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test3_minsize_optsize: +; LINUX-AVX512BW: # %bb.0: +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void } ; Large constant memcpy's should be inlined when not optimizing for size. define void @test4(i8* nocapture %A, i8* nocapture %B) nounwind noredzone { +; DARWIN-LABEL: test4: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movq 56(%rsi), %rax +; DARWIN-NEXT: movq %rax, 56(%rdi) +; DARWIN-NEXT: movq 48(%rsi), %rax +; DARWIN-NEXT: movq %rax, 48(%rdi) +; DARWIN-NEXT: movq 40(%rsi), %rax +; DARWIN-NEXT: movq %rax, 40(%rdi) +; DARWIN-NEXT: movq 32(%rsi), %rax +; DARWIN-NEXT: movq %rax, 32(%rdi) +; DARWIN-NEXT: movq 24(%rsi), %rax +; DARWIN-NEXT: movq %rax, 24(%rdi) +; DARWIN-NEXT: movq 16(%rsi), %rax +; DARWIN-NEXT: movq %rax, 16(%rdi) +; DARWIN-NEXT: movq (%rsi), %rax +; DARWIN-NEXT: movq 8(%rsi), %rcx +; DARWIN-NEXT: movq %rcx, 8(%rdi) +; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: retq +; ; LINUX-LABEL: test4: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: movq 56(%rsi), %rax @@ -126,25 +281,38 @@ define void @test4(i8* nocapture %A, i8* nocapture %B) nounwind noredzone { ; LINUX-NEXT: movq %rax, (%rdi) ; LINUX-NEXT: retq ; -; DARWIN-LABEL: test4: -; DARWIN: ## %bb.0: ## %entry -; DARWIN-NEXT: movq 56(%rsi), %rax -; DARWIN-NEXT: movq %rax, 56(%rdi) -; DARWIN-NEXT: movq 48(%rsi), %rax -; DARWIN-NEXT: movq %rax, 48(%rdi) -; DARWIN-NEXT: movq 40(%rsi), %rax -; DARWIN-NEXT: movq %rax, 40(%rdi) -; DARWIN-NEXT: movq 32(%rsi), %rax -; DARWIN-NEXT: movq %rax, 32(%rdi) -; DARWIN-NEXT: movq 24(%rsi), %rax -; DARWIN-NEXT: movq %rax, 24(%rdi) -; DARWIN-NEXT: movq 16(%rsi), %rax -; DARWIN-NEXT: movq %rax, 16(%rdi) -; DARWIN-NEXT: movq (%rsi), %rax -; DARWIN-NEXT: movq 8(%rsi), %rcx -; DARWIN-NEXT: movq %rcx, 8(%rdi) -; DARWIN-NEXT: movq %rax, (%rdi) -; DARWIN-NEXT: retq +; LINUX-SKL-LABEL: test4: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKL-NEXT: vzeroupper +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test4: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups (%rsi), %ymm0 +; LINUX-SKX-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-SKX-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-SKX-NEXT: vmovups %ymm0, (%rdi) +; LINUX-SKX-NEXT: vzeroupper +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test4: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups (%rsi), %ymm0 +; LINUX-KNL-NEXT: vmovups 32(%rsi), %ymm1 +; LINUX-KNL-NEXT: vmovups %ymm1, 32(%rdi) +; LINUX-KNL-NEXT: vmovups %ymm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test4: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %zmm0 +; LINUX-AVX512BW-NEXT: vmovups %zmm0, (%rdi) +; LINUX-AVX512BW-NEXT: vzeroupper +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false) ret void @@ -154,6 +322,14 @@ entry: @.str = private unnamed_addr constant [30 x i8] c"\00aaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1 define void @test5(i8* nocapture %C) nounwind uwtable ssp { +; DARWIN-LABEL: test5: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movabsq $7016996765293437281, %rax ## imm = 0x6161616161616161 +; DARWIN-NEXT: movq %rax, 8(%rdi) +; DARWIN-NEXT: movabsq $7016996765293437184, %rax ## imm = 0x6161616161616100 +; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: retq +; ; LINUX-LABEL: test5: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 @@ -162,13 +338,29 @@ define void @test5(i8* nocapture %C) nounwind uwtable ssp { ; LINUX-NEXT: movq %rax, (%rdi) ; LINUX-NEXT: retq ; -; DARWIN-LABEL: test5: -; DARWIN: ## %bb.0: ## %entry -; DARWIN-NEXT: movabsq $7016996765293437281, %rax ## imm = 0x6161616161616161 -; DARWIN-NEXT: movq %rax, 8(%rdi) -; DARWIN-NEXT: movabsq $7016996765293437184, %rax ## imm = 0x6161616161616100 -; DARWIN-NEXT: movq %rax, (%rdi) -; DARWIN-NEXT: retq +; LINUX-SKL-LABEL: test5: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: vmovups {{.*}}(%rip), %xmm0 +; LINUX-SKL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test5: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: vmovups {{.*}}(%rip), %xmm0 +; LINUX-SKX-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test5: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: vmovups {{.*}}(%rip), %xmm0 +; LINUX-KNL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test5: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: vmovups {{.*}}(%rip), %xmm0 +; LINUX-AVX512BW-NEXT: vmovups %xmm0, (%rdi) +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str, i64 0, i64 0), i64 16, i1 false) ret void @@ -179,17 +371,41 @@ entry: @.str2 = private unnamed_addr constant [2 x i8] c"x\00", align 1 define void @test6() nounwind uwtable { +; DARWIN-LABEL: test6: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movw $0, 8 +; DARWIN-NEXT: movq $120, 0 +; DARWIN-NEXT: retq +; ; LINUX-LABEL: test6: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: movw $0, 8 ; LINUX-NEXT: movq $120, 0 ; LINUX-NEXT: retq ; -; DARWIN-LABEL: test6: -; DARWIN: ## %bb.0: ## %entry -; DARWIN-NEXT: movw $0, 8 -; DARWIN-NEXT: movq $120, 0 -; DARWIN-NEXT: retq +; LINUX-SKL-LABEL: test6: +; LINUX-SKL: # %bb.0: # %entry +; LINUX-SKL-NEXT: movw $0, 8 +; LINUX-SKL-NEXT: movq $120, 0 +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: test6: +; LINUX-SKX: # %bb.0: # %entry +; LINUX-SKX-NEXT: movw $0, 8 +; LINUX-SKX-NEXT: movq $120, 0 +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: test6: +; LINUX-KNL: # %bb.0: # %entry +; LINUX-KNL-NEXT: movw $0, 8 +; LINUX-KNL-NEXT: movq $120, 0 +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: test6: +; LINUX-AVX512BW: # %bb.0: # %entry +; LINUX-AVX512BW-NEXT: movw $0, 8 +; LINUX-AVX512BW-NEXT: movq $120, 0 +; LINUX-AVX512BW-NEXT: retq entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0), i64 10, i1 false) ret void @@ -198,6 +414,16 @@ entry: define void @PR15348(i8* %a, i8* %b) { ; Ensure that alignment of '0' in an @llvm.memcpy intrinsic results in ; unaligned loads and stores. +; DARWIN-LABEL: PR15348: +; DARWIN: ## %bb.0: +; DARWIN-NEXT: movb 16(%rsi), %al +; DARWIN-NEXT: movb %al, 16(%rdi) +; DARWIN-NEXT: movq (%rsi), %rax +; DARWIN-NEXT: movq 8(%rsi), %rcx +; DARWIN-NEXT: movq %rcx, 8(%rdi) +; DARWIN-NEXT: movq %rax, (%rdi) +; DARWIN-NEXT: retq +; ; LINUX-LABEL: PR15348: ; LINUX: # %bb.0: ; LINUX-NEXT: movb 16(%rsi), %al @@ -208,15 +434,37 @@ define void @PR15348(i8* %a, i8* %b) { ; LINUX-NEXT: movq %rax, (%rdi) ; LINUX-NEXT: retq ; -; DARWIN-LABEL: PR15348: -; DARWIN: ## %bb.0: -; DARWIN-NEXT: movb 16(%rsi), %al -; DARWIN-NEXT: movb %al, 16(%rdi) -; DARWIN-NEXT: movq (%rsi), %rax -; DARWIN-NEXT: movq 8(%rsi), %rcx -; DARWIN-NEXT: movq %rcx, 8(%rdi) -; DARWIN-NEXT: movq %rax, (%rdi) -; DARWIN-NEXT: retq +; LINUX-SKL-LABEL: PR15348: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: movb 16(%rsi), %al +; LINUX-SKL-NEXT: movb %al, 16(%rdi) +; LINUX-SKL-NEXT: vmovups (%rsi), %xmm0 +; LINUX-SKL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: PR15348: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: movb 16(%rsi), %al +; LINUX-SKX-NEXT: movb %al, 16(%rdi) +; LINUX-SKX-NEXT: vmovups (%rsi), %xmm0 +; LINUX-SKX-NEXT: vmovups %xmm0, (%rdi) +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: PR15348: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: movb 16(%rsi), %al +; LINUX-KNL-NEXT: movb %al, 16(%rdi) +; LINUX-KNL-NEXT: vmovups (%rsi), %xmm0 +; LINUX-KNL-NEXT: vmovups %xmm0, (%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: PR15348: +; LINUX-AVX512BW: # %bb.0: +; LINUX-AVX512BW-NEXT: movb 16(%rsi), %al +; LINUX-AVX512BW-NEXT: movb %al, 16(%rdi) +; LINUX-AVX512BW-NEXT: vmovups (%rsi), %xmm0 +; LINUX-AVX512BW-NEXT: vmovups %xmm0, (%rdi) +; LINUX-AVX512BW-NEXT: retq call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 17, i1 false) ret void } @@ -224,6 +472,14 @@ define void @PR15348(i8* %a, i8* %b) { ; Memcpys from / to address space 256 should be lowered to appropriate loads / ; stores if small enough. define void @addrspace256(i8 addrspace(256)* %a, i8 addrspace(256)* %b) nounwind { +; DARWIN-LABEL: addrspace256: +; DARWIN: ## %bb.0: +; DARWIN-NEXT: movq %gs:(%rsi), %rax +; DARWIN-NEXT: movq %gs:8(%rsi), %rcx +; DARWIN-NEXT: movq %rcx, %gs:8(%rdi) +; DARWIN-NEXT: movq %rax, %gs:(%rdi) +; DARWIN-NEXT: retq +; ; LINUX-LABEL: addrspace256: ; LINUX: # %bb.0: ; LINUX-NEXT: movq %gs:(%rsi), %rax @@ -232,13 +488,29 @@ define void @addrspace256(i8 addrspace(256)* %a, i8 addrspace(256)* %b) nounwind ; LINUX-NEXT: movq %rax, %gs:(%rdi) ; LINUX-NEXT: retq ; -; DARWIN-LABEL: addrspace256: -; DARWIN: ## %bb.0: -; DARWIN-NEXT: movq %gs:(%rsi), %rax -; DARWIN-NEXT: movq %gs:8(%rsi), %rcx -; DARWIN-NEXT: movq %rcx, %gs:8(%rdi) -; DARWIN-NEXT: movq %rax, %gs:(%rdi) -; DARWIN-NEXT: retq +; LINUX-SKL-LABEL: addrspace256: +; LINUX-SKL: # %bb.0: +; LINUX-SKL-NEXT: vmovups %gs:(%rsi), %xmm0 +; LINUX-SKL-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-SKL-NEXT: retq +; +; LINUX-SKX-LABEL: addrspace256: +; LINUX-SKX: # %bb.0: +; LINUX-SKX-NEXT: vmovups %gs:(%rsi), %xmm0 +; LINUX-SKX-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-SKX-NEXT: retq +; +; LINUX-KNL-LABEL: addrspace256: +; LINUX-KNL: # %bb.0: +; LINUX-KNL-NEXT: vmovups %gs:(%rsi), %xmm0 +; LINUX-KNL-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-KNL-NEXT: retq +; +; LINUX-AVX512BW-LABEL: addrspace256: +; LINUX-AVX512BW: # %bb.0: +; LINUX-AVX512BW-NEXT: vmovups %gs:(%rsi), %xmm0 +; LINUX-AVX512BW-NEXT: vmovups %xmm0, %gs:(%rdi) +; LINUX-AVX512BW-NEXT: retq tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false) ret void } diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll index 8166d00..8d1eeba 100644 --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; https://llvm.org/bugs/show_bug.cgi?id=27100 @@ -227,6 +229,13 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_16_nonconst_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rdi) +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i1 false) ret void } @@ -270,6 +279,14 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) { ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_32_nonconst_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i1 false) ret void } @@ -322,6 +339,15 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) { ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_64_nonconst_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false) ret void } @@ -390,6 +416,17 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) { ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_128_nonconst_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false) ret void } @@ -455,6 +492,21 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) { ; AVX2-NEXT: vmovdqu %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq +; +; AVX512-LABEL: memset_256_nonconst_bytes: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, 224(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 192(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 160(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 128(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, 32(%rdi) +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false) ret void } diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll index fb28742..14d7472 100644 --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM ; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=sandybridge | FileCheck %s --check-prefix=SANDYBRIDGE ; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=skylake | FileCheck %s --check-prefix=SKYLAKE +; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=knl | FileCheck %s --check-prefix=KNL declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind @@ -28,6 +29,10 @@ define void @memset_0(i8* %a) nounwind { ; SKYLAKE-LABEL: memset_0: ; SKYLAKE: # %bb.0: # %entry ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_0: +; KNL: # %bb.0: # %entry +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 0, i1 false) ret void @@ -59,6 +64,11 @@ define void @memset_4(i8* %a) nounwind { ; SKYLAKE: # %bb.0: # %entry ; SKYLAKE-NEXT: movl $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_4: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movl $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 4, i1 false) ret void @@ -95,6 +105,12 @@ define void @memset_5(i8* %a) nounwind { ; SKYLAKE-NEXT: movb $0, 4(%rdi) ; SKYLAKE-NEXT: movl $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_5: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movb $0, 4(%rdi) +; KNL-NEXT: movl $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 5, i1 false) ret void @@ -131,6 +147,12 @@ define void @memset_7(i8* %a) nounwind { ; SKYLAKE-NEXT: movl $0, 3(%rdi) ; SKYLAKE-NEXT: movl $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_7: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movl $0, 3(%rdi) +; KNL-NEXT: movl $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 7, i1 false) ret void @@ -163,6 +185,11 @@ define void @memset_8(i8* %a) nounwind { ; SKYLAKE: # %bb.0: # %entry ; SKYLAKE-NEXT: movq $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_8: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movq $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 8, i1 false) ret void @@ -200,6 +227,12 @@ define void @memset_11(i8* %a) nounwind { ; SKYLAKE-NEXT: movl $0, 7(%rdi) ; SKYLAKE-NEXT: movq $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_11: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movl $0, 7(%rdi) +; KNL-NEXT: movq $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 11, i1 false) ret void @@ -238,6 +271,12 @@ define void @memset_13(i8* %a) nounwind { ; SKYLAKE-NEXT: movq $0, 5(%rdi) ; SKYLAKE-NEXT: movq $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_13: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movq $0, 5(%rdi) +; KNL-NEXT: movq $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 13, i1 false) ret void @@ -276,6 +315,12 @@ define void @memset_15(i8* %a) nounwind { ; SKYLAKE-NEXT: movq $0, 7(%rdi) ; SKYLAKE-NEXT: movq $0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_15: +; KNL: # %bb.0: # %entry +; KNL-NEXT: movq $0, 7(%rdi) +; KNL-NEXT: movq $0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 15, i1 false) ret void @@ -314,6 +359,12 @@ define void @memset_16(i8* %a) nounwind { ; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_16: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 16, i1 false) ret void @@ -357,6 +408,13 @@ define void @memset_17(i8* %a) nounwind { ; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) ; SKYLAKE-NEXT: movb $0, 16(%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_17: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: movb $0, 16(%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i1 false) ret void @@ -400,6 +458,13 @@ define void @memset_19(i8* %a) nounwind { ; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) ; SKYLAKE-NEXT: movl $0, 15(%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_19: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: movl $0, 15(%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 19, i1 false) ret void @@ -447,6 +512,13 @@ define void @memset_31(i8* %a) nounwind { ; SKYLAKE-NEXT: vmovups %xmm0, 15(%rdi) ; SKYLAKE-NEXT: vmovups %xmm0, (%rdi) ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_31: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %xmm0, 15(%rdi) +; KNL-NEXT: vmovups %xmm0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 31, i1 false) ret void @@ -494,6 +566,12 @@ define void @memset_32(i8* %a) nounwind { ; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_32: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 32, i1 false) ret void @@ -540,6 +618,12 @@ define void @memset_32_align32(i8* %a) nounwind { ; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_32_align32: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovaps %ymm0, (%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 false) ret void @@ -593,7 +677,152 @@ define void @memset_35(i8* %a) nounwind { ; SKYLAKE-NEXT: movl $0, 31(%rdi) ; SKYLAKE-NEXT: vzeroupper ; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_35: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: movl $0, 31(%rdi) +; KNL-NEXT: retq entry: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 35, i1 false) ret void } + +define void @memset_64(i8* %a) nounwind { +; X86-LABEL: memset_64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $0, 60(%eax) +; X86-NEXT: movl $0, 56(%eax) +; X86-NEXT: movl $0, 52(%eax) +; X86-NEXT: movl $0, 48(%eax) +; X86-NEXT: movl $0, 44(%eax) +; X86-NEXT: movl $0, 40(%eax) +; X86-NEXT: movl $0, 36(%eax) +; X86-NEXT: movl $0, 32(%eax) +; X86-NEXT: movl $0, 28(%eax) +; X86-NEXT: movl $0, 24(%eax) +; X86-NEXT: movl $0, 20(%eax) +; X86-NEXT: movl $0, 16(%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: movl $0, (%eax) +; X86-NEXT: retl +; +; CORE2-LABEL: memset_64: +; CORE2: # %bb.0: # %entry +; CORE2-NEXT: movq $0, 56(%rdi) +; CORE2-NEXT: movq $0, 48(%rdi) +; CORE2-NEXT: movq $0, 40(%rdi) +; CORE2-NEXT: movq $0, 32(%rdi) +; CORE2-NEXT: movq $0, 24(%rdi) +; CORE2-NEXT: movq $0, 16(%rdi) +; CORE2-NEXT: movq $0, 8(%rdi) +; CORE2-NEXT: movq $0, (%rdi) +; CORE2-NEXT: retq +; +; NEHALEM-LABEL: memset_64: +; NEHALEM: # %bb.0: # %entry +; NEHALEM-NEXT: xorps %xmm0, %xmm0 +; NEHALEM-NEXT: movups %xmm0, 48(%rdi) +; NEHALEM-NEXT: movups %xmm0, 32(%rdi) +; NEHALEM-NEXT: movups %xmm0, 16(%rdi) +; NEHALEM-NEXT: movups %xmm0, (%rdi) +; NEHALEM-NEXT: retq +; +; SANDYBRIDGE-LABEL: memset_64: +; SANDYBRIDGE: # %bb.0: # %entry +; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; SANDYBRIDGE-NEXT: vmovups %xmm0, 16(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, (%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, 48(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, 32(%rdi) +; SANDYBRIDGE-NEXT: retq +; +; SKYLAKE-LABEL: memset_64: +; SKYLAKE: # %bb.0: # %entry +; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovups %ymm0, 32(%rdi) +; SKYLAKE-NEXT: vmovups %ymm0, (%rdi) +; SKYLAKE-NEXT: vzeroupper +; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_64: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovups %ymm0, 32(%rdi) +; KNL-NEXT: vmovups %ymm0, (%rdi) +; KNL-NEXT: retq +entry: + call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false) + ret void +} + +define void @memset_64_align64(i8* %a) nounwind { +; X86-LABEL: memset_64_align64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $0, 60(%eax) +; X86-NEXT: movl $0, 56(%eax) +; X86-NEXT: movl $0, 52(%eax) +; X86-NEXT: movl $0, 48(%eax) +; X86-NEXT: movl $0, 44(%eax) +; X86-NEXT: movl $0, 40(%eax) +; X86-NEXT: movl $0, 36(%eax) +; X86-NEXT: movl $0, 32(%eax) +; X86-NEXT: movl $0, 28(%eax) +; X86-NEXT: movl $0, 24(%eax) +; X86-NEXT: movl $0, 20(%eax) +; X86-NEXT: movl $0, 16(%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: movl $0, (%eax) +; X86-NEXT: retl +; +; CORE2-LABEL: memset_64_align64: +; CORE2: # %bb.0: # %entry +; CORE2-NEXT: xorps %xmm0, %xmm0 +; CORE2-NEXT: movaps %xmm0, 48(%rdi) +; CORE2-NEXT: movaps %xmm0, 32(%rdi) +; CORE2-NEXT: movaps %xmm0, 16(%rdi) +; CORE2-NEXT: movaps %xmm0, (%rdi) +; CORE2-NEXT: retq +; +; NEHALEM-LABEL: memset_64_align64: +; NEHALEM: # %bb.0: # %entry +; NEHALEM-NEXT: xorps %xmm0, %xmm0 +; NEHALEM-NEXT: movaps %xmm0, 48(%rdi) +; NEHALEM-NEXT: movaps %xmm0, 32(%rdi) +; NEHALEM-NEXT: movaps %xmm0, 16(%rdi) +; NEHALEM-NEXT: movaps %xmm0, (%rdi) +; NEHALEM-NEXT: retq +; +; SANDYBRIDGE-LABEL: memset_64_align64: +; SANDYBRIDGE: # %bb.0: # %entry +; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; SANDYBRIDGE-NEXT: vmovaps %ymm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovaps %ymm0, (%rdi) +; SANDYBRIDGE-NEXT: vzeroupper +; SANDYBRIDGE-NEXT: retq +; +; SKYLAKE-LABEL: memset_64_align64: +; SKYLAKE: # %bb.0: # %entry +; SKYLAKE-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; SKYLAKE-NEXT: vmovaps %ymm0, 32(%rdi) +; SKYLAKE-NEXT: vmovaps %ymm0, (%rdi) +; SKYLAKE-NEXT: vzeroupper +; SKYLAKE-NEXT: retq +; +; KNL-LABEL: memset_64_align64: +; KNL: # %bb.0: # %entry +; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovaps %ymm0, 32(%rdi) +; KNL-NEXT: vmovaps %ymm0, (%rdi) +; KNL-NEXT: retq +entry: + call void @llvm.memset.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 false) + ret void +}