; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake -verify-machineinstrs| FileCheck %s --check-prefixes=AVX,ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,SPR
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
define <2 x i64> @foo_reg_128(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, <2 x i64> %5) {
; AVX-LABEL: foo_reg_128:
; AVX: # %bb.0:
; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm3, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm4, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: foo_reg_128:
; AVX512: # %bb.0:
; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm3, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%7 = bitcast <2 x i64> %0 to <4 x i32>
%8 = bitcast <2 x i64> %1 to <4 x i32>
declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) #1
define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
-; AVX-LABEL: foo_128:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB1_6
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %edx
-; AVX-NEXT: movl %edx, %eax
-; AVX-NEXT: andl $3, %eax
-; AVX-NEXT: cmpl $4, %edi
-; AVX-NEXT: jae .LBB1_7
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB1_3
-; AVX-NEXT: .LBB1_7:
-; AVX-NEXT: andl $-4, %edx
-; AVX-NEXT: leaq 48(%rsi), %rdi
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd -16(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0
-; AVX-NEXT: addq $4, %rcx
-; AVX-NEXT: addq $64, %rdi
-; AVX-NEXT: cmpq %rcx, %rdx
-; AVX-NEXT: jne .LBB1_8
-; AVX-NEXT: .LBB1_3:
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: je .LBB1_6
-; AVX-NEXT: # %bb.4: # %.preheader
-; AVX-NEXT: shlq $4, %rcx
-; AVX-NEXT: addq %rcx, %rsi
-; AVX-NEXT: shlq $4, %rax
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
-; AVX-NEXT: addq $16, %rcx
-; AVX-NEXT: cmpq %rcx, %rax
-; AVX-NEXT: jne .LBB1_5
-; AVX-NEXT: .LBB1_6:
-; AVX-NEXT: retq
+; ADL-LABEL: foo_128:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB1_6
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %edx
+; ADL-NEXT: movl %edx, %eax
+; ADL-NEXT: andl $3, %eax
+; ADL-NEXT: cmpl $4, %edi
+; ADL-NEXT: jae .LBB1_7
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB1_3
+; ADL-NEXT: .LBB1_7:
+; ADL-NEXT: andl $-4, %edx
+; ADL-NEXT: leaq 48(%rsi), %rdi
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
+; ADL-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; ADL-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm3
+; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; ADL-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; ADL-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; ADL-NEXT: addq $4, %rcx
+; ADL-NEXT: addq $64, %rdi
+; ADL-NEXT: cmpq %rcx, %rdx
+; ADL-NEXT: jne .LBB1_8
+; ADL-NEXT: .LBB1_3:
+; ADL-NEXT: testq %rax, %rax
+; ADL-NEXT: je .LBB1_6
+; ADL-NEXT: # %bb.4: # %.preheader
+; ADL-NEXT: shlq $4, %rcx
+; ADL-NEXT: addq %rcx, %rsi
+; ADL-NEXT: shlq $4, %rax
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; ADL-NEXT: addq $16, %rcx
+; ADL-NEXT: cmpq %rcx, %rax
+; ADL-NEXT: jne .LBB1_5
+; ADL-NEXT: .LBB1_6:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: foo_128:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB1_6
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %edx
+; SPR-NEXT: movl %edx, %eax
+; SPR-NEXT: andl $3, %eax
+; SPR-NEXT: cmpl $4, %edi
+; SPR-NEXT: jae .LBB1_7
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB1_3
+; SPR-NEXT: .LBB1_7:
+; SPR-NEXT: andl $-4, %edx
+; SPR-NEXT: leaq 48(%rsi), %rdi
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
+; SPR-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: addq $4, %rcx
+; SPR-NEXT: addq $64, %rdi
+; SPR-NEXT: cmpq %rcx, %rdx
+; SPR-NEXT: jne .LBB1_8
+; SPR-NEXT: .LBB1_3:
+; SPR-NEXT: testq %rax, %rax
+; SPR-NEXT: je .LBB1_6
+; SPR-NEXT: # %bb.4: # %.preheader
+; SPR-NEXT: shlq $4, %rcx
+; SPR-NEXT: addq %rcx, %rsi
+; SPR-NEXT: shlq $4, %rax
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; SPR-NEXT: addq $16, %rcx
+; SPR-NEXT: cmpq %rcx, %rax
+; SPR-NEXT: jne .LBB1_5
+; SPR-NEXT: .LBB1_6:
+; SPR-NEXT: retq
;
; AVX512-LABEL: foo_128:
; AVX512: # %bb.0:
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd -32(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd -16(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0
+; AVX512-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: addq $4, %rcx
; AVX512-NEXT: addq $64, %rdi
; AVX512-NEXT: cmpq %rcx, %rdx
}
define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) {
-; AVX-LABEL: bar_128:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB2_5
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: cmpl $1, %edi
-; AVX-NEXT: jne .LBB2_6
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB2_3
-; AVX-NEXT: .LBB2_6:
-; AVX-NEXT: movl %eax, %edi
-; AVX-NEXT: andl $-2, %edi
-; AVX-NEXT: movl $16, %r8d
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
-; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm2
-; AVX-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rsi,%r8)
-; AVX-NEXT: addq $2, %rcx
-; AVX-NEXT: addq $32, %r8
-; AVX-NEXT: cmpq %rcx, %rdi
-; AVX-NEXT: jne .LBB2_7
-; AVX-NEXT: .LBB2_3:
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: je .LBB2_5
-; AVX-NEXT: # %bb.4:
-; AVX-NEXT: shlq $4, %rcx
-; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
-; AVX-NEXT: .LBB2_5:
-; AVX-NEXT: retq
+; ADL-LABEL: bar_128:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB2_5
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %eax
+; ADL-NEXT: cmpl $1, %edi
+; ADL-NEXT: jne .LBB2_6
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB2_3
+; ADL-NEXT: .LBB2_6:
+; ADL-NEXT: movl %eax, %edi
+; ADL-NEXT: andl $-2, %edi
+; ADL-NEXT: movl $16, %r8d
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: vmovdqa (%rsi,%r8), %xmm1
+; ADL-NEXT: vpmaddwd -16(%rdx,%r8), %xmm0, %xmm2
+; ADL-NEXT: vpaddd -16(%rsi,%r8), %xmm2, %xmm2
+; ADL-NEXT: vmovdqa %xmm2, -16(%rsi,%r8)
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm1
+; ADL-NEXT: vmovdqa %xmm1, (%rsi,%r8)
+; ADL-NEXT: addq $2, %rcx
+; ADL-NEXT: addq $32, %r8
+; ADL-NEXT: cmpq %rcx, %rdi
+; ADL-NEXT: jne .LBB2_7
+; ADL-NEXT: .LBB2_3:
+; ADL-NEXT: testb $1, %al
+; ADL-NEXT: je .LBB2_5
+; ADL-NEXT: # %bb.4:
+; ADL-NEXT: shlq $4, %rcx
+; ADL-NEXT: vmovdqa (%rsi,%rcx), %xmm1
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
+; ADL-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; ADL-NEXT: .LBB2_5:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: bar_128:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB2_5
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %eax
+; SPR-NEXT: cmpl $1, %edi
+; SPR-NEXT: jne .LBB2_6
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB2_3
+; SPR-NEXT: .LBB2_6:
+; SPR-NEXT: movl %eax, %edi
+; SPR-NEXT: andl $-2, %edi
+; SPR-NEXT: movl $16, %r8d
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
+; SPR-NEXT: vmovdqa (%rsi,%r8), %xmm2
+; SPR-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
+; SPR-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
+; SPR-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1
+; SPR-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; SPR-NEXT: vmovdqa %xmm1, (%rsi,%r8)
+; SPR-NEXT: addq $2, %rcx
+; SPR-NEXT: addq $32, %r8
+; SPR-NEXT: cmpq %rcx, %rdi
+; SPR-NEXT: jne .LBB2_7
+; SPR-NEXT: .LBB2_3:
+; SPR-NEXT: testb $1, %al
+; SPR-NEXT: je .LBB2_5
+; SPR-NEXT: # %bb.4:
+; SPR-NEXT: shlq $4, %rcx
+; SPR-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0
+; SPR-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0
+; SPR-NEXT: vmovdqa %xmm0, (%rsi,%rcx)
+; SPR-NEXT: .LBB2_5:
+; SPR-NEXT: retq
;
; AVX512-LABEL: bar_128:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi,%r8), %xmm2
; AVX512-NEXT: vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
-; AVX512-NEXT: vpdpwssd (%rdx,%r8), %xmm0, %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, (%rsi,%r8)
+; AVX512-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%r8)
; AVX512-NEXT: addq $2, %rcx
; AVX512-NEXT: addq $32, %r8
; AVX512-NEXT: cmpq %rcx, %rdi
; AVX512-NEXT: je .LBB2_5
; AVX512-NEXT: # %bb.4:
; AVX512-NEXT: shlq $4, %rcx
-; AVX512-NEXT: vmovdqa (%rsi,%rcx), %xmm1
-; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0
+; AVX512-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi,%rcx)
; AVX512-NEXT: .LBB2_5:
; AVX512-NEXT: retq
%5 = icmp sgt i32 %0, 0
; AVX-LABEL: foo_reg_256:
; AVX: # %bb.0:
; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm4, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2
+; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1
+; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: foo_reg_256:
; AVX512: # %bb.0:
; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm3, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm4, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%7 = bitcast <4 x i64> %0 to <8 x i32>
%8 = bitcast <4 x i64> %1 to <8 x i32>
; }
define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
-; AVX-LABEL: foo_256:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB4_6
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %edx
-; AVX-NEXT: movl %edx, %eax
-; AVX-NEXT: andl $3, %eax
-; AVX-NEXT: cmpl $4, %edi
-; AVX-NEXT: jae .LBB4_7
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB4_3
-; AVX-NEXT: .LBB4_7:
-; AVX-NEXT: andl $-4, %edx
-; AVX-NEXT: leaq 96(%rsi), %rdi
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0
-; AVX-NEXT: addq $4, %rcx
-; AVX-NEXT: subq $-128, %rdi
-; AVX-NEXT: cmpq %rcx, %rdx
-; AVX-NEXT: jne .LBB4_8
-; AVX-NEXT: .LBB4_3:
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: je .LBB4_6
-; AVX-NEXT: # %bb.4: # %.preheader
-; AVX-NEXT: shlq $5, %rcx
-; AVX-NEXT: addq %rcx, %rsi
-; AVX-NEXT: shlq $5, %rax
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
-; AVX-NEXT: addq $32, %rcx
-; AVX-NEXT: cmpq %rcx, %rax
-; AVX-NEXT: jne .LBB4_5
-; AVX-NEXT: .LBB4_6:
-; AVX-NEXT: retq
+; ADL-LABEL: foo_256:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB4_6
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %edx
+; ADL-NEXT: movl %edx, %eax
+; ADL-NEXT: andl $3, %eax
+; ADL-NEXT: cmpl $4, %edi
+; ADL-NEXT: jae .LBB4_7
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB4_3
+; ADL-NEXT: .LBB4_7:
+; ADL-NEXT: andl $-4, %edx
+; ADL-NEXT: leaq 96(%rsi), %rdi
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
+; ADL-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; ADL-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm3
+; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; ADL-NEXT: vpaddd %ymm3, %ymm0, %ymm0
+; ADL-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; ADL-NEXT: addq $4, %rcx
+; ADL-NEXT: subq $-128, %rdi
+; ADL-NEXT: cmpq %rcx, %rdx
+; ADL-NEXT: jne .LBB4_8
+; ADL-NEXT: .LBB4_3:
+; ADL-NEXT: testq %rax, %rax
+; ADL-NEXT: je .LBB4_6
+; ADL-NEXT: # %bb.4: # %.preheader
+; ADL-NEXT: shlq $5, %rcx
+; ADL-NEXT: addq %rcx, %rsi
+; ADL-NEXT: shlq $5, %rax
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; ADL-NEXT: addq $32, %rcx
+; ADL-NEXT: cmpq %rcx, %rax
+; ADL-NEXT: jne .LBB4_5
+; ADL-NEXT: .LBB4_6:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: foo_256:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB4_6
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %edx
+; SPR-NEXT: movl %edx, %eax
+; SPR-NEXT: andl $3, %eax
+; SPR-NEXT: cmpl $4, %edi
+; SPR-NEXT: jae .LBB4_7
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB4_3
+; SPR-NEXT: .LBB4_7:
+; SPR-NEXT: andl $-4, %edx
+; SPR-NEXT: leaq 96(%rsi), %rdi
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
+; SPR-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: addq $4, %rcx
+; SPR-NEXT: subq $-128, %rdi
+; SPR-NEXT: cmpq %rcx, %rdx
+; SPR-NEXT: jne .LBB4_8
+; SPR-NEXT: .LBB4_3:
+; SPR-NEXT: testq %rax, %rax
+; SPR-NEXT: je .LBB4_6
+; SPR-NEXT: # %bb.4: # %.preheader
+; SPR-NEXT: shlq $5, %rcx
+; SPR-NEXT: addq %rcx, %rsi
+; SPR-NEXT: shlq $5, %rax
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; SPR-NEXT: addq $32, %rcx
+; SPR-NEXT: cmpq %rcx, %rax
+; SPR-NEXT: jne .LBB4_5
+; SPR-NEXT: .LBB4_6:
+; SPR-NEXT: retq
;
; AVX512-LABEL: foo_256:
; AVX512: # %bb.0:
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd -64(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd -32(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0
+; AVX512-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX512-NEXT: addq $4, %rcx
; AVX512-NEXT: subq $-128, %rdi
; AVX512-NEXT: cmpq %rcx, %rdx
; }
; }
define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
-; AVX-LABEL: bar_256:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB5_5
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: cmpl $1, %edi
-; AVX-NEXT: jne .LBB5_6
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB5_3
-; AVX-NEXT: .LBB5_6:
-; AVX-NEXT: movl %eax, %edi
-; AVX-NEXT: andl $-2, %edi
-; AVX-NEXT: movl $32, %r8d
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
-; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm2
-; AVX-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
-; AVX-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2
-; AVX-NEXT: vmovdqa %ymm2, (%rsi,%r8)
-; AVX-NEXT: addq $2, %rcx
-; AVX-NEXT: addq $64, %r8
-; AVX-NEXT: cmpq %rcx, %rdi
-; AVX-NEXT: jne .LBB5_7
-; AVX-NEXT: .LBB5_3:
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: je .LBB5_5
-; AVX-NEXT: # %bb.4:
-; AVX-NEXT: shlq $5, %rcx
-; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
-; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
-; AVX-NEXT: .LBB5_5:
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; ADL-LABEL: bar_256:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB5_5
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %eax
+; ADL-NEXT: cmpl $1, %edi
+; ADL-NEXT: jne .LBB5_6
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB5_3
+; ADL-NEXT: .LBB5_6:
+; ADL-NEXT: movl %eax, %edi
+; ADL-NEXT: andl $-2, %edi
+; ADL-NEXT: movl $32, %r8d
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: vmovdqa (%rsi,%r8), %ymm1
+; ADL-NEXT: vpmaddwd -32(%rdx,%r8), %ymm0, %ymm2
+; ADL-NEXT: vpaddd -32(%rsi,%r8), %ymm2, %ymm2
+; ADL-NEXT: vmovdqa %ymm2, -32(%rsi,%r8)
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm1
+; ADL-NEXT: vmovdqa %ymm1, (%rsi,%r8)
+; ADL-NEXT: addq $2, %rcx
+; ADL-NEXT: addq $64, %r8
+; ADL-NEXT: cmpq %rcx, %rdi
+; ADL-NEXT: jne .LBB5_7
+; ADL-NEXT: .LBB5_3:
+; ADL-NEXT: testb $1, %al
+; ADL-NEXT: je .LBB5_5
+; ADL-NEXT: # %bb.4:
+; ADL-NEXT: shlq $5, %rcx
+; ADL-NEXT: vmovdqa (%rsi,%rcx), %ymm1
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
+; ADL-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; ADL-NEXT: .LBB5_5:
+; ADL-NEXT: vzeroupper
+; ADL-NEXT: retq
+;
+; SPR-LABEL: bar_256:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB5_5
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %eax
+; SPR-NEXT: cmpl $1, %edi
+; SPR-NEXT: jne .LBB5_6
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB5_3
+; SPR-NEXT: .LBB5_6:
+; SPR-NEXT: movl %eax, %edi
+; SPR-NEXT: andl $-2, %edi
+; SPR-NEXT: movl $32, %r8d
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
+; SPR-NEXT: vmovdqa (%rsi,%r8), %ymm2
+; SPR-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
+; SPR-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
+; SPR-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1
+; SPR-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; SPR-NEXT: vmovdqa %ymm1, (%rsi,%r8)
+; SPR-NEXT: addq $2, %rcx
+; SPR-NEXT: addq $64, %r8
+; SPR-NEXT: cmpq %rcx, %rdi
+; SPR-NEXT: jne .LBB5_7
+; SPR-NEXT: .LBB5_3:
+; SPR-NEXT: testb $1, %al
+; SPR-NEXT: je .LBB5_5
+; SPR-NEXT: # %bb.4:
+; SPR-NEXT: shlq $5, %rcx
+; SPR-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0
+; SPR-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0
+; SPR-NEXT: vmovdqa %ymm0, (%rsi,%rcx)
+; SPR-NEXT: .LBB5_5:
+; SPR-NEXT: vzeroupper
+; SPR-NEXT: retq
;
; AVX512-LABEL: bar_256:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm2
; AVX512-NEXT: vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
; AVX512-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
-; AVX512-NEXT: vpdpwssd (%rdx,%r8), %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa %ymm2, (%rsi,%r8)
+; AVX512-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1
+; AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%r8)
; AVX512-NEXT: addq $2, %rcx
; AVX512-NEXT: addq $64, %r8
; AVX512-NEXT: cmpq %rcx, %rdi
; AVX512-NEXT: je .LBB5_5
; AVX512-NEXT: # %bb.4:
; AVX512-NEXT: shlq $5, %rcx
-; AVX512-NEXT: vmovdqa (%rsi,%rcx), %ymm1
-; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0
+; AVX512-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, (%rsi,%rcx)
; AVX512-NEXT: .LBB5_5:
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq