--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
+
+
+source_filename = "amx_api.c"
+
+%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
+
+@buf = dso_local global [1024 x i8] zeroinitializer, align 16
+@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
+; AVX512-LABEL: test_api:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: .cfi_def_cfa_register %rbp
+; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400
+; AVX512-NEXT: movw %dx, %ax
+; AVX512-NEXT: movw %si, %cx
+; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset@PLT
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset@PLT
+; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: callq memset@PLT
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %di
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rdi)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: jmp .LBB0_3
+; AVX512-NEXT: .LBB0_2: # %if.else
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %dil
+; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %di
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw (%rax), %si
+; AVX512-NEXT: movw 2(%rax), %dx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%rdi)
+; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: addq $64, %rdx
+; AVX512-NEXT: movl $64, %esi
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: .LBB0_3: # %if.end
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: movl $1088, %edx # imm = 0x440
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: movl $1024, %edx # imm = 0x400
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: # kill: def $r8 killed $rax
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX512-NEXT: movw %r10w, %di
+; AVX512-NEXT: shrl $2, %r10d
+; AVX512-NEXT: movw %r10w, %r9w
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r8b
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; AVX512-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2
+; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: addq $64, %rdi
+; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq memcpy@PLT
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def $rdi killed $rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
+; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
+; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb %al, %r9b
+; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg (%r8)
+; AVX512-NEXT: movl $64, %r8d
+; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0
+; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa %rsp, 8
+; AVX512-NEXT: tilerelease
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: test_api:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: .cfi_def_cfa_register %rbp
+; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX2-NEXT: subq $29696, %rsp # imm = 0x7400
+; AVX2-NEXT: movw %dx, %ax
+; AVX2-NEXT: movw %si, %cx
+; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset@PLT
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset@PLT
+; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: callq memset@PLT
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %di
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rdi)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: jmp .LBB0_3
+; AVX2-NEXT: .LBB0_2: # %if.else
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %dil
+; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %di
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw (%rax), %si
+; AVX2-NEXT: movw 2(%rax), %dx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%rdi)
+; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: movl $64, %esi
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: .LBB0_3: # %if.end
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movl $1088, %edx # imm = 0x440
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vmovaps 64(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 96(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 128(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 160(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 192(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 224(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 256(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 288(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 320(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 352(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 384(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 416(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 448(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 480(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 512(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 544(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 576(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 608(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 640(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 672(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 704(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 736(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 768(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 800(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 832(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 864(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 896(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 928(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 960(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 992(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 1024(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps 1056(%rax), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movl $1024, %edx # imm = 0x400
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: # kill: def $r8 killed $rax
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movw %r10w, %di
+; AVX2-NEXT: shrl $2, %r10d
+; AVX2-NEXT: movw %r10w, %r9w
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r8b
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; AVX2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm0
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm1
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: tileloadd (%r10,%r8), %tmm2
+; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: addq $64, %rdi
+; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq memcpy@PLT
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: # kill: def $rdi killed $rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb %al, %r9b
+; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg (%r8)
+; AVX2-NEXT: movl $64, %r8d
+; AVX2-NEXT: tileloadd (%rdi,%r8), %tmm0
+; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
+; AVX2-NEXT: tilerelease
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; SSE2-LABEL: test_api:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: .cfi_def_cfa_offset 16
+; SSE2-NEXT: .cfi_offset %rbp, -16
+; SSE2-NEXT: movq %rsp, %rbp
+; SSE2-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; SSE2-NEXT: subq $30720, %rsp # imm = 0x7800
+; SSE2-NEXT: movw %dx, %ax
+; SSE2-NEXT: movw %si, %cx
+; SSE2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset@PLT
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset@PLT
+; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: xorl %esi, %esi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: callq memset@PLT
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: je .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %di
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rdi)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: jmp .LBB0_3
+; SSE2-NEXT: .LBB0_2: # %if.else
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %dil
+; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %di
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw (%rax), %si
+; SSE2-NEXT: movw 2(%rax), %dx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%rdi)
+; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: addq $64, %rdx
+; SSE2-NEXT: movl $64, %esi
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: .LBB0_3: # %if.end
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movl $1088, %edx # imm = 0x440
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movaps 64(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 80(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 96(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 112(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 128(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 144(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 160(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 176(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 192(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 208(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 224(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 240(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 256(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 272(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 288(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 304(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 320(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 336(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 352(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 368(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 384(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 400(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 416(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 432(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 448(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 464(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 480(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 496(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 512(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 528(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 544(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 560(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 576(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 592(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 608(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 624(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 640(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 656(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 672(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 688(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 704(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 720(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 736(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 752(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 768(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 784(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 800(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 816(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 832(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 848(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 864(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 880(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 896(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 912(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 928(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 944(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 960(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 976(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 992(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1008(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1024(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1040(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1056(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps 1072(%rax), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: movl $1024, %edx # imm = 0x400
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: # kill: def $r8 killed $rax
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: movw %r10w, %di
+; SSE2-NEXT: shrl $2, %r10d
+; SSE2-NEXT: movw %r10w, %r9w
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r8b
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: # kill: def $r10b killed $r10b killed $r10d
+; SSE2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm0
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm1
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: tileloadd (%r10,%r8), %tmm2
+; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: addq $64, %rdi
+; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: callq memcpy@PLT
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
+; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT: # kill: def $rdi killed $rax
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb %al, %r9b
+; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg (%r8)
+; SSE2-NEXT: movl $64, %r8d
+; SSE2-NEXT: tileloadd (%rdi,%r8), %tmm0
+; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-NEXT: movq %rbp, %rsp
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: .cfi_def_cfa %rsp, 8
+; SSE2-NEXT: tilerelease
+; SSE2-NEXT: retq
+entry:
+ %m.addr.i85 = alloca i16, align 2
+ %n.addr.i86 = alloca i16, align 2
+ %base.addr.i87 = alloca i8*, align 8
+ %stride.addr.i88 = alloca i64, align 8
+ %tile.addr.i = alloca <256 x i32>, align 64
+ %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024
+ %m.addr.i81 = alloca i16, align 2
+ %n.addr.i82 = alloca i16, align 2
+ %k.addr.i = alloca i16, align 2
+ %dst.addr.i83 = alloca <256 x i32>, align 64
+ %src1.addr.i = alloca <256 x i32>, align 64
+ %src2.addr.i = alloca <256 x i32>, align 64
+ %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024
+ %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024
+ %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024
+ %m.addr.i74 = alloca i16, align 2
+ %n.addr.i75 = alloca i16, align 2
+ %base.addr.i76 = alloca i8*, align 8
+ %stride.addr.i77 = alloca i64, align 8
+ %m.addr.i70 = alloca i16, align 2
+ %n.addr.i71 = alloca i16, align 2
+ %base.addr.i72 = alloca i8*, align 8
+ %stride.addr.i73 = alloca i64, align 8
+ %m.addr.i66 = alloca i16, align 2
+ %n.addr.i67 = alloca i16, align 2
+ %base.addr.i68 = alloca i8*, align 8
+ %stride.addr.i69 = alloca i64, align 8
+ %m.addr.i62 = alloca i16, align 2
+ %n.addr.i63 = alloca i16, align 2
+ %base.addr.i64 = alloca i8*, align 8
+ %stride.addr.i65 = alloca i64, align 8
+ %m.addr.i58 = alloca i16, align 2
+ %n.addr.i59 = alloca i16, align 2
+ %base.addr.i60 = alloca i8*, align 8
+ %stride.addr.i61 = alloca i64, align 8
+ %m.addr.i = alloca i16, align 2
+ %n.addr.i = alloca i16, align 2
+ %base.addr.i56 = alloca i8*, align 8
+ %stride.addr.i57 = alloca i64, align 8
+ %base.addr.i50 = alloca i8*, align 8
+ %stride.addr.i51 = alloca i64, align 8
+ %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024
+ %c49 = alloca %struct.__tile1024i_str, align 64
+ %dst.addr.i44 = alloca %struct.__tile1024i_str*, align 8
+ %indirect-arg-temp.i = alloca <256 x i32>, align 1024
+ %indirect-arg-temp4.i = alloca <256 x i32>, align 1024
+ %indirect-arg-temp5.i = alloca <256 x i32>, align 1024
+ %b43 = alloca %struct.__tile1024i_str, align 64
+ %a42 = alloca %struct.__tile1024i_str, align 64
+ %dst.addr.i35 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i36 = alloca i8*, align 8
+ %stride.addr.i37 = alloca i64, align 8
+ %dst.addr.i28 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i29 = alloca i8*, align 8
+ %stride.addr.i30 = alloca i64, align 8
+ %dst.addr.i21 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i22 = alloca i8*, align 8
+ %stride.addr.i23 = alloca i64, align 8
+ %dst.addr.i14 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i15 = alloca i8*, align 8
+ %stride.addr.i16 = alloca i64, align 8
+ %dst.addr.i7 = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i8 = alloca i8*, align 8
+ %stride.addr.i9 = alloca i64, align 8
+ %dst.addr.i = alloca %struct.__tile1024i_str*, align 8
+ %base.addr.i = alloca i8*, align 8
+ %stride.addr.i = alloca i64, align 8
+ %cond.addr = alloca i32, align 4
+ %row.addr = alloca i16, align 2
+ %col.addr = alloca i16, align 2
+ %a = alloca %struct.__tile1024i_str, align 64
+ %b = alloca %struct.__tile1024i_str, align 64
+ %c = alloca %struct.__tile1024i_str, align 64
+ store i32 %cond, i32* %cond.addr, align 4
+ store i16 %row, i16* %row.addr, align 2
+ store i16 %col, i16* %col.addr, align 2
+ %0 = bitcast %struct.__tile1024i_str* %a to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %0, i8 0, i64 1088, i1 false)
+ %row1 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 0
+ %1 = load i16, i16* %row.addr, align 2
+ store i16 %1, i16* %row1, align 64
+ %col2 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a, i32 0, i32 1
+ store i16 8, i16* %col2, align 2
+ %2 = bitcast %struct.__tile1024i_str* %b to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %2, i8 0, i64 1088, i1 false)
+ %row3 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 0
+ store i16 8, i16* %row3, align 64
+ %col4 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b, i32 0, i32 1
+ %3 = load i16, i16* %col.addr, align 2
+ store i16 %3, i16* %col4, align 2
+ %4 = bitcast %struct.__tile1024i_str* %c to i8*
+ call void @llvm.memset.p0i8.i64(i8* align 64 %4, i8 0, i64 1088, i1 false)
+ %row5 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 0
+ %5 = load i16, i16* %row.addr, align 2
+ store i16 %5, i16* %row5, align 64
+ %col6 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c, i32 0, i32 1
+ %6 = load i16, i16* %col.addr, align 2
+ store i16 %6, i16* %col6, align 2
+ %7 = load i32, i32* %cond.addr, align 4
+ %tobool = icmp ne i32 %7, 0
+ br i1 %tobool, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i36, align 8
+ store i64 32, i64* %stride.addr.i37, align 8
+ %8 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %row.i38 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %8, i32 0, i32 0
+ %9 = load i16, i16* %row.i38, align 64
+ %10 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %col.i39 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %10, i32 0, i32 1
+ %11 = load i16, i16* %col.i39, align 2
+ %12 = load i8*, i8** %base.addr.i36, align 8
+ %13 = load i64, i64* %stride.addr.i37, align 8
+ store i16 %9, i16* %m.addr.i, align 2
+ store i16 %11, i16* %n.addr.i, align 2
+ store i8* %12, i8** %base.addr.i56, align 8
+ store i64 %13, i64* %stride.addr.i57, align 8
+ %14 = load i16, i16* %m.addr.i, align 2
+ %15 = load i16, i16* %n.addr.i, align 2
+ %16 = load i8*, i8** %base.addr.i56, align 8
+ %17 = load i64, i64* %stride.addr.i57, align 8
+ %18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %14, i16 %15, i8* %16, i64 %17) #2
+ %19 = bitcast x86_amx %18 to <256 x i32>
+ %20 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i35, align 8
+ %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %20, i32 0, i32 3
+ store <256 x i32> %19, <256 x i32>* %tile.i41, align 64
+ store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i29, align 8
+ store i64 32, i64* %stride.addr.i30, align 8
+ %21 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %row.i31 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %21, i32 0, i32 0
+ %22 = load i16, i16* %row.i31, align 64
+ %23 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %col.i32 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %23, i32 0, i32 1
+ %24 = load i16, i16* %col.i32, align 2
+ %25 = load i8*, i8** %base.addr.i29, align 8
+ %26 = load i64, i64* %stride.addr.i30, align 8
+ store i16 %22, i16* %m.addr.i58, align 2
+ store i16 %24, i16* %n.addr.i59, align 2
+ store i8* %25, i8** %base.addr.i60, align 8
+ store i64 %26, i64* %stride.addr.i61, align 8
+ %27 = load i16, i16* %m.addr.i58, align 2
+ %28 = load i16, i16* %n.addr.i59, align 2
+ %29 = load i8*, i8** %base.addr.i60, align 8
+ %30 = load i64, i64* %stride.addr.i61, align 8
+ %31 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %27, i16 %28, i8* %29, i64 %30) #2
+ %32 = bitcast x86_amx %31 to <256 x i32>
+ %33 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i28, align 8
+ %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %33, i32 0, i32 3
+ store <256 x i32> %32, <256 x i32>* %tile.i34, align 64
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i22, align 8
+ store i64 32, i64* %stride.addr.i23, align 8
+ %34 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %row.i24 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %34, i32 0, i32 0
+ %35 = load i16, i16* %row.i24, align 64
+ %36 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %col.i25 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %36, i32 0, i32 1
+ %37 = load i16, i16* %col.i25, align 2
+ %38 = load i8*, i8** %base.addr.i22, align 8
+ %39 = load i64, i64* %stride.addr.i23, align 8
+ store i16 %35, i16* %m.addr.i62, align 2
+ store i16 %37, i16* %n.addr.i63, align 2
+ store i8* %38, i8** %base.addr.i64, align 8
+ store i64 %39, i64* %stride.addr.i65, align 8
+ %40 = load i16, i16* %m.addr.i62, align 2
+ %41 = load i16, i16* %n.addr.i63, align 2
+ %42 = load i8*, i8** %base.addr.i64, align 8
+ %43 = load i64, i64* %stride.addr.i65, align 8
+ %44 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %40, i16 %41, i8* %42, i64 %43) #2
+ %45 = bitcast x86_amx %44 to <256 x i32>
+ %46 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i21, align 8
+ %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %46, i32 0, i32 3
+ store <256 x i32> %45, <256 x i32>* %tile.i27, align 64
+ br label %if.end
+
+if.else: ; preds = %entry
+ store %struct.__tile1024i_str* %a, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i15, align 8
+ store i64 32, i64* %stride.addr.i16, align 8
+ %47 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %row.i17 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %47, i32 0, i32 0
+ %48 = load i16, i16* %row.i17, align 64
+ %49 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %col.i18 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %49, i32 0, i32 1
+ %50 = load i16, i16* %col.i18, align 2
+ %51 = load i8*, i8** %base.addr.i15, align 8
+ %52 = load i64, i64* %stride.addr.i16, align 8
+ store i16 %48, i16* %m.addr.i66, align 2
+ store i16 %50, i16* %n.addr.i67, align 2
+ store i8* %51, i8** %base.addr.i68, align 8
+ store i64 %52, i64* %stride.addr.i69, align 8
+ %53 = load i16, i16* %m.addr.i66, align 2
+ %54 = load i16, i16* %n.addr.i67, align 2
+ %55 = load i8*, i8** %base.addr.i68, align 8
+ %56 = load i64, i64* %stride.addr.i69, align 8
+ %57 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %53, i16 %54, i8* %55, i64 %56) #2
+ %58 = bitcast x86_amx %57 to <256 x i32>
+ %59 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i14, align 8
+ %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %59, i32 0, i32 3
+ store <256 x i32> %58, <256 x i32>* %tile.i20, align 64
+ store %struct.__tile1024i_str* %b, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i8, align 8
+ store i64 32, i64* %stride.addr.i9, align 8
+ %60 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %row.i10 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %60, i32 0, i32 0
+ %61 = load i16, i16* %row.i10, align 64
+ %62 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %col.i11 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %62, i32 0, i32 1
+ %63 = load i16, i16* %col.i11, align 2
+ %64 = load i8*, i8** %base.addr.i8, align 8
+ %65 = load i64, i64* %stride.addr.i9, align 8
+ store i16 %61, i16* %m.addr.i70, align 2
+ store i16 %63, i16* %n.addr.i71, align 2
+ store i8* %64, i8** %base.addr.i72, align 8
+ store i64 %65, i64* %stride.addr.i73, align 8
+ %66 = load i16, i16* %m.addr.i70, align 2
+ %67 = load i16, i16* %n.addr.i71, align 2
+ %68 = load i8*, i8** %base.addr.i72, align 8
+ %69 = load i64, i64* %stride.addr.i73, align 8
+ %70 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %66, i16 %67, i8* %68, i64 %69) #2
+ %71 = bitcast x86_amx %70 to <256 x i32>
+ %72 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i7, align 8
+ %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %72, i32 0, i32 3
+ store <256 x i32> %71, <256 x i32>* %tile.i13, align 64
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i, align 8
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i8** %base.addr.i, align 8
+ store i64 32, i64* %stride.addr.i, align 8
+ %73 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %row.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %73, i32 0, i32 0
+ %74 = load i16, i16* %row.i, align 64
+ %75 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %col.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %75, i32 0, i32 1
+ %76 = load i16, i16* %col.i, align 2
+ %77 = load i8*, i8** %base.addr.i, align 8
+ %78 = load i64, i64* %stride.addr.i, align 8
+ store i16 %74, i16* %m.addr.i74, align 2
+ store i16 %76, i16* %n.addr.i75, align 2
+ store i8* %77, i8** %base.addr.i76, align 8
+ store i64 %78, i64* %stride.addr.i77, align 8
+ %79 = load i16, i16* %m.addr.i74, align 2
+ %80 = load i16, i16* %n.addr.i75, align 2
+ %81 = load i8*, i8** %base.addr.i76, align 8
+ %82 = load i64, i64* %stride.addr.i77, align 8
+ %83 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %79, i16 %80, i8* %81, i64 %82) #2
+ %84 = bitcast x86_amx %83 to <256 x i32>
+ %85 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i, align 8
+ %tile.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %85, i32 0, i32 3
+ store <256 x i32> %84, <256 x i32>* %tile.i, align 64
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %86 = bitcast %struct.__tile1024i_str* %b43 to i8*
+ %87 = bitcast %struct.__tile1024i_str* %b to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %86, i8* align 1 %87, i64 1088, i1 false) #2
+ %88 = bitcast %struct.__tile1024i_str* %a42 to i8*
+ %89 = bitcast %struct.__tile1024i_str* %a to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %88, i8* align 1 %89, i64 1088, i1 false) #2
+ store %struct.__tile1024i_str* %c, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %row.i45 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 0
+ %90 = load i16, i16* %row.i45, align 64
+ %col.i46 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 1
+ %91 = load i16, i16* %col.i46, align 2
+ %col1.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 1
+ %92 = load i16, i16* %col1.i, align 2
+ %93 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %93, i32 0, i32 3
+ %94 = load <256 x i32>, <256 x i32>* %tile.i47, align 64
+ %tile2.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %a42, i32 0, i32 3
+ %95 = load <256 x i32>, <256 x i32>* %tile2.i, align 64
+ %tile3.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %b43, i32 0, i32 3
+ %96 = load <256 x i32>, <256 x i32>* %tile3.i, align 64
+ store <256 x i32> %94, <256 x i32>* %indirect-arg-temp.i, align 1024
+ store <256 x i32> %95, <256 x i32>* %indirect-arg-temp4.i, align 1024
+ store <256 x i32> %96, <256 x i32>* %indirect-arg-temp5.i, align 1024
+ %97 = bitcast <256 x i32>* %indirect-arg-temp5.i80 to i8*
+ %98 = bitcast <256 x i32>* %indirect-arg-temp5.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %97, i8* align 1 %98, i64 1024, i1 false) #2
+ %99 = bitcast <256 x i32>* %indirect-arg-temp4.i79 to i8*
+ %100 = bitcast <256 x i32>* %indirect-arg-temp4.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %99, i8* align 1 %100, i64 1024, i1 false) #2
+ %101 = bitcast <256 x i32>* %indirect-arg-temp.i78 to i8*
+ %102 = bitcast <256 x i32>* %indirect-arg-temp.i to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %101, i8* align 1 %102, i64 1024, i1 false) #2
+ %dst.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i78, align 1024
+ %src1.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp4.i79, align 1024
+ %src2.i = load <256 x i32>, <256 x i32>* %indirect-arg-temp5.i80, align 1024
+ store i16 %90, i16* %m.addr.i81, align 2
+ store i16 %91, i16* %n.addr.i82, align 2
+ store i16 %92, i16* %k.addr.i, align 2
+ store <256 x i32> %dst.i, <256 x i32>* %dst.addr.i83, align 64
+ store <256 x i32> %src1.i, <256 x i32>* %src1.addr.i, align 64
+ store <256 x i32> %src2.i, <256 x i32>* %src2.addr.i, align 64
+ %103 = load i16, i16* %m.addr.i81, align 2
+ %104 = load i16, i16* %n.addr.i82, align 2
+ %105 = load i16, i16* %k.addr.i, align 2
+ %106 = load <256 x i32>, <256 x i32>* %dst.addr.i83, align 64
+ %107 = bitcast <256 x i32> %106 to x86_amx
+ %108 = load <256 x i32>, <256 x i32>* %src1.addr.i, align 64
+ %109 = bitcast <256 x i32> %108 to x86_amx
+ %110 = load <256 x i32>, <256 x i32>* %src2.addr.i, align 64
+ %111 = bitcast <256 x i32> %110 to x86_amx
+ %112 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %103, i16 %104, i16 %105, x86_amx %107, x86_amx %109, x86_amx %111) #2
+ %113 = bitcast x86_amx %112 to <256 x i32>
+ %114 = load %struct.__tile1024i_str*, %struct.__tile1024i_str** %dst.addr.i44, align 8
+ %tile6.i = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %114, i32 0, i32 3
+ store <256 x i32> %113, <256 x i32>* %tile6.i, align 64
+ %115 = bitcast %struct.__tile1024i_str* %c49 to i8*
+ %116 = bitcast %struct.__tile1024i_str* %c to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %115, i8* align 1 %116, i64 1088, i1 false) #2
+ store i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i8** %base.addr.i50, align 8
+ store i64 32, i64* %stride.addr.i51, align 8
+ %row.i53 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 0
+ %117 = load i16, i16* %row.i53, align 64
+ %col.i54 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 1
+ %118 = load i16, i16* %col.i54, align 2
+ %119 = load i8*, i8** %base.addr.i50, align 8
+ %120 = load i64, i64* %stride.addr.i51, align 8
+ %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, %struct.__tile1024i_str* %c49, i32 0, i32 3
+ %121 = load <256 x i32>, <256 x i32>* %tile.i55, align 64
+ store <256 x i32> %121, <256 x i32>* %indirect-arg-temp.i52, align 1024
+ %122 = bitcast <256 x i32>* %indirect-arg-temp.i5284 to i8*
+ %123 = bitcast <256 x i32>* %indirect-arg-temp.i52 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %122, i8* align 1 %123, i64 1024, i1 false) #2
+ %tile.i89 = load <256 x i32>, <256 x i32>* %indirect-arg-temp.i5284, align 1024
+ store i16 %117, i16* %m.addr.i85, align 2
+ store i16 %118, i16* %n.addr.i86, align 2
+ store i8* %119, i8** %base.addr.i87, align 8
+ store i64 %120, i64* %stride.addr.i88, align 8
+ store <256 x i32> %tile.i89, <256 x i32>* %tile.addr.i, align 64
+ %124 = load i16, i16* %m.addr.i85, align 2
+ %125 = load i16, i16* %n.addr.i86, align 2
+ %126 = load i8*, i8** %base.addr.i87, align 8
+ %127 = load i64, i64* %stride.addr.i88, align 8
+ %128 = load <256 x i32>, <256 x i32>* %tile.addr.i, align 64
+ %129 = bitcast <256 x i32> %128 to x86_amx
+ call void @llvm.x86.tilestored64.internal(i16 %124, i16 %125, i8* %126, i64 %127, x86_amx %129) #2
+ ret void
+}
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2
+
+; Function Attrs: nounwind
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
+
+; Function Attrs: nounwind
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nofree nosync nounwind willreturn }