isPowerOf2_32(WidenWidth / MemVTWidth) &&
(MemVTWidth <= Width ||
(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+ if (MemVTWidth == WidenWidth)
+ return MemVT;
RetVT = MemVT;
break;
}
VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
EVT MemVT = (MVT::SimpleValueType) VT;
unsigned MemVTWidth = MemVT.getSizeInBits();
- if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+ auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
+ if ((Action == TargetLowering::TypeLegal ||
+ Action == TargetLowering::TypePromoteInteger) &&
+ WidenEltVT == MemVT.getVectorElementType() &&
(WidenWidth % MemVTWidth) == 0 &&
isPowerOf2_32(WidenWidth / MemVTWidth) &&
(MemVTWidth <= Width ||
SDLoc DL(Op);
+ const bool TruncatingStore = StoreNode->isTruncatingStore();
+
// Neither LOCAL nor PRIVATE can do vectors at the moment
- if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS ||
+ TruncatingStore) &&
VT.isVector()) {
- if ((AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- StoreNode->isTruncatingStore()) {
+ if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
// Add an extra level of chain to isolate this vector
SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
// TODO: can the chain be replaced without creating a new store?
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
- if (StoreNode->isTruncatingStore()) {
+ if (TruncatingStore) {
assert(VT.bitsLE(MVT::i32));
SDValue MaskConstant;
if (MemVT == MVT::i8) {
// Convert pointer from byte address to dword address.
Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
- if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
- llvm_unreachable("Truncated and indexed stores not supported yet");
+ if (StoreNode->isIndexed()) {
+ llvm_unreachable("Indexed stores not supported yet");
} else {
Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
; FUNC-LABEL: {{^}}constant_load_v3i16:
; GCN: s_load_dwordx2 s
-; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1
; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
entry:
; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
; EG: CF_END
-; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1
+; EG-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1
+; EG-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1
+; EG-DAG: LSHR {{[* ]*}}{{T[0-9]\.[XYZW]}}, {{T[0-9]\.[XYZW]}}, literal
; EG-DAG: 16
-; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
-; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
-; EG-DAG: 65535
-; EG-DAG: 65535
define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
; v3i16 is naturally 8 byte aligned
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
-; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
-; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EG-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1
+; EG-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1
+; EG-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[ST_LO]].X, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[ST_HI]].X, 0.0, literal
; EG-DAG: 16
; EG-DAG: 16
define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
; GCN-NOHSA: buffer_load_dwordx2 v
; GCN-HSA: flat_load_dwordx2 v
-; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1
; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
entry:
; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
-; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
-; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EGCM-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1
+; EGCM-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1
+; EGCM-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1
; EGCM: 16
define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
entry:
; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
-; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1
-; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
-; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EGCM-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[SRC:T[0-9].[XYZW]]], 0, #1
+; EGCM-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1
+; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], [[SRC]], 4, #1
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[DST_LO]], 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal
; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
; EGCM-DAG: 16
; EGCM-DAG: 16
; GCN-DAG: ds_write_b16
; EG-DAG: LDS_USHORT_READ_RET
-; EG-DAG: LDS_READ_RET
+; EG-DAG: LDS_USHORT_READ_RET
define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b64
-; EG: LDS_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b64
-; EG: LDS_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
+; EG: LDS_USHORT_READ_RET
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
; EG-DAG: BFE_INT
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s
+
+; widen a v3i1 to v4i1 to do a vector load/store. We would previously
+; reconstruct the said v3i1 from the first element of the vector by filling all
+; the lanes of the vector with that first element, which was obviously wrong.
+; This was done in the type-legalizing of the DAG, when legalizing the load.
+
+; Function Attrs: argmemonly nounwind readonly
+declare <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)*, i32, <3 x i1>, <3 x i32>)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.masked.store.v3i32.p1v3i32(<3 x i32>, <3 x i32> addrspace(1)*, i32, <3 x i1>)
+
+define <3 x i32> @masked_load_v3(i32 addrspace(1)*, <3 x i1>) {
+entry:
+ %2 = bitcast i32 addrspace(1)* %0 to <3 x i32> addrspace(1)*
+ %3 = call <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)* %2, i32 4, <3 x i1> %1, <3 x i32> undef)
+ ret <3 x i32> %3
+}
+
+define void @masked_store4_v3(<3 x i32>, i32 addrspace(1)*, <3 x i1>) {
+entry:
+ %3 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)*
+ call void @llvm.masked.store.v3i32.p1v3i32(<3 x i32> %0, <3 x i32> addrspace(1)* %3, i32 4, <3 x i1> %2)
+ ret void
+}
+
+define void @local_load_v3i1(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <3 x i1>* %predicate_ptr) nounwind {
+; CHECK-LABEL: local_load_v3i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movzbl (%rdx), %ebp
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: shrl %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: movl %ebp, %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pinsrd $1, %eax, %xmm0
+; CHECK-NEXT: shrl $2, %ebp
+; CHECK-NEXT: andl $1, %ebp
+; CHECK-NEXT: pinsrd $2, %ebp, %xmm0
+; CHECK-NEXT: movd %xmm0, %ebx
+; CHECK-NEXT: pextrd $1, %xmm0, %r15d
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: movl %ebx, %esi
+; CHECK-NEXT: movl %r15d, %edx
+; CHECK-NEXT: movl %ebp, %ecx
+; CHECK-NEXT: callq masked_load_v3
+; CHECK-NEXT: movq %r14, %rdi
+; CHECK-NEXT: movl %ebx, %esi
+; CHECK-NEXT: movl %r15d, %edx
+; CHECK-NEXT: movl %ebp, %ecx
+; CHECK-NEXT: callq masked_store4_v3
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+ %predicate = load <3 x i1>, <3 x i1>* %predicate_ptr
+ %load1 = call <3 x i32> @masked_load_v3(i32 addrspace(1)* %in, <3 x i1> %predicate)
+ call void @masked_store4_v3(<3 x i32> %load1, i32 addrspace(1)* %out, <3 x i1> %predicate)
+ ret void
+}
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
-; CHECK-NEXT: subl $40, %esp
+; CHECK-NEXT: subl $32, %esp
; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl 12(%ebp), %edx
; CHECK-NEXT: movl 8(%ebp), %ecx
-; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; CHECK-NEXT: psubd %xmm0, %xmm2
; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8)
; CHECK-NEXT: pshufb %xmm1, %xmm2
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
; CHECK-NEXT: psubw %xmm0, %xmm1
; CHECK-NEXT: psubw %xmm0, %xmm2
-; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
-; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
+; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax)
; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
-; X86-NEXT: movd %xmm0, (%eax)
+; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: convert:
; X86-NEXT: movdqa (%edx), %xmm0
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
-; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
-; X86-NEXT: movd %xmm0, (%eax)
+; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pinsrd $1, 4(%edx), %xmm0
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: pinsrd $2, 8(%edx), %xmm0
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1
; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: pextrd $1, %xmm1, 4(%eax)
+; X86-NEXT: movq %xmm1, (%eax)
; X86-NEXT: pextrd $2, %xmm1, 8(%eax)
-; X86-NEXT: movd %xmm1, (%eax)
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32_2:
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddd (%ecx), %xmm0
; X86-NEXT: paddd 16(%ecx), %xmm1
-; X86-NEXT: movd %xmm1, 16(%eax)
-; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
+; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
+; X86-NEXT: subl $8, %esp
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 16(%ebp), %ecx
; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-NEXT: pinsrd $2, 4(%edx), %xmm0
-; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1
+; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X86-NEXT: paddd %xmm0, %xmm1
; X86-NEXT: pextrw $4, %xmm1, 4(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddw (%ecx), %xmm0
; X86-NEXT: paddw 16(%ecx), %xmm1
-; X86-NEXT: movd %xmm1, 16(%eax)
-; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
+; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;
; X86-NEXT: movdqa 16(%edx), %xmm1
; X86-NEXT: paddb (%ecx), %xmm0
; X86-NEXT: paddb 16(%ecx), %xmm1
-; X86-NEXT: movd %xmm1, 16(%eax)
-; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
; X86-NEXT: pextrw $6, %xmm1, 28(%eax)
; X86-NEXT: pextrb $14, %xmm1, 30(%eax)
+; X86-NEXT: movq %xmm1, 16(%eax)
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: retl $4
;