From: Chandler Carruth Date: Thu, 26 Apr 2018 03:12:17 +0000 (+0000) Subject: [x86] NFC: Add tests for idiomatic usage patterns of SSE4.2 string X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8cc8c0a87ccf76735f8438bfdafa7487a1cf127d;p=platform%2Fupstream%2Fllvm.git [x86] NFC: Add tests for idiomatic usage patterns of SSE4.2 string comparison instructions (pcmp[ei]stri*). These will help show improvements from fixes to PR37246. I've not really covered the mask forms of this intrinsic as I don't have as good of an intuition about the likely usage patterns there. Happy for someone to extend this with tests covering the mask form. llvm-svn: 330895 --- diff --git a/llvm/test/CodeGen/X86/sse42.ll b/llvm/test/CodeGen/X86/sse42.ll new file mode 100644 index 0000000..1e197dc --- /dev/null +++ b/llvm/test/CodeGen/X86/sse42.ll @@ -0,0 +1,976 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64 + +declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8) +declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8) +declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8) + +define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl 12(%ebp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB2_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB2_3 +; X32-NEXT: .LBB2_2: # %compare +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB2_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB2_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB2_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rdx), %xmm1 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rdx), %xmm1 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 20(%ebp), %edx +; X32-NEXT: movl 16(%ebp), %ecx +; X32-NEXT: movl 8(%ebp), %esi +; X32-NEXT: movdqu (%esi), %xmm1 +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpestri $24, %xmm0, %xmm1 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB5_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB5_3 +; X32-NEXT: .LBB5_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB5_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rdx), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $24, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB5_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + ret i32 %idx +} + +define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_reg_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl 12(%ebp), %edx +; X32-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB8_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB8_3 +; X32-NEXT: .LBB8_2: # %compare +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB8_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_reg_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %esi, %edx +; X64-NEXT: pcmpestri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB8_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB8_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: pcmpestri $25, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rdx), %xmm1 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movdqu (%esi), %xmm0 +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: pcmpestri $25, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rdx), %xmm1 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + ret i32 %idx +} + +define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind { +; X32-LABEL: pcmpestri_mem_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: pushl %esi +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 20(%ebp), %edx +; X32-NEXT: movl 16(%ebp), %ecx +; X32-NEXT: movl 8(%ebp), %esi +; X32-NEXT: movdqu (%esi), %xmm1 +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: pcmpestri $25, %xmm0, %xmm1 +; X32-NEXT: cmpl $8, %ecx +; X32-NEXT: jne .LBB11_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB11_3 +; X32-NEXT: .LBB11_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB11_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: leal -4(%ebp), %esp +; X32-NEXT: popl %esi +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpestri_mem_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rdx), %xmm0 +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %ecx, %edx +; X64-NEXT: pcmpestri $25, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $8, %ecx +; X64-NEXT: jne .LBB11_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB11_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25) + %eq = icmp eq i32 %idx, 8 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB14_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; X32-NEXT: .LBB14_2: # %compare +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB14_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: movdqu (%eax), %xmm1 +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_eq_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rsi), %xmm1 +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: movdqu (%eax), %xmm1 +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_idx_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rsi), %xmm1 +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 8(%ebp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: movdqu (%eax), %xmm0 +; X32-NEXT: pcmpistri $24, %xmm0, %xmm1 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB17_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB17_3 +; X32-NEXT: .LBB17_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: andl $15, %ecx +; X32-NEXT: movb (%esp,%ecx), %al +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subb 16(%esp,%ecx), %al +; X32-NEXT: .LBB17_3: # %exit +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_diff_i8: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rsi), %xmm0 +; X64-NEXT: pcmpistri $24, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB17_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $15, %ecx +; X64-NEXT: movb -24(%rsp,%rcx), %al +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subb -40(%rsp,%rcx), %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>* + %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>* + %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1 + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <16 x i8> %lhs, i32 %idx + %rhs_c = extractelement <16 x i8> %rhs, i32 %idx + %sub = sub i8 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i8 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i8 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + ret i32 %idx +} + +define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind { +; X32-LABEL: pcmpistri_reg_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X32-NEXT: cmpl $16, %ecx +; X32-NEXT: jne .LBB20_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; X32-NEXT: .LBB20_2: # %compare +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movdqa %xmm0, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_reg_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: pcmpistri $24, %xmm1, %xmm0 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $16, %ecx +; X64-NEXT: jne .LBB20_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB20_2: # %compare +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24) + %eq = icmp eq i32 %idx, 16 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +} + +define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_eq_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: movdqu (%eax), %xmm1 +; X32-NEXT: pcmpistri $25, %xmm1, %xmm0 +; X32-NEXT: setae %al +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_eq_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rsi), %xmm1 +; X64-NEXT: pcmpistri $25, %xmm1, %xmm0 +; X64-NEXT: setae %al +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + %result = icmp eq i32 %c, 0 + ret i1 %result +} + +define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_idx_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm0 +; X32-NEXT: movdqu (%eax), %xmm1 +; X32-NEXT: pcmpistri $25, %xmm1, %xmm0 +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_idx_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm0 +; X64-NEXT: movdqu (%rsi), %xmm1 +; X64-NEXT: pcmpistri $25, %xmm1, %xmm0 +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + ret i32 %idx +} + +define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind { +; X32-LABEL: pcmpistri_mem_diff_i16: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $48, %esp +; X32-NEXT: movl 12(%ebp), %eax +; X32-NEXT: movl 8(%ebp), %ecx +; X32-NEXT: movdqu (%ecx), %xmm1 +; X32-NEXT: movdqu (%eax), %xmm0 +; X32-NEXT: pcmpistri $25, %xmm0, %xmm1 +; X32-NEXT: cmpl $8, %ecx +; X32-NEXT: jne .LBB23_2 +; X32-NEXT: # %bb.1: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: jmp .LBB23_3 +; X32-NEXT: .LBB23_2: # %compare +; X32-NEXT: movdqa %xmm1, (%esp) +; X32-NEXT: addl %ecx, %ecx +; X32-NEXT: andl $14, %ecx +; X32-NEXT: movzwl (%esp,%ecx), %eax +; X32-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: subw 16(%esp,%ecx), %ax +; X32-NEXT: .LBB23_3: # %exit +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: pcmpistri_mem_diff_i16: +; X64: # %bb.0: # %entry +; X64-NEXT: movdqu (%rdi), %xmm1 +; X64-NEXT: movdqu (%rsi), %xmm0 +; X64-NEXT: pcmpistri $25, %xmm0, %xmm1 +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: cmpl $8, %ecx +; X64-NEXT: jne .LBB23_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB23_2: # %compare +; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: andl $7, %ecx +; X64-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: subw -40(%rsp,%rcx,2), %ax +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: retq +entry: + %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>* + %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1 + %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>* + %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1 + %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8> + %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8> + %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25) + %eq = icmp eq i32 %idx, 8 + br i1 %eq, label %exit, label %compare + +compare: + %lhs_c = extractelement <8 x i16> %lhs, i32 %idx + %rhs_c = extractelement <8 x i16> %rhs, i32 %idx + %sub = sub i16 %lhs_c, %rhs_c + br label %exit + +exit: + %result = phi i16 [ 0, %entry ], [ %sub, %compare ] + %result_ext = zext i16 %result to i32 + ret i32 %result_ext +}