; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
; It's profitable to convert the zext to a shuffle, which in turn will be
; lowered to 4 tbl instructions. The masks are materialized outside the loop.
; CHECK-NEXT: b.ne LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB0_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB0_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: b LBB1_1
; CHECK-NEXT: LBB1_4: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_not_header:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: b .LBB1_2
+; CHECK-BE-NEXT: .LBB1_1: // %loop.latch
+; CHECK-BE-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: b.eq .LBB1_4
+; CHECK-BE-NEXT: .LBB1_2: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: tbz w2, #0, .LBB1_1
+; CHECK-BE-NEXT: // %bb.3: // %then
+; CHECK-BE-NEXT: // in Loop: Header=BB1_2 Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x11, x1, #16
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: st1 { v3.4s }, [x11]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT: b .LBB1_1
+; CHECK-BE-NEXT: .LBB1_4: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: stp q1, q2, [x1, #32]
; CHECK-NEXT: stp q0, q3, [x1]
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i32_no_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x0]
+; CHECK-BE-NEXT: add x8, x1, #48
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x1, #32
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x1, #16
+; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT: ret
entry:
%src.cast = bitcast i8* %src to <16 x i8>*
%load = load <16 x i8>, <16 x i8>* %src.cast
; CHECK-NEXT: b.ne LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_optsize:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB3_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB3_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: b.ne LBB4_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_minsize:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB4_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #48
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB4_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: b.ne LBB5_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i16_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB5_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v1.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-BE-NEXT: st1 { v1.8h }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #32
+; CHECK-BE-NEXT: st1 { v0.8h }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB5_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: b.ne LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB6_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #64
+; CHECK-BE-NEXT: st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: b.ne .LBB6_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
; CHECK-NEXT: b.ne LBB7_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB7_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8
+; CHECK-BE-NEXT: add x10, x1, #96
+; CHECK-BE-NEXT: add x8, x8, #16
+; CHECK-BE-NEXT: cmp x8, #128
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #112
+; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT: ushll2 v3.2d, v2.4s, #0
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #80
+; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0
+; CHECK-BE-NEXT: st1 { v2.2d }, [x10]
+; CHECK-BE-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-BE-NEXT: add x10, x1, #48
+; CHECK-BE-NEXT: st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT: add x9, x1, #64
+; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll2 v4.2d, v2.4s, #0
+; CHECK-BE-NEXT: st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT: ushll v1.2d, v0.2s, #0
+; CHECK-BE-NEXT: add x9, x1, #16
+; CHECK-BE-NEXT: st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT: add x10, x1, #32
+; CHECK-BE-NEXT: st1 { v1.2d }, [x1]
+; CHECK-BE-NEXT: add x1, x1, #128
+; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT: st1 { v0.2d }, [x9]
+; CHECK-BE-NEXT: st1 { v2.2d }, [x10]
+; CHECK-BE-NEXT: b.ne .LBB7_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop