From 242798c31c5259a47c8b14bbd960b65434b859dd Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 27 Nov 2022 21:57:43 +0300 Subject: [PATCH] [NFC][InstCombine] Add test coverage for potential fold --- .../InstCombine/widen-load-of-small-alloca.ll | 1424 ++++++++++++++++++++ 1 file changed, 1424 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/widen-load-of-small-alloca.ll diff --git a/llvm/test/Transforms/InstCombine/widen-load-of-small-alloca.ll b/llvm/test/Transforms/InstCombine/widen-load-of-small-alloca.ll new file mode 100644 index 0000000..31830e9 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/widen-load-of-small-alloca.ll @@ -0,0 +1,1424 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=instcombine -data-layout="e-n8:16:32:64" -S %s | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR,CHECK-SCALAR-64,CHECK-LE-64 +; RUN: opt -passes=instcombine -data-layout="e-n8:16:32" -S %s | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR,CHECK-SCALAR-32,CHECK-LE-32 +; RUN: opt -passes=instcombine -data-layout="E-n8:16:32:64" -S %s | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR,CHECK-SCALAR-64,CHECK-BE-64 +; RUN: opt -passes=instcombine -data-layout="E-n8:16:32" -S %s | FileCheck %s --check-prefixes=CHECK-ALL,CHECK-SCALAR,CHECK-SCALAR-32,CHECK-BE-32 + +define void @load-1byte-chunk-of-1byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-1byte-chunk-of-1byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [1 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <1 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <1 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <1 x i8> [[INTERMEDIATE_VAL]] +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[INTERMEDIATE_VAL_FROZEN]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [1 x i8], align 64 + %init = load <1 x i8>, ptr %src, align 1 + store <1 x i8> %init, ptr %intermediate, align 64 + call void @use.v1i8(<1 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-1byte-chunk-of-2byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 8 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-2byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 8 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [2 x i8], align 64 + %init = load <2 x i8>, ptr %src, align 1 + store <2 x i8> %init, ptr %intermediate, align 64 + call void @use.v2i8(<2 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-2byte-chunk-of-2byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-2byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [2 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <2 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <2 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i16 +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i16 [[BYTEOFF_TR]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <2 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <2 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i16 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [2 x i8], align 64 + %init = load <2 x i8>, ptr %src, align 1 + store <2 x i8> %init, ptr %intermediate, align 64 + call void @use.v2i8(<2 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +define void @load-1byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 24 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 24 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [4 x i8], align 64 + %init = load <4 x i8>, ptr %src, align 1 + store <4 x i8> %init, ptr %intermediate, align 64 + call void @use.v4i8(<4 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-2byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 16 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 16 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [4 x i8], align 64 + %init = load <4 x i8>, ptr %src, align 1 + store <4 x i8> %init, ptr %intermediate, align 64 + call void @use.v4i8(<4 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +define void @load-4byte-chunk-of-4byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-4byte-chunk-of-4byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [4 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <4 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <4 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_TR:%.*]] = trunc i64 [[BYTEOFF]] to i32 +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = shl i32 [[BYTEOFF_TR]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <4 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <4 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i32 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <4 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [4 x i8], align 64 + %init = load <4 x i8>, ptr %src, align 1 + store <4 x i8> %init, ptr %intermediate, align 64 + call void @use.v4i8(<4 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <4 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v4i8(<4 x i8> %chunk) + ret void +} + +define void @load-1byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 56 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-1byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 56 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [8 x i8], align 64 + %init = load <8 x i8>, ptr %src, align 1 + store <8 x i8> %init, ptr %intermediate, align 64 + call void @use.v8i8(<8 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-2byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [8 x i8], align 64 + %init = load <8 x i8>, ptr %src, align 1 + store <8 x i8> %init, ptr %intermediate, align 64 + call void @use.v8i8(<8 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +define void @load-4byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 32 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-4byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 32 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [8 x i8], align 64 + %init = load <8 x i8>, ptr %src, align 1 + store <8 x i8> %init, ptr %intermediate, align 64 + call void @use.v8i8(<8 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <4 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v4i8(<4 x i8> %chunk) + ret void +} + +define void @load-8byte-chunk-of-8byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-8byte-chunk-of-8byte-alloca( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <8 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [8 x i8], align 64 + %init = load <8 x i8>, ptr %src, align 1 + store <8 x i8> %init, ptr %intermediate, align 64 + call void @use.v8i8(<8 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <8 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v8i8(<8 x i8> %chunk) + ret void +} + +define void @load-1byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i8 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]], i64 0 +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-SCALAR-32-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-1byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 120 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i8 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = insertelement <1 x i8> poison, i8 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]], i64 0 +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v1i8(<1 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; + %intermediate = alloca [16 x i8], align 64 + %init = load <16 x i8>, ptr %src, align 1 + store <16 x i8> %init, ptr %intermediate, align 64 + call void @use.v16i8(<16 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-2byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-SCALAR-32-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 112 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; + %intermediate = alloca [16 x i8], align 64 + %init = load <16 x i8>, ptr %src, align 1 + store <16 x i8> %init, ptr %intermediate, align 64 + call void @use.v16i8(<16 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +define void @load-4byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i32 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <4 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-SCALAR-32-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-4byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 96 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i32 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i32 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <4 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v4i8(<4 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; + %intermediate = alloca [16 x i8], align 64 + %init = load <16 x i8>, ptr %src, align 1 + store <16 x i8> %init, ptr %intermediate, align 64 + call void @use.v16i8(<16 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <4 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v4i8(<4 x i8> %chunk) + ret void +} + +define void @load-8byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i64 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <8 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-SCALAR-32-NEXT: call void @use.v8i8(<8 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-8byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i64 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <8 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; + %intermediate = alloca [16 x i8], align 64 + %init = load <16 x i8>, ptr %src, align 1 + store <16 x i8> %init, ptr %intermediate, align 64 + call void @use.v16i8(<16 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <8 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v8i8(<8 x i8> %chunk) + ret void +} + +define void @load-16byte-chunk-of-16byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <16 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v16i8(<16 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-SCALAR-32-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-SCALAR-32-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-SCALAR-32-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-SCALAR-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-SCALAR-32-NEXT: [[CHUNK:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-SCALAR-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-SCALAR-32-NEXT: call void @use.v16i8(<16 x i8> [[CHUNK]]) +; CHECK-SCALAR-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-16byte-chunk-of-16byte-alloca( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [16 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <16 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 3 +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS_WIDE:%.*]] = zext i64 [[BYTEOFF_NUMBITS]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <16 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <16 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i128 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i128 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS_WIDE]] +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i128 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to <16 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v16i8(<16 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; + %intermediate = alloca [16 x i8], align 64 + %init = load <16 x i8>, ptr %src, align 1 + store <16 x i8> %init, ptr %intermediate, align 64 + call void @use.v16i8(<16 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <16 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v16i8(<16 x i8> %chunk) + ret void +} + +define void @load-1byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-1byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <1 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v1i8(<1 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <1 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v1i8(<1 x i8> %chunk) + ret void +} + +define void @load-2byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-2byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <2 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v2i8(<2 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +define void @load-4byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-4byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <4 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v4i8(<4 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <4 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v4i8(<4 x i8> %chunk) + ret void +} + +define void @load-8byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-8byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v8i8(<8 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <8 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v8i8(<8 x i8> %chunk) + ret void +} + +define void @load-16byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-16byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <16 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v16i8(<16 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <16 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v16i8(<16 x i8> %chunk) + ret void +} + +define void @load-32byte-chunk-of-32byte-alloca(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-ALL-LABEL: @load-32byte-chunk-of-32byte-alloca( +; CHECK-ALL-NEXT: [[INTERMEDIATE:%.*]] = alloca [32 x i8], align 64 +; CHECK-ALL-NEXT: [[INIT:%.*]] = load <32 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-ALL-NEXT: store <32 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[INIT]]) +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-ALL-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-ALL-NEXT: [[CHUNK:%.*]] = load <32 x i8>, ptr [[INTERMEDIATE_OFF_ADDR]], align 1 +; CHECK-ALL-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-ALL-NEXT: call void @use.v32i8(<32 x i8> [[CHUNK]]) +; CHECK-ALL-NEXT: ret void +; + %intermediate = alloca [32 x i8], align 64 + %init = load <32 x i8>, ptr %src, align 1 + store <32 x i8> %init, ptr %intermediate, align 64 + call void @use.v32i8(<32 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i8, ptr %intermediate, i64 %byteOff + %chunk = load <32 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v32i8(<32 x i8> %chunk) + ret void +} + +;; Special test + +define void @load-2byte-chunk-of-8byte-alloca-with-2byte-step(ptr %src, i64 %byteOff, ptr %escape) { +; CHECK-LE-64-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-LE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i16, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 4 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-64-NEXT: ret void +; +; CHECK-LE-32-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-LE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-LE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-LE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-LE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i16, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-LE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 4 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-LE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]] to i16 +; CHECK-LE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_EXTRACTED]] to <2 x i8> +; CHECK-LE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-LE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-LE-32-NEXT: ret void +; +; CHECK-BE-64-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-BE-64-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-64-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-64-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-64-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i16, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-64-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 4 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-64-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-64-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-64-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-64-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-64-NEXT: ret void +; +; CHECK-BE-32-LABEL: @load-2byte-chunk-of-8byte-alloca-with-2byte-step( +; CHECK-BE-32-NEXT: [[INTERMEDIATE:%.*]] = alloca [8 x i8], align 64 +; CHECK-BE-32-NEXT: [[INIT:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1 +; CHECK-BE-32-NEXT: store <8 x i8> [[INIT]], ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: call void @use.v8i8(<8 x i8> [[INIT]]) +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE]]) +; CHECK-BE-32-NEXT: [[INTERMEDIATE_OFF_ADDR:%.*]] = getelementptr inbounds i16, ptr [[INTERMEDIATE]], i64 [[BYTEOFF:%.*]] +; CHECK-BE-32-NEXT: [[BYTEOFF_NUMBITS:%.*]] = shl nuw nsw i64 [[BYTEOFF]], 4 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL:%.*]] = load <8 x i8>, ptr [[INTERMEDIATE]], align 64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN:%.*]] = freeze <8 x i8> [[INTERMEDIATE_VAL]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS:%.*]] = bitcast <8 x i8> [[INTERMEDIATE_VAL_FROZEN]] to i64 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED:%.*]] = shl i64 [[INTERMEDIATE_VAL_FROZEN_BITS]], [[BYTEOFF_NUMBITS]] +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART:%.*]] = lshr i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED]], 48 +; CHECK-BE-32-NEXT: [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED:%.*]] = trunc i64 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART]] to i16 +; CHECK-BE-32-NEXT: [[TMP1:%.*]] = bitcast i16 [[INTERMEDIATE_VAL_FROZEN_BITS_POSITIONED_PART_EXTRACTED]] to <2 x i8> +; CHECK-BE-32-NEXT: call void @use.ptr(ptr nonnull [[INTERMEDIATE_OFF_ADDR]]) +; CHECK-BE-32-NEXT: call void @use.v2i8(<2 x i8> [[TMP1]]) +; CHECK-BE-32-NEXT: ret void +; + %intermediate = alloca [8 x i8], align 64 + %init = load <8 x i8>, ptr %src, align 1 + store <8 x i8> %init, ptr %intermediate, align 64 + call void @use.v8i8(<8 x i8> %init) + call void @use.ptr(ptr %intermediate) + + %intermediate.off.addr = getelementptr inbounds i16, ptr %intermediate, i64 %byteOff + %chunk = load <2 x i8>, ptr %intermediate.off.addr, align 1 + + call void @use.ptr(ptr %intermediate.off.addr) + call void @use.v2i8(<2 x i8> %chunk) + ret void +} + +declare void @use.ptr(ptr) +declare void @use.v1i8(<1 x i8>) +declare void @use.v2i8(<2 x i8>) +declare void @use.v4i8(<4 x i8>) +declare void @use.v8i8(<8 x i8>) +declare void @use.v16i8(<16 x i8>) +declare void @use.v32i8(<32 x i8>) -- 2.7.4