From 87679b12c1b3c730173d205156e129629dab0457 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 21 Dec 2022 12:32:56 +0100 Subject: [PATCH] [ARM] Regenerate test checks (NFC) --- llvm/test/CodeGen/ARM/ParallelDSP/aliasing.ll | 507 +++++++++++++-- llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll | 198 +++++- llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll | 685 +++++++++++++++++---- .../CodeGen/ARM/ParallelDSP/inner-full-unroll.ll | 119 ++-- llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll | 350 +++++++++-- llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll | 217 ++++--- llvm/test/CodeGen/ARM/ParallelDSP/sext-acc.ll | 105 ++-- 7 files changed, 1736 insertions(+), 445 deletions(-) diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/aliasing.ll b/llvm/test/CodeGen/ARM/ParallelDSP/aliasing.ll index bc1b17a..77a3e88 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/aliasing.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/aliasing.ll @@ -1,13 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=arm-none-none-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -verify -S | FileCheck %s ; ; Alias check: check that the rewrite isn't triggered when there's a store ; instruction possibly aliasing any mul load operands; arguments are passed ; without 'restrict' enabled. ; -; CHECK-LABEL: @no_restrict -; CHECK-NOT: call i32 @llvm.arm.smlad -; define dso_local i32 @no_restrict(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @no_restrict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -53,10 +86,42 @@ for.body: ; aliasing one of the mul load operands. Arguments are now annotated with ; 'noalias'. ; -; CHECK-LABEL: @restrict -; CHECK-NOT: call i32 @llvm.arm.smlad -; define dso_local i32 @restrict(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias %arg3) { +; CHECK-LABEL: @restrict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -101,14 +166,57 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_dominates_all -; CHECK: store -; CHECK: load -; CHECK: load -; CHECK: load -; CHECK: load -; CHECK: smlad define dso_local i32 @store_dominates_all(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @store_dominates_all( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[TMP13]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3]] to i32* +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.arm.smlad(i32 [[TMP11]], i32 [[TMP2]], i32 [[MAC1_026]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP11]], 16 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP17]], [[TMP7]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -147,13 +255,57 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: loads_dominate -; CHECK-NOT: store -; CHECK: load i32 -; CHECK-NOT: store -; CHECK: load i32 -; CHECK: store define dso_local i32 @loads_dominate(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @loads_dominate( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[TMP13]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3]] to i32* +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.arm.smlad(i32 [[TMP11]], i32 [[TMP2]], i32 [[MAC1_026]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP11]], 16 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP17]], [[TMP7]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -192,18 +344,57 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg3_legal_1 -; CHECK-NOT: store -; CHECK: phi i32 -; CHECK: [[IV:%[^ ]+]] = phi i32 [ %add -; CHECK: [[ARG3_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg3, i32 [[IV]] -; CHECK: [[ARG3:%[^ ]+]] = bitcast i16* [[ARG3_GEP]] to i32* -; CHECK: load i32, i32* [[ARG3]] -; CHECK: [[ARG2_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg2, i32 [[IV]] -; CHECK: [[ARG2:%[^ ]+]] = bitcast i16* [[ARG2_GEP]] to i32* -; CHECK: load i32, i32* [[ARG2]] -; CHECK: store define dso_local i32 @store_alias_arg3_legal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @store_alias_arg3_legal_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[TMP13]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3]] to i32* +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.arm.smlad(i32 [[TMP11]], i32 [[TMP2]], i32 [[MAC1_026]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP11]], 16 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP4]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP17]], [[TMP7]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -242,15 +433,57 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg3_legal_2 -; CHECK-NOT: store -; CHECK: [[BITCAST:[^ ]+]] = bitcast i16* %arrayidx to i32* -; CHECK: load i32, i32* [[BITCAST]] -; CHECK: store i16 42, i16* %arrayidx -; CHECK: [[BITCAST3:[^ ]+]] = bitcast i16* %arrayidx3 to i32* -; CHECK: load i32, i32* [[BITCAST3]] -; CHECK: smlad define dso_local i32 @store_alias_arg3_legal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @store_alias_arg3_legal_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[TMP13]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32* +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3]] to i32* +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13]] = call i32 @llvm.arm.smlad(i32 [[TMP11]], i32 [[TMP2]], i32 [[MAC1_026]]) +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP11]], 16 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP17]], [[TMP7]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -289,9 +522,42 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg3_illegal_1 -; CHECK-NOT: load i32 define dso_local i32 @store_alias_arg3_illegal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture %arg3) { +; CHECK-LABEL: @store_alias_arg3_illegal_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -330,9 +596,42 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg3_illegal_2 -; CHECK-NOT: load i32 define dso_local i32 @store_alias_arg3_illegal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture %arg3) { +; CHECK-LABEL: @store_alias_arg3_illegal_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -371,9 +670,42 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg2_illegal_1 -; CHECK-NOT: load i32 define dso_local i32 @store_alias_arg2_illegal_1(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @store_alias_arg2_illegal_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -412,9 +744,42 @@ for.body: br i1 %exitcond, label %for.body, label %for.cond.cleanup } -; CHECK-LABEL: store_alias_arg2_illegal_2 -; CHECK-NOT: load i32 define dso_local i32 @store_alias_arg2_illegal_2(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +; CHECK-LABEL: @store_alias_arg2_illegal_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[ARG3:%.*]], align 2 +; CHECK-NEXT: [[DOTPRE27:%.*]] = load i16, i16* [[ARG2:%.*]], align 2 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[MAC1_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD11:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret i32 [[MAC1_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[MAC1_026:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[I_025:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[I_025]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I_025]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i32 [[ADD]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[I_025]] +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[ARG2]], i32 [[ADD]] +; CHECK-NEXT: store i16 42, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV7]], [[CONV8]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[MUL]], [[MAC1_026]] +; CHECK-NEXT: [[ADD11]] = add i32 [[MUL9]], [[ADD10]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[ADD]], [[ARG]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp24 = icmp sgt i32 %arg, 0 br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup @@ -455,9 +820,55 @@ for.body: ; TODO: I think we should be able to generate one smlad here. The search fails ; when it finds the alias. -; CHECK-LABEL: one_pair_alias -; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @one_pair_alias(i16* noalias nocapture readonly %b, i16* noalias nocapture %c) { +; CHECK-LABEL: @one_pair_alias( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[ADD26:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_050:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_049:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD26]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD3:%.*]] = or i32 [[I_050]], 1 +; CHECK-NEXT: [[ADD11:%.*]] = or i32 [[I_050]], 2 +; CHECK-NEXT: [[ADD19:%.*]] = or i32 [[I_050]], 3 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[I_050]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[ADD3]] +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[ADD19]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[C:%.*]], i32 [[I_050]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[ADD3]] +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[C]], i32 [[ADD19]] +; CHECK-NEXT: [[TMP:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX12]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: store i16 43, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP]] to i32 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[A_049]] +; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV8]], [[CONV5]] +; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD]], [[MUL9]] +; CHECK-NEXT: [[CONV13:%.*]] = sext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[CONV16:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV13]] +; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[ADD10]], [[MUL17]] +; CHECK-NEXT: [[CONV21:%.*]] = sext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[CONV24:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[MUL25:%.*]] = mul nsw i32 [[CONV24]], [[CONV21]] +; CHECK-NEXT: [[ADD26]] = add nsw i32 [[ADD18]], [[MUL25]] +; CHECK-NEXT: [[ADD27]] = add nuw nsw i32 [[I_050]], 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD27]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; entry: br label %for.body diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll index adcb81e..1e9cdde 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -1,12 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -arm-parallel-dsp -dce -mtriple=armv7-a -S %s -o - | FileCheck %s -; CHECK-LABEL: single_block -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) define i32 @single_block(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @single_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP3]], i32 [[ACC:%.*]]) +; CHECK-NEXT: ret i32 [[TMP4]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -25,13 +29,16 @@ entry: ret i32 %res } -; CHECK-LABEL: single_block_64 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 %acc) define i64 @single_block_64(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @single_block_64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 [[ACC:%.*]]) +; CHECK-NEXT: ret i64 [[TMP4]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -52,13 +59,19 @@ entry: ret i64 %res } -; CHECK-LABEL: multi_block -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @multi_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP3]], i32 0) +; CHECK-NEXT: br label [[BB_1:%.*]] +; CHECK: bb.1: +; CHECK-NEXT: [[RES:%.*]] = add i32 [[TMP4]], [[ACC:%.*]] +; CHECK-NEXT: ret i32 [[RES]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -80,13 +93,19 @@ bb.1: ret i32 %res } -; CHECK-LABEL: multi_block_64 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 0) define i64 @multi_block_64(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @multi_block_64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 0) +; CHECK-NEXT: br label [[BB_1:%.*]] +; CHECK: bb.1: +; CHECK-NEXT: [[RES:%.*]] = add i64 [[TMP4]], [[ACC:%.*]] +; CHECK-NEXT: ret i64 [[RES]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -110,9 +129,27 @@ bb.1: ret i64 %res } -; CHECK-LABEL: multi_block_1 -; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @multi_block_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B:%.*]], align 2 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_B_0]] +; CHECK-NEXT: br label [[BB_1:%.*]] +; CHECK: bb.1: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B]], i32 1 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_1]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC:%.*]] +; CHECK-NEXT: ret i32 [[RES]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -136,12 +173,44 @@ bb.1: ; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as ; scalars. -; CHECK-LABEL: num_load_limit -; CHECK: call i32 @llvm.arm.smlad -; CHECK: call i32 @llvm.arm.smlad -; CHECK: call i32 @llvm.arm.smlad -; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @num_load_limit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP10]] +; CHECK-NEXT: [[ADDR_B_3:%.*]] = getelementptr i16, i16* [[B]], i32 3 +; CHECK-NEXT: [[LD_B_3:%.*]] = load i16, i16* [[ADDR_B_3]], align 2 +; CHECK-NEXT: [[SEXT_B_3:%.*]] = sext i16 [[LD_B_3]] to i32 +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP6]], [[SEXT_B_3]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[MUL_3]], [[ACC:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[MUL_0]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[TMP12]]) +; CHECK-NEXT: [[ADDR_A_4:%.*]] = getelementptr i16, i16* [[A]], i32 4 +; CHECK-NEXT: [[ADDR_B_4:%.*]] = getelementptr i16, i16* [[B]], i32 4 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[ADDR_A_4]] to i32* +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16* [[ADDR_B_4]] to i32* +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP15]], i32 [[TMP17]], i32 [[TMP13]]) +; CHECK-NEXT: [[ADDR_A_6:%.*]] = getelementptr i16, i16* [[A]], i32 6 +; CHECK-NEXT: [[ADDR_B_6:%.*]] = getelementptr i16, i16* [[B]], i32 6 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i16* [[ADDR_A_6]] to i32* +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 2 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16* [[ADDR_B_6]] to i32* +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 2 +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP20]], i32 [[TMP22]], i32 [[TMP18]]) +; CHECK-NEXT: ret i32 [[TMP23]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -212,9 +281,72 @@ entry: ret i32 %res } -; CHECK-LABEL: too_many_loads -; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @too_many_loads( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B:%.*]], align 2 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_B_0]] +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B]], i32 1 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_1]] +; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_B_0]] +; CHECK-NEXT: [[ADDR_B_3:%.*]] = getelementptr i16, i16* [[B]], i32 3 +; CHECK-NEXT: [[LD_B_3:%.*]] = load i16, i16* [[ADDR_B_3]], align 2 +; CHECK-NEXT: [[SEXT_B_3:%.*]] = sext i16 [[LD_B_3]] to i32 +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_3]] +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADDR_A_4:%.*]] = getelementptr i16, i16* [[A]], i32 4 +; CHECK-NEXT: [[ADDR_B_4:%.*]] = getelementptr i16, i16* [[B]], i32 4 +; CHECK-NEXT: [[LD_A_4:%.*]] = load i16, i16* [[ADDR_A_4]], align 2 +; CHECK-NEXT: [[SEXT_A_4:%.*]] = sext i16 [[LD_A_4]] to i32 +; CHECK-NEXT: [[LD_B_4:%.*]] = load i16, i16* [[ADDR_B_4]], align 2 +; CHECK-NEXT: [[SEXT_B_4:%.*]] = sext i16 [[LD_B_4]] to i32 +; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[SEXT_A_4]], [[SEXT_B_4]] +; CHECK-NEXT: [[ADDR_A_5:%.*]] = getelementptr i16, i16* [[A]], i32 5 +; CHECK-NEXT: [[ADDR_B_5:%.*]] = getelementptr i16, i16* [[B]], i32 5 +; CHECK-NEXT: [[LD_A_5:%.*]] = load i16, i16* [[ADDR_A_5]], align 2 +; CHECK-NEXT: [[SEXT_A_5:%.*]] = sext i16 [[LD_A_5]] to i32 +; CHECK-NEXT: [[LD_B_5:%.*]] = load i16, i16* [[ADDR_B_5]], align 2 +; CHECK-NEXT: [[SEXT_B_5:%.*]] = sext i16 [[LD_B_5]] to i32 +; CHECK-NEXT: [[MUL_5:%.*]] = mul i32 [[SEXT_A_5]], [[SEXT_B_5]] +; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[MUL_4]], [[MUL_5]] +; CHECK-NEXT: [[ADDR_A_6:%.*]] = getelementptr i16, i16* [[A]], i32 6 +; CHECK-NEXT: [[ADDR_B_6:%.*]] = getelementptr i16, i16* [[B]], i32 6 +; CHECK-NEXT: [[LD_A_6:%.*]] = load i16, i16* [[ADDR_A_6]], align 2 +; CHECK-NEXT: [[SEXT_A_6:%.*]] = sext i16 [[LD_A_6]] to i32 +; CHECK-NEXT: [[LD_B_6:%.*]] = load i16, i16* [[ADDR_B_6]], align 2 +; CHECK-NEXT: [[SEXT_B_6:%.*]] = sext i16 [[LD_B_6]] to i32 +; CHECK-NEXT: [[MUL_6:%.*]] = mul i32 [[SEXT_A_6]], [[SEXT_B_6]] +; CHECK-NEXT: [[ADDR_A_7:%.*]] = getelementptr i16, i16* [[A]], i32 7 +; CHECK-NEXT: [[ADDR_B_7:%.*]] = getelementptr i16, i16* [[B]], i32 7 +; CHECK-NEXT: [[LD_A_7:%.*]] = load i16, i16* [[ADDR_A_7]], align 2 +; CHECK-NEXT: [[SEXT_A_7:%.*]] = sext i16 [[LD_A_7]] to i32 +; CHECK-NEXT: [[LD_B_7:%.*]] = load i16, i16* [[ADDR_B_7]], align 2 +; CHECK-NEXT: [[SEXT_B_7:%.*]] = sext i16 [[LD_B_7]] to i32 +; CHECK-NEXT: [[MUL_7:%.*]] = mul i32 [[SEXT_A_7]], [[SEXT_B_7]] +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[MUL_6]], [[MUL_7]] +; CHECK-NEXT: [[ADDR_A_8:%.*]] = getelementptr i16, i16* [[A]], i32 7 +; CHECK-NEXT: [[ADDR_B_8:%.*]] = getelementptr i16, i16* [[B]], i32 7 +; CHECK-NEXT: [[LD_A_8:%.*]] = load i16, i16* [[ADDR_A_8]], align 2 +; CHECK-NEXT: [[SEXT_A_8:%.*]] = sext i16 [[LD_A_8]] to i32 +; CHECK-NEXT: [[LD_B_8:%.*]] = load i16, i16* [[ADDR_B_8]], align 2 +; CHECK-NEXT: [[SEXT_B_8:%.*]] = sext i16 [[LD_B_8]] to i32 +; CHECK-NEXT: [[MUL_8:%.*]] = mul i32 [[SEXT_A_8]], [[SEXT_B_8]] +; CHECK-NEXT: [[ADD_10:%.*]] = add i32 [[ADD_7]], [[ADD_5]] +; CHECK-NEXT: [[ADD_11:%.*]] = add i32 [[ADD_3]], [[ADD_0]] +; CHECK-NEXT: [[ADD_12:%.*]] = add i32 [[ADD_10]], [[ADD_11]] +; CHECK-NEXT: [[ADD_13:%.*]] = add i32 [[ADD_12]], [[ACC:%.*]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_13]], [[MUL_8]] +; CHECK-NEXT: ret i32 [[RES]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll index e90284b..5fcef9d 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -1,12 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s -; CHECK-LABEL: exchange_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -25,13 +53,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -50,13 +105,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -75,13 +157,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_4 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -100,17 +209,58 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_multi_use_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_multi_use_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP16]], i32 [[TMP8]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP18]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -139,17 +289,59 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_multi_use_64_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @exchange_multi_use_64_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP8]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]] +; CHECK-NEXT: [[SEXT_ADD_2:%.*]] = sext i32 [[ADD_2]] to i64 +; CHECK-NEXT: [[RES:%.*]] = add i64 [[SEXT_ADD_2]], [[ACC]] +; CHECK-NEXT: ret i64 [[TMP18]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -179,17 +371,60 @@ entry: ret i64 %res } -; CHECK-LABEL: exchange_multi_use_64_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @exchange_multi_use_64_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP8]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[SEXT_ADD]], [[SEXT_ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = add i64 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i64 [[TMP18]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -220,17 +455,58 @@ entry: ret i64 %res } -; CHECK-LABEL: exchange_multi_use_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_multi_use_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP16]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP11]], [[TMP22]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP14]], [[TMP19]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP18]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -260,15 +536,50 @@ entry: } ; TODO: Why aren't two intrinsics generated? -; CHECK-LABEL: exchange_multi_use_3 -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK-NOT: call i32 @llvm.arm.smlad -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_multi_use_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[TMP3]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[TMP6]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[ADD]], [[TMP10]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ACC:%.*]], [[SUB]] +; CHECK-NEXT: ret i32 [[RES]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -298,17 +609,60 @@ entry: } ; TODO: Would it be better to generate a smlad and then sign extend it? -; CHECK-LABEL: exchange_multi_use_64_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @exchange_multi_use_64_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP8]], i32 [[TMP15]], i64 0) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP8]], i64 [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP15]], 16 +; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP10]], [[TMP22]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP13]], [[TMP19]] +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP10]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP13]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[SEXT_ADD]], [[SEXT_ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = sub i64 [[ACC:%.*]], [[TMP18]] +; CHECK-NEXT: ret i64 [[RES]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -340,14 +694,50 @@ entry: } ; TODO: Why isn't smladx generated too? -; CHECK-LABEL: exchange_multi_use_4 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 -; CHECK-NOT: call i32 @llvm.arm.smlad define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_multi_use_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP11]], [[SEXT_A_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP14]], [[SEXT_A_2]] +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[TMP10]], [[ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ACC:%.*]], [[SUB]] +; CHECK-NEXT: ret i32 [[RES]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -376,13 +766,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_swap -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_swap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -401,13 +818,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_swap_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_swap_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -426,13 +870,40 @@ entry: ret i32 %res } -; CHECK-LABEL: exchange_swap_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @exchange_swap_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP10]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll index 542202c..38f97b9 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -1,27 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv7em -arm-parallel-dsp -dce -S %s -o - | FileCheck %s -; CHECK-LABEL: full_unroll -; CHECK: [[IV:%[^ ]+]] = phi i32 -; CHECK: [[AI:%[^ ]+]] = getelementptr inbounds i32, i32* %a, i32 [[IV]] -; CHECK: [[BI:%[^ ]+]] = getelementptr inbounds i16*, i16** %b, i32 [[IV]] -; CHECK: [[BIJ:%[^ ]+]] = load i16*, i16** %arrayidx5, align 4 -; CHECK: [[CI:%[^ ]+]] = getelementptr inbounds i16*, i16** %c, i32 [[IV]] -; CHECK: [[CIJ:%[^ ]+]] = load i16*, i16** [[CI]], align 4 -; CHECK: [[BIJ_CAST:%[^ ]+]] = bitcast i16* [[BIJ]] to i32* -; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2 -; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32* -; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0) -; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2 -; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32* -; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2 -; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 -; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* -; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]]) -; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4 - define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { +; CHECK-LABEL: @full_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP29:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP29]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_030:%.*]] = phi i32 [ [[INC12:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16*, i16** [[B:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16*, i16** [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16*, i16** [[C:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16*, i16** [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP0]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP1]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP5]], i32 [[TMP3]], i32 0) +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[ARRAYIDX6_2]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[ARRAYIDX8_2]] to i32* +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP10]], i32 [[TMP8]], i32 [[TMP6]]) +; CHECK-NEXT: store i32 [[TMP11]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INC12]] = add nuw i32 [[I_030]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC12]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; entry: %cmp29 = icmp eq i32 %N, 0 br i1 %cmp29, label %for.cond.cleanup, label %for.body @@ -71,36 +81,45 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; CHECK-LABEL: full_unroll_sub -; CHECK: [[IV:%[^ ]+]] = phi i32 -; CHECK: [[AI:%[^ ]+]] = getelementptr inbounds i32, i32* %a, i32 [[IV]] -; CHECK: [[BI:%[^ ]+]] = getelementptr inbounds i16*, i16** %b, i32 [[IV]] -; CHECK: [[BIJ:%[^ ]+]] = load i16*, i16** [[BI]], align 4 -; CHECK: [[CI:%[^ ]+]] = getelementptr inbounds i16*, i16** %c, i32 [[IV]] -; CHECK: [[CIJ:%[^ ]+]] = load i16*, i16** [[CI]], align 4 -; CHECK: [[BIJ_LD:%[^ ]+]] = load i16, i16* [[BIJ]], align 2 -; CHECK: [[BIJ_LD_SXT:%[^ ]+]] = sext i16 [[BIJ_LD]] to i32 -; CHECK: [[CIJ_LD:%[^ ]+]] = load i16, i16* [[CIJ]], align 2 -; CHECK: [[CIJ_LD_SXT:%[^ ]+]] = sext i16 [[CIJ_LD]] to i32 -; CHECK: [[SUB:%[^ ]+]] = sub nsw i32 [[CIJ_LD_SXT]], [[BIJ_LD_SXT]] -; CHECK: [[BIJ_1:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 1 -; CHECK: [[BIJ_1_LD:%[^ ]+]] = load i16, i16* [[BIJ_1]], align 2 -; CHECK: [[BIJ_1_LD_SXT:%[^ ]+]] = sext i16 [[BIJ_1_LD]] to i32 -; CHECK: [[CIJ_1:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 1 -; CHECK: [[CIJ_1_LD:%[^ ]+]] = load i16, i16* [[CIJ_1]], align 2 -; CHECK: [[CIJ_1_LD_SXT:%[^ ]+]] = sext i16 [[CIJ_1_LD]] to i32 -; CHECK: [[MUL:%[^ ]+]] = mul nsw i32 [[CIJ_1_LD_SXT]], [[BIJ_1_LD_SXT]] -; CHECK: [[ACC:%[^ ]+]] = add nsw i32 [[MUL]], [[SUB]] -; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2 -; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32* -; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2 -; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 -; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* -; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[ACC]]) -; CHECK: store i32 [[SMLAD0]], i32* %arrayidx, align 4 - define void @full_unroll_sub(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { +; CHECK-LABEL: @full_unroll_sub( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP29:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP29]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_030:%.*]] = phi i32 [ [[INC12:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16*, i16** [[B:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16*, i16** [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16*, i16** [[C:%.*]], i32 [[I_030]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16*, i16** [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV9]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX6_1]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX8_1]], align 2 +; CHECK-NEXT: [[CONV9_1:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[CONV9_1]], [[CONV_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[SUB]] +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX6_2]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[ARRAYIDX8_2]] to i32* +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP9]], i32 [[TMP7]], i32 [[ADD_1]]) +; CHECK-NEXT: store i32 [[TMP10]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INC12]] = add nuw i32 [[I_030]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC12]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; entry: %cmp29 = icmp eq i32 %N, 0 br i1 %cmp29, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll index f807149..09fcf33 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -1,20 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s -; CHECK-LABEL: overlap_1 -; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 -; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) -; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* -; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]]) -; CHECK: ret i32 [[RES]] define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @overlap_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP16]], i32 [[TMP23]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 +; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP29]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP25]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -44,21 +88,66 @@ entry: ; TODO: Is it really best to generate smlald for the first instruction? Does ; this just increase register pressure unnecessarily? -; CHECK-LABEL: overlap_64_1 -; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 -; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc) -; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* -; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]]) -; CHECK: ret i64 [[RES]] define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) { +; CHECK-LABEL: @overlap_64_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP23]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 +; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP29]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] +; CHECK-NEXT: [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[SEXT_ADD_1]], [[SEXT_ADD]] +; CHECK-NEXT: [[RES:%.*]] = add i64 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i64 [[TMP25]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -88,18 +177,51 @@ entry: ret i64 %res } -; CHECK-LABEL: overlap_2 -; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 -; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc -; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]] -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]]) -; CHECK: ret i32 [[RES]] define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @overlap_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP10]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[MUL_1]], [[ACC:%.*]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_B_2]], [[SEXT_A_2]] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[MUL_2]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[TMP15]]) +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP16]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -127,21 +249,68 @@ entry: ret i32 %res } -; CHECK-LABEL: overlap_3 -; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) -; CHECK: ret i32 [[RES]] define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @overlap_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP23]], i32 [[TMP16]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 +; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]], align 2 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP26]], [[TMP14]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP29]], [[TMP21]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP25]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 @@ -173,21 +342,68 @@ entry: ret i32 %res } -; CHECK-LABEL: overlap_4 -; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) -; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* -; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* -; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) -; CHECK: ret i32 [[RES]] define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @overlap_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1 +; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, i16* [[A]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 +; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, i16* [[B]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[B]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 +; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]], align 2 +; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 +; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 +; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 +; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, i16* [[A]], i32 3 +; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]], align 2 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 2 +; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP23]], i32 [[TMP16]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 +; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]], align 2 +; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, i16* [[ADDR_A_3]], align 2 +; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 +; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 +; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP26]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP14]], [[TMP29]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] +; CHECK-NEXT: ret i32 [[TMP25]] +; entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 %addr.b.1 = getelementptr i16, i16* %b, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll index 7620b64f..caf5bb3 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -1,30 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv7-unknown-linux-gnueabihf -arm-parallel-dsp -dce %s -S -o - | FileCheck %s -; CHECK-LABEL: first_mul_invalid -; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 -; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 -; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 -; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 -; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 -; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 -; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] -; CHECK: [[ADD0:%[^ ]+]] = add i32 [[MUL0]], %call -; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 -; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* -; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 -; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 -; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* -; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]]) -; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 -; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* -; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 -; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 -; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* -; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) -; CHECK: ret i32 [[RES]] define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { +; CHECK-LABEL: @first_mul_invalid( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[IN:%.*]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[B:%.*]], align 2 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @bar(i32 [[CONV]], i32 [[CONV2]]) +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -1 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[CONV4]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[CALL]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX13]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX9]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP5]], i32 [[TMP7]], i32 [[ADD]]) +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -5 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[ARRAYIDX25]] to i32* +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 4 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[ARRAYIDX21]] to i32* +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP10]], i32 [[TMP12]], i32 [[TMP8]]) +; CHECK-NEXT: ret i32 [[TMP13]] +; entry: %0 = load i16, i16* %in, align 2 %conv = sext i16 %0 to i32 @@ -74,30 +82,32 @@ entry: ret i32 %add30 } -; CHECK-LABEL: with_no_acc_input -; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 -; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 -; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 -; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 -; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 -; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 -; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] -; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 -; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* -; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 -; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 -; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* -; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]]) -; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 -; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* -; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 -; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 -; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* -; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) -; CHECK: ret i32 [[RES]] define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) { +; CHECK-LABEL: @with_no_acc_input( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[IN:%.*]], i32 -1 +; CHECK-NEXT: [[LD_2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[LD_2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 1 +; CHECK-NEXT: [[LD_3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[LD_3]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[CONV4]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[ARRAYIDX13]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX9]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP3]], i32 [[MUL]]) +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -5 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[ARRAYIDX25]] to i32* +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[ARRAYIDX21]] to i32* +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP6]], i32 [[TMP8]], i32 [[TMP4]]) +; CHECK-NEXT: ret i32 [[TMP9]] +; entry: %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1 %ld.2 = load i16, i16* %arrayidx3, align 2 @@ -141,32 +151,40 @@ entry: ret i32 %add30 } -; CHECK-LABEL: with_64bit_acc -; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 -; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 -; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 -; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 -; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 -; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 -; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] -; CHECK: [[SEXT1:%[^ ]+]] = sext i32 [[MUL0]] to i64 -; CHECK: [[ADD0:%[^ ]+]] = add i64 %sext.0, [[SEXT1]] -; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 -; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* -; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 -; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 -; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* -; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]]) -; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 -; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* -; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 -; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 -; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* -; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]]) -; CHECK: ret i64 [[RES]] define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) { +; CHECK-LABEL: @with_64bit_acc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[IN:%.*]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[B:%.*]], align 2 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @bar(i32 [[CONV]], i32 [[CONV2]]) +; CHECK-NEXT: [[SEXT_0:%.*]] = sext i32 [[CALL]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -1 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2 +; CHECK-NEXT: [[CONV4:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[CONV4]] +; CHECK-NEXT: [[SEXT_1:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SEXT_0]], [[SEXT_1]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX13]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX9]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP5]], i32 [[TMP7]], i64 [[ADD]]) +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i16, i16* [[IN]], i32 -5 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[ARRAYIDX25]] to i32* +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 4 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[ARRAYIDX21]] to i32* +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP10]], i32 [[TMP12]], i64 [[TMP8]]) +; CHECK-NEXT: ret i64 [[TMP13]] +; entry: %0 = load i16, i16* %in, align 2 %conv = sext i16 %0 to i32 @@ -222,32 +240,37 @@ entry: ret i64 %add30 } -; CHECK: with_64bit_add_acc -; CHECK: [[ADDR_X_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %px.10756.unr, i32 1 -; CHECK: [[X:%[^ ]+]] = load i16, i16* %px.10756.unr, align 2 -; CHECK: [[SEXT_X:%[^ ]+]] = sext i16 [[X]] to i32 -; CHECK: [[ADDR_Y_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -1 -; CHECK: [[Y:%[^ ]+]] = load i16, i16* %py.8757.unr, align 2 -; CHECK: [[SEXT_Y:%[^ ]+]] = sext i16 [[Y]] to i32 -; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[SEXT_Y]], [[SEXT_X]] -; CHECK: [[SEXT_MUL0:%[^ ]+]] = sext i32 [[MUL0]] to i64 -; CHECK: [[ADD_1:%[^ ]+]] = add nsw i64 %sum.3758.unr, [[SEXT_MUL0]] -; CHECK: [[X_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %px.10756.unr, i32 2 -; CHECK: [[X_1:%[^ ]+]] = load i16, i16* [[ADDR_X_PLUS_1]], align 2 -; CHECK: [[SEXT_X_1:%[^ ]+]] = sext i16 [[X_1]] to i32 -; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2 -; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32 -; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] -; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 -; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] -; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32* -; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2 -; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 -; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32* -; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]]) -; CHECK: ret i64 [[RES]] define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { +; CHECK-LABEL: @with_64bit_add_acc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUM_3758_UNR:%.*]] = sext i32 [[ACC:%.*]] to i64 +; CHECK-NEXT: br label [[BB_1:%.*]] +; CHECK: bb.1: +; CHECK-NEXT: [[INCDEC_PTR184_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PX_10756_UNR:%.*]], i32 1 +; CHECK-NEXT: [[TMP216:%.*]] = load i16, i16* [[PX_10756_UNR]], align 2 +; CHECK-NEXT: [[CONV185_EPIL:%.*]] = sext i16 [[TMP216]] to i32 +; CHECK-NEXT: [[INCDEC_PTR186_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PY_8757_UNR:%.*]], i32 -1 +; CHECK-NEXT: [[TMP217:%.*]] = load i16, i16* [[PY_8757_UNR]], align 2 +; CHECK-NEXT: [[CONV187_EPIL:%.*]] = sext i16 [[TMP217]] to i32 +; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[CONV187_EPIL]], [[CONV185_EPIL]] +; CHECK-NEXT: [[CONV188_EPIL:%.*]] = sext i32 [[MUL_EPIL]] to i64 +; CHECK-NEXT: [[ADD189_EPIL:%.*]] = add nsw i64 [[SUM_3758_UNR]], [[CONV188_EPIL]] +; CHECK-NEXT: [[INCDEC_PTR190_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PX_10756_UNR]], i32 2 +; CHECK-NEXT: [[TMP218:%.*]] = load i16, i16* [[INCDEC_PTR184_EPIL]], align 2 +; CHECK-NEXT: [[CONV191_EPIL:%.*]] = sext i16 [[TMP218]] to i32 +; CHECK-NEXT: [[TMP219:%.*]] = load i16, i16* [[INCDEC_PTR186_EPIL]], align 2 +; CHECK-NEXT: [[CONV193_EPIL:%.*]] = sext i16 [[TMP219]] to i32 +; CHECK-NEXT: [[MUL194_EPIL:%.*]] = mul nsw i32 [[CONV193_EPIL]], [[CONV191_EPIL]] +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[MUL194_EPIL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[ADD189_EPIL]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[INCDEC_PTR190_EPIL]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[INCDEC_PTR199_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PY_8757_UNR]], i32 -3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[INCDEC_PTR199_EPIL]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP5]], i32 [[TMP3]], i64 [[TMP1]]) +; CHECK-NEXT: ret i64 [[TMP6]] +; entry: %sum.3758.unr = sext i32 %acc to i64 br label %bb.1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/sext-acc.ll b/llvm/test/CodeGen/ARM/ParallelDSP/sext-acc.ll index 192309a..6974a00 100644 --- a/llvm/test/CodeGen/ARM/ParallelDSP/sext-acc.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/sext-acc.ll @@ -1,13 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -arm-parallel-dsp -dce -mtriple=armv7-a -S %s -o - | FileCheck %s -; CHECK-LABEL: sext_acc_1 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[ACC:%[^ ]+]] = sext i32 %acc to i64 -; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 [[ACC]]) define i64 @sext_acc_1(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @sext_acc_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[ACC:%.*]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 [[TMP4]]) +; CHECK-NEXT: ret i64 [[TMP5]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -29,19 +33,24 @@ entry: ret i64 %res } -; CHECK-LABEL: sext_acc_2 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* %addr.a.2 to i32* -; CHECK: [[A_2:%[^ ]+]] = load i32, i32* %4 -; CHECK: [[CAST_B_2:%[^ ]+]] = bitcast i16* %addr.b.2 to i32* -; CHECK: [[B_2:%[^ ]+]] = load i32, i32* %6 -; CHECK: [[ACC:%[^ ]+]] = sext i32 %acc to i64 -; CHECK: [[SMLALD:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 [[ACC]]) -; CHECK: call i64 @llvm.arm.smlald(i32 [[A_2]], i32 [[B_2]], i64 [[SMLALD]]) define i64 @sext_acc_2(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @sext_acc_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[ACC:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP5]], i32 [[TMP7]], i64 [[TMP9]]) +; CHECK-NEXT: ret i64 [[TMP10]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -81,19 +90,24 @@ entry: ret i64 %add.3 } -; CHECK-LABEL: sext_acc_3 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* %addr.a.2 to i32* -; CHECK: [[A_2:%[^ ]+]] = load i32, i32* %4 -; CHECK: [[CAST_B_2:%[^ ]+]] = bitcast i16* %addr.b.2 to i32* -; CHECK: [[B_2:%[^ ]+]] = load i32, i32* %6 -; CHECK: [[ACC:%[^ ]+]] = sext i32 %acc to i64 -; CHECK: [[SMLALD:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 [[ACC]]) -; CHECK: call i64 @llvm.arm.smlald(i32 [[A_2]], i32 [[B_2]], i64 [[SMLALD]]) define i64 @sext_acc_3(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @sext_acc_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[ACC:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP5]], i32 [[TMP7]], i64 [[TMP9]]) +; CHECK-NEXT: ret i64 [[TMP10]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 @@ -133,19 +147,24 @@ entry: ret i64 %add.3 } -; CHECK-LABEL: sext_acc_4 -; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* -; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] -; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* -; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] -; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* %addr.a.2 to i32* -; CHECK: [[A_2:%[^ ]+]] = load i32, i32* %4 -; CHECK: [[CAST_B_2:%[^ ]+]] = bitcast i16* %addr.b.2 to i32* -; CHECK: [[B_2:%[^ ]+]] = load i32, i32* %6 -; CHECK: [[ACC:%[^ ]+]] = sext i32 %acc to i64 -; CHECK: [[SMLALD:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 [[ACC]]) -; CHECK: call i64 @llvm.arm.smlald(i32 [[A_2]], i32 [[B_2]], i64 [[SMLALD]]) define i64 @sext_acc_4(i16* %a, i16* %b, i32 %acc) { +; CHECK-LABEL: @sext_acc_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32* +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[B:%.*]] to i32* +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 2 +; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2 +; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[ACC:%.*]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP3]], i64 [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP5]], i32 [[TMP7]], i64 [[TMP9]]) +; CHECK-NEXT: ret i64 [[TMP10]] +; entry: %ld.a.0 = load i16, i16* %a %sext.a.0 = sext i16 %ld.a.0 to i32 -- 2.7.4