From a5573bf030e81ee25b3c4e490c4f79009119aa09 Mon Sep 17 00:00:00 2001 From: Anna Thomas Date: Mon, 17 Jul 2023 17:17:00 -0400 Subject: [PATCH] [LV] Precommit test for interleaving miscompile Identified another miscompile while working on fixing interleaving's current miscompile in D154309. This is different from testcases landed in D154309, since it showcases an incorrect sinking of store (the former testcases in that review and follow-up ones) showed incorrect hoisting of loads across stores. --- .../interleaved-accesses-sink-store-across-load.ll | 102 +++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll new file mode 100644 index 0000000..bf53833 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=loop-vectorize -mcpu=skx -enable-masked-interleaved-mem-accesses=1 -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" +target triple = "x86_64-apple-macos" + +; This is currently miscompiled. +; %l2 load and the preceeding store has a dependency. However, we currently sink +; that store into the last store (by creating an interleaved store group). This +; means the loaded %l2 has incorrect value. +; We do not release this store group correctly because the next interleave group +; chosen compares only the memory access of last load in program (%l3) against the dependent store location +; (%gep.iv.1.plus.2) and they are different, thereby incorrectly assuming no +; dependency. We need to compare against all loads in that interleaved group +; (%l2 is part of it). +define void @avoid_sinking_store_across_load(ptr %arr) { +; CHECK-LABEL: define void @avoid_sinking_store_across_load +; CHECK-SAME: (ptr [[ARR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -2 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]] +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[STRIDED_VEC5]], +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP8]], <4 x ptr> [[TMP7]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 3 +; CHECK-NEXT: [[IV_1_PLUS_4:%.*]] = add nuw nsw i64 [[IV_1]], 4 +; CHECK-NEXT: [[GEP_IV_1_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IV_1_PLUS_4]] +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[GEP_IV_1_PLUS_4]], align 4 +; CHECK-NEXT: [[GEP_IV_2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IV_2]] +; CHECK-NEXT: [[IV_1_PLUS_2:%.*]] = add nuw nsw i64 [[IV_1]], 2 +; CHECK-NEXT: [[GEP_IV_1_PLUS_2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[IV_1_PLUS_2]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L1]], 25 +; CHECK-NEXT: store i32 [[MUL]], ptr [[GEP_IV_1_PLUS_2]], align 4 +; CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[GEP_IV_1_PLUS_2]], align 4 +; CHECK-NEXT: [[L3:%.*]] = load i32, ptr [[GEP_IV_2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L3]], [[L2]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_IV_2]], align 4 +; CHECK-NEXT: [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.1 = phi i64 [ 1, %entry ], [ %iv.1.next, %loop ] + %iv.2 = phi i64 [ 4, %entry ], [ %iv.2.next, %loop ] + %iv.1.next = add nuw nsw i64 %iv.1, 3 + %iv.1.plus.4 = add nuw nsw i64 %iv.1, 4 + %gep.iv.1.plus.4 = getelementptr inbounds i32, ptr %arr, i64 %iv.1.plus.4 + %l1 = load i32, ptr %gep.iv.1.plus.4 + %gep.iv.2 = getelementptr inbounds i32, ptr %arr, i64 %iv.2 + %iv.1.plus.2 = add nuw nsw i64 %iv.1, 2 + %gep.iv.1.plus.2= getelementptr inbounds i32, ptr %arr, i64 %iv.1.plus.2 + %mul = mul i32 %l1, 25 + store i32 %mul, ptr %gep.iv.1.plus.2 + %l2 = load i32, ptr %gep.iv.1.plus.2 + %l3 = load i32, ptr %gep.iv.2 + %add = add i32 %l3 , %l2 + store i32 %add, ptr %gep.iv.2 + %iv.2.next = add nuw nsw i64 %iv.2, 3 + %icmp = icmp ugt i64 %iv.2, 50 + br i1 %icmp, label %exit, label %loop + +exit: + ret void +} -- 2.7.4