default: return false;
case Intrinsic::memset:
case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
// Do shorten memory intrinsics.
// FIXME: Add memmove if it's also safe to transform.
- // TODO: Add atomic memcpy/memset
return true;
}
}
static bool isShortenableAtTheBeginning(Instruction *I) {
// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
// easily done by offsetting the source address.
- // TODO: Handle atomic memory intrinsics
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- return II && II->getIntrinsicID() == Intrinsic::memset;
+ return isa<AnyMemSetInst>(I);
}
/// Return the pointer that is being written to.
// Power of 2 vector writes are probably always a bad idea to optimize
// as any store/memset/memcpy is likely using vector instructions so
// shortening it to not vector size is likely to be slower
- MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
+ auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
if (!IsOverwriteEnd)
LaterOffset = int64_t(LaterOffset + LaterSize);
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
return false;
+ int64_t NewLength = IsOverwriteEnd
+ ? LaterOffset - EarlierOffset
+ : EarlierSize - (LaterOffset - EarlierOffset);
+
+ if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+ // When shortening an atomic memory intrinsic, the newly shortened
+ // length must remain an integer multiple of the element size.
+ const uint32_t ElementSize = AMI->getElementSizeInBytes();
+ if (0 != NewLength % ElementSize)
+ return false;
+ }
+
DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
<< (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
<< "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
<< ")\n");
- int64_t NewLength = IsOverwriteEnd
- ? LaterOffset - EarlierOffset
- : EarlierSize - (LaterOffset - EarlierOffset);
-
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
Value *TrimmedLength =
ConstantInt::get(EarlierWriteLength->getType(), NewLength);
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
; CHECK-NEXT: ret void
; CHECK-LABEL: @write0to3_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4
; CHECK-NEXT: ret void
;
; CHECK-LABEL: @write0to3_atomic_weaker(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
; CHECK-NEXT: store i32 1, i32* [[P]], align 4
; CHECK-NEXT: ret void
;
; CHECK-LABEL: @write0to7_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64*
; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8
; CHECK-NEXT: ret void
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64*
; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8
; CHECK-NEXT: ret void
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
; CHECK-NEXT: store i64 1, i64* [[BASE64_1]], align 8
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
; CHECK-NEXT: ret void
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: ret void
; CHECK-LABEL: @write28to32_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
; CHECK-NEXT: ret void
; CHECK-LABEL: @write32to36_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8*
-; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4)
+; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4)
; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2
; CHECK-NEXT: store atomic i32 1, i32* [[C]] unordered, align 4
; CHECK-NEXT: ret void
; CHECK-LABEL: @write32to36_atomic_weaker(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8*
-; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4)
+; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4)
; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2
; CHECK-NEXT: store i32 1, i32* [[C]], align 4
; CHECK-NEXT: ret void
; CHECK-LABEL: @write16to32_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8*
-; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4)
+; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i32 4)
; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1
; CHECK-NEXT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* [[C]], align 4
; CHECK-NEXT: ret void
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
; CHECK-NEXT: store i64 3, i64* [[BASE64_2]], align 8
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
-; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
+; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8