[test][msan] Remove -DAG after fixing indeterminism

author Vitaly Buka <vitalybuka@google.com>

Sat, 10 Sep 2022 22:19:00 +0000 (15:19 -0700)

committer Vitaly Buka <vitalybuka@google.com>

Sat, 10 Sep 2022 22:28:33 +0000 (15:28 -0700)
author Vitaly Buka <vitalybuka@google.com>
Sat, 10 Sep 2022 22:19:00 +0000 (15:19 -0700)
committer Vitaly Buka <vitalybuka@google.com>
Sat, 10 Sep 2022 22:28:33 +0000 (15:28 -0700)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/avx-intrinsics-x86.ll

index e0c832c..4472ff2 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/avx-intrinsics-x86.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
  
  define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_addsub_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -22,8 +22,8 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
  
  define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_addsub_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -38,9 +38,9 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
  
  define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_blendv_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
@@ -56,9 +56,9 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
  
  define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_blendv_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -74,8 +74,8 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
  
  define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_cmp_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -99,8 +99,8 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
  
  define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_cmp_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -122,8 +122,8 @@ define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0
  
  define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_cmp_ps_256_pseudo_op(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -426,7 +426,7 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
  
  define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_cvt_pd2_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -447,7 +447,7 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
  
  define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_cvt_pd2dq_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -468,7 +468,7 @@ declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
  
  define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_cvt_ps2dq_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -489,7 +489,7 @@ declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
  
  define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_cvtt_pd2dq_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -510,7 +510,7 @@ declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
  
  define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_cvtt_ps2dq_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -531,8 +531,8 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
  
  define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_dp_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -556,8 +556,8 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
  
  define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_hadd_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -572,8 +572,8 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
  
  define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_hadd_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -588,8 +588,8 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
  
  define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_hsub_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -604,8 +604,8 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
  
  define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_hsub_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -620,7 +620,7 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
  
  define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_ldu_dq_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i8* [[A0:%.*]] to i64
  ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -644,8 +644,8 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
  
  define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskload_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -668,8 +668,8 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
  
  define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -692,8 +692,8 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
  
  define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskload_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -716,8 +716,8 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
  
  define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -740,9 +740,9 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind reado
  
  define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -767,9 +767,9 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
  
  define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -794,9 +794,9 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
  
  define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -821,9 +821,9 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
  
  define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -848,8 +848,8 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
  
  define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_max_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -864,8 +864,8 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
  
  define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_max_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -880,8 +880,8 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
  
  define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_min_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -896,8 +896,8 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
  
  define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_min_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -912,7 +912,7 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
  
  define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_movmsk_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -933,7 +933,7 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
  
  define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_movmsk_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -954,8 +954,8 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
  
  define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_ptestc_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -979,8 +979,8 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
  
  define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_ptestnzc_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1004,8 +1004,8 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
  
  define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_ptestz_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1029,7 +1029,7 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
  
  define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_rcp_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <8 x i32> [[TMP1]], <8 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <8 x i32>*), align 8
@@ -1043,7 +1043,7 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
  
  define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_round_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -1064,7 +1064,7 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
  
  define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_round_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -1085,7 +1085,7 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
  
  define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_rsqrt_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <8 x i32> [[TMP1]], <8 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <8 x i32>*), align 8
@@ -1098,8 +1098,8 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
  
  define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1123,8 +1123,8 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
  
  define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1147,7 +1147,7 @@ declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) no
  
  define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256_2(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
@@ -1166,8 +1166,8 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
  
  define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1188,8 +1188,8 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
  }
  define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, <4 x i32>* %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
@@ -1225,8 +1225,8 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
  
  define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1250,8 +1250,8 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
  
  define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestc_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1275,8 +1275,8 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
  
  define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestc_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1300,8 +1300,8 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
  
  define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestc_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1325,8 +1325,8 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestc_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1350,8 +1350,8 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
  
  define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1375,8 +1375,8 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1400,8 +1400,8 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
  
  define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1425,8 +1425,8 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
  
  define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1450,8 +1450,8 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
  
  define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestz_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1475,8 +1475,8 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
  
  define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestz_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1500,8 +1500,8 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
  
  define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestz_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1525,8 +1525,8 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx_vtestz_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1573,8 +1573,8 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
  
  define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind #0 {
  ; CHECK-LABEL: @movnt_dq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[A2:%.*]] = add <2 x i64> [[A1:%.*]], <i64 1, i64 1>
@@ -1603,8 +1603,8 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
  
  define void @movnt_ps(i8* %p, <8 x float> %a) nounwind #0 {
  ; CHECK-LABEL: @movnt_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8* [[P:%.*]] to <8 x float>*
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1628,8 +1628,8 @@ declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
  define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind #0 {
    ; add operation forces the execution domain.
  ; CHECK-LABEL: @movnt_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[A2:%.*]] = fadd <4 x double> [[A1:%.*]], zeroinitializer
@@ -1656,8 +1656,8 @@ declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
  
  define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_pclmulqdq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/avx2-intrinsics-x86.ll

index 37644bf..9ea9611 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/avx2-intrinsics-x86.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
  
  define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_packssdw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -39,8 +39,8 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() #0 {
  
  define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_packsswb(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
@@ -72,8 +72,8 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() #0 {
  
  define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_packuswb(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
@@ -105,8 +105,8 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() #0 {
  
  define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pavg_b(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
@@ -121,8 +121,8 @@ declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pavg_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -137,8 +137,8 @@ declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readno
  
  define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmadd_wd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
@@ -156,13 +156,13 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
  
  define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmovmskb(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> [[A0:%.*]])
@@ -177,8 +177,8 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmulh_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -193,8 +193,8 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
  
  define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmulhu_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -209,8 +209,8 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
  
  define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psad_bw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <4 x i64>
@@ -229,8 +229,8 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psll_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -251,8 +251,8 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psll_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -273,8 +273,8 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psll_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -295,7 +295,7 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
  
  define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pslli_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
@@ -311,7 +311,7 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pslli_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP2]], zeroinitializer
@@ -327,7 +327,7 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pslli_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
@@ -343,8 +343,8 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psra_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -365,8 +365,8 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psra_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -387,7 +387,7 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
  
  define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrai_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
@@ -403,7 +403,7 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrai_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
@@ -419,8 +419,8 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrl_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -441,8 +441,8 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrl_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -463,8 +463,8 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrl_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -485,13 +485,13 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
  
  define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrl_w_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
@@ -518,7 +518,7 @@ define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) #0 {
  
  define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrli_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
@@ -534,7 +534,7 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrli_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP2]], zeroinitializer
@@ -550,7 +550,7 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrli_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
@@ -566,8 +566,8 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phadd_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -582,8 +582,8 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phadd_sw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -598,8 +598,8 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
  
  define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phadd_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -614,8 +614,8 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
  
  define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phsub_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -630,8 +630,8 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phsub_sw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -646,8 +646,8 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
  
  define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_phsub_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -662,8 +662,8 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
  
  define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
@@ -680,13 +680,13 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
  
  define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw_load_op0(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A0:%.*]] = load <32 x i8>, <32 x i8>* [[PTR:%.*]], align 32
@@ -709,8 +709,8 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8>
  
  define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pmul_hr_sw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -725,8 +725,8 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
  
  define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pshuf_b(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
@@ -741,8 +741,8 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
  
  define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psign_b(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
@@ -757,8 +757,8 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psign_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -773,8 +773,8 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
  
  define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psign_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -789,8 +789,8 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
  
  define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_mpsadbw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -799,7 +799,7 @@ define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]], i8 7)
@@ -813,13 +813,13 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
  
  define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_mpsadbw_load_op0(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A0:%.*]] = load <32 x i8>, <32 x i8>* [[PTR:%.*]], align 32
@@ -834,7 +834,7 @@ define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
  ; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       11:
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]], i8 7)
@@ -848,8 +848,8 @@ define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1
  
  define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_packusdw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -881,9 +881,9 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() #0 {
  
  define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pblendvb(
-; CHECK-DAG:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <32 x i8>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 64) to <32 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
@@ -899,8 +899,8 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
  
  define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pblendw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -915,8 +915,8 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
  
  define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pblendd_128(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
@@ -931,8 +931,8 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
  
  define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_pblendd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -947,8 +947,8 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
  
  define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_permd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -963,8 +963,8 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
  
  define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_permps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -973,7 +973,7 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -988,8 +988,8 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
  
  define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskload_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -997,7 +997,7 @@ define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
  ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       5:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* [[A0:%.*]], <2 x i64> [[A1:%.*]])
@@ -1012,8 +1012,8 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
  
  define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -1021,7 +1021,7 @@ define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
  ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       5:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* [[A0:%.*]], <4 x i64> [[A1:%.*]])
@@ -1036,8 +1036,8 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonl
  
  define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskload_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -1045,7 +1045,7 @@ define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
  ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       5:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* [[A0:%.*]], <4 x i32> [[A1:%.*]])
@@ -1060,8 +1060,8 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
  
  define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -1069,7 +1069,7 @@ define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
  ; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       5:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -1084,9 +1084,9 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonl
  
  define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -1097,7 +1097,7 @@ define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) #0
  ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
  ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
  ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       7:
  ; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(i8* [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
@@ -1111,9 +1111,9 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
  
  define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -1124,7 +1124,7 @@ define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2
  ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
  ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
  ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       7:
  ; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(i8* [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
@@ -1138,9 +1138,9 @@ declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
  
  define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -1151,7 +1151,7 @@ define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) #0
  ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
  ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
  ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       7:
  ; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(i8* [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
@@ -1165,9 +1165,9 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
  
  define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
  ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -1178,7 +1178,7 @@ define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2
  ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
  ; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
  ; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       7:
  ; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(i8* [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
@@ -1192,8 +1192,8 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
  
  define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psllv_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1231,8 +1231,8 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psllv_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1270,8 +1270,8 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
  
  define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psllv_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
@@ -1301,8 +1301,8 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psllv_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
@@ -1333,8 +1333,8 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
  
  define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrlv_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1372,8 +1372,8 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrlv_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1411,8 +1411,8 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
  
  define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrlv_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
@@ -1443,8 +1443,8 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
  
  define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrlv_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
@@ -1476,8 +1476,8 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
  
  define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrav_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1507,8 +1507,8 @@ declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_avx2_psrav_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1538,10 +1538,10 @@ declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind read
  
  define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1555,7 +1555,7 @@ define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <2 x double> [[MASK:%.*]], i8 2)
@@ -1571,10 +1571,10 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
  
  define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1588,7 +1588,7 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x double> [[MASK:%.*]], i8 2)
@@ -1604,10 +1604,10 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
  
  define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1621,7 +1621,7 @@ define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> [[A0:%.*]], i8* [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x double> [[MASK:%.*]], i8 2)
@@ -1637,10 +1637,10 @@ declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
  
  define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_pd_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1654,7 +1654,7 @@ define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> [[A0:%.*]], i8* [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x double> [[MASK:%.*]], i8 2)
@@ -1670,10 +1670,10 @@ declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
  
  define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1687,7 +1687,7 @@ define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
@@ -1703,10 +1703,10 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
  
  define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1720,7 +1720,7 @@ define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> [[A0:%.*]], i8* [[A1:%.*]], <8 x i32> [[IDX:%.*]], <8 x float> [[MASK:%.*]], i8 2)
@@ -1736,10 +1736,10 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
  
  define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1753,7 +1753,7 @@ define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> [[A0:%.*]], i8* [[A1:%.*]], <2 x i64> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
@@ -1769,10 +1769,10 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
  
  define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_ps_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1786,7 +1786,7 @@ define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> [[A0:%.*]], i8* [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
@@ -1802,10 +1802,10 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
  
  define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1819,7 +1819,7 @@ define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %id
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <2 x i64> [[MASK:%.*]], i8 2)
@@ -1835,10 +1835,10 @@ declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
  
  define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1852,7 +1852,7 @@ define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32>
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x i64> [[MASK:%.*]], i8 2)
@@ -1868,10 +1868,10 @@ declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
  
  define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1885,7 +1885,7 @@ define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %id
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> [[A0:%.*]], i8* [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x i64> [[MASK:%.*]], i8 2)
@@ -1901,10 +1901,10 @@ declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
  
  define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_q_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <4 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1918,7 +1918,7 @@ define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64>
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> [[A0:%.*]], i8* [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x i64> [[MASK:%.*]], i8 2)
@@ -1934,10 +1934,10 @@ declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
  
  define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1951,7 +1951,7 @@ define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %id
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> [[A0:%.*]], i8* [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
@@ -1967,10 +1967,10 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
  
  define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_d_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1984,7 +1984,7 @@ define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32>
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> [[A0:%.*]], i8* [[A1:%.*]], <8 x i32> [[IDX:%.*]], <8 x i32> [[MASK:%.*]], i8 2)
@@ -2000,10 +2000,10 @@ declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
  
  define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -2017,7 +2017,7 @@ define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %id
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> [[A0:%.*]], i8* [[A1:%.*]], <2 x i64> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
@@ -2033,10 +2033,10 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
  
  define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) #0 {
  ; CHECK-LABEL: @test_x86_avx2_gather_q_d_256(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i64>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <4 x i64>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 56) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -2050,7 +2050,7 @@ define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64>
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> [[A0:%.*]], i8* [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
@@ -2066,11 +2066,11 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
  
  define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx, <8 x float> %mask, float* nocapture %out) #0 {
  ; CHECK-LABEL: @test_gather_mask(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
-; CHECK-DAG:    [[TMP5:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 104) to i64*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 72) to <8 x i32>*), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 104) to i64*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[A_I8:%.*]] = bitcast float* [[A:%.*]] to i8*
  ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -2085,7 +2085,7 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
  ; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       10:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> [[A0:%.*]], i8* [[A_I8]], <8 x i32> [[IDX:%.*]], <8 x float> [[MASK:%.*]], i8 4)
@@ -2093,7 +2093,7 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx
  ; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP6]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
  ; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       12:
  ; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint <8 x float>* [[OUT_PTR]] to i64
@@ -2116,10 +2116,10 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx
  
  define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i1> %mask) #0 {
  ; CHECK-LABEL: @test_mask_demanded_bits(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i1>, <2 x i1>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i1>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, <2 x i1>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 40) to <2 x i1>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
  ; CHECK-NEXT:    [[MASK1:%.*]] = sext <2 x i1> [[MASK:%.*]] to <2 x i64>
@@ -2135,7 +2135,7 @@ define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, i8* %a1, <2 x i64> %idx
  ; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
  ; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
  ; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       9:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> [[A0:%.*]], i8* [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x i64> [[MASK1]], i8 2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/disambiguate-origin.ll b/llvm/test/Instrumentation/MemorySanitizer/disambiguate-origin.ll

index df94ec0..a9af4b3 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/disambiguate-origin.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/disambiguate-origin.ll
@@ -24,20 +24,22 @@ define void @TestOne(i32* noundef %a)  nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @TestOne(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1:![0-9]+]]
-; CHECK-DAG:    [[V:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP0:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSLD:%.*]] = load i32, i32* [[TMP2]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0, !dbg [[DBG7:![0-9]+]]
-; CHECK:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !dbg [[DBG7]], !prof [[PROF8:![0-9]+]]
-; CHECK:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP5]]) #[[ATTR2:[0-9]+]], !dbg [[DBG7]]
-; CHECK:    unreachable, !dbg [[DBG7]]
-; CHECK:    call void @OneArg(i32 noundef [[V]]), !dbg [[DBG7]]
-; CHECK:    ret void
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, i32* [[TMP2]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0, !dbg [[DBG7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !dbg [[DBG7]], !prof [[PROF8:![0-9]+]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP5]]) #[[ATTR3:[0-9]+]], !dbg [[DBG7]]
+; CHECK-NEXT:    unreachable, !dbg [[DBG7]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @OneArg(i32 noundef [[V]]), !dbg [[DBG7]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    %v = load i32, i32* %a, !dbg !11
@@ -49,47 +51,53 @@ define void @TestMany(i32* noundef %a)  nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @TestMany(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-DAG:    [[X:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP0:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSLD:%.*]] = load i32, i32* [[TMP2]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[Y:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG9:![0-9]+]]
-; CHECK-DAG:    [[TMP6:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG9]]
-; CHECK-DAG:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080, !dbg [[DBG9]]
-; CHECK-DAG:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i32*, !dbg [[DBG9]]
-; CHECK-DAG:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416, !dbg [[DBG9]]
-; CHECK-DAG:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to i32*, !dbg [[DBG9]]
-; CHECK-DAG:    [[_MSLD1:%.*]] = load i32, i32* [[TMP8]], align 4, !dbg [[DBG9]]
-; CHECK-DAG:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !dbg [[DBG9]]
-; CHECK-DAG:    [[Z:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG10:![0-9]+]]
-; CHECK-DAG:    [[TMP12:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG10]]
-; CHECK-DAG:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080, !dbg [[DBG10]]
-; CHECK-DAG:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to i32*, !dbg [[DBG10]]
-; CHECK-DAG:    [[TMP15:%.*]] = add i64 [[TMP13]], 17592186044416, !dbg [[DBG10]]
-; CHECK-DAG:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to i32*, !dbg [[DBG10]]
-; CHECK-DAG:    [[_MSLD2:%.*]] = load i32, i32* [[TMP14]], align 4, !dbg [[DBG10]]
-; CHECK-DAG:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !dbg [[DBG10]]
-; CHECK-DAG:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0, !dbg [[DBG7]]
-; CHECK:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP20:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
-; CHECK:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP5]]), !dbg [[DBG1]]
-; CHECK:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP19]]) #[[ATTR2]], !dbg [[DBG7]]
-; CHECK:    unreachable, !dbg [[DBG7]]
-; CHECK:    [[_MSCMP3:%.*]] = icmp ne i32 [[_MSLD1]], 0, !dbg [[DBG7]]
-; CHECK:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP23:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
-; CHECK:    [[TMP22:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP11]]), !dbg [[DBG9]]
-; CHECK:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP22]]) #[[ATTR2]], !dbg [[DBG7]]
-; CHECK:    unreachable, !dbg [[DBG7]]
-; CHECK:    [[_MSCMP4:%.*]] = icmp ne i32 [[_MSLD2]], 0, !dbg [[DBG7]]
-; CHECK:    br i1 [[_MSCMP4]], label [[TMP24:%.*]], label [[TMP26:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
-; CHECK:    [[TMP25:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP17]]), !dbg [[DBG10]]
-; CHECK:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP25]]) #[[ATTR2]], !dbg [[DBG7]]
-; CHECK:    unreachable, !dbg [[DBG7]]
-; CHECK:    call void @ManyArgs(i32 noundef [[X]], i32 noundef [[Y]], i32 noundef [[Z]]), !dbg [[DBG7]]
-; CHECK:    ret void
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, i32* [[TMP2]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[Y:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i32*, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to i32*, !dbg [[DBG9]]
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load i32, i32* [[TMP8]], align 4, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !dbg [[DBG9]]
+; CHECK-NEXT:    [[Z:%.*]] = load i32, i32* [[A]], align 4, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to i32*, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP13]], 17592186044416, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to i32*, !dbg [[DBG10]]
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i32, i32* [[TMP14]], align 4, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !dbg [[DBG10]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP20:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP5]]), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP19]]) #[[ATTR3]], !dbg [[DBG7]]
+; CHECK-NEXT:    unreachable, !dbg [[DBG7]]
+; CHECK:       20:
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[_MSLD1]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP23:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
+; CHECK:       21:
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP11]]), !dbg [[DBG9]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP22]]) #[[ATTR3]], !dbg [[DBG7]]
+; CHECK-NEXT:    unreachable, !dbg [[DBG7]]
+; CHECK:       23:
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i32 [[_MSLD2]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP24:%.*]], label [[TMP26:%.*]], !dbg [[DBG7]], !prof [[PROF8]]
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP17]]), !dbg [[DBG10]]
+; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP25]]) #[[ATTR3]], !dbg [[DBG7]]
+; CHECK-NEXT:    unreachable, !dbg [[DBG7]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @ManyArgs(i32 noundef [[X]], i32 noundef [[Y]], i32 noundef [[Z]]), !dbg [[DBG7]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    %x = load i32, i32* %a, !dbg !11
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll

index d190c87..8089703 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
@@ -22,21 +22,22 @@ target triple = "x86_64-unknown-linux-gnu"
  define void @Store(i32* nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @Store(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1:![0-9]+]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = ptrtoint i32* [[P:%.*]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = add i64 [[TMP5]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    store i32 [[TMP2]], i32* [[TMP6]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = bitcast i32* [[P]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP2]], i8* [[TMP9]], i32 zeroext [[TMP3]]), !dbg [[DBG1]]
-; CHECK-DAG:    store i32 [[X:%.*]], i32* [[P]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint i32* [[P:%.*]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TMP6]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[P]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP2]], i8* [[TMP9]], i32 zeroext [[TMP3]]), !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[P]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    store i32 %x, i32* %p, align 4, !dbg !10
@@ -46,34 +47,35 @@ entry:
  define void @LoadAndCmp(i32* nocapture %a) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @LoadAndCmp(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSLD:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = xor i32 [[TMP2]], 0, !dbg [[DBG7:![0-9]+]]
-; CHECK-DAG:    [[TMP10:%.*]] = or i32 [[_MSLD]], 0, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP12:%.*]] = xor i32 [[TMP10]], -1, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP13:%.*]] = and i32 [[TMP12]], [[TMP9]], !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0, !dbg [[DBG7]]
-; CHECK-DAG:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]], !dbg [[DBG7]]
-; CHECK-DAG:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP2]], 0, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP15:%.*]] = zext i1 [[_MSPROP_ICMP]] to i8, !dbg [[DBG8:![0-9]+]]
-; CHECK-DAG:    call void @__msan_maybe_warning_1(i8 zeroext [[TMP15]], i32 zeroext [[TMP8]]), !dbg [[DBG8]]
-; CHECK-DAG:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A:%.*]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[A]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, i32* [[TMP5]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP2]], 0, !dbg [[DBG7:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[_MSLD]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], -1, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], [[TMP9]], !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]], !dbg [[DBG7]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP2]], 0, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i1 [[_MSPROP_ICMP]] to i8, !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_1(i8 zeroext [[TMP15]], i32 zeroext [[TMP8]]), !dbg [[DBG8]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG8]]
  ; CHECK:       if.then:
-; CHECK-DAG:    store i64 0, i64* @__msan_va_arg_overflow_size_tls, align 8
-; CHECK-DAG:    tail call void (...) @foo() #[[ATTR5:[0-9]+]]
-; CHECK-DAG:    br label [[IF_END]]
+; CHECK-NEXT:    store i64 0, i64* @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    tail call void (...) @foo() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    br label [[IF_END]]
  ; CHECK:       if.end:
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    ret void
  ;
  entry:
    %0 = load i32, i32* %a, align 4, !dbg !10
@@ -91,9 +93,10 @@ declare void @foo(...)
  define i32 @ReturnInt() nounwind uwtable readnone sanitize_memory {
  ; CHECK-LABEL: @ReturnInt(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    store i32 0, i32* @__msan_retval_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-DAG:    ret i32 123, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 0, i32* @__msan_retval_origin_tls, align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    ret i32 123, !dbg [[DBG1]]
  ;
  entry:
    ret i32 123, !dbg !10
@@ -102,23 +105,24 @@ entry:
  define void @CopyRetVal(i32* nocapture %a) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @CopyRetVal(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[CALL:%.*]] = tail call i32 @ReturnInt() #[[ATTR5]], !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4, !dbg [[DBG7]]
-; CHECK-DAG:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP3:%.*]] = ptrtoint i32* [[A:%.*]] to i64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    store i32 [[_MSRET]], i32* [[TMP5]], align 4, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP8:%.*]] = bitcast i32* [[A]] to i8*, !dbg [[DBG7]]
-; CHECK-DAG:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSRET]], i8* [[TMP8]], i32 zeroext [[TMP2]]), !dbg [[DBG7]]
-; CHECK-DAG:    store i32 [[CALL]], i32* [[A]], align 4, !dbg [[DBG7]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @ReturnInt() #[[ATTR5]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint i32* [[A:%.*]] to i64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    store i32 [[_MSRET]], i32* [[TMP5]], align 4, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A]] to i8*, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSRET]], i8* [[TMP8]], i32 zeroext [[TMP2]]), !dbg [[DBG7]]
+; CHECK-NEXT:    store i32 [[CALL]], i32* [[A]], align 4, !dbg [[DBG7]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    %call = tail call i32 @ReturnInt() nounwind, !dbg !10
@@ -131,33 +135,34 @@ entry:
  define void @SExt(i32* nocapture %a, i16* nocapture %b) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @SExt(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i16, i16* [[B:%.*]], align 2, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = ptrtoint i16* [[B]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i16*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = add i64 [[TMP6]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = and i64 [[TMP8]], -4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSLD:%.*]] = load i16, i16* [[TMP7]], align 2, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP:%.*]] = sext i16 [[_MSLD]] to i32, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP12:%.*]] = sext i16 [[TMP4]] to i32, !dbg [[DBG7]]
-; CHECK-DAG:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP2]], i32 zeroext [[TMP3]]), !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP13:%.*]] = ptrtoint i32* [[A:%.*]] to i64, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to i32*, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to i32*, !dbg [[DBG8]]
-; CHECK-DAG:    store i32 [[_MSPROP]], i32* [[TMP15]], align 4, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP18:%.*]] = bitcast i32* [[A]] to i8*, !dbg [[DBG8]]
-; CHECK-DAG:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSPROP]], i8* [[TMP18]], i32 zeroext [[TMP11]]), !dbg [[DBG8]]
-; CHECK-DAG:    store i32 [[TMP12]], i32* [[A]], align 4, !dbg [[DBG8]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B:%.*]], align 2, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint i16* [[B]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to i16*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, i16* [[TMP7]], align 2, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = sext i16 [[_MSLD]] to i32, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i16 [[TMP4]] to i32, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP2]], i32 zeroext [[TMP3]]), !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint i32* [[A:%.*]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to i32*, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to i32*, !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[_MSPROP]], i32* [[TMP15]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[A]] to i8*, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSPROP]], i8* [[TMP18]], i32 zeroext [[TMP11]]), !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[TMP12]], i32* [[A]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    %0 = load i16, i16* %b, align 2, !dbg !10
@@ -169,8 +174,9 @@ entry:
  define void @MemSet(i8* nocapture %x) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @MemSet(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = call i8* @__msan_memset(i8* [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__msan_memset(i8* [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    call void @llvm.memset.p0i8.i64(i8* %x, i8 42, i64 10, i1 false), !dbg !10
@@ -184,10 +190,11 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
  define void @MemCpy(i8* nocapture %x, i8* nocapture %y) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @MemCpy(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = call i8* @__msan_memcpy(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__msan_memcpy(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x, i8* %y, i64 10, i1 false), !dbg !10
@@ -200,8 +207,9 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) n
  define void @MemSetInline(i8* nocapture %x) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @MemSetInline(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = call i8* @__msan_memset(i8* [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__msan_memset(i8* [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    call void @llvm.memset.inline.p0i8.i64(i8* %x, i8 42, i64 10, i1 false), !dbg !10
@@ -214,10 +222,11 @@ declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
  define void @MemCpyInline(i8* nocapture %x, i8* nocapture %y) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @MemCpyInline(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = call i8* @__msan_memcpy(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__msan_memcpy(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* %x, i8* %y, i64 10, i1 false), !dbg !10
@@ -230,10 +239,11 @@ declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64
  define void @MemMove(i8* nocapture %x, i8* nocapture %y) nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @MemMove(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = call i8* @__msan_memmove(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i64*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @__msan_memmove(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    call void @llvm.memmove.p0i8.p0i8.i64(i8* %x, i8* %y, i64 10, i1 false), !dbg !10
@@ -249,8 +259,9 @@ declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture w
  
  define void @atomic_memcpy(i8* nocapture %x, i8* nocapture %y) nounwind {
  ; CHECK-LABEL: @atomic_memcpy(
-; CHECK-DAG:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[X:%.*]], i8* align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[X:%.*]], i8* align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1), !dbg !10
    ret void
@@ -258,8 +269,9 @@ define void @atomic_memcpy(i8* nocapture %x, i8* nocapture %y) nounwind {
  
  define void @atomic_memmove(i8* nocapture %x, i8* nocapture %y) nounwind {
  ; CHECK-LABEL: @atomic_memmove(
-; CHECK-DAG:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[X:%.*]], i8* align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[X:%.*]], i8* align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %x, i8* align 2 %y, i64 16, i32 1), !dbg !10
    ret void
@@ -267,8 +279,9 @@ define void @atomic_memmove(i8* nocapture %x, i8* nocapture %y) nounwind {
  
  define void @atomic_memset(i8* nocapture %x) nounwind {
  ; CHECK-LABEL: @atomic_memset(
-; CHECK-DAG:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 [[X:%.*]], i8 88, i64 16, i32 1), !dbg [[DBG1]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 [[X:%.*]], i8 88, i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    ret void
  ;
    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %x, i8 88, i64 16, i32 1), !dbg !10
    ret void
@@ -280,23 +293,24 @@ define void @atomic_memset(i8* nocapture %x) nounwind {
  define i32 @Select(i32 %a, i32 %b, i1 %c) nounwind uwtable readnone sanitize_memory {
  ; CHECK-LABEL: @Select(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i1, i1* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i1*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = select i1 [[C:%.*]], i32 [[TMP2]], i32 [[TMP4]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = xor i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP2]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = or i32 [[TMP8]], [[TMP4]], !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], i32 [[TMP9]], i32 [[TMP6]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP10:%.*]] = select i1 [[C]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP11:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 [[TMP10]], !dbg [[DBG1]]
-; CHECK-DAG:    [[COND:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]], !dbg [[DBG1]]
-; CHECK-DAG:    store i32 [[_MSPROP_SELECT]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 [[TMP11]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i1, i1* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i1*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C:%.*]], i32 [[TMP2]], i32 [[TMP4]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP2]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP8]], [[TMP4]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], i32 [[TMP9]], i32 [[TMP6]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[C]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 [[TMP10]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]], !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 [[_MSPROP_SELECT]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 [[TMP11]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[COND]]
  ;
  entry:
    %cond = select i1 %c, i32 %a, i32 %b, !dbg !10
@@ -309,27 +323,28 @@ entry:
  define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind uwtable readnone sanitize_memory {
  ; CHECK-LABEL: @SelectVector(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load <8 x i1>, <8 x i1>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i1>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 32) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = select <8 x i1> [[C:%.*]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = or <8 x i16> [[TMP7]], [[TMP2]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP4]], !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP9]], <8 x i16> [[TMP6]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP10:%.*]] = bitcast <8 x i1> [[C]] to i8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP12:%.*]] = bitcast <8 x i1> [[TMP0]] to i8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP14:%.*]] = select i1 [[TMP11]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 [[TMP1]], i32 [[TMP14]], !dbg [[DBG1]]
-; CHECK-DAG:    [[COND:%.*]] = select <8 x i1> [[C]], <8 x i16> [[A]], <8 x i16> [[B]], !dbg [[DBG1]]
-; CHECK-DAG:    store <8 x i16> [[_MSPROP_SELECT]], <8 x i16>* bitcast ([100 x i64]* @__msan_retval_tls to <8 x i16>*), align 8
-; CHECK-DAG:    store i32 [[TMP15]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret <8 x i16> [[COND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i1>, <8 x i1>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <8 x i1>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 32) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[C:%.*]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP7]], [[TMP2]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP4]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP9]], <8 x i16> [[TMP6]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[C]] to i8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i1> [[TMP0]] to i8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 [[TMP1]], i32 [[TMP14]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[COND:%.*]] = select <8 x i1> [[C]], <8 x i16> [[A]], <8 x i16> [[B]], !dbg [[DBG1]]
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP_SELECT]], <8 x i16>* bitcast ([100 x i64]* @__msan_retval_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    store i32 [[TMP15]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <8 x i16> [[COND]]
  ;
  entry:
    %cond = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b, !dbg !10
@@ -342,12 +357,13 @@ entry:
  define i8* @IntToPtr(i64 %x) nounwind uwtable readnone sanitize_memory {
  ; CHECK-LABEL: @IntToPtr(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = inttoptr i64 [[X:%.*]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    store i64 [[TMP0]], i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_retval_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    store i32 [[TMP1]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i8* [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[X:%.*]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    store i64 [[TMP0]], i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_retval_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    store i32 [[TMP1]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i8* [[TMP2]]
  ;
  entry:
    %0 = inttoptr i64 %x to i8*, !dbg !10
@@ -361,15 +377,16 @@ entry:
  define i32 @Div(i32 %a, i32 %b) nounwind uwtable readnone sanitize_memory {
  ; CHECK-LABEL: @Div(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[DIV:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
-; CHECK-DAG:    store i32 [[TMP2]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 [[TMP3]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[DIV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 8) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 [[TMP2]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 [[TMP3]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[DIV]]
  ;
  entry:
    %div = udiv i32 %a, %b, !dbg !10
@@ -384,27 +401,28 @@ entry:
  
  define i32 @ShadowLoadAlignmentLarge() nounwind uwtable sanitize_memory {
  ; CHECK-LABEL: @ShadowLoadAlignmentLarge(
-; CHECK-DAG:    [[Y:%.*]] = alloca i32, align 64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = ptrtoint i32* [[Y]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = and i64 [[TMP4]], -4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 64 [[TMP3]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = bitcast i32* [[Y]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP7]], i64 4, i8* bitcast (i32* @[[GLOB0:[0-9]+]] to i8*), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @[[GLOB1:[0-9]+]], i32 0, i32 0)), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = load volatile i32, i32* [[Y]], align 64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP9:%.*]] = ptrtoint i32* [[Y]] to i64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP12:%.*]] = add i64 [[TMP10]], 17592186044416, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    [[_MSLD:%.*]] = load i32, i32* [[TMP11]], align 64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 64, !dbg [[DBG7]]
-; CHECK-DAG:    store i32 [[_MSLD]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 [[TMP14]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[TMP8]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[Y:%.*]] = alloca i32, align 64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint i32* [[Y]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 64 [[TMP3]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[Y]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP7]], i64 4, i8* bitcast (i32* @[[GLOB0:[0-9]+]] to i8*), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @[[GLOB1:[0-9]+]], i32 0, i32 0)), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load volatile i32, i32* [[Y]], align 64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint i32* [[Y]] to i64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 17592186044416, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, i32* [[TMP11]], align 64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 64, !dbg [[DBG7]]
+; CHECK-NEXT:    store i32 [[_MSLD]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 [[TMP14]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[TMP8]]
  ;
    %y = alloca i32, align 64, !dbg !10
    %1 = load volatile i32, i32* %y, align 64, !dbg !11
@@ -415,16 +433,17 @@ define i32 @ShadowLoadAlignmentLarge() nounwind uwtable sanitize_memory {
  
  define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
  ; CHECK-LABEL: @ExtractElement(
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 [[IDX:%.*]], !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[X:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i32 [[IDX]], !dbg [[DBG1]]
-; CHECK-DAG:    store i32 [[_MSPROP]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 [[TMP4]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[X]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 [[IDX:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i32 [[IDX]], !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 [[_MSPROP]], i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[X]]
  ;
    %x = extractelement <4 x i32> %vec, i32 %idx, !dbg !10
    ret i32 %x
@@ -433,22 +452,23 @@ define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
  
  define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memory {
  ; CHECK-LABEL: @InsertElement(
-; CHECK-DAG:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 24) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX:%.*]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP4]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = icmp ne i32 [[TMP1]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP2]], i32 [[TMP8]], !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
-; CHECK-DAG:    [[VEC1:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[X:%.*]], i32 [[IDX]], !dbg [[DBG1]]
-; CHECK-DAG:    store <4 x i32> [[_MSPROP]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
-; CHECK-DAG:    store i32 [[TMP10]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret <4 x i32> [[VEC1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 24) to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 24) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP4]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP1]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP2]], i32 [[TMP8]], !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[X:%.*]], i32 [[IDX]], !dbg [[DBG1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    store i32 [[TMP10]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <4 x i32> [[VEC1]]
  ;
    %vec1 = insertelement <4 x i32> %vec, i32 %x, i32 %idx, !dbg !10
    ret <4 x i32> %vec1
@@ -457,18 +477,19 @@ define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memor
  
  define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) sanitize_memory {
  ; CHECK-LABEL: @ShuffleVector(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = icmp ne i128 [[TMP5]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 [[TMP4]], i32 [[TMP2]], !dbg [[DBG1]]
-; CHECK-DAG:    [[VEC2:%.*]] = shufflevector <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
-; CHECK-DAG:    store <4 x i32> [[_MSPROP]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
-; CHECK-DAG:    store i32 [[TMP7]], i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret <4 x i32> [[VEC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* inttoptr (i64 add (i64 ptrtoint ([200 x i32]* @__msan_param_origin_tls to i64), i64 16) to i32*), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i128 [[TMP5]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 [[TMP4]], i32 [[TMP2]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    store i32 [[TMP7]], i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret <4 x i32> [[VEC2]]
  ;
    %vec2 = shufflevector <4 x i32> %vec, <4 x i32> %vec1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg !10
    ret <4 x i32> %vec2
@@ -482,79 +503,80 @@ declare void @llvm.va_start(i8*) nounwind
  define void @VAStart(i32 %x, ...) sanitize_memory {
  ; CHECK-LABEL: @VAStart(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = load i64, i64* @__msan_va_arg_overflow_size_tls, align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = add i64 176, [[TMP0]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 1, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 bitcast ([100 x i64]* @__msan_va_arg_tls to i8*), i64 [[TMP1]], i1 false), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = alloca i8, i64 [[TMP1]], align 1, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 bitcast ([200 x i32]* @__msan_va_arg_origin_tls to i8*), i64 [[TMP1]], i1 false), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[X_ADDR:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP6:%.*]] = ptrtoint i32* [[X_ADDR]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP10:%.*]] = and i64 [[TMP9]], -4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP8]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP12:%.*]] = bitcast i32* [[X_ADDR]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP12]], i64 4, i8* bitcast (i32* @[[GLOB2:[0-9]+]] to i8*), i8* getelementptr inbounds ([7 x i8], [7 x i8]* @[[GLOB3:[0-9]+]], i32 0, i32 0)), !dbg [[DBG1]]
-; CHECK-DAG:    [[VA:%.*]] = alloca [1 x %struct.__va_list_tag], align 16, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP13:%.*]] = ptrtoint [1 x %struct.__va_list_tag]* [[VA]] to i64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to i8*, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP17:%.*]] = and i64 [[TMP16]], -4, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 16 [[TMP15]], i8 -1, i64 24, i1 false), !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP19:%.*]] = bitcast [1 x %struct.__va_list_tag]* [[VA]] to i8*, !dbg [[DBG7]]
-; CHECK-DAG:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP19]], i64 24, i8* bitcast (i32* @[[GLOB4:[0-9]+]] to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @[[GLOB5:[0-9]+]], i32 0, i32 0)), !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP20:%.*]] = ptrtoint i32* [[X_ADDR]] to i64, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to i32*, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP23:%.*]] = add i64 [[TMP21]], 17592186044416, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to i32*, !dbg [[DBG8]]
-; CHECK-DAG:    store i32 [[TMP4]], i32* [[TMP22]], align 4, !dbg [[DBG8]]
-; CHECK-DAG:    [[TMP25:%.*]] = bitcast i32* [[X_ADDR]] to i8*, !dbg [[DBG8]]
-; CHECK-DAG:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP4]], i8* [[TMP25]], i32 zeroext [[TMP5]]), !dbg [[DBG8]]
-; CHECK-DAG:    store i32 [[X:%.*]], i32* [[X_ADDR]], align 4, !dbg [[DBG8]]
-; CHECK-DAG:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* [[VA]], i32 0, i32 0, !dbg [[DBG9:![0-9]+]]
-; CHECK-DAG:    [[ARRAYDECAY1:%.*]] = bitcast %struct.__va_list_tag* [[ARRAYDECAY]] to i8*, !dbg [[DBG10:![0-9]+]]
-; CHECK-DAG:    [[TMP26:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11:![0-9]+]]
-; CHECK-DAG:    [[TMP27:%.*]] = xor i64 [[TMP26]], 87960930222080, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to i8*, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP29:%.*]] = add i64 [[TMP27]], 17592186044416, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to i32*, !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP28]], i8 0, i64 24, i1 false), !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.va_start(i8* [[ARRAYDECAY1]]), !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP31:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP32:%.*]] = add i64 [[TMP31]], 16, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to i64**, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP34:%.*]] = load i64*, i64** [[TMP33]], align 8, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP35:%.*]] = ptrtoint i64* [[TMP34]] to i64, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP36:%.*]] = xor i64 [[TMP35]], 87960930222080, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP37:%.*]] = inttoptr i64 [[TMP36]] to i8*, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP38:%.*]] = add i64 [[TMP36]], 17592186044416, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to i32*, !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP37]], i8* align 16 [[TMP2]], i64 176, i1 false), !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP40:%.*]] = bitcast i32* [[TMP39]] to i8*, !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP40]], i8* align 16 [[TMP3]], i64 176, i1 false), !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP41:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP42:%.*]] = add i64 [[TMP41]], 8, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to i64**, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP44:%.*]] = load i64*, i64** [[TMP43]], align 8, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP45:%.*]] = ptrtoint i64* [[TMP44]] to i64, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP46:%.*]] = xor i64 [[TMP45]], 87960930222080, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to i8*, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP48:%.*]] = add i64 [[TMP46]], 17592186044416, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to i32*, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP50:%.*]] = getelementptr i8, i8* [[TMP2]], i32 176, !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP47]], i8* align 16 [[TMP50]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP51:%.*]] = getelementptr i8, i8* [[TMP3]], i32 176, !dbg [[DBG11]]
-; CHECK-DAG:    [[TMP52:%.*]] = bitcast i32* [[TMP49]] to i8*, !dbg [[DBG11]]
-; CHECK-DAG:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP52]], i8* align 16 [[TMP51]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
-; CHECK-DAG:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* @__msan_va_arg_overflow_size_tls, align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 176, [[TMP0]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 1, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 bitcast ([100 x i64]* @__msan_va_arg_tls to i8*), i64 [[TMP1]], i1 false), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca i8, i64 [[TMP1]], align 1, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 bitcast ([200 x i32]* @__msan_va_arg_origin_tls to i8*), i64 [[TMP1]], i1 false), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__msan_param_origin_tls, i32 0, i32 0), align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[X_ADDR]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP8]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[X_ADDR]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP12]], i64 4, i8* bitcast (i32* @[[GLOB2:[0-9]+]] to i8*), i8* getelementptr inbounds ([7 x i8], [7 x i8]* @[[GLOB3:[0-9]+]], i32 0, i32 0)), !dbg [[DBG1]]
+; CHECK-NEXT:    [[VA:%.*]] = alloca [1 x %struct.__va_list_tag], align 16, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint [1 x %struct.__va_list_tag]* [[VA]] to i64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to i8*, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], -4, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 16 [[TMP15]], i8 -1, i64 24, i1 false), !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast [1 x %struct.__va_list_tag]* [[VA]] to i8*, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(i8* [[TMP19]], i64 24, i8* bitcast (i32* @[[GLOB4:[0-9]+]] to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @[[GLOB5:[0-9]+]], i32 0, i32 0)), !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint i32* [[X_ADDR]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to i32*, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP21]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to i32*, !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP22]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[X_ADDR]] to i8*, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP4]], i8* [[TMP25]], i32 zeroext [[TMP5]]), !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[X_ADDR]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* [[VA]], i32 0, i32 0, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = bitcast %struct.__va_list_tag* [[ARRAYDECAY]] to i8*, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 87960930222080, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to i8*, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[TMP27]], 17592186044416, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to i32*, !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP28]], i8 0, i64 24, i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.va_start(i8* [[ARRAYDECAY1]]), !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 16, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to i64**, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i64*, i64** [[TMP33]], align 8, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP35:%.*]] = ptrtoint i64* [[TMP34]] to i64, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP36:%.*]] = xor i64 [[TMP35]], 87960930222080, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP37:%.*]] = inttoptr i64 [[TMP36]] to i8*, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP36]], 17592186044416, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to i32*, !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP37]], i8* align 16 [[TMP2]], i64 176, i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i32* [[TMP39]] to i8*, !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP40]], i8* align 16 [[TMP3]], i64 176, i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP41:%.*]] = ptrtoint i8* [[ARRAYDECAY1]] to i64, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 8, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to i64**, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP44:%.*]] = load i64*, i64** [[TMP43]], align 8, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP45:%.*]] = ptrtoint i64* [[TMP44]] to i64, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP46:%.*]] = xor i64 [[TMP45]], 87960930222080, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to i8*, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP48:%.*]] = add i64 [[TMP46]], 17592186044416, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to i32*, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, i8* [[TMP2]], i32 176, !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP47]], i8* align 16 [[TMP50]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, i8* [[TMP3]], i32 176, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP49]] to i8*, !dbg [[DBG11]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP52]], i8* align 16 [[TMP51]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    ret void
  ;
  entry:
    %x.addr = alloca i32, align 4, !dbg !10
@@ -571,19 +593,20 @@ entry:
  define i32 @NoSanitizeMemory(i32 %x) uwtable {
  ; CHECK-LABEL: @NoSanitizeMemory(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TMP0:%.*]] = xor i32 [[X:%.*]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = and i32 -1, [[TMP0]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSPROP_ICMP:%.*]] = and i1 false, [[TMP2]], !dbg [[DBG1]]
-; CHECK-DAG:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0, !dbg [[DBG1]]
-; CHECK-DAG:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 -1, [[TMP0]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 false, [[TMP2]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0, !dbg [[DBG1]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG7]]
  ; CHECK:       if.then:
-; CHECK-DAG:    tail call void @bar(), !dbg [[DBG8]]
-; CHECK-DAG:    br label [[IF_END]]
+; CHECK-NEXT:    tail call void @bar(), !dbg [[DBG8]]
+; CHECK-NEXT:    br label [[IF_END]]
  ; CHECK:       if.end:
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 0, i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[X]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 0, i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[X]]
  ;
  entry:
    %tobool = icmp eq i32 %x, 0, !dbg !10
@@ -603,22 +626,23 @@ declare void @bar()
  define i32 @NoSanitizeMemoryAlloca() {
  ; CHECK-LABEL: @NoSanitizeMemoryAlloca(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[P:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP0:%.*]] = ptrtoint i32* [[P]] to i64, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i8*, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG1]]
-; CHECK-DAG:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP2]], i8 0, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-DAG:    store i64 0, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG7]]
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG7]]
-; CHECK-DAG:    [[X:%.*]] = call i32 @NoSanitizeMemoryAllocaHelper(i32* [[P]]), !dbg [[DBG7]]
-; CHECK-DAG:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    [[TMP6:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 0, i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[X]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i32* [[P]] to i64, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i8*, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP2]], i8 0, i64 4, i1 false), !dbg [[DBG1]]
+; CHECK-NEXT:    store i64 0, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG7]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG7]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryAllocaHelper(i32* [[P]]), !dbg [[DBG7]]
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 0, i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[X]]
  ;
  entry:
    %p = alloca i32, align 4, !dbg !10
@@ -634,14 +658,15 @@ declare i32 @NoSanitizeMemoryAllocaHelper(i32* %p)
  define i32 @NoSanitizeMemoryUndef() {
  ; CHECK-LABEL: @NoSanitizeMemoryUndef(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
-; CHECK-DAG:    [[X:%.*]] = call i32 @NoSanitizeMemoryUndefHelper(i32 undef), !dbg [[DBG1]]
-; CHECK-DAG:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    [[TMP0:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
-; CHECK-DAG:    store i32 0, i32* @__msan_retval_origin_tls, align 4
-; CHECK-DAG:    ret i32 [[X]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_param_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8, !dbg [[DBG1]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryUndefHelper(i32 undef), !dbg [[DBG1]]
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    store i32 0, i32* bitcast ([100 x i64]* @__msan_retval_tls to i32*), align 8
+; CHECK-NEXT:    store i32 0, i32* @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[X]]
  ;
  entry:
    %x = call i32 @NoSanitizeMemoryUndefHelper(i32 undef), !dbg !10
@@ -658,20 +683,21 @@ declare void @foo8(i8* nocapture)
  define void @msan() sanitize_memory {
  ; CHECK-LABEL: @msan(
  ; CHECK-NEXT:  entry:
-; CHECK-DAG:    [[TEXT:%.*]] = alloca i8, align 1, !dbg [[DBG1]]
-; CHECK-DAG:    call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TEXT]]), !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP0:%.*]] = ptrtoint i8* [[TEXT]] to i64, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i8*, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG7]]
-; CHECK-DAG:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG7]]
-; CHECK-DAG:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP2]], i8 -1, i64 1, i1 false), !dbg [[DBG7]]
-; CHECK-DAG:    call void @__msan_set_alloca_origin_with_descr(i8* [[TEXT]], i64 1, i8* bitcast (i32* @[[GLOB6:[0-9]+]] to i8*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[GLOB7:[0-9]+]], i32 0, i32 0)), !dbg [[DBG7]]
-; CHECK-DAG:    store i64 0, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG8]]
-; CHECK-DAG:    call void @foo8(i8* [[TEXT]]), !dbg [[DBG8]]
-; CHECK-DAG:    call void @llvm.lifetime.end.p0i8(i64 1, i8* [[TEXT]]), !dbg [[DBG9]]
-; CHECK-DAG:    ret void, !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TEXT:%.*]] = alloca i8, align 1, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TEXT]]), !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint i8* [[TEXT]] to i64, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to i8*, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to i32*, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP2]], i8 -1, i64 1, i1 false), !dbg [[DBG7]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(i8* [[TEXT]], i64 1, i8* bitcast (i32* @[[GLOB6:[0-9]+]] to i8*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[GLOB7:[0-9]+]], i32 0, i32 0)), !dbg [[DBG7]]
+; CHECK-NEXT:    store i64 0, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @foo8(i8* [[TEXT]]), !dbg [[DBG8]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 1, i8* [[TEXT]]), !dbg [[DBG9]]
+; CHECK-NEXT:    ret void, !dbg [[DBG10]]
  ;
  entry:
    %text = alloca i8, align 1, !dbg !10
diff --git a/llvm/test/Instrumentation/MemorySanitizer/sse-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/sse-intrinsics-x86.ll

index 8e5fecf..00ae71a 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/sse-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/sse-intrinsics-x86.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
  
  define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_cmp_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
@@ -24,8 +24,8 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
  
  define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_cmp_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -44,8 +44,8 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
  
  define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comieq_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -63,8 +63,8 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comige_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -82,8 +82,8 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comigt_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -101,8 +101,8 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comile_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -120,8 +120,8 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comilt_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -139,8 +139,8 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_comineq_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -158,7 +158,7 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_cvtss2si(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -179,7 +179,7 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_cvttss2si(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -200,7 +200,7 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
  
  define void @test_x86_sse_ldmxcsr(i8* %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_ldmxcsr(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i8* [[A0:%.*]] to i64
  ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -226,8 +226,8 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
  
  define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_max_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]])
@@ -242,8 +242,8 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
  
  define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_max_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -259,8 +259,8 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
  
  define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_min_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]])
@@ -275,8 +275,8 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
  
  define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_min_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -292,7 +292,7 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
  
  define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_movmsk_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -314,7 +314,7 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
  
  define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_rcp_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
@@ -328,7 +328,7 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
  
  define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_rcp_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
@@ -342,7 +342,7 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
  
  define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_rsqrt_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
@@ -356,7 +356,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
  
  define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_rsqrt_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> [[A0:%.*]])
  ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([100 x i64]* @__msan_retval_tls to <4 x i32>*), align 8
@@ -370,7 +370,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
  
  define void @test_x86_sse_stmxcsr(i8* %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse_stmxcsr(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i8* [[A0:%.*]] to i64
  ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -393,8 +393,8 @@ declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
  
  define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomieq_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -412,8 +412,8 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomige_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -431,8 +431,8 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomigt_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -450,8 +450,8 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomile_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -469,8 +469,8 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomilt_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -488,8 +488,8 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
  
  define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse_ucomineq_ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/sse2-intrinsics-x86.ll

index a1b937c..e57f293 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/sse2-intrinsics-x86.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
  
  define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cmp_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -24,8 +24,8 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
  
  define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cmp_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -44,8 +44,8 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
  
  define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comieq_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -63,8 +63,8 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
  
  define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comige_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -82,8 +82,8 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
  
  define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comigt_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -101,8 +101,8 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
  
  define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comile_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -120,8 +120,8 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
  
  define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comilt_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -139,8 +139,8 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
  
  define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_comineq_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -158,7 +158,7 @@ declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readn
  
  define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtpd2dq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -179,7 +179,7 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
  
  define <2 x i64> @test_mm_cvtpd_epi32_zext(<2 x double> %a0) nounwind #0 {
  ; CHECK-LABEL: @test_mm_cvtpd_epi32_zext(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -203,7 +203,7 @@ define <2 x i64> @test_mm_cvtpd_epi32_zext(<2 x double> %a0) nounwind #0 {
  
  define <2 x i64> @test_mm_cvtpd_epi32_zext_load(<2 x double>* %p0) nounwind #0 {
  ; CHECK-LABEL: @test_mm_cvtpd_epi32_zext_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
@@ -239,7 +239,7 @@ define <2 x i64> @test_mm_cvtpd_epi32_zext_load(<2 x double>* %p0) nounwind #0 {
  
  define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtpd2ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -259,7 +259,7 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
  
  define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtpd2ps_zext(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -280,7 +280,7 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind #0 {
  
  define <4 x float> @test_x86_sse2_cvtpd2ps_zext_load(<2 x double>* %p0) nounwind #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtpd2ps_zext_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
@@ -313,7 +313,7 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext_load(<2 x double>* %p0) nounwind
  
  define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtps2dq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -334,7 +334,7 @@ declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
  
  define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtsd2si(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
@@ -355,8 +355,8 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
  
  define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
  ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
@@ -378,8 +378,8 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
  
  define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
@@ -412,8 +412,8 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %
  
  define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, <2 x double>* %p1) optsize #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load_optsize(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
@@ -446,7 +446,7 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, <2 x do
  
  define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvttpd2dq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -467,7 +467,7 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
  
  define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind #0 {
  ; CHECK-LABEL: @test_mm_cvttpd_epi32_zext(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -491,7 +491,7 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind #0 {
  
  define <2 x i64> @test_mm_cvttpd_epi32_zext_load(<2 x double>* %p0) nounwind #0 {
  ; CHECK-LABEL: @test_mm_cvttpd_epi32_zext_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
@@ -527,7 +527,7 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext_load(<2 x double>* %p0) nounwind #0
  
  define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvttps2dq(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -548,7 +548,7 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
  
  define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_cvttsd2si(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
@@ -569,8 +569,8 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
  
  define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_max_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]])
@@ -585,8 +585,8 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
  
  define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_max_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP3]], <2 x i32> <i32 2, i32 1>
@@ -602,8 +602,8 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
  
  define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_min_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]])
@@ -618,8 +618,8 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
  
  define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_min_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP3]], <2 x i32> <i32 2, i32 1>
@@ -635,7 +635,7 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
  
  define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_movmsk_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -656,8 +656,8 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_packssdw_128(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -689,8 +689,8 @@ define <8 x i16> @test_x86_sse2_packssdw_128_fold() #0 {
  
  define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_packsswb_128(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
@@ -722,8 +722,8 @@ define <16 x i8> @test_x86_sse2_packsswb_128_fold() #0 {
  
  define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_packuswb_128(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
@@ -755,8 +755,8 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() #0 {
  
  define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pavg_b(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]])
@@ -771,8 +771,8 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pavg_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -787,8 +787,8 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pmadd_wd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
@@ -806,7 +806,7 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
  
  define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pmovmskb_128(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
@@ -827,8 +827,8 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pmulh_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -843,8 +843,8 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pmulhu_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -859,8 +859,8 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
  
  define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psad_bw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
@@ -879,8 +879,8 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psll_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -901,8 +901,8 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psll_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -923,8 +923,8 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psll_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -945,7 +945,7 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pslli_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
@@ -961,7 +961,7 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
  
  define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pslli_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer
@@ -977,7 +977,7 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_pslli_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer
@@ -993,8 +993,8 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psra_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1015,8 +1015,8 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psra_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1037,7 +1037,7 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrai_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
@@ -1053,7 +1053,7 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrai_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer
@@ -1069,8 +1069,8 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
  
  define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrl_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1091,8 +1091,8 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
  
  define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrl_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1113,8 +1113,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrl_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
  ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1135,8 +1135,8 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrl_w_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
@@ -1168,7 +1168,7 @@ define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) #0 {
  
  define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrli_d(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer
@@ -1184,7 +1184,7 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
  
  define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrli_q(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP2]], zeroinitializer
@@ -1200,7 +1200,7 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
  
  define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse2_psrli_w(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> [[TMP1]], i32 7)
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i16> [[TMP2]], zeroinitializer
@@ -1216,8 +1216,8 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
  
  define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomieq_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1235,8 +1235,8 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomige_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1254,8 +1254,8 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomigt_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1273,8 +1273,8 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomile_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1292,8 +1292,8 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomilt_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1311,8 +1311,8 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
  
  define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse2_ucomineq_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1362,7 +1362,7 @@ declare void @llvm.x86.sse2.mfence() nounwind
  
  define void @clflush(i8* %p) nounwind #0 {
  ; CHECK-LABEL: @clflush(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/sse41-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/sse41-intrinsics-x86.ll

index 4681c0c..6a625e5 100644 (file)
--- a/llvm/test/Instrumentation/MemorySanitizer/sse41-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/sse41-intrinsics-x86.ll
@@ -6,9 +6,9 @@ target triple = "x86_64-unknown-linux-gnu"
  
  define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
  ; CHECK-LABEL: @test_x86_sse41_blendvpd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
@@ -24,9 +24,9 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
  
  define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
  ; CHECK-LABEL: @test_x86_sse41_blendvps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -42,8 +42,8 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
  
  define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_dppd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -52,7 +52,7 @@ define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) #0
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0:![0-9]+]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 7)
@@ -67,8 +67,8 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
  
  define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_dpps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -77,7 +77,7 @@ define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 7)
@@ -92,8 +92,8 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
  
  define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_insertps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -102,7 +102,7 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) #0
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 17)
@@ -118,8 +118,8 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
  
  define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_mpsadbw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -128,7 +128,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]], i8 7)
@@ -142,13 +142,13 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
  
  define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_mpsadbw_load_op0(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* getelementptr inbounds ([100 x i64], [100 x i64]* @__msan_param_tls, i32 0, i32 0), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 8) to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A0:%.*]] = load <16 x i8>, <16 x i8>* [[PTR:%.*]], align 16
@@ -163,7 +163,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(<16 x i8>* %ptr, <16 x i8> %a1
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
  ; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       11:
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> [[A0]], <16 x i8> [[A1:%.*]], i8 7)
@@ -177,8 +177,8 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(<16 x i8>* %ptr, <16 x i8> %a1
  
  define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_packusdw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
  ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -210,9 +210,9 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() #0 {
  
  define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) #0 {
  ; CHECK-LABEL: @test_x86_sse41_pblendvb(
-; CHECK-DAG:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
-; CHECK-DAG:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([100 x i64]* @__msan_param_tls to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <16 x i8>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 32) to <16 x i8>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
  ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
@@ -228,7 +228,7 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
  
  define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse41_phminposuw(
-; CHECK-DAG:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([100 x i64]* @__msan_param_tls to <8 x i16>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> [[A0:%.*]])
  ; CHECK-NEXT:    store <8 x i16> [[TMP1]], <8 x i16>* bitcast ([100 x i64]* @__msan_retval_tls to <8 x i16>*), align 8
@@ -242,8 +242,8 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
  
  define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_ptestc(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -252,7 +252,7 @@ define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse41.ptestc(<2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]])
@@ -267,8 +267,8 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
  
  define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_ptestnzc(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -277,7 +277,7 @@ define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]])
@@ -292,8 +292,8 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
  
  define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_ptestz(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -302,7 +302,7 @@ define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) #0 {
  ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
  ; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
  ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       6:
  ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse41.ptestz(<2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]])
@@ -317,13 +317,13 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
  
  define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse41_round_pd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> [[A0:%.*]], i32 7)
@@ -338,13 +338,13 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
  
  define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) #0 {
  ; CHECK-LABEL: @test_x86_sse41_round_ps(
-; CHECK-DAG:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[A0:%.*]], i32 7)
@@ -359,8 +359,8 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
  
  define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_round_sd(
-; CHECK-DAG:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 1>
  ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 7)
@@ -375,13 +375,13 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
  
  define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, <2 x double>* %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_round_sd_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([100 x i64]* @__msan_param_tls to <2 x i64>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A1B:%.*]] = load <2 x double>, <2 x double>* [[A1:%.*]], align 16
@@ -402,13 +402,13 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, <2 x double>
  
  define <4 x float> @test_x86_sse41_round_ss_load(<4 x float> %a0, <4 x float>* %a1) #0 {
  ; CHECK-LABEL: @test_x86_sse41_round_ss_load(
-; CHECK-DAG:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
-; CHECK-DAG:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__msan_param_tls to i64), i64 16) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([100 x i64]* @__msan_param_tls to <4 x i32>*), align 8
  ; CHECK-NEXT:    call void @llvm.donothing()
  ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
  ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
  ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
  ; CHECK-NEXT:    unreachable
  ; CHECK:       4:
  ; CHECK-NEXT:    [[A1B:%.*]] = load <4 x float>, <4 x float>* [[A1:%.*]], align 16
author	Vitaly Buka <vitalybuka@google.com>
	Sat, 10 Sep 2022 22:19:00 +0000 (15:19 -0700)
committer	Vitaly Buka <vitalybuka@google.com>
	Sat, 10 Sep 2022 22:28:33 +0000 (15:28 -0700)
llvm/test/Instrumentation/MemorySanitizer/avx-intrinsics-x86.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/avx2-intrinsics-x86.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/disambiguate-origin.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/sse-intrinsics-x86.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/sse2-intrinsics-x86.ll		patch \| blob \| history
llvm/test/Instrumentation/MemorySanitizer/sse41-intrinsics-x86.ll		patch \| blob \| history