}
define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
-; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
-; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
-; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
-; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
-; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
-; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
-; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
-; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
-; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
-; CHECK-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
-; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
-; CHECK-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
-; CHECK-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
-; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
-; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
-; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
-; CHECK-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
-; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
-; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
-; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
-; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
-; CHECK-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
-; CHECK-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
-; CHECK-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
-; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
-; CHECK-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
-; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
-; CHECK-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
-; CHECK-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
-; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
-; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
-; CHECK-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
-; CHECK-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
-; CHECK-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
-; CHECK-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
-; CHECK-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
-; CHECK-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
-; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
-; CHECK-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
-; CHECK-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
-; CHECK-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
-; CHECK-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
-; CHECK-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
-; CHECK-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
-; CHECK-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
-; CHECK-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
-; CHECK-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
-; CHECK-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
-; CHECK-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
-; CHECK-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
-; CHECK-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
-; CHECK-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
-; CHECK-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
-; CHECK-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
-; CHECK-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
-; CHECK-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
-; CHECK-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
-; CHECK-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
-; CHECK-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
-; CHECK-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
-; CHECK-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
-; CHECK-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
-; CHECK-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
-; CHECK-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
-; CHECK-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
-; CHECK-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
-; CHECK-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
-; CHECK-NEXT: [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
-; CHECK-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
-; CHECK-NEXT: [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
-; CHECK-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
-; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
-; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
-; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
-; CHECK-NEXT: [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
-; CHECK-NEXT: [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
-; CHECK-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
-; CHECK-NEXT: [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
-; CHECK-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
-; CHECK-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
-; CHECK-NEXT: store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1
-; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
-; CHECK-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
-; CHECK-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
-; CHECK-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
-; CHECK-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; SSE-LABEL: @bar(
+; SSE-NEXT: entry:
+; SSE-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[W:%.*]], i32 0
+; SSE-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[W]], i32 1
+; SSE-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[W]], i32 2
+; SSE-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[W]], i32 3
+; SSE-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W]], i32 1
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W]], i32 2
+; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[W]], i32 3
+; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[W]], i32 1
+; SSE-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[W]], i32 2
+; SSE-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[W]], i32 3
+; SSE-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> undef, i32 [[W]], i32 0
+; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[W]], i32 1
+; SSE-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[W]], i32 2
+; SSE-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[W]], i32 3
+; SSE-NEXT: br label [[FOR_BODY:%.*]]
+; SSE: for.body:
+; SSE-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
+; SSE-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
+; SSE-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
+; SSE-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
+; SSE-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
+; SSE-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
+; SSE-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
+; SSE-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
+; SSE-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
+; SSE-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
+; SSE-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; SSE-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>*
+; SSE-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1
+; SSE-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>*
+; SSE-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1
+; SSE-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; SSE-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>*
+; SSE-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1
+; SSE-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>*
+; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
+; SSE-NEXT: [[TMP24:%.*]] = icmp ult <4 x i8> [[TMP17]], [[TMP19]]
+; SSE-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP24]], <4 x i8> [[TMP23]], <4 x i8> [[TMP21]]
+; SSE-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[TMP25]] to <4 x i32>
+; SSE-NEXT: [[TMP27:%.*]] = mul <4 x i32> [[TMP26]], [[TMP3]]
+; SSE-NEXT: [[TMP28:%.*]] = trunc <4 x i32> [[TMP27]] to <4 x i8>
+; SSE-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
+; SSE-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>*
+; SSE-NEXT: store <4 x i8> [[TMP28]], <4 x i8>* [[TMP29]], align 1
+; SSE-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
+; SSE-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
+; SSE-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
+; SSE-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
+; SSE-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
+; SSE-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
+; SSE-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
+; SSE-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
+; SSE-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
+; SSE-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
+; SSE-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
+; SSE-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
+; SSE-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
+; SSE-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
+; SSE-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
+; SSE-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; SSE-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>*
+; SSE-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1
+; SSE-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; SSE-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>*
+; SSE-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1
+; SSE-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>*
+; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
+; SSE-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; SSE-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>*
+; SSE-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1
+; SSE-NEXT: [[TMP38:%.*]] = icmp ult <4 x i8> [[TMP31]], [[TMP33]]
+; SSE-NEXT: [[TMP39:%.*]] = select <4 x i1> [[TMP38]], <4 x i8> [[TMP37]], <4 x i8> [[TMP35]]
+; SSE-NEXT: [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32>
+; SSE-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP40]], [[TMP7]]
+; SSE-NEXT: [[TMP42:%.*]] = trunc <4 x i32> [[TMP41]] to <4 x i8>
+; SSE-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
+; SSE-NEXT: [[TMP43:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>*
+; SSE-NEXT: store <4 x i8> [[TMP42]], <4 x i8>* [[TMP43]], align 1
+; SSE-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
+; SSE-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
+; SSE-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
+; SSE-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
+; SSE-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
+; SSE-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
+; SSE-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
+; SSE-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
+; SSE-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
+; SSE-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
+; SSE-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
+; SSE-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
+; SSE-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
+; SSE-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
+; SSE-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
+; SSE-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; SSE-NEXT: [[TMP44:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>*
+; SSE-NEXT: [[TMP45:%.*]] = load <4 x i8>, <4 x i8>* [[TMP44]], align 1
+; SSE-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; SSE-NEXT: [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>*
+; SSE-NEXT: [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1
+; SSE-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; SSE-NEXT: [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>*
+; SSE-NEXT: [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1
+; SSE-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; SSE-NEXT: [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>*
+; SSE-NEXT: [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1
+; SSE-NEXT: [[TMP52:%.*]] = icmp ult <4 x i8> [[TMP45]], [[TMP47]]
+; SSE-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP52]], <4 x i8> [[TMP51]], <4 x i8> [[TMP49]]
+; SSE-NEXT: [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
+; SSE-NEXT: [[TMP55:%.*]] = mul <4 x i32> [[TMP54]], [[TMP11]]
+; SSE-NEXT: [[TMP56:%.*]] = trunc <4 x i32> [[TMP55]] to <4 x i8>
+; SSE-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
+; SSE-NEXT: [[TMP57:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>*
+; SSE-NEXT: store <4 x i8> [[TMP56]], <4 x i8>* [[TMP57]], align 1
+; SSE-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
+; SSE-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
+; SSE-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
+; SSE-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
+; SSE-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
+; SSE-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
+; SSE-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
+; SSE-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
+; SSE-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
+; SSE-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
+; SSE-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
+; SSE-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
+; SSE-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
+; SSE-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
+; SSE-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
+; SSE-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; SSE-NEXT: [[TMP58:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>*
+; SSE-NEXT: [[TMP59:%.*]] = load <4 x i8>, <4 x i8>* [[TMP58]], align 1
+; SSE-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; SSE-NEXT: [[TMP60:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>*
+; SSE-NEXT: [[TMP61:%.*]] = load <4 x i8>, <4 x i8>* [[TMP60]], align 1
+; SSE-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; SSE-NEXT: [[TMP62:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>*
+; SSE-NEXT: [[TMP63:%.*]] = load <4 x i8>, <4 x i8>* [[TMP62]], align 1
+; SSE-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; SSE-NEXT: [[TMP64:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>*
+; SSE-NEXT: [[TMP65:%.*]] = load <4 x i8>, <4 x i8>* [[TMP64]], align 1
+; SSE-NEXT: [[TMP66:%.*]] = icmp ult <4 x i8> [[TMP59]], [[TMP61]]
+; SSE-NEXT: [[TMP67:%.*]] = select <4 x i1> [[TMP66]], <4 x i8> [[TMP65]], <4 x i8> [[TMP63]]
+; SSE-NEXT: [[TMP68:%.*]] = zext <4 x i8> [[TMP67]] to <4 x i32>
+; SSE-NEXT: [[TMP69:%.*]] = mul <4 x i32> [[TMP68]], [[TMP15]]
+; SSE-NEXT: [[TMP70:%.*]] = trunc <4 x i32> [[TMP69]] to <4 x i8>
+; SSE-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
+; SSE-NEXT: [[TMP71:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>*
+; SSE-NEXT: store <4 x i8> [[TMP70]], <4 x i8>* [[TMP71]], align 1
+; SSE-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; SSE-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; SSE-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; SSE-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; SSE-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; SSE-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SSE: for.end:
+; SSE-NEXT: ret void
+;
+; AVX512-LABEL: @bar(
+; AVX512-NEXT: entry:
+; AVX512-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0
+; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1
+; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2
+; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3
+; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4
+; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5
+; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6
+; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7
+; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8
+; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9
+; AVX512-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10
+; AVX512-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11
+; AVX512-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12
+; AVX512-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13
+; AVX512-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14
+; AVX512-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15
+; AVX512-NEXT: br label [[FOR_BODY:%.*]]
+; AVX512: for.body:
+; AVX512-NEXT: [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; AVX512-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
+; AVX512-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
+; AVX512-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
+; AVX512-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
+; AVX512-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
+; AVX512-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
+; AVX512-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
+; AVX512-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
+; AVX512-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
+; AVX512-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
+; AVX512-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; AVX512-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; AVX512-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; AVX512-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; AVX512-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
+; AVX512-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
+; AVX512-NEXT: [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
+; AVX512-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
+; AVX512-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
+; AVX512-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
+; AVX512-NEXT: [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
+; AVX512-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
+; AVX512-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
+; AVX512-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
+; AVX512-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
+; AVX512-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
+; AVX512-NEXT: [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
+; AVX512-NEXT: [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
+; AVX512-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
+; AVX512-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
+; AVX512-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; AVX512-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; AVX512-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; AVX512-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; AVX512-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
+; AVX512-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
+; AVX512-NEXT: [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
+; AVX512-NEXT: [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
+; AVX512-NEXT: [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
+; AVX512-NEXT: [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
+; AVX512-NEXT: [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
+; AVX512-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
+; AVX512-NEXT: [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
+; AVX512-NEXT: [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
+; AVX512-NEXT: [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
+; AVX512-NEXT: [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
+; AVX512-NEXT: [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
+; AVX512-NEXT: [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
+; AVX512-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
+; AVX512-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
+; AVX512-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; AVX512-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; AVX512-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; AVX512-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; AVX512-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
+; AVX512-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
+; AVX512-NEXT: [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
+; AVX512-NEXT: [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
+; AVX512-NEXT: [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
+; AVX512-NEXT: [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
+; AVX512-NEXT: [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
+; AVX512-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
+; AVX512-NEXT: [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
+; AVX512-NEXT: [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
+; AVX512-NEXT: [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
+; AVX512-NEXT: [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
+; AVX512-NEXT: [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
+; AVX512-NEXT: [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
+; AVX512-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
+; AVX512-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
+; AVX512-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; AVX512-NEXT: [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
+; AVX512-NEXT: [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
+; AVX512-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; AVX512-NEXT: [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
+; AVX512-NEXT: [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
+; AVX512-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; AVX512-NEXT: [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
+; AVX512-NEXT: [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
+; AVX512-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; AVX512-NEXT: [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
+; AVX512-NEXT: [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
+; AVX512-NEXT: [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
+; AVX512-NEXT: [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
+; AVX512-NEXT: [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
+; AVX512-NEXT: [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
+; AVX512-NEXT: [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
+; AVX512-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
+; AVX512-NEXT: [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
+; AVX512-NEXT: store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1
+; AVX512-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; AVX512-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; AVX512-NEXT: [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; AVX512-NEXT: [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; AVX512-NEXT: [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; AVX512-NEXT: [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; AVX512: for.end:
+; AVX512-NEXT: ret void
;
entry:
br label %for.body