: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
+def TuningAllowLight256Bit
+ : SubtargetFeature<"allow-light-256-bit", "AllowLight256Bit", "true",
+ "Enable generation of 256-bit load/stores even if we prefer 128-bit">;
+
def TuningPreferMaskRegisters
: SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
"Prefer AVX512 mask registers over PTEST/MOVMSK">;
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> HSWFeatures =
!listconcat(IVBFeatures, HSWAdditionalFeatures);
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
TuningFastVariableCrossLaneShuffle,
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
TuningFastMOVBE,
TuningSlowSHLD,
TuningSBBDepBreaking,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAllowLight256Bit];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake| FileCheck %s --check-prefixes=CHECK,PREFER256
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=sandybridge| FileCheck %s --check-prefixes=CHECK,LIGHT256
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1| FileCheck %s --check-prefixes=CHECK,PREFER256
; This tests whether or not we generate vectors large than preferred vector width when
; lowering memmove.
; Function Attrs: nounwind uwtable
define weak_odr dso_local void @A(ptr %src, ptr %dst) local_unnamed_addr #0 {
-; CHECK-LABEL: A:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovups (%rdi), %xmm0
-; CHECK-NEXT: vmovups 16(%rdi), %xmm1
-; CHECK-NEXT: vmovups %xmm1, 16(%rsi)
-; CHECK-NEXT: vmovups %xmm0, (%rsi)
-; CHECK-NEXT: retq
+; PREFER256-LABEL: A:
+; PREFER256: # %bb.0: # %entry
+; PREFER256-NEXT: vmovups (%rdi), %ymm0
+; PREFER256-NEXT: vmovups %ymm0, (%rsi)
+; PREFER256-NEXT: vzeroupper
+; PREFER256-NEXT: retq
+;
+; LIGHT256-LABEL: A:
+; LIGHT256: # %bb.0: # %entry
+; LIGHT256-NEXT: vmovups (%rdi), %xmm0
+; LIGHT256-NEXT: vmovups 16(%rdi), %xmm1
+; LIGHT256-NEXT: vmovups %xmm1, 16(%rsi)
+; LIGHT256-NEXT: vmovups %xmm0, (%rsi)
+; LIGHT256-NEXT: retq
entry:
call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 32, i1 false)
ret void
; Function Attrs: nounwind uwtable
define weak_odr dso_local void @B(ptr %src, ptr %dst) local_unnamed_addr #0 {
-; CHECK-LABEL: B:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vmovups (%rdi), %xmm0
-; CHECK-NEXT: vmovups 16(%rdi), %xmm1
-; CHECK-NEXT: vmovups 32(%rdi), %xmm2
-; CHECK-NEXT: vmovups 48(%rdi), %xmm3
-; CHECK-NEXT: vmovups %xmm3, 48(%rsi)
-; CHECK-NEXT: vmovups %xmm2, 32(%rsi)
-; CHECK-NEXT: vmovups %xmm1, 16(%rsi)
-; CHECK-NEXT: vmovups %xmm0, (%rsi)
-; CHECK-NEXT: retq
+; PREFER256-LABEL: B:
+; PREFER256: # %bb.0: # %entry
+; PREFER256-NEXT: vmovups (%rdi), %ymm0
+; PREFER256-NEXT: vmovups 32(%rdi), %ymm1
+; PREFER256-NEXT: vmovups %ymm1, 32(%rsi)
+; PREFER256-NEXT: vmovups %ymm0, (%rsi)
+; PREFER256-NEXT: vzeroupper
+; PREFER256-NEXT: retq
+;
+; LIGHT256-LABEL: B:
+; LIGHT256: # %bb.0: # %entry
+; LIGHT256-NEXT: vmovups (%rdi), %xmm0
+; LIGHT256-NEXT: vmovups 16(%rdi), %xmm1
+; LIGHT256-NEXT: vmovups 32(%rdi), %xmm2
+; LIGHT256-NEXT: vmovups 48(%rdi), %xmm3
+; LIGHT256-NEXT: vmovups %xmm3, 48(%rsi)
+; LIGHT256-NEXT: vmovups %xmm2, 32(%rsi)
+; LIGHT256-NEXT: vmovups %xmm1, 16(%rsi)
+; LIGHT256-NEXT: vmovups %xmm0, (%rsi)
+; LIGHT256-NEXT: retq
entry:
call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 64, i1 false)
ret void
; Function Attrs: argmemonly nounwind
declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }