From d553ff3e2e08f659ab9977b97d9dbe518a759af0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 15 Jul 2018 21:49:01 +0000 Subject: [PATCH] [X86] Add load patterns for cases where we select X86Movss/X86Movsd to blend instructions. This allows us to fold the load during isel without waiting for the peephole pass to do it. llvm-svn: 337136 --- llvm/lib/Target/X86/X86InstrSSE.td | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index c2e1a94..ae4b3ce 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6421,13 +6421,29 @@ let Predicates = [HasAVX, OptForSpeed] in { def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), + (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), + (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; + def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>; + def : Pat<(v4i32 (X86Movss (bc_v4i32 (loadv2i64 addr:$src2)), VR128:$src1)), + (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), + (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), + (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, (loadv2i64 addr:$src2))), + (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>; + def : Pat<(v2i64 (X86Movsd (loadv2i64 addr:$src2), VR128:$src1)), + (VPBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>; // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), @@ -6465,13 +6481,29 @@ let Predicates = [UseSSE41, OptForSpeed] in { def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), + (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), + (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), (PBLENDWrri VR128:$src1, VR128:$src2, (i8 3))>; + def : Pat<(v4i32 (X86Movss VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))), + (PBLENDWrmi VR128:$src1, addr:$src2, (i8 3))>; + def : Pat<(v4i32 (X86Movss (bc_v4i32 (memopv2i64 addr:$src2)), VR128:$src1)), + (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xfc))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), + (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), + (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), (PBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, (memopv2i64 addr:$src2))), + (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf))>; + def : Pat<(v2i64 (X86Movsd (memopv2i64 addr:$src2), VR128:$src1)), + (PBLENDWrmi VR128:$src1, addr:$src2, (i8 0xf0))>; } -- 2.7.4