AMDGPU: Custom lower v2i32 loads and stores

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index ec0a032..21ab9de 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -101,6 +101,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
    setOperationAction(ISD::LOAD, MVT::i64, Promote);
    AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
  
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+
    setOperationAction(ISD::STORE, MVT::v8i32, Custom);
    setOperationAction(ISD::STORE, MVT::v16i32, Custom);
  
@@ -113,6 +115,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
    setOperationAction(ISD::STORE, MVT::i64, Promote);
    AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
  
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+
    setOperationAction(ISD::SELECT, MVT::i64, Custom);
    setOperationAction(ISD::SELECT, MVT::f64, Promote);
    AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -1905,10 +1909,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
  
    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
           "Custom lowering for non-i32 vectors hasn't been implemented.");
-  unsigned NumElements = MemVT.getVectorNumElements();
-  assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
  
-  switch (Load->getAddressSpace()) {
+  unsigned AS = Load->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                          AS, Load->getAlignment())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  unsigned NumElements = MemVT.getVectorNumElements();
+  switch (AS) {
    case AMDGPUAS::CONSTANT_ADDRESS:
      if (isMemOpUniform(Load))
        return SDValue();
@@ -1943,9 +1954,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
        llvm_unreachable("unsupported private_element_size");
      }
    }
-  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorLoad(Op, DAG);
+
+    if (NumElements == 2)
+      return SDValue();
+
      // If properly aligned, if we split we might be able to use ds_read_b64.
      return SplitVectorLoad(Op, DAG);
+  }
    default:
      return SDValue();
    }
@@ -2150,10 +2168,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         Store->getBasePtr(), MVT::i1, Store->getMemOperand());
    }
  
-  assert(Store->getValue().getValueType().getScalarType() == MVT::i32);
+  assert(VT.isVector() &&
+         Store->getValue().getValueType().getScalarType() == MVT::i32);
+
+  unsigned AS = Store->getAddressSpace();
+  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          AS, Store->getAlignment())) {
+    return expandUnalignedStore(Store, DAG);
+  }
  
    unsigned NumElements = VT.getVectorNumElements();
-  switch (Store->getAddressSpace()) {
+  switch (AS) {
    case AMDGPUAS::GLOBAL_ADDRESS:
    case AMDGPUAS::FLAT_ADDRESS:
      if (NumElements > 4)
@@ -2175,9 +2200,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
        llvm_unreachable("unsupported private_element_size");
      }
    }
-  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::LOCAL_ADDRESS: {
+    if (NumElements > 2)
+      return SplitVectorStore(Op, DAG);
+
+    if (NumElements == 2)
+      return Op;
+
      // If properly aligned, if we split we might be able to use ds_write_b64.
      return SplitVectorStore(Op, DAG);
+  }
    default:
      llvm_unreachable("unhandled address space");
    }
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll

index 82d88eb..4a77e00 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
  ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
  
  ; SI-LABEL: {{^}}unaligned_load_store_i16_local:
@@ -56,6 +56,29 @@ define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace
    ret void
  }
  
+; SI-LABEL: {{^}}align2_load_store_i32_global:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @align2_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+  %v = load i32, i32 addrspace(1)* %p, align 2
+  store i32 %v, i32 addrspace(1)* %r, align 2
+  ret void
+}
+
+; SI-LABEL: {{^}}align2_load_store_i32_local:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_write_b16
+; SI: ds_write_b16
+define void @align2_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+  %v = load i32, i32 addrspace(3)* %p, align 2
+  store i32 %v, i32 addrspace(3)* %r, align 2
+  ret void
+}
+
+; FIXME: Unnecessary packing and unpacking of bytes.
  ; SI-LABEL: {{^}}unaligned_load_store_i64_local:
  ; SI: ds_read_u8
  ; SI: ds_read_u8
@@ -65,13 +88,36 @@ define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace
  ; SI: ds_read_u8
  ; SI: ds_read_u8
  ; SI: ds_read_u8
+
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
  ; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
  ; SI: ds_write_b8
  ; SI: s_endpgm
  define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
@@ -80,6 +126,53 @@ define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(
    ret void
  }
  
+; SI-LABEL: {{^}}unaligned_load_store_v2i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+
+; SI: ds_write_b8
+; XSI-NOT: v_or_b32
+; XSI-NOT: v_lshl
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_v2i32_local(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) {
+  %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
+  store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
+  ret void
+}
+
  ; SI-LABEL: {{^}}unaligned_load_store_i64_global:
  ; SI: buffer_load_ubyte
  ; SI: buffer_load_ubyte
@@ -89,6 +182,10 @@ define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(
  ; SI: buffer_load_ubyte
  ; SI: buffer_load_ubyte
  ; SI: buffer_load_ubyte
+
+; XSI-NOT: v_or_
+; XSI-NOT: v_lshl
+
  ; SI: buffer_store_byte
  ; SI: buffer_store_byte
  ; SI: buffer_store_byte
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 2 May 2016 20:13:51 +0000 (20:13 +0000)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll		patch \| blob \| history