AMDGPU {NFC}: Add code object v5 support and generate metadata for implicit kernel...
authorChangpeng Fang <Changpeng.Fang@amd.com>
Tue, 1 Feb 2022 02:07:47 +0000 (18:07 -0800)
committerChangpeng Fang <Changpeng.Fang@amd.com>
Tue, 1 Feb 2022 02:07:47 +0000 (18:07 -0800)
Summary:
  Add code object v5 support (deafult is still v4)
  Generate metadata for implicit kernel args for the new ABI
  Set the metadata version to be 1.2

Reviewers:
  t-tye, b-sumner, arsenm, and bcahoon

Fixes:
  SWDEV-307188, SWDEV-307189

Differential Revision:
  https://reviews.llvm.org/D118272

18 files changed:
llvm/docs/AMDGPUUsage.rst
llvm/include/llvm/BinaryFormat/ELF.h
llvm/include/llvm/Support/AMDGPUMetadata.h
llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
llvm/lib/ObjectYAML/ELFYAML.cpp
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll [new file with mode: 0644]
llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll [new file with mode: 0644]
llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll [new file with mode: 0644]
llvm/tools/llvm-readobj/ELFDumper.cpp

index 5f0abbe..34c3868 100644 (file)
@@ -494,7 +494,7 @@ For example:
                                                   code that can be loaded and executed in a process
                                                   with SRAMECC enabled.
 
-                                                  If not specified for code object V4, generate
+                                                  If not specified for code object V4 or above, generate
                                                   code that can be loaded and executed in a process
                                                   with either setting of SRAMECC.
 
@@ -516,7 +516,7 @@ For example:
                                                   code that can be loaded and executed in a process
                                                   with XNACK replay enabled.
 
-                                                  If not specified for code object V4, generate
+                                                  If not specified for code object V4 or above, generate
                                                   code that can be loaded and executed in a process
                                                   with either setting of XNACK replay.
 
@@ -524,10 +524,10 @@ For example:
                                                   page migration. If enabled in the device, then if
                                                   a page fault occurs the code may execute
                                                   incorrectly unless generated with XNACK replay
-                                                  enabled, or generated for code object V4 without
+                                                  enabled, or generated for code object V4 or above without
                                                   specifying XNACK replay. Executing code that was
                                                   generated with XNACK replay enabled, or generated
-                                                  for code object V4 without specifying XNACK replay,
+                                                  for code object V4 or above without specifying XNACK replay,
                                                   on a device that does not have XNACK replay
                                                   enabled will execute correctly but may be less
                                                   performant than code generated for XNACK replay
@@ -954,6 +954,7 @@ The AMDGPU backend uses the following ELF header:
      ``e_ident[EI_ABIVERSION]`` - ``ELFABIVERSION_AMDGPU_HSA_V2``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V3``
                                 - ``ELFABIVERSION_AMDGPU_HSA_V4``
+                                - ``ELFABIVERSION_AMDGPU_HSA_V5``
                                 - ``ELFABIVERSION_AMDGPU_PAL``
                                 - ``ELFABIVERSION_AMDGPU_MESA3D``
      ``e_type``                 - ``ET_REL``
@@ -962,7 +963,7 @@ The AMDGPU backend uses the following ELF header:
      ``e_entry``                0
      ``e_flags``                See :ref:`amdgpu-elf-header-e_flags-v2-table`,
                                 :ref:`amdgpu-elf-header-e_flags-table-v3`,
-                                and :ref:`amdgpu-elf-header-e_flags-table-v4`
+                                and :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`
      ========================== ===============================
 
 ..
@@ -981,6 +982,7 @@ The AMDGPU backend uses the following ELF header:
      ``ELFABIVERSION_AMDGPU_HSA_V2`` 0
      ``ELFABIVERSION_AMDGPU_HSA_V3`` 1
      ``ELFABIVERSION_AMDGPU_HSA_V4`` 2
+     ``ELFABIVERSION_AMDGPU_HSA_V5`` 3
      ``ELFABIVERSION_AMDGPU_PAL``    0
      ``ELFABIVERSION_AMDGPU_MESA3D`` 0
      =============================== =====
@@ -1025,6 +1027,10 @@ The AMDGPU backend uses the following ELF header:
     ``-mcode-object-version=4``. This is the default code object
     version if not specified.
 
+  * ``ELFABIVERSION_AMDGPU_HSA_V5`` is used to specify the version of AMD HSA
+    runtime ABI for code object V5. Specify using the Clang option
+    ``-mcode-object-version=5``.
+
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
     runtime ABI.
 
@@ -1050,9 +1056,9 @@ The AMDGPU backend uses the following ELF header:
   :ref:`amdgpu-processor-table`). The specific processor is specified in the
   ``NT_AMD_HSA_ISA_VERSION`` note record for code object V2 (see
   :ref:`amdgpu-note-records-v2`) and in the ``EF_AMDGPU_MACH`` bit field of the
-  ``e_flags`` for code object V3 to V4 (see
+  ``e_flags`` for code object V3 and above (see
   :ref:`amdgpu-elf-header-e_flags-table-v3` and
-  :ref:`amdgpu-elf-header-e_flags-table-v4`).
+  :ref:`amdgpu-elf-header-e_flags-table-v4-onwards`).
 
 ``e_entry``
   The entry point is 0 as the entry points for individual kernels must be
@@ -1123,8 +1129,8 @@ The AMDGPU backend uses the following ELF header:
                                              :ref:`amdgpu-target-features`.
      ================================= ===== =============================
 
-  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4
-     :name: amdgpu-elf-header-e_flags-table-v4
+  .. table:: AMDGPU ELF Header ``e_flags`` for Code Object V4 and After
+     :name: amdgpu-elf-header-e_flags-table-v4-onwards
 
      ============================================ ===== ===================================
      Name                                         Value      Description
@@ -1283,7 +1289,7 @@ Note Records
 The AMDGPU backend code object contains ELF note records in the ``.note``
 section. The set of generated notes and their semantics depend on the code
 object version; see :ref:`amdgpu-note-records-v2` and
-:ref:`amdgpu-note-records-v3-v4`.
+:ref:`amdgpu-note-records-v3-onwards`.
 
 As required by ``ELFCLASS32`` and ``ELFCLASS64``, minimal zero-byte padding
 must be generated after the ``name`` field to ensure the ``desc`` field is 4
@@ -1462,21 +1468,21 @@ are deprecated and should not be used.
      ``AMD:AMDGPU:9:0:12`` ``gfx90c:xnack-``
      ===================== ==========================
 
-.. _amdgpu-note-records-v3-v4:
+.. _amdgpu-note-records-v3-onwards:
 
-Code Object V3 to V4 Note Records
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Code Object V3 and Above Note Records
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The AMDGPU backend code object uses the following ELF note record in the
-``.note`` section when compiling for code object V3 to V4.
+``.note`` section when compiling for code object V3 and above.
 
 The note record vendor field is "AMDGPU".
 
 Additional note records may be present, but any which are not documented here
 are deprecated and should not be used.
 
-  .. table:: AMDGPU Code Object V3 to V4 ELF Note Records
-     :name: amdgpu-elf-note-records-table-v3-v4
+  .. table:: AMDGPU Code Object V3 and Above ELF Note Records
+     :name: amdgpu-elf-note-records-table-v3-onwards
 
      ======== ============================== ======================================
      Name     Type                           Description
@@ -1487,8 +1493,8 @@ are deprecated and should not be used.
 
 ..
 
-  .. table:: AMDGPU Code Object V3 to V4 ELF Note Record Enumeration Values
-     :name: amdgpu-elf-note-record-enumeration-values-table-v3-v4
+  .. table:: AMDGPU Code Object V3 and Above ELF Note Record Enumeration Values
+     :name: amdgpu-elf-note-record-enumeration-values-table-v3-onwards
 
      ============================== =====
      Name                           Value
@@ -1500,8 +1506,9 @@ are deprecated and should not be used.
 ``NT_AMDGPU_METADATA``
   Specifies extensible metadata associated with an AMDGPU code object. It is
   encoded as a map in the Message Pack [MsgPack]_ binary data format. See
-  :ref:`amdgpu-amdhsa-code-object-metadata-v3` and
-  :ref:`amdgpu-amdhsa-code-object-metadata-v4` for the map keys defined for the
+  :ref:`amdgpu-amdhsa-code-object-metadata-v3`,
+  :ref:`amdgpu-amdhsa-code-object-metadata-v4` and
+  :ref:`amdgpu-amdhsa-code-object-metadata-v5` for the map keys defined for the
   ``amdhsa`` OS.
 
 .. _amdgpu-symbols:
@@ -2548,8 +2555,9 @@ The code object metadata specifies extensible metadata associated with the code
 objects executed on HSA [HSA]_ compatible runtimes (see :ref:`amdgpu-os`). The
 encoding and semantics of this metadata depends on the code object version; see
 :ref:`amdgpu-amdhsa-code-object-metadata-v2`,
-:ref:`amdgpu-amdhsa-code-object-metadata-v3`, and
-:ref:`amdgpu-amdhsa-code-object-metadata-v4`.
+:ref:`amdgpu-amdhsa-code-object-metadata-v3`,
+:ref:`amdgpu-amdhsa-code-object-metadata-v4` and
+:ref:`amdgpu-amdhsa-code-object-metadata-v5`.
 
 Code object metadata is specified in a note record (see
 :ref:`amdgpu-note-records`) and is required when the target triple OS is
@@ -2994,8 +3002,8 @@ Code Object V3 Metadata
   Code object V3 is not the default code object version emitted by this version
   of LLVM.
 
-Code object V3 to V4 metadata is specified by the ``NT_AMDGPU_METADATA`` note
-record (see :ref:`amdgpu-note-records-v3-v4`).
+Code object V3 and above metadata is specified by the ``NT_AMDGPU_METADATA`` note
+record (see :ref:`amdgpu-note-records-v3-onwards`).
 
 The metadata is represented as Message Pack formatted binary data (see
 [MsgPack]_). The top level is a Message Pack map that includes the
@@ -3431,9 +3439,9 @@ Code Object V4 Metadata
 
 Code object V4 metadata is the same as
 :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions
-defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`.
+defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v4`.
 
-  .. table:: AMDHSA Code Object V4 Metadata Map Changes from :ref:`amdgpu-amdhsa-code-object-metadata-v3`
+  .. table:: AMDHSA Code Object V4 Metadata Map Changes
      :name: amdgpu-amdhsa-code-object-metadata-map-table-v4
 
      ================= ============== ========= =======================================
@@ -3454,6 +3462,133 @@ defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v3`.
                                                 and :ref:`amdgpu-target-id`.
      ================= ============== ========= =======================================
 
+.. _amdgpu-amdhsa-code-object-metadata-v5:
+
+Code Object V5 Metadata
++++++++++++++++++++++++
+
+.. warning::
+  Code object V5 is not the default code object version emitted by this version
+  of LLVM.
+
+
+Code object V5 metadata is the same as
+:ref:`amdgpu-amdhsa-code-object-metadata-v4` with the changes defined in table
+:ref:`amdgpu-amdhsa-code-object-metadata-map-table-v5` and table
+:ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v5`.
+
+  .. table:: AMDHSA Code Object V5 Metadata Map Changes
+     :name: amdgpu-amdhsa-code-object-metadata-map-table-v5
+
+     ================= ============== ========= =======================================
+     String Key        Value Type     Required? Description
+     ================= ============== ========= =======================================
+     "amdhsa.version"  sequence of    Required  - The first integer is the major
+                       2 integers                 version. Currently 1.
+                                                - The second integer is the minor
+                                                  version. Currently 2.
+     ================= ============== ========= =======================================
+
+..
+
+  .. table:: AMDHSA Code Object V5 Kernel Argument Metadata Map Additions and Changes
+     :name: amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v5
+
+     ====================== ============== ========= ================================
+     String Key             Value Type     Required? Description
+     ====================== ============== ========= ================================
+     ".value_kind"          string         Required  Kernel argument kind that
+                                                     specifies how to set up the
+                                                     corresponding argument.
+                                                     Values include:
+                                                     the same as code object V3 metadata
+                                                     (see :ref:`amdgpu-amdhsa-code-object-kernel-argument-metadata-map-table-v3`)
+                                                     with the following additions:
+
+                                                     "hidden_block_count_x"
+                                                       The grid dispatch work-group count for the X dimension
+                                                       is passed in the kernarg. Some languages, such as OpenCL,
+                                                       support a last work-group in each dimension being partial.
+                                                       This count only includes the non-partial work-group count.
+                                                       This is not the same as the value in the AQL dispatch packet,
+                                                       which has the grid size in work-items.
+
+                                                     "hidden_block_count_y"
+                                                       The grid dispatch work-group count for the Y dimension
+                                                       is passed in the kernarg. Some languages, such as OpenCL,
+                                                       support a last work-group in each dimension being partial.
+                                                       This count only includes the non-partial work-group count.
+                                                       This is not the same as the value in the AQL dispatch packet,
+                                                       which has the grid size in work-items. If the grid dimentionality
+                                                       is 1, then must be 1.
+
+                                                     "hidden_block_count_z"
+                                                       The grid dispatch work-group count for the Z dimension
+                                                       is passed in the kernarg. Some languages, such as OpenCL,
+                                                       support a last work-group in each dimension being partial.
+                                                       This count only includes the non-partial work-group count.
+                                                       This is not the same as the value in the AQL dispatch packet,
+                                                       which has the grid size in work-items. If the grid dimentionality
+                                                       is 1 or 2, then must be 1.
+
+                                                     "hidden_group_size_x"
+                                                       The grid dispatch work-group size for the X dimension is
+                                                       passed in the kernarg. This size only applies to the
+                                                       non-partial work-groups. This is the same value as the AQL
+                                                       dispatch packet work-group size.
+
+                                                     "hidden_group_size_y"
+                                                       The grid dispatch work-group size for the Y dimension is
+                                                       passed in the kernarg. This size only applies to the
+                                                       non-partial work-groups. This is the same value as the AQL
+                                                       dispatch packet work-group size. If the grid dimentionality
+                                                       is 1, then must be 1.
+
+                                                     "hidden_group_size_z"
+                                                       The grid dispatch work-group size for the Z dimension is
+                                                       passed in the kernarg. This size only applies to the
+                                                       non-partial work-groups. This is the same value as the AQL
+                                                       dispatch packet work-group size. If the grid dimentionality
+                                                       is 1, then must be 1 or 2.
+
+                                                     "hidden_remainder_x"
+                                                       The grid dispatch work group size of the the partial work group
+                                                       of the X dimension, if it exists. Must be zero if a partial
+                                                       work group does not exist in the X dimension.
+
+                                                     "hidden_remainder_y"
+                                                       The grid dispatch work group size of the the partial work group
+                                                       of the Y dimension, if it exists. Must be zero if a partial
+                                                       work group does not exist in the Y dimension.
+
+                                                     "hidden_remainder_z"
+                                                       The grid dispatch work group size of the the partial work group
+                                                       of the Z dimension, if it exists. Must be zero if a partial
+                                                       work group does not exist in the Z dimension.
+
+                                                     "hidden_grid_dims"
+                                                       The grid dispatch dimentionality. This is the same value
+                                                       as the AQL dispatch packet dimentionality. Must be a value
+                                                       between 1 and 3.
+
+                                                     "hidden_private_base"
+                                                       The high 32 bits of the flat addressing private aperture base.
+                                                       Only used by GFX8 to allow conversion between private segment
+                                                       and flat addresses. See :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`.
+
+                                                     "hidden_shared_base"
+                                                       The high 32 bits of the flat addressing shared aperture base.
+                                                       Only used by GFX8 to allow conversion between shared segment
+                                                       and flat addresses. See :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`.
+
+                                                     "hidden_queue_ptr"
+                                                       A global memory address space pointer to the ROCm runtime
+                                                       ``struct amd_queue_t`` structure for the HSA queue of the
+                                                       associated dispatch AQL packet. It is only required for pre-GFX9
+                                                       devices for the trap handler ABI (see :ref:`amdgpu-amdhsa-trap-handler-abi`).
+
+     ====================== ============== ========= ================================
+
 ..
 
 Kernel Dispatch
@@ -3585,7 +3720,7 @@ local apertures), that are outside the range of addressible global memory, to
 map from a flat address to a private or local address.
 
 FLAT instructions can take a flat address and access global, private (scratch)
-and group (LDS) memory depending in if the address is within one of the
+and group (LDS) memory depending on if the address is within one of the
 aperture ranges. Flat access to scratch requires hardware aperture setup and
 setup in the kernel prologue (see
 :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`). Flat access to LDS requires
@@ -10571,6 +10706,8 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
                                - system                  for OpenCL.*
      ============ ============ ============== ========== ================================
 
+.. _amdgpu-amdhsa-trap-handler-abi:
+
 Trap Handler ABI
 ~~~~~~~~~~~~~~~~
 
@@ -10580,7 +10717,7 @@ supports the ``s_trap`` instruction. For usage see:
 
 - :ref:`amdgpu-trap-handler-for-amdhsa-os-v2-table`
 - :ref:`amdgpu-trap-handler-for-amdhsa-os-v3-table`
-- :ref:`amdgpu-trap-handler-for-amdhsa-os-v4-table`
+- :ref:`amdgpu-trap-handler-for-amdhsa-os-v4-onwards-table`
 
   .. table:: AMDGPU Trap Handler for AMDHSA OS Code Object V2
      :name: amdgpu-trap-handler-for-amdhsa-os-v2-table
@@ -10664,8 +10801,8 @@ supports the ``s_trap`` instruction. For usage see:
 
 ..
 
-  .. table:: AMDGPU Trap Handler for AMDHSA OS Code Object V4
-     :name: amdgpu-trap-handler-for-amdhsa-os-v4-table
+  .. table:: AMDGPU Trap Handler for AMDHSA OS Code Object V4 and Above
+     :name: amdgpu-trap-handler-for-amdhsa-os-v4-onwards-table
 
      =================== =============== ================ ================= =======================================
      Usage               Code Sequence   GFX6-GFX8 Inputs GFX9-GFX10 Inputs Description
@@ -11127,7 +11264,7 @@ Code Object Metadata
   was generated the version was 2.6.*
 
 Code object metadata is specified by the ``NT_AMDGPU_METADATA`` note
-record (see :ref:`amdgpu-note-records-v3-v4`).
+record (see :ref:`amdgpu-note-records-v3-onwards`).
 
 The metadata is represented as Message Pack formatted binary data (see
 [MsgPack]_). The top level is a Message Pack map that includes the keys
@@ -11988,10 +12125,10 @@ Here is an example of a minimal assembly source file, defining one HSA kernel:
    .Lfunc_end0:
         .size   hello_world, .Lfunc_end0-hello_world
 
-.. _amdgpu-amdhsa-assembler-predefined-symbols-v3-v4:
+.. _amdgpu-amdhsa-assembler-predefined-symbols-v3-onwards:
 
-Code Object V3 to V4 Predefined Symbols
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Code Object V3 and Above Predefined Symbols
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The AMDGPU assembler defines and updates some symbols automatically. These
 symbols do not affect code generation.
@@ -12050,10 +12187,10 @@ May be used to set the `.amdhsa_next_free_spgr` directive in
 
 May be set at any time, e.g. manually set to zero at the start of each kernel.
 
-.. _amdgpu-amdhsa-assembler-directives-v3-v4:
+.. _amdgpu-amdhsa-assembler-directives-v3-onwards:
 
-Code Object V3 to V4 Directives
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Code Object V3 and Above Directives
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Directives which begin with ``.amdgcn`` are valid for all ``amdgcn``
 architecture processors, and are not OS-specific. Directives which begin with
@@ -12216,18 +12353,19 @@ terminated by an ``.end_amdhsa_kernel`` directive.
 ++++++++++++++++
 
 Optional directive which declares the contents of the ``NT_AMDGPU_METADATA``
-note record (see :ref:`amdgpu-elf-note-records-table-v3-v4`).
+note record (see :ref:`amdgpu-elf-note-records-table-v3-onwards`).
 
 The contents must be in the [YAML]_ markup format, with the same structure and
-semantics described in :ref:`amdgpu-amdhsa-code-object-metadata-v3` or
-:ref:`amdgpu-amdhsa-code-object-metadata-v4`.
+semantics described in :ref:`amdgpu-amdhsa-code-object-metadata-v3`,
+:ref:`amdgpu-amdhsa-code-object-metadata-v4` or
+:ref:`amdgpu-amdhsa-code-object-metadata-v5`.
 
 This directive is terminated by an ``.end_amdgpu_metadata`` directive.
 
-.. _amdgpu-amdhsa-assembler-example-v3-v4:
+.. _amdgpu-amdhsa-assembler-example-v3-onwards:
 
-Code Object V3 to V4 Example Source Code
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Code Object V3 and Above Example Source Code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Here is an example of a minimal assembly source file, defining one HSA kernel:
 
index 8840929..5d3b127 100644 (file)
@@ -372,7 +372,8 @@ enum {
   // was never defined for V1.
   ELFABIVERSION_AMDGPU_HSA_V2 = 0,
   ELFABIVERSION_AMDGPU_HSA_V3 = 1,
-  ELFABIVERSION_AMDGPU_HSA_V4 = 2
+  ELFABIVERSION_AMDGPU_HSA_V4 = 2,
+  ELFABIVERSION_AMDGPU_HSA_V5 = 3
 };
 
 #define ELF_RELOC(name, value) name = value,
index 784a980..e0838a1 100644 (file)
@@ -44,6 +44,11 @@ constexpr uint32_t VersionMajorV4 = 1;
 /// HSA metadata minor version for code object V4.
 constexpr uint32_t VersionMinorV4 = 1;
 
+/// HSA metadata major version for code object V5.
+constexpr uint32_t VersionMajorV5 = 1;
+/// HSA metadata minor version for code object V5.
+constexpr uint32_t VersionMinorV5 = 2;
+
 /// HSA metadata beginning assembler directive.
 constexpr char AssemblerDirectiveBegin[] = ".amd_amdgpu_hsa_metadata";
 /// HSA metadata ending assembler directive.
index 99d2c82..0d28d93 100644 (file)
@@ -117,15 +117,28 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
                                .Case("image", true)
                                .Case("pipe", true)
                                .Case("queue", true)
+                               .Case("hidden_block_count_x", true)
+                               .Case("hidden_block_count_y", true)
+                               .Case("hidden_block_count_z", true)
+                               .Case("hidden_group_size_x", true)
+                               .Case("hidden_group_size_y", true)
+                               .Case("hidden_group_size_z", true)
+                               .Case("hidden_remainder_x", true)
+                               .Case("hidden_remainder_y", true)
+                               .Case("hidden_remainder_z", true)
                                .Case("hidden_global_offset_x", true)
                                .Case("hidden_global_offset_y", true)
                                .Case("hidden_global_offset_z", true)
+                               .Case("hidden_grid_dims", true)
                                .Case("hidden_none", true)
                                .Case("hidden_printf_buffer", true)
                                .Case("hidden_hostcall_buffer", true)
                                .Case("hidden_default_queue", true)
                                .Case("hidden_completion_action", true)
                                .Case("hidden_multigrid_sync_arg", true)
+                               .Case("hidden_private_base", true)
+                               .Case("hidden_shared_base", true)
+                               .Case("hidden_queue_ptr", true)
                                .Default(false);
                          }))
     return false;
index ffe2599..d597148 100644 (file)
@@ -579,6 +579,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
       BCase(EF_AMDGPU_FEATURE_SRAMECC_V3);
       break;
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       BCaseMask(EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4,
                 EF_AMDGPU_FEATURE_XNACK_V4);
       BCaseMask(EF_AMDGPU_FEATURE_XNACK_ANY_V4,
index bb2e723..6e2984f 100644 (file)
@@ -88,6 +88,8 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
       HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
     } else if (isHsaAbiVersion3(getGlobalSTI())) {
       HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
+    } else if (isHsaAbiVersion5(getGlobalSTI())) {
+      HSAMetadataStream.reset(new HSAMD::MetadataStreamerV5());
     } else {
       HSAMetadataStream.reset(new HSAMD::MetadataStreamerV4());
     }
@@ -118,7 +120,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
     return;
 
-  if (isHsaAbiVersion3Or4(getGlobalSTI()))
+  if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
     getTargetStreamer()->EmitDirectiveAMDGCNTarget();
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
@@ -127,7 +129,7 @@ void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
 
-  if (isHsaAbiVersion3Or4(getGlobalSTI()))
+  if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
     return;
 
   // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -259,7 +261,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
 
 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
-      isHsaAbiVersion3Or4(getGlobalSTI())) {
+      isHsaAbiVersion3AndAbove(getGlobalSTI())) {
     AsmPrinter::emitFunctionEntryLabel();
     return;
   }
index 3ac7c45..f5018e3 100644 (file)
@@ -672,15 +672,15 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
     Kern[".kind"] = Kern.getDocument()->getNode("fini");
 }
 
-void MetadataStreamerV3::emitKernelArgs(const Function &Func,
-                                        const GCNSubtarget &ST,
+void MetadataStreamerV3::emitKernelArgs(const MachineFunction &MF,
                                         msgpack::MapDocNode Kern) {
+  auto &Func = MF.getFunction();
   unsigned Offset = 0;
   auto Args = HSAMetadataDoc->getArrayNode();
   for (auto &Arg : Func.args())
     emitKernelArg(Arg, Offset, Args);
 
-  emitHiddenKernelArgs(Func, ST, Offset, Args);
+  emitHiddenKernelArgs(MF, Offset, Args);
 
   Kern[".args"] = Args;
 }
@@ -789,10 +789,12 @@ void MetadataStreamerV3::emitKernelArg(
   Args.push_back(Arg);
 }
 
-void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
-                                              const GCNSubtarget &ST,
+void MetadataStreamerV3::emitHiddenKernelArgs(const MachineFunction &MF,
                                               unsigned &Offset,
                                               msgpack::ArrayDocNode Args) {
+  auto &Func = MF.getFunction();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
   unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
   if (!HiddenArgNumBytes)
     return;
@@ -910,7 +912,6 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
                                     const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   auto Kern = getHSAKernelProps(MF, ProgramInfo);
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@@ -924,7 +925,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
         (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
     emitKernelLanguage(Func, Kern);
     emitKernelAttrs(Func, Kern);
-    emitKernelArgs(Func, ST, Kern);
+    emitKernelArgs(MF, Kern);
   }
 
   Kernels.push_back(Kern);
@@ -954,6 +955,97 @@ void MetadataStreamerV4::begin(const Module &Mod,
   getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
 }
 
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV5
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerV5::emitVersion() {
+  auto Version = HSAMetadataDoc->getArrayNode();
+  Version.push_back(Version.getDocument()->getNode(VersionMajorV5));
+  Version.push_back(Version.getDocument()->getNode(VersionMinorV5));
+  getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerV5::emitHiddenKernelArgs(const MachineFunction &MF,
+                                              unsigned &Offset,
+                                              msgpack::ArrayDocNode Args) {
+  auto &Func = MF.getFunction();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const Module *M = Func.getParent();
+  auto &DL = M->getDataLayout();
+
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+  auto Int32Ty = Type::getInt32Ty(Func.getContext());
+  auto Int16Ty = Type::getInt16Ty(Func.getContext());
+
+  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
+  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
+  emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
+
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
+
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
+
+  // Reserved for hidden_tool_correlation_id.
+  Offset += 8;
+
+  Offset += 8; // Reserved.
+
+  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
+  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
+  emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
+
+  emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
+
+  Offset += 6; // Reserved.
+  auto Int8PtrTy =
+      Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+
+  if (M->getNamedMetadata("llvm.printf.fmts")) {
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
+                  Args);
+  } else
+    Offset += 8; // Skipped.
+
+  if (M->getModuleFlag("amdgpu_hostcall")) {
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
+                  Args);
+  } else
+    Offset += 8; // Skipped.
+
+  emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
+                Args);
+
+  // Ignore temporarily until it is implemented.
+  // emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
+  Offset += 8;
+
+  if (Func.hasFnAttribute("calls-enqueue-kernel")) {
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
+                  Args);
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
+                  Args);
+  } else
+    Offset += 16; // Skipped.
+
+  Offset += 72; // Reserved.
+
+  // hidden_private_base and hidden_shared_base are only used by GFX8.
+  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
+    emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
+  } else
+    Offset += 8; // Skipped.
+
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.hasQueuePtr())
+    emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
+}
+
 } // end namespace HSAMD
 } // end namespace AMDGPU
 } // end namespace llvm
index 54ed0af..bcf7fc4 100644 (file)
@@ -53,6 +53,11 @@ public:
 
   virtual void emitKernel(const MachineFunction &MF,
                           const SIProgramInfo &ProgramInfo) = 0;
+
+protected:
+  virtual void emitVersion() = 0;
+  virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+                                    msgpack::ArrayDocNode Args) = 0;
 };
 
 // TODO: Rename MetadataStreamerV3 -> MetadataStreamerMsgPackV3.
@@ -79,7 +84,7 @@ protected:
   msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF,
                                         const SIProgramInfo &ProgramInfo) const;
 
-  void emitVersion();
+  void emitVersion() override;
 
   void emitPrintf(const Module &Mod);
 
@@ -87,8 +92,7 @@ protected:
 
   void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
 
-  void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
-                      msgpack::MapDocNode Kern);
+  void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);
 
   void emitKernelArg(const Argument &Arg, unsigned &Offset,
                      msgpack::ArrayDocNode Args);
@@ -100,8 +104,8 @@ protected:
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-  void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
-                            unsigned &Offset, msgpack::ArrayDocNode Args);
+  void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+                            msgpack::ArrayDocNode Args) override;
 
   msgpack::DocNode &getRootMetadata(StringRef Key) {
     return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@@ -127,9 +131,9 @@ public:
 };
 
 // TODO: Rename MetadataStreamerV4 -> MetadataStreamerMsgPackV4.
-class MetadataStreamerV4 final : public MetadataStreamerV3 {
-  void emitVersion();
-
+class MetadataStreamerV4 : public MetadataStreamerV3 {
+protected:
+  void emitVersion() override;
   void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
 
 public:
@@ -140,6 +144,18 @@ public:
              const IsaInfo::AMDGPUTargetID &TargetID) override;
 };
 
+// TODO: Rename MetadataStreamerV5 -> MetadataStreamerMsgPackV5.
+class MetadataStreamerV5 final : public MetadataStreamerV4 {
+protected:
+  void emitVersion() override;
+  void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+                            msgpack::ArrayDocNode Args) override;
+
+public:
+  MetadataStreamerV5() = default;
+  ~MetadataStreamerV5() = default;
+};
+
 // TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2.
 class MetadataStreamerV2 final : public MetadataStreamer {
 private:
@@ -167,8 +183,6 @@ private:
       const MachineFunction &MF,
       const SIProgramInfo &ProgramInfo) const;
 
-  void emitVersion();
-
   void emitPrintf(const Module &Mod);
 
   void emitKernelLanguage(const Function &Func);
@@ -191,6 +205,13 @@ private:
     return HSAMetadata;
   }
 
+protected:
+  void emitVersion() override;
+  void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
+                            msgpack::ArrayDocNode Args) override {
+    llvm_unreachable("Dummy override should not be invoked!");
+  }
+
 public:
   MetadataStreamerV2() = default;
   ~MetadataStreamerV2() = default;
index 04c6f67..645d05a 100644 (file)
@@ -4778,6 +4778,7 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
       return legalizeTrapHsaQueuePtr(MI, MRI, B);
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       return ST.supportsGetDoorbellID() ?
           legalizeTrapHsa(MI, MRI, B) :
           legalizeTrapHsaQueuePtr(MI, MRI, B);
index c1c88d9..c2efcb5 100644 (file)
@@ -1296,7 +1296,7 @@ public:
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
       AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
       MCContext &Ctx = getContext();
-      if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
+      if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
         MCSymbol *Sym =
             Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
         Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1313,7 +1313,7 @@ public:
         Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
         Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
       }
-      if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
+      if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
         initializeGprCountSymbol(IS_VGPR);
         initializeGprCountSymbol(IS_SGPR);
       } else
@@ -2747,7 +2747,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
   if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
     return nullptr;
   }
-  if (isHsaAbiVersion3Or4(&getSTI())) {
+  if (isHsaAbiVersion3AndAbove(&getSTI())) {
     if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
       return nullptr;
   } else
@@ -5099,7 +5099,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
   const char *AssemblerDirectiveBegin;
   const char *AssemblerDirectiveEnd;
   std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
-      isHsaAbiVersion3Or4(&getSTI())
+      isHsaAbiVersion3AndAbove(&getSTI())
           ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
                             HSAMD::V3::AssemblerDirectiveEnd)
           : std::make_tuple(HSAMD::AssemblerDirectiveBegin,
@@ -5116,7 +5116,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
                           HSAMetadataString))
     return true;
 
-  if (isHsaAbiVersion3Or4(&getSTI())) {
+  if (isHsaAbiVersion3AndAbove(&getSTI())) {
     if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
       return Error(getLoc(), "invalid HSA metadata");
   } else {
@@ -5266,7 +5266,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
-  if (isHsaAbiVersion3Or4(&getSTI())) {
+  if (isHsaAbiVersion3AndAbove(&getSTI())) {
     if (IDVal == ".amdhsa_kernel")
      return ParseDirectiveAMDHSAKernel();
 
@@ -7440,7 +7440,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
   if (!getTargetStreamer().getTargetID())
     getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());
 
-  if (isHsaAbiVersion3Or4(&getSTI()))
+  if (isHsaAbiVersion3AndAbove(&getSTI()))
     getTargetStreamer().EmitDirectiveAMDGCNTarget();
 }
 
index 9578bdb..7aa5f1a 100644 (file)
@@ -396,6 +396,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
       break;
     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       if (getTargetID()->isXnackSupported())
         OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
       break;
@@ -578,6 +579,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
       return getEFlagsV3();
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       return getEFlagsV4();
     }
   }
index 561866b..e2f4a08 100644 (file)
@@ -5423,6 +5423,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
       return lowerTrapHsaQueuePtr(Op, DAG);
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       return Subtarget->supportsGetDoorbellID() ?
           lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
     }
index 1e96266..683be87 100644 (file)
@@ -99,6 +99,8 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
     return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
   case 4:
     return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
+  case 5:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
   default:
     report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
                        Twine(AmdhsaCodeObjectVersion));
@@ -123,8 +125,15 @@ bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
   return false;
 }
 
-bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
-  return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
+bool isHsaAbiVersion5(const MCSubtargetInfo *STI) {
+  if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+    return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5;
+  return false;
+}
+
+bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
+  return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI) ||
+         isHsaAbiVersion5(STI);
 }
 
 #define GET_MIMGBaseOpcodesTable_IMPL
@@ -495,6 +504,7 @@ std::string AMDGPUTargetID::toString() const {
         Features += "+sram-ecc";
       break;
     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
       // sramecc.
       if (getSramEccSetting() == TargetIDSetting::Off)
         Features += ":sramecc-";
index 89f928e..4516b51 100644 (file)
@@ -47,9 +47,12 @@ bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
 /// \returns True if HSA OS ABI Version identification is 4,
 /// false otherwise.
 bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 5,
+/// false otherwise.
+bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
 /// \returns True if HSA OS ABI Version identification is 3 or 4,
 /// false otherwise.
-bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);
+bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
 
 struct GcnBufferFormatInfo {
   unsigned Format;
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll
new file mode 100644 (file)
index 0000000..580fecd
--- /dev/null
@@ -0,0 +1,123 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:       amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         96
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_printf_buffer
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         104
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_hostcall_buffer
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         128
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_default_queue
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         136
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_completion_action
+; GFX8-NEXT:      - .offset:         216
+; GFX8-NEXT:        .size:           4
+; GFX8-NEXT:        .value_kind:     hidden_private_base
+; GFX8-NEXT:      - .offset:         220
+; GFX8-NEXT:        .size:           4
+; GFX8-NEXT:        .value_kind:     hidden_shared_base
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         224
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+
+; CHECK:          .name:           test_v5
+; CHECK:          .symbol:         test_v5.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+define amdgpu_kernel void @test_v5(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #0 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.printf.fmts = !{!1, !2}
+
+!0 = !{i32 1, !"amdgpu_hostcall", i32 1}
+!1 = !{!"1:1:4:%d\5Cn"}
+!2 = !{!"2:1:8:%g\5Cn"}
+
+attributes #0 = { optnone noinline "calls-enqueue-kernel" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll
new file mode 100644 (file)
index 0000000..e1ffd33
--- /dev/null
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK  %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK  %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX9  %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=CHECK,GFX9 %s
+
+
+; On gfx8, the queue ptr is required for this addrspacecast.
+; CHECK: - .args:
+; PRE-GFX9:          .offset:         208
+; PRE-GFX9-NEXT:     .size:           8
+; PRE-GFX9-NEXT:     .value_kind:     hidden_queue_ptr
+; GFX9-NOT:          .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           addrspacecast_requires_queue_ptr
+; CHECK:             .symbol:         addrspacecast_requires_queue_ptr.kd
+define amdgpu_kernel void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) {
+  %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32*
+  %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32*
+  store volatile i32 1, i32* %flat.private
+  store volatile i32 2, i32* %flat.local
+  ret void
+}
+
+; CHECK: - .args:
+; CHECK:             .offset:         208
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           is_shared_requires_queue_ptr
+; CHECK:             .symbol:         is_shared_requires_queue_ptr.kd
+define amdgpu_kernel void @is_shared_requires_queue_ptr(i8* %ptr) {
+  %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
+  %zext = zext i1 %is.shared to i32
+  store volatile i32 %zext, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: - .args:
+; CHECK:             .offset:         208
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           is_private_requires_queue_ptr
+; CHECK:             .symbol:         is_private_requires_queue_ptr.kd
+define amdgpu_kernel void @is_private_requires_queue_ptr(i8* %ptr) {
+  %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr)
+  %zext = zext i1 %is.private to i32
+  store volatile i32 %zext, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: - .args:
+; CHECK:             .offset:         200
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           trap_requires_queue_ptr
+; CHECK:             .symbol:         trap_requires_queue_ptr.kd
+define amdgpu_kernel void @trap_requires_queue_ptr() {
+  call void @llvm.trap()
+  unreachable
+}
+
+; CHECK: - .args:
+; CHECK:             .offset:         200
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           debugtrap_requires_queue_ptr
+; CHECK:             .symbol:         debugtrap_requires_queue_ptr.kd
+define amdgpu_kernel void @debugtrap_requires_queue_ptr() {
+  call void @llvm.debugtrap()
+  unreachable
+}
+
+; CHECK: - .args:
+; CHECK:             .offset:         208
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_queue_ptr
+; CHECK:             .name:           amdgcn_queue_ptr_requires_queue_ptr
+; CHECK:             .symbol:         amdgcn_queue_ptr_requires_queue_ptr.kd
+define amdgpu_kernel void @amdgcn_queue_ptr_requires_queue_ptr(i64 addrspace(1)* %ptr)  {
+  %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
+  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
+  %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr
+  %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
+  %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr
+  store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr
+  ret void
+}
+
+
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
+declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+declare i64 @llvm.amdgcn.dispatch.id()
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+declare i1 @llvm.amdgcn.is.shared(i8*)
+declare i1 @llvm.amdgcn.is.private(i8*)
+declare void @llvm.trap()
+declare void @llvm.debugtrap()
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-reduced-hidden-args-v5.ll
new file mode 100644 (file)
index 0000000..c6dd98e
--- /dev/null
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=CHECK %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s
+
+
+; CHECK:       amdhsa.kernels:
+; CHECK-NEXT:       - .args:
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           r
+; CHECK-NEXT:         .offset:         0
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           a
+; CHECK-NEXT:         .offset:         8
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .address_space:  global
+; CHECK-NEXT:         .name:           b
+; CHECK-NEXT:         .offset:         16
+; CHECK-NEXT:         .size:           8
+; CHECK-NEXT:         .value_kind:     global_buffer
+; CHECK-NEXT:       - .offset:         24
+; CHECK-NEXT:         .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_x
+; CHECK-NEXT:      - .offset:         28
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_y
+; CHECK-NEXT:      - .offset:         32
+; CHECK-NEXT:        .size:           4
+; CHECK-NEXT:        .value_kind:     hidden_block_count_z
+; CHECK-NEXT:      - .offset:         36
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_x
+; CHECK-NEXT:      - .offset:         38
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_y
+; CHECK-NEXT:      - .offset:         40
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_group_size_z
+; CHECK-NEXT:      - .offset:         42
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_x
+; CHECK-NEXT:      - .offset:         44
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_y
+; CHECK-NEXT:      - .offset:         46
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_remainder_z
+; CHECK-NEXT:      - .offset:         64
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_x
+; CHECK-NEXT:      - .offset:         72
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_y
+; CHECK-NEXT:      - .offset:         80
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_global_offset_z
+; CHECK-NEXT:      - .offset:         88
+; CHECK-NEXT:        .size:           2
+; CHECK-NEXT:        .value_kind:     hidden_grid_dims
+; CHECK-NEXT:      - .address_space:  global
+; CHECK-NEXT:        .offset:         112
+; CHECK-NEXT:        .size:           8
+; CHECK-NEXT:        .value_kind:     hidden_multigrid_sync_arg
+; GFX8-NEXT:      - .offset:         216
+; GFX8-NEXT:        .size:           4
+; GFX8-NEXT:        .value_kind:     hidden_private_base
+; GFX8-NEXT:      - .offset:         220
+; GFX8-NEXT:        .size:           4
+; GFX8-NEXT:        .value_kind:     hidden_shared_base
+
+; CHECK:          .name:           test_v5_reduced_hidden
+; CHECK:          .symbol:         test_v5_reduced_hidden.kd
+
+; CHECK:  amdhsa.version:
+; CHECK-NEXT: - 1
+; CHECK-NEXT: - 2
+define amdgpu_kernel void @test_v5_reduced_hidden(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
index cfb6181..04a6722 100644 (file)
@@ -6393,6 +6393,7 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printFileHeaders() {
                      unsigned(ELF::EF_AMDGPU_MACH));
         break;
       case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+      case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
         W.printFlags("Flags", E.e_flags,
                      makeArrayRef(ElfHeaderAMDGPUFlagsABIVersion4),
                      unsigned(ELF::EF_AMDGPU_MACH),