From d16a0877d8ac12a49fc75ae651247f338d46fead Mon Sep 17 00:00:00 2001
From: Jose Manuel Monsalve Diaz <josem@udel.edu>
Date: Wed, 1 Jun 2022 21:49:23 +0000
Subject: [PATCH] [LIBOMPTARGET] Adding AMD to llvm-omp-device-info

Adding device information print for AMD devices on the
`llvm-omp-device-info` command line tool. The output is inspired by
the rocminfo command line tool.

This commit adds missing HSA functions, enums and structs
needed to query additional information from the HSA agents.
A generic message for the `generic-elf-64bit` plugin is also added

Example of an output:
```
llvm-omp-device-info
Device (0):
    This is a generic-elf-64bit device

Device (1):
    This is a generic-elf-64bit device

Device (2):
    This is a generic-elf-64bit device

Device (3):
    This is a generic-elf-64bit device

Device (4):
    HSA Runtime Version:                1.1
    HSA OpenMP Device Number:           0
    Device Name:                        gfx906
    Vendor Name:                        AMD
    Device Type:                        GPU
    Max Queues:                         128
    Queue Min Size:                     64
    Queue Max Size:                     131072
    Cache:
      L0:                               16384 bytes
      L1:                               8388608 bytes
    Cacheline Size:                     64
    Max Clock Freq(MHz):                1725
    Compute Units:                      60
    SIMD per CU:                        4
    Fast F16 Operation:                 TRUE
    Wavefront Size:                     64
    Workgroup Max Size:                 1024
    Workgroup Max Size per Dimension:
      x:                                1024
      y:                                1024
      z:                                1024
    Max Waves Per CU:                   40
    Max Work-item Per CU:               2560
    Grid Max Size:                      4294967295
    Grid Max Size per Dimension:
      x:                                4294967295
      y:                                4294967295
      z:                                4294967295
    Max fbarriers/Workgrp:              32
    Memory Pools:
      Pool GLOBAL; FLAGS: COARSE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GLOBAL; FLAGS: FINE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GROUP:
        Size:                            65536 bytes
        Allocatable:                     FALSE
        Runtime Alloc Granule:           0 bytes
        Runtime Alloc alignment:         0 bytes
        Accessable by all:               FALSE

Device (5):
    HSA Runtime Version:                1.1
    HSA OpenMP Device Number:           1
    Device Name:                        gfx906
    Vendor Name:                        AMD
    Device Type:                        GPU
    Max Queues:                         128
    Queue Min Size:                     64
    Queue Max Size:                     131072
    Cache:
      L0:                               16384 bytes
      L1:                               8388608 bytes
    Cacheline Size:                     64
    Max Clock Freq(MHz):                1725
    Compute Units:                      60
    SIMD per CU:                        4
    Fast F16 Operation:                 TRUE
    Wavefront Size:                     64
    Workgroup Max Size:                 1024
    Workgroup Max Size per Dimension:
      x:                                1024
      y:                                1024
      z:                                1024
    Max Waves Per CU:                   40
    Max Work-item Per CU:               2560
    Grid Max Size:                      4294967295
    Grid Max Size per Dimension:
      x:                                4294967295
      y:                                4294967295
      z:                                4294967295
    Max fbarriers/Workgrp:              32
    Memory Pools:
      Pool GLOBAL; FLAGS: COARSE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GLOBAL; FLAGS: FINE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GROUP:
        Size:                            65536 bytes
        Allocatable:                     FALSE
        Runtime Alloc Granule:           0 bytes
        Runtime Alloc alignment:         0 bytes
        Accessable by all:               FALSE

Device (6):
    HSA Runtime Version:                1.1
    HSA OpenMP Device Number:           2
    Device Name:                        gfx906
    Vendor Name:                        AMD
    Device Type:                        GPU
    Max Queues:                         128
    Queue Min Size:                     64
    Queue Max Size:                     131072
    Cache:
      L0:                               16384 bytes
      L1:                               8388608 bytes
    Cacheline Size:                     64
    Max Clock Freq(MHz):                1725
    Compute Units:                      60
    SIMD per CU:                        4
    Fast F16 Operation:                 TRUE
    Wavefront Size:                     64
    Workgroup Max Size:                 1024
    Workgroup Max Size per Dimension:
      x:                                1024
      y:                                1024
      z:                                1024
    Max Waves Per CU:                   40
    Max Work-item Per CU:               2560
    Grid Max Size:                      4294967295
    Grid Max Size per Dimension:
      x:                                4294967295
      y:                                4294967295
      z:                                4294967295
    Max fbarriers/Workgrp:              32
    Memory Pools:
      Pool GLOBAL; FLAGS: COARSE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GLOBAL; FLAGS: FINE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GROUP:
        Size:                            65536 bytes
        Allocatable:                     FALSE
        Runtime Alloc Granule:           0 bytes
        Runtime Alloc alignment:         0 bytes
        Accessable by all:               FALSE

Device (7):
    HSA Runtime Version:                1.1
    HSA OpenMP Device Number:           3
    Device Name:                        gfx906
    Vendor Name:                        AMD
    Device Type:                        GPU
    Max Queues:                         128
    Queue Min Size:                     64
    Queue Max Size:                     131072
    Cache:
      L0:                               16384 bytes
      L1:                               8388608 bytes
    Cacheline Size:                     64
    Max Clock Freq(MHz):                1725
    Compute Units:                      60
    SIMD per CU:                        4
    Fast F16 Operation:                 TRUE
    Wavefront Size:                     64
    Workgroup Max Size:                 1024
    Workgroup Max Size per Dimension:
      x:                                1024
      y:                                1024
      z:                                1024
    Max Waves Per CU:                   40
    Max Work-item Per CU:               2560
    Grid Max Size:                      4294967295
    Grid Max Size per Dimension:
      x:                                4294967295
      y:                                4294967295
      z:                                4294967295
    Max fbarriers/Workgrp:              32
    Memory Pools:
      Pool GLOBAL; FLAGS: COARSE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GLOBAL; FLAGS: FINE GRAINED, :
        Size:                            34342961152 bytes
        Allocatable:                     TRUE
        Runtime Alloc Granule:           4096 bytes
        Runtime Alloc alignment:         4096 bytes
        Accessable by all:               FALSE
      Pool GROUP:
        Size:                            65536 bytes
        Allocatable:                     FALSE
        Runtime Alloc Granule:           0 bytes
        Runtime Alloc alignment:         0 bytes
        Accessable by all:               FALSE
```

Differential Revision: https://reviews.llvm.org/D126836
---
 .../plugins/amdgpu/dynamic_hsa/hsa.cpp             |   3 +
 .../libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h  |  35 +++
 .../plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h       |  17 ++
 openmp/libomptarget/plugins/amdgpu/src/rtl.cpp     | 271 +++++++++++++++++++++
 .../plugins/generic-elf-64bit/src/rtl.cpp          |   4 +
 5 files changed, 330 insertions(+)

diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
index ea1498c..e748056 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
@@ -23,8 +23,11 @@ DLWRAP_INTERNAL(hsa_init, 0);
 
 DLWRAP(hsa_status_string, 2);
 DLWRAP(hsa_shut_down, 0);
+DLWRAP(hsa_system_get_info, 2);
 DLWRAP(hsa_agent_get_info, 3);
+DLWRAP(hsa_isa_get_info_alt, 3);
 DLWRAP(hsa_iterate_agents, 2);
+DLWRAP(hsa_agent_iterate_isas, 3);
 DLWRAP(hsa_signal_create, 4);
 DLWRAP(hsa_signal_destroy, 1);
 DLWRAP(hsa_signal_store_relaxed, 2);
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
index 54359c2..ebdd642 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
+++ b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
@@ -56,22 +56,57 @@ typedef enum {
 } hsa_device_type_t;
 
 typedef enum {
+  HSA_ISA_INFO_NAME = 1,
+} hsa_isa_info_t;
+
+typedef enum {
   HSA_AGENT_INFO_NAME = 0,
+  HSA_AGENT_INFO_VENDOR_NAME = 1,
   HSA_AGENT_INFO_PROFILE = 4,
   HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
   HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
+  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
   HSA_AGENT_INFO_GRID_MAX_DIM = 9,
+  HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
+  HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
+  HSA_AGENT_INFO_QUEUES_MAX = 12,
+  HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
   HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
   HSA_AGENT_INFO_DEVICE = 17,
+  HSA_AGENT_INFO_CACHE_SIZE = 18,
+  HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
 } hsa_agent_info_t;
 
+typedef enum {
+  HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
+  HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+} hsa_system_info_t;
+
+typedef struct hsa_region_s {
+  uint64_t handle;
+} hsa_region_t;
+
+typedef struct hsa_isa_s {
+  uint64_t handle;
+} hsa_isa_t;
+
+hsa_status_t hsa_system_get_info(hsa_system_info_t attribute, void *value);
+
 hsa_status_t hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute,
                                 void *value);
 
+hsa_status_t hsa_isa_get_info_alt(hsa_isa_t isa, hsa_isa_info_t attribute,
+                                  void *value);
+
 hsa_status_t hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent,
                                                          void *data),
                                 void *data);
 
+hsa_status_t hsa_agent_iterate_isas(hsa_agent_t agent,
+                                    hsa_status_t (*callback)(hsa_isa_t isa,
+                                                             void *data),
+                                    void *data);
+
 typedef struct hsa_signal_s {
   uint64_t handle;
 } hsa_signal_t;
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
index a8662ee..d27be93 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -29,9 +29,20 @@ typedef enum hsa_amd_memory_pool_global_flag_s {
 } hsa_amd_memory_pool_global_flag_t;
 
 typedef enum {
+  HSA_AMD_SEGMENT_GLOBAL = 0,
+  HSA_AMD_SEGMENT_READONLY = 1,
+  HSA_AMD_SEGMENT_PRIVATE = 2,
+  HSA_AMD_SEGMENT_GROUP = 3,
+} hsa_amd_segment_t;
+
+typedef enum {
+  HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
   HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
   HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
+  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
+  HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
 } hsa_amd_memory_pool_info_t;
 
 typedef enum {
@@ -43,7 +54,13 @@ typedef enum {
 } hsa_amd_memory_pool_access_t;
 
 typedef enum hsa_amd_agent_info_s {
+  HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
   HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
+  HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
+  HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
+  HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
+  HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
+  HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010
 } hsa_amd_agent_info_t;
 
 hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 4b5dd0d..a7d2498 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -282,6 +282,16 @@ static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
 
 namespace core {
 namespace {
+
+bool checkResult(hsa_status_t Err, const char *ErrMsg) {
+  if (Err == HSA_STATUS_SUCCESS)
+    return true;
+
+  REPORT("%s", ErrMsg);
+  REPORT("%s", get_error_string(Err));
+  return false;
+}
+
 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
 }
@@ -542,6 +552,259 @@ public:
     return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
   }
 
+  static void printDeviceInfo(int32_t device_id, hsa_agent_t agent) {
+    char TmpChar[1000];
+    uint16_t major, minor;
+    uint32_t TmpUInt;
+    uint32_t TmpUInt2;
+    uint32_t CacheSize[4];
+    bool TmpBool;
+    uint16_t workgroupMaxDim[3];
+    hsa_dim3_t gridMaxDim;
+
+    // Getting basic information about HSA and Device
+    core::checkResult(
+        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major),
+        "Error from hsa_system_get_info when obtaining "
+        "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
+    core::checkResult(
+        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor),
+        "Error from hsa_system_get_info when obtaining "
+        "HSA_SYSTEM_INFO_VERSION_MINOR\n");
+    printf("    HSA Runtime Version: \t\t%u.%u \n", major, minor);
+    printf("    HSA OpenMP Device Number: \t\t%d \n", device_id);
+    core::checkResult(
+        hsa_agent_get_info(
+            agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
+    printf("    Product Name: \t\t\t%s \n", TmpChar);
+    core::checkResult(hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, TmpChar),
+                      "Error returned from hsa_agent_get_info when obtaining "
+                      "HSA_AGENT_INFO_NAME\n");
+    printf("    Device Name: \t\t\t%s \n", TmpChar);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_NAME\n");
+    printf("    Vendor Name: \t\t\t%s \n", TmpChar);
+    hsa_device_type_t devType;
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &devType),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_DEVICE\n");
+    printf("    Device Type: \t\t\t%s \n",
+           devType == HSA_DEVICE_TYPE_CPU
+               ? "CPU"
+               : (devType == HSA_DEVICE_TYPE_GPU
+                      ? "GPU"
+                      : (devType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_QUEUES_MAX\n");
+    printf("    Max Queues: \t\t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
+    printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
+    printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
+
+    // Getting cache information
+    printf("    Cache:\n");
+
+    // FIXME: This is deprecated according to HSA documentation. But using
+    // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
+    // runtime.
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_CACHE_SIZE\n");
+
+    for (int i = 0; i < 4; i++) {
+      if (CacheSize[i]) {
+        printf("      L%u: \t\t\t\t%u bytes\n", i, CacheSize[i]);
+      }
+    }
+
+    core::checkResult(
+        hsa_agent_get_info(agent,
+                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
+                           &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
+    printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(
+            agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
+            &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
+    printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(
+            agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
+            &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
+    printf("    Compute Units: \t\t\t%u \n", TmpUInt);
+    core::checkResult(hsa_agent_get_info(
+                          agent,
+                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
+                          &TmpUInt),
+                      "Error returned from hsa_agent_get_info when obtaining "
+                      "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
+    printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
+    printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
+    printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
+    printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
+    core::checkResult(hsa_agent_get_info(agent,
+                                         HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+                                         workgroupMaxDim),
+                      "Error returned from hsa_agent_get_info when obtaining "
+                      "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
+    printf("    Workgroup Max Size per Dimension:\n");
+    printf("      x: \t\t\t\t%u\n", workgroupMaxDim[0]);
+    printf("      y: \t\t\t\t%u\n", workgroupMaxDim[1]);
+    printf("      z: \t\t\t\t%u\n", workgroupMaxDim[2]);
+    core::checkResult(hsa_agent_get_info(
+                          agent,
+                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
+                          &TmpUInt),
+                      "Error returned from hsa_agent_get_info when obtaining "
+                      "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
+    printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
+    printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
+    printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &gridMaxDim),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_GRID_MAX_DIM\n");
+    printf("    Grid Max Size per Dimension: \t\t\n");
+    printf("      x: \t\t\t\t%u\n", gridMaxDim.x);
+    printf("      y: \t\t\t\t%u\n", gridMaxDim.y);
+    printf("      z: \t\t\t\t%u\n", gridMaxDim.z);
+    core::checkResult(
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
+        "Error returned from hsa_agent_get_info when obtaining "
+        "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
+    printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
+
+    printf("    Memory Pools:\n");
+    auto CB_mem = [](hsa_amd_memory_pool_t region, void *data) -> hsa_status_t {
+      std::string TmpStr;
+      size_t size;
+      bool alloc, access;
+      hsa_amd_segment_t segment;
+      hsa_amd_memory_pool_global_flag_t globalFlags;
+      core::checkResult(
+          hsa_amd_memory_pool_get_info(
+              region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &globalFlags),
+          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
+          "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
+      core::checkResult(hsa_amd_memory_pool_get_info(
+                            region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment),
+                        "Error returned from hsa_amd_memory_pool_get_info when "
+                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
+
+      switch (segment) {
+      case HSA_AMD_SEGMENT_GLOBAL:
+        TmpStr = "GLOBAL; FLAGS: ";
+        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & globalFlags)
+          TmpStr += "KERNARG, ";
+        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & globalFlags)
+          TmpStr += "FINE GRAINED, ";
+        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & globalFlags)
+          TmpStr += "COARSE GRAINED, ";
+        break;
+      case HSA_AMD_SEGMENT_READONLY:
+        TmpStr = "READONLY";
+        break;
+      case HSA_AMD_SEGMENT_PRIVATE:
+        TmpStr = "PRIVATE";
+        break;
+      case HSA_AMD_SEGMENT_GROUP:
+        TmpStr = "GROUP";
+        break;
+      default:
+        TmpStr = "unknown";
+        break;
+      }
+      printf("      Pool %s: \n", TmpStr.c_str());
+
+      core::checkResult(hsa_amd_memory_pool_get_info(
+                            region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &size),
+                        "Error returned from hsa_amd_memory_pool_get_info when "
+                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
+      printf("        Size: \t\t\t\t %zu bytes\n", size);
+      core::checkResult(
+          hsa_amd_memory_pool_get_info(
+              region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc),
+          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
+          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
+      printf("        Allocatable: \t\t\t %s\n", (alloc ? "TRUE" : "FALSE"));
+      core::checkResult(
+          hsa_amd_memory_pool_get_info(
+              region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &size),
+          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
+          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
+      printf("        Runtime Alloc Granule: \t\t %zu bytes\n", size);
+      core::checkResult(
+          hsa_amd_memory_pool_get_info(
+              region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &size),
+          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
+          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
+      printf("        Runtime Alloc alignment: \t %zu bytes\n", size);
+      core::checkResult(
+          hsa_amd_memory_pool_get_info(
+              region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &access),
+          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
+          "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
+      printf("        Accessable by all: \t\t %s\n",
+             (access ? "TRUE" : "FALSE"));
+
+      return HSA_STATUS_SUCCESS;
+    };
+    // Iterate over all the memory regions for this agent. Get the memory region
+    // type and size
+    hsa_amd_agent_iterate_memory_pools(agent, CB_mem, nullptr);
+
+    printf("    ISAs:\n");
+    auto CB_isas = [](hsa_isa_t isa, void *data) -> hsa_status_t {
+      char TmpChar[1000];
+      core::checkResult(hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TmpChar),
+                        "Error returned from hsa_isa_get_info_alt when "
+                        "obtaining HSA_ISA_INFO_NAME\n");
+      printf("        Name: \t\t\t\t %s\n", TmpChar);
+
+      return HSA_STATUS_SUCCESS;
+    };
+    // Iterate over all the memory regions for this agent. Get the memory region
+    // type and size
+    hsa_agent_iterate_isas(agent, CB_isas, nullptr);
+  }
+
   // Record entry point associated with device
   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
@@ -2338,4 +2601,12 @@ int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {
   }
   return OFFLOAD_SUCCESS;
 }
+
+void __tgt_rtl_print_device_info(int32_t device_id) {
+  // TODO: Assertion to see if device_id is correct
+  // NOTE: We don't need to set context for print device info.
+
+  DeviceInfo.printDeviceInfo(device_id, DeviceInfo.HSAAgents[device_id]);
+}
+
 } // extern "C"
diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
index c512e61..babe189 100644
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -250,6 +250,10 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
   return DeviceInfo.getOffloadEntriesTable(device_id);
 }
 
+void __tgt_rtl_print_device_info(int32_t device_id) {
+  printf("    This is a generic-elf-64bit device\n");
+}
+
 // Sample implementation of explicit memory allocator. For this plugin all kinds
 // are equivalent to each other.
 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr,
-- 
2.7.4