From 30c0d5b4c3f89efef8f79c47fd892d06432d87b1 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Thu, 18 Feb 2021 17:10:40 -0500 Subject: [PATCH] [OPENMP][AMDGCN] Improvements to print_kernel_trace (bit mask) allow bit masking to select various trace features. bit 0 => Launch tracing (stderr) bit 1 => timing of runtime (stdout) bit 2 => detailed launch tracing (stderr) bit 3 => timing goes to stdout instead of stderr example: LIBOMPTARGET_KERNEL_TRACE=7 does it all LIBOMPTARGET_KERNEL_TRACE=5 Launch + details LIBOMPTARGET_KERNEL_TRACE=2 timings + launch to stderr LIBOMPTARGET_KERNEL_TRACE=10 timings + launch to stdout Differential Revision: https://reviews.llvm.org/D96998 --- .../plugins/amdgpu/src/print_tracing.h | 21 +++++++++++++++++++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp | 24 +++++++++++++--------- 2 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 openmp/libomptarget/plugins/amdgpu/src/print_tracing.h diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h new file mode 100644 index 0000000..624b1fa --- /dev/null +++ b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h @@ -0,0 +1,21 @@ +//===--- print_tracing.h - OpenMP interface definitions -------- AMDGPU -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED +#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED + +enum PrintTraceControlBits { + LAUNCH = 1, // print a message to stderr for each kernel launch + RTL_TIMING = 2, // Print timing info around each RTL step + STARTUP_DETAILS = 4, // Details around loading up kernel + RTL_TO_STDOUT = 8 // Redirect RTL tracing to stdout +}; + +extern int print_kernel_trace; // set by environment variable + +#endif diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index ec79239..382fe5a 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -38,6 +38,7 @@ #include "Debug.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" +#include "print_tracing.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -714,7 +715,7 @@ int32_t __tgt_rtl_init_device(int device_id) { DeviceInfo.GPUName[device_id] = GetInfoName; } - if (print_kernel_trace == 4) + if (print_kernel_trace & STARTUP_DETAILS) fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id, DeviceInfo.ComputeUnits[device_id], DeviceInfo.GPUName[device_id].c_str()); @@ -1568,7 +1569,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, if (Max_Teams > DeviceInfo.HardTeamLimit) Max_Teams = DeviceInfo.HardTeamLimit; - if (print_kernel_trace == 4) { + if (print_kernel_trace & STARTUP_DETAILS) { fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams); fprintf(stderr, "Max_Teams: %d\n", Max_Teams); @@ -1601,7 +1602,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", threadsPerGroup); } - if (print_kernel_trace == 4) + if (print_kernel_trace & STARTUP_DETAILS) fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); DP("Preparing %d threads\n", threadsPerGroup); @@ -1614,7 +1615,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, num_groups = Max_Teams; DP("Set default num of groups %d\n", num_groups); - if (print_kernel_trace == 4) { + if (print_kernel_trace & STARTUP_DETAILS) { fprintf(stderr, "num_groups: %d\n", num_groups); fprintf(stderr, "num_teams: %d\n", num_teams); } @@ -1634,7 +1635,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, if (num_teams > 0) { num_groups = (num_teams < num_groups) ? num_teams : num_groups; } - if (print_kernel_trace == 4) { + if (print_kernel_trace & STARTUP_DETAILS) { fprintf(stderr, "num_groups: %d\n", num_groups); fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit); @@ -1667,13 +1668,13 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, } if (num_groups > Max_Teams) { num_groups = Max_Teams; - if (print_kernel_trace == 4) + if (print_kernel_trace & STARTUP_DETAILS) fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams); } if (num_groups > num_teams && num_teams > 0) { num_groups = num_teams; - if (print_kernel_trace == 4) + if (print_kernel_trace & STARTUP_DETAILS) fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", num_groups, num_teams); } @@ -1687,7 +1688,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, num_groups > DeviceInfo.EnvMaxTeamsDefault) num_groups = DeviceInfo.EnvMaxTeamsDefault; } - if (print_kernel_trace == 4) { + if (print_kernel_trace & STARTUP_DETAILS) { fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); fprintf(stderr, "num_groups: %d\n", num_groups); fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); @@ -1767,14 +1768,17 @@ int32_t __tgt_rtl_run_target_team_region_locked( loop_tripcount, // From run_region arg KernelInfo->device_id); - if (print_kernel_trace >= 1) + if (print_kernel_trace >= LAUNCH) { // enum modes are SPMD, GENERIC, NONE 0,1,2 - fprintf(stderr, + // if doing rtl timing, print to stderr, unless stdout requested. + bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); + fprintf(traceToStdout ? stdout : stderr, "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " "reqd:(%4dX%4d) n:%s\n", device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, KernelInfo->Name); + } // Run on the device. { -- 2.7.4