// OpenMP Environment properties
int EnvNumTeams;
int EnvTeamLimit;
+ int EnvTeamThreadLimit;
int EnvMaxTeamsDefault;
// OpenMP Requires Flags
} else {
EnvMaxTeamsDefault = -1;
}
+ envStr = getenv("OMP_TEAMS_THREAD_LIMIT");
+ if (envStr) {
+ EnvTeamThreadLimit = std::stoi(envStr);
+ DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
+ } else {
+ EnvTeamThreadLimit = -1;
+ }
// Default state.
RequiresFlags = OMP_REQ_UNDEFINED;
DeviceInfo.GroupsPerDevice[device_id]);
}
+ // Adjust threads to the env variables
+ if (DeviceInfo.EnvTeamThreadLimit > 0 &&
+ (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
+ DeviceInfo.EnvTeamThreadLimit))) {
+ DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
+ DeviceInfo.EnvTeamThreadLimit);
+ }
+
// Set default number of threads
DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
DP("Default number of threads set according to library's default %d\n",
// OpenMP environment properties
int EnvNumTeams;
int EnvTeamLimit;
+ int EnvTeamThreadLimit;
// OpenMP requires flags
int64_t RequiresFlags;
DeviceRTLTy()
: NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
- RequiresFlags(OMP_REQ_UNDEFINED) {
+ EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
DP("Start initializing CUDA\n");
EnvTeamLimit = std::stoi(EnvStr);
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
}
+ if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
+ // OMP_TEAMS_THREAD_LIMIT has been set
+ EnvTeamThreadLimit = std::stoi(EnvStr);
+ DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
+ }
if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
// OMP_NUM_TEAMS has been set
EnvNumTeams = std::stoi(EnvStr);
DP("Error getting max block dimension, use default value %d\n",
DeviceRTLTy::DefaultNumThreads);
DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
- } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+ } else {
DP("Using %d CUDA threads per block\n", MaxBlockDimX);
DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
- } else {
- DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
- "capping at the hard limit\n",
- MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
- DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+
+ if (EnvTeamThreadLimit > 0 &&
+ DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
+ DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
+ "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
+ DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
+ DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
+ }
+ if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
+ DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+ "capping at the hard limit\n",
+ DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
+ DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+ }
}
// Get and set warp size