From f05c95f10fc1d8171071735af8ad3a9e87633120 Mon Sep 17 00:00:00 2001
From: peter klausler <pklausler@nvidia.com>
Date: Wed, 6 Oct 2021 14:03:48 -0700
Subject: [PATCH] [flang] More work on SYSTEM_CLOCK runtime API and
 implementation

To get proper wrap-around behavior for the various kind parameter
values of the optional COUNT= and COUNT_MAX= dummy arguments to
the intrinsic subroutine SYSTEM_CLOCK, add an extra argument to
the APIs for lowering to pass the integer kind of the actual argument.
Avoid confusion by requiring that both actual arguments have the same
kind when both are present.  The results of the runtime functions
remain std::int64_t and lowering should still convert them before
storing to the actual argument variables.

Rework the implementation a bit to accomodate the dynamic
specification of the kind parameter, and to clean up some coding
issues with preprocessing and templates.

Use the kind of the COUNT=/COUNT_MAX= actual arguments to determine
the clock's resolution, where possible, in conformance with other
Fortran implementations.

Differential Revision: https://reviews.llvm.org/D111281
---
 flang/docs/Extensions.md                     |   7 ++
 flang/include/flang/Runtime/time-intrinsic.h |  14 ++-
 flang/lib/Evaluate/intrinsics.cpp            |   4 +-
 flang/runtime/time-intrinsic.cpp             | 157 ++++++++++++++-------------
 flang/unittests/Runtime/Time.cpp             |  45 ++++++--
 5 files changed, 133 insertions(+), 94 deletions(-)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index f35954b..ac293ce 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -63,6 +63,13 @@ end
   not the bounds of the implied DO loop.  It is not advisable to use
   an object of the same name as the index variable in a bounds
   expression, but it will work, instead of being needlessly undefined.
+* If both the `COUNT=` and the `COUNT_MAX=` optional arguments are
+  present on the same call to the intrinsic subroutine `SYSTEM_CLOCK`,
+  we require that their types have the same integer kind, since the
+  kind of these arguments is used to select the clock rate.
+  In common with some other compilers, the clock is in milliseconds
+  for kinds <= 4 and nanoseconds otherwise where the target system
+  supports these rates.
 
 ## Extensions, deletions, and legacy features supported by default
 
diff --git a/flang/include/flang/Runtime/time-intrinsic.h b/flang/include/flang/Runtime/time-intrinsic.h
index c48e961..4600099 100644
--- a/flang/include/flang/Runtime/time-intrinsic.h
+++ b/flang/include/flang/Runtime/time-intrinsic.h
@@ -12,8 +12,8 @@
 #ifndef FORTRAN_RUNTIME_TIME_INTRINSIC_H_
 #define FORTRAN_RUNTIME_TIME_INTRINSIC_H_
 
-#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/entry-names.h"
+#include <cinttypes>
 
 namespace Fortran::runtime {
 
@@ -27,10 +27,14 @@ double RTNAME(CpuTime)();
 
 // Interface for the SYSTEM_CLOCK intrinsic. We break it up into 3 distinct
 // function calls, one for each of SYSTEM_CLOCK's optional output arguments.
-// Lowering will have to cast the results to whatever type it prefers.
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCount)();
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountRate)();
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountMax)();
+// Lowering converts the results to the types of the actual arguments,
+// including the case of a real argument for COUNT_RATE=..
+// The kind argument to SystemClockCount and SystemClockCountMax is the
+// kind of the integer actual arguments, which are required to be the same
+// when both appear.
+std::int64_t RTNAME(SystemClockCount)(int kind = 8);
+std::int64_t RTNAME(SystemClockCountRate)(int kind = 8);
+std::int64_t RTNAME(SystemClockCountMax)(int kind = 8);
 
 // Interface for DATE_AND_TIME intrinsic.
 void RTNAME(DateAndTime)(char *date, std::size_t dateChars, char *time,
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 2324b78..519f0e6 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -1127,11 +1127,11 @@ static const IntrinsicInterface intrinsicSubroutine[]{
                 common::Intent::Out}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"system_clock",
-        {{"count", AnyInt, Rank::scalar, Optionality::optional,
+        {{"count", SameInt, Rank::scalar, Optionality::optional,
              common::Intent::Out},
             {"count_rate", AnyIntOrReal, Rank::scalar, Optionality::optional,
                 common::Intent::Out},
-            {"count_max", AnyInt, Rank::scalar, Optionality::optional,
+            {"count_max", SameInt, Rank::scalar, Optionality::optional,
                 common::Intent::Out}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
 };
diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp
index 7a70f52..f7ef5be 100644
--- a/flang/runtime/time-intrinsic.cpp
+++ b/flang/runtime/time-intrinsic.cpp
@@ -11,6 +11,7 @@
 #include "flang/Runtime/time-intrinsic.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstdint>
@@ -51,18 +52,10 @@ template <typename Unused = void> double GetCpuTime(fallback_implementation) {
   if (timestamp != static_cast<std::clock_t>(-1)) {
     return static_cast<double>(timestamp) / CLOCKS_PER_SEC;
   }
-
   // Return some negative value to represent failure.
   return -1.0;
 }
 
-// POSIX implementation using clock_gettime. This is only enabled if
-// clock_gettime is available.
-template <typename T = int, typename U = struct timespec>
-double GetCpuTime(preferred_implementation,
-    // We need some dummy parameters to pass to decltype(clock_gettime).
-    T ClockId = 0, U *Timespec = nullptr,
-    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
 #if defined CLOCK_THREAD_CPUTIME_ID
 #define CLOCKID CLOCK_THREAD_CPUTIME_ID
 #elif defined CLOCK_PROCESS_CPUTIME_ID
@@ -72,106 +65,119 @@ double GetCpuTime(preferred_implementation,
 #else
 #define CLOCKID CLOCK_REALTIME
 #endif
+
+// POSIX implementation using clock_gettime. This is only enabled where
+// clock_gettime is available.
+template <typename T = int, typename U = struct timespec>
+double GetCpuTime(preferred_implementation,
+    // We need some dummy parameters to pass to decltype(clock_gettime).
+    T ClockId = 0, U *Timespec = nullptr,
+    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
   struct timespec tspec;
   if (clock_gettime(CLOCKID, &tspec) == 0) {
     return tspec.tv_nsec * 1.0e-9 + tspec.tv_sec;
   }
-
   // Return some negative value to represent failure.
   return -1.0;
 }
 
-using count_t =
-    Fortran::runtime::CppTypeFor<Fortran::common::TypeCategory::Integer, 8>;
+using count_t = std::int64_t;
+using unsigned_count_t = std::uint64_t;
+
+// Computes HUGE(INT(0,kind)) as an unsigned integer value.
+static constexpr inline unsigned_count_t GetHUGE(int kind) {
+  if (kind > 8) {
+    kind = 8;
+  }
+  return (unsigned_count_t{1} << ((8 * kind) - 1)) - 1;
+}
 
 // This is the fallback implementation, which should work everywhere. Note that
 // in general we can't recover after std::clock has reached its maximum value.
 template <typename Unused = void>
-count_t GetSystemClockCount(fallback_implementation) {
+count_t GetSystemClockCount(int kind, fallback_implementation) {
   std::clock_t timestamp{std::clock()};
   if (timestamp == static_cast<std::clock_t>(-1)) {
-    // Return -HUGE() to represent failure.
-    return -std::numeric_limits<count_t>::max();
+    // Return -HUGE(COUNT) to represent failure.
+    return -static_cast<count_t>(GetHUGE(kind));
   }
-
-  // If our return type is large enough to hold any value returned by
-  // std::clock, our work is done. Otherwise, we have to wrap around.
-  static constexpr auto max{std::numeric_limits<count_t>::max()};
-  if constexpr (std::numeric_limits<std::clock_t>::max() <= max) {
-    return static_cast<count_t>(timestamp);
-  } else {
-    // Since std::clock_t could be a floating point type, we can't just use the
-    // % operator, so we have to wrap around manually.
-    return static_cast<count_t>(timestamp - max * std::floor(timestamp / max));
+  // Convert the timestamp to std::uint64_t with wrap-around. The timestamp is
+  // most likely a floating-point value (since C'11), so compute the modulus
+  // carefully when one is required.
+  constexpr auto maxUnsignedCount{std::numeric_limits<unsigned_count_t>::max()};
+  if constexpr (std::numeric_limits<std::clock_t>::max() > maxUnsignedCount) {
+    timestamp -= maxUnsignedCount * std::floor(timestamp / maxUnsignedCount);
   }
+  unsigned_count_t unsignedCount{static_cast<unsigned_count_t>(timestamp)};
+  // Return the modulus of the unsigned integral count with HUGE(COUNT)+1.
+  // The result is a signed integer but never negative.
+  return static_cast<count_t>(unsignedCount % (GetHUGE(kind) + 1));
 }
 
 template <typename Unused = void>
-count_t GetSystemClockCountRate(fallback_implementation) {
+count_t GetSystemClockCountRate(int kind, fallback_implementation) {
   return CLOCKS_PER_SEC;
 }
 
 template <typename Unused = void>
-count_t GetSystemClockCountMax(fallback_implementation) {
-  static constexpr auto max_clock_t = std::numeric_limits<std::clock_t>::max();
-  static constexpr auto max_count_t = std::numeric_limits<count_t>::max();
-  if constexpr (max_clock_t < max_count_t) {
-    return static_cast<count_t>(max_clock_t);
-  } else {
-    return max_count_t;
-  }
+count_t GetSystemClockCountMax(int kind, fallback_implementation) {
+  constexpr auto max_clock_t{std::numeric_limits<std::clock_t>::max()};
+  unsigned_count_t maxCount{GetHUGE(kind)};
+  return max_clock_t <= maxCount ? static_cast<count_t>(max_clock_t)
+                                 : static_cast<count_t>(maxCount);
 }
 
-constexpr count_t NSECS_PER_SEC{1'000'000'000};
+// POSIX implementation using clock_gettime. This is only enabled where
+// clock_gettime is available.  Use a millisecond CLOCK_RATE for kinds
+// of COUNT/COUNT_MAX less than 64 bits, and nanoseconds otherwise.
+constexpr unsigned_count_t MILLIS_PER_SEC{1'000u};
+constexpr unsigned_count_t NSECS_PER_SEC{1'000'000'000u};
+constexpr unsigned_count_t maxSecs{
+    std::numeric_limits<unsigned_count_t>::max() / NSECS_PER_SEC};
+
+// Use a millisecond clock rate for smaller COUNT= kinds.
+static inline unsigned_count_t ScaleResult(unsigned_count_t nsecs, int kind) {
+  return kind >= 8 ? nsecs : nsecs / (NSECS_PER_SEC / MILLIS_PER_SEC);
+}
 
-// POSIX implementation using clock_gettime. This is only enabled if
-// clock_gettime is available.
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCount(preferred_implementation,
+count_t GetSystemClockCount(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-#if defined CLOCK_THREAD_CPUTIME_ID
-#define CLOCKID CLOCK_THREAD_CPUTIME_ID
-#elif defined CLOCK_PROCESS_CPUTIME_ID
-#define CLOCKID CLOCK_PROCESS_CPUTIME_ID
-#elif defined CLOCK_MONOTONIC
-#define CLOCKID CLOCK_MONOTONIC
-#else
-#define CLOCKID CLOCK_REALTIME
-#endif
   struct timespec tspec;
   if (clock_gettime(CLOCKID, &tspec) != 0) {
     // Return -HUGE() to represent failure.
-    return -std::numeric_limits<count_t>::max();
+    return -GetHUGE(kind);
   }
-
   // Wrap around to avoid overflows.
-  constexpr count_t max_secs{
-      std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
-  count_t wrapped_secs{tspec.tv_sec % max_secs};
-
-  // At this point, wrapped_secs < max_secs, and max_secs has already been
-  // truncated by the division. Therefore, we should still have enough room to
-  // add tv_nsec, since it is < NSECS_PER_SEC.
-  return tspec.tv_nsec + wrapped_secs * NSECS_PER_SEC;
+  unsigned_count_t wrappedSecs{
+      static_cast<unsigned_count_t>(tspec.tv_sec) % maxSecs};
+  unsigned_count_t unsignedNsecs{static_cast<unsigned_count_t>(tspec.tv_nsec) +
+      wrappedSecs * NSECS_PER_SEC};
+  unsigned_count_t unsignedCount{ScaleResult(unsignedNsecs, kind)};
+  // Return the modulus of the unsigned integral count with HUGE(COUNT)+1.
+  // The result is a signed integer but never negative.
+  return static_cast<count_t>(unsignedCount % (GetHUGE(kind) + 1));
 }
 
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCountRate(preferred_implementation,
+count_t GetSystemClockCountRate(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-  return NSECS_PER_SEC;
+  return kind >= 8 ? static_cast<count_t>(NSECS_PER_SEC) : MILLIS_PER_SEC;
 }
 
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCountMax(preferred_implementation,
+count_t GetSystemClockCountMax(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-  count_t max_secs{std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
-  return max_secs * NSECS_PER_SEC - 1;
+  unsigned_count_t maxClockNsec{maxSecs * NSECS_PER_SEC + NSECS_PER_SEC - 1};
+  unsigned_count_t maxClock{ScaleResult(maxClockNsec, kind)};
+  unsigned_count_t maxCount{GetHUGE(kind)};
+  return static_cast<count_t>(maxClock <= maxCount ? maxClock : maxCount);
 }
 
 // DATE_AND_TIME (Fortran 2018 16.9.59)
@@ -198,7 +204,7 @@ template <int KIND> struct StoreNegativeHugeAt {
 
 // Default implementation when date and time information is not available (set
 // strings to blanks and values to -HUGE as defined by the standard).
-void DateAndTimeUnavailable(Fortran::runtime::Terminator &terminator,
+static void DateAndTimeUnavailable(Fortran::runtime::Terminator &terminator,
     char *date, std::size_t dateChars, char *time, std::size_t timeChars,
     char *zone, std::size_t zoneChars,
     const Fortran::runtime::Descriptor *values) {
@@ -259,9 +265,9 @@ template <typename TM = struct tm> struct GmtOffsetHelper {
   };
 };
 
-// Dispatch to posix implemetation when gettimeofday and localtime_r are
+// Dispatch to posix implementation where gettimeofday and localtime_r are
 // available.
-void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
+static void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
     std::size_t dateChars, char *time, std::size_t timeChars, char *zone,
     std::size_t zoneChars, const Fortran::runtime::Descriptor *values) {
 
@@ -330,9 +336,9 @@ void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
 }
 
 #else
-// Fallback implementation when gettimeofday or localtime_r is not available
-// (e.g. windows).
-void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
+// Fallback implementation where gettimeofday or localtime_r are not both
+// available (e.g. windows).
+static void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
     std::size_t dateChars, char *time, std::size_t timeChars, char *zone,
     std::size_t zoneChars, const Fortran::runtime::Descriptor *values) {
   // TODO: An actual implementation for non Posix system should be added.
@@ -342,26 +348,23 @@ void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
       terminator, date, dateChars, time, timeChars, zone, zoneChars, values);
 }
 #endif
-} // anonymous namespace
+} // namespace
 
 namespace Fortran::runtime {
 extern "C" {
 
 double RTNAME(CpuTime)() { return GetCpuTime(0); }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCount)() {
-  return GetSystemClockCount(0);
+std::int64_t RTNAME(SystemClockCount)(int kind) {
+  return GetSystemClockCount(kind, 0);
 }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCountRate)() {
-  return GetSystemClockCountRate(0);
+std::int64_t RTNAME(SystemClockCountRate)(int kind) {
+  return GetSystemClockCountRate(kind, 0);
 }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCountMax)() {
-  return GetSystemClockCountMax(0);
+std::int64_t RTNAME(SystemClockCountMax)(int kind) {
+  return GetSystemClockCountMax(kind, 0);
 }
 
 void RTNAME(DateAndTime)(char *date, std::size_t dateChars, char *time,
diff --git a/flang/unittests/Runtime/Time.cpp b/flang/unittests/Runtime/Time.cpp
index 4720e7e..b6898d4 100644
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang/unittests/Runtime/Time.cpp
@@ -31,7 +31,7 @@ TEST(TimeIntrinsics, CpuTime) {
   }
 }
 
-using count_t = CppTypeFor<TypeCategory::Integer, 8>;
+using count_t = std::int64_t;
 
 TEST(TimeIntrinsics, SystemClock) {
   // We can't really test that we get the "right" result for SYSTEM_CLOCK, but
@@ -43,21 +43,46 @@ TEST(TimeIntrinsics, SystemClock) {
   // SYSTEM_CLOCK.
   EXPECT_GT(RTNAME(SystemClockCountRate)(), 0);
 
-  count_t max{RTNAME(SystemClockCountMax)()};
-  EXPECT_GT(max, 0);
+  count_t max1{RTNAME(SystemClockCountMax)(1)};
+  EXPECT_GT(max1, 0);
+  EXPECT_LE(max1, static_cast<count_t>(0x7f));
+  count_t start1{RTNAME(SystemClockCount)(1)};
+  EXPECT_GE(start1, 0);
+  EXPECT_LE(start1, max1);
 
-  count_t start{RTNAME(SystemClockCount)()};
-  EXPECT_GE(start, 0);
-  EXPECT_LE(start, max);
+  count_t max2{RTNAME(SystemClockCountMax)(2)};
+  EXPECT_GT(max2, 0);
+  EXPECT_LE(max2, static_cast<count_t>(0x7fff));
+  count_t start2{RTNAME(SystemClockCount)(2)};
+  EXPECT_GE(start2, 0);
+  EXPECT_LE(start2, max2);
+
+  count_t max4{RTNAME(SystemClockCountMax)(4)};
+  EXPECT_GT(max4, 0);
+  EXPECT_LE(max4, static_cast<count_t>(0x7fffffff));
+  count_t start4{RTNAME(SystemClockCount)(4)};
+  EXPECT_GE(start4, 0);
+  EXPECT_LE(start4, max4);
+
+  count_t max8{RTNAME(SystemClockCountMax)(8)};
+  EXPECT_GT(max8, 0);
+  count_t start8{RTNAME(SystemClockCount)(8)};
+  EXPECT_GE(start8, 0);
+  EXPECT_LT(start8, max8);
+
+  count_t max16{RTNAME(SystemClockCountMax)(16)};
+  EXPECT_GT(max16, 0);
+  count_t start16{RTNAME(SystemClockCount)(16)};
+  EXPECT_GE(start16, 0);
+  EXPECT_LT(start16, max16);
 
   // Loop until we get a different value from SystemClockCount. If we don't get
   // one before we time out, then we should probably look into an implementation
   // for SystemClokcCount with a better timer resolution on this platform.
-  for (count_t end = start; end == start; end = RTNAME(SystemClockCount)()) {
+  for (count_t end{start8}; end == start8; end = RTNAME(SystemClockCount)(8)) {
     EXPECT_GE(end, 0);
-    EXPECT_LE(end, max);
-
-    EXPECT_GE(end, start);
+    EXPECT_LE(end, max8);
+    EXPECT_GE(end, start8);
   }
 }
 
-- 
2.7.4