Change value type of `TfUtf8CodePointIterator` to `TfUtf8CodePoint`
authorMatt Kuruc <mkuruc@nvidia.com>
Fri, 29 Dec 2023 23:13:40 +0000 (15:13 -0800)
committerMatt Kuruc <mkuruc@nvidia.com>
Wed, 17 Jan 2024 16:45:49 +0000 (08:45 -0800)
pxr/base/tf/testenv/unicodeUtils.cpp
pxr/base/tf/unicodeUtils.h
pxr/usd/sdf/path.cpp
pxr/usd/sdf/tokens.h

index 19a3650572c0f21b710c7da0807194ccd76b0da0..4350e810de044536e54f6fe0d0dfa3dc462a793a 100644 (file)
@@ -75,6 +75,13 @@ TestUtf8CodePoint()
                  TfStringify(TfUtf8InvalidCodePoint));
 
     }
+    {
+        // Test ASCII character helper
+        TF_AXIOM(TfUtf8CodePointFromAscii('a') == TfUtf8CodePoint(97));
+        TF_AXIOM(TfStringify(TfUtf8CodePointFromAscii('a')) == "a");
+        TF_AXIOM(TfUtf8CodePointFromAscii(static_cast<char>(128)) ==
+                 TfUtf8InvalidCodePoint);
+    }
     return true;
 }
 
@@ -94,12 +101,13 @@ TestUtf8CodePointView()
         TfUtf8CodePointView u1{s1};
         auto i1 = std::cbegin(u1);
         TF_AXIOM(i1.GetBase() == s1.begin());
-        TF_AXIOM(*i1 == 8520);
+        TF_AXIOM(*i1 != TfUtf8InvalidCodePoint);
+        TF_AXIOM(*i1 == TfUtf8CodePoint{8520});
         std::advance(i1, 9);
         TF_AXIOM(i1 == std::cend(u1));
 
-        for (const uint32_t codePoint : u1) {
-            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
+        for (const auto codePoint : u1) {
+            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint);
         }
     }
 
@@ -108,12 +116,13 @@ TestUtf8CodePointView()
         TfUtf8CodePointView u2{s2};
         auto i2 = std::cbegin(u2);
         TF_AXIOM(i2.GetBase() == s2.begin());
-        TF_AXIOM(*i2 == 14652);
+        TF_AXIOM(*i2 != TfUtf8InvalidCodePoint);
+        TF_AXIOM(*i2 == TfUtf8CodePoint{14652});
         std::advance(i2, 5);
         TF_AXIOM(i2 == std::cend(u2));
 
-        for (const uint32_t codePoint : u2) {
-            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
+        for (const auto codePoint : u2) {
+            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint);
         }
     }
 
@@ -121,30 +130,25 @@ TestUtf8CodePointView()
         const std::string_view s3{"㤻üaf-∫⁇…🔗"};
         TfUtf8CodePointView u3{s3};
         auto i3a = std::cbegin(u3);
-        auto i3b = std::cbegin(u3);
-
-        // The C++20 ranges version of find_if can be used with sentinels in
-        // C++20
-        for (; i3b != std::cend(u3); ++i3b) {
-            if (*(i3b.GetBase()) == '-') {
-                break;
-            }
-        }
+        auto i3b = TfUtf8CodePointIterator{
+                std::next(std::cbegin(s3), s3.find('-')),
+                std::cend(s3)};
         TF_AXIOM(i3b != std::cend(u3));
 
         // i3a should contain all characters before the "-"
-        TF_AXIOM(*i3a == 14651);
+        TF_AXIOM(*i3a != TfUtf8InvalidCodePoint);
+        TF_AXIOM(*i3a == TfUtf8CodePoint{14651});
         std::advance(i3a, 4);
         TF_AXIOM(i3a == i3b);
         TF_AXIOM(i3a.GetBase() == i3b.GetBase());
 
         // i3b should include the "-" character
-        TF_AXIOM(*i3b == 45);
+        TF_AXIOM((*i3b) == TfUtf8CodePointFromAscii('-'));
         std::advance(i3b, 5);
         TF_AXIOM(i3b == std::cend(u3));
 
-        for (const uint32_t codePoint : u3) {
-            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint.AsUInt32());
+        for (const auto codePoint : u3) {
+            TF_AXIOM(codePoint != TfUtf8InvalidCodePoint);
         }
 
     }
@@ -153,32 +157,31 @@ TestUtf8CodePointView()
         // invalid code points.
         const std::string_view sv{"\x80\x61\x62\x81\x63"};
         const TfUtf8CodePointView uv{sv};
-        TF_AXIOM(std::next(std::begin(uv), 5) == std::end(uv));
+        TF_AXIOM(std::distance(std::begin(uv), uv.EndAsIterator()) == 5);
         TF_AXIOM(std::next(std::begin(uv), 5).GetBase() == std::end(sv));
 
-        std::array<uint32_t, 5> codePoints{0};
-        const std::array<uint32_t, 5> expectedCodePoints{{
-            TfUtf8InvalidCodePoint.AsUInt32(), 0x61, 0x62,
-            TfUtf8InvalidCodePoint.AsUInt32(), 0x63}};
-        std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
-        TF_AXIOM(codePoints == expectedCodePoints);
+        const std::array expectedCodePoints{
+            TfUtf8InvalidCodePoint, TfUtf8CodePointFromAscii('a'),
+            TfUtf8CodePointFromAscii('b'), TfUtf8InvalidCodePoint,
+            TfUtf8CodePointFromAscii('c')};
+        TF_AXIOM(std::equal(std::cbegin(uv), uv.EndAsIterator(),
+                            std::cbegin(expectedCodePoints)));
     }
     {
         // Incomplete UTF-8 sequences should not consume valid character
         // sequences
         const std::string_view sv{"\xc0\x61\xe0\x85\x62\xf0\x83\x84\x63\xf1"};
         const TfUtf8CodePointView uv{sv};
-        TF_AXIOM(std::next(std::begin(uv), 7) == std::end(uv));
+        TF_AXIOM(std::distance(std::begin(uv), uv.EndAsIterator()) == 7);
         TF_AXIOM(std::next(std::begin(uv), 7).GetBase() == std::end(sv));
 
-        std::array<uint32_t, 7> codePoints{0};
-        const std::array<uint32_t, 7> expectedCodePoints{{
-            TfUtf8InvalidCodePoint.AsUInt32(), 0x61,
-            TfUtf8InvalidCodePoint.AsUInt32(), 0x62,
-            TfUtf8InvalidCodePoint.AsUInt32(), 0x63,
-            TfUtf8InvalidCodePoint.AsUInt32()}};
-        std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
-        TF_AXIOM(codePoints == expectedCodePoints);
+        const std::array expectedCodePoints{
+            TfUtf8InvalidCodePoint, TfUtf8CodePointFromAscii('a'),
+            TfUtf8InvalidCodePoint, TfUtf8CodePointFromAscii('b'),
+            TfUtf8InvalidCodePoint, TfUtf8CodePointFromAscii('c'),
+            TfUtf8InvalidCodePoint};
+        TF_AXIOM(std::equal(std::cbegin(uv), uv.EndAsIterator(),
+                            std::cbegin(expectedCodePoints)));
     }
     return true;
 }
@@ -196,7 +199,7 @@ TestUtf8CodePointReflection()
             const std::string text{TfStringify(codePoint)};
             const auto view = TfUtf8CodePointView{text};
             TF_AXIOM(std::cbegin(view) != std::cend(view));
-            TF_AXIOM(*std::cbegin(view) == codePoint.AsUInt32());
+            TF_AXIOM(*std::cbegin(view) == codePoint);
             TF_AXIOM(++std::cbegin(view) == std::cend(view));
         }
     }
@@ -356,44 +359,32 @@ TestCharacterClasses()
     TfUtf8CodePointView view4 {sv4};
 
     // s1 should start with XID_Start and then have XID_Continue
-    bool first = true;
-    for (const uint32_t codePoint : view1)
-    {
-        bool result = first ? TfIsUtf8CodePointXidStart(codePoint)
-            : TfIsUtf8CodePointXidContinue(codePoint);
-        TF_AXIOM(result);
-
-        first = false;
-    }
+    TF_AXIOM(std::distance(std::cbegin(view1), view1.EndAsIterator()) == 9);
+    TF_AXIOM(TfIsUtf8CodePointXidStart(*std::cbegin(view1)));
+    TF_AXIOM(std::all_of(std::next(std::cbegin(view1)), view1.EndAsIterator(),
+                         [](const auto c) {
+                            return TfIsUtf8CodePointXidContinue(c);
+                         }));
 
     // s2 should start with XID_Start, have three characters that are 
-    // XID_Continue, then one that isn't in either 
-    size_t count = 0;
-    for (const uint32_t codePoint : view2)
-    {
-        if (count == 0)
-        {
-            TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
-        }
-        else if (count == 4)
-        {
-            TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
-        }
-        else
-        {
-            TF_AXIOM(TfIsUtf8CodePointXidContinue(codePoint));
-        }
-
-        count++;
-    }
+    // XID_Continue, then one that isn't in either
+    TF_AXIOM(std::distance(std::cbegin(view2), view2.EndAsIterator()) == 5);
+    auto it = std::cbegin(view2);
+    TF_AXIOM(TfIsUtf8CodePointXidStart(*it++));
+    TF_AXIOM(TfIsUtf8CodePointXidContinue(*it++));
+    TF_AXIOM(TfIsUtf8CodePointXidContinue(*it++));
+    TF_AXIOM(TfIsUtf8CodePointXidContinue(*it++));
+    TF_AXIOM(it != std::cend(view2));
+    TF_AXIOM(!TfIsUtf8CodePointXidContinue(*it++));
+    TF_AXIOM(it == std::cend(view2));
 
     // s3 should have all XID_Start characters in the first set
     // (before the "-") and all invalid characters after
-    for (const uint32_t codePoint : view3)
+    for (const auto codePoint : view3)
     {
         TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
     }
-    for (const uint32_t codePoint : view4)
+    for (const auto codePoint : view4)
     {
         TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
     }
index a1386908999120b5be2bb566e9300181a4e25c5b..80a613f1452c1ceb407bd8c068a05218dcb308ba 100644 (file)
@@ -98,6 +98,14 @@ TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);
 constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
     TfUtf8CodePoint::ReplacementValue};
 
+/// Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
+constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
+{
+    return static_cast<unsigned char>(value) < 128 ?
+           TfUtf8CodePoint(static_cast<unsigned char>(value)) :
+           TfUtf8InvalidCodePoint;
+}
+
 /// Defines an iterator over a UTF-8 encoded string that extracts unicode
 /// code point values.
 ///
@@ -108,10 +116,10 @@ constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
 class TfUtf8CodePointIterator final {
 public:
     using iterator_category = std::forward_iterator_tag;
-    using value_type = uint32_t;
+    using value_type = TfUtf8CodePoint;
     using difference_type = std::ptrdiff_t;
     using pointer = void;
-    using reference = uint32_t;
+    using reference = TfUtf8CodePoint;
 
     /// Model iteration ending when the underlying iterator's end condition
     /// has been met.
@@ -131,14 +139,14 @@ public:
         }
 
     /// Retrieves the current UTF-8 character in the sequence as its Unicode
-    /// code point value. Returns `TfUtf8InvalidCodePoint.AsUInt32()` when the
+    /// code point value. Returns `TfUtf8InvalidCodePoint` when the
     /// byte sequence pointed to by the iterator cannot be decoded.
     ///
     /// A code point might be invalid because it's incorrectly encoded, exceeds
     /// the maximum allowed value, or is in the disallowed surrogate range.
-    uint32_t operator* () const
+    value_type operator* () const
     {
-        return _GetCodePoint();
+        return TfUtf8CodePoint{_GetCodePoint()};
     }
 
     /// Retrieves the wrapped string iterator.
@@ -306,8 +314,8 @@ private:
 ///
 /// \code{.cpp}
 /// std::string value{"∫dx"};
-/// for (const uint32_t codePoint : TfUtf8CodePointView{value}) {
-///     if (codePoint == TfUtf8InvalidCodePoint.AsUInt32()) {
+/// for (const auto codePoint : TfUtf8CodePointView{value}) {
+///     if (codePoint == TfUtf8InvalidCodePoint) {
 ///         TF_WARN("String cannot be decoded.");
 ///         break;
 ///     }
@@ -322,7 +330,7 @@ private:
 ///
 /// \code{.cpp}
 /// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
-///     [](const auto c) { return c == TfUtf8InvalidCodePoint.AsUInt32(); }))
+///     [](const auto c) { return c == TfUtf8InvalidCodePoint; }))
 /// {
 ///     TF_WARN("String cannot be decoded");
 /// }
@@ -394,6 +402,10 @@ private:
 ///
 TF_API
 bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
+inline bool TfIsUtf8CodePointXidStart(const TfUtf8CodePoint codePoint)
+{
+    return TfIsUtf8CodePointXidStart(codePoint.AsUInt32());
+}
 
 /// Determines whether the given Unicode \a codePoint is in the XID_Continue
 /// character class.
@@ -406,6 +418,10 @@ bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
 ///
 TF_API
 bool TfIsUtf8CodePointXidContinue(uint32_t codePoint);
+inline bool TfIsUtf8CodePointXidContinue(const TfUtf8CodePoint codePoint)
+{
+    return TfIsUtf8CodePointXidContinue(codePoint.AsUInt32());
+}
 
 PXR_NAMESPACE_CLOSE_SCOPE
 
index 949e2900dcf3e6e93a3cba3861ad3e57175d4443..b07ab3787d4f96b426b6ed7080872514896e085c 100644 (file)
@@ -1789,6 +1789,15 @@ SdfPath::MakeRelativePath(const SdfPath & anchor) const
     return result;
 }
 
+// Valid identifiers require the first character to be an `_` or in the
+// XidStart class.
+static bool
+_IsValidIdentifierStart(const TfUtf8CodePoint codePoint)
+{
+    return codePoint == TfUtf8CodePointFromAscii('_') ||
+           TfIsUtf8CodePointXidStart(codePoint);
+}
+
 static
 inline bool 
 _IsValidIdentifier(const std::string_view& name)
@@ -1799,23 +1808,20 @@ _IsValidIdentifier(const std::string_view& name)
         return false;
     }
 
-    TfUtf8CodePointView view {name};
-    bool first = true;
-    for (const uint32_t codePoint : view)
+    auto it = TfUtf8CodePointIterator{std::cbegin(name), std::cend(name)};
+    if (!_IsValidIdentifierStart(*it))
     {
-        bool result = first ? 
-            ((codePoint == SDF_UNDERSCORE_CODE_POINT) ||
-                TfIsUtf8CodePointXidStart(codePoint))
-            : TfIsUtf8CodePointXidContinue(codePoint);
-
-        if (!result)
+        return false;
+    }
+    // In C++20, this can be replaced with the `std::ranges` version of
+    // `std::all_of`
+    for (++it; it != TfUtf8CodePointIterator::PastTheEndSentinel{}; ++it)
+    {
+        if (!TfIsUtf8CodePointXidContinue(*it))
         {
             return false;
         }
-
-        first = false;
     }
-
     return true;
 }
 
@@ -1846,7 +1852,7 @@ SdfPath::IsValidNamespacedIdentifier(const std::string &name)
     // identifiers.
     std::string_view remainder {name};
     while (!remainder.empty()) {
-        const auto index = remainder.find(':');
+        const auto index = remainder.find(SDF_PATH_NS_DELIMITER_CHAR);
         
         // can't start with ':'
         if (index == 0) {
@@ -1895,8 +1901,7 @@ SdfPath::TokenizeIdentifier(const std::string &name)
     TfUtf8CodePointIterator anchor = iterator;
     
     // Check first character is in XidStart or '_'
-    if(!TfIsUtf8CodePointXidStart(*iterator) &&
-        *iterator != SDF_UNDERSCORE_CODE_POINT)
+    if(!_IsValidIdentifierStart(*iterator))
     {
         return result;
     }
@@ -1908,7 +1913,7 @@ SdfPath::TokenizeIdentifier(const std::string &name)
     for (++iterator; iterator != view.end(); ++iterator)
     {
         // Allow a namespace delimiter.
-        if (*iterator == SDF_NAMESPACE_DELIMITER_CODE_POINT)
+        if (*iterator == TfUtf8CodePointFromAscii(SDF_PATH_NS_DELIMITER_CHAR))
         {
             // Record token.
             result.push_back(std::string(anchor.GetBase(), iterator.GetBase()));
@@ -1919,8 +1924,7 @@ SdfPath::TokenizeIdentifier(const std::string &name)
             anchor = ++iterator;
 
             // First character.
-            if (!TfIsUtf8CodePointXidStart(*iterator) && 
-                *iterator != SDF_UNDERSCORE_CODE_POINT)
+            if (!_IsValidIdentifierStart(*iterator))
             {
                 TfReset(result);
                 return result;
index 1a42de86e793c2c1883111225acba877730f166c..60fc1b5ca3d87d3b6674e9f4fa779b3eb9a28689 100644 (file)
@@ -79,10 +79,6 @@ TF_DECLARE_PUBLIC_TOKENS(SdfMetadataDisplayGroupTokens,
                          SDF_API,
                          SDF_METADATA_DISPLAYGROUP_TOKENS);
 
-// constants for identifier validation
-constexpr uint32_t SDF_NAMESPACE_DELIMITER_CODE_POINT = 0x003Au;
-constexpr uint32_t SDF_UNDERSCORE_CODE_POINT = 0x005Fu;
-
 PXR_NAMESPACE_CLOSE_SCOPE
 
 #endif // PXR_USD_SDF_TOKENS_H