#include "pxr/base/tf/regTest.h"
#include "pxr/base/tf/unicodeUtils.h"
+#include <algorithm>
+#include <array>
#include <string_view>
PXR_NAMESPACE_USING_DIRECTIVE
{
TF_AXIOM(TfUtf8CodePointView{}.empty());
+ TF_AXIOM(TfUtf8CodePointView{}.cbegin() == TfUtf8CodePointView{}.EndAsIterator());
+ TF_AXIOM(std::cbegin(TfUtf8CodePointView{}) == std::cend(TfUtf8CodePointView{}));
}
// Exercise the iterator converting from UTF-8 char to code point
}
}
+ {
+ // Unexpected continuations (\x80 and \x81) should be decoded as
+ // invalid code points.
+ const std::string_view sv{"\x80\x61\x62\x81\x63"};
+ const TfUtf8CodePointView uv{sv};
+ TF_AXIOM(std::next(std::begin(uv), 5) == std::end(uv));
+ TF_AXIOM(std::next(std::begin(uv), 5).GetBase() == std::end(sv));
+
+ std::array<uint32_t, 5> codePoints{0};
+ const std::array<uint32_t, 5> expectedCodePoints{{
+ TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61, 0x62,
+ TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63}};
+ std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
+ TF_AXIOM(codePoints == expectedCodePoints);
+ }
+ {
+ // Incomplete UTF-8 sequences should not consume valid character
+ // sequences
+ const std::string_view sv{"\xc0\x61\xe0\x85\x62\xf0\x83\x84\x63\xf1"};
+ const TfUtf8CodePointView uv{sv};
+ TF_AXIOM(std::next(std::begin(uv), 7) == std::end(uv));
+ TF_AXIOM(std::next(std::begin(uv), 7).GetBase() == std::end(sv));
+
+ std::array<uint32_t, 7> codePoints{0};
+ const std::array<uint32_t, 7> expectedCodePoints{{
+ TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x61,
+ TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x62,
+ TfUtf8CodePointIterator::INVALID_CODE_POINT, 0x63,
+ TfUtf8CodePointIterator::INVALID_CODE_POINT}};
+ std::copy(std::cbegin(uv), uv.EndAsIterator(), std::begin(codePoints));
+ TF_AXIOM(codePoints == expectedCodePoints);
+ }
return true;
}
#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/unicodeCharacterClasses.h"
-#include <optional>
#include <string>
#include <string_view>
return _view.empty();
}
+ /// Returns an iterator of the same type as begin that identifies the end
+ /// of the string.
+ ///
+ /// As the end iterator is stored three times, this is slightly heavier
+ /// than using the PastTheEndSentinel and should be avoided in performance
+ /// critical code paths. It is provided for convenience when an algorithm
+ /// restricts the iterators to have the same type.
+ ///
+ /// As C++20 ranges exposes more sentinel friendly algorithms, this can
+ /// likely be deprecated in the future.
+ inline const_iterator EndAsIterator() const;
+
private:
std::string_view _view;
};
// throwing an exception, _GetCodePoint signals this is
// bad by setting the code point to 0xFFFD (this mostly happens
// when a high / low private surrogate is used)
- // TODO: note that this isn't precisely conformant, as we
- // likely consumed an entire sequence when a subset would have
- // been invalid e.g. the byte sequence C2 41 would be detected
- // as a 2-byte sequence, but it is invalid because of the
- // second byte signature by the standard, this should process
- // the C2 as invalid but consume 41 as a valid 1-byte UTF-8
- // character
return _GetCodePoint();
}
/// Advances the iterator logically one UTF-8 character sequence in
/// the string. The underlying string iterator will be advanced
/// according to the variable length encoding of the next UTF-8
- /// character. Invalid leading bytes will increment until the end
- /// of the range is reached.
+ /// character, but will never consume non-continuation bytes after
+ /// the current one.
TfUtf8CodePointIterator& operator++ ()
{
+ // The increment operator should never be called if it's past
+ // the end. The user is expected to have already checked this
+ // condition.
+ TF_DEV_AXIOM(!_IsPastTheEnd());
+ _EncodingLength increment = _GetEncodingLength();
// note that in cases where the encoding is invalid, we move to the
// next byte this is necessary because otherwise the iterator would
// never advanced and the end condition of == iterator::end() would
- // never be satisfied
- _EncodingLength encodingLength = _GetEncodingLength();
- std::advance(_it, (encodingLength != 0) ? encodingLength : 1);
- if (_IsPastTheEnd()) {
- _it = _end;
+ // never be satisfied. This means that we increment, even if the
+ // encoding length is 0.
+ ++_it;
+ // Only continuation bytes will be consumed after the the first byte.
+ // This avoid consumption of ASCII characters or other starting bytes.
+ auto isContinuation = [](const char c) {
+ const auto uc = static_cast<unsigned char>(c);
+ return (uc >= static_cast<unsigned char>('\x80')) &&
+ (uc < static_cast<unsigned char>('\xc0'));
+ };
+ while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
+ ++_it;
+ --increment;
}
return *this;
}
- /// Advances the iterator logically one UTF-8 character sequence in the
- /// string. The underlying string iterator will be advanced according
- /// to the variable length encoding of the next UTF-8 character.
+ /// Advances the iterator logically one UTF-8 character sequence in
+ /// the string. The underlying string iterator will be advanced
+ /// according to the variable length encoding of the next UTF-8
+ /// character, but will never consume non-continuation bytes after
+ /// the current one.
TfUtf8CodePointIterator operator++ (int)
{
- // note that in cases where the encoding is invalid, we move to the
- // next byte this is necessary because otherwise the iterator would
- // never advanced and the end condition of == iterator::end() would
- // never be satisfied
auto temp = *this;
++(*this);
return temp;
// Constructs an iterator that can read UTF-8 character sequences from
// the given starting string_view iterator \a it. \a end is used as a
// guard against reading byte sequences past the end of the source string.
+ // \a end must not be in the middle of a UTF-8 character sequence.
TfUtf8CodePointIterator(
const std::string_view::const_iterator& it,
const std::string_view::const_iterator& end) : _it(it), _end(end) {
{
return 1;
}
- else if ((x >> 5) == 0x6)
+ else if ((x >= 0xc0) && (x < 0xe0))
{
return 2;
}
- else if ((x >> 4) == 0xe)
+ else if ((x >= 0xe0) && (x < 0xf0))
{
return 3;
}
- else if ((x >> 3) == 0x1e)
+ else if ((x >= 0xf0) && (x < 0xf8))
{
return 4;
}
return begin();
}
+inline TfUtf8CodePointView::const_iterator
+TfUtf8CodePointView::EndAsIterator() const
+{
+ return const_iterator(std::cend(_view), std::cend(_view));
+}
+
/// Determines whether the given Unicode \a codePoint is in the XID_Start
/// character class.
///