//
// This loop terminates with CurPtr pointing at the newline (or end of buffer)
// character that ends the line comment.
-
- // C++23 [lex.phases] p1
- // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
- // diagnostic only once per entire ill-formed subsequence to avoid
- // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
- bool UnicodeDecodingAlreadyDiagnosed = false;
-
char C;
while (true) {
C = *CurPtr;
// Skip over characters in the fast loop.
- while (isASCII(C) && C != 0 && // Potentially EOF.
- C != '\n' && C != '\r') { // Newline or DOS-style newline.
+ while (C != 0 && // Potentially EOF.
+ C != '\n' && C != '\r') // Newline or DOS-style newline.
C = *++CurPtr;
- UnicodeDecodingAlreadyDiagnosed = false;
- }
-
- if (!isASCII(C)) {
- unsigned Length = llvm::getUTF8SequenceSize(
- (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
- if (Length == 0) {
- if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
- Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
- UnicodeDecodingAlreadyDiagnosed = true;
- ++CurPtr;
- } else {
- UnicodeDecodingAlreadyDiagnosed = false;
- CurPtr += Length;
- }
- continue;
- }
const char *NextLine = CurPtr;
if (C != 0) {
if (C == '/')
C = *CurPtr++;
- // C++23 [lex.phases] p1
- // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
- // diagnostic only once per entire ill-formed subsequence to avoid
- // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
- bool UnicodeDecodingAlreadyDiagnosed = false;
-
while (true) {
// Skip over all non-interesting characters until we find end of buffer or a
// (probably ending) '/' character.
// doesn't check for '\0'.
!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
// While not aligned to a 16-byte boundary.
- while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
- if (!isASCII(C))
- goto MultiByteUTF8;
+ while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
C = *CurPtr++;
- }
+
if (C == '/') goto FoundSlash;
#ifdef __SSE2__
__m128i Slashes = _mm_set1_epi8('/');
- while (CurPtr + 16 < BufferEnd) {
- int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
- if (LLVM_UNLIKELY(Mask != 0)) {
- CurPtr += llvm::countTrailingZeros<unsigned>(Mask);
- goto MultiByteUTF8;
- }
- // look for slashes
+ while (CurPtr+16 <= BufferEnd) {
int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
Slashes));
if (cmp != 0) {
CurPtr += 16;
}
#elif __ALTIVEC__
- __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80};
__vector unsigned char Slashes = {
'/', '/', '/', '/', '/', '/', '/', '/',
'/', '/', '/', '/', '/', '/', '/', '/'
};
- while (CurPtr + 16 < BufferEnd) {
- if (LLVM_UNLIKELY(
- vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
- goto MultiByteUTF8;
- if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
- C = *CurPtr++;
- break;
- }
+ while (CurPtr + 16 <= BufferEnd &&
+ !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
CurPtr += 16;
- }
-
#else
- while (CurPtr + 16 < BufferEnd) {
- bool HasNonASCII = false;
- for (unsigned I = 0; I < 16; ++I)
- HasNonASCII |= !isASCII(CurPtr[I]);
-
- if (LLVM_UNLIKELY(HasNonASCII))
- goto MultiByteUTF8;
-
- bool HasSlash = false;
- for (unsigned I = 0; I < 16; ++I)
- HasSlash |= CurPtr[I] == '/';
- if (HasSlash)
- break;
- CurPtr += 16;
+ // Scan for '/' quickly. Many block comments are very large.
+ while (CurPtr[0] != '/' &&
+ CurPtr[1] != '/' &&
+ CurPtr[2] != '/' &&
+ CurPtr[3] != '/' &&
+ CurPtr+4 < BufferEnd) {
+ CurPtr += 4;
}
#endif
C = *CurPtr++;
}
- // Loop to scan the remainder, warning on invalid UTF-8
- // if the corresponding warning is enabled, emitting a diagnostic only once
- // per sequence that cannot be decoded.
- while (C != '/' && C != '\0') {
- if (isASCII(C)) {
- UnicodeDecodingAlreadyDiagnosed = false;
- C = *CurPtr++;
- continue;
- }
- MultiByteUTF8:
- // CurPtr is 1 code unit past C, so to decode
- // the codepoint, we need to read from the previous position.
- unsigned Length = llvm::getUTF8SequenceSize(
- (const llvm::UTF8 *)CurPtr-1, (const llvm::UTF8 *)BufferEnd);
- if (Length == 0) {
- if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
- Diag(CurPtr-1, diag::warn_invalid_utf8_in_comment);
- UnicodeDecodingAlreadyDiagnosed = true;
- }
- else {
- UnicodeDecodingAlreadyDiagnosed = false;
- CurPtr += Length - 1;
- }
+ // Loop to scan the remainder.
+ while (C != '/' && C != '\0')
C = *CurPtr++;
- }
if (C == '/') {
FoundSlash: