From a2100d755ad9f24261360fe3d1da725a1d3c8530 Mon Sep 17 00:00:00 2001 From: Jordan Rose Date: Fri, 8 Feb 2013 22:30:22 +0000 Subject: [PATCH] Pull Lexer's CharInfo table out for general use throughout Clang. Rewriting the same predicates over and over again is bad for code size and code maintainence. Using the functions in is generally unsafe unless they are specified to be locale-independent (i.e. only isdigit and isxdigit). The next commit will try to clean up uses of functions within Clang. llvm-svn: 174765 --- clang/include/clang/Basic/CharInfo.h | 162 ++++++++++++++ clang/lib/Basic/CMakeLists.txt | 1 + clang/lib/Basic/CharInfo.cpp | 80 +++++++ clang/lib/Lex/Lexer.cpp | 175 +-------------- clang/unittests/Basic/CMakeLists.txt | 1 + clang/unittests/Basic/CharInfoTest.cpp | 377 +++++++++++++++++++++++++++++++++ 6 files changed, 626 insertions(+), 170 deletions(-) create mode 100644 clang/include/clang/Basic/CharInfo.h create mode 100644 clang/lib/Basic/CharInfo.cpp create mode 100644 clang/unittests/Basic/CharInfoTest.cpp diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h new file mode 100644 index 0000000..f9b7b73 --- /dev/null +++ b/clang/include/clang/Basic/CharInfo.h @@ -0,0 +1,162 @@ +//===--- clang/Basic/CharInfo.h - Classifying ASCII Characters ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef CLANG_BASIC_CHARINFO_H +#define CLANG_BASIC_CHARINFO_H + +#include "llvm/Support/Compiler.h" +#include "llvm/Support/DataTypes.h" + +namespace clang { +namespace charinfo { + extern const uint16_t InfoTable[256]; + + enum { + CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0' + CHAR_VERT_WS = 0x0002, // '\r', '\n' + CHAR_SPACE = 0x0004, // ' ' + CHAR_DIGIT = 0x0008, // 0-9 + CHAR_XLETTER = 0x0010, // a-f,A-F + CHAR_UPPER = 0x0020, // A-Z + CHAR_LOWER = 0x0040, // a-z + CHAR_UNDER = 0x0080, // _ + CHAR_PERIOD = 0x0100, // . + CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' + CHAR_PUNCT = 0x0400 // `$@() + }; + + enum { + CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER, + CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER + }; +} // end namespace charinfo + +/// Returns true if this is an ASCII character. +LLVM_READNONE static inline bool isASCII(char c) { + return static_cast(c) <= 127; +} + +/// Returns true if this is a valid first character of a C identifier, +/// which is [a-zA-Z_]. +LLVM_READONLY static inline bool isIdentifierHead(unsigned char c, + bool AllowDollar = false) { + using namespace charinfo; + if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) + return true; + return AllowDollar && c == '$'; +} + +/// Returns true if this is a body character of a C identifier, +/// which is [a-zA-Z0-9_]. +LLVM_READONLY static inline bool isIdentifierBody(unsigned char c, + bool AllowDollar = false) { + using namespace charinfo; + if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER)) + return true; + return AllowDollar && c == '$'; +} + +/// Returns true if this character is horizontal ASCII whitespace: +/// ' ', '\\t', '\\f', '\\v'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isHorizontalWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0; +} + +/// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isVerticalWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_VERT_WS) != 0; +} + +/// Return true if this character is horizontal or vertical ASCII whitespace: +/// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. +/// +/// Note that this returns false for '\\0'. +LLVM_READONLY static inline bool isWhitespace(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0; +} + +/// Return true if this character is an ASCII digit: [0-9] +LLVM_READONLY static inline bool isDigit(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_DIGIT) != 0; +} + +/// Return true if this character is a lowercase ASCII letter: [a-z] +LLVM_READONLY static inline bool isLowercase(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_LOWER) != 0; +} + +/// Return true if this character is an uppercase ASCII letter: [A-Z] +LLVM_READONLY static inline bool isUppercase(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & CHAR_UPPER) != 0; +} + +/// Return true if this character is an ASCII letter: [a-zA-Z] +LLVM_READONLY static inline bool isLetter(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0; +} + +/// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9] +LLVM_READONLY static inline bool isAlphanumeric(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0; +} + +/// Return true if this character is an ASCII hex digit: [0-9a-fA-F] +LLVM_READONLY static inline bool isHexDigit(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0; + return true; +} + +/// Return true if this character is an ASCII punctuation character. +/// +/// Note that '_' is both a punctuation character and an identifier character! +LLVM_READONLY static inline bool isPunctuation(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; +} + +/// Return true if this character is an ASCII printable character; that is, a +/// character that should take exactly one column to print in a fixed-width +/// terminal. +LLVM_READONLY static inline bool isPrintable(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| + CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; +} + +/// Return true if this is the body character of a C preprocessing number, +/// which is [a-zA-Z0-9_.]. +LLVM_READONLY static inline bool isPreprocessingNumberBody(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & + (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0; +} + +/// Return true if this is the body character of a C++ raw string delimiter. +LLVM_READONLY static inline bool isRawStringDelimBody(unsigned char c) { + using namespace charinfo; + return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| + CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; +} + +} // end namespace clang + +#endif diff --git a/clang/lib/Basic/CMakeLists.txt b/clang/lib/Basic/CMakeLists.txt index 7c5e42c..37efcb1 100644 --- a/clang/lib/Basic/CMakeLists.txt +++ b/clang/lib/Basic/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS mc) add_clang_library(clangBasic Builtins.cpp + CharInfo.cpp Diagnostic.cpp DiagnosticIDs.cpp FileManager.cpp diff --git a/clang/lib/Basic/CharInfo.cpp b/clang/lib/Basic/CharInfo.cpp new file mode 100644 index 0000000..a1a4b39 --- /dev/null +++ b/clang/lib/Basic/CharInfo.cpp @@ -0,0 +1,80 @@ +//===--- CharInfo.cpp - Static Data for Classifying ASCII Characters ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharInfo.h" + +// Statically initialize CharInfo table based on ASCII character set +// Reference: FreeBSD 7.2 /usr/share/misc/ascii +const uint16_t clang::charinfo::InfoTable[256] = +{ + // 0 NUL 1 SOH 2 STX 3 ETX + // 4 EOT 5 ENQ 6 ACK 7 BEL + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + // 8 BS 9 HT 10 NL 11 VT + //12 NP 13 CR 14 SO 15 SI + 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, + CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , + //16 DLE 17 DC1 18 DC2 19 DC3 + //20 DC4 21 NAK 22 SYN 23 ETB + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + //24 CAN 25 EM 26 SUB 27 ESC + //28 FS 29 GS 30 RS 31 US + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + //32 SP 33 ! 34 " 35 # + //36 $ 37 % 38 & 39 ' + CHAR_SPACE , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + //40 ( 41 ) 42 * 43 + + //44 , 45 - 46 . 47 / + CHAR_PUNCT , CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , + //48 0 49 1 50 2 51 3 + //52 4 53 5 54 6 55 7 + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + //56 8 57 9 58 : 59 ; + //60 < 61 = 62 > 63 ? + CHAR_DIGIT , CHAR_DIGIT , CHAR_RAWDEL , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , + //64 @ 65 A 66 B 67 C + //68 D 69 E 70 F 71 G + CHAR_PUNCT , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , + CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER , + //72 H 73 I 74 J 75 K + //76 L 77 M 78 N 79 O + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + //80 P 81 Q 82 R 83 S + //84 T 85 U 86 V 87 W + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + //88 X 89 Y 90 Z 91 [ + //92 \ 93 ] 94 ^ 95 _ + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_RAWDEL , + CHAR_PUNCT , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , + //96 ` 97 a 98 b 99 c + //100 d 101 e 102 f 103 g + CHAR_PUNCT , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , + CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER , + //104 h 105 i 106 j 107 k + //108 l 109 m 110 n 111 o + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + //112 p 113 q 114 r 115 s + //116 t 117 u 118 v 119 w + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + //120 x 121 y 122 z 123 { + //124 | 125 } 126 ~ 127 DEL + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_RAWDEL , + CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 +}; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 1b064c8..6aae4e1 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -25,6 +25,7 @@ //===----------------------------------------------------------------------===// #include "clang/Lex/Lexer.h" +#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/CodeCompletionHandler.h" #include "clang/Lex/LexDiagnostic.h" @@ -38,8 +39,6 @@ #include using namespace clang; -static void InitCharacterInfo(); - //===----------------------------------------------------------------------===// // Token Class Implementation //===----------------------------------------------------------------------===// @@ -66,8 +65,6 @@ void Lexer::anchor() { } void Lexer::InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd) { - InitCharacterInfo(); - BufferStart = BufStart; BufferPtr = BufPtr; BufferEnd = BufEnd; @@ -408,9 +405,6 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, } - -static bool isWhitespace(unsigned char c); - /// MeasureTokenLength - Relex the token at the specified location and return /// its length in bytes in the input file. If the token needs cleaning (e.g. /// includes a trigraph or an escaped newline) then this count includes bytes @@ -1008,163 +1002,8 @@ StringRef Lexer::getImmediateMacroName(SourceLocation Loc, return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); } -//===----------------------------------------------------------------------===// -// Character information. -//===----------------------------------------------------------------------===// - -enum { - CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' - CHAR_VERT_WS = 0x02, // '\r', '\n' - CHAR_LETTER = 0x04, // a-z,A-Z - CHAR_NUMBER = 0x08, // 0-9 - CHAR_UNDER = 0x10, // _ - CHAR_PERIOD = 0x20, // . - CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' -}; - -// Statically initialize CharInfo table based on ASCII character set -// Reference: FreeBSD 7.2 /usr/share/misc/ascii -static const unsigned char CharInfo[256] = -{ -// 0 NUL 1 SOH 2 STX 3 ETX -// 4 EOT 5 ENQ 6 ACK 7 BEL - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -// 8 BS 9 HT 10 NL 11 VT -//12 NP 13 CR 14 SO 15 SI - 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, - CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , -//16 DLE 17 DC1 18 DC2 19 DC3 -//20 DC4 21 NAK 22 SYN 23 ETB - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -//24 CAN 25 EM 26 SUB 27 ESC -//28 FS 29 GS 30 RS 31 US - 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , -//32 SP 33 ! 34 " 35 # -//36 $ 37 % 38 & 39 ' - CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , - 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , -//40 ( 41 ) 42 * 43 + -//44 , 45 - 46 . 47 / - 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , -//48 0 49 1 50 2 51 3 -//52 4 53 5 54 6 55 7 - CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , - CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , -//56 8 57 9 58 : 59 ; -//60 < 61 = 62 > 63 ? - CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , -//64 @ 65 A 66 B 67 C -//68 D 69 E 70 F 71 G - 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//72 H 73 I 74 J 75 K -//76 L 77 M 78 N 79 O - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//80 P 81 Q 82 R 83 S -//84 T 85 U 86 V 87 W - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//88 X 89 Y 90 Z 91 [ -//92 \ 93 ] 94 ^ 95 _ - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , - 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , -//96 ` 97 a 98 b 99 c -//100 d 101 e 102 f 103 g - 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//104 h 105 i 106 j 107 k -//108 l 109 m 110 n 111 o - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//112 p 113 q 114 r 115 s -//116 t 117 u 118 v 119 w - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , -//120 x 121 y 122 z 123 { -//124 | 125 } 126 ~ 127 DEL - CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , - CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 -}; - -static void InitCharacterInfo() { - static bool isInited = false; - if (isInited) return; - // check the statically-initialized CharInfo table - assert(CHAR_HORZ_WS == CharInfo[(int)' ']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); - assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); - assert(CHAR_VERT_WS == CharInfo[(int)'\n']); - assert(CHAR_VERT_WS == CharInfo[(int)'\r']); - assert(CHAR_UNDER == CharInfo[(int)'_']); - assert(CHAR_PERIOD == CharInfo[(int)'.']); - for (unsigned i = 'a'; i <= 'z'; ++i) { - assert(CHAR_LETTER == CharInfo[i]); - assert(CHAR_LETTER == CharInfo[i+'A'-'a']); - } - for (unsigned i = '0'; i <= '9'; ++i) - assert(CHAR_NUMBER == CharInfo[i]); - - isInited = true; -} - - -/// isIdentifierHead - Return true if this is the first character of an -/// identifier, which is [a-zA-Z_]. -static inline bool isIdentifierHead(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; -} - -/// isIdentifierBody - Return true if this is the body character of an -/// identifier, which is [a-zA-Z0-9_]. -static inline bool isIdentifierBody(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; -} - -/// isHorizontalWhitespace - Return true if this character is horizontal -/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for -/// '\\0'. -static inline bool isHorizontalWhitespace(unsigned char c) { - return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; -} - -/// isVerticalWhitespace - Return true if this character is vertical -/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'. -static inline bool isVerticalWhitespace(unsigned char c) { - return (CharInfo[c] & CHAR_VERT_WS) ? true : false; -} - -/// isWhitespace - Return true if this character is horizontal or vertical -/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns -/// false for '\\0'. -static inline bool isWhitespace(unsigned char c) { - return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; -} - -/// isNumberBody - Return true if this is the body character of an -/// preprocessing number, which is [a-zA-Z0-9_.]. -static inline bool isNumberBody(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? - true : false; -} - -/// isRawStringDelimBody - Return true if this is the body character of a -/// raw string delimiter. -static inline bool isRawStringDelimBody(unsigned char c) { - return (CharInfo[c] & - (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? - true : false; -} - -// Allow external clients to make use of CharInfo. bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { - return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents); + return isIdentifierBody(c, LangOpts.DollarIdents); } @@ -1578,10 +1417,6 @@ static bool isAllowedInitiallyIDChar(uint32_t c) { !(0xFE20 <= c && c <= 0xFE2F); } -static inline bool isASCII(char C) { - return static_cast(C) >= 0; -} - void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] @@ -1595,8 +1430,8 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Fast path, no $,\,? in identifier found. '\' might be an escaped newline // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. // - // TODO: Could merge these checks into a CharInfo flag to make the comparison - // cheaper + // TODO: Could merge these checks into an InfoTable flag to make the + // comparison cheaper if (isASCII(C) && C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) { FinishIdentifier: @@ -1700,7 +1535,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { unsigned Size; char C = getCharAndSize(CurPtr, Size); char PrevCh = 0; - while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix. + while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix. CurPtr = ConsumeChar(CurPtr, Size, Result); PrevCh = C; C = getCharAndSize(CurPtr, Size); diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt index 300dcd5..51db6ce 100644 --- a/clang/unittests/Basic/CMakeLists.txt +++ b/clang/unittests/Basic/CMakeLists.txt @@ -1,4 +1,5 @@ add_clang_unittest(BasicTests + CharInfoTest.cpp FileManagerTest.cpp SourceManagerTest.cpp ) diff --git a/clang/unittests/Basic/CharInfoTest.cpp b/clang/unittests/Basic/CharInfoTest.cpp new file mode 100644 index 0000000..9b3d1b3 --- /dev/null +++ b/clang/unittests/Basic/CharInfoTest.cpp @@ -0,0 +1,377 @@ +//===- unittests/Basic/CharInfoTest.cpp -- ASCII classification tests -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/CharInfo.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace clang; + +// Check that the CharInfo table has been constructed reasonably. +TEST(CharInfoTest, validateInfoTable) { + using namespace charinfo; + EXPECT_EQ((unsigned)CHAR_SPACE, InfoTable[(unsigned)' ']); + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\t']); + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\f']); // ?? + EXPECT_EQ((unsigned)CHAR_HORZ_WS, InfoTable[(unsigned)'\v']); // ?? + EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\n']); + EXPECT_EQ((unsigned)CHAR_VERT_WS, InfoTable[(unsigned)'\r']); + EXPECT_EQ((unsigned)CHAR_UNDER, InfoTable[(unsigned)'_']); + EXPECT_EQ((unsigned)CHAR_PERIOD, InfoTable[(unsigned)'.']); + + for (unsigned i = 'a'; i <= 'f'; ++i) { + EXPECT_EQ((unsigned)CHAR_XLOWER, InfoTable[i]); + EXPECT_EQ((unsigned)CHAR_XUPPER, InfoTable[i+'A'-'a']); + } + + for (unsigned i = 'g'; i <= 'z'; ++i) { + EXPECT_EQ((unsigned)CHAR_LOWER, InfoTable[i]); + EXPECT_EQ((unsigned)CHAR_UPPER, InfoTable[i+'A'-'a']); + } + + for (unsigned i = '0'; i <= '9'; ++i) + EXPECT_EQ((unsigned)CHAR_DIGIT, InfoTable[i]); +} + +// Check various predicates. +TEST(CharInfoTest, isASCII) { + EXPECT_TRUE(isASCII('\0')); + EXPECT_TRUE(isASCII('\n')); + EXPECT_TRUE(isASCII(' ')); + EXPECT_TRUE(isASCII('a')); + EXPECT_TRUE(isASCII('\x7f')); + EXPECT_FALSE(isASCII('\x80')); + EXPECT_FALSE(isASCII('\xc2')); + EXPECT_FALSE(isASCII('\xff')); +} + +TEST(CharInfoTest, isIdentifierHead) { + EXPECT_TRUE(isIdentifierHead('a')); + EXPECT_TRUE(isIdentifierHead('A')); + EXPECT_TRUE(isIdentifierHead('z')); + EXPECT_TRUE(isIdentifierHead('Z')); + EXPECT_TRUE(isIdentifierHead('_')); + + EXPECT_FALSE(isIdentifierHead('0')); + EXPECT_FALSE(isIdentifierHead('.')); + EXPECT_FALSE(isIdentifierHead('`')); + EXPECT_FALSE(isIdentifierHead('\0')); + + EXPECT_FALSE(isIdentifierHead('$')); + EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true)); + + EXPECT_FALSE(isIdentifierHead('\x80')); + EXPECT_FALSE(isIdentifierHead('\xc2')); + EXPECT_FALSE(isIdentifierHead('\xff')); +} + +TEST(CharInfoTest, isIdentifierBody) { + EXPECT_TRUE(isIdentifierBody('a')); + EXPECT_TRUE(isIdentifierBody('A')); + EXPECT_TRUE(isIdentifierBody('z')); + EXPECT_TRUE(isIdentifierBody('Z')); + EXPECT_TRUE(isIdentifierBody('_')); + + EXPECT_TRUE(isIdentifierBody('0')); + EXPECT_FALSE(isIdentifierBody('.')); + EXPECT_FALSE(isIdentifierBody('`')); + EXPECT_FALSE(isIdentifierBody('\0')); + + EXPECT_FALSE(isIdentifierBody('$')); + EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true)); + + EXPECT_FALSE(isIdentifierBody('\x80')); + EXPECT_FALSE(isIdentifierBody('\xc2')); + EXPECT_FALSE(isIdentifierBody('\xff')); +} + +TEST(CharInfoTest, isHorizontalWhitespace) { + EXPECT_FALSE(isHorizontalWhitespace('a')); + EXPECT_FALSE(isHorizontalWhitespace('_')); + EXPECT_FALSE(isHorizontalWhitespace('0')); + EXPECT_FALSE(isHorizontalWhitespace('.')); + EXPECT_FALSE(isHorizontalWhitespace('`')); + EXPECT_FALSE(isHorizontalWhitespace('\0')); + EXPECT_FALSE(isHorizontalWhitespace('\x7f')); + + EXPECT_TRUE(isHorizontalWhitespace(' ')); + EXPECT_TRUE(isHorizontalWhitespace('\t')); + EXPECT_TRUE(isHorizontalWhitespace('\f')); // ?? + EXPECT_TRUE(isHorizontalWhitespace('\v')); // ?? + + EXPECT_FALSE(isHorizontalWhitespace('\n')); + EXPECT_FALSE(isHorizontalWhitespace('\r')); + + EXPECT_FALSE(isHorizontalWhitespace('\x80')); + EXPECT_FALSE(isHorizontalWhitespace('\xc2')); + EXPECT_FALSE(isHorizontalWhitespace('\xff')); +} + +TEST(CharInfoTest, isVerticalWhitespace) { + EXPECT_FALSE(isVerticalWhitespace('a')); + EXPECT_FALSE(isVerticalWhitespace('_')); + EXPECT_FALSE(isVerticalWhitespace('0')); + EXPECT_FALSE(isVerticalWhitespace('.')); + EXPECT_FALSE(isVerticalWhitespace('`')); + EXPECT_FALSE(isVerticalWhitespace('\0')); + EXPECT_FALSE(isVerticalWhitespace('\x7f')); + + EXPECT_FALSE(isVerticalWhitespace(' ')); + EXPECT_FALSE(isVerticalWhitespace('\t')); + EXPECT_FALSE(isVerticalWhitespace('\f')); // ?? + EXPECT_FALSE(isVerticalWhitespace('\v')); // ?? + + EXPECT_TRUE(isVerticalWhitespace('\n')); + EXPECT_TRUE(isVerticalWhitespace('\r')); + + EXPECT_FALSE(isVerticalWhitespace('\x80')); + EXPECT_FALSE(isVerticalWhitespace('\xc2')); + EXPECT_FALSE(isVerticalWhitespace('\xff')); +} + +TEST(CharInfoTest, isWhitespace) { + EXPECT_FALSE(isWhitespace('a')); + EXPECT_FALSE(isWhitespace('_')); + EXPECT_FALSE(isWhitespace('0')); + EXPECT_FALSE(isWhitespace('.')); + EXPECT_FALSE(isWhitespace('`')); + EXPECT_FALSE(isWhitespace('\0')); + EXPECT_FALSE(isWhitespace('\x7f')); + + EXPECT_TRUE(isWhitespace(' ')); + EXPECT_TRUE(isWhitespace('\t')); + EXPECT_TRUE(isWhitespace('\f')); + EXPECT_TRUE(isWhitespace('\v')); + + EXPECT_TRUE(isWhitespace('\n')); + EXPECT_TRUE(isWhitespace('\r')); + + EXPECT_FALSE(isWhitespace('\x80')); + EXPECT_FALSE(isWhitespace('\xc2')); + EXPECT_FALSE(isWhitespace('\xff')); +} + +TEST(CharInfoTest, isDigit) { + EXPECT_TRUE(isDigit('0')); + EXPECT_TRUE(isDigit('9')); + + EXPECT_FALSE(isDigit('a')); + EXPECT_FALSE(isDigit('A')); + + EXPECT_FALSE(isDigit('z')); + EXPECT_FALSE(isDigit('Z')); + + EXPECT_FALSE(isDigit('.')); + EXPECT_FALSE(isDigit('_')); + + EXPECT_FALSE(isDigit('/')); + EXPECT_FALSE(isDigit('\0')); + + EXPECT_FALSE(isDigit('\x80')); + EXPECT_FALSE(isDigit('\xc2')); + EXPECT_FALSE(isDigit('\xff')); +} + +TEST(CharInfoTest, isHexDigit) { + EXPECT_TRUE(isHexDigit('0')); + EXPECT_TRUE(isHexDigit('9')); + + EXPECT_TRUE(isHexDigit('a')); + EXPECT_TRUE(isHexDigit('A')); + + EXPECT_FALSE(isHexDigit('z')); + EXPECT_FALSE(isHexDigit('Z')); + + EXPECT_FALSE(isHexDigit('.')); + EXPECT_FALSE(isHexDigit('_')); + + EXPECT_FALSE(isHexDigit('/')); + EXPECT_FALSE(isHexDigit('\0')); + + EXPECT_FALSE(isHexDigit('\x80')); + EXPECT_FALSE(isHexDigit('\xc2')); + EXPECT_FALSE(isHexDigit('\xff')); +} + +TEST(CharInfoTest, isLetter) { + EXPECT_FALSE(isLetter('0')); + EXPECT_FALSE(isLetter('9')); + + EXPECT_TRUE(isLetter('a')); + EXPECT_TRUE(isLetter('A')); + + EXPECT_TRUE(isLetter('z')); + EXPECT_TRUE(isLetter('Z')); + + EXPECT_FALSE(isLetter('.')); + EXPECT_FALSE(isLetter('_')); + + EXPECT_FALSE(isLetter('/')); + EXPECT_FALSE(isLetter('(')); + EXPECT_FALSE(isLetter('\0')); + + EXPECT_FALSE(isLetter('\x80')); + EXPECT_FALSE(isLetter('\xc2')); + EXPECT_FALSE(isLetter('\xff')); +} + +TEST(CharInfoTest, isLowercase) { + EXPECT_FALSE(isLowercase('0')); + EXPECT_FALSE(isLowercase('9')); + + EXPECT_TRUE(isLowercase('a')); + EXPECT_FALSE(isLowercase('A')); + + EXPECT_TRUE(isLowercase('z')); + EXPECT_FALSE(isLowercase('Z')); + + EXPECT_FALSE(isLowercase('.')); + EXPECT_FALSE(isLowercase('_')); + + EXPECT_FALSE(isLowercase('/')); + EXPECT_FALSE(isLowercase('(')); + EXPECT_FALSE(isLowercase('\0')); + + EXPECT_FALSE(isLowercase('\x80')); + EXPECT_FALSE(isLowercase('\xc2')); + EXPECT_FALSE(isLowercase('\xff')); +} + +TEST(CharInfoTest, isUppercase) { + EXPECT_FALSE(isUppercase('0')); + EXPECT_FALSE(isUppercase('9')); + + EXPECT_FALSE(isUppercase('a')); + EXPECT_TRUE(isUppercase('A')); + + EXPECT_FALSE(isUppercase('z')); + EXPECT_TRUE(isUppercase('Z')); + + EXPECT_FALSE(isUppercase('.')); + EXPECT_FALSE(isUppercase('_')); + + EXPECT_FALSE(isUppercase('/')); + EXPECT_FALSE(isUppercase('(')); + EXPECT_FALSE(isUppercase('\0')); + + EXPECT_FALSE(isUppercase('\x80')); + EXPECT_FALSE(isUppercase('\xc2')); + EXPECT_FALSE(isUppercase('\xff')); +} + +TEST(CharInfoTest, isAlphanumeric) { + EXPECT_TRUE(isAlphanumeric('0')); + EXPECT_TRUE(isAlphanumeric('9')); + + EXPECT_TRUE(isAlphanumeric('a')); + EXPECT_TRUE(isAlphanumeric('A')); + + EXPECT_TRUE(isAlphanumeric('z')); + EXPECT_TRUE(isAlphanumeric('Z')); + + EXPECT_FALSE(isAlphanumeric('.')); + EXPECT_FALSE(isAlphanumeric('_')); + + EXPECT_FALSE(isAlphanumeric('/')); + EXPECT_FALSE(isAlphanumeric('(')); + EXPECT_FALSE(isAlphanumeric('\0')); + + EXPECT_FALSE(isAlphanumeric('\x80')); + EXPECT_FALSE(isAlphanumeric('\xc2')); + EXPECT_FALSE(isAlphanumeric('\xff')); +} + +TEST(CharInfoTest, isPunctuation) { + EXPECT_FALSE(isPunctuation('0')); + EXPECT_FALSE(isPunctuation('9')); + + EXPECT_FALSE(isPunctuation('a')); + EXPECT_FALSE(isPunctuation('A')); + + EXPECT_FALSE(isPunctuation('z')); + EXPECT_FALSE(isPunctuation('Z')); + + EXPECT_TRUE(isPunctuation('.')); + EXPECT_TRUE(isPunctuation('_')); + + EXPECT_TRUE(isPunctuation('/')); + EXPECT_TRUE(isPunctuation('(')); + + EXPECT_FALSE(isPunctuation(' ')); + EXPECT_FALSE(isPunctuation('\n')); + EXPECT_FALSE(isPunctuation('\0')); + + EXPECT_FALSE(isPunctuation('\x80')); + EXPECT_FALSE(isPunctuation('\xc2')); + EXPECT_FALSE(isPunctuation('\xff')); +} + +TEST(CharInfoTest, isPrintable) { + EXPECT_TRUE(isPrintable('0')); + EXPECT_TRUE(isPrintable('9')); + + EXPECT_TRUE(isPrintable('a')); + EXPECT_TRUE(isPrintable('A')); + + EXPECT_TRUE(isPrintable('z')); + EXPECT_TRUE(isPrintable('Z')); + + EXPECT_TRUE(isPrintable('.')); + EXPECT_TRUE(isPrintable('_')); + + EXPECT_TRUE(isPrintable('/')); + EXPECT_TRUE(isPrintable('(')); + + EXPECT_TRUE(isPrintable(' ')); + EXPECT_FALSE(isPrintable('\t')); + EXPECT_FALSE(isPrintable('\n')); + EXPECT_FALSE(isPrintable('\0')); + + EXPECT_FALSE(isPrintable('\x80')); + EXPECT_FALSE(isPrintable('\xc2')); + EXPECT_FALSE(isPrintable('\xff')); +} + +TEST(CharInfoTest, isPreprocessingNumberBody) { + EXPECT_TRUE(isPreprocessingNumberBody('0')); + EXPECT_TRUE(isPreprocessingNumberBody('9')); + + EXPECT_TRUE(isPreprocessingNumberBody('a')); + EXPECT_TRUE(isPreprocessingNumberBody('A')); + + EXPECT_TRUE(isPreprocessingNumberBody('z')); + EXPECT_TRUE(isPreprocessingNumberBody('Z')); + EXPECT_TRUE(isPreprocessingNumberBody('.')); + EXPECT_TRUE(isPreprocessingNumberBody('_')); + + EXPECT_FALSE(isPreprocessingNumberBody('/')); + EXPECT_FALSE(isPreprocessingNumberBody('(')); + EXPECT_FALSE(isPreprocessingNumberBody('\0')); + + EXPECT_FALSE(isPreprocessingNumberBody('\x80')); + EXPECT_FALSE(isPreprocessingNumberBody('\xc2')); + EXPECT_FALSE(isPreprocessingNumberBody('\xff')); +} + +TEST(CharInfoTest, isRawStringDelimBody) { + EXPECT_TRUE(isRawStringDelimBody('0')); + EXPECT_TRUE(isRawStringDelimBody('9')); + + EXPECT_TRUE(isRawStringDelimBody('a')); + EXPECT_TRUE(isRawStringDelimBody('A')); + + EXPECT_TRUE(isRawStringDelimBody('z')); + EXPECT_TRUE(isRawStringDelimBody('Z')); + EXPECT_TRUE(isRawStringDelimBody('.')); + EXPECT_TRUE(isRawStringDelimBody('_')); + + EXPECT_TRUE(isRawStringDelimBody('/')); + EXPECT_FALSE(isRawStringDelimBody('(')); + EXPECT_FALSE(isRawStringDelimBody('\0')); +} -- 2.7.4