*
* The second check covers surrogate pairs (category Cs).
*
- * The last two checks cover "Noncharacter": defined as:
- * "A code point that is permanently reserved for
- * internal use, and that should never be interchanged. In
- * Unicode 3.1, these consist of the values U+nFFFE and U+nFFFF
- * (where n is from 0 to 10_16) and the values U+FDD0..U+FDEF."
- *
* @param Char the character
*/
#define UNICODE_VALID(Char) \
((Char) < 0x110000 && \
- (((Char) & 0xFFFFF800) != 0xD800) && \
- ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
- ((Char) & 0xFFFE) != 0xFFFE)
+ (((Char) & 0xFFFFF800) != 0xD800))
/**
* Finds the given substring in the string,
const char * const valid_strings[] = {
"",
- "\xc2\xa9",
+ "\xc2\xa9", /* UTF-8 (c) symbol */
+ "\xef\xbf\xbe", /* U+FFFE is reserved but Corrigendum 9 says it's OK */
NULL
};
const char * const invalid_strings[] = {
- "\xa9",
+ "\xa9", /* Latin-1 (c) symbol */
+ "\xed\xa0\x80", /* UTF-16 surrogates are not valid in UTF-8 */
NULL
};