/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
+ * Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
-#include <stdarg.h>
-#include <string.h>
#include "phonenumbers/utf/utf.h"
#include "phonenumbers/utf/utfdef.h"
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
- Rune4 = (1<<(Bit4+3*Bitx))-1,
- /* 0001 1111 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
+ SurrogateMin = 0xD800,
+ SurrogateMax = 0xDFFF,
+
Bad = Runeerror,
};
*/
c = *(uchar*)str;
if(c < Tx) {
- *rune = c;
+ *rune = (Rune)c;
return 1;
}
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
- *rune = l;
+ *rune = (Rune)l;
return 2;
}
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
- *rune = l;
+ if (SurrogateMin <= l && l <= SurrogateMax)
+ goto bad;
+ *rune = (Rune)l;
return 3;
}
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
- if (l <= Rune3)
+ if (l <= Rune3 || l > Runemax)
goto bad;
- *rune = l;
+ *rune = (Rune)l;
return 4;
}
*/
c = *(uchar*)str;
if(c < Tx) {
- *rune = c;
+ *rune = (Rune)c;
return 1;
}
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
- *rune = l;
+ *rune = (Rune)l;
return 2;
}
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
- *rune = l;
+ if (SurrogateMin <= l && l <= SurrogateMax)
+ goto bad;
+ *rune = (Rune)l;
return 3;
}
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
- if (l <= Rune3)
+ if (l <= Rune3 || l > Runemax)
goto bad;
- *rune = l;
+ *rune = (Rune)l;
return 4;
}
}
int
-isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
+{
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
*/
c = *rune;
if(c <= Rune1) {
- str[0] = c;
+ str[0] = (char)c;
return 1;
}
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
- str[0] = T2 | (c >> 1*Bitx);
- str[1] = Tx | (c & Maskx);
+ str[0] = (char)(T2 | (c >> 1*Bitx));
+ str[1] = (char)(Tx | (c & Maskx));
return 2;
}
/*
- * If the Rune is out of range, convert it to the error rune.
+ * If the Rune is out of range or a surrogate half, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
+ if (SurrogateMin <= c && c <= SurrogateMax)
+ c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
+ str[0] = (char)(T3 | (c >> 2*Bitx));
+ str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
+ str[2] = (char)(Tx | (c & Maskx));
return 3;
}
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
- str[0] = T4 | (c >> 3*Bitx);
- str[1] = Tx | ((c >> 2*Bitx) & Maskx);
- str[2] = Tx | ((c >> 1*Bitx) & Maskx);
- str[3] = Tx | (c & Maskx);
+ str[0] = (char)(T4 | (c >> 3*Bitx));
+ str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
+ str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
+ str[3] = (char)(Tx | (c & Maskx));
return 4;
}
nb = 0;
while(nrune--) {
- c = *r++;
+ c = (int)*r++;
if (c <= Rune1)
nb++;
else if (c <= Rune2)
/*
* The authors of this software are Rob Pike and Ken Thompson.
- * Copyright (c) 1998-2002 by Lucent Technologies.
- * Portions Copyright (c) 2009 The Go Authors. All rights reserved.
+ * Copyright (c) 1998-2002 by Lucent Technologies.
+ * Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
- * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
- */
+ */
#ifndef _UTFH_
#define _UTFH_ 1
-// stdint.h content doesn't seem to be used in this file and doesn't exist on
-// Windows, therefore we comment it out here so that the code could be compiled
-// on Windows.
-//#include <stdint.h>
-
-typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
+typedef unsigned int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
// n bytes of s. If the UTF sequence is incomplete within n bytes,
// charntorune will set *r to Runeerror and return 0. If it is complete
// but not in UTF format, it will set *r to Runeerror and return 1.
-//
+//
// Added 2004-09-24 by Wei-Hwa Huang
int charntorune(Rune* r, const char* s, int n);
// byte terminating a string is considered to be part of the string s.
// (cf. strchr)
-const char* utfrune(const char* s, Rune r);
+/*const*/ char* utfrune(const char* s, Rune r);
// utfrrune returns a pointer to the last occurrence of rune r in the
// byte terminating a string is considered to be part of the string s.
// (cf. strrchr)
-const char* utfrrune(const char* s, Rune r);
+/*const*/ char* utfrrune(const char* s, Rune r);
// utfutf returns a pointer to the first occurrence of the UTF string
// These functions are rune-string analogues of the corresponding
// functions in strcat (3).
-//
+//
// These routines first appeared in Plan 9.
// SEE ALSO
// memmove (3)
// isupperrune tests for upper case characters, including Unicode
// upper case letters and targets of the toupper mapping. islowerrune
-// and istitlerune are defined analogously.
-
+// and istitlerune are defined analogously.
+
int isupperrune(Rune r);
int islowerrune(Rune r);
int istitlerune(Rune r);
int isdigitrune(Rune r);
-// isideographicrune tests for ideographic characters and numbers, as
-// defined by the Unicode standard.
-
-int isideographicrune(Rune r);
-
-
// isspacerune tests for whitespace characters, including "C" locale
// whitespace, Unicode defined whitespace, and the "zero-width
// non-break space" character.