Add support for UTF-16BE and UTF-32BE
authorH. Peter Anvin <hpa@zytor.com>
Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
committerH. Peter Anvin <hpa@zytor.com>
Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
Add support for bigendian UTF-16 and UTF-32, and (for symmetry) add
explicitly littleendian operators.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
doc/nasmdoc.src
nasm.h
strfunc.c
test/utf.asm
tokens.dat

index 217c12a..3c912ee 100644 (file)
@@ -1596,9 +1596,12 @@ operands to \c{DW}, and so forth.
 
 \S{unicode} \I{UTF-16}\I{UTF-32}\i{Unicode} Strings
 
-The special operators \i\c{__utf16__} and \i\c{__utf32__} allows
-definition of Unicode strings.  They take a string in UTF-8 format and
-converts it to (littleendian) UTF-16 or UTF-32, respectively.
+The special operators \i\c{__utf16__}, \i\c{__utf16le__},
+\i\c{__utf16be__}, \i\c{__utf32__}, \i\c{__utf32le__} and
+\i\c{__utf32be__} allows definition of Unicode strings.  They take a
+string in UTF-8 format and converts it to UTF-16 or UTF-32,
+respectively.  Unless the \c{be} forms are specified, the output is
+littleendian.
 
 For example:
 
@@ -1608,9 +1611,9 @@ For example:
 \c       dw u('C:\WINDOWS'), 0       ; Pathname in UTF-16
 \c       dd w(`A + B = \u206a`), 0   ; String in UTF-32
 
-\c{__utf16__} and \c{__utf32__} can be applied either to strings
-passed to the \c{DB} family instructions, or to character constants in
-an expression context.
+The UTF operators can be applied either to strings passed to the
+\c{DB} family instructions, or to character constants in an expression
+context.
 
 \S{fltconst} \I{floating-point, constants}Floating-Point Constants
 
diff --git a/nasm.h b/nasm.h
index 46e4c05..5b4b5ff 100644 (file)
--- a/nasm.h
+++ b/nasm.h
@@ -224,7 +224,7 @@ enum token_type { /* token types, other than chars */
     TOKEN_SEG,          /* SEG */
     TOKEN_WRT,          /* WRT */
     TOKEN_FLOATIZE,     /* __floatX__ */
-    TOKEN_STRFUNC,      /* __utf16__, __utf32__ */
+    TOKEN_STRFUNC,      /* __utf16*__, __utf32*__ */
 };
 
 enum floatize {
@@ -241,7 +241,11 @@ enum floatize {
 /* Must match the list in string_transform(), in strfunc.c */
 enum strfunc {
     STRFUNC_UTF16,
+    STRFUNC_UTF16LE,
+    STRFUNC_UTF16BE,
     STRFUNC_UTF32,
+    STRFUNC_UTF32LE,
+    STRFUNC_UTF32BE,
 };
 
 size_t string_transform(char *, size_t, char **, enum strfunc);
index a34f738..4b5af40 100644 (file)
--- a/strfunc.c
+++ b/strfunc.c
@@ -111,6 +111,84 @@ static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
 }
 
 /*
+ * Convert a string in UTF-8 format to UTF-16BE
+ */
+static size_t utf8_to_16be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                 \
+    do {                                        \
+        uint16_t _y = (x);                      \
+        if (op) {                               \
+            WRITECHAR(op, _y >> 8);             \
+            WRITECHAR(op, _y);                  \
+        }                                       \
+        outlen++;                               \
+    } while (0)                                 \
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               expect = 0;
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || v > 0x10ffff ||
+                       (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else if (v > 0xffff) {
+                       v -= 0x10000;
+                       EMIT(0xdc00 | (v & 0x3ff));
+                       EMIT(0xd800 | (v >> 10));
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xc0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
  * Convert a string in UTF-8 format to UTF-32LE
  */
 static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
@@ -174,6 +252,80 @@ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
 #undef EMIT
 }
 
+/*
+ * Convert a string in UTF-8 format to UTF-32BE
+ */
+static size_t utf8_to_32be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                         \
+    do {                                                \
+        uint32_t _y = (x);                              \
+        if (op) {                                       \
+            WRITECHAR(op,_y >> 24);                     \
+            WRITECHAR(op,_y >> 16);                     \
+            WRITECHAR(op,_y >> 8);                      \
+            WRITECHAR(op,_y);                           \
+        }                                               \
+        outlen++;                                       \
+    } while (0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xc0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
 typedef size_t (*transform_func)(uint8_t *, size_t, char *);
 
 /*
@@ -186,7 +338,11 @@ size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
     /* This should match enum strfunc in nasm.h */
     static const transform_func str_transforms[] = {
        utf8_to_16le,
+       utf8_to_16le,
+       utf8_to_16be,
+       utf8_to_32le,
        utf8_to_32le,
+       utf8_to_32be,
     };
     transform_func transform = str_transforms[func];
     size_t outlen;
index 4b894f8..00207dc 100644 (file)
@@ -2,6 +2,10 @@
 ;Testname=error; Arguments=-fbin -outf.bin -DERROR; Files=stdout stderr utf.bin
 %define u(x) __utf16__(x)
 %define w(x) __utf32__(x)
+%define ul(x) __utf16le__(x)
+%define wl(x) __utf32le__(x)
+%define ub(x) __utf16be__(x)
+%define wb(x) __utf32be__(x)
 
        db `Test \u306a\U0001abcd\n`
        dw u(`Test \u306a\U0001abcd\n`)
        mov ebx,u(`\U0001abcd`)
        mov ecx,w(`\U0001abcd`)
 
+       db `Test \u306a\U0001abcd\n`
+       dw ul(`Test \u306a\U0001abcd\n`)
+       dd wl(`Test \u306a\U0001abcd\n`)
+
+       db `\u306a`
+       db `\xe3\x81\xaa`
+
+       dw __utf16le__ "Hello, World!"
+
+       nop
+
+       mov ax,ul(`a`)
+       mov bx,ul(`\u306a`)
+       mov cx,ul(`\xe3\x81\xaa`)
+       mov eax,ul(`ab`)
+       mov ebx,ul(`\U0001abcd`)
+       mov ecx,wl(`\U0001abcd`)
+       
+       db `Test \u306a\U0001abcd\n`
+       dw ub(`Test \u306a\U0001abcd\n`)
+       dd wb(`Test \u306a\U0001abcd\n`)
+
+       db `\u306a`
+       db `\xe3\x81\xaa`
+
+       dw __utf16be__ "Hello, World!"
+
+       nop
+
+       mov ax,ub(`a`)
+       mov bx,ub(`\u306a`)
+       mov cx,ub(`\xe3\x81\xaa`)
+       mov eax,ub(`ab`)
+       mov ebx,ub(`\U0001abcd`)
+       mov ecx,wb(`\U0001abcd`)
+
 %ifdef ERROR
        dw __utf16__ 33
        dw __utf16__, 46
        dw __utf16__("Hello, World!",16)
        dw __utf16__("Hello, World!",16
        dw u(`\xff`)
+
+       dw __utf16le__ 33
+       dw __utf16le__, 46
+       dw __utf16le__("Hello, World!",16)
+       dw __utf16le__("Hello, World!",16
+       dw ul(`\xff`)
+
+       dw __utf16be__ 33
+       dw __utf16be__, 46
+       dw __utf16be__("Hello, World!",16)
+       dw __utf16be__("Hello, World!",16
+       dw ub(`\xff`)
 %endif
index 25179fa..bb5fccb 100644 (file)
@@ -91,7 +91,11 @@ __float128h__
 
 % TOKEN_STRFUNC, 0, STRFUNC_{__*__}
 __utf16__
+__utf16le__
+__utf16be__
 __utf32__
+__utf32le__
+__utf32be__
 
 % TOKEN_*, 0, 0
 seg