Add support for UTF-16BE and UTF-32BE

author H. Peter Anvin <hpa@zytor.com>

Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)

committer H. Peter Anvin <hpa@zytor.com>

Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
author H. Peter Anvin <hpa@zytor.com>
Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
committer H. Peter Anvin <hpa@zytor.com>
Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
diff --git a/doc/nasmdoc.src b/doc/nasmdoc.src

index 217c12a..3c912ee 100644 (file)
--- a/doc/nasmdoc.src
+++ b/doc/nasmdoc.src
@@ -1596,9 +1596,12 @@ operands to \c{DW}, and so forth.
  
  \S{unicode} \I{UTF-16}\I{UTF-32}\i{Unicode} Strings
  
-The special operators \i\c{__utf16__} and \i\c{__utf32__} allows
-definition of Unicode strings.  They take a string in UTF-8 format and
-converts it to (littleendian) UTF-16 or UTF-32, respectively.
+The special operators \i\c{__utf16__}, \i\c{__utf16le__},
+\i\c{__utf16be__}, \i\c{__utf32__}, \i\c{__utf32le__} and
+\i\c{__utf32be__} allows definition of Unicode strings.  They take a
+string in UTF-8 format and converts it to UTF-16 or UTF-32,
+respectively.  Unless the \c{be} forms are specified, the output is
+littleendian.
  
  For example:
  
@@ -1608,9 +1611,9 @@ For example:
  \c       dw u('C:\WINDOWS'), 0       ; Pathname in UTF-16
  \c       dd w(`A + B = \u206a`), 0   ; String in UTF-32
  
-\c{__utf16__} and \c{__utf32__} can be applied either to strings
-passed to the \c{DB} family instructions, or to character constants in
-an expression context.
+The UTF operators can be applied either to strings passed to the
+\c{DB} family instructions, or to character constants in an expression
+context.
  
  \S{fltconst} \I{floating-point, constants}Floating-Point Constants
  
diff --git a/nasm.h b/nasm.h

index 46e4c05..5b4b5ff 100644 (file)
--- a/nasm.h
+++ b/nasm.h
@@ -224,7 +224,7 @@ enum token_type { /* token types, other than chars */
      TOKEN_SEG,          /* SEG */
      TOKEN_WRT,          /* WRT */
      TOKEN_FLOATIZE,     /* __floatX__ */
-    TOKEN_STRFUNC,      /* __utf16__, __utf32__ */
+    TOKEN_STRFUNC,      /* __utf16*__, __utf32*__ */
  };
  
  enum floatize {
@@ -241,7 +241,11 @@ enum floatize {
  /* Must match the list in string_transform(), in strfunc.c */
  enum strfunc {
      STRFUNC_UTF16,
+    STRFUNC_UTF16LE,
+    STRFUNC_UTF16BE,
      STRFUNC_UTF32,
+    STRFUNC_UTF32LE,
+    STRFUNC_UTF32BE,
  };
  
  size_t string_transform(char *, size_t, char **, enum strfunc);
diff --git a/strfunc.c b/strfunc.c

index a34f738..4b5af40 100644 (file)
--- a/strfunc.c
+++ b/strfunc.c
@@ -111,6 +111,84 @@ static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
  }
  
  /*
+ * Convert a string in UTF-8 format to UTF-16BE
+ */
+static size_t utf8_to_16be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                 \
+    do {                                        \
+        uint16_t _y = (x);                      \
+        if (op) {                               \
+            WRITECHAR(op, _y >> 8);             \
+            WRITECHAR(op, _y);                  \
+        }                                       \
+        outlen++;                               \
+    } while (0)                                 \
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               expect = 0;
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || v > 0x10ffff ||
+                       (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else if (v > 0xffff) {
+                       v -= 0x10000;
+                       EMIT(0xdc00 | (v & 0x3ff));
+                       EMIT(0xd800 | (v >> 10));
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xc0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
   * Convert a string in UTF-8 format to UTF-32LE
   */
  static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
@@ -174,6 +252,80 @@ static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
  #undef EMIT
  }
  
+/*
+ * Convert a string in UTF-8 format to UTF-32BE
+ */
+static size_t utf8_to_32be(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x)                                         \
+    do {                                                \
+        uint32_t _y = (x);                              \
+        if (op) {                                       \
+            WRITECHAR(op,_y >> 24);                     \
+            WRITECHAR(op,_y >> 16);                     \
+            WRITECHAR(op,_y >> 8);                      \
+            WRITECHAR(op,_y);                           \
+        }                                               \
+        outlen++;                                       \
+    } while (0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xc0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
  typedef size_t (*transform_func)(uint8_t *, size_t, char *);
  
  /*
@@ -186,7 +338,11 @@ size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
      /* This should match enum strfunc in nasm.h */
      static const transform_func str_transforms[] = {
         utf8_to_16le,
+       utf8_to_16le,
+       utf8_to_16be,
+       utf8_to_32le,
         utf8_to_32le,
+       utf8_to_32be,
      };
      transform_func transform = str_transforms[func];
      size_t outlen;
diff --git a/test/utf.asm b/test/utf.asm

index 4b894f8..00207dc 100644 (file)
--- a/test/utf.asm
+++ b/test/utf.asm
@@ -2,6 +2,10 @@
  ;Testname=error; Arguments=-fbin -outf.bin -DERROR; Files=stdout stderr utf.bin
  %define u(x) __utf16__(x)
  %define w(x) __utf32__(x)
+%define ul(x) __utf16le__(x)
+%define wl(x) __utf32le__(x)
+%define ub(x) __utf16be__(x)
+%define wb(x) __utf32be__(x)
  
         db `Test \u306a\U0001abcd\n`
         dw u(`Test \u306a\U0001abcd\n`)
@@ -21,10 +25,58 @@
         mov ebx,u(`\U0001abcd`)
         mov ecx,w(`\U0001abcd`)
  
+       db `Test \u306a\U0001abcd\n`
+       dw ul(`Test \u306a\U0001abcd\n`)
+       dd wl(`Test \u306a\U0001abcd\n`)
+
+       db `\u306a`
+       db `\xe3\x81\xaa`
+
+       dw __utf16le__ "Hello, World!"
+
+       nop
+
+       mov ax,ul(`a`)
+       mov bx,ul(`\u306a`)
+       mov cx,ul(`\xe3\x81\xaa`)
+       mov eax,ul(`ab`)
+       mov ebx,ul(`\U0001abcd`)
+       mov ecx,wl(`\U0001abcd`)
+       
+       db `Test \u306a\U0001abcd\n`
+       dw ub(`Test \u306a\U0001abcd\n`)
+       dd wb(`Test \u306a\U0001abcd\n`)
+
+       db `\u306a`
+       db `\xe3\x81\xaa`
+
+       dw __utf16be__ "Hello, World!"
+
+       nop
+
+       mov ax,ub(`a`)
+       mov bx,ub(`\u306a`)
+       mov cx,ub(`\xe3\x81\xaa`)
+       mov eax,ub(`ab`)
+       mov ebx,ub(`\U0001abcd`)
+       mov ecx,wb(`\U0001abcd`)
+
  %ifdef ERROR
         dw __utf16__ 33
         dw __utf16__, 46
         dw __utf16__("Hello, World!",16)
         dw __utf16__("Hello, World!",16
         dw u(`\xff`)
+
+       dw __utf16le__ 33
+       dw __utf16le__, 46
+       dw __utf16le__("Hello, World!",16)
+       dw __utf16le__("Hello, World!",16
+       dw ul(`\xff`)
+
+       dw __utf16be__ 33
+       dw __utf16be__, 46
+       dw __utf16be__("Hello, World!",16)
+       dw __utf16be__("Hello, World!",16
+       dw ub(`\xff`)
  %endif
diff --git a/tokens.dat b/tokens.dat

index 25179fa..bb5fccb 100644 (file)
--- a/tokens.dat
+++ b/tokens.dat
@@ -91,7 +91,11 @@ __float128h__
  
  % TOKEN_STRFUNC, 0, STRFUNC_{__*__}
  __utf16__
+__utf16le__
+__utf16be__
  __utf32__
+__utf32le__
+__utf32be__
  
  % TOKEN_*, 0, 0
  seg
author	H. Peter Anvin <hpa@zytor.com>
	Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
committer	H. Peter Anvin <hpa@zytor.com>
	Sat, 25 Feb 2012 23:29:37 +0000 (15:29 -0800)
doc/nasmdoc.src		patch \| blob \| history
nasm.h		patch \| blob \| history
strfunc.c		patch \| blob \| history
test/utf.asm		patch \| blob \| history
tokens.dat		patch \| blob \| history