Implement __utf16__() and __utf32__() for the DB family
authorH. Peter Anvin <hpa@zytor.com>
Sat, 14 Jun 2008 23:53:48 +0000 (16:53 -0700)
committerH. Peter Anvin <hpa@zytor.com>
Sat, 14 Jun 2008 23:53:48 +0000 (16:53 -0700)
Implement __utf16__() and __utf32__() for the DB family of
pseudo-instructions.  Not yet implemented for evaluation context.

Makefile.in
Mkfiles/msvc.mak
Mkfiles/netware.mak
Mkfiles/openwcom.mak
Mkfiles/owlinux.mak
assemble.c
nasm.h
parser.c
strfunc.c [new file with mode: 0644]
tokens.dat

index d207acc..7d32d06 100644 (file)
@@ -67,8 +67,8 @@ NASM =        nasm.$(O) nasmlib.$(O) raa.$(O) saa.$(O) \
        output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
        output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
        preproc.$(O) quote.$(O) pptok.$(O) macros.$(O) \
-       listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O) \
-       regvals.$(O) regflags.$(O)
+       listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) strfunc.$(O) \
+       tokhash.$(O) regvals.$(O) regflags.$(O)
 
 NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) \
        insnsd.$(O) insnsb.$(O) insnsn.$(O) regs.$(O) regdis.$(O)
@@ -234,7 +234,7 @@ alldeps: perlreq
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h config.h
+crc64.$(O): crc64.c compiler.h config.h nasmlib.h
 disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h config.h eval.h float.h insnsi.h labels.h \
@@ -309,6 +309,8 @@ regvals.$(O): regvals.c compiler.h config.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h config.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
  nasmlib.h quote.h regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tokens.h version.h
index c2904d8..04188bf 100644 (file)
@@ -180,7 +180,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -253,6 +253,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
index f4ec46b..c1a970f 100644 (file)
@@ -120,7 +120,7 @@ $(OBJDIR)/version.inc: $(PROOT)/version $(PROOT)/version.pl $(OBJDIR)
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.o: assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
  nasm.h nasmlib.h regs.h tables.h tokens.h version.h
-crc64.o: crc64.c compiler.h config.h
+crc64.o: crc64.c compiler.h config.h nasmlib.h
 disasm.o: disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
  nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.o: eval.c compiler.h config.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -193,6 +193,8 @@ regvals.o: regvals.c compiler.h config.h insnsi.h tables.h
 saa.o: saa.c compiler.h config.h nasmlib.h saa.h
 stdscan.o: stdscan.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
  quote.h regs.h stdscan.h tokens.h version.h
+strfunc.o: strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.o: sync.c compiler.h config.h nasmlib.h sync.h
 tokhash.o: tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
index fb95f57..aeb42c5 100644 (file)
@@ -209,7 +209,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h &
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h &
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h &
@@ -282,6 +282,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h &
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h &
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h &
  nasmlib.h regs.h tokens.h version.h
index bb10d9e..ce4dc6d 100644 (file)
@@ -219,7 +219,7 @@ everything: all doc rdf
 #-- Everything below is generated by mkdep.pl - do not edit --#
 assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tables.h tokens.h version.h
-crc64.$(O): crc64.c compiler.h
+crc64.$(O): crc64.c compiler.h nasmlib.h
 disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
  regdis.h regs.h sync.h tables.h tokens.h version.h
 eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@@ -292,6 +292,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
 saa.$(O): saa.c compiler.h nasmlib.h saa.h
 stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
  regs.h stdscan.h tokens.h version.h
+strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
+ version.h
 sync.$(O): sync.c compiler.h nasmlib.h sync.h
 tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
  nasmlib.h regs.h tokens.h version.h
index 442ed2a..7ab53ad 100644 (file)
@@ -335,7 +335,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
                         out(offset, segment, &e->offset,
                             OUT_ADDRESS, wsize, e->segment, e->wrt);
                     offset += wsize;
-                } else if (e->type == EOT_DB_STRING) {
+                } else if (e->type == EOT_DB_STRING ||
+                          e->type == EOT_DB_STRING_FREE) {
                     int align;
 
                     out(offset, segment, e->stringval,
@@ -348,6 +349,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
                             OUT_RAWDATA, align, NO_SEG, NO_SEG);
                     }
                     offset += e->stringlen + align;
+                   if (e->type == EOT_DB_STRING_FREE)
+                       nasm_free(e->stringval);
                 }
             }
             if (t > 0 && t == instruction->times - 1) {
@@ -365,15 +368,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
     }
 
     if (instruction->opcode == I_INCBIN) {
-        static char fname[FILENAME_MAX];
+        const char *fname = instruction->eops->stringval;
         FILE *fp;
-        int32_t len;
-
-        len = FILENAME_MAX - 1;
-        if (len > instruction->eops->stringlen)
-            len = instruction->eops->stringlen;
-        strncpy(fname, instruction->eops->stringval, len);
-        fname[len] = '\0';
 
        fp = fopen(fname, "rb");
        if (!fp) {
@@ -383,17 +379,18 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
             error(ERR_NONFATAL, "`incbin': unable to seek on file `%s'",
                   fname);
        } else {
-            static char buf[2048];
-            int32_t t = instruction->times;
-            int32_t base = 0;
+            static char buf[4096];
+            size_t t = instruction->times;
+            size_t base = 0;
+           size_t len;
 
             len = ftell(fp);
             if (instruction->eops->next) {
                 base = instruction->eops->next->offset;
                 len -= base;
                 if (instruction->eops->next->next &&
-                    len > instruction->eops->next->next->offset)
-                    len = instruction->eops->next->next->offset;
+                    len > (size_t)instruction->eops->next->next->offset)
+                    len = (size_t)instruction->eops->next->next->offset;
             }
             /*
              * Dummy call to list->output to give the offset to the
@@ -402,7 +399,7 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
             list->output(offset, NULL, OUT_RAWDATA, 0);
             list->uplevel(LIST_INCBIN);
             while (t--) {
-                int32_t l;
+                size_t l;
 
                 fseek(fp, base, SEEK_SET);
                 l = len;
@@ -660,7 +657,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
             osize = 0;
             if (e->type == EOT_DB_NUMBER)
                 osize = 1;
-            else if (e->type == EOT_DB_STRING)
+            else if (e->type == EOT_DB_STRING ||
+                    e->type == EOT_DB_STRING_FREE)
                 osize = e->stringlen;
 
             align = (-osize) % wsize;
@@ -672,16 +670,10 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
     }
 
     if (instruction->opcode == I_INCBIN) {
-        char fname[FILENAME_MAX];
+       const char *fname = instruction->eops->stringval;
         FILE *fp;
-        int32_t len;
-
-        len = FILENAME_MAX - 1;
-        if (len > instruction->eops->stringlen)
-            len = instruction->eops->stringlen;
-        strncpy(fname, instruction->eops->stringval, len);
-        fname[len] = '\0';
-       
+        size_t len;
+
        fp = fopen(fname, "rb");
        if (!fp)
             error(ERR_NONFATAL, "`incbin': unable to open file `%s'",
@@ -695,8 +687,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
             if (instruction->eops->next) {
                 len -= instruction->eops->next->offset;
                 if (instruction->eops->next->next &&
-                    len > instruction->eops->next->next->offset) {
-                    len = instruction->eops->next->next->offset;
+                    len > (size_t)instruction->eops->next->next->offset) {
+                    len = (size_t)instruction->eops->next->next->offset;
                 }
             }
             return instruction->times * len;
diff --git a/nasm.h b/nasm.h
index fedf858..ec44f16 100644 (file)
--- a/nasm.h
+++ b/nasm.h
@@ -182,6 +182,7 @@ enum token_type {           /* token types, other than chars */
     TOKEN_DBL_AND, TOKEN_DBL_OR, TOKEN_DBL_XOR, /* &&, || and ^^ */
     TOKEN_SEG, TOKEN_WRT,       /* SEG and WRT */
     TOKEN_FLOATIZE,            /* __floatX__ */
+    TOKEN_STRFUNC,             /* __utf16__, __utf32__ */
 };
 
 enum floatize {
@@ -195,6 +196,14 @@ enum floatize {
     FLOAT_128H,
 };
 
+/* Must match the list in string_transform(), in strfunc.c */
+enum strfunc {
+    STRFUNC_UTF16,
+    STRFUNC_UTF32,
+};
+
+size_t string_transform(char *, size_t, char **, enum strfunc);
+
 /*
  * The expression evaluator must be passed a scanner function; a
  * standard scanner is provided as part of nasmlib.c. The
@@ -605,11 +614,14 @@ enum prefixes {                   /* instruction prefixes */
     PREFIX_ENUM_LIMIT
 };
 
-enum {                          /* extended operand types */
-    EOT_NOTHING, EOT_DB_STRING, EOT_DB_NUMBER
+enum extop_type {              /* extended operand types */
+    EOT_NOTHING,
+    EOT_DB_STRING,             /* Byte string */
+    EOT_DB_STRING_FREE,                /* Byte string which should be nasm_free'd*/
+    EOT_DB_NUMBER,             /* Integer */
 };
 
-enum {                          /* special EA flags */
+enum ea_flags {                        /* special EA flags */
     EAF_BYTEOFFS =  1,          /* force offset part to byte size */
     EAF_WORDOFFS =  2,          /* force offset part to [d]word size */
     EAF_TIMESTWO =  4,          /* really do EAX*2 not EAX+EAX */
@@ -643,12 +655,12 @@ typedef struct operand {  /* operand to an instruction */
 
 typedef struct extop {          /* extended operand */
     struct extop *next;         /* linked list */
-    int32_t type;               /* defined above */
-    char *stringval;          /* if it's a string, then here it is */
-    int stringlen;              /* ... and here's how long it is */
-    int32_t segment;            /* if it's a number/address, then... */
+    char *stringval;           /* if it's a string, then here it is */
+    size_t stringlen;           /* ... and here's how long it is */
     int64_t offset;             /* ... it's given here ... */
+    int32_t segment;            /* if it's a number/address, then... */
     int32_t wrt;                /* ... and here */
+    enum extop_type type;      /* defined above */
 } extop;
 
 /* Prefix positions: each type of prefix goes in a specific slot.
index 6fb7e3c..caff1b1 100644 (file)
--- a/parser.c
+++ b/parser.c
@@ -334,6 +334,7 @@ restart_parse:
        result->opcode == I_DY || result->opcode == I_INCBIN) {
         extop *eop, **tail = &result->eops, **fixptr;
         int oper_num = 0;
+       int32_t sign;
 
         result->eops_float = false;
 
@@ -355,85 +356,114 @@ restart_parse:
             eop->next = NULL;
             eop->type = EOT_NOTHING;
             oper_num++;
+           sign = +1;
 
+           /* is_comma_next() here is to distinguish this from
+              a string used as part of an expression... */
             if (i == TOKEN_STR && is_comma_next()) {
                 eop->type = EOT_DB_STRING;
                 eop->stringval = tokval.t_charptr;
                 eop->stringlen = tokval.t_inttwo;
                 i = stdscan(NULL, &tokval);     /* eat the comma */
-                continue;
-            }
-
-            if ((i == TOKEN_FLOAT && is_comma_next())
-               || i == '-' || i == '+') {
-                int32_t sign = +1;
-
-                if (i == '+' || i == '-') {
-                    char *save = stdscan_bufptr;
-                   int token = i;
-                   sign = (i == '-') ? -1 : 1;
-                    i = stdscan(NULL, &tokval);
-                    if (i != TOKEN_FLOAT || !is_comma_next()) {
-                        stdscan_bufptr = save;
-                        i = tokval.t_type = token;
-                    }
-                }
-
-                if (i == TOKEN_FLOAT) {
-                    eop->type = EOT_DB_STRING;
-                    result->eops_float = true;
-                   switch (result->opcode) {
-                   case I_DB:
-                       eop->stringlen = 1;
-                       break;
-                   case I_DW:
-                       eop->stringlen = 2;
-                       break;
-                   case I_DD:
-                        eop->stringlen = 4;
-                       break;
-                   case I_DQ:
-                        eop->stringlen = 8;
-                       break;
-                   case I_DT:
-                        eop->stringlen = 10;
-                       break;
-                   case I_DO:
-                        eop->stringlen = 16;
-                       break;
-                   case I_DY:
-                        error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in DY instruction");
-                       eop->stringlen = 0;
-                       break;
-                   default:
-                        error(ERR_NONFATAL, "floating-point constant"
-                              " encountered in unknown instruction");
-                        /*
-                         * fix suggested by Pedro Gimeno... original line
-                         * was:
-                         * eop->type = EOT_NOTHING;
-                         */
-                        eop->stringlen = 0;
-                       break;
-                    }
-                    eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
-                    tail = &eop->next;
-                    *fixptr = eop;
-                    eop->stringval = (char *)eop + sizeof(extop);
-                    if (!eop->stringlen ||
-                        !float_const(tokval.t_charptr, sign,
-                                     (uint8_t *)eop->stringval,
-                                     eop->stringlen, error))
-                        eop->type = EOT_NOTHING;
-                    i = stdscan(NULL, &tokval); /* eat the comma */
-                    continue;
-                }
-            }
-
-            /* anything else */
-            {
+           } else if (i == TOKEN_STRFUNC) {
+               bool parens = false;
+               const char *funcname = tokval.t_charptr;
+               enum strfunc func = tokval.t_integer;
+               i = stdscan(NULL, &tokval);
+               if (i == '(') {
+                   parens = true;
+                   i = stdscan(NULL, &tokval);
+               }
+               if (i != TOKEN_STR) {
+                   error(ERR_NONFATAL,
+                         "%s must be followed by a string constant",
+                         funcname);
+                       eop->type = EOT_NOTHING;
+               } else {
+                   eop->type = EOT_DB_STRING_FREE;
+                   eop->stringlen =
+                       string_transform(tokval.t_charptr, tokval.t_inttwo,
+                                        &eop->stringval, func);
+                   if (eop->stringlen == (size_t)-1) {
+                       error(ERR_NONFATAL, "invalid string for transform");
+                       eop->type = EOT_NOTHING;
+                   }
+               }
+               if (parens && i && i != ')') {
+                   i = stdscan(NULL, &tokval);
+                   if (i != ')') {
+                       error(ERR_NONFATAL, "unterminated %s function",
+                             funcname);
+                   }
+               }
+               if (i && i != ',')
+                   i = stdscan(NULL, &tokval);
+           } else if (i == '-' || i == '+') {
+               char *save = stdscan_bufptr;
+               int token = i;
+               sign = (i == '-') ? -1 : 1;
+               i = stdscan(NULL, &tokval);
+               if (i != TOKEN_FLOAT) {
+                   stdscan_bufptr = save;
+                   i = tokval.t_type = token;
+                   goto is_expression;
+               } else {
+                   goto is_float;
+               }
+            } else if (i == TOKEN_FLOAT) {
+           is_float:
+               eop->type = EOT_DB_STRING;
+               result->eops_float = true;
+               switch (result->opcode) {
+               case I_DB:
+                   eop->stringlen = 1;
+                   break;
+               case I_DW:
+                   eop->stringlen = 2;
+                   break;
+               case I_DD:
+                   eop->stringlen = 4;
+                   break;
+               case I_DQ:
+                   eop->stringlen = 8;
+                   break;
+               case I_DT:
+                   eop->stringlen = 10;
+                   break;
+               case I_DO:
+                   eop->stringlen = 16;
+                   break;
+               case I_DY:
+                   error(ERR_NONFATAL, "floating-point constant"
+                         " encountered in DY instruction");
+                   eop->stringlen = 0;
+                   break;
+               default:
+                   error(ERR_NONFATAL, "floating-point constant"
+                         " encountered in unknown instruction");
+                   /*
+                    * fix suggested by Pedro Gimeno... original line
+                    * was:
+                    * eop->type = EOT_NOTHING;
+                    */
+                   eop->stringlen = 0;
+                   break;
+               }
+               eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
+               tail = &eop->next;
+               *fixptr = eop;
+               eop->stringval = (char *)eop + sizeof(extop);
+               if (!eop->stringlen ||
+                   !float_const(tokval.t_charptr, sign,
+                                (uint8_t *)eop->stringval,
+                                eop->stringlen, error))
+                   eop->type = EOT_NOTHING;
+               i = stdscan(NULL, &tokval); /* eat the comma */
+           } else {
+               /* anything else, assume it is an expression */
                 expr *value;
+
+           is_expression:
                 value = evaluate(stdscan, NULL, &tokval, NULL,
                                  critical, error, NULL);
                 i = tokval.t_type;
diff --git a/strfunc.c b/strfunc.c
new file mode 100644 (file)
index 0000000..9fb7270
--- /dev/null
+++ b/strfunc.c
@@ -0,0 +1,167 @@
+/*
+ * strfunc.c
+ *
+ * String transformation functions
+ */
+
+#include "nasmlib.h"
+#include "nasm.h"
+
+/*
+ * Convert a string in UTF-8 format to UTF-16LE
+ */
+static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               expect = 0;
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || v > 0x10ffff ||
+                       (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else if (v > 0xffff) {
+                       v -= 0x10000;
+                       EMIT(0xd800 | (v >> 10));
+                       EMIT(0xdc00 | (v & 0x3ff));
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xa0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 1;
+
+#undef EMIT
+}
+
+/*
+ * Convert a string in UTF-8 format to UTF-32LE
+ */
+static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
+{
+#define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0)
+
+    size_t outlen = 0;
+    int expect = 0;
+    uint8_t c;
+    uint32_t v = 0, vmin = 0;
+
+    while (len--) {
+       c = *str++;
+
+       if (expect) {
+           if ((c & 0xc0) != 0x80) {
+               return -1;
+           } else {
+               v = (v << 6) | (c & 0x3f);
+               if (!--expect) {
+                   if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
+                       return -1;
+                   } else {
+                       EMIT(v);
+                   }
+               }
+               continue;
+           }
+       }
+
+       if (c < 0x80) {
+           EMIT(c);
+       } else if (c < 0xa0 || c >= 0xfe) {
+           /* Invalid UTF-8 */
+           return -1;
+       } else if (c < 0xe0) {
+           v = c & 0x1f;
+           expect = 1;
+           vmin = 0x80;
+       } else if (c < 0xf0) {
+           v = c & 0x0f;
+           expect = 2;
+           vmin = 0x800;
+       } else if (c < 0xf8) {
+           v = c & 0x07;
+           expect = 3;
+           vmin = 0x10000;
+       } else if (c < 0xfc) {
+           v = c & 0x03;
+           expect = 4;
+           vmin = 0x200000;
+       } else {
+           v = c & 0x01;
+           expect = 5;
+           vmin = 0x4000000;
+       }
+    }
+
+    return expect ? (size_t)-1 : outlen << 2;
+
+#undef EMIT
+}
+
+typedef size_t (*transform_func)(uint8_t *, size_t, char *);
+
+/*
+ * Apply a specific string transform and return it in a nasm_malloc'd
+ * buffer, returning the length.  On error, returns (size_t)-1 and no
+ * buffer is allocated.
+ */
+size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
+{
+    /* This should match enum strfunc in nasm.h */
+    static const transform_func str_transforms[] = {
+       utf8_to_16le,
+       utf8_to_32le,
+    };
+    transform_func transform = str_transforms[func];
+    size_t outlen;
+    uint8_t *s = (uint8_t *)str;
+
+    outlen = transform(s, len, NULL);
+    if (outlen == (size_t)-1)
+       return -1;
+
+    return transform(s, len, *out = nasm_malloc(outlen));
+}
index 6c3ad65..128bc67 100644 (file)
@@ -53,6 +53,10 @@ __float80e__
 __float128l__
 __float128h__
 
+% TOKEN_STRFUNC, 0, STRFUNC_{__*__}
+__utf16__
+__utf32__
+
 % TOKEN_*, 0, 0
 seg
 wrt