From 3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983 Mon Sep 17 00:00:00 2001 From: Jerry DeLisle Date: Sat, 16 Aug 2008 03:38:31 +0000 Subject: [PATCH] re PR libfortran/35863 ([F2003] Implement ENCODING="UTF-8") 2008-08-15 Jerry DeLisle PR libfortran/35863 * intrinsics/selected_char_kind.c: Enable iso_10646. * io/read.c (typedef uchar): New type. (read_utf8): New function to read a single UTF-8 encoded character. (read_utf8_char1): New function to read UTF-8 into a KIND=1 string. (read_default_char1): New functio to read default into KIND=1 string. (read_utf8_char4): New function to read UTF-8 into a KIND=4 string. (read_default_char4): New function to read UTF-8 into a KIND=4 string. (read_a): Modify to use the new functions. (read_a_char4): Modify to use the new functions. * io/write.c (error.h): Add include. (typedef uchar): New type. (write_default_char4): New function to default write KIND=4 string. (write_utf8_char4): New function to UTF-8 write KIND=4 string. (write_a_char4): Modify to use new functions. (write_character): Modify to use new functions. From-SVN: r139147 --- libgfortran/ChangeLog | 18 ++ libgfortran/intrinsics/selected_char_kind.c | 2 + libgfortran/io/read.c | 239 +++++++++++++++++++++----- libgfortran/io/write.c | 253 ++++++++++++++++++++-------- 4 files changed, 405 insertions(+), 107 deletions(-) diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog index f2eb391..3e10c2e 100644 --- a/libgfortran/ChangeLog +++ b/libgfortran/ChangeLog @@ -1,3 +1,21 @@ +2008-08-15 Jerry DeLisle + + PR libfortran/35863 + * intrinsics/selected_char_kind.c: Enable iso_10646. + * io/read.c (typedef uchar): New type. + (read_utf8): New function to read a single UTF-8 encoded character. + (read_utf8_char1): New function to read UTF-8 into a KIND=1 string. + (read_default_char1): New functio to read default into KIND=1 string. + (read_utf8_char4): New function to read UTF-8 into a KIND=4 string. + (read_default_char4): New function to read UTF-8 into a KIND=4 string. + (read_a): Modify to use the new functions. + (read_a_char4): Modify to use the new functions. + * io/write.c (error.h): Add include. (typedef uchar): New type. + (write_default_char4): New function to default write KIND=4 string. + (write_utf8_char4): New function to UTF-8 write KIND=4 string. + (write_a_char4): Modify to use new functions. + (write_character): Modify to use new functions. + 2008-08-14 H.J. Lu PR libfortran/37123 diff --git a/libgfortran/intrinsics/selected_char_kind.c b/libgfortran/intrinsics/selected_char_kind.c index c10d5b2..6866361 100644 --- a/libgfortran/intrinsics/selected_char_kind.c +++ b/libgfortran/intrinsics/selected_char_kind.c @@ -44,6 +44,8 @@ selected_char_kind (gfc_charlen_type name_len, char *name) if ((len == 5 && strncasecmp (name, "ascii", 5) == 0) || (len == 7 && strncasecmp (name, "default", 7) == 0)) return 1; + else if (len == 9 && strncasecmp (name, "iso_10646", 9) == 0) + return 1; else return -1; } diff --git a/libgfortran/io/read.c b/libgfortran/io/read.c index cb88933..8d25493 100644 --- a/libgfortran/io/read.c +++ b/libgfortran/io/read.c @@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA. */ #include #include +typedef unsigned char uchar; + /* read.c -- Deal with formatted reads */ @@ -236,78 +238,239 @@ read_l (st_parameter_dt *dtp, const fnode *f, char *dest, int length) } -/* read_a()-- Read a character record. This one is pretty easy. */ - -void -read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length) +static inline gfc_char4_t +read_utf8 (st_parameter_dt *dtp, size_t *nbytes) { + static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; + static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + static uchar buffer[6]; + size_t i, nb, nread; + gfc_char4_t c; + int status; char *s; - int m, n, wi, status; - size_t w; - wi = f->u.w; - if (wi == -1) /* '(A)' edit descriptor */ - wi = length; + *nbytes = 1; + s = (char *) &buffer[0]; + status = read_block_form (dtp, s, nbytes); + if (status == FAILURE) + return 0; - w = wi; + /* If this is a short read, just return. */ + if (*nbytes == 0) + return 0; - s = gfc_alloca (w); + c = buffer[0]; + if (c < 0x80) + return c; - dtp->u.p.sf_read_comma = 0; - status = read_block_form (dtp, s, &w); - dtp->u.p.sf_read_comma = - dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; + /* The number of leading 1-bits in the first byte indicates how many + bytes follow. */ + for (nb = 2; nb < 7; nb++) + if ((c & ~masks[nb-1]) == patns[nb-1]) + goto found; + goto invalid; + + found: + c = (c & masks[nb-1]); + nread = nb - 1; + + s = (char *) &buffer[1]; + status = read_block_form (dtp, s, &nread); + if (status == FAILURE) + return 0; + /* Decode the bytes read. */ + for (i = 1; i < nb; i++) + { + gfc_char4_t n = *s++; + + if ((n & 0xC0) != 0x80) + goto invalid; + + c = ((c << 6) + (n & 0x3F)); + } + + /* Make sure the shortest possible encoding was used. */ + if (c <= 0x7F && nb > 1) goto invalid; + if (c <= 0x7FF && nb > 2) goto invalid; + if (c <= 0xFFFF && nb > 3) goto invalid; + if (c <= 0x1FFFFF && nb > 4) goto invalid; + if (c <= 0x3FFFFFF && nb > 5) goto invalid; + + /* Make sure the character is valid. */ + if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) + goto invalid; + + return c; + + invalid: + generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding"); + return (gfc_char4_t) '?'; +} + + +static void +read_utf8_char1 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + gfc_char4_t c; + char *dest; + size_t nbytes; + int i, j; + + len = ((int) width < len) ? len : (int) width; + + dest = (char *) p; + + /* Proceed with decoding one character at a time. */ + for (j = 0; j < len; j++, dest++) + { + c = read_utf8 (dtp, &nbytes); + + /* Check for a short read and if so, break out. */ + if (nbytes == 0) + break; + + *dest = c > 255 ? '?' : (uchar) c; + } + + /* If there was a short read, pad the remaining characters. */ + for (i = j; i < len; i++) + *dest++ = ' '; + return; +} + +static void +read_default_char1 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + char *s; + int m, n, status; + + s = gfc_alloca (width); + + status = read_block_form (dtp, s, &width); + if (status == FAILURE) return; - if (w > (size_t) length) - s += (w - length); + if (width > (size_t) len) + s += (width - len); - m = ((int) w > length) ? length : (int) w; + m = ((int) width > len) ? len : (int) width; memcpy (p, s, m); - n = length - w; + n = len - width; if (n > 0) memset (p + m, ' ', n); } -void -read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length) + +static void +read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width) { - char *s; gfc_char4_t *dest; - int m, n, wi, status; - size_t w; + size_t nbytes; + int i, j; - wi = f->u.w; - if (wi == -1) /* '(A)' edit descriptor */ - wi = length; + len = ((int) width < len) ? len : (int) width; - w = wi; + dest = (gfc_char4_t *) p; - s = gfc_alloca (w); + /* Proceed with decoding one character at a time. */ + for (j = 0; j < len; j++, dest++) + { + *dest = read_utf8 (dtp, &nbytes); - /* Read in w bytes, treating comma as not a separator. */ - dtp->u.p.sf_read_comma = 0; - status = read_block_form (dtp, s, &w); - dtp->u.p.sf_read_comma = - dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; + /* Check for a short read and if so, break out. */ + if (nbytes == 0) + break; + } + + /* If there was a short read, pad the remaining characters. */ + for (i = j; i < len; i++) + *dest++ = (gfc_char4_t) ' '; + return; +} + + +static void +read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + char *s; + gfc_char4_t *dest; + int m, n, status; + + s = gfc_alloca (width); + + status = read_block_form (dtp, s, &width); if (status == FAILURE) return; - if (w > (size_t) length) - s += (w - length); + if (width > (size_t) len) + s += (width - len); - m = ((int) w > length) ? length : (int) w; + m = ((int) width > len) ? len : (int) width; dest = (gfc_char4_t *) p; for (n = 0; n < m; n++, dest++, s++) *dest = (unsigned char ) *s; - for (n = 0; n < length - (int) w; n++, dest++) + for (n = 0; n < len - (int) width; n++, dest++) *dest = (unsigned char) ' '; } + +/* read_a()-- Read a character record into a KIND=1 character destination, + processing UTF-8 encoding if necessary. */ + +void +read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length) +{ + int wi; + size_t w; + + wi = f->u.w; + if (wi == -1) /* '(A)' edit descriptor */ + wi = length; + w = wi; + + /* Read in w characters, treating comma as not a separator. */ + dtp->u.p.sf_read_comma = 0; + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + read_utf8_char1 (dtp, p, length, w); + else + read_default_char1 (dtp, p, length, w); + + dtp->u.p.sf_read_comma = + dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; +} + + +/* read_a_char4()-- Read a character record into a KIND=4 character destination, + processing UTF-8 encoding if necessary. */ + +void +read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length) +{ + int wi; + size_t w; + + wi = f->u.w; + if (wi == -1) /* '(A)' edit descriptor */ + wi = length; + w = wi; + + /* Read in w characters, treating comma as not a separator. */ + dtp->u.p.sf_read_comma = 0; + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + read_utf8_char4 (dtp, p, length, w); + else + read_default_char4 (dtp, p, length, w); + + dtp->u.p.sf_read_comma = + dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; +} + /* eat_leading_spaces()-- Given a character pointer and a width, * ignore the leading spaces. */ diff --git a/libgfortran/io/write.c b/libgfortran/io/write.c index ed50e0d..8194cf8 100644 --- a/libgfortran/io/write.c +++ b/libgfortran/io/write.c @@ -36,10 +36,161 @@ Boston, MA 02110-1301, USA. */ #include #include #include +#include #define star_fill(p, n) memset(p, '*', n) #include "write_float.def" +typedef unsigned char uchar; + +/* Write out default char4. */ + +static void +write_default_char4 (st_parameter_dt *dtp, gfc_char4_t *source, + int src_len, int w_len) +{ + char *p; + int j, k = 0; + gfc_char4_t c; + uchar d; + + /* Take care of preceding blanks. */ + if (w_len > src_len) + { + k = w_len - src_len; + p = write_block (dtp, k); + if (p == NULL) + return; + memset (p, ' ', k); + } + + /* Get ready to handle delimiters if needed. */ + + switch (dtp->u.p.delim_status) + { + case DELIM_APOSTROPHE: + d = '\''; + break; + case DELIM_QUOTE: + d = '"'; + break; + default: + d = ' '; + break; + } + + /* Now process the remaining characters, one at a time. */ + for (j = k; j < src_len; j++) + { + c = source[j]; + + /* Handle delimiters if any. */ + if (c == d && d != ' ') + { + p = write_block (dtp, 2); + if (p == NULL) + return; + *p++ = (uchar) c; + } + else + { + p = write_block (dtp, 1); + if (p == NULL) + return; + } + *p = c > 255 ? '?' : (uchar) c; + } +} + + +/* Write out UTF-8 converted from char4. */ + +static void +write_utf8_char4 (st_parameter_dt *dtp, gfc_char4_t *source, + int src_len, int w_len) +{ + char *p; + int j, k = 0; + gfc_char4_t c; + static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE }; + size_t nbytes; + uchar buf[6], d, *q; + + /* Take care of preceding blanks. */ + if (w_len > src_len) + { + k = w_len - src_len; + p = write_block (dtp, k); + if (p == NULL) + return; + memset (p, ' ', k); + } + + /* Get ready to handle delimiters if needed. */ + + switch (dtp->u.p.delim_status) + { + case DELIM_APOSTROPHE: + d = '\''; + break; + case DELIM_QUOTE: + d = '"'; + break; + default: + d = ' '; + break; + } + + /* Now process the remaining characters, one at a time. */ + for (j = k; j < src_len; j++) + { + c = source[j]; + if (c < 0x80) + { + /* Handle the delimiters if any. */ + if (c == d && d != ' ') + { + p = write_block (dtp, 2); + if (p == NULL) + return; + *p++ = (uchar) c; + } + else + { + p = write_block (dtp, 1); + if (p == NULL) + return; + } + *p = (uchar) c; + } + else + { + /* Convert to UTF-8 sequence. */ + nbytes = 1; + q = &buf[6]; + + do + { + *--q = ((c & 0x3F) | 0x80); + c >>= 6; + nbytes++; + } + while (c >= 0x3F || (c & limits[nbytes-1])); + + *--q = (c | masks[nbytes-1]); + + p = write_block (dtp, nbytes); + if (p == NULL) + return; + + while (q < &buf[6]) + *p++ = *q++; + } + } +} + + void write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len) { @@ -126,17 +277,16 @@ write_a (st_parameter_dt *dtp, const fnode *f, const char *source, int len) /* The primary difference between write_a_char4 and write_a is that we have to - deal with writing from the first byte of the 4-byte character and take care - of endianess. This currently implements encoding="default" which means we - write the lowest significant byte. If the 3 most significant bytes are - not representable emit a '?'. TODO: Implement encoding="UTF-8" - which will process all 4 bytes and translate to the encoded output. */ + deal with writing from the first byte of the 4-byte character and pay + attention to the most significant bytes. For ENCODING="default" write the + lowest significant byte. If the 3 most significant bytes contain + non-zero values, emit a '?'. For ENCODING="utf-8", convert the UCS-32 value + to the UTF-8 encoded string before writing out. */ void write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len) { int wlen; - char *p; gfc_char4_t *q; wlen = f->u.string.length < 0 @@ -173,19 +323,15 @@ write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len /* Write out the previously scanned characters in the string. */ if (bytes > 0) { - p = write_block (dtp, bytes); - if (p == NULL) - return; - for (j = 0; j < bytes; j++) - p[j] = q[j] > 255 ? '?' : (unsigned char) q[j]; + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_utf8_char4 (dtp, q, bytes, 0); + else + write_default_char4 (dtp, q, bytes, 0); bytes = 0; } /* Write out the CR_LF sequence. */ - p = write_block (dtp, 2); - if (p == NULL) - return; - memcpy (p, crlf, 2); + write_default_char4 (dtp, crlf, 2, 0); } else bytes++; @@ -194,32 +340,19 @@ write_a_char4 (st_parameter_dt *dtp, const fnode *f, const char *source, int len /* Write out any remaining bytes if no LF was found. */ if (bytes > 0) { - p = write_block (dtp, bytes); - if (p == NULL) - return; - for (j = 0; j < bytes; j++) - p[j] = q[j] > 255 ? '?' : (unsigned char) q[j]; + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_utf8_char4 (dtp, q, bytes, 0); + else + write_default_char4 (dtp, q, bytes, 0); } } else { #endif - int j; - p = write_block (dtp, wlen); - if (p == NULL) - return; - - if (wlen < len) - { - for (j = 0; j < wlen; j++) - p[j] = q[j] > 255 ? '?' : (unsigned char) q[j]; - } + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_utf8_char4 (dtp, q, len, wlen); else - { - memset (p, ' ', wlen - len); - for (j = wlen - len; j < wlen; j++) - p[j] = q[j] > 255 ? '?' : (unsigned char) q[j]; - } + write_default_char4 (dtp, q, len, wlen); #ifdef HAVE_CRLF } #endif @@ -745,8 +878,6 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length) { int i, extra; char *p, d; - gfc_char4_t *q; - switch (dtp->u.p.delim_status) { @@ -769,9 +900,9 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length) { extra = 2; - for (i = 0; i < length; i++) - if (source[i] == d) - extra++; + for (i = 0; i < length; i++) + if (source[i] == d) + extra++; } p = write_block (dtp, length + extra); @@ -796,40 +927,24 @@ write_character (st_parameter_dt *dtp, const char *source, int kind, int length) } else { - /* We have to scan the source string looking for delimiters to determine - how large the write block needs to be. */ if (d == ' ') - extra = 0; - else { - extra = 2; - - q = (gfc_char4_t *) source; - for (i = 0; i < length; i++, q++) - if (*q == (gfc_char4_t) d) - extra++; - } - - p = write_block (dtp, length + extra); - if (p == NULL) - return; - - if (d == ' ') - { - q = (gfc_char4_t *) source; - for (i = 0; i < length; i++, q++) - p[i] = *q > 255 ? '?' : (unsigned char) *q; + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0); + else + write_default_char4 (dtp, (gfc_char4_t *) source, length, 0); } else { - *p++ = d; - q = (gfc_char4_t *) source; - for (i = 0; i < length; i++, q++) - { - *p++ = *q > 255 ? '?' : (unsigned char) *q; - if (*q == (gfc_char4_t) d) - *p++ = d; - } + p = write_block (dtp, 1); + *p = d; + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_utf8_char4 (dtp, (gfc_char4_t *) source, length, 0); + else + write_default_char4 (dtp, (gfc_char4_t *) source, length, 0); + + p = write_block (dtp, 1); *p = d; } } -- 2.7.4