Reader option for R6RS hex escapes
authorMichael Gran <spk121@yahoo.com>
Wed, 13 Jan 2010 05:02:41 +0000 (21:02 -0800)
committerMichael Gran <spk121@yahoo.com>
Wed, 13 Jan 2010 05:02:41 +0000 (21:02 -0800)
This adds a reader option 'r6rs-hex-escapes that modifies the
behavior of numeric escapes in characters and strings.  When enabled,
variable-length character hex escapes (#\xNNN) are allowed and become
the default output format for numerically-escaped characters.  Also,
string hex escapes switch to a semicolon terminated hex escape (\xNNNN;).

* libguile/print.c (PRINT_CHAR_ESCAPE): new macro
  (iprin1): use new macro PRINT_CHAR_ESCAPE

* libguile/private-options.h (SCM_R6RS_ESCAPES_P): new #define

* libguile/read.c (scm_read_opts): add new option r6rs-hex-escapes
  (SCM_READ_HEX_ESCAPE): modify to take a terminator parameter
  (scm_read_string): parse R6RS hex string escapes
  (scm_read_character): parse R6RS hex character escapes

* test-suite/tests/chars.test (with-read-options): new procedure
  (R6RS hex escapes): new tests

* test-suite/tests/strings.test (with-read-options): new procedure
  (R6RS hex escapes): new tests

libguile/print.c
libguile/private-options.h
libguile/read.c
test-suite/tests/chars.test
test-suite/tests/strings.test

index aef575d8043e7127ae368d7bf3e26348c0a8717d..dcf28c7c26bb54ddc95b36a75037cbab3a358171 100644 (file)
@@ -409,6 +409,22 @@ SCM_GPROC(s_display, "display", 1, 1, 0, scm_display, g_display);
 
 static void iprin1 (SCM exp, SCM port, scm_print_state *pstate);
 
+
+/* Print a character as an octal or hex escape.  */
+#define PRINT_CHAR_ESCAPE(i, port)              \
+  do                                            \
+    {                                           \
+      if (!SCM_R6RS_ESCAPES_P)                  \
+        scm_intprint (i, 8, port);              \
+      else                                      \
+        {                                       \
+          scm_puts ("x", port);                 \
+          scm_intprint (i, 16, port);           \
+        }                                       \
+    }                                           \
+  while (0)
+
+  
 void 
 scm_iprin1 (SCM exp, SCM port, scm_print_state *pstate)
 {
@@ -488,7 +504,7 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
                       else
                         /* Character is graphic but unrepresentable in
                            this port's encoding.  */
-                        scm_intprint (i, 8, port);
+                        PRINT_CHAR_ESCAPE (i, port);
                     }
                   else
                     {
@@ -507,12 +523,12 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
                       else
                         /* Character is graphic but unrepresentable in
                            this port's encoding.  */
-                        scm_intprint (i, 8, port);
+                        PRINT_CHAR_ESCAPE (i, port);
                     }
                 }
               else
                 /* Character is a non-graphical character.  */
-                scm_intprint (i, 8, port);
+                PRINT_CHAR_ESCAPE (i, port);
            }
          else
            scm_i_charprint (i, port);
@@ -579,9 +595,9 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
         case scm_tc7_string:
           if (SCM_WRITINGP (pstate))
             {
-              size_t i, j, len;
+              size_t i, len;
               static char const hex[] = "0123456789abcdef";
-              char buf[8];
+              char buf[9];
 
 
               scm_putc ('"', port);
@@ -647,37 +663,61 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
                     {
                       /* Character is graphic but unrepresentable in
                          this port's encoding or is not graphic.  */
-                      if (ch <= 0xFF)
+                      if (!SCM_R6RS_ESCAPES_P)
                         {
-                          buf[0] = '\\';
-                          buf[1] = 'x';
-                          buf[2] = hex[ch / 16];
-                          buf[3] = hex[ch % 16];
-                          scm_lfwrite (buf, 4, port);
-                        }
-                      else if (ch <= 0xFFFF)
-                        {
-                          buf[0] = '\\';
-                          buf[1] = 'u';
-                          buf[2] = hex[(ch & 0xF000) >> 12];
-                          buf[3] = hex[(ch & 0xF00) >> 8];
-                          buf[4] = hex[(ch & 0xF0) >> 4];
-                          buf[5] = hex[(ch & 0xF)];
-                          scm_lfwrite (buf, 6, port);
-                          j = i + 1;
+                          if (ch <= 0xFF)
+                            {
+                              buf[0] = '\\';
+                              buf[1] = 'x';
+                              buf[2] = hex[ch / 16];
+                              buf[3] = hex[ch % 16];
+                              scm_lfwrite (buf, 4, port);
+                            }
+                          else if (ch <= 0xFFFF)
+                            {
+                              buf[0] = '\\';
+                              buf[1] = 'u';
+                              buf[2] = hex[(ch & 0xF000) >> 12];
+                              buf[3] = hex[(ch & 0xF00) >> 8];
+                              buf[4] = hex[(ch & 0xF0) >> 4];
+                              buf[5] = hex[(ch & 0xF)];
+                              scm_lfwrite (buf, 6, port);
+                            }
+                          else if (ch > 0xFFFF)
+                            {
+                              buf[0] = '\\';
+                              buf[1] = 'U';
+                              buf[2] = hex[(ch & 0xF00000) >> 20];
+                              buf[3] = hex[(ch & 0xF0000) >> 16];
+                              buf[4] = hex[(ch & 0xF000) >> 12];
+                              buf[5] = hex[(ch & 0xF00) >> 8];
+                              buf[6] = hex[(ch & 0xF0) >> 4];
+                              buf[7] = hex[(ch & 0xF)];
+                              scm_lfwrite (buf, 8, port);
+                            }
                         }
-                      else if (ch > 0xFFFF)
+                      else
                         {
-                          buf[0] = '\\';
-                          buf[1] = 'U';
-                          buf[2] = hex[(ch & 0xF00000) >> 20];
-                          buf[3] = hex[(ch & 0xF0000) >> 16];
-                          buf[4] = hex[(ch & 0xF000) >> 12];
-                          buf[5] = hex[(ch & 0xF00) >> 8];
-                          buf[6] = hex[(ch & 0xF0) >> 4];
-                          buf[7] = hex[(ch & 0xF)];
-                          scm_lfwrite (buf, 8, port);
-                          j = i + 1;
+                          scm_t_wchar ch2 = ch;
+                          
+                          /* Print an R6RS variable-length hex escape: "\xNNNN;"
+                          */
+                          int i = 8;
+                          buf[i] = ';';
+                          i --;
+                          if (ch == 0)
+                            buf[i--] = '0';
+                          else
+                            while (ch2 > 0)
+                              {
+                                buf[i] = hex[ch2 & 0xF];
+                                ch2 >>= 4;
+                                i --;
+                              }
+                          buf[i] = 'x';
+                          i --;
+                          buf[i] = '\\';
+                          scm_lfwrite (buf + i, 9 - i, port);
                         }
                     }
                 }
index 703ca8a5ba5a36c716af3354513ae03a9bb96bc1..40d40fbd7366769bb267b80f8d0b38cfca608426 100644 (file)
@@ -94,9 +94,13 @@ SCM_API scm_t_option scm_read_opts[];
 #if SCM_ENABLE_ELISP
 #define SCM_ELISP_VECTORS_P    scm_read_opts[4].val
 #define SCM_ESCAPED_PARENS_P   scm_read_opts[5].val
-#define SCM_N_READ_OPTIONS 6
+#endif
+#define SCM_R6RS_ESCAPES_P     scm_read_opts[6].val
+
+#if SCM_ENABLE_ELISP
+#define SCM_N_READ_OPTIONS 7
 #else
-#define SCM_N_READ_OPTIONS 4
+#define SCM_N_READ_OPTIONS 5
 #endif
 
 #endif  /* PRIVATE_OPTIONS */ 
index 011684b3d09cec614fef7ffa565c0bc8fc4b045f..9e66cce802cdd3ddc28ad7a3be6120ab1b3a7094 100644 (file)
@@ -76,6 +76,8 @@ scm_t_option scm_read_opts[] = {
   { SCM_OPTION_BOOLEAN, "elisp-strings", 0,
     "Support `\\(' and `\\)' in strings."},
 #endif
+  { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
+    "Use R6RS variable-length character and string hex escapes."},
   { 0, },
 };
 
@@ -412,32 +414,37 @@ scm_read_sexp (scm_t_wchar chr, SCM port)
 
 
 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
-   C.  */
-#define SCM_READ_HEX_ESCAPE(ndigits)            \
-  do                                            \
-    {                                           \
-      scm_t_wchar a;                            \
-      size_t i = 0;                             \
-      c = 0;                                    \
-      while (i < ndigits)                       \
-        {                                       \
-          a = scm_getc (port);                  \
-          if (a == EOF)                         \
-            goto str_eof;                       \
-          if ('0' <= a && a <= '9')             \
-            a -= '0';                           \
-          else if ('A' <= a && a <= 'F')        \
-            a = a - 'A' + 10;                   \
-          else if ('a' <= a && a <= 'f')        \
-            a = a - 'a' + 10;                   \
-          else                                  \
-            {                                   \
-              c = a;                            \
-              goto bad_escaped;                 \
-            }                                   \
-          c = c * 16 + a;                       \
-          i ++;                                 \
-        }                                       \
+   C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
+   found.  */
+#define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
+  do                                                               \
+    {                                                              \
+      scm_t_wchar a;                                               \
+      size_t i = 0;                                                \
+      c = 0;                                                       \
+      while (i < ndigits)                                          \
+        {                                                          \
+          a = scm_getc (port);                                     \
+          if (a == EOF)                                            \
+            goto str_eof;                                          \
+          if (terminator                                           \
+              && (a == (scm_t_wchar) terminator)                   \
+              && (i > 0))                                          \
+            break;                                                 \
+          if ('0' <= a && a <= '9')                                \
+            a -= '0';                                              \
+          else if ('A' <= a && a <= 'F')                           \
+            a = a - 'A' + 10;                                      \
+          else if ('a' <= a && a <= 'f')                           \
+            a = a - 'a' + 10;                                      \
+          else                                                     \
+            {                                                      \
+              c = a;                                               \
+              goto bad_escaped;                                    \
+            }                                                      \
+          c = c * 16 + a;                                          \
+          i ++;                                                    \
+        }                                                          \
     } while (0)
 
 static SCM
@@ -511,13 +518,16 @@ scm_read_string (int chr, SCM port)
               c = '\010';
               break;
             case 'x':
-              SCM_READ_HEX_ESCAPE (2);
+              if (SCM_R6RS_ESCAPES_P)
+                SCM_READ_HEX_ESCAPE (10, ';');
+              else
+                SCM_READ_HEX_ESCAPE (2, '\0');
               break;
             case 'u':
-              SCM_READ_HEX_ESCAPE (4);
+              SCM_READ_HEX_ESCAPE (4, '\0');
               break;
             case 'U':
-              SCM_READ_HEX_ESCAPE (6);
+              SCM_READ_HEX_ESCAPE (6, '\0');
               break;
             default:
             bad_escaped:
@@ -828,6 +838,26 @@ scm_read_character (scm_t_wchar chr, SCM port)
         }
     }
 
+  if (cp == 'x' && (charname_len > 1) && SCM_R6RS_ESCAPES_P)
+    {
+      SCM p;
+      scm_t_wchar chr;
+      
+      /* Convert from hex, skipping the initial 'x' character in CHARNAME */
+      p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
+                                scm_from_uint (16));
+      if (SCM_I_INUMP (p))
+        {
+          scm_t_wchar c = SCM_I_INUM (p);
+          if (SCM_IS_UNICODE_CHAR (c))
+            return SCM_MAKE_CHAR (c);
+          else
+            scm_i_input_error (FUNC_NAME, port,
+                               "out-of-range hex character escape: ~a",
+                               scm_list_1 (charname));
+        }
+    }
+
   /* The names of characters should never have non-Latin1
      characters.  */
   if (scm_i_is_narrow_string (charname)
index 509f07066b15609fd7041f7865413e3f071eb89f..25c82e825ececaf9bfb49ee7cace19fc918b0cda 100644 (file)
   (cons #t "out-of-range"))
 
 
+;; Run THUNK in the context of the reader options OPTS
+(define (with-read-options opts thunk)
+  (let ((saved-options (read-options)))
+    (dynamic-wind
+        (lambda ()
+          (read-options opts))
+        thunk
+        (lambda ()
+          (read-options saved-options)))))
+
 (with-test-prefix "basic char handling"
 
   (with-test-prefix "evaluator"
        (with-output-to-string (lambda () (write #\soh)))
        "#\\soh"))))
 
+(with-test-prefix "R6RS hex escapes"
+
+  (pass-if "one-digit hex escape"
+    (eqv? (with-read-options '(r6rs-hex-escapes)
+            (lambda ()
+              (with-input-from-string "#\\xA" read)))
+          (integer->char #x0A)))
+
+  (pass-if "two-digit hex escape"
+    (eqv? (with-read-options '(r6rs-hex-escapes)
+            (lambda ()
+              (with-input-from-string "#\\xFF" read)))
+          (integer->char #xFF)))
+
+  (pass-if "four-digit hex escape"
+    (eqv? (with-read-options '(r6rs-hex-escapes)
+            (lambda ()
+              (with-input-from-string "#\\x00FF" read)))
+          (integer->char #xFF)))
+
+  (pass-if "eight-digit hex escape"
+    (eqv? (with-read-options '(r6rs-hex-escapes)
+            (lambda ()
+              (with-input-from-string "#\\x00006587" read)))
+          (integer->char #x6587)))
+  (pass-if "write R6RS escapes"
+    (string=?
+     (with-read-options '(r6rs-hex-escapes)
+       (lambda ()
+         (with-output-to-string 
+           (lambda () 
+             (write (integer->char #x80))))))
+     "#\\x80")))
+
index e04c0260d0ed88dc94feecfa5dc8a23f90b05e66..47ae93ae95bcc27c1fbaee525558c779a9768df2 100644 (file)
@@ -2,23 +2,24 @@
 ;;;; Jim Blandy <jimb@red-bean.com> --- August 1999
 ;;;;
 ;;;; Copyright (C) 1999, 2001, 2004, 2005, 2006, 2008, 2009 Free Software Foundation, Inc.
-;;;; 
+;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
 ;;;; License as published by the Free Software Foundation; either
 ;;;; version 3 of the License, or (at your option) any later version.
-;;;; 
+;;;;
 ;;;; This library is distributed in the hope that it will be useful,
 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;;;; Lesser General Public License for more details.
-;;;; 
+;;;;
 ;;;; You should have received a copy of the GNU Lesser General Public
 ;;;; License along with this library; if not, write to the Free Software
 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
 (define-module (test-strings)
-  #:use-module (test-suite lib))
+  #:use-module (test-suite lib)
+  #:use-module (srfi srfi-1))
 
 (define exception:read-only-string
   (cons 'misc-error "^string is read-only"))
 (define exception:wrong-type-arg
   (cons #t "Wrong type"))
 
+;; Run THUNK in the context of the reader options OPTS
+(define (with-read-options opts thunk)
+  (let ((saved-options (read-options)))
+    (dynamic-wind
+        (lambda ()
+          (read-options opts))
+        thunk
+        (lambda ()
+          (read-options saved-options)))))
+
 ;; Create a string from integer char values, eg. (string-ints 65) => "A"
 (define (string-ints . args)
   (apply string (map integer->char args)))
   (pass-if "Guile extensions backslash escapes"
     (string=? "\0" (string #\nul))))
 
+
+(with-test-prefix "R6RS hex escapes"
+
+  (pass-if-exception "non-hex char in two-digit hex-escape"
+    exception:illegal-escape                     
+    (with-read-options '(r6rs-hex-escapes)
+      (lambda ()
+        (with-input-from-string "\"\\x0g;\"" read))))
+
+  (pass-if-exception "non-hex char in four-digit hex-escape"
+    exception:illegal-escape                     
+    (with-read-options '(r6rs-hex-escapes)
+      (lambda ()
+        (with-input-from-string "\"\\x000g;\"" read))))
+
+  (pass-if-exception "non-hex char in six-digit hex-escape"
+    exception:illegal-escape                     
+    (with-read-options '(r6rs-hex-escapes)
+      (lambda ()
+        (with-input-from-string "\"\\x00000g;\"" read))))
+
+  (pass-if-exception "no semicolon at termination of one-digit hex-escape"
+    exception:illegal-escape                     
+    (with-read-options '(r6rs-hex-escapes)
+      (lambda ()
+        (with-input-from-string "\"\\x0\"" read))))
+
+  (pass-if-exception "no semicolon at termination of three-digit hex-escape"
+    exception:illegal-escape                     
+    (with-read-options '(r6rs-hex-escapes)
+      (lambda ()
+        (with-input-from-string "\"\\x000\"" read))))
+
+  (pass-if "two-digit hex escape"
+    (eqv? 
+     (with-read-options '(r6rs-hex-escapes)
+       (lambda ()
+         (string-ref (with-input-from-string "\"--\\xff;--\"" read) 2)))
+     (integer->char #xff)))
+
+  (pass-if "four-digit hex escape"
+    (eqv?
+     (with-read-options '(r6rs-hex-escapes)
+       (lambda ()
+         (string-ref (with-input-from-string "\"--\\x0100;--\"" read) 2)))
+     (integer->char #x0100)))
+
+  (pass-if "six-digit hex escape"
+    (eqv? 
+     (with-read-options '(r6rs-hex-escapes)
+       (lambda ()
+         (string-ref (with-input-from-string "\"--\\x010300;--\"" read) 2)))
+     (integer->char #x010300)))
+
+  (pass-if "escaped characters match non-escaped ASCII characters"
+    (string=?
+     (with-read-options '(r6rs-hex-escapes)
+       (lambda ()
+         (with-input-from-string "\"\\x41;\\x0042;\\x000043;\"" read)))
+     "ABC"))
+
+  (pass-if "write R6RS escapes"
+    
+     (let* ((s1 (apply string 
+                       (map integer->char '(#x8 ; backspace
+                                            #x20 ; space
+                                            #x30 ; zero
+                                            #x40 ; at sign
+                                            ))))
+            (s2 (with-read-options '(r6rs-hex-escapes)
+                  (lambda ()
+                    (with-output-to-string 
+                      (lambda () (write s1)))))))
+       (lset= eqv? 
+              (string->list s2)
+              (list #\" #\\ #\x #\8 #\; #\space #\0 #\@ #\")))))
+
 ;;
 ;; string?
 ;;