handy.h Special case toCTRL('?') for EBCDIC

author Karl Williamson <public@khwilliamson.com>

Sat, 13 Apr 2013 17:41:04 +0000 (11:41 -0600)

committer Karl Williamson <public@khwilliamson.com>

Wed, 5 Feb 2014 22:47:04 +0000 (15:47 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 13 Apr 2013 17:41:04 +0000 (11:41 -0600)
committer Karl Williamson <public@khwilliamson.com>
Wed, 5 Feb 2014 22:47:04 +0000 (15:47 -0700)
diff --git a/ext/B/B.pm b/ext/B/B.pm

index ee4700f..952475d 100644 (file)
--- a/ext/B/B.pm
+++ b/ext/B/B.pm
@@ -90,11 +90,13 @@ sub B::GV::SAFENAME {
    # The regex below corresponds to the isCONTROLVAR macro
    # from toke.c
  
-  $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^".
-       chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
+  $name =~ s/^\c?/^?/
+    or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/
+                "^" .  chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
  
    # When we say unicode_to_native we really mean ascii_to_native,
-  # which matters iff this is a non-ASCII platform (EBCDIC).
+  # which matters iff this is a non-ASCII platform (EBCDIC).  '\c?' would
+  # not have to be special cased, except for non-ASCII.
  
    return $name;
  }
diff --git a/handy.h b/handy.h

index d695cf5..2f0132f 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -1633,11 +1633,23 @@ EXTCONST U32 PL_charclass[];
  #define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
  #define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
  
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII;  ST on EBCDIC */
-#  define toCTRL(c)    (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII;  ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block.  If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+#  define toCTRL(c)    (toUPPER(c) ^ 64)
+#else
+#  define toCTRL(c)    ((c) == '?'                               \
+                        ? LATIN1_TO_NATIVE(0x9F)                 \
+                        : (c) == LATIN1_TO_NATIVE(0x9F)          \
+                          ? '?'                                  \
+                          : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
  
  /* Line numbers are unsigned, 32 bits. */
  typedef U32 line_t;
diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod

index af0e05d..36f93e8 100644 (file)
--- a/pod/perlebcdic.pod
+++ b/pod/perlebcdic.pod
@@ -789,7 +789,7 @@ in the ASCII table is that they can "literally" be constructed
  as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)>
  C<(chr(1)> eq C<\cA>)>, and so on.  Perl on EBCDIC platforms has been
  ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the
-thirty three characters that result depend on which code page you are
+characters that result depend on which code page you are
  using.  The table below uses the standard acronyms for the controls.
  The POSIX-BC and 1047 sets are
  identical throughout this range and differ from the 0037 set at only
@@ -799,10 +799,12 @@ platforms and cannot be generated as a C<"\c.letter."> control character on
  0037 platforms.  Note also that C<\c\> cannot be the final element in a string
  or regex, as it will absorb the terminator.   But C<\c\I<X>> is a C<FILE
  SEPARATOR> concatenated with I<X> for all I<X>.
+The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>,
+yields the outlier control C<APC> on EBCDIC, the one that isn't in the
+block of contiguous controls.
  
   chr   ord   8859-1    0037    1047 && POSIX-BC
   -----------------------------------------------------------------------
- \c?   127   <DEL>       "            "
   \c@     0   <NUL>     <NUL>        <NUL>
   \cA     1   <SOH>     <SOH>        <SOH>
   \cB     2   <STX>     <STX>        <STX>
@@ -835,6 +837,12 @@ SEPARATOR> concatenated with I<X> for all I<X>.
   \c]    29   <GS>      <GS>         <GS>
   \c^    30   <RS>      <RS>         <RS>
   \c_    31   <US>      <US>         <US>
+ \c?    *    <DEL>     <APC>        <APC>
+
+C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but
+since ordinal 127 is a not a control character on EBCDIC machines,
+C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in
+POSIX-BC.
  
  =head1 FUNCTION DIFFERENCES
  
diff --git a/t/op/chars.t b/t/op/chars.t

index d26a632..3fa9b8f 100644 (file)
--- a/t/op/chars.t
+++ b/t/op/chars.t
@@ -77,6 +77,14 @@ is (ord($c), 30, '\c^');
  $c = "\c_";
  is (ord($c), 31, '\c_');
  $c = "\c?";
-is (ord($c), 127, '\c?');
+
+# '\c?' is an outlier, and is treated differently on each platform.
+# It's DEL on ASCII, and APC on EBCDIC
+is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037
+               ? 255
+               : ord('^') == 106    # Posix-BC
+                 ? 95
+                 : 127),
+              '\c?');
  $c = '';
  is (ord($c), 0, 'ord("") is 0');
author	Karl Williamson <public@khwilliamson.com>
	Sat, 13 Apr 2013 17:41:04 +0000 (11:41 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Wed, 5 Feb 2014 22:47:04 +0000 (15:47 -0700)
ext/B/B.pm		patch \| blob \| history
handy.h		patch \| blob \| history
pod/perlebcdic.pod		patch \| blob \| history
t/op/chars.t		patch \| blob \| history