# The regex below corresponds to the isCONTROLVAR macro
# from toke.c
- $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^".
- chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
+ $name =~ s/^\c?/^?/
+ or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/
+ "^" . chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
# When we say unicode_to_native we really mean ascii_to_native,
- # which matters iff this is a non-ASCII platform (EBCDIC).
+ # which matters iff this is a non-ASCII platform (EBCDIC). '\c?' would
+ # not have to be special cased, except for non-ASCII.
return $name;
}
#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII; ST on EBCDIC */
-# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block. If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+# define toCTRL(c) (toUPPER(c) ^ 64)
+#else
+# define toCTRL(c) ((c) == '?' \
+ ? LATIN1_TO_NATIVE(0x9F) \
+ : (c) == LATIN1_TO_NATIVE(0x9F) \
+ ? '?' \
+ : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
/* Line numbers are unsigned, 32 bits. */
typedef U32 line_t;
as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)>
C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been
ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the
-thirty three characters that result depend on which code page you are
+characters that result depend on which code page you are
using. The table below uses the standard acronyms for the controls.
The POSIX-BC and 1047 sets are
identical throughout this range and differ from the 0037 set at only
0037 platforms. Note also that C<\c\> cannot be the final element in a string
or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE
SEPARATOR> concatenated with I<X> for all I<X>.
+The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>,
+yields the outlier control C<APC> on EBCDIC, the one that isn't in the
+block of contiguous controls.
chr ord 8859-1 0037 1047 && POSIX-BC
-----------------------------------------------------------------------
- \c? 127 <DEL> " "
\c@ 0 <NUL> <NUL> <NUL>
\cA 1 <SOH> <SOH> <SOH>
\cB 2 <STX> <STX> <STX>
\c] 29 <GS> <GS> <GS>
\c^ 30 <RS> <RS> <RS>
\c_ 31 <US> <US> <US>
+ \c? * <DEL> <APC> <APC>
+
+C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but
+since ordinal 127 is a not a control character on EBCDIC machines,
+C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in
+POSIX-BC.
=head1 FUNCTION DIFFERENCES
$c = "\c_";
is (ord($c), 31, '\c_');
$c = "\c?";
-is (ord($c), 127, '\c?');
+
+# '\c?' is an outlier, and is treated differently on each platform.
+# It's DEL on ASCII, and APC on EBCDIC
+is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037
+ ? 255
+ : ord('^') == 106 # Posix-BC
+ ? 95
+ : 127),
+ '\c?');
$c = '';
is (ord($c), 0, 'ord("") is 0');