ApdR |I32 |looks_like_number|NN SV *const sv
Apd |UV |grok_bin |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
EXpR |char |grok_bslash_c |const char source|const bool output_warning
+EXpR |char* |grok_bslash_o |NN const char* s|NN UV* uv|NN STRLEN* len|const bool output_warning
Apd |UV |grok_hex |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
Apd |int |grok_number |NN const char *pv|STRLEN len|NULLOK UV *valuep
ApdR |bool |grok_numeric_radix|NN const char **sp|NN const char *send
#define grok_bin Perl_grok_bin
#if defined(PERL_CORE) || defined(PERL_EXT)
#define grok_bslash_c Perl_grok_bslash_c
+#define grok_bslash_o Perl_grok_bslash_o
#endif
#define grok_hex Perl_grok_hex
#define grok_number Perl_grok_number
#define grok_bin(a,b,c,d) Perl_grok_bin(aTHX_ a,b,c,d)
#if defined(PERL_CORE) || defined(PERL_EXT)
#define grok_bslash_c(a,b) Perl_grok_bslash_c(aTHX_ a,b)
+#define grok_bslash_o(a,b,c,d) Perl_grok_bslash_o(aTHX_ a,b,c,d)
#endif
#define grok_hex(a,b,c,d) Perl_grok_hex(aTHX_ a,b,c,d)
#define grok_number(a,b,c) Perl_grok_number(aTHX_ a,b,c)
Perl_looks_like_number
Perl_grok_bin
Perl_grok_bslash_c
+Perl_grok_bslash_o
Perl_grok_hex
Perl_grok_number
Perl_grok_numeric_radix
[ List each enhancement as a =head2 entry ]
+=head2 \o{...}
+
+The escape sequence C<"\o"> in double-quotish contexts is now defined. It
+must be followed by braces enclosing an octal number of at least one digit. It
+means the character whose ordinal value is that octal number. This construct
+allows large octal ordinals beyond the current max of 0777 to be represented.
+It also allows you to specify a character in octal which can safely be
+concatenated with other regex snippets without danger of changing its meaning,
+and one which won't ever be confused with being a backreference to a regex
+capture group. See L<perlre/Capture groups>
+
=head2 C<\N{I<name>}> and C<charnames> enhancements
C<\N{}> and C<charnames::vianame> now know about the abbreviated character
all double-quotish contexts have the same behavior, namely to be equivalent to
C<\x{100}> - C<\x{1FF}>, with no deprecation warning. Use of these values in the
command line option C<"-0"> retains the current meaning to slurp input files
-whole; previously, this was documented only for C<"-0777">.
+whole; previously, this was documented only for C<"-0777">. It is recommended,
+however, because of various ambiguities, to use the new L</\o{...}> construct
+to represent characters in octal.
=head1 Deprecations
This modifier does not change the requirement that the brace immediately follow
the C<\N>.
+=item Missing braces on \o{}
+
+(F) A C<\o> must be followed immediately by a C<{> in double-quotish context.
+
=item Missing comma after first argument to %s function
(F) While certain functions allow you to specify a filehandle or an
F<SYS$TIMEZONE_DIFFERENTIAL> to translate to the number of seconds which
need to be added to UTC to get local time.
+=item Non-octal character '%c'. Resolved as "%s"
+
+(W digit) In parsing an octal numeric constant, a character was unexpectedly
+encountered that isn't octal. The resulting value is as indicated.
+
=item Non-string passed as bitmask
(W misc) A number has been passed as a bitmask argument to select().
the meantime, try using scientific notation (e.g. "1e6" instead of
"1_000_000").
+=item Number with no digits
+
+(F) Perl was looking for a number but found nothing that looked like a number.
+This happens, for example with C<\o{}>, with no number between the braces.
+
=item Octal number in vector unsupported
(F) Numbers with a leading C<0> are not currently allowed in vectors.
\f form feed (FF)
\a alarm (bell) (BEL)
\e escape (think troff) (ESC)
- \033 octal char (example: ESC)
\cK control char (example: VT)
\x{}, \x00 character whose ordinal is the given hexadecimal number
\N{name} named Unicode character
\N{U+263D} Unicode character (example: FIRST QUARTER MOON)
+ \o{}, \000 character whose ordinal is the given octal number
\l lowercase next char (think vi)
\u uppercase next char (think vi)
\L lowercase till \E (think vi)
Those not usable within a bracketed character class (like C<[\da-z]>) are marked
as C<Not in [].>
- \000 Octal escape sequence.
+ \000 Octal escape sequence. See also \o{}.
\1 Absolute backreference. Not in [].
\a Alarm or bell.
\A Beginning of string. Not in [].
\n (Logical) newline character.
\N Any character but newline. Experimental. Not in [].
\N{} Named or numbered (Unicode) character.
+ \o{} Octal escape sequence.
\p{}, \pP Character with the given Unicode property.
\P{}, \PP Character without the given Unicode property.
\Q Quotemeta till \E. Not in [].
=head3 Octal escapes
-Octal escapes consist of a backslash followed by three octal digits
-matching the code point of the character you want to use. (In some contexts,
-two or even one octal digits are also accepted, sometimes with a warning.) This
-allows for 512 characters (C<\000> up to C<\777>) that can be expressed this
-way. Enough in pre-Unicode days,
-but most Unicode characters cannot be escaped this way.
+There are two forms of octal escapes. Each is used to specify a character by
+its ordinal, specified in octal notation.
+
+One form, available starting in Perl 5.14 looks like C<\o{...}>, where the dots
+represent one or more octal digits. It can be used for any Unicode character.
+
+It was introduced to avoid the potential problems with the other form,
+available in all Perls. That form consists of a backslash followed by three
+octal digits. One problem with this form is that it can look exactly like an
+old-style backreference (see
+L</Disambiguation rules between old-style octal escapes and backreferences>
+below.) You can avoid this by making the first of the three digits always a
+zero, but that makes \077 the largest ordinal unambiguously specifiable by this
+form.
+
+In some contexts, a backslash followed by two or even one octal digits may be
+interpreted as an octal escape, sometimes with a warning, and because of some
+bugs, sometimes with surprising results. Also, if you are creating a regex
+out of smaller snippets concatentated together, and you use fewer than three
+digits, the beginning of one snippet may be interpreted as adding digits to the
+ending of the snippet before it. See L</Absolute referencing> for more
+discussion and examples of the snippet problem.
Note that a character that is expressed as an octal escape is considered
as a character without special meaning by the regex engine, and will match
"as is".
-=head4 Examples (assuming an ASCII platform)
+To summarize, the C<\o{}> form is always safe to use, and the other form is
+safe to use for ordinals up through \077 when you use exactly three digits to
+specify them.
- $str = "Perl";
- $str =~ /\120/; # Match, "\120" is "P".
- $str =~ /\120+/; # Match, "\120" is "P", it is repeated at least once
- $str =~ /P\053/; # No match, "\053" is "+" and taken literally.
+Mnemonic: I<0>ctal or I<o>ctal.
-=head4 Caveat
+=head4 Examples (assuming an ASCII platform)
-Octal escapes potentially clash with old-style backreferences (see L</Absolute
-referencing> below). They both consist of a backslash followed by numbers. So
-Perl has to use heuristics to determine whether it is a backreference or an
-octal escape. You can avoid ambiguity by using the C<\g> form for
-backreferences, and by beginning octal escapes with a "0". (Since octal
-escapes are 3 digits, this latter method works only up to C<\077>.) In the
-absence of C<\g>, Perl uses the following rules:
+ $str = "Perl";
+ $str =~ /\o{120}/; # Match, "\120" is "P".
+ $str =~ /\120/; # Same.
+ $str =~ /\o{120}+/; # Match, "\120" is "P", it's repeated at least once
+ $str =~ /\120+/; # Same.
+ $str =~ /P\053/; # No match, "\053" is "+" and taken literally.
+ /\o{23073}/ # Black foreground, white background smiling face.
+ /\o{4801234567}/ # Raises a warning, and yields chr(4)
+
+=head4 Disambiguation rules between old-style octal escapes and backreferences
+
+Octal escapes of the C<\000> form outside of bracketed character classes
+potentially clash with old-style backreferences. (see L</Absolute referencing>
+below). They both consist of a backslash followed by numbers. So Perl has to
+use heuristics to determine whether it is a backreference or an octal escape.
+Perl uses the following rules to disambiguate:
=over 4
$pat .= ")" x 999;
/^($pat)\1000$/; # Matches 'aa'; there are 1000 capture groups.
/^$pat\1000$/; # Matches 'a@0'; there are 999 capture groups
- # and \1000 is seen as \100 (a '@') and a '0'.
+ # and \1000 is seen as \100 (a '@') and a '0'
=back
+You can the force a backreference interpretation always by using the C<\g{...}>
+form. You can the force an octal interpretation always by using the C<\o{...}>
+form, or for numbers up through \077 (= 63 decimal), by using three digits,
+beginning with a "0".
+
=head3 Hexadecimal escapes
-Hexadecimal escapes start with C<\x> and are then either followed by a
-two digit hexadecimal number, or a hexadecimal number of arbitrary length
-surrounded by curly braces. The hexadecimal number is the code point of
-the character you want to express.
+Like octal escapes, there are two forms of hexadecimal escapes, but both start
+with the same thing, C<\x>. This is followed by either exactly two hexadecimal
+digits forming a number, or a hexadecimal number of arbitrary length surrounded
+by curly braces. The hexadecimal number is the code point of the character you
+want to express.
-Note that a character that is expressed as a hexadecimal escape is considered
+Note that a character that is expressed as one of these escapes is considered
as a character without special meaning by the regex engine, and will match
"as is".
"1000\t2000" =~ m(0\t2) # matches
"1000\n2000" =~ /0\n20/ # matches
"1000\t2000" =~ /\000\t2/ # doesn't match, "0" ne "\000"
- "cat" =~ /\143\x61\x74/ # matches in ASCII, but a weird way to spell cat
+ "cat" =~ /\o{143}\x61\x74/ # matches in ASCII, but a weird way
+ # to spell cat
If you've been around Perl a while, all this talk of escape sequences
may seem familiar. Similar escape sequences are used in double-quoted
to know 1) how to represent Unicode characters in a regexp and 2) that
a matching operation will treat the string to be searched as a sequence
of characters, not bytes. The answer to 1) is that Unicode characters
-greater than C<chr(255)> are represented using the C<\x{hex}> notation,
-because the \0 octal and \x hex (without curly braces) don't go further
-than 255.
+greater than C<chr(255)> are represented using the C<\x{hex}> notation, because
+\x hex (without curly braces) doesn't go further than 255. (Starting in Perl
+5.14) if you're an octal fan, you can also use C<\o{oct}>.
/\x{263a}/; # match a Unicode smiley face :)
PERL_CALLCONV char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
__attribute__warn_unused_result__;
+PERL_CALLCONV char* Perl_grok_bslash_o(pTHX_ const char* s, UV* uv, STRLEN* len, const bool output_warning)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_GROK_BSLASH_O \
+ assert(s); assert(uv); assert(len)
+
PERL_CALLCONV UV Perl_grok_hex(pTHX_ const char* start, STRLEN* len_p, I32* flags, NV *result)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
register UV ender;
register char *p;
char *s;
+ char *error_msg;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
ender = ASCII_TO_NATIVE('\007');
p++;
break;
+ case 'o':
+ {
+ STRLEN brace_len = len;
+ if ((error_msg = grok_bslash_o(p,
+ &ender,
+ &brace_len,
+ SIZE_ONLY))
+ != NULL)
+ {
+ vFAIL(error_msg);
+ }
+ p += brace_len;
+ if (PL_encoding && ender < 0x100) {
+ goto recode_encoding;
+ }
+ if (ender > 0xff) {
+ RExC_utf8 = 1;
+ }
+ break;
+ }
case 'x':
if (*++p == '{') {
char* const e = strchr(p, '}');
parseit:
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
+ char* error_msg;
charclassloop:
case 'b': value = '\b'; break;
case 'e': value = ASCII_TO_NATIVE('\033');break;
case 'a': value = ASCII_TO_NATIVE('\007');break;
+ case 'o':
+ RExC_parse--; /* function expects to be pointed at the 'o' */
+ if ((error_msg = grok_bslash_o(RExC_parse,
+ &value,
+ &numlen,
+ SIZE_ONLY))
+ != NULL)
+ {
+ vFAIL(error_msg);
+ }
+ RExC_parse += numlen;
+ if (PL_encoding && value < 0x100) {
+ goto recode_encoding;
+ }
+ break;
case 'x':
if (*RExC_parse == '{') {
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
Useless (?o) - use /o modifier in regex; marked by <-- HERE in m/(?o <-- HERE gc)/ at - line 12.
Useless (?g) - use /g modifier in regex; marked by <-- HERE in m/(?og <-- HERE c)/ at - line 12.
Useless (?c) - use /gc modifier in regex; marked by <-- HERE in m/(?ogc <-- HERE )/ at - line 12.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o{/;
+EXPECT
+Missing right brace on \o{ in regex; marked by <-- HERE in m/\ <-- HERE o{/ at - line 2.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o/;
+EXPECT
+Missing braces on \o{} in regex; marked by <-- HERE in m/\ <-- HERE o/ at - line 2.
+########
+# regcomp.c [S_regatom]
+$a = qr/\o{}/;
+EXPECT
+Number with no digits in regex; marked by <-- HERE in m/\ <-- HERE o{}/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o{]/;
+EXPECT
+Missing right brace on \o{ in regex; marked by <-- HERE in m/[\ <-- HERE o{]/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o]/;
+EXPECT
+Missing braces on \o{} in regex; marked by <-- HERE in m/[\ <-- HERE o]/ at - line 2.
+########
+# regcomp.c [S_regclass]
+$a = qr/[\o{}]/;
+EXPECT
+Number with no digits in regex; marked by <-- HERE in m/[\ <-- HERE o{}]/ at - line 2.
Use of := for an empty attribute list is deprecated at - line 38.
Use of := for an empty attribute list is deprecated at - line 41.
Use of := for an empty attribute list is deprecated at - line 42.
+########
+# toke.c
+use warnings 'syntax' ;
+my $a = "\o";
+my $a = "\o{";
+my $a = "\o{}";
+no warnings 'syntax' ;
+my $a = "\o";
+my $a = "\o{";
+my $a = "\o{}";
+EXPECT
+Missing braces on \o{} at - line 3, within string
+Missing right brace on \o{ at - line 4, within string
+Number with no digits at - line 5, within string
+BEGIN not safe after errors--compilation aborted at - line 6.
+########
+# toke.c
+use warnings 'digit' ;
+my $a = "\o{1238456}";
+no warnings 'digit' ;
+my $a = "\o{1238456}";
+EXPECT
+Non-octal character '8'. Resolved as "\o{123}" at - line 3.
@INC = '../lib';
}
-print q(1..26
+print q(1..29
);
# This is() function is written to avoid ""
is ("\400", chr 0x100);
is ("\600", chr 0x180);
is ("\777", chr 0x1FF);
+is ("a\o{120}b", "a" . chr(0x50) . "b");
+is ("a\o{400}b", "a" . chr(0x100) . "b");
+is ("a\o{1000}b", "a" . chr(0x200) . "b");
# These kludged tests should change when we remove the temporary fatal error
# in util.c for "\c{". And, the warning there should probably not be
[a\400] \x{100} y $& \x{100}
[b\600] \x{180} y $& \x{180}
[c\777] \x{1FF} y $& \x{1FF}
+\o{120} \x{50} y $& \x{50}
+\o{400} \x{100} y $& \x{100}
+\o{1000} \x{200} y $& \x{200}
+[a\o{120}] \x{50} y $& \x{50}
+[a\o{400}] \x{100} y $& \x{100}
+[a\o{1000}] \x{200} y $& \x{200}
# vim: softtabstop=0 noexpandtab
}
goto NUM_ESCAPE_INSERT;
+ /* eg. \o{24} indicates the octal constant \024 */
+ case 'o':
+ {
+ STRLEN len;
+
+ char* error = grok_bslash_o(s, &uv, &len, 1);
+ s += len;
+ if (error) {
+ yyerror(error);
+ continue;
+ }
+ goto NUM_ESCAPE_INSERT;
+ }
+
/* eg. \x24 indicates the hex constant 0x24 */
case 'x':
++s;
char
Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
{
-
+
U8 result;
if (! isASCII(source)) {
return result;
}
+char *
+Perl_grok_bslash_o(pTHX_ const char *s, UV *uv, STRLEN *len, const bool output_warning)
+{
+
+/* Documentation to be supplied when interface nailed down finally
+ * This returns NULL on success, otherwise a pointer to an internal constant
+ * error message. On input:
+ * s points to a string that begins with o, and the previous character was
+ * a backslash.
+ * uv points to a UV that will hold the output value
+ * len will point to the next character in the string past the end of this
+ * construct
+ * output_warning says whether to output any warning messages, or suppress
+ * them
+ */
+ char* e;
+ STRLEN numbers_len;
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX
+ /* XXX Until the message is improved in grok_oct, handle errors
+ * ourselves */
+ | PERL_SCAN_SILENT_ILLDIGIT;
+
+ PERL_ARGS_ASSERT_GROK_BSLASH_O;
+
+
+ assert(*s == 'o');
+ s++;
+
+ if (*s != '{') {
+ *len = 1; /* Move past the o */
+ return "Missing braces on \\o{}";
+ }
+
+ e = strchr(s, '}');
+ if (!e) {
+ *len = 2; /* Move past the o{ */
+ return "Missing right brace on \\o{";
+ }
+
+ /* Return past the '}' no matter what is inside the braces */
+ *len = e - s + 2; /* 2 = 1 for the o + 1 for the '}' */
+
+ s++; /* Point to first digit */
+
+ numbers_len = e - s;
+ if (numbers_len == 0) {
+ return "Number with no digits";
+ }
+
+ *uv = NATIVE_TO_UNI(grok_oct(s, &numbers_len, &flags, NULL));
+ /* Note that if has non-octal, will ignore everything starting with that up
+ * to the '}' */
+
+ if (output_warning && numbers_len != (STRLEN) (e - s)) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
+ /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */
+ "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"",
+ *(s + numbers_len),
+ (int) numbers_len,
+ s);
+ }
+
+ return NULL;
+}
+
/* To workaround core dumps from the uninitialised tm_zone we get the
* system to give us a reasonable struct to copy. This fix means that
* strftime uses the tm_zone and tm_gmtoff values returned by