Add warnings for "\08", /\017/
authorKarl Williamson <public@khwilliamson.com>
Sun, 13 Jan 2013 20:33:22 +0000 (13:33 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 14 Jan 2013 16:05:08 +0000 (09:05 -0700)
This was discussed in thread
http://perl.markmail.org/thread/avtzvtpzemvg2ki2
but I never got around to this portion of the consensus, until now.

I did a cpan grep
http://grep.cpan.me/?q=%28^|[^\\]%29\\[0-7]{1%2C2}[8-9]&page=1

and eyeballing the results, saw three cases where this warning might
show up; one of which was for EBCDIC.  The others looked to be false
positives, such as in .css files.

dquote_static.c
embed.fnc
embed.h
pod/perldelta.pod
pod/perldiag.pod
proto.h
regcomp.c
t/re/reg_mesg.t
toke.c

index 61845cc..5a22993 100644 (file)
@@ -297,6 +297,35 @@ S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
     return TRUE;
 }
 
+STATIC char*
+S_form_short_octal_warning(pTHX_
+                           const char * const s, /* Points to first non-octal */
+                           const STRLEN len      /* Length of octals string, so
+                                                    (s-len) points to first
+                                                    octal */
+) {
+    /* Return a character string consisting of a warning message for when a
+     * string constant in octal is weird, like "\078".  */
+
+    const char * sans_leading_zeros = s - len;
+
+    PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING;
+
+    assert(*s == '8' || *s == '9');
+
+    /* Remove the leading zeros, retaining one zero so won't be zero length */
+    while (*sans_leading_zeros == '0') sans_leading_zeros++;
+    if (sans_leading_zeros == s) {
+        sans_leading_zeros--;
+    }
+
+    return Perl_form(aTHX_
+                     "'%.*s' resolved to '\\o{%.*s}%c'",
+                     (int) (len + 2), s - len - 1,
+                     (int) (s - sans_leading_zeros), sans_leading_zeros,
+                     *s);
+}
+
 /*
  * Local variables:
  * c-indentation-style: bsd
index 730691a..b3b931c 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -751,6 +751,8 @@ EMiR        |bool   |grok_bslash_x  |NN char** s|NN UV* uv           \
                                |const bool strict               \
                                |const bool silence_non_portable \
                                |const bool utf8
+EMsPR  |char*|form_short_octal_warning|NN const char * const s  \
+                               |const STRLEN len
 #endif
 Apd    |UV     |grok_hex       |NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
 Apd    |int    |grok_number    |NN const char *pv|STRLEN len|NULLOK UV *valuep
diff --git a/embed.h b/embed.h
index 8289cec..8e8279f 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define _core_swash_init(a,b,c,d,e,f,g)        Perl__core_swash_init(aTHX_ a,b,c,d,e,f,g)
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
+#define form_short_octal_warning(a,b)  S_form_short_octal_warning(aTHX_ a,b)
 #define grok_bslash_c(a,b,c)   S_grok_bslash_c(aTHX_ a,b,c)
 #define grok_bslash_o(a,b,c,d,e,f,g)   S_grok_bslash_o(aTHX_ a,b,c,d,e,f,g)
 #define grok_bslash_x(a,b,c,d,e,f,g)   S_grok_bslash_x(aTHX_ a,b,c,d,e,f,g)
index ea4db73..a43cf72 100644 (file)
@@ -210,8 +210,13 @@ XXX L<message|perldiag/"message">
 
 =item *
 
+L<'%s' resolved to '\o{%s}%d'|perldiag/"'%s' resolved to '\o{%s}%d'">
+
+=item *
+
 XXX L<message|perldiag/"message">
 
+
 =back
 
 =head2 Changes to Existing Diagnostics
index 9e6ee34..2be0f79 100644 (file)
@@ -4313,6 +4313,15 @@ terminates.  You might use ^# instead.  See L<perlform>.
 search list.  So the additional elements in the replacement list
 are meaningless.
 
+=item '%s' resolved to '\o{%s}%d'
+
+(W misc, regexp)  You wrote something like C<\08>, or C<\179> in a
+double-quotish string.  All but the last digit is treated as a single
+character, specified in octal.  The last digit is the next character in
+the string.  To tell Perl that this is indeed what you want, you can use
+the C<\o{ }> syntax, or use exactly three digits to specify the octal
+for the character.
+
 =item Reversed %s= operator
 
 (W syntax) You wrote your assignment operator backwards.  The = must
diff --git a/proto.h b/proto.h
index b4d81d6..7428380 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6788,6 +6788,13 @@ PERL_CALLCONV SV*        Perl__core_swash_init(pTHX_ const char* pkg, const char* name,
 
 #endif
 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
+STATIC char*   S_form_short_octal_warning(pTHX_ const char * const s, const STRLEN len)
+                       __attribute__warn_unused_result__
+                       __attribute__pure__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_FORM_SHORT_OCTAL_WARNING      \
+       assert(s)
+
 STATIC char    S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning)
                        __attribute__warn_unused_result__;
 
index c0a37b2..05e9fe5 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -535,6 +535,13 @@ static const scan_data_t zero_scan_data =
     Simple_vFAIL4(m, a1, a2, a3);                      \
 } STMT_END
 
+/* m is not necessarily a "literal string", in this macro */
+#define reg_warn_non_literal_string(loc, m) STMT_START {                \
+    const IV offset = loc - RExC_precomp;                               \
+    Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
+            m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
+} STMT_END
+
 #define        ckWARNreg(loc,m) STMT_START {                                   \
     const IV offset = loc - RExC_precomp;                              \
     Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,     \
@@ -10700,6 +10707,15 @@ tryagain:
                                REQUIRE_UTF8;
                            }
                            p += numlen;
+                            if (SIZE_ONLY   /* like \08, \178 */
+                                && numlen < 3
+                                && p < RExC_end
+                                && isDIGIT(*p) && ckWARN(WARN_REGEXP))
+                            {
+                               reg_warn_non_literal_string(
+                                         p + 1,
+                                         form_short_octal_warning(p, numlen));
+                            }
                        }
                         else {  /* Not to be treated as an octal constant, go
                                    find backref */
@@ -12166,11 +12182,25 @@ parseit:
                     numlen = (strict) ? 4 : 3;
                     value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
-                    if (strict) {
-                        if (numlen != 3) {
+                    if (numlen != 3) {
+                        SAVEFREESV(listsv); /* In case warnings are fatalized */
+                        if (strict) {
                             RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
                             vFAIL("Need exactly 3 octal digits");
                         }
+                        else if (! SIZE_ONLY /* like \08, \178 */
+                                 && numlen < 3
+                                 && RExC_parse < RExC_end
+                                 && isDIGIT(*RExC_parse)
+                                 && ckWARN(WARN_REGEXP))
+                        {
+                            SAVEFREESV(RExC_rx_sv);
+                            reg_warn_non_literal_string(
+                                 RExC_parse + 1,
+                                 form_short_octal_warning(RExC_parse, numlen));
+                            (void)ReREFCNT_inc(RExC_rx_sv);
+                        }
+                        SvREFCNT_inc_simple_void_NN(listsv);
                     }
                    if (PL_encoding && value < 0x100)
                        goto recode_encoding;
index 7487421..14e9ace 100644 (file)
@@ -165,6 +165,10 @@ my @warning = (
     'm/[\pM-x]/' => 'False [] range "\pM-" in regex; marked by {#} in m/[\pM-{#}x]/',
     "m'\\y'"     => 'Unrecognized escape \y passed through in regex; marked by {#} in m/\y{#}/',
     '/x{3,1}/'   => 'Quantifier {n,m} with n > m can\'t match in regex; marked by {#} in m/x{3,1}{#}/',
+    '/\08/' => '\'\08\' resolved to \'\o{0}8\' in regex; marked by {#} in m/\08{#}/',
+    '/\018/' => '\'\018\' resolved to \'\o{1}8\' in regex; marked by {#} in m/\018{#}/',
+    '/[\08]/' => '\'\08\' resolved to \'\o{0}8\' in regex; marked by {#} in m/[\08{#}]/',
+    '/[\018]/' => '\'\018\' resolved to \'\o{1}8\' in regex; marked by {#} in m/[\018{#}]/',
     '/(?[ \t ])/' => 'The regex_sets feature is experimental in regex; marked by {#} in m/(?[{#} \t ])/',
 );
 
diff --git a/toke.c b/toke.c
index 8c53580..987a68d 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3277,10 +3277,16 @@ S_scan_const(pTHX_ char *start)
            case '0': case '1': case '2': case '3':
            case '4': case '5': case '6': case '7':
                {
-                    I32 flags = 0;
+                    I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
                     STRLEN len = 3;
                    uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
                    s += len;
+                    if (len < 3 && s < send && isDIGIT(*s)
+                        && ckWARN(WARN_MISC))
+                    {
+                        Perl_warner(aTHX_ packWARN(WARN_MISC),
+                                    "%s", form_short_octal_warning(s, len));
+                    }
                }
                goto NUM_ESCAPE_INSERT;