Comment additions, typos, white-space.

author Karl Williamson <public@khwilliamson.com>

Fri, 18 Nov 2011 15:36:43 +0000 (08:36 -0700)

committer Karl Williamson <public@khwilliamson.com>

Fri, 13 Jan 2012 16:58:32 +0000 (09:58 -0700)
author Karl Williamson <public@khwilliamson.com>
Fri, 18 Nov 2011 15:36:43 +0000 (08:36 -0700)
committer Karl Williamson <public@khwilliamson.com>
Fri, 13 Jan 2012 16:58:32 +0000 (09:58 -0700)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 7b7af71..03f122b 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -15633,7 +15633,7 @@ sub make_property_test_script() {
  # others except DAge.txt (as data in an extracted file can be over-ridden by
  # the non-extracted.  Some other files depend on data derived from an earlier
  # file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode.  Mostly, it safest to order by first
+# folding requires data from Unicode.  Mostly, it is safest to order by first
  # version releases in (except the Jamo).  DAge.txt is read before the
  # extracted ones because of the rarely used feature $compare_versions.  In the
  # unlikely event that there were ever an extracted file that contained the Age
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl

index 84a8167..95758f7 100644 (file)
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -105,9 +105,8 @@ sub _loose_name ($) {
  
          if ($type)
          {
-
              # Verify that this isn't a recursive call for this property.
-            # Can't use croak, as it may try to recurse here itself.
+            # Can't use croak, as it may try to recurse to here itself.
              my $class_type = $class . "::$type";
              if (grep { $_ eq $class_type } @recursed) {
                  CORE::die "panic: Infinite recursion in SWASHNEW for '$type'\n";
diff --git a/regcomp.c b/regcomp.c

index 4669c27..6e27f1e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -5840,8 +5840,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
   * The 1th element is the first element beyond that not in the list.  In other
   * words, the first range is
   *  invlist[0]..(invlist[1]-1)
- * The other ranges follow.  Thus every element that is divisible by two marks
- * the beginning of a range that is in the list, and every element not
+ * The other ranges follow.  Thus every element whose index is divisible by two
+ * marks the beginning of a range that is in the list, and every element not
   * divisible by two marks the beginning of a range not in the list.  A single
   * element inversion list that contains the single code point N generally
   * consists of two elements
@@ -5922,7 +5922,8 @@ S_invlist_array(pTHX_ SV* const invlist)
  
      PERL_ARGS_ASSERT_INVLIST_ARRAY;
  
-    /* Must not be empty */
+    /* Must not be empty.  If these fail, you probably didn't check for <len>
+     * being non-zero before trying to get the array */
      assert(*get_invlist_len_addr(invlist));
      assert(*get_invlist_zero_addr(invlist) == 0
            || *get_invlist_zero_addr(invlist) == 1);
@@ -5948,7 +5949,8 @@ S_get_invlist_len_addr(pTHX_ SV* invlist)
  PERL_STATIC_INLINE UV
  S_invlist_len(pTHX_ SV* const invlist)
  {
-    /* Returns the current number of elements in the inversion list's array */
+    /* Returns the current number of elements stored in the inversion list's
+     * array */
  
      PERL_ARGS_ASSERT_INVLIST_LEN;
  
@@ -6059,7 +6061,6 @@ S_invlist_trim(pTHX_ SV* const invlist)
  
  /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
   * etc */
-
  #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
  #define PREV_ELEMENT_IN_INVLIST_SET(i) (! ELEMENT_IN_INVLIST_SET(i))
  
@@ -6105,7 +6106,7 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV
             }
             else {
                 /* But if the end is the maximum representable on the machine,
-                * just let the range that this would extend have no end */
+                * just let the range that this would extend to have no end */
                 invlist_set_len(invlist, len - 1);
             }
             return;
@@ -6145,7 +6146,7 @@ void
  Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
  {
      /* Take the union of two inversion lists and point 'result' to it.  If
-     * 'result' on input points to one of the two lists, the reference count to
+     * 'output' on input points to one of the two lists, the reference count to
       * that list will be decremented.
       * The basis for this comes from "Unicode Demystified" Chapter 13 by
       * Richard Gillam, published by Addison-Wesley, and explained at some
@@ -6191,8 +6192,7 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
         }
         else if (output != &b) {
             *output = invlist_clone(b);
-       }
-       /* else *output already = b; */
+       } /* else *output already = b; */
         return;
      }
      else if ((len_b = invlist_len(b)) == 0) {
@@ -6636,8 +6636,8 @@ S_invlist_clone(pTHX_ SV* const invlist)
  void
  Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
  {
-    /* Point result to an inversion list which consists of all elements in 'a'
-     * that aren't also in 'b' */
+    /* Point <result> to an inversion list which consists of all elements in
+     * <a> that aren't also in <b> */
  
      PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
  
@@ -6687,6 +6687,13 @@ S_invlist_iterinit(pTHX_ SV* invlist)    /* Initialize iterator for invlist */
  STATIC bool
  S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
  {
+    /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
+     * This call sets in <*start> and <*end>, the next range in <invlist>.
+     * Returns <TRUE> if successful and the next call will return the next
+     * range; <FALSE> if was already at the end of the list.  If the latter,
+     * <*start> and <*end> are unchanged, and the next call to this function
+     * will start over at the beginning of the list */
+
      UV* pos = get_invlist_iter_addr(invlist);
      UV len = invlist_len(invlist);
      UV *array;
@@ -10469,10 +10476,10 @@ parseit:
             }
         }
  
-       /* Only the characters in this class that participate in folds need
-           * be checked.  Get the intersection of this class and all the
-           * possible characters that are foldable.  This can quickly narrow
-           * down a large class */
+       /* Only the characters in this class that participate in folds need be
+        * checked.  Get the intersection of this class and all the possible
+        * characters that are foldable.  This can quickly narrow down a large
+        * class */
         _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
  
         /* Now look at the foldable characters in this class individually */
@@ -10491,23 +10498,22 @@ parseit:
  
                 if (foldlen > (STRLEN)UNISKIP(f)) {
  
-                   /* Any multicharacter foldings (disallowed in
-                       * lookbehind patterns) require the following
-                       * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
-                       * E folds into "pq" and F folds into "rst", all other
-                       * characters fold to single characters.  We save away
-                       * these multicharacter foldings, to be later saved as
-                       * part of the additional "s" data. */
+                   /* Any multicharacter foldings (disallowed in lookbehind
+                    * patterns) require the following transform: [ABCDEF] ->
+                    * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
+                    * folds into "rst", all other characters fold to single
+                    * characters.  We save away these multicharacter foldings,
+                    * to be later saved as part of the additional "s" data. */
                     if (! RExC_in_lookbehind) {
                         U8* loc = foldbuf;
                         U8* e = foldbuf + foldlen;
  
-                       /* If any of the folded characters of this are in
-                           * the Latin1 range, tell the regex engine that
-                           * this can match a non-utf8 target string.  The
-                           * only multi-byte fold whose source is in the
-                           * Latin1 range (U+00DF) applies only when the
-                           * target string is utf8, or under unicode rules */
+                       /* If any of the folded characters of this are in the
+                        * Latin1 range, tell the regex engine that this can
+                        * match a non-utf8 target string.  The only multi-byte
+                        * fold whose source is in the Latin1 range (U+00DF)
+                        * applies only when the target string is utf8, or
+                        * under unicode rules */
                         if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
                             while (loc < e) {
  
@@ -10520,8 +10526,8 @@ parseit:
                                 if (UTF8_IS_INVARIANT(*loc)
                                     || UTF8_IS_DOWNGRADEABLE_START(*loc))
                                 {
-                                   /* Can't mix above and below 256 under
-                                       * LOC */
+                                    /* Can't mix above and below 256 under LOC
+                                     */
                                     if (LOC) {
                                         goto end_multi_fold;
                                     }
@@ -10551,13 +10557,13 @@ parseit:
                 }
                 else {
                     /* Single character fold.  Add everything in its fold
-                       * closure to the list that this node should match */
+                    * closure to the list that this node should match */
                     SV** listp;
  
-                   /* The fold closures data structure is a hash with the
-                       * keys being every character that is folded to, like
-                       * 'k', and the values each an array of everything that
-                       * folds to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                   /* The fold closures data structure is a hash with the keys
+                    * being every character that is folded to, like 'k', and
+                    * the values each an array of everything that folds to its
+                    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
                     if ((listp = hv_fetch(PL_utf8_foldclosures,
                                     (char *) foldbuf, foldlen, FALSE)))
                     {
@@ -10571,9 +10577,9 @@ parseit:
                             }
                             c = SvUV(*c_p);
  
-                           /* /aa doesn't allow folds between ASCII and
-                               * non-; /l doesn't allow them between above
-                               * and below 256 */
+                           /* /aa doesn't allow folds between ASCII and non-;
+                            * /l doesn't allow them between above and below
+                            * 256 */
                             if ((MORE_ASCII_RESTRICTED
                                  && (isASCII(c) != isASCII(j)))
                                     || (LOC && ((c < 256) != (j < 256))))
@@ -10587,9 +10593,9 @@ parseit:
                                         (U8) c,
                                         &l1_fold_invlist, &unicode_alternate);
                             }
-                               /* It may be that the code point is already
-                                   * in this range or already in the bitmap,
-                                   * in which case we need do nothing */
+                               /* It may be that the code point is already in
+                                * this range or already in the bitmap, in
+                                * which case we need do nothing */
                             else if ((c < start || c > end)
                                         && (c > 255
                                             || ! ANYOF_BITMAP_TEST(ret, c)))
@@ -10616,21 +10622,25 @@ parseit:
      }
  
      /* Here, we have calculated what code points should be in the character
-     * class.   Now we can see about various optimizations.  Fold calculation
-     * needs to take place before inversion.  Otherwise /[^k]/i would invert to
-     * include K, which under /i would match k. */
+     * class.
+     *
+     * Now we can see about various optimizations.  Fold calculation (which we
+     * did above) needs to take place before inversion.  Otherwise /[^k]/i
+     * would invert to include K, which under /i would match k, which it
+     * shouldn't. */
  
      /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
-     * set the FOLD flag yet, so this this does optimize those.  It doesn't
+     * set the FOLD flag yet, so this does optimize those.  It doesn't
       * optimize locale.  Doing so perhaps could be done as long as there is
       * nothing like \w in it; some thought also would have to be given to the
       * interaction with above 0x100 chars */
-    if (! LOC
-       && (ANYOF_FLAGS(ret) & ANYOF_INVERT)
+    if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
+        && ! LOC
         && ! unicode_alternate
         /* In case of /d, there are some things that should match only when in
          * not in the bitmap, i.e., they require UTF8 to match.  These are
-        * listed in nonbitmap. */
+        * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
+        * case, they don't require UTF8, so can invert here */
         && (! nonbitmap
             || ! DEPENDS_SEMANTICS
             || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
@@ -10657,6 +10667,8 @@ parseit:
                     ANYOF_BITMAP_SET(ret, value);
                 }
             }
+
+           /* And do the removal */
             _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
             SvREFCNT_dec(remove_list);
         }
@@ -10794,11 +10806,11 @@ parseit:
         /* The 0th element stores the character class description
          * in its textual form: used later (regexec.c:Perl_regclass_swash())
          * to initialize the appropriate swash (which gets stored in
-        * the 1st element), and also useful for dumping the regnode.
-        * The 2nd element stores the multicharacter foldings,
+        * element [1]), and also useful for dumping the regnode.
+        * Element [2] stores the multicharacter foldings,
          * used later (regexec.c:S_reginclass()). */
         av_store(av, 0, listsv);
-       av_store(av, 1, NULL);
+       av_store(av, 1, NULL);  /* Placeholder for generated swash */
  
          /* Store any computed multi-char folds only if we are allowing
           * them */
@@ -11610,14 +11622,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             sv_catpvs(sv, "{outside bitmap}");
  
         if (ANYOF_NONBITMAP(o)) {
-           SV *lv;
+           SV *lv; /* Set if there is something outside the bit map */
             SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
         
             if (lv) {
                 if (sw) {
                     U8 s[UTF8_MAXBYTES_CASE+1];
  
-                   for (i = 0; i <= 256; i++) { /* just the first 256 */
+                   for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
                         uvchr_to_utf8(s, i);
                         
                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
diff --git a/regcomp.h b/regcomp.h

index 81c8a5d..0540c63 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -311,6 +311,8 @@ struct regnode_charclass_class {
   * are done to share them, as described below.  If necessary, the ANYOF_LOCALE
   * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes,
   * but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS.
+ * Another option would be to push them into new nodes.  E.g. there could be an
+ * ANYOF_LOCALE node that would be in place of the flag of the same name.
   * Once the planned change to compile all the above-latin1 code points is done,
   * then the UNICODE_ALL bit can be freed up, with a small performance penalty.
   * If flags need to be added that are applicable to the synthetic start class
diff --git a/utf8.c b/utf8.c

index 23308a3..61c6c23 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2586,6 +2586,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
  
      PERL_ARGS_ASSERT_SWASH_FETCH;
  
+    /* Convert to utf8 if not already */
      if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
author	Karl Williamson <public@khwilliamson.com>
	Fri, 18 Nov 2011 15:36:43 +0000 (08:36 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Fri, 13 Jan 2012 16:58:32 +0000 (09:58 -0700)
lib/unicore/mktables		patch \| blob \| history
lib/utf8_heavy.pl		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
utf8.c		patch \| blob \| history