Add is_utf8_char_buf()

author Karl Williamson <public@khwilliamson.com>

Sat, 11 Feb 2012 21:04:39 +0000 (14:04 -0700)

committer Karl Williamson <public@khwilliamson.com>

Sat, 11 Feb 2012 21:35:46 +0000 (14:35 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 11 Feb 2012 21:04:39 +0000 (14:04 -0700)
committer Karl Williamson <public@khwilliamson.com>
Sat, 11 Feb 2012 21:35:46 +0000 (14:35 -0700)
diff --git a/embed.fnc b/embed.fnc

index bb40393..892a719 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -627,6 +627,7 @@ ApPR        |bool   |is_uni_punct_lc|UV c
  ApPR   |bool   |is_uni_xdigit_lc|UV c
  Anpd   |bool   |is_ascii_string|NN const U8 *s|STRLEN len
  Anpd   |STRLEN |is_utf8_char   |NN const U8 *s
+Anpd   |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
  Anpd   |bool   |is_utf8_string |NN const U8 *s|STRLEN len
  Anpdmb |bool   |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
  Anpd   |bool   |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
diff --git a/embed.h b/embed.h

index 2f9d47d..d429c8d 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -250,6 +250,7 @@
  #define is_utf8_alpha(a)       Perl_is_utf8_alpha(aTHX_ a)
  #define is_utf8_ascii(a)       Perl_is_utf8_ascii(aTHX_ a)
  #define is_utf8_char           Perl_is_utf8_char
+#define is_utf8_char_buf       Perl_is_utf8_char_buf
  #define is_utf8_cntrl(a)       Perl_is_utf8_cntrl(aTHX_ a)
  #define is_utf8_digit(a)       Perl_is_utf8_digit(aTHX_ a)
  #define is_utf8_graph(a)       Perl_is_utf8_graph(aTHX_ a)
diff --git a/proto.h b/proto.h

index d2022d5..dde1a43 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN     Perl_is_utf8_char(const U8 *s)
  #define PERL_ARGS_ASSERT_IS_UTF8_CHAR  \
         assert(s)
  
+PERL_CALLCONV STRLEN   Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end)
+                       __attribute__nonnull__(1)
+                       __attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF      \
+       assert(buf); assert(buf_end)
+
  PERL_CALLCONV bool     Perl_is_utf8_cntrl(pTHX_ const U8 *p)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c

index bfcc40c..2e0429e 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
  }
  
  /*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
+buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+    STRLEN len;
+
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+    if (buf_end <= buf) {
+       return 0;
+    }
+
+    len = buf_end - buf;
+    if (len > UTF8SKIP(buf)) {
+       len = UTF8SKIP(buf);
+    }
+
+#ifdef IS_UTF8_CHAR
+    if (IS_UTF8_CHAR_FAST(len))
+        return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+    return is_utf8_char_slow(buf, len);
+}
+
+/*
  =for apidoc is_utf8_char
  
  Tests if some arbitrary number of bytes begins in a valid UTF-8
@@ -330,14 +367,10 @@ UTF8SKIP(s) bytes.
  STRLEN
  Perl_is_utf8_char(const U8 *s)
  {
-    const STRLEN len = UTF8SKIP(s);
-
      PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
-    return is_utf8_char_slow(s, len);
+
+    /* Assumes we have enough space */
+    return is_utf8_char_buf(s, s + UTF8SKIP(s));
  }
  
  
@@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
-    if (!is_utf8_char(p))
+    /* The API should have included a length for the UTF-8 character in <p>,
+     * but it doesn't.  We therefor assume that p has been validated at least
+     * as far as there being enough bytes available in it to accommodate the
+     * character without reading beyond the end, and pass that number on to the
+     * validating routine */
+    if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
         return FALSE;
      if (!*swash)
         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
author	Karl Williamson <public@khwilliamson.com>
	Sat, 11 Feb 2012 21:04:39 +0000 (14:04 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Sat, 11 Feb 2012 21:35:46 +0000 (14:35 -0700)
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
proto.h		patch \| blob \| history
utf8.c		patch \| blob \| history