utf8.c: Add utf8_to_uvchr_buf() and utf8_to_uvuni_buf()

author Karl Williamson <public@khwilliamson.com>

Mon, 19 Mar 2012 21:03:01 +0000 (15:03 -0600)

committer Karl Williamson <public@khwilliamson.com>

Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
author Karl Williamson <public@khwilliamson.com>
Mon, 19 Mar 2012 21:03:01 +0000 (15:03 -0600)
committer Karl Williamson <public@khwilliamson.com>
Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
diff --git a/embed.fnc b/embed.fnc

index c549dc9..5a49690 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1450,6 +1450,8 @@ ApMd      |U8*    |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
  ApMd   |U8*    |bytes_to_utf8  |NN const U8 *s|NN STRLEN *len
  Apd    |UV     |utf8_to_uvchr  |NN const U8 *s|NULLOK STRLEN *retlen
  Apd    |UV     |utf8_to_uvuni  |NN const U8 *s|NULLOK STRLEN *retlen
+Apd    |UV     |utf8_to_uvchr_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
+Apd    |UV     |utf8_to_uvuni_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
  pM     |bool   |check_utf8_print       |NN const U8 *s|const STRLEN len
  
  #ifdef EBCDIC
diff --git a/embed.h b/embed.h

index 1d1e598..8a39047 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -672,7 +672,9 @@
  #define utf8_length(a,b)       Perl_utf8_length(aTHX_ a,b)
  #define utf8_to_bytes(a,b)     Perl_utf8_to_bytes(aTHX_ a,b)
  #define utf8_to_uvchr(a,b)     Perl_utf8_to_uvchr(aTHX_ a,b)
+#define utf8_to_uvchr_buf(a,b,c)       Perl_utf8_to_uvchr_buf(aTHX_ a,b,c)
  #define utf8_to_uvuni(a,b)     Perl_utf8_to_uvuni(aTHX_ a,b)
+#define utf8_to_uvuni_buf(a,b,c)       Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
  #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
  #define uvchr_to_utf8_flags(a,b,c)     Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
  #define uvuni_to_utf8_flags(a,b,c)     Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index c6fe15f..8b04237 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -104,7 +104,11 @@ The code has been refactored to reduce duplication.
  
  =item *
  
-XXX
+Two new functions C<utf8_to_uvchr_buf()> and C<utf8_to_uvuni_buf()> have
+been added.  These are the same as C<utf8_to_uvchr> and
+C<utf8_to_uvuni>, but take an extra parameter that is used to guard
+against reading beyond the end of the input string.
+See L<perlapi/utf8_to_uvchr_buf> and L<perlapi/utf8_to_uvuni_buf>.
  
  =back
  
diff --git a/proto.h b/proto.h

index b811e6b..9c91855 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4568,11 +4568,23 @@ PERL_CALLCONV UV        Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  #define PERL_ARGS_ASSERT_UTF8_TO_UVCHR \
         assert(s)
  
+PERL_CALLCONV UV       Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF     \
+       assert(s); assert(send)
+
  PERL_CALLCONV UV       Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
                         __attribute__nonnull__(pTHX_1);
  #define PERL_ARGS_ASSERT_UTF8_TO_UVUNI \
         assert(s)
  
+PERL_CALLCONV UV       Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2);
+#define PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF     \
+       assert(s); assert(send)
+
  PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
                         __attribute__nonnull__(pTHX_1);
  #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI        \
diff --git a/utf8.c b/utf8.c

index 0aede4c..1faa96d 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -563,7 +563,7 @@ All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
-Most code should use L</utf8_to_uvchr>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  =cut
  */
@@ -795,6 +795,31 @@ malformed:
  }
  
  /*
+=for apidoc utf8_to_uvchr_buf
+
+Returns the native code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+
+UV
+Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
+    assert(s < send);
+
+    return utf8n_to_uvchr(s, send - s, retlen,
+                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
  =for apidoc utf8_to_uvchr
  
  Returns the native code point of the first character in the string C<s>
@@ -817,6 +842,34 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  }
  
  /*
+=for apidoc utf8_to_uvuni_buf
+
+Returns the Unicode code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+This function should only be used when the returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
+
+    assert(send > s);
+
+    /* Call the low level routine asking for checks */
+    return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
  =for apidoc utf8_to_uvuni
  
  Returns the Unicode code point of the first character in the string C<s>
author	Karl Williamson <public@khwilliamson.com>
	Mon, 19 Mar 2012 21:03:01 +0000 (15:03 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Tue, 20 Mar 2012 00:23:44 +0000 (18:23 -0600)
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history
proto.h		patch \| blob \| history
utf8.c		patch \| blob \| history