From eebe148573e5a07582cda46391148ca89c563ade Mon Sep 17 00:00:00 2001
From: Simon Cozens <simon@netthink.co.uk>
Date: Tue, 16 Jan 2001 13:42:30 +0000
Subject: [PATCH] Re: API Cleanup To: perl5-porters@perl.org Date: Tue, 16 Jan
 2001 13:42:30 +0000 Message-ID: <20010116134230.A13420@pembro26.pmb.ox.ac.uk>

Subject: [PATCH] utf8.c documentation
Date: Tue, 16 Jan 2001 13:52:48 +0000
Message-ID: <20010116135248.A13496@pembro26.pmb.ox.ac.uk>

Subject: Re: API Cleanup
From: Simon Cozens <simon@cozens.net>
Date: Tue, 16 Jan 2001 14:58:55 +0000
Message-ID: <20010116145855.A13794@pembro26.pmb.ox.ac.uk>

UTF-8 doc patches.

p4raw-id: //depot/perl@8452
---
 embed.pl        | 20 +++++++--------
 pod/perlapi.pod | 77 +++++++++++++++++++++++++++++++++++++++++----------------
 utf8.c          | 47 ++++++++++++++++++++++++++---------
 3 files changed, 101 insertions(+), 43 deletions(-)
diff --git a/embed.pl b/embed.pl
index 7621f66..371ba58 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1628,8 +1628,8 @@ Ap	|bool	|is_uni_xdigit_lc|U32 c
 Ap	|U32	|to_uni_upper_lc|U32 c
 Ap	|U32	|to_uni_title_lc|U32 c
 Ap	|U32	|to_uni_lower_lc|U32 c
-Ap	|STRLEN	|is_utf8_char	|U8 *p
-Ap	|bool	|is_utf8_string	|U8 *s|STRLEN len
+Apd	|STRLEN	|is_utf8_char	|U8 *p
+Apd	|bool	|is_utf8_string	|U8 *s|STRLEN len
 Ap	|bool	|is_utf8_alnum	|U8 *p
 Ap	|bool	|is_utf8_alnumc	|U8 *p
 Ap	|bool	|is_utf8_idfirst|U8 *p
@@ -2077,14 +2077,14 @@ p	|void	|unshare_hek	|HEK* hek
 p	|void	|utilize	|int aver|I32 floor|OP* version|OP* id|OP* arg
 Ap	|U8*	|utf16_to_utf8	|U8* p|U8 *d|I32 bytelen|I32 *newlen
 Ap	|U8*	|utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
-Ap	|STRLEN	|utf8_length	|U8* s|U8 *e
-Ap	|IV	|utf8_distance	|U8 *a|U8 *b
-Ap	|U8*	|utf8_hop	|U8 *s|I32 off
-ApM	|U8*	|utf8_to_bytes	|U8 *s|STRLEN *len
-ApM	|U8*	|bytes_to_utf8	|U8 *s|STRLEN *len
-Ap	|UV	|utf8_to_uv_simple|U8 *s|STRLEN* retlen
-Ap	|UV	|utf8_to_uv	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
-Ap	|U8*	|uv_to_utf8	|U8 *d|UV uv
+Adp	|STRLEN	|utf8_length	|U8* s|U8 *e
+Apd	|IV	|utf8_distance	|U8 *a|U8 *b
+Apd	|U8*	|utf8_hop	|U8 *s|I32 off
+ApMd	|U8*	|utf8_to_bytes	|U8 *s|STRLEN *len
+ApMd	|U8*	|bytes_to_utf8	|U8 *s|STRLEN *len
+Apd	|UV	|utf8_to_uv_simple|U8 *s|STRLEN* retlen
+Adp	|UV	|utf8_to_uv	|U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
+Apd	|U8*	|uv_to_utf8	|U8 *d|UV uv
 p	|void	|vivify_defelem	|SV* sv
 p	|void	|vivify_ref	|SV* sv|U32 to_what
 p	|I32	|wait4pid	|Pid_t pid|int* statusp|int flags
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index e676431..25fe18a 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -188,7 +188,10 @@ Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 Returns a pointer to the newly-created string, and sets C<len> to
 reflect the new length.
 
-	U8 *	bytes_to_utf8(U8 *s, STRLEN *len)
+NOTE: this function is experimental and may change or be
+removed without notice.
+
+	U8*	bytes_to_utf8(U8 *s, STRLEN *len)
 
 =for hackers
 Found in file utf8.c
@@ -1013,6 +1016,27 @@ character.
 =for hackers
 Found in file handy.h
 
+=item is_utf8_char
+
+Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
+The actual number of bytes in the UTF-8 character will be returned if it
+is valid, otherwise 0. 
+ 
+	STRLEN	is_utf8_char(U8 *p)
+
+=for hackers
+Found in file utf8.c
+
+=item is_utf8_string
+
+Returns true if first C<len> bytes of the given string form valid a UTF8
+string, false otherwise.
+
+	bool	is_utf8_string(U8 *s, STRLEN len)
+
+=for hackers
+Found in file utf8.c
+
 =item items
 
 Variable which is setup by C<xsubpp> to indicate the number of 
@@ -2396,19 +2420,19 @@ false, defined or undefined.  Does not handle 'get' magic.
 =for hackers
 Found in file sv.h
 
-=item SvTYPE
-
-Returns the type of the SV.  See C<svtype>.
+=item svtype
 
-	svtype	SvTYPE(SV* sv)
+An enum of flags for Perl types.  These are found in the file B<sv.h> 
+in the C<svtype> enum.  Test these flags with the C<SvTYPE> macro.
 
 =for hackers
 Found in file sv.h
 
-=item svtype
+=item SvTYPE
 
-An enum of flags for Perl types.  These are found in the file B<sv.h> 
-in the C<svtype> enum.  Test these flags with the C<SvTYPE> macro.
+Returns the type of the SV.  See C<svtype>.
+
+	svtype	SvTYPE(SV* sv)
 
 =for hackers
 Found in file sv.h
@@ -3247,16 +3271,6 @@ Converts the specified character to uppercase.
 =for hackers
 Found in file handy.h
 
-=item U8 *s
-
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
-
-	is_utf8_string	U8 *s(STRLEN len)
-
-=for hackers
-Found in file utf8.c
-
 =item utf8_distance
 
 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
@@ -3302,7 +3316,10 @@ Unlike C<bytes_to_utf8>, this over-writes the original string, and
 updates len to contain the new length.
 Returns zero on failure, setting C<len> to -1.
 
-	U8 *	utf8_to_bytes(U8 *s, STRLEN *len)
+NOTE: this function is experimental and may change or be
+removed without notice.
+
+	U8*	utf8_to_bytes(U8 *s, STRLEN *len)
 
 =for hackers
 Found in file utf8.c
@@ -3324,7 +3341,7 @@ length of the UTF-8 character in bytes, and zero will be returned.
 The C<flags> can also contain various flags to allow deviations from
 the strict UTF-8 encoding (see F<utf8.h>).
 
-	U8* s	utf8_to_uv(STRLEN curlen, STRLEN *retlen, U32 flags)
+	UV	utf8_to_uv(U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags)
 
 =for hackers
 Found in file utf8.c
@@ -3338,7 +3355,25 @@ length, in bytes, of that character.
 If C<s> does not point to a well-formed UTF8 character, zero is
 returned and retlen is set, if possible, to -1.
 
-	U8* s	utf8_to_uv_simple(STRLEN *retlen)
+	UV	utf8_to_uv_simple(U8 *s, STRLEN* retlen)
+
+=for hackers
+Found in file utf8.c
+
+=item uv_to_utf8
+
+Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words, 
+
+    d = uv_to_utf8(d, uv);
+
+is the recommended Unicode-aware way of saying
+
+    *(d++) = uv;
+
+	U8*	uv_to_utf8(U8 *d, UV uv)
 
 =for hackers
 Found in file utf8.c
diff --git a/utf8.c b/utf8.c
index 65f1096..156e63f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -26,8 +26,25 @@
 
 /* Unicode support */
 
+/*
+=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words, 
+
+    d = uv_to_utf8(d, uv);
+
+is the recommended Unicode-aware way of saying
+
+    *(d++) = uv;
+
+=cut
+*/
+
 U8 *
-Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */
+Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
 {
     if (uv < 0x80) {
 	*d++ = uv;
@@ -101,9 +118,15 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */
 #endif
 }
 
-/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
- * The actual number of bytes in the UTF-8 character will be returned if it
- * is valid, otherwise 0. */
+/*
+=for apidoc A|STRLEN|is_utf8_char|U8 *s
+
+Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
+The actual number of bytes in the UTF-8 character will be returned if it
+is valid, otherwise 0. 
+ 
+=cut
+*/
 STRLEN
 Perl_is_utf8_char(pTHX_ U8 *s)
 {
@@ -143,7 +166,7 @@ Perl_is_utf8_char(pTHX_ U8 *s)
 }
 
 /*
-=for apidoc Am|is_utf8_string|U8 *s|STRLEN len
+=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 
 Returns true if first C<len> bytes of the given string form valid a UTF8
 string, false otherwise.
@@ -175,7 +198,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
+=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
@@ -390,7 +413,7 @@ malformed:
 }
 
 /*
-=for apidoc Am|U8* s|utf8_to_uv_simple|STRLEN *retlen
+=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen
 
 Returns the character value of the first character in the string C<s>
 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
@@ -409,7 +432,7 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
 }
 
 /*
-=for apidoc Am|STRLEN|utf8_length|U8* s|U8 *e
+=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e
 
 Return the length of the UTF-8 char encoded string C<s> in characters.
 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
@@ -442,7 +465,7 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
 }
 
 /*
-=for apidoc Am|IV|utf8_distance|U8 *a|U8 *b
+=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 
 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 and C<b>.
@@ -486,7 +509,7 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 }
 
 /*
-=for apidoc Am|U8*|utf8_hop|U8 *s|I32 off
+=for apidoc A|U8*|utf8_hop|U8 *s|I32 off
 
 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 forward or backward.
@@ -519,7 +542,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 }
 
 /*
-=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
+=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 
 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 Unlike C<bytes_to_utf8>, this over-writes the original string, and
@@ -560,7 +583,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
 }
 
 /*
-=for apidoc Am|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
+=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 
 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 Returns a pointer to the newly-created string, and sets C<len> to
-- 
2.7.4