From eebe148573e5a07582cda46391148ca89c563ade Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Tue, 16 Jan 2001 13:42:30 +0000 Subject: [PATCH] Re: API Cleanup To: perl5-porters@perl.org Date: Tue, 16 Jan 2001 13:42:30 +0000 Message-ID: <20010116134230.A13420@pembro26.pmb.ox.ac.uk> Subject: [PATCH] utf8.c documentation Date: Tue, 16 Jan 2001 13:52:48 +0000 Message-ID: <20010116135248.A13496@pembro26.pmb.ox.ac.uk> Subject: Re: API Cleanup From: Simon Cozens Date: Tue, 16 Jan 2001 14:58:55 +0000 Message-ID: <20010116145855.A13794@pembro26.pmb.ox.ac.uk> UTF-8 doc patches. p4raw-id: //depot/perl@8452 --- embed.pl | 20 +++++++-------- pod/perlapi.pod | 77 +++++++++++++++++++++++++++++++++++++++++---------------- utf8.c | 47 ++++++++++++++++++++++++++--------- 3 files changed, 101 insertions(+), 43 deletions(-) diff --git a/embed.pl b/embed.pl index 7621f66..371ba58 100755 --- a/embed.pl +++ b/embed.pl @@ -1628,8 +1628,8 @@ Ap |bool |is_uni_xdigit_lc|U32 c Ap |U32 |to_uni_upper_lc|U32 c Ap |U32 |to_uni_title_lc|U32 c Ap |U32 |to_uni_lower_lc|U32 c -Ap |STRLEN |is_utf8_char |U8 *p -Ap |bool |is_utf8_string |U8 *s|STRLEN len +Apd |STRLEN |is_utf8_char |U8 *p +Apd |bool |is_utf8_string |U8 *s|STRLEN len Ap |bool |is_utf8_alnum |U8 *p Ap |bool |is_utf8_alnumc |U8 *p Ap |bool |is_utf8_idfirst|U8 *p @@ -2077,14 +2077,14 @@ p |void |unshare_hek |HEK* hek p |void |utilize |int aver|I32 floor|OP* version|OP* id|OP* arg Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen -Ap |STRLEN |utf8_length |U8* s|U8 *e -Ap |IV |utf8_distance |U8 *a|U8 *b -Ap |U8* |utf8_hop |U8 *s|I32 off -ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len -ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len -Ap |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen -Ap |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags -Ap |U8* |uv_to_utf8 |U8 *d|UV uv +Adp |STRLEN |utf8_length |U8* s|U8 *e +Apd |IV |utf8_distance |U8 *a|U8 *b +Apd |U8* |utf8_hop |U8 *s|I32 off +ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len +ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len +Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen +Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags +Apd |U8* |uv_to_utf8 |U8 *d|UV uv p |void |vivify_defelem |SV* sv p |void |vivify_ref |SV* sv|U32 to_what p |I32 |wait4pid |Pid_t pid|int* statusp|int flags diff --git a/pod/perlapi.pod b/pod/perlapi.pod index e676431..25fe18a 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -188,7 +188,10 @@ Converts a string C of length C from ASCII into UTF8 encoding. Returns a pointer to the newly-created string, and sets C to reflect the new length. - U8 * bytes_to_utf8(U8 *s, STRLEN *len) +NOTE: this function is experimental and may change or be +removed without notice. + + U8* bytes_to_utf8(U8 *s, STRLEN *len) =for hackers Found in file utf8.c @@ -1013,6 +1016,27 @@ character. =for hackers Found in file handy.h +=item is_utf8_char + +Tests if some arbitrary number of bytes begins in a valid UTF-8 character. +The actual number of bytes in the UTF-8 character will be returned if it +is valid, otherwise 0. + + STRLEN is_utf8_char(U8 *p) + +=for hackers +Found in file utf8.c + +=item is_utf8_string + +Returns true if first C bytes of the given string form valid a UTF8 +string, false otherwise. + + bool is_utf8_string(U8 *s, STRLEN len) + +=for hackers +Found in file utf8.c + =item items Variable which is setup by C to indicate the number of @@ -2396,19 +2420,19 @@ false, defined or undefined. Does not handle 'get' magic. =for hackers Found in file sv.h -=item SvTYPE - -Returns the type of the SV. See C. +=item svtype - svtype SvTYPE(SV* sv) +An enum of flags for Perl types. These are found in the file B +in the C enum. Test these flags with the C macro. =for hackers Found in file sv.h -=item svtype +=item SvTYPE -An enum of flags for Perl types. These are found in the file B -in the C enum. Test these flags with the C macro. +Returns the type of the SV. See C. + + svtype SvTYPE(SV* sv) =for hackers Found in file sv.h @@ -3247,16 +3271,6 @@ Converts the specified character to uppercase. =for hackers Found in file handy.h -=item U8 *s - -Returns true if first C bytes of the given string form valid a UTF8 -string, false otherwise. - - is_utf8_string U8 *s(STRLEN len) - -=for hackers -Found in file utf8.c - =item utf8_distance Returns the number of UTF8 characters between the UTF-8 pointers C @@ -3302,7 +3316,10 @@ Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. - U8 * utf8_to_bytes(U8 *s, STRLEN *len) +NOTE: this function is experimental and may change or be +removed without notice. + + U8* utf8_to_bytes(U8 *s, STRLEN *len) =for hackers Found in file utf8.c @@ -3324,7 +3341,7 @@ length of the UTF-8 character in bytes, and zero will be returned. The C can also contain various flags to allow deviations from the strict UTF-8 encoding (see F). - U8* s utf8_to_uv(STRLEN curlen, STRLEN *retlen, U32 flags) + UV utf8_to_uv(U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags) =for hackers Found in file utf8.c @@ -3338,7 +3355,25 @@ length, in bytes, of that character. If C does not point to a well-formed UTF8 character, zero is returned and retlen is set, if possible, to -1. - U8* s utf8_to_uv_simple(STRLEN *retlen) + UV utf8_to_uv_simple(U8 *s, STRLEN* retlen) + +=for hackers +Found in file utf8.c + +=item uv_to_utf8 + +Adds the UTF8 representation of the Unicode codepoint C to the end +of the string C; C should be have at least C free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uv_to_utf8(d, uv); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; + + U8* uv_to_utf8(U8 *d, UV uv) =for hackers Found in file utf8.c diff --git a/utf8.c b/utf8.c index 65f1096..156e63f 100644 --- a/utf8.c +++ b/utf8.c @@ -26,8 +26,25 @@ /* Unicode support */ +/* +=for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv + +Adds the UTF8 representation of the Unicode codepoint C to the end +of the string C; C should be have at least C free +bytes available. The return value is the pointer to the byte after the +end of the new character. In other words, + + d = uv_to_utf8(d, uv); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; + +=cut +*/ + U8 * -Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ +Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) { if (uv < 0x80) { *d++ = uv; @@ -101,9 +118,15 @@ Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) /* the d must be UTF8_MAXLEN+1 deep */ #endif } -/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character. - * The actual number of bytes in the UTF-8 character will be returned if it - * is valid, otherwise 0. */ +/* +=for apidoc A|STRLEN|is_utf8_char|U8 *s + +Tests if some arbitrary number of bytes begins in a valid UTF-8 character. +The actual number of bytes in the UTF-8 character will be returned if it +is valid, otherwise 0. + +=cut +*/ STRLEN Perl_is_utf8_char(pTHX_ U8 *s) { @@ -143,7 +166,7 @@ Perl_is_utf8_char(pTHX_ U8 *s) } /* -=for apidoc Am|is_utf8_string|U8 *s|STRLEN len +=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len Returns true if first C bytes of the given string form valid a UTF8 string, false otherwise. @@ -175,7 +198,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) } /* -=for apidoc Am|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags +=for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding and no longer than C; @@ -390,7 +413,7 @@ malformed: } /* -=for apidoc Am|U8* s|utf8_to_uv_simple|STRLEN *retlen +=for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen Returns the character value of the first character in the string C which is assumed to be in UTF8 encoding; C will be set to the @@ -409,7 +432,7 @@ Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) } /* -=for apidoc Am|STRLEN|utf8_length|U8* s|U8 *e +=for apidoc A|STRLEN|utf8_length|U8* s|U8 *e Return the length of the UTF-8 char encoded string C in characters. Stops at C (inclusive). If C s> or if the scan would end @@ -442,7 +465,7 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) } /* -=for apidoc Am|IV|utf8_distance|U8 *a|U8 *b +=for apidoc A|IV|utf8_distance|U8 *a|U8 *b Returns the number of UTF8 characters between the UTF-8 pointers C and C. @@ -486,7 +509,7 @@ Perl_utf8_distance(pTHX_ U8 *a, U8 *b) } /* -=for apidoc Am|U8*|utf8_hop|U8 *s|I32 off +=for apidoc A|U8*|utf8_hop|U8 *s|I32 off Return the UTF-8 pointer C displaced by C characters, either forward or backward. @@ -519,7 +542,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) } /* -=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len +=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len Converts a string C of length C from UTF8 into byte encoding. Unlike C, this over-writes the original string, and @@ -560,7 +583,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) } /* -=for apidoc Am|U8 *|bytes_to_utf8|U8 *s|STRLEN *len +=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len Converts a string C of length C from ASCII into UTF8 encoding. Returns a pointer to the newly-created string, and sets C to -- 2.7.4