From: Jarkko Hietaniemi Date: Fri, 12 Sep 2003 17:59:25 +0000 (+0000) Subject: It's UTF-8, not UTF8. (Note: not s/UTF-8/UTF8/, X-Git-Tag: accepted/trunk/20130322.191538~23008 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1e54db1a8aea187ba2e790aca2ab81fab24ff92d;p=platform%2Fupstream%2Fperl.git It's UTF-8, not UTF8. (Note: not s/UTF-8/UTF8/, since that would break a lot of code.) Also few stray UTF16s, UTF32s, and "encoded in Unicode". p4raw-id: //depot/perl@21198 --- diff --git a/doop.c b/doop.c index 89fe461..5a41f6a 100644 --- a/doop.c +++ b/doop.c @@ -42,7 +42,7 @@ S_do_trans_simple(pTHX_ SV *sv) s = (U8*)SvPV(sv, len); send = s + len; - /* First, take care of non-UTF8 input strings, because they're easy */ + /* First, take care of non-UTF-8 input strings, because they're easy */ if (!SvUTF8(sv)) { while (s < send) { if ((ch = tbl[*s]) >= 0) { diff --git a/hv.h b/hv.h index e414979..4fa4239 100644 --- a/hv.h +++ b/hv.h @@ -25,8 +25,8 @@ struct hek { I32 hek_len; /* length of hash key */ char hek_key[1]; /* variable-length hash key */ /* the hash-key is \0-terminated */ - /* after the \0 there is a byte for flags, such as whether the key is - UTF8 */ + /* after the \0 there is a byte for flags, such as whether the key + is UTF-8 */ }; /* hash structure: */ diff --git a/pod/perl570delta.pod b/pod/perl570delta.pod index 70425ef..20abcd6 100644 --- a/pod/perl570delta.pod +++ b/pod/perl570delta.pod @@ -127,7 +127,7 @@ is a NaN. Previously the behaviour was unspecified. =item * -C can now be used to force a string to UTF8. +C can now be used to force a string to UTF-8. =item * @@ -508,7 +508,7 @@ C, C, and C now match titlecase. Concatenation with the C<.> operator or via variable interpolation, C, C, C, C, the C operator, -substitution with C, single-quoted UTF8, should now work--in +substitution with C, single-quoted UTF-8, should now work--in theory. =item * diff --git a/pod/perl58delta.pod b/pod/perl58delta.pod index 53aa217..a29f04b 100644 --- a/pod/perl58delta.pod +++ b/pod/perl58delta.pod @@ -664,7 +664,7 @@ The template letters are C, C, C, and C. =item * -C can now be used to force a string to UTF8. +C can now be used to force a string to UTF-8. =item * @@ -2475,7 +2475,7 @@ C, C, and C now match titlecase. Concatenation with the C<.> operator or via variable interpolation, C, C, C, C, the C operator, -substitution with C, single-quoted UTF8, should now work. +substitution with C, single-quoted UTF-8, should now work. =item * diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 3d67204..07c47f9 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -3065,7 +3065,7 @@ Found in file sv.h =item SvPOK_only Tells an SV that it is a string and disables all other OK bits. -Will also turn off the UTF8 status. +Will also turn off the UTF-8 status. void SvPOK_only(SV* sv) @@ -3075,7 +3075,7 @@ Found in file sv.h =item SvPOK_only_UTF8 Tells an SV that it is a string and disables all other OK bits, -and leaves the UTF8 status as it was. +and leaves the UTF-8 status as it was. void SvPOK_only_UTF8(SV* sv) @@ -3418,7 +3418,7 @@ Found in file sv.h =item SvUTF8_off -Unsets the UTF8 status of an SV. +Unsets the UTF-8 status of an SV. void SvUTF8_off(SV *sv) @@ -3427,7 +3427,7 @@ Found in file sv.h =item SvUTF8_on -Turn on the UTF8 status of an SV (the data is not changed, just the flag). +Turn on the UTF-8 status of an SV (the data is not changed, just the flag). Do not use frivolously. void SvUTF8_on(SV *sv) @@ -3540,7 +3540,7 @@ Found in file sv.c =item sv_2pvbyte Return a pointer to the byte-encoded representation of the SV, and set *lp -to its length. May cause the SV to be downgraded from UTF8 as a +to its length. May cause the SV to be downgraded from UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3553,7 +3553,7 @@ Found in file sv.c =item sv_2pvbyte_nolen Return a pointer to the byte-encoded representation of the SV. -May cause the SV to be downgraded from UTF8 as a side-effect. +May cause the SV to be downgraded from UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3564,8 +3564,8 @@ Found in file sv.c =item sv_2pvutf8 -Return a pointer to the UTF8-encoded representation of the SV, and set *lp -to its length. May cause the SV to be upgraded to UTF8 as a side-effect. +Return a pointer to the UTF-8-encoded representation of the SV, and set *lp +to its length. May cause the SV to be upgraded to UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3576,8 +3576,8 @@ Found in file sv.c =item sv_2pvutf8_nolen -Return a pointer to the UTF8-encoded representation of the SV. -May cause the SV to be upgraded to UTF8 as a side-effect. +Return a pointer to the UTF-8-encoded representation of the SV. +May cause the SV to be upgraded to UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3643,8 +3643,8 @@ Found in file sv.c =item sv_catpv Concatenates the string onto the end of the string which is in the SV. -If the SV has the UTF8 status set, then the bytes appended should be -valid UTF8. Handles 'get' magic, but not 'set' magic. See C. +If the SV has the UTF-8 status set, then the bytes appended should be +valid UTF-8. Handles 'get' magic, but not 'set' magic. See C. void sv_catpv(SV* sv, const char* ptr) @@ -3678,8 +3678,8 @@ Found in file sv.c =item sv_catpvn Concatenates the string onto the end of the string which is in the SV. The -C indicates number of bytes to copy. If the SV has the UTF8 -status set, then the bytes appended should be valid UTF8. +C indicates number of bytes to copy. If the SV has the UTF-8 +status set, then the bytes appended should be valid UTF-8. Handles 'get' magic, but not 'set' magic. See C. void sv_catpvn(SV* sv, const char* ptr, STRLEN len) @@ -3690,8 +3690,8 @@ Found in file sv.c =item sv_catpvn_flags Concatenates the string onto the end of the string which is in the SV. The -C indicates number of bytes to copy. If the SV has the UTF8 -status set, then the bytes appended should be valid UTF8. +C indicates number of bytes to copy. If the SV has the UTF-8 +status set, then the bytes appended should be valid UTF-8. If C has C bit set, will C on C if appropriate, else not. C and C are implemented in terms of this function. @@ -3990,7 +3990,7 @@ Found in file sv.c =item sv_len_utf8 Returns the number of characters in the string in an SV, counting wide -UTF8 bytes as a single character. Handles magic and type coercion. +UTF-8 bytes as a single character. Handles magic and type coercion. STRLEN sv_len_utf8(SV* sv) @@ -4075,7 +4075,7 @@ Found in file sv.c =item sv_pos_b2u Converts the value pointed to by offsetp from a count of bytes from the -start of the string, to a count of the equivalent number of UTF8 chars. +start of the string, to a count of the equivalent number of UTF-8 chars. Handles magic and type coercion. void sv_pos_b2u(SV* sv, I32* offsetp) @@ -4085,7 +4085,7 @@ Found in file sv.c =item sv_pos_u2b -Converts the value pointed to by offsetp from a count of UTF8 chars from +Converts the value pointed to by offsetp from a count of UTF-8 chars from the start of the string, to a count of the equivalent number of bytes; if lenp is non-zero, it does the same to lenp, but this time starting from the offset, rather than from the start of the string. Handles magic and @@ -4634,7 +4634,7 @@ Found in file sv.c =item sv_utf8_downgrade -Attempt to convert the PV of an SV from UTF8-encoded to byte encoding. +Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding. This may not be possible if the PV contains non-byte encoding characters; if this is the case, either returns false or, if C is not true, croaks. @@ -4652,7 +4652,7 @@ Found in file sv.c =item sv_utf8_encode -Convert the PV of an SV to UTF8-encoded, but then turn off the C +Convert the PV of an SV to UTF-8-encoded, but then turn off the C flag so that it looks like octets again. Used as a building block for encode_utf8 in Encode.xs @@ -4663,7 +4663,7 @@ Found in file sv.c =item sv_utf8_upgrade -Convert the PV of an SV to its UTF8-encoded form. +Convert the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. @@ -4678,7 +4678,7 @@ Found in file sv.c =item sv_utf8_upgrade_flags -Convert the PV of an SV to its UTF8-encoded form. +Convert the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. If C has C bit set, @@ -4739,7 +4739,7 @@ Found in file sv.c =item bytes_from_utf8 -Converts a string C of length C from UTF8 into byte encoding. +Converts a string C of length C from UTF-8 into byte encoding. Unlike but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C @@ -4756,11 +4756,11 @@ Found in file utf8.c =item bytes_to_utf8 -Converts a string C of length C from ASCII into UTF8 encoding. +Converts a string C of length C from ASCII into UTF-8 encoding. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF8 from other encodings than ASCII, +If you want to convert to UTF-8 from other encodings than ASCII, see sv_recode_to_utf8(). NOTE: this function is experimental and may change or be @@ -4814,9 +4814,9 @@ Found in file utf8.c =item is_utf8_string Returns true if first C bytes of the given string form a valid -UTF8 string, false otherwise. Note that 'a valid UTF8 string' does -not mean 'a string that contains code points above 0x7F encoded in -UTF8' because a valid ASCII string is a valid UTF8 string. +UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does +not mean 'a string that contains code points above 0x7F encoded in UTF-8' +because a valid ASCII string is a valid UTF-8 string. bool is_utf8_string(U8 *s, STRLEN len) @@ -4997,7 +4997,7 @@ Found in file utf8.c =item utf8n_to_uvchr Returns the native character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. Allows length and flags to be passed to low level routine. @@ -5011,10 +5011,10 @@ Found in file utf8.c Bottom level UTF-8 decode routine. Returns the unicode code point value of the first character in the string C -which is assumed to be in UTF8 encoding and no longer than C; +which is assumed to be in UTF-8 encoding and no longer than C; C will be set to the length, in bytes, of that character. -If C does not point to a well-formed UTF8 character, the behaviour +If C does not point to a well-formed UTF-8 character, the behaviour is dependent on the value of C: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function will silently just set C to C<-1> and return zero. If the @@ -5034,7 +5034,7 @@ Found in file utf8.c =item utf8_distance -Returns the number of UTF8 characters between the UTF-8 pointers C +Returns the number of UTF-8 characters between the UTF-8 pointers C and C. WARNING: use only if you *know* that the pointers point inside the @@ -5072,7 +5072,7 @@ Found in file utf8.c =item utf8_to_bytes -Converts a string C of length C from UTF8 into byte encoding. +Converts a string C of length C from UTF-8 into byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. @@ -5088,10 +5088,10 @@ Found in file utf8.c =item utf8_to_uvchr Returns the native character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -If C does not point to a well-formed UTF8 character, zero is +If C does not point to a well-formed UTF-8 character, zero is returned and retlen is set, if possible, to -1. UV utf8_to_uvchr(U8 *s, STRLEN* retlen) @@ -5102,13 +5102,13 @@ Found in file utf8.c =item utf8_to_uvuni Returns the Unicode code point of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. This function should only be used when returned UV is considered an index into the Unicode semantic tables (e.g. swashes). -If C does not point to a well-formed UTF8 character, zero is +If C does not point to a well-formed UTF-8 character, zero is returned and retlen is set, if possible, to -1. UV utf8_to_uvuni(U8 *s, STRLEN* retlen) @@ -5118,7 +5118,7 @@ Found in file utf8.c =item uvchr_to_utf8 -Adds the UTF8 representation of the Native codepoint C to the end +Adds the UTF-8 representation of the Native codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -5136,7 +5136,7 @@ Found in file utf8.c =item uvuni_to_utf8_flags -Adds the UTF8 representation of the Unicode codepoint C to the end +Adds the UTF-8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 3bc93d9..d7a3fa4 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -728,9 +728,9 @@ On POSIX systems, you can detect this condition this way: Returns the character represented by that NUMBER in the character set. For example, C is C<"A"> in either ASCII or Unicode, and -chr(0x263a) is a Unicode smiley face. Note that characters from 127 -to 255 (inclusive) are by default not encoded in Unicode for backward -compatibility reasons (but see L). +chr(0x263a) is a Unicode smiley face. Note that characters from 128 +to 255 (inclusive) are by default not encoded in UTF-8 Unicode for +backward compatibility reasons (but see L). If NUMBER is omitted, uses C<$_>. @@ -3533,12 +3533,13 @@ equal $foo). =item * -If the pattern begins with a C, the resulting string will be treated -as Unicode-encoded. You can force UTF8 encoding on in a string with an -initial C, and the bytes that follow will be interpreted as Unicode -characters. If you don't want this to happen, you can begin your pattern -with C (or anything else) to force Perl not to UTF8 encode your -string, and then follow this with a C somewhere in your pattern. +If the pattern begins with a C, the resulting string will be +treated as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a +string with an initial C, and the bytes that follow will be +interpreted as Unicode characters. If you don't want this to happen, +you can begin your pattern with C (or anything else) to force Perl +not to UTF-8 encode your string, and then follow this with a C +somewhere in your pattern. =item * @@ -6314,10 +6315,10 @@ extend the string with sufficiently many zero bytes. It is an error to try to write off the beginning of the string (i.e. negative OFFSET). The string should not contain any character with the value > 255 (which -can only happen if you're using UTF8 encoding). If it does, it will be -treated as something which is not UTF8 encoded. When the C was +can only happen if you're using UTF-8 encoding). If it does, it will be +treated as something which is not UTF-8 encoded. When the C was assigned to, other parts of your program will also no longer consider the -string to be UTF8 encoded. In other words, if you do have such characters +string to be UTF-8 encoded. In other words, if you do have such characters in your string, vec() will operate on the actual byte string, and not the conceptual character string. diff --git a/pod/perlguts.pod b/pod/perlguts.pod index b763dfe..3d1e5d8 100644 --- a/pod/perlguts.pod +++ b/pod/perlguts.pod @@ -2255,34 +2255,34 @@ to one character. To fix this, some people formed Unicode, Inc. and produced a new character set containing all the characters you can possibly think of and more. There are several ways of representing these -characters, and the one Perl uses is called UTF8. UTF8 uses +characters, and the one Perl uses is called UTF-8. UTF-8 uses a variable number of bytes to represent a character, instead of just one. You can learn more about Unicode at http://www.unicode.org/ -=head2 How can I recognise a UTF8 string? +=head2 How can I recognise a UTF-8 string? -You can't. This is because UTF8 data is stored in bytes just like -non-UTF8 data. The Unicode character 200, (C<0xC8> for you hex types) +You can't. This is because UTF-8 data is stored in bytes just like +non-UTF-8 data. The Unicode character 200, (C<0xC8> for you hex types) capital E with a grave accent, is represented by the two bytes C. Unfortunately, the non-Unicode string C has that byte sequence as well. So you can't tell just by looking - this is what makes Unicode input an interesting problem. The API function C can help; it'll tell you if a string -contains only valid UTF8 characters. However, it can't do the work for +contains only valid UTF-8 characters. However, it can't do the work for you. On a character-by-character basis, C will tell you -whether the current character in a string is valid UTF8. +whether the current character in a string is valid UTF-8. -=head2 How does UTF8 represent Unicode characters? +=head2 How does UTF-8 represent Unicode characters? -As mentioned above, UTF8 uses a variable number of bytes to store a +As mentioned above, UTF-8 uses a variable number of bytes to store a character. Characters with values 1...128 are stored in one byte, just like good ol' ASCII. Character 129 is stored as C; this continues up to character 191, which is C. Now we've run out of bits (191 is binary C<10111111>) so we move on; 192 is C. And so it goes on, moving to three bytes at character 2048. -Assuming you know you're dealing with a UTF8 string, you can find out +Assuming you know you're dealing with a UTF-8 string, you can find out how long the first character in it is with the C macro: char *utf = "\305\233\340\240\201"; @@ -2292,12 +2292,12 @@ how long the first character in it is with the C macro: utf += len; len = UTF8SKIP(utf); /* len is 3 here */ -Another way to skip over characters in a UTF8 string is to use +Another way to skip over characters in a UTF-8 string is to use C, which takes a string and a number of characters to skip over. You're on your own about bounds checking, though, so don't use it lightly. -All bytes in a multi-byte UTF8 character will have the high bit set, +All bytes in a multi-byte UTF-8 character will have the high bit set, so you can test if you need to do something special with this character like this (the UTF8_IS_INVARIANT() is a macro that tests whether the byte can be encoded as a single byte even in UTF-8): @@ -2306,7 +2306,7 @@ whether the byte can be encoded as a single byte even in UTF-8): UV uv; /* Note: a UV, not a U8, not a char */ if (!UTF8_IS_INVARIANT(*utf)) - /* Must treat this as UTF8 */ + /* Must treat this as UTF-8 */ uv = utf8_to_uv(utf); else /* OK to treat this character as a byte */ @@ -2314,7 +2314,7 @@ whether the byte can be encoded as a single byte even in UTF-8): You can also see in that example that we use C to get the value of the character; the inverse function C is available -for putting a UV into UTF8: +for putting a UV into UTF-8: if (!UTF8_IS_INVARIANT(uv)) /* Must treat this as UTF8 */ @@ -2324,14 +2324,14 @@ for putting a UV into UTF8: *utf8++ = uv; You B convert characters to UVs using the above functions if -you're ever in a situation where you have to match UTF8 and non-UTF8 -characters. You may not skip over UTF8 characters in this case. If you -do this, you'll lose the ability to match hi-bit non-UTF8 characters; -for instance, if your UTF8 string contains C, and you skip -that character, you can never match a C in a non-UTF8 string. +you're ever in a situation where you have to match UTF-8 and non-UTF-8 +characters. You may not skip over UTF-8 characters in this case. If you +do this, you'll lose the ability to match hi-bit non-UTF-8 characters; +for instance, if your UTF-8 string contains C, and you skip +that character, you can never match a C in a non-UTF-8 string. So don't do that! -=head2 How does Perl store UTF8 strings? +=head2 How does Perl store UTF-8 strings? Currently, Perl deals with Unicode strings and non-Unicode strings slightly differently. If a string has been identified as being UTF-8 @@ -2348,8 +2348,8 @@ C, C and other string handling operations will have undesirable results. The problem comes when you have, for instance, a string that isn't -flagged is UTF8, and contains a byte sequence that could be UTF8 - -especially when combining non-UTF8 and UTF8 strings. +flagged is UTF-8, and contains a byte sequence that could be UTF-8 - +especially when combining non-UTF-8 and UTF-8 strings. Never forget that the C flag is separate to the PV value; you need be sure you don't accidentally knock it off while you're @@ -2366,7 +2366,7 @@ manipulating SVs. More specifically, you cannot expect to do this: The C string does not tell you the whole story, and you can't copy or reconstruct an SV just by copying the string value. Check if the -old SV has the UTF8 flag set, and act accordingly: +old SV has the UTF-8 flag set, and act accordingly: p = SvPV(sv, len); frobnicate(p); @@ -2375,17 +2375,17 @@ old SV has the UTF8 flag set, and act accordingly: SvUTF8_on(nsv); In fact, your C function should be made aware of whether or -not it's dealing with UTF8 data, so that it can handle the string +not it's dealing with UTF-8 data, so that it can handle the string appropriately. Since just passing an SV to an XS function and copying the data of -the SV is not enough to copy the UTF8 flags, even less right is just +the SV is not enough to copy the UTF-8 flags, even less right is just passing a C to an XS function. -=head2 How do I convert a string to UTF8? +=head2 How do I convert a string to UTF-8? -If you're mixing UTF8 and non-UTF8 strings, you might find it necessary -to upgrade one of the strings to UTF8. If you've got an SV, the easiest +If you're mixing UTF-8 and non-UTF-8 strings, you might find it necessary +to upgrade one of the strings to UTF-8. If you've got an SV, the easiest way to do this is: sv_utf8_upgrade(sv); @@ -2399,7 +2399,7 @@ If you do this in a binary operator, you will actually change one of the strings that came into the operator, and, while it shouldn't be noticeable by the end user, it can cause problems. -Instead, C will give you a UTF8-encoded B of its +Instead, C will give you a UTF-8-encoded B of its string argument. This is useful for having the data available for comparisons and so on, without harming the original SV. There's also C to go the other way, but naturally, this will fail if @@ -2414,27 +2414,27 @@ Not really. Just remember these things: =item * -There's no way to tell if a string is UTF8 or not. You can tell if an SV -is UTF8 by looking at is C flag. Don't forget to set the flag if -something should be UTF8. Treat the flag as part of the PV, even though +There's no way to tell if a string is UTF-8 or not. You can tell if an SV +is UTF-8 by looking at is C flag. Don't forget to set the flag if +something should be UTF-8. Treat the flag as part of the PV, even though it's not - if you pass on the PV to somewhere, pass on the flag too. =item * -If a string is UTF8, B use C to get at the value, +If a string is UTF-8, B use C to get at the value, unless C in which case you can use C<*s>. =item * -When writing a character C to a UTF8 string, B use +When writing a character C to a UTF-8 string, B use C, unless C in which case you can use C<*s = uv>. =item * -Mixing UTF8 and non-UTF8 strings is tricky. Use C to get -a new string which is UTF8 encoded. There are tricks you can use to -delay deciding whether you need to use a UTF8 string until you get to a +Mixing UTF-8 and non-UTF-8 strings is tricky. Use C to get +a new string which is UTF-8 encoded. There are tricks you can use to +delay deciding whether you need to use a UTF-8 string until you get to a high character - C is one of those. =back diff --git a/pod/perlhack.pod b/pod/perlhack.pod index 2d05fc3..c815177 100644 --- a/pod/perlhack.pod +++ b/pod/perlhack.pod @@ -875,7 +875,7 @@ C<"\0">. Line 13 manipulates the flags; since we've changed the PV, any IV or NV values will no longer be valid: if we have C<$a=10; $a.="6";> we don't -want to use the old IV of 10. C is a special UTF8-aware +want to use the old IV of 10. C is a special UTF-8-aware version of C, a macro which turns off the IOK and NOK flags and turns on POK. The final C is a macro which launders tainted data if taint mode is turned on. @@ -1439,7 +1439,7 @@ some things you'll need to know when fiddling with them. Let's now get on and create a simple patch. Here's something Larry suggested: if a C is the first active format during a C, (for example, C) then the resulting string should be treated as -UTF8 encoded. +UTF-8 encoded. How do we prepare to fix this up? First we locate the code in question - the C happens at runtime, so it's going to be in one of the F @@ -1488,7 +1488,7 @@ of C: while (pat < patend) { Now if we see a C which was at the start of the string, we turn on -the UTF8 flag for the output SV, C: +the C flag for the output SV, C: + if (datumtype == 'U' && pat==patcopy+1) + SvUTF8_on(cat); @@ -1574,10 +1574,10 @@ this text in the description of C: =item * If the pattern begins with a C, the resulting string will be treated - as Unicode-encoded. You can force UTF8 encoding on in a string with an - initial C, and the bytes that follow will be interpreted as Unicode - characters. If you don't want this to happen, you can begin your pattern - with C (or anything else) to force Perl not to UTF8 encode your + as UTF-8-encoded Unicode. You can force UTF-8 encoding on in a string + with an initial C, and the bytes that follow will be interpreted as + Unicode characters. If you don't want this to happen, you can begin your + pattern with C (or anything else) to force Perl not to UTF-8 encode your string, and then follow this with a C somewhere in your pattern. All done. Now let's create the patch. F tells us diff --git a/pod/perlpodspec.pod b/pod/perlpodspec.pod index 0b35663..e42bd4f 100644 --- a/pod/perlpodspec.pod +++ b/pod/perlpodspec.pod @@ -335,7 +335,7 @@ paragraph. =item "=encoding encodingname" This command, which should occur early in the document (at least -before any non-USASCII data!), declares that this document is +before any non-US-ASCII data!), declares that this document is encoded in the encoding I, which must be an encoding name that L recognizes. (Encoding's list of supported encodings, in L, is useful here.) @@ -352,8 +352,8 @@ there are contradictory "=encoding" lines in the same document (e.g., if there is a "=encoding utf8" early in the document and "=encoding big5" later). Pod processors that recognize BOMs may also complain if they see an "=encoding" line -that contradicts the BOM (e.g., if a document with a UTF16LE BOM -has an "=encoding shiftjis" line). +that contradicts the BOM (e.g., if a document with a UTF-16LE +BOM has an "=encoding shiftjis" line). =back @@ -612,7 +612,7 @@ UTF-16. If the file begins with the three literal byte values 0xEF 0xBB 0xBF =for comment - If toke.c is modified to support UTF32, add mention of those here. + If toke.c is modified to support UTF-32, add mention of those here. =item * diff --git a/pod/perltoc.pod b/pod/perltoc.pod index e1e5d49..0610509 100644 --- a/pod/perltoc.pod +++ b/pod/perltoc.pod @@ -1389,7 +1389,7 @@ file? =item Are Perl regexes DFAs or NFAs? Are they POSIX compliant? -=item What's wrong with using grep or map in a void context? +=item What's wrong with using grep in a void context? =item How can I match strings with multibyte characters? @@ -1973,7 +1973,7 @@ width, size, order of arguments, sqrt EXPR, sqrt, srand EXPR, srand, stat FILEHANDLE, stat EXPR, stat, study SCALAR, study, sub NAME BLOCK, sub NAME (PROTO) BLOCK, sub NAME : ATTRS BLOCK, sub NAME (PROTO) : ATTRS BLOCK, substr EXPR,OFFSET,LENGTH,REPLACEMENT, substr EXPR,OFFSET,LENGTH, substr -EXPR,OFFSET, symlink OLDFILE,NEWFILE, syscall LIST, sysopen +EXPR,OFFSET, symlink OLDFILE,NEWFILE, syscall NUMBER, LIST, sysopen FILEHANDLE,FILENAME,MODE, sysopen FILEHANDLE,FILENAME,MODE,PERMS, sysread FILEHANDLE,SCALAR,LENGTH,OFFSET, sysread FILEHANDLE,SCALAR,LENGTH, sysseek FILEHANDLE,POSITION,WHENCE, system LIST, system PROGRAM LIST, syswrite @@ -2158,7 +2158,8 @@ PID,FLAGS, wantarray, warn LIST, write FILEHANDLE, write EXPR, write, y/// C<=head1 I>, C<=head2 I>, C<=head3 I>, C<=head4 I>, C<=over I>, C<=item I>, C<=back>, C<=cut>, C<=pod>, C<=begin I>, C<=end -I>, C<=for I I> +I>, C<=for I I>, C<=encoding +I> =item Formatting Codes @@ -2195,7 +2196,8 @@ notes =item Pod Commands "=head1", "=head2", "=head3", "=head4", "=pod", "=cut", "=over", "=item", -"=back", "=begin formatname", "=end formatname", "=for formatname text..." +"=back", "=begin formatname", "=end formatname", "=for formatname text...", +"=encoding encodingname" =item Pod Formatting Codes @@ -2382,7 +2384,7 @@ IO::Handle->input_record_separator(EXPR), $INPUT_RECORD_SEPARATOR, $RS, $/, HANDLE->autoflush(EXPR), $OUTPUT_AUTOFLUSH, $|, IO::Handle->output_field_separator EXPR, $OUTPUT_FIELD_SEPARATOR, $OFS, $,, IO::Handle->output_record_separator EXPR, $OUTPUT_RECORD_SEPARATOR, $ORS, -$\, $LIST_SEPARATOR, $", $SUBSCRIPT_SEPARATOR, $SUBSEP, $;, $OFMT, $#, +$\, $LIST_SEPARATOR, $", $SUBSCRIPT_SEPARATOR, $SUBSEP, $;, $#, HANDLE->format_page_number(EXPR), $FORMAT_PAGE_NUMBER, $%, HANDLE->format_lines_per_page(EXPR), $FORMAT_LINES_PER_PAGE, $=, HANDLE->format_lines_left(EXPR), $FORMAT_LINES_LEFT, $-, @LAST_MATCH_START, @@ -4483,13 +4485,13 @@ A, p, d, s, n, r, f, M, o, j, x =item What B Unicode, anyway? -=item How can I recognise a UTF8 string? +=item How can I recognise a UTF-8 string? -=item How does UTF8 represent Unicode characters? +=item How does UTF-8 represent Unicode characters? -=item How does Perl store UTF8 strings? +=item How does Perl store UTF-8 strings? -=item How do I convert a string to UTF8? +=item How do I convert a string to UTF-8? =item Is there anything else I need to know? @@ -4688,7 +4690,8 @@ dMARK, dORIGMARK, dSP, EXTEND, MARK, ORIGMARK, POPi, POPl, POPn, POPp, POPpbytex, POPpx, POPs, PUSHi, PUSHMARK, PUSHn, PUSHp, PUSHs, PUSHu, PUTBACK, SP, SPAGAIN, XPUSHi, XPUSHn, XPUSHp, XPUSHs, XPUSHu, XSRETURN, XSRETURN_IV, XSRETURN_NO, XSRETURN_NV, XSRETURN_PV, XSRETURN_UNDEF, -XSRETURN_YES, XST_mIV, XST_mNO, XST_mNV, XST_mPV, XST_mUNDEF, XST_mYES +XSRETURN_UV, XSRETURN_YES, XST_mIV, XST_mNO, XST_mNV, XST_mPV, XST_mUNDEF, +XST_mYES =item SV Flags @@ -4700,12 +4703,12 @@ get_sv, looks_like_number, newRV_inc, newRV_noinc, newSV, newSViv, newSVnv, newSVpv, newSVpvf, newSVpvn, newSVpvn_share, newSVrv, newSVsv, newSVuv, SvCUR, SvCUR_set, SvEND, SvGROW, SvIOK, SvIOKp, SvIOK_notUV, SvIOK_off, SvIOK_on, SvIOK_only, SvIOK_only_UV, SvIOK_UV, SvIsCOW, -SvIsCOW_shared_hash, SvIV, SvIVx, SvIVX, SvLEN, SvNIOK, SvNIOKp, -SvNIOK_off, SvNOK, SvNOKp, SvNOK_off, SvNOK_on, SvNOK_only, SvNV, SvNVX, -SvNVx, SvOK, SvOOK, SvPOK, SvPOKp, SvPOK_off, SvPOK_on, SvPOK_only, +SvIsCOW_shared_hash, SvIV, SvIVX, SvIVx, SvLEN, SvNIOK, SvNIOKp, +SvNIOK_off, SvNOK, SvNOKp, SvNOK_off, SvNOK_on, SvNOK_only, SvNV, SvNVx, +SvNVX, SvOK, SvOOK, SvPOK, SvPOKp, SvPOK_off, SvPOK_on, SvPOK_only, SvPOK_only_UTF8, SvPV, SvPVbyte, SvPVbytex, SvPVbytex_force, SvPVbyte_force, SvPVbyte_nolen, SvPVutf8, SvPVutf8x, SvPVutf8x_force, -SvPVutf8_force, SvPVutf8_nolen, SvPVx, SvPVX, SvPV_force, SvPV_force_nomg, +SvPVutf8_force, SvPVutf8_nolen, SvPVX, SvPVx, SvPV_force, SvPV_force_nomg, SvPV_nolen, SvREFCNT, SvREFCNT_dec, SvREFCNT_inc, SvROK, SvROK_off, SvROK_on, SvRV, SvSTASH, SvTAINT, SvTAINTED, SvTAINTED_off, SvTAINTED_on, SvTRUE, SvTYPE, SvUNLOCK, SvUOK, SvUPGRADE, SvUTF8, SvUTF8_off, SvUTF8_on, @@ -4733,10 +4736,10 @@ sv_utf8_upgrade_flags, sv_uv, sv_vcatpvfn, sv_vsetpvfn =item Unicode Support bytes_from_utf8, bytes_to_utf8, ibcmp_utf8, is_utf8_char, is_utf8_string, -pv_uni_display, sv_cat_decode, sv_recode_to_utf8, sv_uni_display, -to_utf8_case, to_utf8_fold, to_utf8_lower, to_utf8_title, to_utf8_upper, -utf8n_to_uvchr, utf8n_to_uvuni, utf8_distance, utf8_hop, utf8_length, -utf8_to_bytes, utf8_to_uvchr, utf8_to_uvuni, uvchr_to_utf8, +is_utf8_string_loc, pv_uni_display, sv_cat_decode, sv_recode_to_utf8, +sv_uni_display, to_utf8_case, to_utf8_fold, to_utf8_lower, to_utf8_title, +to_utf8_upper, utf8n_to_uvchr, utf8n_to_uvuni, utf8_distance, utf8_hop, +utf8_length, utf8_to_bytes, utf8_to_uvchr, utf8_to_uvuni, uvchr_to_utf8, uvuni_to_utf8_flags =item Variables created by C and C internal functions @@ -5330,7 +5333,7 @@ goto(LABEL) and friends =item Mmap for input -=item Byte to/from UTF8 and UTF8 to/from local conversion +=item Byte to/from UTF-8 and UTF-8 to/from local conversion =item Add sockatmark support @@ -7325,6 +7328,8 @@ a), b), c), d), a), b), c), d) =item The IBM ANSI C Compiler +=item The usenm option + =item Using GNU's gcc for building perl =item Using Large Files with Perl @@ -7469,6 +7474,8 @@ R4 x86, R4 PPC =item Floating point anomalies on BS2000 +=item Using PerlIO and different encodings on ASCII and EBCDIC partitions + =back =item AUTHORS @@ -7893,6 +7900,8 @@ op/lexassign.t, pragma/warnings.t =back +=item Starting From Scratch + =item AUTHOR =item DATE @@ -8463,6 +8472,8 @@ DATAMODEL_NATIVE specified", sh: ar: not found =back +=item SunOS 4.x + =item AUTHOR =item LAST MODIFIED @@ -8886,6 +8897,8 @@ FETCH_I_ATTRIBUTES, MODIFY_I_ATTRIBUTES =item HISTORY +=item CAVEATS + =item SEE ALSO =back @@ -9029,6 +9042,8 @@ semantics =item DESCRIPTION +=item LIMITATIONS + =item SEE ALSO =back @@ -13870,18 +13885,6 @@ B =back -=head2 ExtUtils::Miniperl, writemain - write the C code for perlmain.c - -=over 4 - -=item SYNOPSIS - -=item DESCRIPTION - -=item SEE ALSO - -=back - =head2 ExtUtils::Mkbootstrap - make a bootstrap file for use by DynaLoader =over 4 @@ -14835,6 +14838,8 @@ lock_value, unlock_value B, B +B + =over 4 =item CAVEATS @@ -16134,7 +16139,7 @@ stringify, bstr(), bdiv, Modifying and =, bpow =item DESCRIPTION -Canonical notation, Input, Output +Input, Output =item METHODS @@ -17502,7 +17507,7 @@ unix, stdio, perlio, crlf, utf8, bytes, raw, pop =back -=head2 PerlIO::scalar - support module for in-memory IO. +=head2 PerlIO::scalar - in-memory IO, scalar IO =over 4 @@ -17510,6 +17515,8 @@ unix, stdio, perlio, crlf, utf8, bytes, raw, pop =item DESCRIPTION +=item IMPLEMENTATION NOTE + =back =head2 PerlIO::via - Helper class for PerlIO layers implemented in perl @@ -19121,6 +19128,8 @@ C =item DESCRIPTION +=item BUGS + =back =head2 Sys::Hostname - Try every conceivable way to get hostname diff --git a/pod/perltodo.pod b/pod/perltodo.pod index e434a1d..4defa86 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -37,7 +37,7 @@ For displaying PVs with control characters, embedded nulls, and Unicode. This would be useful for printing warnings, or data and regex dumping, not_a_number(), and so on. -Requirements: should handle both byte and UTF8 strings. isPRINT() +Requirements: should handle both byte and UTF-8 strings. isPRINT() characters printed as-is, character less than 256 as \xHH, Unicode characters as \x{HHH}. Don't assume ASCII-like, either, get somebody on EBCDIC to test the output. @@ -806,7 +806,7 @@ Benjamin Sugars has done this. Nick Ing-Simmons' C supports an C IO method. -=head2 Byte to/from UTF8 and UTF8 to/from local conversion +=head2 Byte to/from UTF-8 and UTF-8 to/from local conversion C provides this. diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 5f9ee29..1101b5e 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -904,7 +904,7 @@ Like UTF-8 but EBCDIC-safe, in the way that UTF-8 is ASCII-safe. =item * -UTF-16, UTF-16BE, UTF16-LE, Surrogates, and BOMs (Byte Order Marks) +UTF-16, UTF-16BE, UTF-16LE, Surrogates, and BOMs (Byte Order Marks) The followings items are mostly for reference and general Unicode knowledge, Perl doesn't use these constructs internally. @@ -956,7 +956,7 @@ format". =item * -UTF-32, UTF-32BE, UTF32-LE +UTF-32, UTF-32BE, UTF-32LE The UTF-32 family is pretty much like the UTF-16 family, expect that the units are 32-bit, and therefore the surrogate scheme is not diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index eadcedd..71d0e57 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -504,7 +504,7 @@ Yet another way would be to use the Devel::Peek module: perl -MDevel::Peek -e 'Dump(chr(0x100))' -That shows the UTF8 flag in FLAGS and both the UTF-8 bytes +That shows the C flag in FLAGS and both the UTF-8 bytes and Unicode characters in C. See also later in this document the discussion about the C function. @@ -638,7 +638,7 @@ C<$string>. If the flag is off, the bytes in the scalar are interpreted as a single byte encoding. If the flag is on, the bytes in the scalar are interpreted as the (multi-byte, variable-length) UTF-8 encoded code points of the characters. Bytes added to an UTF-8 encoded string are -automatically upgraded to UTF-8. If mixed non-UTF8 and UTF-8 scalars +automatically upgraded to UTF-8. If mixed non-UTF-8 and UTF-8 scalars are merged (double-quoted interpolation, explicit concatenation, and printf/sprintf parameter substitution), the result will be UTF-8 encoded as if copies of the byte strings were upgraded to UTF-8: for example, diff --git a/pp_hot.c b/pp_hot.c index 0ad2fcf..70d4e6d 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -141,7 +141,7 @@ PP(pp_concat) if (TARG == right && right != left) { right = sv_2mortal(newSVpvn(rpv, rlen)); - rpv = SvPV(right, rlen); /* no point setting UTF8 here */ + rpv = SvPV(right, rlen); /* no point setting UTF-8 here */ rcopied = TRUE; } diff --git a/pp_sys.c b/pp_sys.c index 72af678..0a3c95e 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -1579,7 +1579,7 @@ PP(pp_sysread) } if ((fp_utf8 = PerlIO_isutf8(IoIFP(io))) && !IN_BYTES) { buffer = SvPVutf8_force(bufsv, blen); - /* UTF8 may not have been set if they are all low bytes */ + /* UTF-8 may not have been set if they are all low bytes */ SvUTF8_on(bufsv); } else { diff --git a/sv.c b/sv.c index e1d7715..51bd17e 100644 --- a/sv.c +++ b/sv.c @@ -3261,7 +3261,7 @@ Perl_sv_copypv(pTHX_ SV *dsv, register SV *ssv) =for apidoc sv_2pvbyte_nolen Return a pointer to the byte-encoded representation of the SV. -May cause the SV to be downgraded from UTF8 as a side-effect. +May cause the SV to be downgraded from UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3279,7 +3279,7 @@ Perl_sv_2pvbyte_nolen(pTHX_ register SV *sv) =for apidoc sv_2pvbyte Return a pointer to the byte-encoded representation of the SV, and set *lp -to its length. May cause the SV to be downgraded from UTF8 as a +to its length. May cause the SV to be downgraded from UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3297,8 +3297,8 @@ Perl_sv_2pvbyte(pTHX_ register SV *sv, STRLEN *lp) /* =for apidoc sv_2pvutf8_nolen -Return a pointer to the UTF8-encoded representation of the SV. -May cause the SV to be upgraded to UTF8 as a side-effect. +Return a pointer to the UTF-8-encoded representation of the SV. +May cause the SV to be upgraded to UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3315,8 +3315,8 @@ Perl_sv_2pvutf8_nolen(pTHX_ register SV *sv) /* =for apidoc sv_2pvutf8 -Return a pointer to the UTF8-encoded representation of the SV, and set *lp -to its length. May cause the SV to be upgraded to UTF8 as a side-effect. +Return a pointer to the UTF-8-encoded representation of the SV, and set *lp +to its length. May cause the SV to be upgraded to UTF-8 as a side-effect. Usually accessed via the C macro. @@ -3390,7 +3390,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) /* =for apidoc sv_utf8_upgrade -Convert the PV of an SV to its UTF8-encoded form. +Convert the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. @@ -3400,7 +3400,7 @@ use the Encode extension for that. =for apidoc sv_utf8_upgrade_flags -Convert the PV of an SV to its UTF8-encoded form. +Convert the PV of an SV to its UTF-8-encoded form. Forces the SV to string form if it is not already. Always sets the SvUTF8 flag to avoid future validity checks even if all the bytes have hibit clear. If C has C bit set, @@ -3470,7 +3470,7 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) /* =for apidoc sv_utf8_downgrade -Attempt to convert the PV of an SV from UTF8-encoded to byte encoding. +Attempt to convert the PV of an SV from UTF-8-encoded to byte encoding. This may not be possible if the PV contains non-byte encoding characters; if this is the case, either returns false or, if C is not true, croaks. @@ -3514,7 +3514,7 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) /* =for apidoc sv_utf8_encode -Convert the PV of an SV to UTF8-encoded, but then turn off the C +Convert the PV of an SV to UTF-8-encoded, but then turn off the C flag so that it looks like octets again. Used as a building block for encode_utf8 in Encode.xs @@ -4575,15 +4575,15 @@ Perl_sv_catpvn(pTHX_ SV *dsv, const char* sstr, STRLEN slen) =for apidoc sv_catpvn Concatenates the string onto the end of the string which is in the SV. The -C indicates number of bytes to copy. If the SV has the UTF8 -status set, then the bytes appended should be valid UTF8. +C indicates number of bytes to copy. If the SV has the UTF-8 +status set, then the bytes appended should be valid UTF-8. Handles 'get' magic, but not 'set' magic. See C. =for apidoc sv_catpvn_flags Concatenates the string onto the end of the string which is in the SV. The -C indicates number of bytes to copy. If the SV has the UTF8 -status set, then the bytes appended should be valid UTF8. +C indicates number of bytes to copy. If the SV has the UTF-8 +status set, then the bytes appended should be valid UTF-8. If C has C bit set, will C on C if appropriate, else not. C and C are implemented in terms of this function. @@ -4705,8 +4705,8 @@ Perl_sv_catsv_mg(pTHX_ SV *dsv, register SV *ssv) =for apidoc sv_catpv Concatenates the string onto the end of the string which is in the SV. -If the SV has the UTF8 status set, then the bytes appended should be -valid UTF8. Handles 'get' magic, but not 'set' magic. See C. +If the SV has the UTF-8 status set, then the bytes appended should be +valid UTF-8. Handles 'get' magic, but not 'set' magic. See C. =cut */ @@ -5652,7 +5652,7 @@ Perl_sv_len(pTHX_ register SV *sv) =for apidoc sv_len_utf8 Returns the number of characters in the string in an SV, counting wide -UTF8 bytes as a single character. Handles magic and type coercion. +UTF-8 bytes as a single character. Handles magic and type coercion. =cut */ @@ -5851,7 +5851,7 @@ S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I /* =for apidoc sv_pos_u2b -Converts the value pointed to by offsetp from a count of UTF8 chars from +Converts the value pointed to by offsetp from a count of UTF-8 chars from the start of the string, to a count of the equivalent number of bytes; if lenp is non-zero, it does the same to lenp, but this time starting from the offset, rather than from the start of the string. Handles magic and @@ -5931,7 +5931,7 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) =for apidoc sv_pos_b2u Converts the value pointed to by offsetp from a count of bytes from the -start of the string, to a count of the equivalent number of UTF8 chars. +start of the string, to a count of the equivalent number of UTF-8 chars. Handles magic and type coercion. =cut diff --git a/sv.h b/sv.h index d2113ae..38fdb12 100644 --- a/sv.h +++ b/sv.h @@ -464,7 +464,7 @@ Unsets the PV status of an SV. =for apidoc Am|void|SvPOK_only|SV* sv Tells an SV that it is a string and disables all other OK bits. -Will also turn off the UTF8 status. +Will also turn off the UTF-8 status. =for apidoc Am|bool|SvVOK|SV* sv Returns a boolean indicating whether the SV contains a v-string. @@ -584,15 +584,15 @@ Set the length of the string which is in the SV. See C. Returns a boolean indicating whether the SV contains UTF-8 encoded data. =for apidoc Am|void|SvUTF8_on|SV *sv -Turn on the UTF8 status of an SV (the data is not changed, just the flag). +Turn on the UTF-8 status of an SV (the data is not changed, just the flag). Do not use frivolously. =for apidoc Am|void|SvUTF8_off|SV *sv -Unsets the UTF8 status of an SV. +Unsets the UTF-8 status of an SV. =for apidoc Am|void|SvPOK_only_UTF8|SV* sv Tells an SV that it is a string and disables all other OK bits, -and leaves the UTF8 status as it was. +and leaves the UTF-8 status as it was. =cut */ diff --git a/utf8.c b/utf8.c index ad8758e..30bb7cf 100644 --- a/utf8.c +++ b/utf8.c @@ -31,7 +31,7 @@ static char unees[] = "Malformed UTF-8 character (unexpected end of string)"; =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags -Adds the UTF8 representation of the Unicode codepoint C to the end +Adds the UTF-8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -218,9 +218,9 @@ Perl_is_utf8_char(pTHX_ U8 *s) =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len Returns true if first C bytes of the given string form a valid -UTF8 string, false otherwise. Note that 'a valid UTF8 string' does -not mean 'a string that contains code points above 0x7F encoded in -UTF8' because a valid ASCII string is a valid UTF8 string. +UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does +not mean 'a string that contains code points above 0x7F encoded in UTF-8' +because a valid ASCII string is a valid UTF-8 string. =cut */ @@ -310,10 +310,10 @@ Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p) Bottom level UTF-8 decode routine. Returns the unicode code point value of the first character in the string C -which is assumed to be in UTF8 encoding and no longer than C; +which is assumed to be in UTF-8 encoding and no longer than C; C will be set to the length, in bytes, of that character. -If C does not point to a well-formed UTF8 character, the behaviour +If C does not point to a well-formed UTF-8 character, the behaviour is dependent on the value of C: if it contains UTF8_CHECK_ONLY, it is assumed that the caller will raise a warning, and this function will silently just set C to C<-1> and return zero. If the @@ -533,10 +533,10 @@ malformed: =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen Returns the native character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -If C does not point to a well-formed UTF8 character, zero is +If C does not point to a well-formed UTF-8 character, zero is returned and retlen is set, if possible, to -1. =cut @@ -553,13 +553,13 @@ Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen Returns the Unicode code point of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. This function should only be used when returned UV is considered an index into the Unicode semantic tables (e.g. swashes). -If C does not point to a well-formed UTF8 character, zero is +If C does not point to a well-formed UTF-8 character, zero is returned and retlen is set, if possible, to -1. =cut @@ -625,7 +625,7 @@ Perl_utf8_length(pTHX_ U8 *s, U8 *e) /* =for apidoc A|IV|utf8_distance|U8 *a|U8 *b -Returns the number of UTF8 characters between the UTF-8 pointers C +Returns the number of UTF-8 characters between the UTF-8 pointers C and C. WARNING: use only if you *know* that the pointers point inside the @@ -720,7 +720,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off) /* =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len -Converts a string C of length C from UTF8 into byte encoding. +Converts a string C of length C from UTF-8 into byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. @@ -735,7 +735,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) U8 *d; U8 *save = s; - /* ensure valid UTF8 and chars < 256 before updating string */ + /* ensure valid UTF-8 and chars < 256 before updating string */ for (send = s + *len; s < send; ) { U8 c = *s++; @@ -761,7 +761,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) /* =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 -Converts a string C of length C from UTF8 into byte encoding. +Converts a string C of length C from UTF-8 into byte encoding. Unlike but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C @@ -782,7 +782,7 @@ Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8) if (!*is_utf8) return start; - /* ensure valid UTF8 and chars < 256 before converting string */ + /* ensure valid UTF-8 and chars < 256 before converting string */ for (send = s + *len; s < send;) { U8 c = *s++; if (!UTF8_IS_INVARIANT(c)) { @@ -815,11 +815,11 @@ Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8) /* =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len -Converts a string C of length C from ASCII into UTF8 encoding. +Converts a string C of length C from ASCII into UTF-8 encoding. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF8 from other encodings than ASCII, +If you want to convert to UTF-8 from other encodings than ASCII, see sv_recode_to_utf8(). =cut @@ -1660,7 +1660,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) if (klen == 0) { /* If char in invariant then swatch is for all the invariant chars - * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK + * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK */ needents = UTF_CONTINUATION_MARK; off = NATIVE_TO_UTF(ptr[klen]); @@ -1764,7 +1764,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) /* =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv -Adds the UTF8 representation of the Native codepoint C to the end +Adds the UTF-8 representation of the Native codepoint C to the end of the string C; C should be have at least C free bytes available. The return value is the pointer to the byte after the end of the new character. In other words, @@ -1798,7 +1798,7 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Returns the native character value of the first character in the string C -which is assumed to be in UTF8 encoding; C will be set to the +which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. Allows length and flags to be passed to low level routine. diff --git a/utf8.h b/utf8.h index 72dd15f..d5a8845 100644 --- a/utf8.h +++ b/utf8.h @@ -8,7 +8,7 @@ */ /* Use UTF-8 as the default script encoding? - * Turning this on will break scripts having non-UTF8 binary + * Turning this on will break scripts having non-UTF-8 binary * data (such as Latin-1) in string literals. */ #ifdef USE_UTF8_SCRIPTS # define USE_UTF8_IN_NAMES (!IN_BYTES) @@ -162,7 +162,7 @@ encoded character. #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) -/* how wide can a single UTF8 encoded character become */ +/* how wide can a single UTF-8 encoded character become */ #define UTF8_MAXLEN 13 /* how wide a character can become when upper/lowercased */ #define UTF8_MAXLEN_UCLC_MULT 3 diff --git a/utfebcdic.h b/utfebcdic.h index 9659315..e47e90d 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -339,7 +339,7 @@ END_EXTERN_C #define UTF8SKIP(s) PL_utf8skip[*(U8*)s] -/* EBCDIC-happy ways of converting native code to UTF8 */ +/* EBCDIC-happy ways of converting native code to UTF-8 */ /* Native to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) PL_e2a[(U8)(ch)]