From a1ca4561f20dbf547f57d39a690790cbe33210da Mon Sep 17 00:00:00 2001 From: Yitzchak Scott-Thoennes Date: Fri, 10 Nov 2000 01:47:15 -0800 Subject: [PATCH] Tweak the definition of the bit complement on UTF-8 data: if none of the characters in the string are > 0xff, the result is a complemented byte string, not a (UTF-8) char string. Based on the summary in Subject: Re: [ID 20000918.005] ~ on wide chars Message-ID: This should give us the maximum backward (pre-char string) compatibility and utf8 compatibility. The other alternative would be to limit the bit complement to be always byte only, taking the least significant byte of the chars. p4raw-id: //depot/perl@7665 --- doop.c | 2 +- pod/perlunicode.pod | 12 ++++++++++++ pp.c | 39 +++++++++++++++++++++++++++++---------- t/op/bop.t | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 74 insertions(+), 15 deletions(-) diff --git a/doop.c b/doop.c index 9fd7dfa..3d22eb4 100644 --- a/doop.c +++ b/doop.c @@ -926,7 +926,7 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right) if (left_utf && !right_utf) sv_utf8_upgrade(right); - if (!left_utf && right_utf) + else if (!left_utf && right_utf) sv_utf8_upgrade(left); if (sv != left || (optype != OP_BIT_AND && !SvOK(sv) && !SvGMAGICAL(sv))) diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index e567e18..30a4482 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -198,6 +198,18 @@ byte-oriented C and C under utf8. =item * +The bit string operators C<& | ^ ~> can operate on character data. +However, for backward compatibility reasons (bit string operations +when the characters all are less than 256 in ordinal value) one cannot +mix C<~> (the bit complement) and characters both less than 256 and +equal or greater than 256. Most importantly, the DeMorgan's laws +(C<~($x|$y) eq ~$x&~$y>, C<~($x&$y) eq ~$x|~$y>) won't hold. +Another way to look at this is that the complement cannot return +B the 8-bit (byte) wide bit complement, and the full character +wide bit complement. + +=item * + And finally, C reverses by character rather than by byte. =back diff --git a/pp.c b/pp.c index cc3f7eb..2a414b8 100644 --- a/pp.c +++ b/pp.c @@ -1476,31 +1476,50 @@ PP(pp_complement) tmps = (U8*)SvPV_force(TARG, len); anum = len; if (SvUTF8(TARG)) { - /* Calculate exact length, let's not estimate */ + /* Calculate exact length, let's not estimate. */ STRLEN targlen = 0; U8 *result; U8 *send; STRLEN l; + UV nchar = 0; + UV nwide = 0; send = tmps + len; while (tmps < send) { UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); tmps += UTF8SKIP(tmps); targlen += UNISKIP(~c); + nchar++; + if (c > 0xff) + nwide++; } /* Now rewind strings and write them. */ tmps -= len; - Newz(0, result, targlen + 1, U8); - while (tmps < send) { - UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); - tmps += UTF8SKIP(tmps); - result = uv_to_utf8(result,(UV)~c); + + if (nwide) { + Newz(0, result, targlen + 1, U8); + while (tmps < send) { + UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); + tmps += UTF8SKIP(tmps); + result = uv_to_utf8(result, ~c); + } + *result = '\0'; + result -= targlen; + sv_setpvn(TARG, (char*)result, targlen); + SvUTF8_on(TARG); + } + else { + Newz(0, result, nchar + 1, U8); + while (tmps < send) { + U8 c = (U8)utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY); + tmps += UTF8SKIP(tmps); + *result++ = ~c; + } + *result = '\0'; + result -= nchar; + sv_setpvn(TARG, (char*)result, nchar); } - *result = '\0'; - result -= targlen; - sv_setpvn(TARG, (char*)result, targlen); - SvUTF8_on(TARG); Safefree(result); SETs(TARG); RETURN; diff --git a/t/op/bop.t b/t/op/bop.t index fd080e6..3fad2fd 100755 --- a/t/op/bop.t +++ b/t/op/bop.t @@ -9,7 +9,7 @@ BEGIN { @INC = '../lib'; } -print "1..38\n"; +print "1..40\n"; # numerics print ((0xdead & 0xbeef) == 0x9ead ? "ok 1\n" : "not ok 1\n"); @@ -107,7 +107,7 @@ for (0x100...0xFFF) { if $a ne chr(~$_) or length($a) != 1 or ~$a ne chr($_); } if (@not36) { - print "# test 36 failed: @not36\n"; + print "# test 36 failed\n"; print "not "; } print "ok 36\n"; @@ -120,14 +120,42 @@ for my $i (0xEEE...0xF00) { push @not37, sprintf("%#03X %#03X", $i, $j) if $a ne chr(~$i).chr(~$j) or length($a) != 2 or - ~$a ne chr($i).chr($j); + ~$a ne chr($i).chr($j); } } if (@not37) { - print "# test 37 failed: @not37\n"; + print "# test 37 failed\n"; print "not "; } print "ok 37\n"; print "not " unless ~chr(~0) eq "\0"; print "ok 38\n"; + +my @not39; + +for my $i (0x100..0x120) { + for my $j (0x100...0x120) { + push @not39, sprintf("%#03X %#03X", $i, $j) + if ~(chr($i)|chr($j)) ne (~chr($i)&~chr($j)); + } +} +if (@not39) { + print "# test 39 failed\n"; + print "not "; +} +print "ok 39\n"; + +my @not40; + +for my $i (0x100..0x120) { + for my $j (0x100...0x120) { + push @not40, sprintf("%#03X %#03X", $i, $j) + if ~(chr($i)&chr($j)) ne (~chr($i)|~chr($j)); + } +} +if (@not40) { + print "# test 40 failed\n"; + print "not "; +} +print "ok 40\n"; -- 2.7.4