From: Jarkko Hietaniemi <jhi@iki.fi>
Date: Sat, 22 Jan 2005 00:20:12 +0000 (+0200)
Subject: Re: uc($long_utf8_string) exhausts memory
X-Git-Tag: accepted/trunk/20130322.191538~21242
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=89ebb4a3f2a55825eeed13aaf58db5c73d2140ef;p=platform%2Fupstream%2Fperl.git

Re: uc($long_utf8_string) exhausts memory
Message-Id:  <41F1801C.3080201@iki.fi>

Make buffer size estimates for utf8 case conversion less maximally
pessimistic

p4raw-id: //depot/perl@23857
---

diff --git a/doop.c b/doop.c
index b1de08d..9e5a60d 100644
--- a/doop.c
+++ b/doop.c
@@ -338,7 +338,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
 
     if (grows) {
 	/* d needs to be bigger than s, in case e.g. upgrading is required */
-	New(0, d, len*3+UTF8_MAXLEN, U8);
+	New(0, d, len * 3 + UTF8_MAXBYTES, U8);
 	dend = d + len * 3;
 	dstart = d;
     }
@@ -370,10 +370,10 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
 
 	if (d > dend) {
 	    STRLEN clen = d - dstart;
-	    STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+	    STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
 	    if (!grows)
 		Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
-	    Renew(dstart, nlen+UTF8_MAXLEN, U8);
+	    Renew(dstart, nlen + UTF8_MAXBYTES, U8);
 	    d = dstart + clen;
 	    dend = dstart + nlen;
 	}
@@ -480,7 +480,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
 
     if (grows) {
 	/* d needs to be bigger than s, in case e.g. upgrading is required */
-	New(0, d, len*3+UTF8_MAXLEN, U8);
+	New(0, d, len * 3 + UTF8_MAXBYTES, U8);
 	dend = d + len * 3;
 	dstart = d;
     }
@@ -496,10 +496,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
 	
 	    if (d > dend) {
 	        STRLEN clen = d - dstart;
-		STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+		STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
 		if (!grows)
 		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
-		Renew(dstart, nlen+UTF8_MAXLEN, U8);
+		Renew(dstart, nlen + UTF8_MAXBYTES, U8);
 		d = dstart + clen;
 		dend = dstart + nlen;
 	    }
@@ -550,10 +550,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
 	    uv = swash_fetch(rv, s, TRUE);
 	    if (d > dend) {
 	        STRLEN clen = d - dstart;
-		STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+		STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
 		if (!grows)
 		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
-		Renew(dstart, nlen+UTF8_MAXLEN, U8);
+		Renew(dstart, nlen + UTF8_MAXBYTES, U8);
 		d = dstart + clen;
 		dend = dstart + nlen;
 	    }
diff --git a/op.c b/op.c
index ab9f374..77a213e 100644
--- a/op.c
+++ b/op.c
@@ -2423,7 +2423,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 */
 
 	if (complement) {
-	    U8 tmpbuf[UTF8_MAXLEN+1];
+	    U8 tmpbuf[UTF8_MAXBYTES+1];
 	    UV *cp;
 	    UV nextmin = 0;
 	    New(1109, cp, 2*tlen, UV);
diff --git a/pp.c b/pp.c
index a90d9ee..12f5bfb 100644
--- a/pp.c
+++ b/pp.c
@@ -3344,7 +3344,7 @@ PP(pp_ord)
     }
 
     XPUSHu(DO_UTF8(argsv) ?
-	   utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+	   utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV) :
 	   (*s & 0xff));
 
     RETURN;
@@ -3454,7 +3454,7 @@ PP(pp_ucfirst)
     if (DO_UTF8(sv) &&
 	(s = (U8*)SvPV_nomg(sv, slen)) && slen &&
 	UTF8_IS_START(*s)) {
-	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 	STRLEN ulen;
 	STRLEN tculen;
 
@@ -3517,7 +3517,7 @@ PP(pp_lcfirst)
 	(s = (U8*)SvPV_nomg(sv, slen)) && slen &&
 	UTF8_IS_START(*s)) {
 	STRLEN ulen;
-	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 	U8 *tend;
 	UV uv;
 
@@ -3574,7 +3574,7 @@ PP(pp_uc)
 	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
-	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+	U8 tmpbuf[UTF8_MAXBYTES+1];
 
 	s = (U8*)SvPV_nomg(sv,len);
 	if (!len) {
@@ -3583,18 +3583,28 @@ PP(pp_uc)
 	    SETs(TARG);
 	}
 	else {
-	    STRLEN nchar = utf8_length(s, s + len);
-
 	    (void)SvUPGRADE(TARG, SVt_PV);
-	    SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+	    SvGROW(TARG, len + 1);
 	    (void)SvPOK_only(TARG);
 	    d = (U8*)SvPVX(TARG);
 	    send = s + len;
 	    while (s < send) {
+		STRLEN u = UTF8SKIP(s);
+
 		toUPPER_utf8(s, tmpbuf, &ulen);
+		if (ulen > u) {
+		    UV o = d - (U8*)SvPVX(TARG);
+
+		    /* If someone uppercases one million U+03B0s we
+		     * SvGROW() one million times.  Or we could try
+		     * guess how much to allocate without overdoing.
+		     * Such is life. */
+		    SvGROW(TARG, SvCUR(TARG) + ulen - u);
+		    d = (U8*)SvPVX(TARG) + o;
+		}
 		Copy(tmpbuf, d, ulen, U8);
 		d += ulen;
-		s += UTF8SKIP(s);
+		s += u;
 	    }
 	    *d = '\0';
 	    SvUTF8_on(TARG);
@@ -3643,7 +3653,7 @@ PP(pp_lc)
 	STRLEN ulen;
 	register U8 *d;
 	U8 *send;
-	U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 
 	s = (U8*)SvPV_nomg(sv,len);
 	if (!len) {
@@ -3652,16 +3662,16 @@ PP(pp_lc)
 	    SETs(TARG);
 	}
 	else {
-	    STRLEN nchar = utf8_length(s, s + len);
-
 	    (void)SvUPGRADE(TARG, SVt_PV);
-	    SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+	    SvGROW(TARG, len + 1);
 	    (void)SvPOK_only(TARG);
 	    d = (U8*)SvPVX(TARG);
 	    send = s + len;
 	    while (s < send) {
+		STRLEN u = UTF8SKIP(s);
 		UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode */
+
+#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
 		if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
 		     /*
 		      * Now if the sigma is NOT followed by
@@ -3675,12 +3685,24 @@ PP(pp_lc)
 		      * then it should be mapped to 0x03C2,
 		      * (GREEK SMALL LETTER FINAL SIGMA),
 		      * instead of staying 0x03A3.
-		      * See lib/unicore/SpecCase.txt.
+		      * "should be": in other words,
+		      * this is not implemented yet.
+		      * See lib/unicore/SpecialCasing.txt.
 		      */
 		}
+		if (ulen > u) {
+		    UV o = d - (U8*)SvPVX(TARG);
+
+		    /* If someone lowercases one million U+0130s we
+		     * SvGROW() one million times.  Or we could try
+		     * guess how much to allocate without overdoing.
+		     Such is life. */
+		    SvGROW(TARG, SvCUR(TARG) + ulen - u);
+		    d = (U8*)SvPVX(TARG) + o;
+		}
 		Copy(tmpbuf, d, ulen, U8);
 		d += ulen;
-		s += UTF8SKIP(s);
+		s += u;
 	    }
 	    *d = '\0';
 	    SvUTF8_on(TARG);
diff --git a/pp_pack.c b/pp_pack.c
index 815c326..2e830f4 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -2183,7 +2183,7 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV
 	    while (len-- > 0) {
 		fromstr = NEXTFROM;
 		auint = UNI_TO_NATIVE(SvUV(fromstr));
-		SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
+		SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1);
 		SvCUR_set(cat,
 			  (char*)uvchr_to_utf8_flags((U8*)SvEND(cat),
 						     auint,
diff --git a/regcomp.c b/regcomp.c
index 2a942d5..ff939b2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3088,7 +3088,7 @@ tryagain:
 	    char *oldp, *s;
 	    STRLEN numlen;
 	    STRLEN foldlen;
-	    U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
+	    U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
 
             parse_start = RExC_parse - 1;
 
@@ -4199,7 +4199,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
 		else if (prevnatvalue == natvalue) {
 		    Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
 		    if (FOLD) {
-			 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+			 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
 			 STRLEN foldlen;
 			 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
 
@@ -4869,7 +4869,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 	
 	    if (lv) {
 		if (sw) {
-		    U8 s[UTF8_MAXLEN+1];
+		    U8 s[UTF8_MAXBYTES_CASE+1];
 		
 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
 			U8 *e = uvchr_to_utf8(s, i);
diff --git a/regexec.c b/regexec.c
index 6ed6d95..f254713 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1028,15 +1028,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    if (UTF) {
 	        STRLEN ulen1, ulen2;
 		U8 *sm = (U8 *) m;
-		U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-		U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+		U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+		U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
 		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
 		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE, 
 				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
 				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 		lnc = 0;
 		while (sm < ((U8 *) m + ln)) {
@@ -1074,15 +1074,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 
 	    if (do_utf8) {
 	        UV c, f;
-	        U8 tmpbuf [UTF8_MAXLEN+1];
-		U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+	        U8 tmpbuf [UTF8_MAXBYTES+1];
+		U8 foldbuf[UTF8_MAXBYTES_CASE+1];
 		STRLEN len, foldlen;
 		
 		if (c1 == c2) {
 		    /* Upper and lower of 1st char are equal -
 		     * probably not a "letter". */
 		    while (s <= e) {
-		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
 					   ckWARN(WARN_UTF8) ?
 					   0 : UTF8_ALLOW_ANY);
 			if ( c == c1
@@ -1109,7 +1109,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		else {
 		    while (s <= e) {
-		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
 					   ckWARN(WARN_UTF8) ?
 					   0 : UTF8_ALLOW_ANY);
 
@@ -2459,7 +2459,7 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			     sayNO;
 			if (NATIVE_TO_UNI(*(U8*)s) !=
-			    utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+			    utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
 					   ckWARN(WARN_UTF8) ?
 					   0 : UTF8_ALLOW_ANY))
 			     sayNO;
@@ -2473,7 +2473,7 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			    sayNO;
 			if (NATIVE_TO_UNI(*((U8*)l)) !=
-			    utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+			    utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
 					   ckWARN(WARN_UTF8) ?
 					   0 : UTF8_ALLOW_ANY))
 			    sayNO;
@@ -2806,8 +2806,8 @@ S_regmatch(pTHX_ regnode *prog)
 		 */
 		if (OP(scan) == REFF) {
 		    STRLEN ulen1, ulen2;
-		    U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-		    U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+		    U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+		    U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 		    while (s < e) {
 			if (l >= PL_regeol)
 			    sayNO;
@@ -3580,21 +3580,21 @@ S_regmatch(pTHX_ regnode *prog)
 		    else { /* UTF */
 			if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
 			     STRLEN ulen1, ulen2;
-			     U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
-			     U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+			     U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+			     U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
 
 			     to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
 			     to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-			     c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+			     c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
 						 ckWARN(WARN_UTF8) ?
 						 0 : UTF8_ALLOW_ANY);
-			     c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+			     c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
 						 ckWARN(WARN_UTF8) ?
 						 0 : UTF8_ALLOW_ANY);
 			}
 			else {
-			    c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+			    c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
 						     ckWARN(WARN_UTF8) ?
 						     0 : UTF8_ALLOW_ANY);
 			}
@@ -3656,7 +3656,7 @@ S_regmatch(pTHX_ regnode *prog)
 				 * utf8_distance(old, locinput) */
 				while (locinput <= e &&
 				       utf8n_to_uvchr((U8*)locinput,
-						      UTF8_MAXLEN, &len,
+						      UTF8_MAXBYTES, &len,
 						      ckWARN(WARN_UTF8) ?
 						      0 : UTF8_ALLOW_ANY) != (UV)c1) {
 				    locinput += len;
@@ -3667,7 +3667,7 @@ S_regmatch(pTHX_ regnode *prog)
 				 * utf8_distance(old, locinput) */
 				while (locinput <= e) {
 				    UV c = utf8n_to_uvchr((U8*)locinput,
-							  UTF8_MAXLEN, &len,
+							  UTF8_MAXBYTES, &len,
 							  ckWARN(WARN_UTF8) ?
 							  0 : UTF8_ALLOW_ANY);
 				    if (c == (UV)c1 || c == (UV)c2)
@@ -3704,7 +3704,7 @@ S_regmatch(pTHX_ regnode *prog)
 		    if (c1 != -1000) {
 			if (do_utf8)
 			    c = utf8n_to_uvchr((U8*)PL_reginput,
-					       UTF8_MAXLEN, 0,
+					       UTF8_MAXBYTES, 0,
 					       ckWARN(WARN_UTF8) ?
 					       0 : UTF8_ALLOW_ANY);
 			else
@@ -3754,7 +3754,7 @@ S_regmatch(pTHX_ regnode *prog)
 			if (c1 != -1000) {
 			    if (do_utf8)
 				c = utf8n_to_uvchr((U8*)PL_reginput,
-						   UTF8_MAXLEN, 0,
+						   UTF8_MAXBYTES, 0,
 						   ckWARN(WARN_UTF8) ?
 						   0 : UTF8_ALLOW_ANY);
 			    else
@@ -3777,7 +3777,7 @@ S_regmatch(pTHX_ regnode *prog)
 			if (c1 != -1000) {
 			    if (do_utf8)
 				c = utf8n_to_uvchr((U8*)PL_reginput,
-						   UTF8_MAXLEN, 0,
+						   UTF8_MAXBYTES, 0,
 						   ckWARN(WARN_UTF8) ?
 						   0 : UTF8_ALLOW_ANY);
 			    else
@@ -4370,7 +4370,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN plen;
 
     if (do_utf8 && !UTF8_IS_INVARIANT(c))
-	 c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+	 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
 			    ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
@@ -4407,7 +4407,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
 			}
 		    }
 		    if (!match) {
-		        U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+		        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 			STRLEN tmplen;
 
 		        to_utf8_fold(p, tmpbuf, &tmplen);
diff --git a/sv.c b/sv.c
index c7211b7..343c5f0 100644
--- a/sv.c
+++ b/sv.c
@@ -9271,7 +9271,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 #endif
 
 	char esignbuf[4];
-	U8 utf8buf[UTF8_MAXLEN+1];
+	U8 utf8buf[UTF8_MAXBYTES+1];
 	STRLEN esignlen = 0;
 
 	char *eptr = Nullch;
diff --git a/t/op/lc.t b/t/op/lc.t
index cee6661..d75b64e 100644
--- a/t/op/lc.t
+++ b/t/op/lc.t
@@ -111,8 +111,8 @@ is("\u\x{587}" , "\x{535}\x{582}", "ligature titlecase");
 is("\U\x{587}" , "\x{535}\x{552}", "ligature uppercase");
 
 # mktables had problems where many-to-one case mappings didn't work right.
-# The lib/unifold.t should give the fourth folding, "casefolding", a good
-# workout. 
+# The lib/uni/fold.t should give the fourth folding, "casefolding", a good
+# workout (one cannot directly get that from Perl). 
 # \x{01C4} is LATIN CAPITAL LETTER DZ WITH CARON
 # \x{01C5} is LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
 # \x{01C6} is LATIN SMALL LETTER DZ WITH CARON
@@ -161,6 +161,5 @@ for my $a (0,1) {
 	chop $a;
 	$a =~ s/^(\s*)(\w*)/$1\u$2/;
 	is($a, v10, "[perl #18857]");
-	$test++;
     } 
 }
diff --git a/toke.c b/toke.c
index b4222f6..5d5abf4 100644
--- a/toke.c
+++ b/toke.c
@@ -1704,7 +1704,7 @@ S_scan_const(pTHX_ char *start)
 			 UV uv = utf8_to_uvchr((U8*)str, 0);
 
 			 if (uv < 0x100) {
-			      U8 tmpbuf[UTF8_MAXLEN+1], *d;
+			      U8 tmpbuf[UTF8_MAXBYTES+1], *d;
 
 			      d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv));
 			      sv_setpvn(res, (char *)tmpbuf, d - tmpbuf);
@@ -7137,7 +7137,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
     I32 brackets = 1;			/* bracket nesting level */
     bool has_utf8 = FALSE;		/* is there any utf8 content? */
     I32 termcode;			/* terminating char. code */
-    U8 termstr[UTF8_MAXLEN];		/* terminating string */
+    U8 termstr[UTF8_MAXBYTES];		/* terminating string */
     STRLEN termlen;			/* length of terminating string */
     char *last = NULL;			/* last position for nesting bracket */
 
@@ -8230,7 +8230,7 @@ Perl_scan_vstring(pTHX_ char *s, SV *sv)
 
     if (!isALPHA(*pos)) {
 	UV rev;
-	U8 tmpbuf[UTF8_MAXLEN+1];
+	U8 tmpbuf[UTF8_MAXBYTES+1];
 	U8 *tmpend;
 
 	if (*s == 'v') s++;  /* get past 'v' */
diff --git a/utf8.c b/utf8.c
index 5c6f271..5fa2aab 100644
--- a/utf8.c
+++ b/utf8.c
@@ -38,7 +38,7 @@ within non-zero characters.
 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
 
 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
@@ -551,7 +551,7 @@ returned and retlen is set, if possible, to -1.
 UV
 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 {
-    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
@@ -575,7 +575,7 @@ UV
 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 {
     /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
@@ -937,7 +937,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 bool
 Perl_is_uni_alnum(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alnum(tmpbuf);
 }
@@ -945,7 +945,7 @@ Perl_is_uni_alnum(pTHX_ UV c)
 bool
 Perl_is_uni_alnumc(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alnumc(tmpbuf);
 }
@@ -953,7 +953,7 @@ Perl_is_uni_alnumc(pTHX_ UV c)
 bool
 Perl_is_uni_idfirst(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_idfirst(tmpbuf);
 }
@@ -961,7 +961,7 @@ Perl_is_uni_idfirst(pTHX_ UV c)
 bool
 Perl_is_uni_alpha(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_alpha(tmpbuf);
 }
@@ -969,7 +969,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
 bool
 Perl_is_uni_ascii(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_ascii(tmpbuf);
 }
@@ -977,7 +977,7 @@ Perl_is_uni_ascii(pTHX_ UV c)
 bool
 Perl_is_uni_space(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_space(tmpbuf);
 }
@@ -985,7 +985,7 @@ Perl_is_uni_space(pTHX_ UV c)
 bool
 Perl_is_uni_digit(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_digit(tmpbuf);
 }
@@ -993,7 +993,7 @@ Perl_is_uni_digit(pTHX_ UV c)
 bool
 Perl_is_uni_upper(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_upper(tmpbuf);
 }
@@ -1001,7 +1001,7 @@ Perl_is_uni_upper(pTHX_ UV c)
 bool
 Perl_is_uni_lower(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_lower(tmpbuf);
 }
@@ -1009,7 +1009,7 @@ Perl_is_uni_lower(pTHX_ UV c)
 bool
 Perl_is_uni_cntrl(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_cntrl(tmpbuf);
 }
@@ -1017,7 +1017,7 @@ Perl_is_uni_cntrl(pTHX_ UV c)
 bool
 Perl_is_uni_graph(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_graph(tmpbuf);
 }
@@ -1025,7 +1025,7 @@ Perl_is_uni_graph(pTHX_ UV c)
 bool
 Perl_is_uni_print(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_print(tmpbuf);
 }
@@ -1033,7 +1033,7 @@ Perl_is_uni_print(pTHX_ UV c)
 bool
 Perl_is_uni_punct(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN+1];
+    U8 tmpbuf[UTF8_MAXBYTES+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_punct(tmpbuf);
 }
@@ -1041,7 +1041,7 @@ Perl_is_uni_punct(pTHX_ UV c)
 bool
 Perl_is_uni_xdigit(pTHX_ UV c)
 {
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     uvchr_to_utf8(tmpbuf, c);
     return is_utf8_xdigit(tmpbuf);
 }
@@ -1166,7 +1166,7 @@ Perl_to_uni_upper_lc(pTHX_ U32 c)
     /* XXX returns only the first character -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_upper(c, tmpbuf, &len);
 }
 
@@ -1176,7 +1176,7 @@ Perl_to_uni_title_lc(pTHX_ U32 c)
     /* XXX returns only the first character XXX -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_title(c, tmpbuf, &len);
 }
 
@@ -1186,7 +1186,7 @@ Perl_to_uni_lower_lc(pTHX_ U32 c)
     /* XXX returns only the first character -- do not use XXX */
     /* XXX no locale support yet */
     STRLEN len;
-    U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     return (U32)to_uni_lower(c, tmpbuf, &len);
 }
 
@@ -1400,7 +1400,7 @@ UV
 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
 {
     UV uv0, uv1;
-    U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
     STRLEN len = 0;
 
     uv0 = utf8_to_uvchr(p, 0);
@@ -1489,9 +1489,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
 
 Convert the UTF-8 encoded character at p to its uppercase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-uppercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
+the uppercase version may be longer than the original character.
 
 The first character of the uppercased version is returned
 (but note, as explained above, that there may be more.)
@@ -1510,9 +1509,8 @@ Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its titlecase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-titlecase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+titlecase version may be longer than the original character.
 
 The first character of the titlecased version is returned
 (but note, as explained above, that there may be more.)
@@ -1531,9 +1529,8 @@ Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its lowercase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-lowercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+lowercase version may be longer than the original character.
 
 The first character of the lowercased version is returned
 (but note, as explained above, that there may be more.)
@@ -1552,7 +1549,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
 
 Convert the UTF-8 encoded character at p to its foldcase version and
 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
 foldcase version may be longer than the original character (up to
 three characters).
 
@@ -1711,7 +1708,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
 	    /* We use utf8n_to_uvuni() as we want an index into
 	       Unicode tables, not a native character number.
 	     */
-	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
 					   ckWARN(WARN_UTF8) ?
 					   0 : UTF8_ALLOW_ANY);
 	    SV *errsv_save;
@@ -1778,7 +1775,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
 
 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
 bytes available. The return value is the pointer to the byte after the
 end of the new character. In other words,
 
@@ -1954,8 +1951,8 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
      STRLEN n1 = 0, n2 = 0;
-     U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
-     U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
+     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
      U8 natbuf[1+1];
      STRLEN foldlen1, foldlen2;
      bool match;
diff --git a/utf8.h b/utf8.h
index 376280d..c206b3d 100644
--- a/utf8.h
+++ b/utf8.h
@@ -162,14 +162,26 @@ encoded character.
 #define isIDFIRST_lazy(p)	isIDFIRST_lazy_if(p,1)
 #define isALNUM_lazy(p)		isALNUM_lazy_if(p,1)
 
-/* how wide can a single UTF-8 encoded character become */
-#define UTF8_MAXLEN 13
-/* how wide a character can become when upper/lowercased */
-#define UTF8_MAXLEN_UCLC_MULT 3
-#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT)
-/* how wide a character can become when casefolded */
-#define UTF8_MAXLEN_FOLD_MULT 3
-#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT)
+#define UTF8_MAXBYTES 13
+/* How wide can a single UTF-8 encoded character become in bytes.
+ * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8
+ * since UTF-8 is an encoding of Unicode and given Unicode's current
+ * upper limit only four bytes is possible.  Perl thinks of UTF-8
+ * as a way to encode non-negative integers in a binary format. */
+#define UTF8_MAXLEN UTF8_MAXBYTES
+
+#define UTF8_MAXLEN_UCLC 3		/* Obsolete, do not use. */
+#define UTF8_MAXLEN_UCLC_MULT 39	/* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD 3		/* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD_MULT 39	/* Obsolete, do not use. */
+
+/* The maximum number of UTF-8 bytes a single Unicode character can
+ * uppercase/lowercase/fold into; this number depends on the Unicode
+ * version.  An example of maximal expansion is the U+03B0 which
+ * uppercases to U+03C5 U+0308 U+0301.  The Unicode databases that
+ * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * SpecialCasing.txt. */
+#define UTF8_MAXBYTES_CASE	6
 
 #define IN_BYTES (PL_curcop->op_private & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)