From 1bd51a4ce2ce8ac9d8d3503b73fce0e0fc8f8426 Mon Sep 17 00:00:00 2001 From: Inaba Hiroto Date: Mon, 10 Feb 2003 09:04:51 +0900 Subject: [PATCH] Re: [PATCH] Re: format, PerlIO and utf8 Message-ID: <3E466E12.E5349D84@st.rim.or.jp> p4raw-id: //depot/perl@18785 --- pp_ctl.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++----------- toke.c | 6 ++++++ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/pp_ctl.c b/pp_ctl.c index 1528168..e22297e 100644 --- a/pp_ctl.c +++ b/pp_ctl.c @@ -355,7 +355,8 @@ PP(pp_formline) bool gotsome = FALSE; STRLEN len; STRLEN fudge = SvCUR(tmpForm) * (IN_BYTES ? 1 : 3) + 1; - bool item_is_utf = FALSE; + bool item_is_utf8 = FALSE; + bool targ_is_utf8 = FALSE; if (!SvMAGICAL(tmpForm) || !SvCOMPILED(tmpForm)) { if (SvREADONLY(tmpForm)) { @@ -366,8 +367,9 @@ PP(pp_formline) else doparseform(tmpForm); } - SvPV_force(PL_formtarget, len); + if (DO_UTF8(PL_formtarget)) + targ_is_utf8 = TRUE; t = SvGROW(PL_formtarget, len + fudge + 1); /* XXX SvCUR bad */ t += len; f = SvPV(tmpForm, len); @@ -414,6 +416,26 @@ PP(pp_formline) case FF_LITERAL: arg = *fpc++; + if (targ_is_utf8 && !SvUTF8(tmpForm)) { + while (arg--) { + if (!NATIVE_IS_INVARIANT(*f)) { + U8 ch = NATIVE_TO_ASCII(*f++); + *t++ = (U8)UTF8_EIGHT_BIT_HI(ch); + *t++ = (U8)UTF8_EIGHT_BIT_LO(ch); + } + else + *t++ = *f++; + } + break; + } + if (!targ_is_utf8 && DO_UTF8(tmpForm)) { + SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget)); + *t = '\0'; + sv_utf8_upgrade(PL_formtarget); + SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1); + t = SvEND(PL_formtarget); + targ_is_utf8 = TRUE; + } while (arg--) *t++ = *f++; break; @@ -458,13 +480,13 @@ PP(pp_formline) break; s++; } - item_is_utf = TRUE; + item_is_utf8 = TRUE; itemsize = s - item; sv_pos_b2u(sv, &itemsize); break; } } - item_is_utf = FALSE; + item_is_utf8 = FALSE; if (itemsize > fieldsize) itemsize = fieldsize; send = chophere = s + itemsize; @@ -519,11 +541,11 @@ PP(pp_formline) itemsize = chophere - item; sv_pos_b2u(sv, &itemsize); } - item_is_utf = TRUE; + item_is_utf8 = TRUE; break; } } - item_is_utf = FALSE; + item_is_utf8 = FALSE; if (itemsize <= fieldsize) { send = chophere = s + itemsize; while (s < send) { @@ -579,7 +601,15 @@ PP(pp_formline) case FF_ITEM: arg = itemsize; s = item; - if (item_is_utf) { + if (item_is_utf8) { + if (!targ_is_utf8) { + SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget)); + *t = '\0'; + sv_utf8_upgrade(PL_formtarget); + SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1); + t = SvEND(PL_formtarget); + targ_is_utf8 = TRUE; + } while (arg--) { if (UTF8_IS_CONTINUED(*s)) { STRLEN skip = UTF8SKIP(s); @@ -628,22 +658,32 @@ PP(pp_formline) case FF_LINEGLOB: item = s = SvPV(sv, len); itemsize = len; - item_is_utf = FALSE; /* XXX is this correct? */ + if ((item_is_utf8 = DO_UTF8(sv))) + itemsize = sv_len_utf8(sv); if (itemsize) { + bool chopped = FALSE; gotsome = TRUE; - send = s + itemsize; + send = s + len; while (s < send) { if (*s++ == '\n') { - if (s == send) + if (s == send) { itemsize--; + chopped = TRUE; + } else lines++; } } SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget)); - sv_catpvn(PL_formtarget, item, itemsize); + if (targ_is_utf8) + SvUTF8_on(PL_formtarget); + sv_catsv(PL_formtarget, sv); + if (chopped) + SvCUR_set(PL_formtarget, SvCUR(PL_formtarget) - 1); SvGROW(PL_formtarget, SvCUR(PL_formtarget) + fudge + 1); t = SvPVX(PL_formtarget) + SvCUR(PL_formtarget); + if (item_is_utf8) + targ_is_utf8 = TRUE; } break; @@ -739,6 +779,8 @@ PP(pp_formline) if (strnEQ(linemark, linemark - arg, arg)) DIE(aTHX_ "Runaway format"); } + if (targ_is_utf8) + SvUTF8_on(PL_formtarget); FmLINES(PL_formtarget) = lines; SP = ORIGMARK; RETURNOP(cLISTOP->op_first); @@ -778,6 +820,8 @@ PP(pp_formline) case FF_END: *t = '\0'; SvCUR_set(PL_formtarget, t - SvPVX(PL_formtarget)); + if (targ_is_utf8) + SvUTF8_on(PL_formtarget); FmLINES(PL_formtarget) += lines; SP = ORIGMARK; RETPUSHYES; diff --git a/toke.c b/toke.c index 3e611e9..bc84fda 100644 --- a/toke.c +++ b/toke.c @@ -7617,6 +7617,12 @@ S_scan_formline(pTHX_ register char *s) } else PL_lex_state = LEX_FORMLINE; + if (!IN_BYTES) { + if (UTF && is_utf8_string((U8*)SvPVX(stuff), SvCUR(stuff))) + SvUTF8_on(stuff); + else if (PL_encoding) + sv_recode_to_utf8(stuff, PL_encoding); + } PL_nextval[PL_nexttoke].opval = (OP*)newSVOP(OP_CONST, 0, stuff); force_next(THING); PL_nextval[PL_nexttoke].ival = OP_FORMLINE; -- 2.7.4