From a176fa2a176313dc1a9b9594d080f47292ff4070 Mon Sep 17 00:00:00 2001
From: Gurusamy Sarathy <gsar@cpan.org>
Date: Sun, 26 Jul 1998 05:07:05 +0000
Subject: [PATCH] add new files to MANIFEST; add missing prototypes to proto.h;
 s/PL_utf8skip/utf8skip/ for now, or we end up with Perl_PL_; add typecasts to
 silence warnings; tweaks for win32 builds

p4raw-id: //depot/perl@1663
---
 MANIFEST          | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 embed.h           |   1 +
 global.sym        |   1 +
 proto.h           |  22 ++++++++
 regexec.c         |  22 ++++----
 toke.c            |   2 +-
 utf8.h            |   6 +-
 win32/Makefile    |   2 +
 win32/makefile.mk |   4 +-
 9 files changed, 206 insertions(+), 16 deletions(-)

diff --git a/MANIFEST b/MANIFEST
index 056e369..192caef 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -589,6 +589,165 @@ lib/syslog.pl		Perl library supporting syslogging
 lib/tainted.pl		Old code for tainting
 lib/termcap.pl		Perl library supporting termcap usage
 lib/timelocal.pl	Perl library supporting inverse of localtime, gmtime
+lib/unicode/ArabLink.pl				Unicode character database
+lib/unicode/ArabLnkGrp.pl			Unicode character database
+lib/unicode/Bidirectional.pl			Unicode character database
+lib/unicode/Block.pl				Unicode character database
+lib/unicode/Category.pl				Unicode character database
+lib/unicode/CombiningClass.pl			Unicode character database
+lib/unicode/Decomposition.pl			Unicode character database
+lib/unicode/In/AlphabeticPresentationForms.pl	Unicode character database
+lib/unicode/In/Arabic.pl			Unicode character database
+lib/unicode/In/ArabicPresentationForms-A.pl	Unicode character database
+lib/unicode/In/ArabicPresentationForms-B.pl	Unicode character database
+lib/unicode/In/Armenian.pl			Unicode character database
+lib/unicode/In/Arrows.pl			Unicode character database
+lib/unicode/In/BasicLatin.pl			Unicode character database
+lib/unicode/In/Bengali.pl			Unicode character database
+lib/unicode/In/BlockElements.pl			Unicode character database
+lib/unicode/In/Bopomofo.pl			Unicode character database
+lib/unicode/In/BoxDrawing.pl			Unicode character database
+lib/unicode/In/CJKCompatibility.pl		Unicode character database
+lib/unicode/In/CJKCompatibilityForms.pl		Unicode character database
+lib/unicode/In/CJKCompatibilityIdeographs.pl	Unicode character database
+lib/unicode/In/CJKSymbolsandPunctuation.pl	Unicode character database
+lib/unicode/In/CJKUnifiedIdeographs.pl		Unicode character database
+lib/unicode/In/CombiningDiacriticalMarks.pl	Unicode character database
+lib/unicode/In/CombiningHalfMarks.pl		Unicode character database
+lib/unicode/In/CombiningMarksforSymbols.pl	Unicode character database
+lib/unicode/In/ControlPictures.pl		Unicode character database
+lib/unicode/In/CurrencySymbols.pl		Unicode character database
+lib/unicode/In/Cyrillic.pl			Unicode character database
+lib/unicode/In/Devanagari.pl			Unicode character database
+lib/unicode/In/Dingbats.pl			Unicode character database
+lib/unicode/In/EnclosedAlphanumerics.pl		Unicode character database
+lib/unicode/In/EnclosedCJKLettersandMonths.pl	Unicode character database
+lib/unicode/In/GeneralPunctuation.pl		Unicode character database
+lib/unicode/In/GeometricShapes.pl		Unicode character database
+lib/unicode/In/Georgian.pl			Unicode character database
+lib/unicode/In/Greek.pl				Unicode character database
+lib/unicode/In/GreekExtended.pl			Unicode character database
+lib/unicode/In/Gujarati.pl			Unicode character database
+lib/unicode/In/Gurmukhi.pl			Unicode character database
+lib/unicode/In/HalfwidthandFullwidthForms.pl	Unicode character database
+lib/unicode/In/HangulCompatibilityJamo.pl	Unicode character database
+lib/unicode/In/HangulJamo.pl			Unicode character database
+lib/unicode/In/HangulSyllables.pl		Unicode character database
+lib/unicode/In/Hebrew.pl			Unicode character database
+lib/unicode/In/HighPrivateUseSurrogates.pl	Unicode character database
+lib/unicode/In/HighSurrogates.pl		Unicode character database
+lib/unicode/In/Hiragana.pl			Unicode character database
+lib/unicode/In/IPAExtensions.pl			Unicode character database
+lib/unicode/In/Kanbun.pl			Unicode character database
+lib/unicode/In/Kannada.pl			Unicode character database
+lib/unicode/In/Katakana.pl			Unicode character database
+lib/unicode/In/Lao.pl				Unicode character database
+lib/unicode/In/Latin-1Supplement.pl		Unicode character database
+lib/unicode/In/LatinExtended-A.pl		Unicode character database
+lib/unicode/In/LatinExtended-B.pl		Unicode character database
+lib/unicode/In/LatinExtendedAdditional.pl	Unicode character database
+lib/unicode/In/LetterlikeSymbols.pl		Unicode character database
+lib/unicode/In/LowSurrogates.pl			Unicode character database
+lib/unicode/In/Malayalam.pl			Unicode character database
+lib/unicode/In/MathematicalOperators.pl		Unicode character database
+lib/unicode/In/MiscellaneousSymbols.pl		Unicode character database
+lib/unicode/In/MiscellaneousTechnical.pl	Unicode character database
+lib/unicode/In/NumberForms.pl			Unicode character database
+lib/unicode/In/OpticalCharacterRecognition.pl	Unicode character database
+lib/unicode/In/Oriya.pl				Unicode character database
+lib/unicode/In/PrivateUse.pl			Unicode character database
+lib/unicode/In/SmallFormVariants.pl		Unicode character database
+lib/unicode/In/SpacingModifierLetters.pl	Unicode character database
+lib/unicode/In/Specials.pl			Unicode character database
+lib/unicode/In/SuperscriptsandSubscripts.pl	Unicode character database
+lib/unicode/In/Tamil.pl				Unicode character database
+lib/unicode/In/Telugu.pl			Unicode character database
+lib/unicode/In/Thai.pl				Unicode character database
+lib/unicode/In/Tibetan.pl			Unicode character database
+lib/unicode/Is/Alnum.pl				Unicode character database
+lib/unicode/Is/Alpha.pl				Unicode character database
+lib/unicode/Is/BidiAN.pl			Unicode character database
+lib/unicode/Is/BidiB.pl				Unicode character database
+lib/unicode/Is/BidiCS.pl			Unicode character database
+lib/unicode/Is/BidiEN.pl			Unicode character database
+lib/unicode/Is/BidiES.pl			Unicode character database
+lib/unicode/Is/BidiET.pl			Unicode character database
+lib/unicode/Is/BidiL.pl				Unicode character database
+lib/unicode/Is/BidiON.pl			Unicode character database
+lib/unicode/Is/BidiR.pl				Unicode character database
+lib/unicode/Is/BidiS.pl				Unicode character database
+lib/unicode/Is/BidiWS.pl			Unicode character database
+lib/unicode/Is/C.pl				Unicode character database
+lib/unicode/Is/Cc.pl				Unicode character database
+lib/unicode/Is/Cn.pl				Unicode character database
+lib/unicode/Is/Co.pl				Unicode character database
+lib/unicode/Is/DCcircle.pl			Unicode character database
+lib/unicode/Is/DCcompat.pl			Unicode character database
+lib/unicode/Is/DCfinal.pl			Unicode character database
+lib/unicode/Is/DCfont.pl			Unicode character database
+lib/unicode/Is/DCinital.pl			Unicode character database
+lib/unicode/Is/DCinitial.pl			Unicode character database
+lib/unicode/Is/DCisolated.pl			Unicode character database
+lib/unicode/Is/DCnarrow.pl			Unicode character database
+lib/unicode/Is/DCnoBreak.pl			Unicode character database
+lib/unicode/Is/DCsmall.pl			Unicode character database
+lib/unicode/Is/DCsquare.pl			Unicode character database
+lib/unicode/Is/DCsub.pl				Unicode character database
+lib/unicode/Is/DCsuper.pl			Unicode character database
+lib/unicode/Is/DCvertical.pl			Unicode character database
+lib/unicode/Is/DCwide.pl			Unicode character database
+lib/unicode/Is/DecoCanon.pl			Unicode character database
+lib/unicode/Is/DecoCompat.pl			Unicode character database
+lib/unicode/Is/Digit.pl				Unicode character database
+lib/unicode/Is/L.pl				Unicode character database
+lib/unicode/Is/Ll.pl				Unicode character database
+lib/unicode/Is/Lm.pl				Unicode character database
+lib/unicode/Is/Lo.pl				Unicode character database
+lib/unicode/Is/Lower.pl				Unicode character database
+lib/unicode/Is/Lt.pl				Unicode character database
+lib/unicode/Is/Lu.pl				Unicode character database
+lib/unicode/Is/M.pl				Unicode character database
+lib/unicode/Is/Mc.pl				Unicode character database
+lib/unicode/Is/Mirrored.pl			Unicode character database
+lib/unicode/Is/Mn.pl				Unicode character database
+lib/unicode/Is/N.pl				Unicode character database
+lib/unicode/Is/Nd.pl				Unicode character database
+lib/unicode/Is/No.pl				Unicode character database
+lib/unicode/Is/P.pl				Unicode character database
+lib/unicode/Is/Pd.pl				Unicode character database
+lib/unicode/Is/Pe.pl				Unicode character database
+lib/unicode/Is/Po.pl				Unicode character database
+lib/unicode/Is/Print.pl				Unicode character database
+lib/unicode/Is/Ps.pl				Unicode character database
+lib/unicode/Is/S.pl				Unicode character database
+lib/unicode/Is/Sc.pl				Unicode character database
+lib/unicode/Is/Sm.pl				Unicode character database
+lib/unicode/Is/So.pl				Unicode character database
+lib/unicode/Is/Space.pl				Unicode character database
+lib/unicode/Is/Upper.pl				Unicode character database
+lib/unicode/Is/Z.pl				Unicode character database
+lib/unicode/Is/Zl.pl				Unicode character database
+lib/unicode/Is/Zp.pl				Unicode character database
+lib/unicode/Is/Zs.pl				Unicode character database
+lib/unicode/JamoShort.pl			Unicode character database
+lib/unicode/Makefile				Unicode character database
+lib/unicode/Name.pl				Unicode character database
+lib/unicode/Number.pl				Unicode character database
+lib/unicode/To/Digit.pl				Unicode character database
+lib/unicode/To/Lower.pl				Unicode character database
+lib/unicode/To/Title.pl				Unicode character database
+lib/unicode/To/Upper.pl				Unicode character database
+lib/unicode/UnicodeData-Latest.txt		Unicode character database
+lib/unicode/arabshp.txt				Unicode character database
+lib/unicode/blocks.txt				Unicode character database
+lib/unicode/index2.txt				Unicode character database
+lib/unicode/jamo2.txt				Unicode character database
+lib/unicode/mktables.PL				Unicode character database generator
+lib/unicode/names2.txt				Unicode character database
+lib/unicode/props2.txt				Unicode character database
+lib/unicode/readme.txt				Unicode character database info
+lib/utf8.pm					Pragma to control Unicode support
+lib/utf8_heavy.pl				Support routines for utf8 pragma
 lib/validate.pl		Perl library supporting wholesale file mode validation
 lib/vars.pm		Declare pseudo-imported global variables
 makeaperl.SH		perl script that produces a new perl binary
@@ -769,6 +928,7 @@ sv.c			Scalar value code
 sv.h			Scalar value header
 t/README		Instructions for regression tests
 t/TEST			The regression tester
+t/UTEST			Run regression tests with -Mutf8
 t/base/cond.t		See if conditionals work
 t/base/if.t		See if if works
 t/base/lex.t		See if lexical items work
@@ -971,6 +1131,8 @@ thread.sym		Symbols for threads
 toke.c			The tokener
 universal.c		The default UNIVERSAL package methods
 unixish.h		Defines that are assumed on Unix
+utf8.c			Unicode routines
+utf8.h			Unicode header
 util.c			Utility routines
 util.h			Dummy header
 utils/Makefile		Extract the utility scripts
diff --git a/embed.h b/embed.h
index 6026c18..ef19977 100644
--- a/embed.h
+++ b/embed.h
@@ -1063,6 +1063,7 @@
 #define utf8_distance		Perl_utf8_distance
 #define utf8_hop		Perl_utf8_hop
 #define utf8_to_uv		Perl_utf8_to_uv
+#define utf8skip		Perl_utf8skip
 #define utilize			Perl_utilize
 #define uv_to_utf8		Perl_uv_to_utf8
 #define varies			Perl_varies
diff --git a/global.sym b/global.sym
index ac13e65..ef16b8a 100644
--- a/global.sym
+++ b/global.sym
@@ -1102,6 +1102,7 @@ utf16_to_utf8_reversed
 utf8_distance
 utf8_hop
 utf8_to_uv
+utf8skip
 utilize
 uv_to_utf8
 wait4pid
diff --git a/proto.h b/proto.h
index acd88d0..7ee3cb4 100644
--- a/proto.h
+++ b/proto.h
@@ -194,6 +194,28 @@ VIRTUAL U32	intro_my _((void));
 VIRTUAL char*	instr _((char* big, char* little));
 VIRTUAL bool	io_close _((IO* io));
 VIRTUAL OP*	invert _((OP* cmd));
+VIRTUAL bool	is_uni_alnum _((U32 c));
+VIRTUAL bool	is_uni_idfirst _((U32 c));
+VIRTUAL bool	is_uni_alpha _((U32 c));
+VIRTUAL bool	is_uni_space _((U32 c));
+VIRTUAL bool	is_uni_digit _((U32 c));
+VIRTUAL bool	is_uni_upper _((U32 c));
+VIRTUAL bool	is_uni_lower _((U32 c));
+VIRTUAL bool	is_uni_print _((U32 c));
+VIRTUAL U32	to_uni_upper _((U32 c));
+VIRTUAL U32	to_uni_title _((U32 c));
+VIRTUAL U32	to_uni_lower _((U32 c));
+VIRTUAL bool	is_uni_alnum_lc _((U32 c));
+VIRTUAL bool	is_uni_idfirst_lc _((U32 c));
+VIRTUAL bool	is_uni_alpha_lc _((U32 c));
+VIRTUAL bool	is_uni_space_lc _((U32 c));
+VIRTUAL bool	is_uni_digit_lc _((U32 c));
+VIRTUAL bool	is_uni_upper_lc _((U32 c));
+VIRTUAL bool	is_uni_lower_lc _((U32 c));
+VIRTUAL bool	is_uni_print_lc _((U32 c));
+VIRTUAL U32	to_uni_upper_lc _((U32 c));
+VIRTUAL U32	to_uni_title_lc _((U32 c));
+VIRTUAL U32	to_uni_lower_lc _((U32 c));
 VIRTUAL bool	is_utf8_alnum _((unsigned char *p));
 VIRTUAL bool	is_utf8_idfirst _((unsigned char *p));
 VIRTUAL bool	is_utf8_alpha _((unsigned char *p));
diff --git a/regexec.c b/regexec.c
index fe9f833..400843b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -420,7 +420,7 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
 	I32 back_min = 
 	    prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
 	I32 delta = back_max - back_min;
-	char *last = HOP(strend, -(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
+	char *last = HOP(strend, 0-(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
 	char *last1;		/* Last position checked before */
 
 	if (s > PL_bostr)
@@ -1130,7 +1130,7 @@ regmatch(regnode *prog)
 	    break;
 	case SANYUTF8:
 	    if (nextchr & 0x80) {
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		if (locinput > PL_regeol)
 		    sayNO;
 		nextchr = UCHARAT(locinput);
@@ -1147,7 +1147,7 @@ regmatch(regnode *prog)
 	    break;
 	case ANYUTF8:
 	    if (nextchr & 0x80) {
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		if (locinput > PL_regeol)
 		    sayNO;
 		nextchr = UCHARAT(locinput);
@@ -1219,7 +1219,7 @@ regmatch(regnode *prog)
 		sayNO;
 	    if (locinput >= PL_regeol)
 		sayNO;
-	    locinput += PL_utf8skip[nextchr];
+	    locinput += utf8skip[nextchr];
 	    nextchr = UCHARAT(locinput);
 	    break;
 	case ANYOF:
@@ -1253,7 +1253,7 @@ regmatch(regnode *prog)
 		if (!(OP(scan) == ALNUMUTF8
 		      ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput)))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1283,7 +1283,7 @@ regmatch(regnode *prog)
 		if (OP(scan) == NALNUMUTF8
 		      ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1351,7 +1351,7 @@ regmatch(regnode *prog)
 		if (!(OP(scan) == SPACEUTF8
 		      ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput)))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1381,7 +1381,7 @@ regmatch(regnode *prog)
 		if (OP(scan) == NSPACEUTF8
 		      ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1399,7 +1399,7 @@ regmatch(regnode *prog)
 	    if (nextchr & 0x80) {
 		if (!(swash_fetch(PL_utf8_digit,locinput)))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1420,7 +1420,7 @@ regmatch(regnode *prog)
 	    if (nextchr & 0x80) {
 		if (swash_fetch(PL_utf8_digit,locinput))
 		    sayNO;
-		locinput += PL_utf8skip[nextchr];
+		locinput += utf8skip[nextchr];
 		nextchr = UCHARAT(locinput);
 		break;
 	    }
@@ -1431,7 +1431,7 @@ regmatch(regnode *prog)
 	case CLUMP:
 	    if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark, locinput))
 		sayNO;
-	    locinput += PL_utf8skip[nextchr];
+	    locinput += utf8skip[nextchr];
 	    while (locinput < PL_regeol && swash_fetch(PL_utf8_mark, locinput))
 		locinput += UTF8SKIP(locinput);
 	    if (locinput > PL_regeol)
diff --git a/toke.c b/toke.c
index 9f96319..13cc965 100644
--- a/toke.c
+++ b/toke.c
@@ -915,7 +915,7 @@ scan_const(char *start)
 	    /* range begins (ignore - as first or last char) */
 	    else if (*s == '-' && s+1 < send  && s != start) {
 		if (utf) {
-		    *d++ = 0xff;	/* use illegal utf8 byte--see pmtrans */
+		    *d++ = (char)0xff;	/* use illegal utf8 byte--see pmtrans */
 		    s++;
 		    continue;
 		}
diff --git a/utf8.h b/utf8.h
index 6f86f72..f39e340 100644
--- a/utf8.h
+++ b/utf8.h
@@ -8,7 +8,7 @@
  */
 
 #ifdef DOINIT
-EXTCONST unsigned char PL_utf8skip[] = {
+EXTCONST unsigned char utf8skip[] = {
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
@@ -19,9 +19,9 @@ EXTCONST unsigned char PL_utf8skip[] = {
 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8, /* cjk etc. */
 };
 #else
-EXTCONST unsigned char PL_utf8skip[];
+EXTCONST unsigned char utf8skip[];
 #endif
 
 #define IN_UTF8 (curcop->op_private & HINT_UTF8)
 
-#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
+#define UTF8SKIP(s) utf8skip[*(U8*)s]
diff --git a/win32/Makefile b/win32/Makefile
index 8570f5d..addf487 100644
--- a/win32/Makefile
+++ b/win32/Makefile
@@ -354,6 +354,7 @@ MICROCORE_SRC	=		\
 		..\taint.c	\
 		..\toke.c	\
 		..\universal.c	\
+		..\utf8.c	\
 		..\util.c
 
 !IF "$(PERL_MALLOC)" == "define"
@@ -427,6 +428,7 @@ CORE_NOCFG_H	=		\
 		..\sv.h		\
 		..\thread.h	\
 		..\unixish.h	\
+		..\utf8.h	\
 		..\util.h	\
 		..\XSUB.h	\
 		..\EXTERN.h	\
diff --git a/win32/makefile.mk b/win32/makefile.mk
index ad24e21..50cdda9 100644
--- a/win32/makefile.mk
+++ b/win32/makefile.mk
@@ -198,7 +198,7 @@ OPTIMIZE	= -O2 $(RUNTIME)
 LINK_DBG	= 
 .ENDIF
 
-CFLAGS		= -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
+CFLAGS		= -K -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
 		$(PCHFLAGS) $(OPTIMIZE)
 LINK_FLAGS	= $(LINK_DBG) -L$(CCLIBDIR) $(EXTRALIBDIRS:^"-L")
 OBJOUT_FLAG	= -o
@@ -471,6 +471,7 @@ MICROCORE_SRC	=		\
 		..\taint.c	\
 		..\toke.c	\
 		..\universal.c	\
+		..\utf8.c	\
 		..\util.c
 
 .IF "$(PERL_MALLOC)" == "define"
@@ -544,6 +545,7 @@ CORE_NOCFG_H	=		\
 		..\sv.h		\
 		..\thread.h	\
 		..\unixish.h	\
+		..\utf8.h	\
 		..\util.h	\
 		..\XSUB.h	\
 		..\EXTERN.h	\
-- 
2.7.4