2 * Summary: interface for the encoding conversion functions
3 * Description: interface for the encoding conversion functions needed for
4 * XML basic encoding and iconv() support.
7 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
8 * [ISO-10646] UTF-8 and UTF-16 in Annexes
9 * [ISO-8859-1] ISO Latin-1 characters codes.
10 * [UNICODE] The Unicode Consortium, "The Unicode Standard --
11 * Worldwide Character Encoding -- Version 1.0", Addison-
12 * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
13 * described in Unicode Technical Report #4.
14 * [US-ASCII] Coded Character Set--7-bit American Standard Code for
15 * Information Interchange, ANSI X3.4-1986.
17 * Copy: See Copyright for the status of this software.
19 * Author: Daniel Veillard
22 #ifndef __XML_CHAR_ENCODING_H__
23 #define __XML_CHAR_ENCODING_H__
25 #include <libxml/xmlversion.h>
27 #ifdef LIBXML_ICONV_ENABLED
30 #ifdef LIBXML_ICU_ENABLED
31 #include <unicode/ucnv.h>
33 /* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h>
34 * to prevent unwanted ICU symbols being exposed to users of libxml2.
35 * One particular case is Qt4 conflicting on UChar32.
39 typedef struct UConverter UConverter;
41 typedef wchar_t UChar;
43 typedef uint16_t UChar;
55 * Predefined values for some standard encodings.
56 * Libxml does not do beforehand translation on UTF8 and ISOLatinX.
57 * It also supports ASCII, ISO-8859-1, and UTF16 (LE and BE) by default.
59 * Anything else would have to be translated to UTF8 before being
60 * given to the parser itself. The BOM for UTF16 and the encoding
61 * declaration are looked at and a converter is looked for at that
62 * point. If not found the parser stops here as asked by the XML REC. A
63 * converter can be registered by the user using xmlRegisterCharEncodingHandler
64 * but the current form doesn't allow stateful transcoding (a serious
65 * problem agreed !). If iconv has been found it will be used
66 * automatically and allow stateful transcoding, the simplest is then
67 * to be sure to enable iconv and to provide iconv libs for the encoding
70 * Note that the generic "UTF-16" is not a predefined value. Instead, only
71 * the specific UTF-16LE and UTF-16BE are present.
74 XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
75 XML_CHAR_ENCODING_NONE= 0, /* No char encoding detected */
76 XML_CHAR_ENCODING_UTF8= 1, /* UTF-8 */
77 XML_CHAR_ENCODING_UTF16LE= 2, /* UTF-16 little endian */
78 XML_CHAR_ENCODING_UTF16BE= 3, /* UTF-16 big endian */
79 XML_CHAR_ENCODING_UCS4LE= 4, /* UCS-4 little endian */
80 XML_CHAR_ENCODING_UCS4BE= 5, /* UCS-4 big endian */
81 XML_CHAR_ENCODING_EBCDIC= 6, /* EBCDIC uh! */
82 XML_CHAR_ENCODING_UCS4_2143=7, /* UCS-4 unusual ordering */
83 XML_CHAR_ENCODING_UCS4_3412=8, /* UCS-4 unusual ordering */
84 XML_CHAR_ENCODING_UCS2= 9, /* UCS-2 */
85 XML_CHAR_ENCODING_8859_1= 10,/* ISO-8859-1 ISO Latin 1 */
86 XML_CHAR_ENCODING_8859_2= 11,/* ISO-8859-2 ISO Latin 2 */
87 XML_CHAR_ENCODING_8859_3= 12,/* ISO-8859-3 */
88 XML_CHAR_ENCODING_8859_4= 13,/* ISO-8859-4 */
89 XML_CHAR_ENCODING_8859_5= 14,/* ISO-8859-5 */
90 XML_CHAR_ENCODING_8859_6= 15,/* ISO-8859-6 */
91 XML_CHAR_ENCODING_8859_7= 16,/* ISO-8859-7 */
92 XML_CHAR_ENCODING_8859_8= 17,/* ISO-8859-8 */
93 XML_CHAR_ENCODING_8859_9= 18,/* ISO-8859-9 */
94 XML_CHAR_ENCODING_2022_JP= 19,/* ISO-2022-JP */
95 XML_CHAR_ENCODING_SHIFT_JIS=20,/* Shift_JIS */
96 XML_CHAR_ENCODING_EUC_JP= 21,/* EUC-JP */
97 XML_CHAR_ENCODING_ASCII= 22 /* pure ASCII */
101 * xmlCharEncodingInputFunc:
102 * @out: a pointer to an array of bytes to store the UTF-8 result
103 * @outlen: the length of @out
104 * @in: a pointer to an array of chars in the original encoding
105 * @inlen: the length of @in
107 * Take a block of chars in the original encoding and try to convert
108 * it to an UTF-8 block of chars out.
110 * Returns the number of bytes written, -1 if lack of space, or -2
111 * if the transcoding failed.
112 * The value of @inlen after return is the number of octets consumed
113 * if the return value is positive, else unpredictiable.
114 * The value of @outlen after return is the number of octets consumed.
116 typedef int (* xmlCharEncodingInputFunc)(unsigned char *out, int *outlen,
117 const unsigned char *in, int *inlen);
121 * xmlCharEncodingOutputFunc:
122 * @out: a pointer to an array of bytes to store the result
123 * @outlen: the length of @out
124 * @in: a pointer to an array of UTF-8 chars
125 * @inlen: the length of @in
127 * Take a block of UTF-8 chars in and try to convert it to another
129 * Note: a first call designed to produce heading info is called with
130 * in = NULL. If stateful this should also initialize the encoder state.
132 * Returns the number of bytes written, -1 if lack of space, or -2
133 * if the transcoding failed.
134 * The value of @inlen after return is the number of octets consumed
135 * if the return value is positive, else unpredictiable.
136 * The value of @outlen after return is the number of octets produced.
138 typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
139 const unsigned char *in, int *inlen);
143 * Block defining the handlers for non UTF-8 encodings.
144 * If iconv is supported, there are two extra fields.
146 #ifdef LIBXML_ICU_ENABLED
148 UConverter *uconv; /* for conversion between an encoding and UTF-16 */
149 UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
151 typedef struct _uconv_t uconv_t;
154 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
155 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr;
156 struct _xmlCharEncodingHandler {
158 xmlCharEncodingInputFunc input;
159 xmlCharEncodingOutputFunc output;
160 #ifdef LIBXML_ICONV_ENABLED
163 #endif /* LIBXML_ICONV_ENABLED */
164 #ifdef LIBXML_ICU_ENABLED
167 #endif /* LIBXML_ICU_ENABLED */
173 #include <libxml/tree.h>
179 * Interfaces for encoding handlers.
181 XMLPUBFUN void XMLCALL
182 xmlInitCharEncodingHandlers (void);
183 XMLPUBFUN void XMLCALL
184 xmlCleanupCharEncodingHandlers (void);
185 XMLPUBFUN void XMLCALL
186 xmlRegisterCharEncodingHandler (xmlCharEncodingHandlerPtr handler);
187 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL
188 xmlGetCharEncodingHandler (xmlCharEncoding enc);
189 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL
190 xmlFindCharEncodingHandler (const char *name);
191 XMLPUBFUN xmlCharEncodingHandlerPtr XMLCALL
192 xmlNewCharEncodingHandler (const char *name,
193 xmlCharEncodingInputFunc input,
194 xmlCharEncodingOutputFunc output);
197 * Interfaces for encoding names and aliases.
199 XMLPUBFUN int XMLCALL
200 xmlAddEncodingAlias (const char *name,
202 XMLPUBFUN int XMLCALL
203 xmlDelEncodingAlias (const char *alias);
204 XMLPUBFUN const char * XMLCALL
205 xmlGetEncodingAlias (const char *alias);
206 XMLPUBFUN void XMLCALL
207 xmlCleanupEncodingAliases (void);
208 XMLPUBFUN xmlCharEncoding XMLCALL
209 xmlParseCharEncoding (const char *name);
210 XMLPUBFUN const char * XMLCALL
211 xmlGetCharEncodingName (xmlCharEncoding enc);
214 * Interfaces directly used by the parsers.
216 XMLPUBFUN xmlCharEncoding XMLCALL
217 xmlDetectCharEncoding (const unsigned char *in,
220 XMLPUBFUN int XMLCALL
221 xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
225 XMLPUBFUN int XMLCALL
226 xmlCharEncInFunc (xmlCharEncodingHandler *handler,
229 XMLPUBFUN int XMLCALL
230 xmlCharEncFirstLine (xmlCharEncodingHandler *handler,
233 XMLPUBFUN int XMLCALL
234 xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
237 * Export a few useful functions
239 #ifdef LIBXML_OUTPUT_ENABLED
240 XMLPUBFUN int XMLCALL
241 UTF8Toisolat1 (unsigned char *out,
243 const unsigned char *in,
245 #endif /* LIBXML_OUTPUT_ENABLED */
246 XMLPUBFUN int XMLCALL
247 isolat1ToUTF8 (unsigned char *out,
249 const unsigned char *in,
255 #endif /* __XML_CHAR_ENCODING_H__ */