1 * Summary: interface for an HTML 4.0 non-verifying parser
2 * Description: this module implements an HTML 4.0 non-verifying parser
3 * with API compatible with the XML parser ones. It should
4 * be able to parse "real world" HTML, even if severely
5 * broken from a specification point of view.
7 * Copy: See Copyright for the status of this software.
9 * Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
11 /if not defined(HTML_PARSER_H__)
12 /define HTML_PARSER_H__
14 /include "libxmlrpg/xmlversion"
15 /include "libxmlrpg/parser"
17 /if defined(LIBXML_HTML_ENABLED)
19 * Most of the back-end structures from XML and HTML are shared.
21 d htmlParserCtxtPtr...
22 d s based(######typedef######)
23 d like(xmlParserCtxtPtr)
25 d htmlParserCtxt ds based(htmlParserCtxtPtr)
26 d likeds(xmlParserCtxt)
28 d htmlParserNodeInfoPtr...
29 d s based(######typedef######)
30 d like(xmlParserNodeInfoPtr)
32 d htmlParserNodeInfo...
33 d ds based(htmlParserNodeInfoPtr)
34 d likeds(xmlParserNodeInfo)
36 d htmlSAXHandlerPtr...
37 d s based(######typedef######)
38 d like(xmlSAXHandlerPtr)
40 d htmlSAXHandler ds based(htmlSAXHandlerPtr)
41 d likeds(xmlSAXHandler)
43 d htmlParserInputPtr...
44 d s based(######typedef######)
45 d like(xmlParserInputPtr)
48 d ds based(htmlParserInputPtr)
49 d likeds(xmlParserInput)
51 d htmlDocPtr s based(######typedef######)
54 d htmlNodePtr s based(######typedef######)
57 * Internal description of an HTML element, representing HTML 4.01
58 * and XHTML 1.0 (which share the same structure).
61 d s * based(######typedef######)
63 d htmlElemDesc ds based(htmlElemDescPtr)
66 d startTag 3u 0 Start tag implied ?
67 d endTag 3u 0 End tag implied ?
68 d saveEndTag 3u 0 Save end tag ?
69 d empty 3u 0 Empty element ?
70 d depr 3u 0 Deprecated element ?
71 d dtd 3u 0 Loose DTD/Frameset
72 d isinline 3u 0 Block 0/inline elem?
75 * New fields encapsulating HTML structure
78 * This is a very limited representation. It fails to tell us when
79 * an element *requires* subelements (we only have whether they're
80 * allowed or not), and it doesn't tell us where CDATA and PCDATA
81 * are allowed. Some element relationships are not fully represented:
82 * these are flagged with the word MODIFIER
84 d subelts * const char * *
85 d defaultsubelt * const char *
86 d attrs_opt * const char * *
87 d attrs_depr * const char * *
88 d attrs_req * const char * *
90 * Internal description of an HTML entity.
92 d htmlEntityDescPtr...
93 d s * based(######typedef######)
96 d ds based(htmlEntityDescPtr)
98 d value 10u 0 Unicode char value
100 d desc * const char *
102 * There is only few public functions.
104 d htmlTagLookup pr extproc('htmlTagLookup')
105 d like(htmlElemDescPtr) const
106 d tag * value options(*string) const xmlChar *
108 d htmlEntityLookup...
109 d pr extproc('htmlEntityLookup')
110 d like(htmlEntityDescPtr) const
111 d name * value options(*string) const xmlChar *
113 d htmlEntityValueLookup...
114 d pr extproc('htmlEntityValueLookup')
115 d like(htmlEntityDescPtr) const
118 d htmlIsAutoClosed...
119 d pr 10i 0 extproc('htmlIsAutoClosed')
120 d doc value like(htmlDocPtr)
121 d elem value like(htmlNodePtr)
123 d htmlAutoCloseTag...
124 d pr 10i 0 extproc('htmlAutoCloseTag')
125 d doc value like(htmlDocPtr)
126 d name * value options(*string) const xmlChar *
127 d elem value like(htmlNodePtr)
129 d htmlParseEntityRef...
130 d pr extproc('htmlParseEntityRef')
131 d like(htmlEntityDescPtr) const
132 d ctxt value like(htmlParserCtxtPtr)
133 d str * const xmlChar *(*)
135 d htmlParseCharRef...
136 d pr 10i 0 extproc('htmlParseCharRef')
137 d ctxt value like(htmlParserCtxtPtr)
139 d htmlParseElement...
140 d pr extproc('htmlParseElement')
141 d ctxt value like(htmlParserCtxtPtr)
143 d htmlNewParserCtxt...
144 d pr extproc('htmlNewParserCtxt')
145 d like(htmlParserCtxtPtr)
147 d htmlCreateMemoryParserCtxt...
148 d pr extproc('htmlCreateMemoryParserCtxt')
149 d like(htmlParserCtxtPtr)
150 d buffer * value options(*string) const char *
153 d htmlParseDocument...
154 d pr 10i 0 extproc('htmlParseDocument')
155 d ctxt value like(htmlParserCtxtPtr)
158 d pr extproc('htmlSAXParseDoc')
160 d cur * value options(*string) xmlChar *
161 d encoding * value options(*string) const char *
162 d sax value like(htmlSAXHandlerPtr)
163 d userData * value void *
165 d htmlParseDoc pr extproc('htmlParseDoc')
167 d cur * value options(*string) xmlChar *
168 d encoding * value options(*string) const char *
170 d htmlSAXParseFile...
171 d pr extproc('htmlSAXParseFile')
173 d filename * value options(*string) const char *
174 d encoding * value options(*string) const char *
175 d sax value like(htmlSAXHandlerPtr)
176 d userData * value void *
178 d htmlParseFile pr extproc('htmlParseFile')
180 d filename * value options(*string) const char *
181 d encoding * value options(*string) const char *
183 d UTF8ToHtml pr 10i 0 extproc('UTF8ToHtml')
184 d out 65535 options(*varsize) unsigned char []
186 d in * value options(*string) const unsigned char*
189 d htmlEncodeEntities...
190 d pr 10i 0 extproc('htmlEncodeEntities')
191 d out 65535 options(*varsize) unsigned char []
193 d in * value options(*string) const unsigned char*
195 d quoteChar 10i 0 value
197 d htmlIsScriptAttribute...
198 d pr 10i 0 extproc('htmlIsScriptAttribute')
199 d name * value options(*string) const xmlChar *
201 d htmlHandleOmittedElem...
202 d pr 10i 0 extproc('htmlHandleOmittedElem')
205 /if defined(LIBXML_PUSH_ENABLED)
207 * Interfaces for the Push mode.
209 d htmlCreatePushParserCtxt...
210 d pr extproc('htmlCreatePushParserCtxt')
211 d like(htmlParserCtxtPtr)
212 d sax value like(htmlSAXHandlerPtr)
213 d user_data * value void *
214 d chunk * value options(*string) const char *
216 d filename * value options(*string) const char *
217 d enc value like(xmlCharEncoding)
219 d htmlParseChunk pr 10i 0 extproc('htmlParseChunk')
220 d ctxt value like(htmlParserCtxtPtr)
221 d chunk * value options(*string) const char *
223 d terminate 10i 0 value
224 /endif LIBXML_PUSH_ENABLED
226 d htmlFreeParserCtxt...
227 d pr extproc('htmlFreeParserCtxt')
228 d ctxt value like(htmlParserCtxtPtr)
230 * New set of simpler/more flexible APIs
234 * This is the set of XML parser options that can be passed down
235 * to the xmlReadDoc() and similar calls.
237 d htmlParserOption...
238 d s 10i 0 based(######typedef######) enum
239 d HTML_PARSE_RECOVER... Relaxed parsing
241 d HTML_PARSE_NODEFDTD... No default doctype
243 d HTML_PARSE_NOERROR... No error reports
245 d HTML_PARSE_NOWARNING... No warning reports
247 d HTML_PARSE_PEDANTIC... Pedantic err reports
249 d HTML_PARSE_NOBLANKS... Remove blank nodes
251 d HTML_PARSE_NONET... Forbid net access
253 d HTML_PARSE_NOIMPLIED... No implied html/body
255 d HTML_PARSE_COMPACT... compact small txtnod
257 d HTML_PARSE_IGNORE_ENC... Ignore encoding hint
260 d htmlCtxtReset pr extproc('htmlCtxtReset')
261 d ctxt value like(htmlParserCtxtPtr)
263 d htmlCtxtUseOptions...
264 d pr 10i 0 extproc('htmlCtxtUseOptions')
265 d ctxt value like(htmlParserCtxtPtr)
266 d options 10i 0 value
268 d htmlReadDoc pr extproc('htmlReadDoc')
270 d cur * value options(*string) const xmlChar *
271 d URL * value options(*string) const char *
272 d encoding * value options(*string) const char *
273 d options 10i 0 value
275 d htmlReadFile pr extproc('htmlReadFile')
277 d URL * value options(*string) const char *
278 d encoding * value options(*string) const char *
279 d options 10i 0 value
281 d htmlReadMemory pr extproc('htmlReadMemory')
283 d buffer * value options(*string) const char *
285 d URL * value options(*string) const char *
286 d encoding * value options(*string) const char *
287 d options 10i 0 value
289 d htmlReadFd pr extproc('htmlReadFd')
292 d URL * value options(*string) const char *
293 d encoding * value options(*string) const char *
294 d options 10i 0 value
296 d htmlReadIO pr extproc('htmlReadIO')
298 d ioread value like(xmlInputReadCallback)
299 d ioclose value like(xmlInputCloseCallback)
300 d ioctx * value void *
301 d URL * value options(*string) const char *
302 d encoding * value options(*string) const char *
303 d options 10i 0 value
306 d pr extproc('htmlCtxtReadDoc')
308 d ctxt value like(xmlParserCtxtPtr)
309 d cur * value options(*string) const xmlChar *
310 d URL * value options(*string) const char *
311 d encoding * value options(*string) const char *
312 d options 10i 0 value
314 d htmlCtxtReadFile...
315 d pr extproc('htmlCtxtReadFile')
317 d ctxt value like(xmlParserCtxtPtr)
318 d filename * value options(*string) const char *
319 d encoding * value options(*string) const char *
320 d options 10i 0 value
322 d htmlCtxtReadMemory...
323 d pr extproc('htmlCtxtReadMemory')
325 d ctxt value like(xmlParserCtxtPtr)
326 d buffer * value options(*string) const char *
328 d URL * value options(*string) const char *
329 d encoding * value options(*string) const char *
330 d options 10i 0 value
332 d htmlCtxtReadFd pr extproc('htmlCtxtReadFd')
334 d ctxt value like(xmlParserCtxtPtr)
336 d URL * value options(*string) const char *
337 d encoding * value options(*string) const char *
338 d options 10i 0 value
340 d htmlCtxtReadIO pr extproc('htmlCtxtReadIO')
342 d ctxt value like(xmlParserCtxtPtr)
343 d ioread value like(xmlInputReadCallback)
344 d ioclose value like(xmlInputCloseCallback)
345 d ioctx * value void *
346 d URL * value options(*string) const char *
347 d encoding * value options(*string) const char *
348 d options 10i 0 value
350 * Further knowledge of HTML structure
352 d htmlStatus s 10i 0 based(######typedef######) enum
353 d HTML_NA c X'0000' No check at all
354 d HTML_INVALID c X'0001'
357 d HTML_VALID c X'0004'
358 d HTML_REQUIRED c X'000C' HTML_VALID ored-in
360 * Using htmlElemDesc rather than name here, to emphasise the fact
361 * that otherwise there's a lookup overhead
364 d pr extproc('htmlAttrAllowed')
366 d #param1 value like(htmlElemDescPtr) const
367 d #param2 * value options(*string) const xmlChar *
368 d #param3 10i 0 value
370 d htmlElementAllowedHere...
371 d pr 10i 0 extproc('htmlElementAllowedHere')
372 d #param1 value like(htmlElemDescPtr) const
373 d #param2 * value options(*string) const xmlChar *
375 d htmlElementStatusHere...
376 d pr extproc('htmlElementStatusHere')
378 d #param1 value like(htmlElemDescPtr) const
379 d #param2 value like(htmlElemDescPtr) const
381 d htmlNodeStatus pr extproc('htmlNodeStatus')
383 d #param1 value like(htmlNodePtr)
384 d #param2 10i 0 value
386 * C macros implemented as procedures for ILE/RPG support.
388 d htmlDefaultSubelement...
389 d pr * extproc('__htmlDefaultSubelement') const char *
390 d elt * value const htmlElemDesc *
392 d htmlElementAllowedHereDesc...
394 d '__htmlElementAllowedHereDesc')
395 d parent * value const htmlElemDesc *
396 d elt * value const htmlElemDesc *
398 d htmlRequiredAttrs...
399 d pr * extproc('__htmlRequiredAttrs') const char * *
400 d elt * value const htmlElemDesc *
402 /endif LIBXML_HTML_ENABLED
403 /endif HTML_PARSER_H__