1 * Summary: interface for an HTML 4.0 non-verifying parser
2 * Description: this module implements an HTML 4.0 non-verifying parser
3 * with API compatible with the XML parser ones. It should
4 * be able to parse "real world" HTML, even if severely
5 * broken from a specification point of view.
7 * Copy: See Copyright for the status of this software.
9 * Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
11 /if not defined(HTML_PARSER_H__)
12 /define HTML_PARSER_H__
14 /include "libxmlrpg/xmlversion"
16 /if defined(LIBXML_HTML_ENABLED)
18 /include "libxmlrpg/xmlTypesC"
19 /include "libxmlrpg/parser"
21 * Most of the back-end structures from XML and HTML are shared.
23 d htmlParserCtxtPtr...
24 d s based(######typedef######)
25 d like(xmlParserCtxtPtr)
27 d htmlParserCtxt ds based(htmlParserCtxtPtr)
28 d likeds(xmlParserCtxt)
30 d htmlParserNodeInfoPtr...
31 d s based(######typedef######)
32 d like(xmlParserNodeInfoPtr)
34 d htmlParserNodeInfo...
35 d ds based(htmlParserNodeInfoPtr)
36 d likeds(xmlParserNodeInfo)
38 d htmlSAXHandlerPtr...
39 d s based(######typedef######)
40 d like(xmlSAXHandlerPtr)
42 d htmlSAXHandler ds based(htmlSAXHandlerPtr)
43 d likeds(xmlSAXHandler)
45 d htmlParserInputPtr...
46 d s based(######typedef######)
47 d like(xmlParserInputPtr)
50 d ds based(htmlParserInputPtr)
51 d likeds(xmlParserInput)
53 d htmlDocPtr s based(######typedef######)
56 d htmlNodePtr s based(######typedef######)
59 * Internal description of an HTML element, representing HTML 4.01
60 * and XHTML 1.0 (which share the same structure).
63 d s * based(######typedef######)
65 d htmlElemDesc ds based(htmlElemDescPtr)
68 d startTag like(xmlCchar) Start tag implied ?
69 d endTag like(xmlCchar) End tag implied ?
70 d saveEndTag like(xmlCchar) Save end tag ?
71 d empty like(xmlCchar) Empty element ?
72 d depr like(xmlCchar) Deprecated element ?
73 d dtd like(xmlCchar) Loose DTD/Frameset
74 d isinline like(xmlCchar) Block 0/inline elem?
77 * New fields encapsulating HTML structure
80 * This is a very limited representation. It fails to tell us when
81 * an element *requires* subelements (we only have whether they're
82 * allowed or not), and it doesn't tell us where CDATA and PCDATA
83 * are allowed. Some element relationships are not fully represented:
84 * these are flagged with the word MODIFIER
86 d subelts * const char * *
87 d defaultsubelt * const char *
88 d attrs_opt * const char * *
89 d attrs_depr * const char * *
90 d attrs_req * const char * *
92 * Internal description of an HTML entity.
94 d htmlEntityDescPtr...
95 d s * based(######typedef######)
98 d ds based(htmlEntityDescPtr)
100 d value like(xmlCuint)
101 d name * const char *
102 d desc * const char *
104 * There is only few public functions.
106 d htmlTagLookup pr extproc('htmlTagLookup')
107 d like(htmlElemDescPtr) const
108 d tag * value options(*string) const xmlChar *
110 d htmlEntityLookup...
111 d pr extproc('htmlEntityLookup')
112 d like(htmlEntityDescPtr) const
113 d name * value options(*string) const xmlChar *
115 d htmlEntityValueLookup...
116 d pr extproc('htmlEntityValueLookup')
117 d like(htmlEntityDescPtr) const
118 d value value like(xmlCuint)
120 d htmlIsAutoClosed...
121 d pr extproc('htmlIsAutoClosed')
123 d doc value like(htmlDocPtr)
124 d elem value like(htmlNodePtr)
126 d htmlAutoCloseTag...
127 d pr extproc('htmlAutoCloseTag')
129 d doc value like(htmlDocPtr)
130 d name * value options(*string) const xmlChar *
131 d elem value like(htmlNodePtr)
133 d htmlParseEntityRef...
134 d pr extproc('htmlParseEntityRef')
135 d like(htmlEntityDescPtr) const
136 d ctxt value like(htmlParserCtxtPtr)
137 d str * const xmlChar *(*)
139 d htmlParseCharRef...
140 d pr extproc('htmlParseCharRef')
142 d ctxt value like(htmlParserCtxtPtr)
144 d htmlParseElement...
145 d pr extproc('htmlParseElement')
146 d ctxt value like(htmlParserCtxtPtr)
148 d htmlNewParserCtxt...
149 d pr extproc('htmlNewParserCtxt')
150 d like(htmlParserCtxtPtr)
152 d htmlCreateMemoryParserCtxt...
153 d pr extproc('htmlCreateMemoryParserCtxt')
154 d like(htmlParserCtxtPtr)
155 d buffer * value options(*string) const char *
156 d size value like(xmlCint)
158 d htmlParseDocument...
159 d pr extproc('htmlParseDocument')
161 d ctxt value like(htmlParserCtxtPtr)
164 d pr extproc('htmlSAXParseDoc')
166 d cur * value options(*string) xmlChar *
167 d encoding * value options(*string) const char *
168 d sax value like(htmlSAXHandlerPtr)
169 d userData * value void *
171 d htmlParseDoc pr extproc('htmlParseDoc')
173 d cur * value options(*string) xmlChar *
174 d encoding * value options(*string) const char *
176 d htmlSAXParseFile...
177 d pr extproc('htmlSAXParseFile')
179 d filename * value options(*string) const char *
180 d encoding * value options(*string) const char *
181 d sax value like(htmlSAXHandlerPtr)
182 d userData * value void *
184 d htmlParseFile pr extproc('htmlParseFile')
186 d filename * value options(*string) const char *
187 d encoding * value options(*string) const char *
189 d UTF8ToHtml pr extproc('UTF8ToHtml')
191 d out 65535 options(*varsize) unsigned char []
192 d outlen like(xmlCint)
193 d in * value options(*string) const unsigned char*
194 d inlen like(xmlCint)
196 d htmlEncodeEntities...
197 d pr extproc('htmlEncodeEntities')
199 d out 65535 options(*varsize) unsigned char []
200 d outlen like(xmlCint)
201 d in * value options(*string) const unsigned char*
202 d inlen like(xmlCint)
203 d quoteChar value like(xmlCint)
205 d htmlIsScriptAttribute...
206 d pr extproc('htmlIsScriptAttribute')
208 d name * value options(*string) const xmlChar *
210 d htmlHandleOmittedElem...
211 d pr extproc('htmlHandleOmittedElem')
213 d val value like(xmlCint)
215 /if defined(LIBXML_PUSH_ENABLED)
217 * Interfaces for the Push mode.
219 d htmlCreatePushParserCtxt...
220 d pr extproc('htmlCreatePushParserCtxt')
221 d like(htmlParserCtxtPtr)
222 d sax value like(htmlSAXHandlerPtr)
223 d user_data * value void *
224 d chunk * value options(*string) const char *
225 d size value like(xmlCint)
226 d filename * value options(*string) const char *
227 d enc value like(xmlCharEncoding)
229 d htmlParseChunk pr extproc('htmlParseChunk')
231 d ctxt value like(htmlParserCtxtPtr)
232 d chunk * value options(*string) const char *
233 d size value like(xmlCint)
234 d terminate value like(xmlCint)
235 /endif LIBXML_PUSH_ENABLED
237 d htmlFreeParserCtxt...
238 d pr extproc('htmlFreeParserCtxt')
239 d ctxt value like(htmlParserCtxtPtr)
241 * New set of simpler/more flexible APIs
245 * This is the set of XML parser options that can be passed down
246 * to the xmlReadDoc() and similar calls.
248 d htmlParserOption...
249 d s based(######typedef######)
251 d HTML_PARSE_RECOVER... Relaxed parsing
253 d HTML_PARSE_NODEFDTD... No default doctype
255 d HTML_PARSE_NOERROR... No error reports
257 d HTML_PARSE_NOWARNING... No warning reports
259 d HTML_PARSE_PEDANTIC... Pedantic err reports
261 d HTML_PARSE_NOBLANKS... Remove blank nodes
263 d HTML_PARSE_NONET... Forbid net access
265 d HTML_PARSE_NOIMPLIED... No implied html/body
267 d HTML_PARSE_COMPACT... compact small txtnod
269 d HTML_PARSE_IGNORE_ENC... Ignore encoding hint
272 d htmlCtxtReset pr extproc('htmlCtxtReset')
273 d ctxt value like(htmlParserCtxtPtr)
275 d htmlCtxtUseOptions...
276 d pr extproc('htmlCtxtUseOptions')
278 d ctxt value like(htmlParserCtxtPtr)
279 d options value like(xmlCint)
281 d htmlReadDoc pr extproc('htmlReadDoc')
283 d cur * value options(*string) const xmlChar *
284 d URL * value options(*string) const char *
285 d encoding * value options(*string) const char *
286 d options value like(xmlCint)
288 d htmlReadFile pr extproc('htmlReadFile')
290 d URL * value options(*string) const char *
291 d encoding * value options(*string) const char *
292 d options value like(xmlCint)
294 d htmlReadMemory pr extproc('htmlReadMemory')
296 d buffer * value options(*string) const char *
297 d size value like(xmlCint)
298 d URL * value options(*string) const char *
299 d encoding * value options(*string) const char *
300 d options value like(xmlCint)
302 d htmlReadFd pr extproc('htmlReadFd')
304 d fd value like(xmlCint)
305 d URL * value options(*string) const char *
306 d encoding * value options(*string) const char *
307 d options value like(xmlCint)
309 d htmlReadIO pr extproc('htmlReadIO')
311 d ioread value like(xmlInputReadCallback)
312 d ioclose value like(xmlInputCloseCallback)
313 d ioctx * value void *
314 d URL * value options(*string) const char *
315 d encoding * value options(*string) const char *
316 d options value like(xmlCint)
319 d pr extproc('htmlCtxtReadDoc')
321 d ctxt value like(xmlParserCtxtPtr)
322 d cur * value options(*string) const xmlChar *
323 d URL * value options(*string) const char *
324 d encoding * value options(*string) const char *
325 d options value like(xmlCint)
327 d htmlCtxtReadFile...
328 d pr extproc('htmlCtxtReadFile')
330 d ctxt value like(xmlParserCtxtPtr)
331 d filename * value options(*string) const char *
332 d encoding * value options(*string) const char *
333 d options value like(xmlCint)
335 d htmlCtxtReadMemory...
336 d pr extproc('htmlCtxtReadMemory')
338 d ctxt value like(xmlParserCtxtPtr)
339 d buffer * value options(*string) const char *
340 d size value like(xmlCint)
341 d URL * value options(*string) const char *
342 d encoding * value options(*string) const char *
343 d options value like(xmlCint)
345 d htmlCtxtReadFd pr extproc('htmlCtxtReadFd')
347 d ctxt value like(xmlParserCtxtPtr)
348 d fd value like(xmlCint)
349 d URL * value options(*string) const char *
350 d encoding * value options(*string) const char *
351 d options value like(xmlCint)
353 d htmlCtxtReadIO pr extproc('htmlCtxtReadIO')
355 d ctxt value like(xmlParserCtxtPtr)
356 d ioread value like(xmlInputReadCallback)
357 d ioclose value like(xmlInputCloseCallback)
358 d ioctx * value void *
359 d URL * value options(*string) const char *
360 d encoding * value options(*string) const char *
361 d options value like(xmlCint)
363 * Further knowledge of HTML structure
365 d htmlStatus s based(######typedef######)
367 d HTML_NA c X'0000' No check at all
368 d HTML_INVALID c X'0001'
371 d HTML_VALID c X'0004'
372 d HTML_REQUIRED c X'000C' HTML_VALID ored-in
374 * Using htmlElemDesc rather than name here, to emphasise the fact
375 * that otherwise there's a lookup overhead
378 d pr extproc('htmlAttrAllowed')
380 d #param1 value like(htmlElemDescPtr) const
381 d #param2 * value options(*string) const xmlChar *
382 d #param3 value like(xmlCint)
384 d htmlElementAllowedHere...
385 d pr extproc('htmlElementAllowedHere')
387 d #param1 value like(htmlElemDescPtr) const
388 d #param2 * value options(*string) const xmlChar *
390 d htmlElementStatusHere...
391 d pr extproc('htmlElementStatusHere')
393 d #param1 value like(htmlElemDescPtr) const
394 d #param2 value like(htmlElemDescPtr) const
396 d htmlNodeStatus pr extproc('htmlNodeStatus')
398 d #param1 value like(htmlNodePtr)
399 d #param2 value like(xmlCint)
401 * C macros implemented as procedures for ILE/RPG support.
403 d htmlDefaultSubelement...
404 d pr * extproc('__htmlDefaultSubelement') const char *
405 d elt * value const htmlElemDesc *
407 d htmlElementAllowedHereDesc...
409 d '__htmlElementAllowedHereDesc')
411 d parent * value const htmlElemDesc *
412 d elt * value const htmlElemDesc *
414 d htmlRequiredAttrs...
415 d pr * extproc('__htmlRequiredAttrs') const char * *
416 d elt * value const htmlElemDesc *
418 /endif LIBXML_HTML_ENABLED
419 /endif HTML_PARSER_H__