|   |   |   |   | libxml2 Reference Manual | 
|---|
HTMLparser - interface for an HTML 4.0 non-verifying parser
this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.
Author(s): Daniel Veillard
#define htmlDefaultSubelement(elt); #define htmlElementAllowedHereDesc(parent, elt); #define htmlRequiredAttrs(elt); typedef xmlParserNodeInfo htmlParserNodeInfo; typedef xmlParserInput htmlParserInput; typedef xmlParserCtxtPtr htmlParserCtxtPtr; typedef struct _htmlEntityDesc htmlEntityDesc; typedef xmlDocPtr htmlDocPtr; typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; typedef enum htmlStatus; typedef xmlNodePtr htmlNodePtr; typedef htmlElemDesc * htmlElemDescPtr; typedef struct _htmlElemDesc htmlElemDesc; typedef xmlSAXHandler htmlSAXHandler; typedef xmlParserInputPtr htmlParserInputPtr; typedef enum htmlParserOption; typedef htmlEntityDesc * htmlEntityDescPtr; typedef xmlParserCtxt htmlParserCtxt; int htmlIsScriptAttribute (const xmlChar * name); int htmlHandleOmittedElem (int val); htmlDocPtr htmlReadFd (int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlParseFile (const char * filename,
const char * encoding); htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * cur,
const char * URL,
const char * encoding,
int options); int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem); int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate); const htmlElemDesc * htmlTagLookup (const xmlChar * tag); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size); void htmlCtxtReset (htmlParserCtxtPtr ctxt); int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt); htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc); htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem); int htmlParseCharRef (htmlParserCtxtPtr ctxt); htmlDocPtr htmlReadDoc (const xmlChar * cur,
const char * URL,
const char * encoding,
int options); int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar); htmlStatus htmlNodeStatus (const htmlNodePtr node,
int legacy); htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy); htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str); htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt); const htmlEntityDesc * htmlEntityValueLookup (unsigned int value); void htmlParseElement (htmlParserCtxtPtr ctxt); int UTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); const htmlEntityDesc * htmlEntityLookup (const xmlChar * name); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options); int htmlParseDocument (htmlParserCtxtPtr ctxt); htmlParserCtxtPtr htmlNewParserCtxt (void); htmlDocPtr htmlSAXParseDoc (xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options); htmlDocPtr htmlParseDoc (xmlChar * cur,
const char * encoding);
#define htmlDefaultSubelement(elt);
Returns the default subelement for this element
| elt: | HTML element | 
#define htmlElementAllowedHereDesc(parent, elt);
Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.
| parent: | HTML parent element | 
| elt: | HTML element | 
#define htmlRequiredAttrs(elt);
Returns the attributes required for the specified element.
| elt: | HTML element | 
struct _htmlElemDesc {
    const char *	name	: The tag name
    char	startTag	: Whether the start tag can be implied
    char	endTag	: Whether the end tag can be implied
    char	saveEndTag	: Whether the end tag should be saved
    char	empty	: Is this an empty element ?
    char	depr	: Is this a deprecated element ?
    char	dtd	: 1: only in Loose DTD, 2: only Frameset one
    char	isinline	: is this a block 0 or inline 1 element
    const char *	desc	: the description NRK Jan.2003 * New fields encapsulating HTML structur
    const char **	subelts	: allowed sub-elements of this element
    const char *	defaultsubelt	: subelement for suggested auto-repair if necessary or NULL
    const char **	attrs_opt	: Optional Attributes
    const char **	attrs_depr	: Additional deprecated attributes
    const char **	attrs_req	: Required attributes
} htmlElemDesc;
htmlElemDesc * htmlElemDescPtr;
struct _htmlEntityDesc {
    unsigned int	value	: the UNICODE value for the character
    const char *	name	: The entity name
    const char *	desc	: the description
} htmlEntityDesc;
htmlEntityDesc * htmlEntityDescPtr;
xmlNodePtr htmlNodePtr;
xmlParserCtxt htmlParserCtxt;
xmlParserCtxtPtr htmlParserCtxtPtr;
xmlParserInput htmlParserInput;
xmlParserInputPtr htmlParserInputPtr;
xmlParserNodeInfo htmlParserNodeInfo;
enum htmlParserOption { HTML_PARSE_RECOVER = 1 /* Relaxed parsing */ HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 32 /* suppress error reports */ HTML_PARSE_NOWARNING = 64 /* suppress warning reports */ HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */ HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */ HTML_PARSE_NONET = 2048 /* Forbid network access */ HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */ HTML_PARSE_COMPACT = 65536 /* compact small text nodes */ HTML_PARSE_IGNORE_ENC = 2097152 /* ignore internal document encoding hint */ };
xmlSAXHandler htmlSAXHandler;
xmlSAXHandlerPtr htmlSAXHandlerPtr;
enum htmlStatus { HTML_NA = 0 /* something we don't check at all */ HTML_INVALID = 1 HTML_DEPRECATED = 2 HTML_VALID = 4 HTML_REQUIRED = 12 /* VALID bit set so ( & HTML_VALID ) is TRUE */ };
int UTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of @out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of @in | 
| Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. | 
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy)
Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes
| elt: | HTML element | 
| attr: | HTML attribute | 
| legacy: | whether to allow deprecated attributes | 
| Returns: | one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID | 
int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
| doc: | the HTML document | 
| name: | The tag name | 
| elem: | the HTML element | 
| Returns: | 1 if autoclose, 0 otherwise | 
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size)
Create a parser context for an HTML in-memory document.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| Returns: | the new parser context or NULL | 
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.
| sax: | a SAX handler | 
| user_data: | The user data returned on SAX callbacks | 
| chunk: | a pointer to an array of chars | 
| size: | number of chars in the array | 
| filename: | an optional file name or URI | 
| enc: | an optional encoding | 
| Returns: | the new parser context or NULL | 
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
| ctxt: | an HTML parser context | 
| cur: | a pointer to a zero terminated string | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options)
parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context
| ctxt: | an HTML parser context | 
| fd: | an open file descriptor | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options)
parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context
| ctxt: | an HTML parser context | 
| filename: | a file or URL | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context
| ctxt: | an HTML parser context | 
| ioread: | an I/O read function | 
| ioclose: | an I/O close function | 
| ioctx: | an I/O handler | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
| ctxt: | an HTML parser context | 
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
void htmlCtxtReset (htmlParserCtxtPtr ctxt)
Reset a parser context
| ctxt: | an HTML parser context | 
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options)
Applies the options to the parser context
| ctxt: | an HTML parser context | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. | 
int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt)
Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements
| parent: | HTML parent element | 
| elt: | HTML element | 
| Returns: | 1 if allowed; 0 otherwise. | 
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt)
Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.
| parent: | HTML parent element | 
| elt: | HTML element | 
| Returns: | one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID | 
int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of @out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of @in | 
| quoteChar: | the quote character to escape (' or ") or zero. | 
| Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. | 
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
| name: | the entity name | 
| Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. | 
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
| value: | the entity's unicode value | 
| Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. | 
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
| ctxt: | an HTML parser context | 
int htmlHandleOmittedElem (int val)
Set and return the previous value for handling HTML omitted tags.
| val: | int 0 or 1 | 
| Returns: | the last value for 0 for no handling, 1 for auto insertion. | 
int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
| doc: | the HTML document | 
| elem: | the HTML element | 
| Returns: | 1 if autoclosed, 0 otherwise | 
int htmlIsScriptAttribute (const xmlChar * name)
Check if an attribute is of content type Script
htmlParserCtxtPtr htmlNewParserCtxt (void)
Allocate and initialize a new parser context.
| Returns: | the htmlParserCtxtPtr or NULL in case of allocation error | 
htmlStatus htmlNodeStatus (const htmlNodePtr node,
int legacy)
Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)
| node: | an htmlNodePtr in a tree | 
| legacy: | whether to allow deprecated elements (YES is faster here for Element nodes) | 
| Returns: | for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed) | 
int htmlParseCharRef (htmlParserCtxtPtr ctxt)
parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
| ctxt: | an HTML parser context | 
| Returns: | the value parsed (as an int) | 
int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate)
Parse a Chunk of memory
| ctxt: | an HTML parser context | 
| chunk: | an char array | 
| size: | the size in byte of the chunk | 
| terminate: | last chunk indicator | 
| Returns: | zero if no error, the xmlParserErrors otherwise. | 
htmlDocPtr htmlParseDoc (xmlChar * cur,
const char * encoding)
parse an HTML in-memory document and build a tree.
| cur: | a pointer to an array of xmlChar | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| Returns: | the resulting document tree | 
int htmlParseDocument (htmlParserCtxtPtr ctxt)
parse an HTML document (and build a tree if using the standard SAX interface).
| ctxt: | an HTML parser context | 
| Returns: | 0, -1 in case of error. the parser context is augmented as a result of the parsing. | 
void htmlParseElement (htmlParserCtxtPtr ctxt)
parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
| ctxt: | an HTML parser context | 
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str)
parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'
| ctxt: | an HTML parser context | 
| str: | location to store the entity name | 
| Returns: | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. | 
htmlDocPtr htmlParseFile (const char * filename,
const char * encoding)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
| filename: | the filename | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadDoc (const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree.
| cur: | a pointer to a zero terminated string | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadFd (int fd,
const char * URL,
const char * encoding,
int options)
parse an XML from a file descriptor and build a tree.
| fd: | an open file descriptor | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options)
parse an XML file from the filesystem or the network.
| filename: | a file or URL | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
parse an HTML document from I/O functions and source and build a tree.
| ioread: | an I/O read function | 
| ioclose: | an I/O close function | 
| ioctx: | an I/O handler | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
parse an XML in-memory document and build a tree.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlSAXParseDoc (xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
| cur: | a pointer to an array of xmlChar | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. | 
htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
| filename: | the filename | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. | 
const htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Lookup the HTML tag in the ElementTable
| tag: | The tag name in lowercase | 
| Returns: | the related htmlElemDescPtr or NULL if not found. |