2 * nghttp2 - HTTP/2 C Library
4 * Copyright (c) 2012 Tatsuhiro Tsujikawa
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #include "HtmlParser.h"
27 #include <libxml/uri.h>
33 ParserData::ParserData(const std::string &base_uri) : base_uri(base_uri) {}
35 HtmlParser::HtmlParser(const std::string &base_uri)
36 : base_uri_(base_uri), parser_ctx_(nullptr), parser_data_(base_uri) {}
38 HtmlParser::~HtmlParser() { htmlFreeParserCtxt(parser_ctx_); }
41 const char *get_attr(const xmlChar **attrs, const char *name) {
42 if (attrs == nullptr) {
45 for (; *attrs; attrs += 2) {
46 if (util::strieq(reinterpret_cast<const char *>(attrs[0]), name)) {
47 return reinterpret_cast<const char *>(attrs[1]);
55 void add_link(ParserData *parser_data, const char *uri, RequestPriority pri) {
57 reinterpret_cast<const xmlChar *>(uri),
58 reinterpret_cast<const xmlChar *>(parser_data->base_uri.c_str()));
60 parser_data->links.push_back(
61 std::make_pair(reinterpret_cast<char *>(u), pri));
68 void start_element_func(void *user_data, const xmlChar *name,
69 const xmlChar **attrs) {
70 auto parser_data = static_cast<ParserData *>(user_data);
71 if (util::strieq(reinterpret_cast<const char *>(name), "link")) {
72 auto rel_attr = get_attr(attrs, "rel");
73 auto href_attr = get_attr(attrs, "href");
77 if (util::strieq(rel_attr, "shortcut icon")) {
78 add_link(parser_data, href_attr, REQ_PRI_LOWEST);
79 } else if (util::strieq(rel_attr, "stylesheet")) {
80 add_link(parser_data, href_attr, REQ_PRI_MEDIUM);
82 } else if (util::strieq(reinterpret_cast<const char *>(name), "img")) {
83 auto src_attr = get_attr(attrs, "src");
87 add_link(parser_data, src_attr, REQ_PRI_LOWEST);
88 } else if (util::strieq(reinterpret_cast<const char *>(name), "script")) {
89 auto src_attr = get_attr(attrs, "src");
93 add_link(parser_data, src_attr, REQ_PRI_LOW);
99 xmlSAXHandler saxHandler = {
100 nullptr, // internalSubsetSAXFunc
101 nullptr, // isStandaloneSAXFunc
102 nullptr, // hasInternalSubsetSAXFunc
103 nullptr, // hasExternalSubsetSAXFunc
104 nullptr, // resolveEntitySAXFunc
105 nullptr, // getEntitySAXFunc
106 nullptr, // entityDeclSAXFunc
107 nullptr, // notationDeclSAXFunc
108 nullptr, // attributeDeclSAXFunc
109 nullptr, // elementDeclSAXFunc
110 nullptr, // unparsedEntityDeclSAXFunc
111 nullptr, // setDocumentLocatorSAXFunc
112 nullptr, // startDocumentSAXFunc
113 nullptr, // endDocumentSAXFunc
114 &start_element_func, // startElementSAXFunc
115 nullptr, // endElementSAXFunc
116 nullptr, // referenceSAXFunc
117 nullptr, // charactersSAXFunc
118 nullptr, // ignorableWhitespaceSAXFunc
119 nullptr, // processingInstructionSAXFunc
120 nullptr, // commentSAXFunc
121 nullptr, // warningSAXFunc
122 nullptr, // errorSAXFunc
123 nullptr, // fatalErrorSAXFunc
124 nullptr, // getParameterEntitySAXFunc
125 nullptr, // cdataBlockSAXFunc
126 nullptr, // externalSubsetSAXFunc
127 0, // unsigned int initialized
128 nullptr, // void * _private
129 nullptr, // startElementNsSAX2Func
130 nullptr, // endElementNsSAX2Func
131 nullptr, // xmlStructuredErrorFunc
135 int HtmlParser::parse_chunk(const char *chunk, size_t size, int fin) {
138 htmlCreatePushParserCtxt(&saxHandler, &parser_data_, chunk, size,
139 base_uri_.c_str(), XML_CHAR_ENCODING_NONE);
144 return parse_chunk_internal(nullptr, 0, fin);
150 return parse_chunk_internal(chunk, size, fin);
154 int HtmlParser::parse_chunk_internal(const char *chunk, size_t size, int fin) {
155 int rv = htmlParseChunk(parser_ctx_, chunk, size, fin);
163 const std::vector<std::pair<std::string, RequestPriority>> &
164 HtmlParser::get_links() const {
165 return parser_data_.links;
168 void HtmlParser::clear_links() { parser_data_.links.clear(); }
170 } // namespace nghttp2