1 /*****************************************************************************
3 * Project ___| | | | _ \| |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
10 // Get a web page, parse it with libxml.
12 // Written by Lars Nilsson
14 // GNU C++ compile command line suggestion (edit paths accordingly):
16 // g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
17 // -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
23 #include <curl/curl.h>
24 #include <libxml/HTMLparser.h>
27 // Case-insensitive string comparison
31 #define COMPARE(a, b) (!stricmp((a), (b)))
33 #define COMPARE(a, b) (!strcasecmp((a), (b)))
37 // libxml callback context structure
42 Context(): addTitle(false) { }
49 // libcurl variables for error strings and returned data
51 static char errorBuffer[CURL_ERROR_SIZE];
52 static std::string buffer;
55 // libcurl write callback function
58 static int writer(char *data, size_t size, size_t nmemb,
59 std::string *writerData)
61 if (writerData == NULL)
64 writerData->append(data, size*nmemb);
70 // libcurl connection initialization
73 static bool init(CURL *&conn, char *url)
77 conn = curl_easy_init();
81 fprintf(stderr, "Failed to create CURL connection\n");
86 code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
89 fprintf(stderr, "Failed to set error buffer [%d]\n", code);
94 code = curl_easy_setopt(conn, CURLOPT_URL, url);
97 fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
102 code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L);
103 if (code != CURLE_OK)
105 fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
110 code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
111 if (code != CURLE_OK)
113 fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
118 code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
119 if (code != CURLE_OK)
121 fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
130 // libxml start element callback function
133 static void StartElement(void *voidContext,
135 const xmlChar **attributes)
137 Context *context = (Context *)voidContext;
139 if (COMPARE((char *)name, "TITLE"))
142 context->addTitle = true;
148 // libxml end element callback function
151 static void EndElement(void *voidContext,
154 Context *context = (Context *)voidContext;
156 if (COMPARE((char *)name, "TITLE"))
157 context->addTitle = false;
161 // Text handling helper function
164 static void handleCharacters(Context *context,
165 const xmlChar *chars,
168 if (context->addTitle)
169 context->title.append((char *)chars, length);
173 // libxml PCDATA callback function
176 static void Characters(void *voidContext,
177 const xmlChar *chars,
180 Context *context = (Context *)voidContext;
182 handleCharacters(context, chars, length);
186 // libxml CDATA callback function
189 static void cdata(void *voidContext,
190 const xmlChar *chars,
193 Context *context = (Context *)voidContext;
195 handleCharacters(context, chars, length);
199 // libxml SAX callback structure
202 static htmlSAXHandler saxHandler =
234 // Parse given (assumed to be) HTML text and return the title
237 static void parseHtml(const std::string &html,
240 htmlParserCtxtPtr ctxt;
243 ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
244 XML_CHAR_ENCODING_NONE);
246 htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
247 htmlParseChunk(ctxt, "", 0, 1);
249 htmlFreeParserCtxt(ctxt);
251 title = context.title;
254 int main(int argc, char *argv[])
260 // Ensure one argument is given
264 fprintf(stderr, "Usage: %s <url>\n", argv[0]);
269 curl_global_init(CURL_GLOBAL_DEFAULT);
271 // Initialize CURL connection
273 if (!init(conn, argv[1]))
275 fprintf(stderr, "Connection initializion failed\n");
280 // Retrieve content for the URL
282 code = curl_easy_perform(conn);
283 curl_easy_cleanup(conn);
285 if (code != CURLE_OK)
287 fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
292 // Parse the (assumed) HTML code
294 parseHtml(buffer, title);
296 // Display the extracted title
298 printf("Title: %s\n", title.c_str());