1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
5 * Copyright (C) 2012 Igalia S.L.
14 #include <glib/gi18n-lib.h>
18 #include "soup-tld-private.h"
22 * @short_description: Top-Level Domain Utilities
24 * These functions can be used to parse hostnames to attempt to determine
25 * what part of the name belongs to the domain owner, and what part is
26 * simply a "public suffix" such as ".com".
29 static void soup_tld_ensure_rules_hash_table (void);
30 static const char *soup_tld_get_base_domain_internal (const char *hostname,
31 guint additional_domains,
34 static GHashTable *rules = NULL;
35 static SoupTLDEntry tld_entries[] = {
36 #include "tld_data.inc"
39 /* Stores the entries data in a hash table to ease and speed up
43 soup_tld_ensure_rules_hash_table (void)
45 static gsize init = 0;
47 if (g_once_init_enter (&init)) {
50 rules = g_hash_table_new (g_str_hash, g_str_equal);
51 for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
52 g_hash_table_insert (rules, tld_entries[i].domain,
53 &(tld_entries[i].flags));
54 g_once_init_leave (&init, 1);
59 * soup_tld_get_base_domain:
60 * @hostname: a hostname
61 * @error: return location for a #GError, or %NULL to ignore
62 * errors. See #SoupTLDError for the available error codes
64 * Finds the base domain for a given @hostname. The base domain is
65 * composed by the top level domain (such as .org, .com, .co.uk, etc)
66 * plus the second level domain, for example for myhost.mydomain.com
67 * it will return mydomain.com.
69 * Note that %NULL will be returned for private URLs (those not ending
70 * with any well known TLD) because choosing a base domain for them
71 * would be totally arbitrary.
73 * Prior to libsoup 2.46, this function required that @hostname be in
74 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
75 * UTF-8 or ASCII format (and the return value will be in the same
78 * Returns: a pointer to the start of the base domain in @hostname. If
79 * an error occurs, %NULL will be returned and @error set.
84 soup_tld_get_base_domain (const char *hostname, GError **error)
86 g_return_val_if_fail (hostname, NULL);
88 return soup_tld_get_base_domain_internal (hostname, 1, error);
92 * soup_tld_domain_is_public_suffix:
93 * @domain: a domain name
95 * Looks whether the @domain passed as argument is a public domain
96 * suffix (.org, .com, .co.uk, etc) or not.
98 * Prior to libsoup 2.46, this function required that @domain be in
99 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
100 * UTF-8 or ASCII format (and the return value will be in the same
103 * Returns: %TRUE if it is a public domain, %FALSE otherwise.
108 soup_tld_domain_is_public_suffix (const char *domain)
110 const char *base_domain;
111 GError *error = NULL;
113 g_return_val_if_fail (domain, FALSE);
115 /* Skip the leading '.' if present */
116 if (*domain == '.' && !*(++domain))
117 g_return_val_if_reached (FALSE);
119 base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
120 if (g_strcmp0 (domain, base_domain)) {
121 g_clear_error (&error);
125 if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
126 g_error_free (error);
130 if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
131 g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
132 g_error_free (error);
133 g_return_val_if_reached (FALSE);
136 g_clear_error (&error);
144 * The #GError domain for soup-tld-related errors.
150 * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
152 * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
153 * actually an IP address (and thus has no base domain or
155 * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
156 * did not have enough components. Eg, calling
157 * soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
158 * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
159 * no recognized public suffix.
161 * Error codes for %SOUP_TLD_ERROR.
167 soup_tld_error_quark (void)
171 error = g_quark_from_static_string ("soup_tld_error_quark");
176 soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
178 char *prev_domain, *cur_domain, *next_dot;
180 const char *orig_hostname = NULL, *tld;
181 char *utf8_hostname = NULL;
183 soup_tld_ensure_rules_hash_table ();
185 if (g_hostname_is_ip_address (hostname)) {
186 g_set_error_literal (error, SOUP_TLD_ERROR,
187 SOUP_TLD_ERROR_IS_IP_ADDRESS,
188 _("Hostname is an IP address"));
192 if (g_hostname_is_ascii_encoded (hostname)) {
193 orig_hostname = hostname;
194 hostname = utf8_hostname = g_hostname_to_unicode (hostname);
196 g_set_error_literal (error, SOUP_TLD_ERROR,
197 SOUP_TLD_ERROR_INVALID_HOSTNAME,
198 _("Invalid hostname"));
203 cur_domain = (char *) hostname;
206 /* Process matching rules from longest to shortest. Logic
207 * based on Mozilla's implementation of nsEffectiveTLDService.
211 gboolean domain_found;
214 /* Valid hostnames neither start with a dot nor have more than one
217 if (*cur_domain == '.') {
218 g_set_error_literal (error, SOUP_TLD_ERROR,
219 SOUP_TLD_ERROR_INVALID_HOSTNAME,
220 _("Invalid hostname"));
221 g_free (utf8_hostname);
225 next_dot = strchr (cur_domain, '.');
226 domain_found = g_hash_table_lookup_extended (rules, cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags);
227 /* We compare the keys just to be sure that we haven't hit a collision */
228 if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
229 if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
230 /* If we match a *. rule and there were no previous exceptions
231 * nor previous domains then treat it as an exact match.
233 tld = prev_domain ? prev_domain : cur_domain;
235 } else if (*flags == SOUP_TLD_RULE_NORMAL) {
238 } else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
244 /* If we hit the top and haven't matched yet, then it
245 * has no public suffix.
248 g_set_error_literal (error, SOUP_TLD_ERROR,
249 SOUP_TLD_ERROR_NO_BASE_DOMAIN,
250 _("Hostname has no base domain"));
251 g_free (utf8_hostname);
255 prev_domain = cur_domain;
256 cur_domain = next_dot + 1;
263 /* Count the number of dots that appear after tld in
264 * utf8_hostname, and then find the corresponding spot
267 for (p = tld, dots = 0; *p; p++) {
272 for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
273 if (*(p - 1) == '.') {
280 /* It's not possible for utf8_hostname to have had
281 * more dots than orig_hostname.
283 g_assert (dots == 0);
286 g_free (utf8_hostname);
287 hostname = orig_hostname;
290 /* Include the additional number of domains requested. */
291 add_domains = additional_domains;
292 while (tld != hostname) {
293 if (*(--tld) == '.' && (!(add_domains--))) {
300 /* If additional_domains > 0 then we haven't found enough additional domains. */
302 g_set_error_literal (error, SOUP_TLD_ERROR,
303 SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
304 _("Not enough domains"));