1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
5 * Copyright (C) 2012 Igalia S.L.
14 #include <glib/gi18n-lib.h>
18 #include "soup-tld-private.h"
22 * @short_description: Top-Level Domain Utilities
24 * These functions can be used to parse hostnames to attempt to determine
25 * what part of the name belongs to the domain owner, and what part is
26 * simply a "public suffix" such as ".com".
29 static void soup_tld_ensure_rules_hash_table (void);
30 static const char *soup_tld_get_base_domain_internal (const char *hostname,
31 guint additional_domains,
34 static GHashTable *rules = NULL;
35 static SoupTLDEntry tld_entries[] = {
36 #include "tld_data.inc"
39 /* Stores the entries data in a hash table to ease and speed up
43 soup_tld_ensure_rules_hash_table (void)
45 static gsize init = 0;
47 if (g_once_init_enter (&init)) {
50 rules = g_hash_table_new (g_str_hash, g_str_equal);
51 for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
52 g_hash_table_insert (rules, tld_entries[i].domain,
53 &(tld_entries[i].flags));
54 g_once_init_leave (&init, 1);
59 * soup_tld_get_base_domain:
60 * @hostname: a UTF-8 hostname in its canonical representation form
61 * @error: return location for a #GError, or %NULL to ignore
62 * errors. See #SoupTLDError for the available error codes
64 * Finds the base domain for a given @hostname. The base domain is
65 * composed by the top level domain (such as .org, .com, .co.uk, etc)
66 * plus the second level domain, for example for myhost.mydomain.com
67 * it will return mydomain.com.
69 * Note that %NULL will be returned for private URLs (those not ending
70 * with any well known TLD) because choosing a base domain for them
71 * would be totally arbitrary.
73 * This method only works for valid UTF-8 hostnames in their canonical
74 * representation form, so you should use g_hostname_to_unicode() to
75 * get the canonical representation if that is not the case.
77 * Returns: a pointer to the start of the base domain in @hostname. If
78 * an error occurs, %NULL will be returned and @error set.
83 soup_tld_get_base_domain (const char *hostname, GError **error)
85 g_return_val_if_fail (hostname, NULL);
86 g_return_val_if_fail (!g_hostname_is_ascii_encoded (hostname), FALSE);
88 return soup_tld_get_base_domain_internal (hostname, 1, error);
92 * soup_tld_domain_is_public_suffix:
93 * @domain: a UTF-8 domain in its canonical representation form
95 * Looks whether the @domain passed as argument is a public domain
96 * suffix (.org, .com, .co.uk, etc) or not.
98 * This method only works for valid UTF-8 domains in their canonical
99 * representation form, so you should use g_hostname_to_unicode() to
100 * get the canonical representation if that is not the case.
102 * Returns: %TRUE if it is a public domain, %FALSE otherwise.
107 soup_tld_domain_is_public_suffix (const char *domain)
109 const char *base_domain;
110 GError *error = NULL;
112 g_return_val_if_fail (domain, FALSE);
114 /* Skip the leading '.' if present */
115 if (*domain == '.' && !*(++domain))
116 g_return_val_if_reached (FALSE);
118 base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
119 if (g_strcmp0 (domain, base_domain)) {
120 g_clear_error (&error);
124 if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
125 g_error_free (error);
129 if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
130 g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
131 g_error_free (error);
132 g_return_val_if_reached (FALSE);
135 g_clear_error (&error);
143 * The #GError domain for soup-tld-related errors.
149 * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
151 * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
152 * actually an IP address (and thus has no base domain or
154 * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
155 * did not have enough components. Eg, calling
156 * soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
157 * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
158 * no recognized public suffix.
160 * Error codes for %SOUP_TLD_ERROR.
166 soup_tld_error_quark (void)
170 error = g_quark_from_static_string ("soup_tld_error_quark");
175 soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
177 char *prev_domain, *cur_domain, *tld, *next_dot;
180 soup_tld_ensure_rules_hash_table ();
182 if (g_hostname_is_ip_address (hostname)) {
183 g_set_error_literal (error, SOUP_TLD_ERROR,
184 SOUP_TLD_ERROR_IS_IP_ADDRESS,
185 _("Hostname is an IP address"));
189 cur_domain = (char *) hostname;
192 /* Process matching rules from longest to shortest. Logic
193 * based on Mozilla's implementation of nsEffectiveTLDService.
197 gboolean domain_found;
200 /* Valid hostnames neither start with a dot nor have more than one
203 if (*cur_domain == '.') {
204 g_set_error_literal (error, SOUP_TLD_ERROR,
205 SOUP_TLD_ERROR_INVALID_HOSTNAME,
206 _("Invalid hostname"));
210 next_dot = strchr (cur_domain, '.');
211 domain_found = g_hash_table_lookup_extended (rules, cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags);
212 /* We compare the keys just to be sure that we haven't hit a collision */
213 if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
214 if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
215 /* If we match a *. rule and there were no previous exceptions
216 * nor previous domains then treat it as an exact match.
218 tld = prev_domain ? prev_domain : cur_domain;
220 } else if (*flags == SOUP_TLD_RULE_NORMAL) {
223 } else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
229 /* If we hit the top and haven't matched yet, then it
230 * has no public suffix.
233 g_set_error_literal (error, SOUP_TLD_ERROR,
234 SOUP_TLD_ERROR_NO_BASE_DOMAIN,
235 _("Hostname has no base domain"));
239 prev_domain = cur_domain;
240 cur_domain = next_dot + 1;
243 /* Include the additional number of domains requested. */
244 add_domains = additional_domains;
245 while (tld != hostname) {
246 if (*(--tld) == '.' && (!(add_domains--))) {
253 /* If additional_domains > 0 then we haven't found enough additional domains. */
255 g_set_error_literal (error, SOUP_TLD_ERROR,
256 SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
257 _("Not enough domains"));