1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/search_engines/template_url_parser.h"
12 #include "base/functional/bind.h"
13 #include "base/logging.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_util.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/values.h"
18 #include "components/search_engines/search_terms_data.h"
19 #include "components/search_engines/template_url.h"
20 #include "services/data_decoder/public/cpp/data_decoder.h"
21 #include "services/data_decoder/public/cpp/safe_xml_parser.h"
22 #include "services/data_decoder/public/mojom/xml_parser.mojom.h"
23 #include "ui/gfx/favicon_size.h"
25 #include "url/url_constants.h"
29 // Defines for element names of the OSD document:
30 const char kURLElement[] = "Url";
31 const char kParamElement[] = "Param";
32 const char kShortNameElement[] = "ShortName";
33 const char kImageElement[] = "Image";
34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
36 const char kInputEncodingElement[] = "InputEncoding";
37 const char kAliasElement[] = "Alias";
39 // Various XML attributes used.
40 const char kURLTypeAttribute[] = "type";
41 const char kURLTemplateAttribute[] = "template";
42 const char kImageTypeAttribute[] = "type";
43 const char kImageWidthAttribute[] = "width";
44 const char kImageHeightAttribute[] = "height";
45 const char kParamNameAttribute[] = "name";
46 const char kParamValueAttribute[] = "value";
47 const char kParamMethodAttribute[] = "method";
49 // Mime type for search results.
50 const char kHTMLType[] = "text/html";
52 // Mime type for as you type suggestions.
53 const char kSuggestionType[] = "application/x-suggestions+json";
55 // Returns true if input_encoding contains a valid input encoding string. This
56 // doesn't verify that we have a valid encoding for the string, just that the
57 // string contains characters that constitute a valid input encoding.
58 bool IsValidEncodingString(const std::string& input_encoding) {
59 if (input_encoding.empty())
62 if (!base::IsAsciiAlpha(input_encoding[0]))
65 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
66 char c = input_encoding[i];
67 if (!base::IsAsciiAlpha(c) && !base::IsAsciiDigit(c) &&
68 c != '.' && c != '_' && c != '-') {
75 void AppendParamToQuery(const std::string& key,
76 const std::string& value,
87 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
88 bool IsHTTPRef(const std::string& url) {
92 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
93 gurl.SchemeIs(url::kHttpsScheme));
96 // SafeTemplateURLParser takes the output of the data_decoder service's
97 // XmlParser and extracts the data from the search description into a
99 class SafeTemplateURLParser {
106 // Key/value of a Param node.
107 using Param = std::pair<std::string, std::string>;
109 SafeTemplateURLParser(
110 const SearchTermsData* search_terms_data,
111 const TemplateURLParser::ParameterFilter& parameter_filter,
112 TemplateURLParser::ParseCallback callback)
113 : search_terms_data_(SearchTermsData::MakeSnapshot(search_terms_data)),
114 parameter_filter_(parameter_filter),
115 callback_(std::move(callback)) {}
117 SafeTemplateURLParser(const SafeTemplateURLParser&) = delete;
118 SafeTemplateURLParser& operator=(const SafeTemplateURLParser&) = delete;
120 // Parse callback for DataDecoder::ParseXml(). This calls the callback
121 // passed to the constructor upon completion.
122 void OnXmlParseComplete(
123 data_decoder::DataDecoder::ValueOrError value_or_error);
126 void ParseURLs(const std::vector<const base::Value*>& urls);
127 void ParseImages(const std::vector<const base::Value*>& images);
128 void ParseEncodings(const std::vector<const base::Value*>& encodings);
129 void ParseAliases(const std::vector<const base::Value*>& aliases);
131 std::unique_ptr<TemplateURL> FinalizeTemplateURL();
133 // Returns all child elements of |elem| named |tag|, which are searched
134 // for using the XML qualified namespaces in |namespaces_|.
135 bool GetChildElementsByTag(const base::Value& elem,
136 const std::string& tag,
137 std::vector<const base::Value*>* children);
139 // Data that gets updated as we parse, and is converted to a TemplateURL by
140 // FinalizeTemplateURL().
141 TemplateURLData data_;
143 // The HTTP methods used.
144 Method method_ = GET;
145 Method suggestion_method_ = GET;
147 // If true, the user has set a keyword and we should use it. Otherwise,
148 // we generate a keyword based on the URL.
149 bool has_custom_keyword_ = false;
151 // Whether we should derive the image from the URL (when images are data
153 bool derive_image_from_url_ = false;
155 // The XML namespaces that were declared on the root element. These are used
156 // to search for tags by name in GetChildElementsByTag(). Will always contain
157 // at least one element, if only the empty string.
158 std::vector<std::string> namespaces_;
160 // We have to own our own snapshot, because the parse request may outlive the
161 // originally provided SearchTermsData lifetime.
162 std::unique_ptr<SearchTermsData> search_terms_data_;
164 TemplateURLParser::ParameterFilter parameter_filter_;
165 TemplateURLParser::ParseCallback callback_;
168 void SafeTemplateURLParser::OnXmlParseComplete(
169 data_decoder::DataDecoder::ValueOrError value_or_error) {
170 if (!value_or_error.has_value()) {
171 DLOG(ERROR) << "Failed to parse XML: " << value_or_error.error();
172 std::move(callback_).Run(nullptr);
176 const base::Value& root = *value_or_error;
178 // Get the namespaces used in the XML document, which will be used
179 // to access nodes by tag name in GetChildElementsByTag().
180 if (const base::Value::Dict* namespaces = root.GetDict().FindDict(
181 data_decoder::mojom::XmlParser::kNamespacesKey)) {
182 for (auto item : *namespaces) {
183 namespaces_.push_back(item.first);
186 if (namespaces_.empty())
187 namespaces_.emplace_back();
189 std::string root_tag;
190 if (!data_decoder::GetXmlElementTagName(root, &root_tag) ||
191 (root_tag != kOpenSearchDescriptionElement &&
192 root_tag != kFirefoxSearchDescriptionElement)) {
193 DLOG(ERROR) << "Unexpected root tag: " << root_tag;
194 std::move(callback_).Run(nullptr);
198 // The only required element is the URL.
199 std::vector<const base::Value*> urls;
200 if (!GetChildElementsByTag(root, kURLElement, &urls)) {
201 std::move(callback_).Run(nullptr);
206 std::vector<const base::Value*> images;
207 if (GetChildElementsByTag(root, kImageElement, &images))
210 std::vector<const base::Value*> encodings;
211 if (GetChildElementsByTag(root, kInputEncodingElement, &encodings))
212 ParseEncodings(encodings);
214 std::vector<const base::Value*> aliases;
215 if (GetChildElementsByTag(root, kAliasElement, &aliases))
216 ParseAliases(aliases);
218 std::vector<const base::Value*> short_names;
219 if (GetChildElementsByTag(root, kShortNameElement, &short_names)) {
221 if (data_decoder::GetXmlElementText(*short_names.back(), &name))
222 data_.SetShortName(base::UTF8ToUTF16(name));
225 std::move(callback_).Run(FinalizeTemplateURL());
228 void SafeTemplateURLParser::ParseURLs(
229 const std::vector<const base::Value*>& urls) {
230 for (auto* url_value : urls) {
231 std::string template_url =
232 data_decoder::GetXmlElementAttribute(*url_value, kURLTemplateAttribute);
234 data_decoder::GetXmlElementAttribute(*url_value, kURLTypeAttribute);
235 bool is_post = base::EqualsCaseInsensitiveASCII(
236 data_decoder::GetXmlElementAttribute(*url_value, kParamMethodAttribute),
238 bool is_html_url = (type == kHTMLType);
239 bool is_suggest_url = (type == kSuggestionType);
241 if (is_html_url && !template_url.empty()) {
242 data_.SetURL(template_url);
243 is_suggest_url = false;
246 } else if (is_suggest_url) {
247 data_.suggestions_url = template_url;
249 suggestion_method_ = POST;
252 std::vector<Param> extra_params;
254 std::vector<const base::Value*> params;
255 GetChildElementsByTag(*url_value, kParamElement, ¶ms);
256 for (auto* param : params) {
258 data_decoder::GetXmlElementAttribute(*param, kParamNameAttribute);
260 data_decoder::GetXmlElementAttribute(*param, kParamValueAttribute);
262 (parameter_filter_.is_null() || parameter_filter_.Run(key, value))) {
263 extra_params.push_back(Param(key, value));
267 if (!parameter_filter_.is_null() || !extra_params.empty()) {
268 GURL url(is_suggest_url ? data_.suggestions_url : data_.url());
272 // If there is a parameter filter, parse the existing URL and remove any
273 // unwanted parameter.
274 std::string new_query;
275 bool modified = false;
276 if (!parameter_filter_.is_null()) {
277 url::Component query = url.parsed_for_possibly_invalid_spec().query;
278 url::Component key, value;
279 const char* url_spec = url.spec().c_str();
280 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
281 std::string key_str(url_spec, key.begin, key.len);
282 std::string value_str(url_spec, value.begin, value.len);
283 if (parameter_filter_.Run(key_str, value_str)) {
284 AppendParamToQuery(key_str, value_str, &new_query);
291 new_query = url.query();
293 // Add the extra parameters if any.
294 if (!extra_params.empty()) {
296 for (const auto& iter : extra_params)
297 AppendParamToQuery(iter.first, iter.second, &new_query);
301 GURL::Replacements repl;
302 repl.SetQueryStr(new_query);
303 url = url.ReplaceComponents(repl);
305 data_.suggestions_url = url.spec();
306 else if (url.is_valid())
307 data_.SetURL(url.spec());
313 void SafeTemplateURLParser::ParseImages(
314 const std::vector<const base::Value*>& images) {
315 for (auto* image : images) {
316 std::string url_string;
317 if (!data_decoder::GetXmlElementText(*image, &url_string))
321 data_decoder::GetXmlElementAttribute(*image, kImageTypeAttribute);
325 data_decoder::GetXmlElementAttribute(*image, kImageWidthAttribute),
328 data_decoder::GetXmlElementAttribute(*image, kImageHeightAttribute),
331 bool image_is_valid_for_favicon =
332 (width == gfx::kFaviconSize) && (height == gfx::kFaviconSize) &&
333 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
335 GURL image_url(url_string);
337 if (image_url.SchemeIs(url::kDataScheme)) {
338 // TODO(jcampan): bug 1169256: when dealing with data URL, we need to
339 // decode the data URL in the renderer. For now, we'll just point to the
340 // favicon from the URL.
341 derive_image_from_url_ = true;
342 } else if (image_is_valid_for_favicon && image_url.is_valid() &&
343 (image_url.SchemeIs(url::kHttpScheme) ||
344 image_url.SchemeIs(url::kHttpsScheme))) {
345 data_.favicon_url = image_url;
347 image_is_valid_for_favicon = false;
351 void SafeTemplateURLParser::ParseEncodings(
352 const std::vector<const base::Value*>& encodings) {
353 for (auto* encoding : encodings) {
354 std::string encoding_value;
355 if (data_decoder::GetXmlElementText(*encoding, &encoding_value)) {
356 if (IsValidEncodingString(encoding_value))
357 data_.input_encodings.push_back(encoding_value);
362 void SafeTemplateURLParser::ParseAliases(
363 const std::vector<const base::Value*>& aliases) {
364 for (auto* alias : aliases) {
365 std::string alias_value;
366 if (data_decoder::GetXmlElementText(*alias, &alias_value)) {
367 data_.SetKeyword(base::UTF8ToUTF16(alias_value));
368 has_custom_keyword_ = true;
373 std::unique_ptr<TemplateURL> SafeTemplateURLParser::FinalizeTemplateURL() {
374 // TODO(https://crbug.com/18107): Support engines that use POST.
375 if (method_ == POST || !IsHTTPRef(data_.url()) ||
376 !IsHTTPRef(data_.suggestions_url)) {
377 DLOG(ERROR) << "POST URLs are not supported";
380 if (suggestion_method_ == POST)
381 data_.suggestions_url.clear();
383 // If the image was a data URL, use the favicon from the search URL instead.
384 // (see the TODO in ParseImages()).
385 GURL search_url(data_.url());
386 if (derive_image_from_url_ && data_.favicon_url.is_empty())
387 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
389 // Generate a keyword for this search engine if a custom one was not present
390 // in the imported data.
391 if (!has_custom_keyword_)
392 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
394 // If the OSDD omits or has an empty short name, use the keyword.
395 if (data_.short_name().empty())
396 data_.SetShortName(data_.keyword());
398 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
399 std::unique_ptr<TemplateURL> template_url =
400 std::make_unique<TemplateURL>(data_);
401 if (template_url->url().empty() ||
402 !template_url->url_ref().IsValid(*search_terms_data_) ||
403 (!template_url->suggestions_url().empty() &&
404 !template_url->suggestions_url_ref().IsValid(*search_terms_data_))) {
405 DLOG(ERROR) << "Template URL is not valid";
412 bool SafeTemplateURLParser::GetChildElementsByTag(
413 const base::Value& elem,
414 const std::string& tag,
415 std::vector<const base::Value*>* children) {
417 for (const auto& ns : namespaces_) {
418 std::string name = data_decoder::GetXmlQualifiedName(ns, tag);
420 data_decoder::GetAllXmlElementChildrenWithTag(elem, name, children);
427 // TemplateURLParser ----------------------------------------------------------
430 void TemplateURLParser::Parse(const SearchTermsData* search_terms_data,
431 const std::string& data,
432 const ParameterFilter& parameter_filter,
433 ParseCallback completion_callback) {
434 auto safe_parser = std::make_unique<SafeTemplateURLParser>(
435 search_terms_data, parameter_filter, std::move(completion_callback));
436 data_decoder::DataDecoder::ParseXmlIsolated(
437 data, data_decoder::mojom::XmlParser::WhitespaceBehavior::kIgnore,
438 base::BindOnce(&SafeTemplateURLParser::OnXmlParseComplete,
439 std::move(safe_parser)));
443 void TemplateURLParser::ParseWithDataDecoder(
444 data_decoder::DataDecoder* data_decoder,
445 const SearchTermsData* search_terms_data,
446 const std::string& data,
447 const ParameterFilter& parameter_filter,
448 ParseCallback completion_callback) {
449 auto safe_parser = std::make_unique<SafeTemplateURLParser>(
450 search_terms_data, parameter_filter, std::move(completion_callback));
451 data_decoder->ParseXml(
452 data, data_decoder::mojom::XmlParser::WhitespaceBehavior::kIgnore,
453 base::BindOnce(&SafeTemplateURLParser::OnXmlParseComplete,
454 std::move(safe_parser)));