base/i18n/file_util_icu.cc

   1 // Copyright 2012 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // File utilities that use the ICU library go in this file.
   6
   7 #include "base/i18n/file_util_icu.h"
   8
   9 #include <stdint.h>
  10
  11 #include "base/check.h"
  12 #include "base/files/file_path.h"
  13 #include "base/i18n/icu_string_conversions.h"
  14 #include "base/i18n/string_compare.h"
  15 #include "base/memory/singleton.h"
  16 #include "base/numerics/safe_conversions.h"
  17 #include "base/strings/string_util.h"
  18 #include "base/strings/sys_string_conversions.h"
  19 #include "base/strings/utf_string_conversions.h"
  20 #include "build/build_config.h"
  21 #include "build/chromeos_buildflags.h"
  22 #include "third_party/icu/source/common/unicode/uniset.h"
  23 #include "third_party/icu/source/i18n/unicode/coll.h"
  24
  25 namespace base {
  26 namespace i18n {
  27
  28 namespace {
  29
  30 class IllegalCharacters {
  31  public:
  32   IllegalCharacters(const IllegalCharacters&) = delete;
  33   IllegalCharacters& operator=(const IllegalCharacters&) = delete;
  34
  35   static IllegalCharacters* GetInstance() {
  36     return Singleton<IllegalCharacters>::get();
  37   }
  38
  39   bool IsDisallowedEverywhere(UChar32 ucs4) const {
  40     return !!illegal_anywhere_.contains(ucs4);
  41   }
  42
  43   bool IsDisallowedLeadingOrTrailing(UChar32 ucs4) const {
  44     return !!illegal_at_ends_.contains(ucs4);
  45   }
  46
  47 #if BUILDFLAG(IS_WIN)
  48   bool IsDisallowedShortNameCharacter(UChar32 ucs4) const {
  49     return !!illegal_in_short_filenames_.contains(ucs4);
  50   }
  51
  52   bool IsDisallowedIfMayBeShortName(UChar32 ucs4) const {
  53     return !!required_to_be_a_short_filename_.contains(ucs4);
  54   }
  55
  56   template <typename StringT>
  57   bool HasValidDotPositionForShortName(const StringT& s) const {
  58     auto first_dot = s.find_first_of('.');
  59     // Short names are not required to have a "." period character...
  60     if (first_dot == std::string::npos) {
  61       return s.size() <= 8;
  62     }
  63     // ...but they must not contain more than one "." period character...
  64     if (first_dot != s.find_last_of('.')) {
  65       return false;
  66     }
  67     // ... and must contain a basename of 1-8 characters, optionally with one
  68     // "." period character followed by an extension no more than 3 characters
  69     // in length.
  70     return first_dot > 0 && first_dot <= 8 && first_dot + 4 >= s.size();
  71   }
  72
  73   // Returns whether `s` could possibly be in the 8.3 name format AND contains a
  74   // '~' character, which may interact poorly with short filenames on VFAT. See
  75   // https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-cifs/09c2ccc8-4aaf-439f-9b4e-13b3fe85a4cf.
  76   bool CouldBeInvalidShortName(const std::u16string& s) const {
  77     if (s.size() > 12 ||
  78         !required_to_be_a_short_filename_.containsSome(icu::UnicodeString(
  79             /*isTerminated=*/false, s.c_str(), s.size())) ||
  80         !illegal_in_short_filenames_.containsNone(
  81             icu::UnicodeString(/*isTerminated=*/false, s.c_str(), s.size()))) {
  82       return false;
  83     }
  84     return HasValidDotPositionForShortName<std::u16string>(s);
  85   }
  86 #endif
  87
  88   bool IsAllowedName(const std::u16string& s) const {
  89     return s.empty() || (!!illegal_anywhere_.containsNone(icu::UnicodeString(
  90                              /*isTerminated=*/false, s.c_str(), s.size())) &&
  91                          !illegal_at_ends_.contains(*s.begin()) &&
  92                          !illegal_at_ends_.contains(*s.rbegin())
  93 #if BUILDFLAG(IS_WIN)
  94                          && !CouldBeInvalidShortName(s)
  95 #endif
  96                         );
  97   }
  98
  99  private:
 100   friend struct DefaultSingletonTraits<IllegalCharacters>;
 101
 102   IllegalCharacters();
 103   ~IllegalCharacters() = default;
 104
 105   // Set of characters considered invalid anywhere inside a filename.
 106   icu::UnicodeSet illegal_anywhere_;
 107
 108   // Set of characters considered invalid at either end of a filename.
 109   icu::UnicodeSet illegal_at_ends_;
 110
 111   // #if BUILDFLAG(IS_WIN)
 112   // Set of characters which are guaranteed to exist if the filename is to be of
 113   // the problematic VFAT 8.3 short filename format.
 114   icu::UnicodeSet required_to_be_a_short_filename_;
 115   // Set of characters which are not allowed in VFAT 8.3 short filenames. If
 116   // any of these characters are present, the file cannot be a short filename.
 117   icu::UnicodeSet illegal_in_short_filenames_;
 118   // #endif
 119 };
 120
 121 IllegalCharacters::IllegalCharacters() {
 122   UErrorCode status = U_ZERO_ERROR;
 123   // Control characters, formatting characters, non-characters, path separators,
 124   // and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
 125   // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
 126   // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
 127   // Note that code points in the "Other, Format" (Cf) category are ignored on
 128   // HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being
 129   // legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is
 130   // also excluded in some circumstances due to the possibility of interacting
 131   // poorly with short filenames on VFAT. (Related to CVE-2014-9390)
 132   illegal_anywhere_ = icu::UnicodeSet(
 133       UNICODE_STRING_SIMPLE("[[\"*/:<>?\\\\|][:Cc:][:Cf:]]"), status);
 134   DCHECK(U_SUCCESS(status));
 135   // Add non-characters. If this becomes a performance bottleneck by
 136   // any chance, do not add these to |set| and change IsFilenameLegal()
 137   // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addition to calling
 138   // IsAllowedName().
 139   illegal_anywhere_.add(0xFDD0, 0xFDEF);
 140   for (int i = 0; i <= 0x10; ++i) {
 141     int plane_base = 0x10000 * i;
 142     illegal_anywhere_.add(plane_base + 0xFFFE, plane_base + 0xFFFF);
 143   }
 144   illegal_anywhere_.freeze();
 145
 146   illegal_at_ends_ =
 147       icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[:WSpace:][.~]]"), status);
 148   DCHECK(U_SUCCESS(status));
 149   illegal_at_ends_.freeze();
 150
 151 #if BUILDFLAG(IS_WIN)
 152   required_to_be_a_short_filename_ =
 153       icu::UnicodeSet(UNICODE_STRING_SIMPLE("[[~]]"), status);
 154   DCHECK(U_SUCCESS(status));
 155   required_to_be_a_short_filename_.freeze();
 156
 157   illegal_in_short_filenames_ = icu::UnicodeSet(
 158       UNICODE_STRING_SIMPLE("[[:WSpace:][\"\\/[]:+|<>=;?,*]]"), status);
 159   DCHECK(U_SUCCESS(status));
 160   illegal_in_short_filenames_.freeze();
 161 #endif
 162 }
 163
 164 // Returns the code point at position |cursor| in |file_name|, and increments
 165 // |cursor| to the next position.
 166 UChar32 GetNextCodePoint(const FilePath::StringType* const file_name,
 167                          int& cursor) {
 168   UChar32 code_point;
 169 #if BUILDFLAG(IS_WIN)
 170   // Windows uses UTF-16 encoding for filenames.
 171   U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
 172            code_point);
 173 #elif BUILDFLAG(IS_POSIX) || BUILDFLAG(IS_FUCHSIA)
 174   // Mac and Chrome OS use UTF-8 encoding for filenames.
 175   // Linux doesn't actually define file system encoding. Try to parse as
 176   // UTF-8.
 177   U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
 178           code_point);
 179 #else
 180 #error Unsupported platform
 181 #endif
 182   return code_point;
 183 }
 184
 185 }  // namespace
 186
 187 bool IsFilenameLegal(const std::u16string& file_name) {
 188   return IllegalCharacters::GetInstance()->IsAllowedName(file_name);
 189 }
 190
 191 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
 192                                     char replace_char) {
 193   IllegalCharacters* illegal = IllegalCharacters::GetInstance();
 194
 195   DCHECK(!(illegal->IsDisallowedEverywhere(replace_char)));
 196   const bool is_replace_char_illegal_at_ends =
 197       illegal->IsDisallowedLeadingOrTrailing(replace_char);
 198 #if BUILDFLAG(IS_WIN)
 199   bool could_be_short_name =
 200       file_name->size() <= 12 &&
 201       illegal->HasValidDotPositionForShortName<FilePath::StringType>(
 202           *file_name);
 203 #endif
 204   // Keep track of the earliest and latest legal begin/end characters and file-
 205   // extension separator encountered, -1 if none yet.
 206   int unreplaced_legal_range_begin = -1;
 207   int unreplaced_legal_range_end = -1;
 208   int last_extension_separator = -1;
 209   static const UChar32 kExtensionSeparator =
 210       checked_cast<UChar32>(FilePath::kExtensionSeparator);
 211
 212   int cursor = 0;  // The ICU macros expect an int.
 213
 214 #if BUILDFLAG(IS_WIN)
 215   // Loop through the file name, looking for any characters which are invalid in
 216   // an 8.3 short file name. If any of these characters exist, it's not an 8.3
 217   // file name and we don't need to replace the '~' character.
 218   while (could_be_short_name && cursor < static_cast<int>(file_name->size())) {
 219     const UChar32 code_point = GetNextCodePoint(file_name, cursor);
 220     could_be_short_name = !illegal->IsDisallowedShortNameCharacter(code_point);
 221   }
 222 #endif
 223
 224   cursor = 0;
 225   while (cursor < static_cast<int>(file_name->size())) {
 226     int char_begin = cursor;
 227     const UChar32 code_point = GetNextCodePoint(file_name, cursor);
 228
 229     const bool is_illegal_at_ends =
 230         illegal->IsDisallowedLeadingOrTrailing(code_point);
 231
 232     if (illegal->IsDisallowedEverywhere(code_point) ||
 233 #if BUILDFLAG(IS_WIN)
 234         (could_be_short_name &&
 235          illegal->IsDisallowedIfMayBeShortName(code_point)) ||
 236 #endif
 237         ((char_begin == 0 || cursor == static_cast<int>(file_name->length())) &&
 238          is_illegal_at_ends && !is_replace_char_illegal_at_ends)) {
 239       file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
 240       // We just made the potentially multi-byte/word char into one that only
 241       // takes one byte/word, so need to adjust the cursor to point to the next
 242       // character again.
 243       cursor = char_begin + 1;
 244     } else if (!is_illegal_at_ends) {
 245       if (unreplaced_legal_range_begin == -1)
 246         unreplaced_legal_range_begin = char_begin;
 247       unreplaced_legal_range_end = cursor;
 248     }
 249
 250     if (code_point == kExtensionSeparator)
 251       last_extension_separator = char_begin;
 252   }
 253
 254   // If |replace_char| is not a legal starting/ending character, ensure that
 255   // |replace_char| is not the first nor last character in |file_name|.
 256   if (is_replace_char_illegal_at_ends) {
 257     if (unreplaced_legal_range_begin == -1) {
 258       // |file_name| has no characters that are legal at ends; enclose in '_'s.
 259       file_name->insert(file_name->begin(), FILE_PATH_LITERAL('_'));
 260       file_name->append(FILE_PATH_LITERAL("_"));
 261     } else {
 262       // Trim trailing instances of |replace_char| and other characters that are
 263       // illegal at ends.
 264       file_name->erase(unreplaced_legal_range_end, FilePath::StringType::npos);
 265
 266       // Trim leading instances of |replace_char| and other characters that are
 267       // illegal at ends, while ensuring that the file-extension separator is
 268       // not removed if present. The file-extension separator is considered the
 269       // last '.' in |file_name| followed by a legal character.
 270       if (last_extension_separator != -1 &&
 271           last_extension_separator == unreplaced_legal_range_begin - 1) {
 272         // If the file-extension separator is at the start of the resulting
 273         // |file_name|, prepend '_' instead of trimming it, e.g.,
 274         // "***.txt" -> "_.txt".
 275         file_name->erase(0, last_extension_separator);
 276         file_name->insert(file_name->begin(), FILE_PATH_LITERAL('_'));
 277       } else {
 278         file_name->erase(0, unreplaced_legal_range_begin);
 279       }
 280     }
 281     DCHECK(!file_name->empty());
 282   }
 283 }
 284
 285 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
 286   UErrorCode error_code = U_ZERO_ERROR;
 287   // Use the default collator. The default locale should have been properly
 288   // set by the time this constructor is called.
 289   std::unique_ptr<icu::Collator> collator(
 290       icu::Collator::createInstance(error_code));
 291   DCHECK(U_SUCCESS(error_code));
 292   // Make it case-sensitive.
 293   collator->setStrength(icu::Collator::TERTIARY);
 294
 295 #if BUILDFLAG(IS_WIN)
 296   return CompareString16WithCollator(*collator, AsStringPiece16(a.value()),
 297                                      AsStringPiece16(b.value())) == UCOL_LESS;
 298
 299 #elif BUILDFLAG(IS_POSIX) || BUILDFLAG(IS_FUCHSIA)
 300   // On linux, the file system encoding is not defined. We assume
 301   // SysNativeMBToWide takes care of it.
 302   return CompareString16WithCollator(
 303              *collator, WideToUTF16(SysNativeMBToWide(a.value())),
 304              WideToUTF16(SysNativeMBToWide(b.value()))) == UCOL_LESS;
 305 #endif
 306 }
 307
 308 void NormalizeFileNameEncoding(FilePath* file_name) {
 309 #if BUILDFLAG(IS_CHROMEOS_ASH)
 310   std::string normalized_str;
 311   if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), kCodepageUTF8,
 312                                 &normalized_str) &&
 313       !normalized_str.empty()) {
 314     *file_name = file_name->DirName().Append(FilePath(normalized_str));
 315   }
 316 #endif
 317 }
 318
 319 }  // namespace i18n
 320 }  // namespace base