src/declarative/qml/ftw/qhashedstring.cpp

   1 /****************************************************************************
   2 **
   3 ** Copyright (C) 2012 Nokia Corporation and/or its subsidiary(-ies).
   4 ** All rights reserved.
   5 ** Contact: http://www.qt-project.org/
   6 **
   7 ** This file is part of the QtDeclarative module of the Qt Toolkit.
   8 **
   9 ** $QT_BEGIN_LICENSE:LGPL$
  10 ** GNU Lesser General Public License Usage
  11 ** This file may be used under the terms of the GNU Lesser General Public
  12 ** License version 2.1 as published by the Free Software Foundation and
  13 ** appearing in the file LICENSE.LGPL included in the packaging of this
  14 ** file. Please review the following information to ensure the GNU Lesser
  15 ** General Public License version 2.1 requirements will be met:
  16 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
  17 **
  18 ** In addition, as a special exception, Nokia gives you certain additional
  19 ** rights. These rights are described in the Nokia Qt LGPL Exception
  20 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
  21 **
  22 ** GNU General Public License Usage
  23 ** Alternatively, this file may be used under the terms of the GNU General
  24 ** Public License version 3.0 as published by the Free Software Foundation
  25 ** and appearing in the file LICENSE.GPL included in the packaging of this
  26 ** file. Please review the following information to ensure the GNU General
  27 ** Public License version 3.0 requirements will be met:
  28 ** http://www.gnu.org/copyleft/gpl.html.
  29 **
  30 ** Other Usage
  31 ** Alternatively, this file may be used in accordance with the terms and
  32 ** conditions contained in a signed written agreement between you and Nokia.
  33 **
  34 **
  35 **
  36 **
  37 **
  38 ** $QT_END_LICENSE$
  39 **
  40 ****************************************************************************/
  41
  42 #include "qhashedstring_p.h"
  43
  44 // This is a reimplementation of V8's string hash algorithm.  It is significantly
  45 // faster to do it here than call into V8, but it adds the maintainence burden of
  46 // ensuring that the two hashes are identical.  We Q_ASSERT() that the two return
  47 // the same value.  If these asserts start to fail, the hash code needs to be
  48 // synced with V8.
  49 namespace String {
  50     static const int kMaxArrayIndexSize = 10;
  51     static const int kMaxHashCalcLength = 16383;
  52     static const int kNofHashBitFields = 2;
  53     static const int kHashShift = kNofHashBitFields;
  54     static const int kIsNotArrayIndexMask = 1 << 1;
  55     static const int kArrayIndexValueBits = 24;
  56     static const int kArrayIndexHashLengthShift = kArrayIndexValueBits + kNofHashBitFields;
  57     static const int kMaxCachedArrayIndexLength = 7;
  58 };
  59
  60 template <typename schar>
  61 uint32_t calculateHash(const schar* chars, int length) {
  62     if (length > String::kMaxHashCalcLength) {
  63         // V8 trivial hash
  64         return (length << String::kHashShift) | String::kIsNotArrayIndexMask;
  65     }
  66
  67     uint32_t raw_running_hash = 0;
  68     uint32_t array_index = 0;
  69     bool is_array_index = (0 < length && length <= String::kMaxArrayIndexSize);
  70     bool is_first_char = true;
  71
  72     int ii = 0;
  73     for (;is_array_index && ii < length; ++ii) {
  74         quint32 c = *chars++;
  75
  76         raw_running_hash += c;
  77         raw_running_hash += (raw_running_hash << 10);
  78         raw_running_hash ^= (raw_running_hash >> 6);
  79
  80         if (c < '0' || c > '9') {
  81             is_array_index = false;
  82         } else {
  83             int d = c - '0';
  84             if (is_first_char) {
  85                 is_first_char = false;
  86                 if (c == '0' && length > 1) {
  87                     is_array_index = false;
  88                     continue;
  89                 }
  90             }
  91             if (array_index > 429496729U - ((d + 2) >> 3)) {
  92                 is_array_index = false;
  93             } else {
  94                 array_index = array_index * 10 + d;
  95             }
  96         }
  97     }
  98
  99     for (;ii < length; ++ii) {
 100         raw_running_hash += *chars++;
 101         raw_running_hash += (raw_running_hash << 10);
 102         raw_running_hash ^= (raw_running_hash >> 6);
 103     }
 104
 105     if (is_array_index) {
 106         array_index <<= String::kHashShift;
 107         array_index |= length << String::kArrayIndexHashLengthShift;
 108         return array_index;
 109     } else {
 110         raw_running_hash += (raw_running_hash << 3);
 111         raw_running_hash ^= (raw_running_hash >> 11);
 112         raw_running_hash += (raw_running_hash << 15);
 113         if (raw_running_hash == 0) {
 114             raw_running_hash = 27;
 115         }
 116
 117         return (raw_running_hash << String::kHashShift) | String::kIsNotArrayIndexMask;
 118     }
 119 }
 120
 121 inline quint32 stringHash(const QChar* data, int length)
 122 {
 123     quint32 rv = calculateHash<quint16>((quint16*)data, length) >> String::kHashShift;
 124     Q_ASSERT(rv == v8::String::ComputeHash((uint16_t*)data, length));
 125     return rv;
 126 }
 127
 128 inline quint32 stringHash(const char *data, int length)
 129 {
 130     quint32 rv = calculateHash<quint8>((quint8*)data, length) >> String::kHashShift;
 131     Q_ASSERT(rv == v8::String::ComputeHash((char *)data, length));
 132     return rv;
 133 }
 134
 135 void QHashedString::computeHash() const
 136 {
 137     m_hash = stringHash(constData(), length());
 138 }
 139
 140 void QHashedStringRef::computeHash() const
 141 {
 142     m_hash = stringHash(m_data, m_length);
 143 }
 144
 145 void QHashedCStringRef::computeHash() const
 146 {
 147     m_hash = stringHash(m_data, m_length);
 148 }
 149
 150 /*
 151     A QHash has initially around pow(2, MinNumBits) buckets. For
 152     example, if MinNumBits is 4, it has 17 buckets.
 153 */
 154 const int MinNumBits = 4;
 155
 156 /*
 157     The prime_deltas array is a table of selected prime values, even
 158     though it doesn't look like one. The primes we are using are 1,
 159     2, 5, 11, 17, 37, 67, 131, 257, ..., i.e. primes in the immediate
 160     surrounding of a power of two.
 161
 162     The primeForNumBits() function returns the prime associated to a
 163     power of two. For example, primeForNumBits(8) returns 257.
 164 */
 165
 166 static const uchar prime_deltas[] = {
 167     0,  0,  1,  3,  1,  5,  3,  3,  1,  9,  7,  5,  3,  9, 25,  3,
 168     1, 21,  3, 21,  7, 15,  9,  5,  3, 29, 15,  0,  0,  0,  0,  0
 169 };
 170
 171 static inline int primeForNumBits(int numBits)
 172 {
 173     return (1 << numBits) + prime_deltas[numBits];
 174 }
 175
 176 void QStringHashData::rehashToSize(int size)
 177 {
 178     short bits = qMax(MinNumBits, (int)numBits);
 179     while (primeForNumBits(bits) < size) bits++;
 180
 181     if (bits > numBits)
 182         rehashToBits(bits);
 183 }
 184
 185 void QStringHashData::rehashToBits(short bits)
 186 {
 187     numBits = qMax(MinNumBits, (int)bits);
 188
 189     int nb = primeForNumBits(numBits);
 190     if (nb == numBuckets && buckets)
 191         return;
 192
 193     numBuckets = nb;
 194
 195     delete []  buckets;
 196     buckets = new QStringHashNode *[numBuckets];
 197     ::memset(buckets, 0, sizeof(QStringHashNode *) * numBuckets);
 198
 199     QStringHashNode *nodeList = nodes;
 200     while (nodeList) {
 201         int bucket = nodeList->hash % numBuckets;
 202         nodeList->next = buckets[bucket];
 203         buckets[bucket] = nodeList;
 204
 205         nodeList = nodeList->nlist;
 206     }
 207 }
 208
 209 // Copy of QString's qMemCompare
 210 bool QHashedString::compare(const QChar *lhs, const QChar *rhs, int length)
 211 {
 212     Q_ASSERT(lhs && rhs);
 213     const quint16 *a = (const quint16 *)lhs;
 214     const quint16 *b = (const quint16 *)rhs;
 215
 216     if (a == b || !length)
 217         return true;
 218
 219     register union {
 220         const quint16 *w;
 221         const quint32 *d;
 222         quintptr value;
 223     } sa, sb;
 224     sa.w = a;
 225     sb.w = b;
 226
 227     // check alignment
 228     if ((sa.value & 2) == (sb.value & 2)) {
 229         // both addresses have the same alignment
 230         if (sa.value & 2) {
 231             // both addresses are not aligned to 4-bytes boundaries
 232             // compare the first character
 233             if (*sa.w != *sb.w)
 234                 return false;
 235             --length;
 236             ++sa.w;
 237             ++sb.w;
 238
 239             // now both addresses are 4-bytes aligned
 240         }
 241
 242         // both addresses are 4-bytes aligned
 243         // do a fast 32-bit comparison
 244         register const quint32 *e = sa.d + (length >> 1);
 245         for ( ; sa.d != e; ++sa.d, ++sb.d) {
 246             if (*sa.d != *sb.d)
 247                 return false;
 248         }
 249
 250         // do we have a tail?
 251         return (length & 1) ? *sa.w == *sb.w : true;
 252     } else {
 253         // one of the addresses isn't 4-byte aligned but the other is
 254         register const quint16 *e = sa.w + length;
 255         for ( ; sa.w != e; ++sa.w, ++sb.w) {
 256             if (*sa.w != *sb.w)
 257                 return false;
 258         }
 259     }
 260     return true;
 261 }
 262
 263 // Unicode stuff
 264 static inline bool isUnicodeNonCharacter(uint ucs4)
 265 {
 266     // Unicode has a couple of "non-characters" that one can use internally,
 267     // but are not allowed to be used for text interchange.
 268     //
 269     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
 270     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
 271     // U+FDEF (inclusive)
 272
 273     return (ucs4 & 0xfffe) == 0xfffe
 274             || (ucs4 - 0xfdd0U) < 16;
 275 }
 276
 277 static int utf8LengthFromUtf16(const QChar *uc, int len)
 278 {
 279     int length = 0;
 280
 281     int surrogate_high = -1;
 282
 283     const QChar *ch = uc;
 284     int invalid = 0;
 285
 286     const QChar *end = ch + len;
 287     while (ch < end) {
 288         uint u = ch->unicode();
 289         if (surrogate_high >= 0) {
 290             if (u >= 0xdc00 && u < 0xe000) {
 291                 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
 292                 surrogate_high = -1;
 293             } else {
 294                 // high surrogate without low
 295                 ++ch;
 296                 ++invalid;
 297                 surrogate_high = -1;
 298                 continue;
 299             }
 300         } else if (u >= 0xdc00 && u < 0xe000) {
 301             // low surrogate without high
 302             ++ch;
 303             ++invalid;
 304             continue;
 305         } else if (u >= 0xd800 && u < 0xdc00) {
 306             surrogate_high = u;
 307             ++ch;
 308             continue;
 309         }
 310
 311         if (u < 0x80) {
 312             ++length;
 313         } else {
 314             if (u < 0x0800) {
 315                 ++length;
 316             } else {
 317                 // is it one of the Unicode non-characters?
 318                 if (isUnicodeNonCharacter(u)) {
 319                     ++length;
 320                     ++ch;
 321                     ++invalid;
 322                     continue;
 323                 }
 324
 325                 if (u > 0xffff) {
 326                     ++length;
 327                     ++length;
 328                 } else {
 329                     ++length;
 330                 }
 331                 ++length;
 332             }
 333             ++length;
 334         }
 335         ++ch;
 336     }
 337
 338     return length;
 339 }
 340
 341 // Writes the utf8 version of uc to output.  uc is of length len.
 342 // There must be at least utf8LengthFromUtf16(uc, len) bytes in output.
 343 // A null terminator is not written.
 344 static void utf8FromUtf16(char *output, const QChar *uc, int len)
 345 {
 346     uchar replacement = '?';
 347     int surrogate_high = -1;
 348
 349     uchar* cursor = (uchar*)output;
 350     const QChar *ch = uc;
 351     int invalid = 0;
 352
 353     const QChar *end = ch + len;
 354     while (ch < end) {
 355         uint u = ch->unicode();
 356         if (surrogate_high >= 0) {
 357             if (u >= 0xdc00 && u < 0xe000) {
 358                 u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
 359                 surrogate_high = -1;
 360             } else {
 361                 // high surrogate without low
 362                 *cursor = replacement;
 363                 ++ch;
 364                 ++invalid;
 365                 surrogate_high = -1;
 366                 continue;
 367             }
 368         } else if (u >= 0xdc00 && u < 0xe000) {
 369             // low surrogate without high
 370             *cursor = replacement;
 371             ++ch;
 372             ++invalid;
 373             continue;
 374         } else if (u >= 0xd800 && u < 0xdc00) {
 375             surrogate_high = u;
 376             ++ch;
 377             continue;
 378         }
 379
 380         if (u < 0x80) {
 381             *cursor++ = (uchar)u;
 382         } else {
 383             if (u < 0x0800) {
 384                 *cursor++ = 0xc0 | ((uchar) (u >> 6));
 385             } else {
 386                 // is it one of the Unicode non-characters?
 387                 if (isUnicodeNonCharacter(u)) {
 388                     *cursor++ = replacement;
 389                     ++ch;
 390                     ++invalid;
 391                     continue;
 392                 }
 393
 394                 if (u > 0xffff) {
 395                     *cursor++ = 0xf0 | ((uchar) (u >> 18));
 396                     *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
 397                 } else {
 398                     *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
 399                 }
 400                 *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
 401             }
 402             *cursor++ = 0x80 | ((uchar) (u&0x3f));
 403         }
 404         ++ch;
 405     }
 406 }
 407
 408 void QHashedStringRef::computeUtf8Length() const
 409 {
 410     if (m_length)
 411         m_utf8length = utf8LengthFromUtf16(m_data, m_length);
 412     else
 413         m_utf8length = 0;
 414 }
 415
 416 QHashedStringRef QHashedStringRef::mid(int offset, int length) const
 417 {
 418     Q_ASSERT(offset < m_length);
 419     return QHashedStringRef(m_data + offset,
 420                             (length == -1 || (offset + length) > m_length)?(m_length - offset):length);
 421 }
 422
 423 bool QHashedStringRef::endsWith(const QString &s) const
 424 {
 425     return s.length() < m_length &&
 426            QHashedString::compare(s.constData(), m_data + m_length - s.length(), s.length());
 427 }
 428
 429 bool QHashedStringRef::startsWith(const QString &s) const
 430 {
 431     return s.length() < m_length &&
 432            QHashedString::compare(s.constData(), m_data, s.length());
 433 }
 434
 435 QString QHashedStringRef::toString() const
 436 {
 437     if (m_length == 0)
 438         return QString();
 439     return QString(m_data, m_length);
 440 }
 441
 442 QByteArray QHashedStringRef::toUtf8() const
 443 {
 444     if (m_length == 0)
 445         return QByteArray();
 446
 447     QByteArray result;
 448     result.resize(utf8length());
 449     writeUtf8(result.data());
 450     return result;
 451 }
 452
 453 void QHashedStringRef::writeUtf8(char *output) const
 454 {
 455     if (m_length) {
 456         int ulen = utf8length();
 457         if (ulen == m_length) {
 458             // Must be a latin1 string
 459             uchar *o = (uchar *)output;
 460             const QChar *c = m_data;
 461             while (ulen--)
 462                 *o++ = (uchar)((*c++).unicode());
 463         } else {
 464             utf8FromUtf16(output, m_data, m_length);
 465         }
 466     }
 467 }
 468
 469 QString QHashedCStringRef::toUtf16() const
 470 {
 471     if (m_length == 0)
 472         return QString();
 473
 474     QString rv;
 475     rv.resize(m_length);
 476     writeUtf16((uint16_t*)rv.data());
 477     return rv;
 478 }
 479