third_party/pigweed/repo/pw_tokenizer/detokenize.cc

   1 // Copyright 2020 The Pigweed Authors
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
   4 // use this file except in compliance with the License. You may obtain a copy of
   5 // the License at
   6 //
   7 //     https://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  12 // License for the specific language governing permissions and limitations under
  13 // the License.
  14
  15 #include "pw_tokenizer/detokenize.h"
  16
  17 #include <algorithm>
  18
  19 #include "pw_tokenizer/internal/decode.h"
  20
  21 namespace pw::tokenizer {
  22 namespace {
  23
  24 std::string UnknownTokenMessage(uint32_t value) {
  25   std::string output(PW_TOKENIZER_ARG_DECODING_ERROR_PREFIX "unknown token ");
  26
  27   // Output a hexadecimal version of the token.
  28   for (int shift = 28; shift >= 0; shift -= 4) {
  29     output.push_back("0123456789abcdef"[(value >> shift) & 0xF]);
  30   }
  31
  32   output.append(PW_TOKENIZER_ARG_DECODING_ERROR_SUFFIX);
  33   return output;
  34 }
  35
  36 // Decoding result with the date removed, for sorting.
  37 using DecodingResult = std::pair<DecodedFormatString, uint32_t>;
  38
  39 // Determines if one result is better than the other if collisions occurred.
  40 // Returns true if lhs is preferred over rhs. This logic should match the
  41 // collision resolution logic in detokenize.py.
  42 bool IsBetterResult(const DecodingResult& lhs, const DecodingResult& rhs) {
  43   // Favor the result for which decoding succeeded.
  44   if (lhs.first.ok() != rhs.first.ok()) {
  45     return lhs.first.ok();
  46   }
  47
  48   // Favor the result for which all bytes were decoded.
  49   if ((lhs.first.remaining_bytes() == 0u) !=
  50       (rhs.first.remaining_bytes() == 0u)) {
  51     return lhs.first.remaining_bytes() == 0u;
  52   }
  53
  54   // Favor the result with fewer decoding errors.
  55   if (lhs.first.decoding_errors() != rhs.first.decoding_errors()) {
  56     return lhs.first.decoding_errors() < rhs.first.decoding_errors();
  57   }
  58
  59   // Favor the result that successfully decoded the most arguments.
  60   if (lhs.first.argument_count() != rhs.first.argument_count()) {
  61     return lhs.first.argument_count() > rhs.first.argument_count();
  62   }
  63
  64   // Favor the result that was removed from the database most recently.
  65   return lhs.second > rhs.second;
  66 }
  67
  68 }  // namespace
  69
  70 DetokenizedString::DetokenizedString(
  71     uint32_t token,
  72     const std::span<const TokenizedStringEntry>& entries,
  73     const std::span<const uint8_t>& arguments)
  74     : token_(token), has_token_(true) {
  75   std::vector<DecodingResult> results;
  76
  77   for (const auto& [format, date_removed] : entries) {
  78     results.push_back(DecodingResult{format.Format(arguments), date_removed});
  79   }
  80
  81   std::sort(results.begin(), results.end(), IsBetterResult);
  82
  83   for (auto& result : results) {
  84     matches_.push_back(std::move(result.first));
  85   }
  86 }
  87
  88 std::string DetokenizedString::BestString() const {
  89   return matches_.empty() ? std::string() : matches_[0].value();
  90 }
  91
  92 std::string DetokenizedString::BestStringWithErrors() const {
  93   if (matches_.empty()) {
  94     return has_token_ ? UnknownTokenMessage(token_)
  95                       : PW_TOKENIZER_ARG_DECODING_ERROR("missing token");
  96   }
  97   return matches_[0].value_with_errors();
  98 }
  99
 100 Detokenizer::Detokenizer(const TokenDatabase& database) {
 101   for (const auto& entry : database) {
 102     database_[entry.token].emplace_back(entry.string, entry.date_removed);
 103   }
 104 }
 105
 106 DetokenizedString Detokenizer::Detokenize(
 107     const std::span<const uint8_t>& encoded) const {
 108   // The token is missing from the encoded data; there is nothing to do.
 109   if (encoded.size() < sizeof(uint32_t)) {
 110     return DetokenizedString();
 111   }
 112
 113   const uint32_t token =
 114       encoded[3] << 24 | encoded[2] << 16 | encoded[1] << 8 | encoded[0];
 115
 116   const auto result = database_.find(token);
 117
 118   return DetokenizedString(token,
 119                            result == database_.end()
 120                                ? std::span<TokenizedStringEntry>()
 121                                : std::span(result->second),
 122                            encoded.subspan(sizeof(token)));
 123 }
 124
 125 }  // namespace pw::tokenizer