1 // Copyright 2020 The Pigweed Authors
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
7 // https://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
15 // This file provides the Detokenizer class, which is used to decode tokenized
16 // strings. To use a Detokenizer, load a binary format token database into
17 // memory, construct a TokenDatabase, and pass it to a Detokenizer:
19 // std::vector data = ReadFile("my_tokenized_strings.db");
20 // Detokenizer detok(TokenDatabase::Create(data));
22 // DetokenizedString result = detok.Detokenize(my_data);
23 // std::cout << result.BestString() << '\n';
31 #include <unordered_map>
35 #include "pw_tokenizer/internal/decode.h"
36 #include "pw_tokenizer/token_database.h"
38 namespace pw::tokenizer {
40 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
42 // A string that has been detokenized. This class tracks all possible results if
43 // there are token collisions.
44 class DetokenizedString {
46 DetokenizedString(uint32_t token,
47 const std::span<const TokenizedStringEntry>& entries,
48 const std::span<const uint8_t>& arguments);
50 DetokenizedString() : has_token_(false) {}
52 // True if there was only one valid match and it decoded successfully.
53 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }
55 // Returns the strings that matched the token, with the best matches first.
56 const std::vector<DecodedFormatString>& matches() const { return matches_; }
58 // Returns the detokenized string or an empty string if there were no matches.
59 // If there are multiple possible results, the DetokenizedString returns the
61 std::string BestString() const;
63 // Returns the best match, with error messages inserted for arguments that
65 std::string BestStringWithErrors() const;
70 std::vector<DecodedFormatString> matches_;
73 // Decodes and detokenizes strings from a TokenDatabase. This class builds a
74 // hash table from the TokenDatabase to give O(1) token lookups.
77 // Constructs a detokenizer from a TokenDatabase. The TokenDatabase is not
78 // referenced by the Detokenizer after construction; its memory can be freed.
79 Detokenizer(const TokenDatabase& database);
81 // Decodes and detokenizes the encoded message. Returns a DetokenizedString
82 // that stores all possible detokenized string results.
83 DetokenizedString Detokenize(const std::span<const uint8_t>& encoded) const;
85 DetokenizedString Detokenize(const std::string_view& encoded) const {
86 return Detokenize(encoded.data(), encoded.size());
89 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
91 std::span(static_cast<const uint8_t*>(encoded), size_bytes));
95 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
98 } // namespace pw::tokenizer