1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_
6 #define COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_
8 #include "components/assist_ranker/proto/example_preprocessor.pb.h"
9 #include "components/assist_ranker/proto/ranker_example.pb.h"
10 #include "third_party/protobuf/src/google/protobuf/map.h"
12 namespace assist_ranker {
14 // Preprocessor for preprocessing RankerExample into formats that is needed by
16 class ExamplePreprocessor {
18 // Error code (bitwise) for preprocessing.
19 enum PreprocessErrorCode {
21 kNoFeatureIndexFound = 1,
22 kNonbucketizableFeatureType = 2,
23 kInvalidFeatureType = 4,
24 kInvalidFeatureListIndex = 8,
25 kNonNormalizableFeatureType = 16,
26 kNonConvertibleToStringFeatureType = 32,
27 kNormalizerIsZero = 64,
30 explicit ExamplePreprocessor(const ExamplePreprocessorConfig& config)
33 // Processes a RankerExample with config_.
34 // Clear up all features except kVectorizedFeatureDefaultName if
35 // clear_other_features is set to true.
36 // Returns the error code of preprocessing, can be any sum of the error code
37 // in PreprocessErrorCode.
38 int Process(RankerExample* example, bool clear_other_features = false) const;
40 // Default feature name for missing features.
41 static const char kMissingFeatureDefaultName[];
43 // Default feature name for vectorized features.
44 static const char kVectorizedFeatureDefaultName[];
46 // Generates a feature's fullname based on feature_name and feature_value.
47 // A feature fullname is defined as:
48 // (1) feature_name if it's bool_value, int64_value or float_value.
49 // (2) a combination of feature_name and feature_value if it's string_value
50 // or i-th element of a string_list.
51 static std::string FeatureFullname(const std::string& feature_name,
52 const std::string& feature_value = "");
55 // If a feature is specified in config_.missing_features() and missing in
56 // the example, then the feature name is added as a sparse feature value to
57 // the special sparse feature "_MissingFeature" in the example.
58 // Always returns kSuccess.
59 int AddMissingFeatures(RankerExample* example) const;
60 // If a numeric feature is specified in config_.bucketizers(), then it is
61 // bucketized based on the boundaries and reset as a one-hot feature with
62 // bucket index as it's string value.
63 int AddBucketizedFeatures(RankerExample* example) const;
64 // Normalizes numeric features to be within [-1.0, 1.0] as float features.
65 int NormalizeFeatures(RankerExample* example) const;
66 // Converts any features in |example| that are listed in
67 // |config_.convert_to_string_features()| into string-valued features.
68 int ConvertToStringFeatures(RankerExample* example) const;
69 // Add a new_float_list feature as kVectorizedFeatureDefaultName, and iterate
70 // for all existing features in example.features(), set corresponding
71 // new_float_list.float_value(config_.feature_indices(feature_value_key)) to
72 // be either numeric value (for scalars) or 1.0 (for string values).
73 int Vectorization(RankerExample* example, bool clear_other_features) const;
75 // Configuration proto for the preprocessor.
76 const ExamplePreprocessorConfig config_;
79 // An iterator that goes through all features of a RankerExample and converts
80 // each field as a struct Field{full_name, value, error}.
81 // (1) A numeric feature (bool_value, int32_value, float_value) is converted
82 // to {feature_name, float(original_value), kSuccess}.
83 // (2) A string feature is converted to
84 // {feature_name_string_value, 1.0, kSuccess}.
85 // (3) A string_value from a string list feature is converted to
86 // {feature_name_string_value, 1.0, error_code} where non-empty list
87 // gets error_code kSuccess, empty list gets kInvalidFeatureListIndex.
89 // std::vector<float> ExampleToStdFloat(const RankerExample& example,
90 // const Map& feature_indices) {
91 // std::vector<float> vectorized(feature_indices.size());
92 // for (const auto& field : ExampleFloatIterator(example)) {
93 // if (field.error == ExamplePreprocessor::kSuccess) {
94 // const int index = feature_indices[field.fullname];
95 // vectorized[index] = field.value;
100 class ExampleFloatIterator {
102 // A struct as float value of one field from a RankerExample.
104 std::string fullname;
109 explicit ExampleFloatIterator(const RankerExample& example)
110 : feature_iterator_(example.features().begin()),
111 feature_end_iterator_(example.features().end()),
112 string_list_index_(0) {}
114 ExampleFloatIterator begin() const { return *this; }
115 ExampleFloatIterator end() const {
116 return ExampleFloatIterator(feature_end_iterator_);
119 Field operator*() const;
121 ExampleFloatIterator& operator++();
123 // Two iterators are equal if they point to the same field, with the same
124 // indices if it's a string_list.
125 bool operator==(const ExampleFloatIterator& other) const {
126 return feature_iterator_ == other.feature_iterator_ &&
127 string_list_index_ == other.string_list_index_;
130 bool operator!=(const ExampleFloatIterator& other) const {
131 return !(*this == other);
135 // Returns the end iterator.
136 explicit ExampleFloatIterator(
137 const google::protobuf::Map<std::string, Feature>::const_iterator&
138 feature_end_iterator)
139 : feature_iterator_(feature_end_iterator),
140 feature_end_iterator_(feature_end_iterator),
141 string_list_index_(0) {}
143 google::protobuf::Map<std::string, Feature>::const_iterator feature_iterator_;
144 google::protobuf::Map<std::string, Feature>::const_iterator
145 feature_end_iterator_;
146 int string_list_index_;
149 } // namespace assist_ranker
151 #endif // COMPONENTS_ASSIST_RANKER_EXAMPLE_PREPROCESSING_H_