media/filters/audio_renderer_algorithm.h

   1 // Copyright 2012 The Chromium Authors
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // AudioRendererAlgorithm buffers and transforms audio data. The owner of
   6 // this object provides audio data to the object through EnqueueBuffer() and
   7 // requests data from the buffer via FillBuffer().
   8 //
   9 // This class is *not* thread-safe. Calls to enqueue and retrieve data must be
  10 // locked if called from multiple threads.
  11 //
  12 // AudioRendererAlgorithm uses the Waveform Similarity Overlap and Add (WSOLA)
  13 // algorithm to stretch or compress audio data to meet playback speeds less than
  14 // or greater than the natural playback of the audio stream. The algorithm
  15 // preserves local properties of the audio, therefore, pitch and harmonics are
  16 // are preserved. See audio_renderer_algorith.cc for a more elaborate
  17 // description of the algorithm.
  18 //
  19 // Audio at very low or very high playback rates are muted to preserve quality.
  20
  21 #ifndef MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_
  22 #define MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_
  23
  24 #include <stdint.h>
  25
  26 #include <memory>
  27 #include <vector>
  28
  29 #include "base/memory/raw_ptr.h"
  30 #include "base/memory/ref_counted.h"
  31 #include "base/time/time.h"
  32 #include "media/base/audio_buffer.h"
  33 #include "media/base/audio_buffer_queue.h"
  34 #include "media/base/audio_parameters.h"
  35 #include "media/base/media_log.h"
  36 #include "media/base/multi_channel_resampler.h"
  37 #include "third_party/abseil-cpp/absl/types/optional.h"
  38
  39 namespace media {
  40
  41 class AudioBus;
  42
  43 class MEDIA_EXPORT AudioRendererAlgorithm {
  44  public:
  45   AudioRendererAlgorithm(MediaLog* media_log);
  46   AudioRendererAlgorithm(MediaLog* media_log,
  47                          AudioRendererAlgorithmParameters params);
  48
  49   AudioRendererAlgorithm(const AudioRendererAlgorithm&) = delete;
  50   AudioRendererAlgorithm& operator=(const AudioRendererAlgorithm&) = delete;
  51
  52   ~AudioRendererAlgorithm();
  53
  54   // Initializes this object with information about the audio stream.
  55   void Initialize(const AudioParameters& params, bool is_encrypted);
  56
  57   // Allows clients to specify which channels will be considered by the
  58   // algorithm when adapting for playback rate, other channels will be muted.
  59   // Useful to avoid performance overhead of the adapatation algorithm. Must
  60   // only be called after Initialize(); may be called multiple times if the
  61   // mask changes.
  62   //
  63   // E.g., If |channel_mask| is [true, false] only the first channel will be
  64   // used to construct the playback rate adapated signal. This is useful if
  65   // channel upmixing has been performed prior to this point.
  66   void SetChannelMask(std::vector<bool> channel_mask);
  67
  68   // Tries to fill |requested_frames| frames into |dest| with possibly scaled
  69   // data from our |audio_buffer_|. Data is scaled based on |playback_rate|,
  70   // using a variation of the Overlap-Add method to combine sample windows.
  71   //
  72   // Data from |audio_buffer_| is consumed in proportion to the playback rate.
  73   //
  74   // |dest_offset| is the offset in frames for writing into |dest|.
  75   //
  76   // Returns the number of frames copied into |dest|.
  77   int FillBuffer(AudioBus* dest,
  78                  int dest_offset,
  79                  int requested_frames,
  80                  double playback_rate);
  81
  82   // Clears |audio_buffer_|.
  83   void FlushBuffers();
  84
  85   // Enqueues a buffer. It is called from the owner of the algorithm after a
  86   // read completes.
  87   void EnqueueBuffer(scoped_refptr<AudioBuffer> buffer_in);
  88
  89   // Sets a target queue latency. This target will be clamped and stored in
  90   // |playback_threshold_|. It may also cause an increase in |capacity_|. A
  91   // value of nullopt indicates the algorithm should restore the default value.
  92   void SetLatencyHint(absl::optional<base::TimeDelta> latency_hint);
  93
  94   // Sets a flag indicating whether apply pitch adjustments when playing back
  95   // at rates other than 1.0. Concretely, we use WSOLA when this is true, and
  96   // resampling when this is false.
  97   void SetPreservesPitch(bool preserves_pitch);
  98
  99   // Returns true if the |audio_buffer_| is >= |playback_threshold_|.
 100   bool IsQueueAdequateForPlayback();
 101
 102   // Returns the required size for |audio_buffer_| to be "adequate for
 103   // playback". See IsQueueAdequateForPlayback().
 104   int QueuePlaybackThreshold() const { return playback_threshold_; }
 105
 106   // Returns true if |audio_buffer_| is >= |capacity_|.
 107   bool IsQueueFull();
 108
 109   // Returns the capacity of |audio_buffer_| in frames.
 110   int QueueCapacity() const { return capacity_; }
 111
 112   // Increase the |playback_threshold_| and |capacity_| of |audio_buffer_| if
 113   // possible. Should not be called if a custom |playback_threshold_| was
 114   // specified.
 115   void IncreasePlaybackThreshold();
 116
 117   // Sets a flag to bypass underflow detection, to read out all remaining data.
 118   void MarkEndOfStream();
 119
 120   // Returns an estimate of the amount of memory (in bytes) used for frames.
 121   int64_t GetMemoryUsage() const;
 122
 123   // Returns the total number of frames in |audio_buffer_| as well as
 124   // unconsumed input frames in the |resampler_|. The returned value may be
 125   // larger than QueueCapacity() in the event that EnqueueBuffer() delivered
 126   // more data than |audio_buffer_| was intending to hold.
 127   int BufferedFrames() const;
 128
 129   // Returns the effective delay in output frames at the given |playback rate|.
 130   // Effectively this tells the caller, if new audio is enqueued via
 131   // EnqueueBuffer(), how many frames must be read via FillBuffer() at the
 132   // |playback_rate| before the new audio is read out. Note that this is
 133   // approximate, since due to WSOLA the audio output doesn't always directly
 134   // correspond to the audio input (some samples may be duplicated or skipped).
 135   double DelayInFrames(double playback_rate) const;
 136
 137   // Returns the samples per second for this audio stream.
 138   int samples_per_second() const { return samples_per_second_; }
 139
 140   std::vector<bool> channel_mask_for_testing() { return channel_mask_; }
 141
 142  private:
 143   enum class FillBufferMode {
 144     kPassthrough,
 145     kResampler,
 146     kWSOLA,
 147   };
 148
 149   // Remove buffered data that will be outdated if we switch fill mode.
 150   void SetFillBufferMode(FillBufferMode mode);
 151
 152   // Within |search_block_|, find the block of data that is most similar to
 153   // |target_block_|, and write it in |optimal_block_|. This method assumes that
 154   // there is enough data to perform a search, i.e. |search_block_| and
 155   // |target_block_| can be extracted from the available frames.
 156   void GetOptimalBlock();
 157
 158   // Read a maximum of |requested_frames| frames from |wsola_output_|. Returns
 159   // number of frames actually read.
 160   int WriteCompletedFramesTo(
 161       int requested_frames, int output_offset, AudioBus* dest);
 162
 163   // Fill |dest| with frames from |audio_buffer_| starting from frame
 164   // |read_offset_frames|. |dest| is expected to have the same number of
 165   // channels as |audio_buffer_|. A negative offset, i.e.
 166   // |read_offset_frames| < 0, is accepted assuming that |audio_buffer| is zero
 167   // for negative indices. This might happen for few first frames. This method
 168   // assumes there is enough frames to fill |dest|, i.e. |read_offset_frames| +
 169   // |dest->frames()| does not extend to future.
 170   void PeekAudioWithZeroPrepend(int read_offset_frames, AudioBus* dest);
 171
 172   // Run one iteration of WSOLA, if there are sufficient frames. This will
 173   // overlap-and-add one block to |wsola_output_|, hence, |num_complete_frames_|
 174   // is incremented by |ola_hop_size_|.
 175   bool RunOneWsolaIteration(double playback_rate);
 176
 177   // Seek |audio_buffer_| forward to remove frames from input that are not used
 178   // any more. State of the WSOLA will be updated accordingly.
 179   void RemoveOldInputFrames(double playback_rate);
 180
 181   // Update |output_time_| by |time_change|. In turn |search_block_index_| is
 182   // updated.
 183   void UpdateOutputTime(double playback_rate, double time_change);
 184
 185   // Is |target_block_| fully within |search_block_|? If so, we don't need to
 186   // perform the search.
 187   bool TargetIsWithinSearchRegion() const;
 188
 189   // Do we have enough data to perform one round of WSOLA?
 190   bool CanPerformWsola() const;
 191
 192   // Creates or recreates |target_block_wrapper_| and |search_block_wrapper_|
 193   // after a |channel_mask_| change. May be called at anytime after a channel
 194   // mask has been specified.
 195   void CreateSearchWrappers();
 196
 197   // Uses |resampler_| to speed up or slowdown audio, by using a resampling
 198   // ratio of |playback_rate|.
 199   int ResampleAndFill(AudioBus* dest,
 200                       int dest_offset,
 201                       int requested_frames,
 202                       double playback_rate);
 203
 204   // Called by |resampler_| to get more audio data.
 205   void OnResamplerRead(int frame_delay, AudioBus* audio_bus);
 206
 207   raw_ptr<MediaLog> media_log_;
 208
 209   // Parameters.
 210   AudioRendererAlgorithmParameters audio_renderer_algorithm_params_;
 211
 212   // Number of channels in audio stream.
 213   int channels_;
 214
 215   // Sample rate of audio stream.
 216   int samples_per_second_;
 217
 218   // Is compressed audio output
 219   bool is_bitstream_format_;
 220
 221   // Buffered audio data.
 222   AudioBufferQueue audio_buffer_;
 223
 224   // Hint to adjust |playback_threshold_| as a means of controlling playback
 225   // start latency. See SetLatencyHint();
 226   absl::optional<base::TimeDelta> latency_hint_;
 227
 228   // Whether to apply pitch adjusments or not when playing back at rates other
 229   // than 1.0. In other words, we use WSOLA to preserve pitch when this is on,
 230   // and resampling when this
 231   bool preserves_pitch_ = true;
 232
 233   // How many frames to have in queue before beginning playback.
 234   int64_t playback_threshold_;
 235
 236   // Minimum allowed value for |plabyack_threshold_| calculated by Initialize().
 237   int64_t min_playback_threshold_;
 238
 239   // How many frames to have in the queue before we report the queue is full.
 240   int64_t capacity_;
 241
 242   // Book keeping of the current time of generated audio, in frames. This
 243   // should be appropriately updated when out samples are generated, regardless
 244   // of whether we push samples out when FillBuffer() is called or we store
 245   // audio in |wsola_output_| for the subsequent calls to FillBuffer().
 246   // Furthermore, if samples from |audio_buffer_| are evicted then this
 247   // member variable should be updated based on |playback_rate_|.
 248   // Note that this member should be updated ONLY by calling UpdateOutputTime(),
 249   // so that |search_block_index_| is update accordingly.
 250   double output_time_;
 251
 252   // The offset of the center frame of |search_block_| w.r.t. its first frame.
 253   int search_block_center_offset_;
 254
 255   // Index of the beginning of the |search_block_|, in frames.
 256   int search_block_index_;
 257
 258   // Number of Blocks to search to find the most similar one to the target
 259   // frame.
 260   int num_candidate_blocks_;
 261
 262   // Index of the beginning of the target block, counted in frames.
 263   int target_block_index_;
 264
 265   // Overlap-and-add window size in frames.
 266   int ola_window_size_;
 267
 268   // The hop size of overlap-and-add in frames. This implementation assumes 50%
 269   // overlap-and-add.
 270   int ola_hop_size_;
 271
 272   // Number of frames in |wsola_output_| that overlap-and-add is completed for
 273   // them and can be copied to output if FillBuffer() is called. It also
 274   // specifies the index where the next WSOLA window has to overlap-and-add.
 275   int num_complete_frames_;
 276
 277   bool reached_end_of_stream_ = false;
 278
 279   // Used to replace WSOLA algorithm at playback speeds close to 1.0. This is to
 280   // prevent noticeable audio artifacts introduced by WSOLA, at the expense of
 281   // changing the pitch of the audio.
 282   std::unique_ptr<MultiChannelResampler> resampler_;
 283
 284   // True when the last call to OnResamplerRead() only gave silence to
 285   // |resampler_|. Used to determine whether or not we have played out all the
 286   // valid audio from |resampler.BufferedFrames()|.
 287   bool resampler_only_has_silence_ = false;
 288
 289   // This stores a part of the output that is created but couldn't be rendered.
 290   // Output is generated frame-by-frame which at some point might exceed the
 291   // number of requested samples. Furthermore, due to overlap-and-add,
 292   // the last half-window of the output is incomplete, which is stored in this
 293   // buffer.
 294   std::unique_ptr<AudioBus> wsola_output_;
 295
 296   // Overlap-and-add window.
 297   std::unique_ptr<float[]> ola_window_;
 298
 299   // Transition window, used to update |optimal_block_| by a weighted sum of
 300   // |optimal_block_| and |target_block_|.
 301   std::unique_ptr<float[]> transition_window_;
 302
 303   // Auxiliary variables to avoid allocation in every iteration.
 304
 305   // Stores the optimal block in every iteration. This is the most
 306   // similar block to |target_block_| within |search_block_| and it is
 307   // overlap-and-added to |wsola_output_|.
 308   std::unique_ptr<AudioBus> optimal_block_;
 309
 310   // A block of data that search is performed over to find the |optimal_block_|.
 311   std::unique_ptr<AudioBus> search_block_;
 312
 313   // Stores the target block, denoted as |target| above. |search_block_| is
 314   // searched for a block (|optimal_block_|) that is most similar to
 315   // |target_block_|.
 316   std::unique_ptr<AudioBus> target_block_;
 317
 318   // Active channels to consider while searching. Used to speed up WSOLA
 319   // processing by ignoring always muted channels. Wrappers are always
 320   // constructed during Initialize() and have <= |channels_|.
 321   std::vector<bool> channel_mask_;
 322   std::unique_ptr<AudioBus> search_block_wrapper_;
 323   std::unique_ptr<AudioBus> target_block_wrapper_;
 324
 325   // The initial and maximum capacity calculated by Initialize().
 326   int64_t initial_capacity_;
 327   int64_t max_capacity_;
 328
 329   FillBufferMode last_mode_ = FillBufferMode::kPassthrough;
 330 };
 331
 332 }  // namespace media
 333
 334 #endif  // MEDIA_FILTERS_AUDIO_RENDERER_ALGORITHM_H_