1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "media/filters/audio_renderer_algorithm.h"
10 #include "base/bind.h"
11 #include "base/logging.h"
12 #include "cc/base/math_util.h"
13 #include "media/base/audio_bus.h"
14 #include "media/base/audio_timestamp_helper.h"
15 #include "media/base/limits.h"
16 #include "media/filters/wsola_internals.h"
20 // Waveform Similarity Overlap-and-add (WSOLA).
22 // One WSOLA iteration
24 // 1) Extract |target_block_| as input frames at indices
25 // [|target_block_index_|, |target_block_index_| + |ola_window_size_|).
26 // Note that |target_block_| is the "natural" continuation of the output.
28 // 2) Extract |search_block_| as input frames at indices
29 // [|search_block_index_|,
30 // |search_block_index_| + |num_candidate_blocks_| + |ola_window_size_|).
32 // 3) Find a block within the |search_block_| that is most similar
33 // to |target_block_|. Let |optimal_index| be the index of such block and
34 // write it to |optimal_block_|.
37 // |optimal_block_| = |transition_window_| * |target_block_| +
38 // (1 - |transition_window_|) * |optimal_block_|.
40 // 5) Overlap-and-add |optimal_block_| to the |wsola_output_|.
43 // |target_block_| = |optimal_index| + |ola_window_size_| / 2.
44 // |output_index_| = |output_index_| + |ola_window_size_| / 2,
45 // |search_block_center_index| = |output_index_| * |playback_rate|, and
46 // |search_block_index_| = |search_block_center_index| -
47 // |search_block_center_offset_|.
49 // Overlap-and-add window size in milliseconds.
50 constexpr base::TimeDelta kOlaWindowSize = base::Milliseconds(20);
52 // Size of search interval in milliseconds. The search interval is
53 // [-delta delta] around |output_index_| * |playback_rate|. So the search
54 // interval is 2 * delta.
55 constexpr base::TimeDelta kWsolaSearchInterval = base::Milliseconds(30);
57 // The maximum size for the |audio_buffer_|. Arbitrarily determined.
58 constexpr base::TimeDelta kMaxCapacity = base::Seconds(3);
60 // The minimum size for the |audio_buffer_|. Arbitrarily determined.
61 constexpr base::TimeDelta kStartingCapacity = base::Milliseconds(200);
63 // The minimum size for the |audio_buffer_| for encrypted streams.
64 // Set this to be larger than |kStartingCapacity| because the performance of
65 // encrypted playback is always worse than clear playback, due to decryption and
66 // potentially IPC overhead. For the context, see https://crbug.com/403462,
67 // https://crbug.com/718161 and https://crbug.com/879970.
68 constexpr base::TimeDelta kStartingCapacityForEncrypted =
69 base::Milliseconds(500);
71 AudioRendererAlgorithm::AudioRendererAlgorithm(MediaLog* media_log)
72 : AudioRendererAlgorithm(
74 {kMaxCapacity, kStartingCapacity, kStartingCapacityForEncrypted}) {}
76 AudioRendererAlgorithm::AudioRendererAlgorithm(
78 AudioRendererAlgorithmParameters params)
79 : media_log_(media_log),
80 audio_renderer_algorithm_params_(std::move(params)),
82 samples_per_second_(0),
83 is_bitstream_format_(false),
86 search_block_center_offset_(0),
87 search_block_index_(0),
88 num_candidate_blocks_(0),
89 target_block_index_(0),
92 num_complete_frames_(0),
96 AudioRendererAlgorithm::~AudioRendererAlgorithm() = default;
98 void AudioRendererAlgorithm::Initialize(const AudioParameters& params,
100 CHECK(params.IsValid());
102 channels_ = params.channels();
103 samples_per_second_ = params.sample_rate();
104 is_bitstream_format_ = params.IsBitstreamFormat();
105 min_playback_threshold_ = params.frames_per_buffer() * 2;
106 initial_capacity_ = capacity_ = playback_threshold_ = std::max(
107 min_playback_threshold_,
108 AudioTimestampHelper::TimeToFrames(
110 ? audio_renderer_algorithm_params_.starting_capacity_for_encrypted
111 : audio_renderer_algorithm_params_.starting_capacity,
112 samples_per_second_));
113 max_capacity_ = std::max(
115 AudioTimestampHelper::TimeToFrames(
116 audio_renderer_algorithm_params_.max_capacity, samples_per_second_));
117 num_candidate_blocks_ = AudioTimestampHelper::TimeToFrames(
118 kWsolaSearchInterval, samples_per_second_);
120 AudioTimestampHelper::TimeToFrames(kOlaWindowSize, samples_per_second_);
122 // Make sure window size is an even number.
123 ola_window_size_ += ola_window_size_ & 1;
124 ola_hop_size_ = ola_window_size_ / 2;
126 // |num_candidate_blocks_| / 2 is the offset of the center of the search
127 // block to the center of the first (left most) candidate block. The offset
128 // of the center of a candidate block to its left most point is
129 // |ola_window_size_| / 2 - 1. Note that |ola_window_size_| is even and in
130 // our convention the center belongs to the left half, so we need to subtract
131 // one frame to get the correct offset.
134 // <------------------------------------------->
136 // |ola_window_size_| / 2 - 1
139 // |num_candidate_blocks_| / 2
142 // X----X----------------X---------------X-----X
143 // <----------> <---------->
144 // Candidate ... Candidate
145 // 1, ... |num_candidate_blocks_|
146 search_block_center_offset_ =
147 num_candidate_blocks_ / 2 + (ola_window_size_ / 2 - 1);
149 // If no mask is provided, assume all channels are valid.
150 if (channel_mask_.empty())
151 SetChannelMask(std::vector<bool>(channels_, true));
154 void AudioRendererAlgorithm::SetChannelMask(std::vector<bool> channel_mask) {
155 DCHECK_EQ(channel_mask.size(), static_cast<size_t>(channels_));
156 channel_mask_ = std::move(channel_mask);
158 CreateSearchWrappers();
161 void AudioRendererAlgorithm::OnResamplerRead(int frame_delay,
162 AudioBus* audio_bus) {
163 const int requested_frames = audio_bus->frames();
164 int read_frames = audio_buffer_.ReadFrames(requested_frames, 0, audio_bus);
166 if (read_frames < requested_frames) {
167 // We should only be filling up |resampler_| with silence if we are playing
168 // out all remaining frames.
169 DCHECK(reached_end_of_stream_);
170 audio_bus->ZeroFramesPartial(read_frames, requested_frames - read_frames);
173 resampler_only_has_silence_ = !read_frames;
176 void AudioRendererAlgorithm::MarkEndOfStream() {
177 reached_end_of_stream_ = true;
180 int AudioRendererAlgorithm::ResampleAndFill(AudioBus* dest,
182 int requested_frames,
183 double playback_rate) {
184 SetFillBufferMode(FillBufferMode::kResampler);
186 resampler_ = std::make_unique<MultiChannelResampler>(
187 channels_, playback_rate, SincResampler::kDefaultRequestSize,
188 base::BindRepeating(&AudioRendererAlgorithm::OnResamplerRead,
189 base::Unretained(this)));
192 if (reached_end_of_stream_ && resampler_only_has_silence_ &&
193 !audio_buffer_.frames()) {
194 // Previous calls to ResampleAndFill() and OnResamplerRead() have used all
195 // of the available buffers from |audio_buffer_|. We have also played out
196 // all remaining frames, and |resampler_| only contains silence.
200 // |resampler_| can request more than |requested_frames|, due to the
201 // requests size not being aligned. To prevent having to fill it with silence,
202 // we find the max number of reads it could request, and make sure we have
203 // enough data to satisfy all of those reads.
204 if (!reached_end_of_stream_ &&
205 audio_buffer_.frames() <
206 resampler_->GetMaxInputFramesRequested(requested_frames)) {
207 // Exit early, forgoing at most a total of |audio_buffer_.frames()| +
208 // |resampler_->BufferedFrames()|.
209 // If we have reached the end of stream, |resampler_| will output silence
210 // after running out of frames, which is ok.
213 resampler_->SetRatio(playback_rate);
215 // Directly use |dest| for the most common case of having 0 offset.
217 resampler_->Resample(requested_frames, dest);
218 return requested_frames;
221 // This is only really used once, at the beginning of a stream, which means
222 // we can use a temporary variable, rather than saving it as a member.
223 // NOTE: We don't wrap |dest|'s channel data in an AudioBus wrapper, because
224 // |dest_offset| isn't aligned always with AudioBus::kChannelAlignment.
225 std::unique_ptr<AudioBus> resampler_output =
226 AudioBus::Create(channels_, requested_frames);
228 resampler_->Resample(requested_frames, resampler_output.get());
229 resampler_output->CopyPartialFramesTo(0, requested_frames, dest_offset, dest);
231 return requested_frames;
234 int AudioRendererAlgorithm::FillBuffer(AudioBus* dest,
236 int requested_frames,
237 double playback_rate) {
238 if (playback_rate == 0)
241 DCHECK_GT(playback_rate, 0);
242 DCHECK_EQ(channels_, dest->channels());
244 // In case of compressed bitstream formats, no post processing is allowed.
245 if (is_bitstream_format_)
246 return audio_buffer_.ReadFrames(requested_frames, dest_offset, dest);
248 int slower_step = ceil(ola_window_size_ * playback_rate);
249 int faster_step = ceil(ola_window_size_ / playback_rate);
251 // Optimize the most common |playback_rate| ~= 1 case to use a single copy
252 // instead of copying frame by frame.
253 if (ola_window_size_ <= faster_step && slower_step >= ola_window_size_) {
254 SetFillBufferMode(FillBufferMode::kPassthrough);
256 const int frames_to_copy =
257 std::min(audio_buffer_.frames(), requested_frames);
258 const int frames_read =
259 audio_buffer_.ReadFrames(frames_to_copy, dest_offset, dest);
260 DCHECK_EQ(frames_read, frames_to_copy);
264 // Use resampling when no pitch adjustments are needed.
265 if (!preserves_pitch_)
266 return ResampleAndFill(dest, dest_offset, requested_frames, playback_rate);
268 SetFillBufferMode(FillBufferMode::kWSOLA);
270 // Allocate structures on first non-1.0 playback rate; these can eat a fair
271 // chunk of memory. ~56kB for stereo 48kHz, up to ~765kB for 7.1 192kHz.
273 ola_window_.reset(new float[ola_window_size_]);
274 internal::GetPeriodicHanningWindow(ola_window_size_, ola_window_.get());
276 transition_window_.reset(new float[ola_window_size_ * 2]);
277 internal::GetPeriodicHanningWindow(2 * ola_window_size_,
278 transition_window_.get());
280 // Initialize for overlap-and-add of the first block.
282 AudioBus::Create(channels_, ola_window_size_ + ola_hop_size_);
283 wsola_output_->Zero();
285 // Auxiliary containers.
286 optimal_block_ = AudioBus::Create(channels_, ola_window_size_);
287 search_block_ = AudioBus::Create(
288 channels_, num_candidate_blocks_ + (ola_window_size_ - 1));
289 target_block_ = AudioBus::Create(channels_, ola_window_size_);
291 // Create potentially smaller wrappers for playback rate adaptation.
292 CreateSearchWrappers();
295 // Silent audio can contain non-zero samples small enough to result in
296 // subnormals internalls. Disabling subnormals can be significantly faster in
298 cc::ScopedSubnormalFloatDisabler disable_subnormals;
300 int rendered_frames = 0;
303 WriteCompletedFramesTo(requested_frames - rendered_frames,
304 dest_offset + rendered_frames, dest);
305 } while (rendered_frames < requested_frames &&
306 RunOneWsolaIteration(playback_rate));
307 return rendered_frames;
310 void AudioRendererAlgorithm::SetFillBufferMode(FillBufferMode mode) {
311 if (last_mode_ == mode)
314 // Clear any state from other fill modes so that we don't produce outdated
316 if (last_mode_ == FillBufferMode::kWSOLA) {
318 search_block_index_ = 0;
319 target_block_index_ = 0;
321 wsola_output_->Zero();
322 num_complete_frames_ = 0;
329 void AudioRendererAlgorithm::FlushBuffers() {
330 // Clear the queue of decoded packets (releasing the buffers).
331 audio_buffer_.Clear();
333 search_block_index_ = 0;
334 target_block_index_ = 0;
336 wsola_output_->Zero();
337 num_complete_frames_ = 0;
340 reached_end_of_stream_ = false;
342 // Reset |capacity_| and |playback_threshold_| so growth triggered by
343 // underflows doesn't penalize seek time. When |latency_hint_| is set we don't
344 // increase the queue for underflow, so avoid resetting it on flush.
345 if (!latency_hint_) {
346 capacity_ = playback_threshold_ = initial_capacity_;
350 void AudioRendererAlgorithm::EnqueueBuffer(
351 scoped_refptr<AudioBuffer> buffer_in) {
352 DCHECK(!buffer_in->end_of_stream());
353 audio_buffer_.Append(std::move(buffer_in));
356 void AudioRendererAlgorithm::SetLatencyHint(
357 absl::optional<base::TimeDelta> latency_hint) {
358 DCHECK_GE(playback_threshold_, min_playback_threshold_);
359 DCHECK_LE(playback_threshold_, capacity_);
360 DCHECK_LE(capacity_, max_capacity_);
362 latency_hint_ = latency_hint;
365 // Restore default values.
366 playback_threshold_ = capacity_ = initial_capacity_;
368 MEDIA_LOG(DEBUG, media_log_)
369 << "Audio latency hint cleared. Default buffer size ("
370 << AudioTimestampHelper::FramesToTime(playback_threshold_,
376 int latency_hint_frames =
377 AudioTimestampHelper::TimeToFrames(*latency_hint_, samples_per_second_);
379 // Set |plabyack_threshold_| using hint, clamped between
380 // [min_playback_threshold_, max_capacity_].
381 std::string clamp_string;
382 if (latency_hint_frames > max_capacity_) {
383 playback_threshold_ = max_capacity_;
384 clamp_string = " (clamped to max)";
385 } else if (latency_hint_frames < min_playback_threshold_) {
386 playback_threshold_ = min_playback_threshold_;
387 clamp_string = " (clamped to min)";
389 playback_threshold_ = latency_hint_frames;
392 // Use |initial_capacity_| if possible. Increase if needed.
393 capacity_ = std::max(playback_threshold_, initial_capacity_);
395 MEDIA_LOG(DEBUG, media_log_)
396 << "Audio latency hint set:" << *latency_hint << ". "
397 << "Effective buffering latency:"
398 << AudioTimestampHelper::FramesToTime(playback_threshold_,
402 DCHECK_GE(playback_threshold_, min_playback_threshold_);
403 DCHECK_LE(playback_threshold_, capacity_);
404 DCHECK_LE(capacity_, max_capacity_);
407 bool AudioRendererAlgorithm::IsQueueAdequateForPlayback() {
408 return audio_buffer_.frames() >= playback_threshold_;
411 bool AudioRendererAlgorithm::IsQueueFull() {
412 return audio_buffer_.frames() >= capacity_;
415 void AudioRendererAlgorithm::IncreasePlaybackThreshold() {
416 DCHECK(!latency_hint_) << "Don't override the user specified latency";
417 DCHECK_EQ(playback_threshold_, capacity_);
418 DCHECK_LE(capacity_, max_capacity_);
420 playback_threshold_ = capacity_ = std::min(2 * capacity_, max_capacity_);
423 int64_t AudioRendererAlgorithm::GetMemoryUsage() const {
424 return BufferedFrames() * channels_ * sizeof(float);
427 int AudioRendererAlgorithm::BufferedFrames() const {
428 return audio_buffer_.frames() +
429 (resampler_ ? static_cast<int>(resampler_->BufferedFrames()) : 0);
432 double AudioRendererAlgorithm::DelayInFrames(double playback_rate) const {
433 int slower_step = std::ceil(ola_window_size_ * playback_rate);
434 int faster_step = std::ceil(ola_window_size_ / playback_rate);
436 // When |playback_rate| ~= 1, we read directly from |audio_buffer_|.
437 if (ola_window_size_ <= faster_step && slower_step >= ola_window_size_)
438 return audio_buffer_.frames();
440 const float buffered_output_frames = BufferedFrames() / playback_rate;
441 const float unconverted_output_frames = buffered_output_frames - output_time_;
442 return unconverted_output_frames + num_complete_frames_;
445 bool AudioRendererAlgorithm::CanPerformWsola() const {
446 const int search_block_size = num_candidate_blocks_ + (ola_window_size_ - 1);
447 const int frames = audio_buffer_.frames();
448 return target_block_index_ + ola_window_size_ <= frames &&
449 search_block_index_ + search_block_size <= frames;
452 bool AudioRendererAlgorithm::RunOneWsolaIteration(double playback_rate) {
453 if (!CanPerformWsola())
459 for (int k = 0; k < channels_; ++k) {
460 if (!channel_mask_[k])
463 const float* const ch_opt_frame = optimal_block_->channel(k);
464 float* ch_output = wsola_output_->channel(k) + num_complete_frames_;
465 for (int n = 0; n < ola_hop_size_; ++n) {
466 ch_output[n] = ch_output[n] * ola_window_[ola_hop_size_ + n] +
467 ch_opt_frame[n] * ola_window_[n];
470 // Copy the second half to the output.
471 memcpy(&ch_output[ola_hop_size_], &ch_opt_frame[ola_hop_size_],
472 sizeof(*ch_opt_frame) * ola_hop_size_);
475 num_complete_frames_ += ola_hop_size_;
476 UpdateOutputTime(playback_rate, ola_hop_size_);
477 RemoveOldInputFrames(playback_rate);
481 void AudioRendererAlgorithm::UpdateOutputTime(double playback_rate,
482 double time_change) {
483 output_time_ += time_change;
484 // Center of the search region, in frames.
485 const int search_block_center_index = static_cast<int>(
486 output_time_ * playback_rate + 0.5);
487 search_block_index_ = search_block_center_index - search_block_center_offset_;
490 void AudioRendererAlgorithm::RemoveOldInputFrames(double playback_rate) {
491 const int earliest_used_index = std::min(target_block_index_,
492 search_block_index_);
493 if (earliest_used_index <= 0)
494 return; // Nothing to remove.
496 // Remove frames from input and adjust indices accordingly.
497 audio_buffer_.SeekFrames(earliest_used_index);
498 target_block_index_ -= earliest_used_index;
500 // Adjust output index.
501 double output_time_change = static_cast<double>(earliest_used_index) /
503 CHECK_GE(output_time_, output_time_change);
504 UpdateOutputTime(playback_rate, -output_time_change);
507 int AudioRendererAlgorithm::WriteCompletedFramesTo(
508 int requested_frames, int dest_offset, AudioBus* dest) {
509 int rendered_frames = std::min(num_complete_frames_, requested_frames);
511 if (rendered_frames == 0)
512 return 0; // There is nothing to read from |wsola_output_|, return.
514 wsola_output_->CopyPartialFramesTo(0, rendered_frames, dest_offset, dest);
516 // Remove the frames which are read.
517 int frames_to_move = wsola_output_->frames() - rendered_frames;
518 for (int k = 0; k < channels_; ++k) {
519 if (!channel_mask_[k])
521 float* ch = wsola_output_->channel(k);
522 memmove(ch, &ch[rendered_frames], sizeof(*ch) * frames_to_move);
524 num_complete_frames_ -= rendered_frames;
525 return rendered_frames;
528 bool AudioRendererAlgorithm::TargetIsWithinSearchRegion() const {
529 const int search_block_size = num_candidate_blocks_ + (ola_window_size_ - 1);
531 return target_block_index_ >= search_block_index_ &&
532 target_block_index_ + ola_window_size_ <=
533 search_block_index_ + search_block_size;
536 void AudioRendererAlgorithm::GetOptimalBlock() {
537 int optimal_index = 0;
539 // An interval around last optimal block which is excluded from the search.
540 // This is to reduce the buzzy sound. The number 160 is rather arbitrary and
541 // derived heuristically.
542 const int kExcludeIntervalLengthFrames = 160;
543 if (TargetIsWithinSearchRegion()) {
544 optimal_index = target_block_index_;
545 PeekAudioWithZeroPrepend(optimal_index, optimal_block_.get());
547 PeekAudioWithZeroPrepend(target_block_index_, target_block_.get());
548 PeekAudioWithZeroPrepend(search_block_index_, search_block_.get());
550 target_block_index_ - ola_hop_size_ - search_block_index_;
551 internal::Interval exclude_interval =
552 std::make_pair(last_optimal - kExcludeIntervalLengthFrames / 2,
553 last_optimal + kExcludeIntervalLengthFrames / 2);
555 // |optimal_index| is in frames and it is relative to the beginning of the
558 internal::OptimalIndex(search_block_wrapper_.get(),
559 target_block_wrapper_.get(), exclude_interval);
561 // Translate |index| w.r.t. the beginning of |audio_buffer_| and extract the
563 optimal_index += search_block_index_;
564 PeekAudioWithZeroPrepend(optimal_index, optimal_block_.get());
566 // Make a transition from target block to the optimal block if different.
567 // Target block has the best continuation to the current output.
568 // Optimal block is the most similar block to the target, however, it might
569 // introduce some discontinuity when over-lap-added. Therefore, we combine
570 // them for a smoother transition. The length of transition window is twice
571 // as that of the optimal-block which makes it like a weighting function
572 // where target-block has higher weight close to zero (weight of 1 at index
573 // 0) and lower weight close the end.
574 for (int k = 0; k < channels_; ++k) {
575 if (!channel_mask_[k])
577 float* ch_opt = optimal_block_->channel(k);
578 const float* const ch_target = target_block_->channel(k);
579 for (int n = 0; n < ola_window_size_; ++n) {
580 ch_opt[n] = ch_opt[n] * transition_window_[n] +
581 ch_target[n] * transition_window_[ola_window_size_ + n];
586 // Next target is one hop ahead of the current optimal.
587 target_block_index_ = optimal_index + ola_hop_size_;
590 void AudioRendererAlgorithm::PeekAudioWithZeroPrepend(
591 int read_offset_frames, AudioBus* dest) {
592 CHECK_LE(read_offset_frames + dest->frames(), audio_buffer_.frames());
594 int write_offset = 0;
595 int num_frames_to_read = dest->frames();
596 if (read_offset_frames < 0) {
597 int num_zero_frames_appended = std::min(-read_offset_frames,
599 read_offset_frames = 0;
600 num_frames_to_read -= num_zero_frames_appended;
601 write_offset = num_zero_frames_appended;
602 dest->ZeroFrames(num_zero_frames_appended);
604 audio_buffer_.PeekFrames(num_frames_to_read, read_offset_frames,
608 void AudioRendererAlgorithm::CreateSearchWrappers() {
609 // WSOLA is quite expensive to run, so if a channel mask exists, use it to
610 // reduce the size of our search space.
611 std::vector<float*> active_target_channels;
612 std::vector<float*> active_search_channels;
613 for (int ch = 0; ch < channels_; ++ch) {
614 if (channel_mask_[ch]) {
615 active_target_channels.push_back(target_block_->channel(ch));
616 active_search_channels.push_back(search_block_->channel(ch));
620 target_block_wrapper_ =
621 AudioBus::WrapVector(target_block_->frames(), active_target_channels);
622 search_block_wrapper_ =
623 AudioBus::WrapVector(search_block_->frames(), active_search_channels);
626 void AudioRendererAlgorithm::SetPreservesPitch(bool preserves_pitch) {
627 preserves_pitch_ = preserves_pitch;