From be957f6c6193592d233c70169d852e5f0c8471a1 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Thu, 27 Jan 2022 02:20:37 +0900
Subject: [PATCH] mediafoundation: Add support for AAC decoding

See also
https://docs.microsoft.com/en-us/windows/win32/medfound/aac-decoder

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1596>
---
 .../sys/mediafoundation/gstmfaacdec.cpp            | 380 +++++++++++++++++++++
 .../sys/mediafoundation/gstmfaacdec.h              |  30 ++
 .../sys/mediafoundation/gstmfaudiodecoder.cpp      | 285 ++++++++++++++++
 .../sys/mediafoundation/gstmfaudiodecoder.h        |  69 ++++
 .../sys/mediafoundation/gstmfutils.cpp             | 197 ++++++++++-
 .../sys/mediafoundation/gstmfutils.h               |   3 +-
 .../sys/mediafoundation/meson.build                |   2 +
 .../gst-plugins-bad/sys/mediafoundation/plugin.cpp |   2 +
 8 files changed, 966 insertions(+), 2 deletions(-)
 create mode 100644 subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.cpp
 create mode 100644 subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.h
 create mode 100644 subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.cpp
 create mode 100644 subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.h

diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.cpp b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.cpp
new file mode 100644
index 0000000..9ac503f
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.cpp
@@ -0,0 +1,380 @@
+/* GStreamer
+ * Copyright (C) 2022 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/**
+ * SECTION:element-mfaacdec
+ * @title: mfaacdec
+ *
+ * This element decodes AAC compressed data into RAW audio data.
+ *
+ * Since: 1.22
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gst/gst.h>
+#include <gst/pbutils/pbutils.h>
+#include "gstmfaudiodecoder.h"
+#include "gstmfaacdec.h"
+#include <wrl.h>
+#include <string.h>
+
+/* *INDENT-OFF* */
+using namespace Microsoft::WRL;
+/* *INDENT-ON* */
+
+GST_DEBUG_CATEGORY (gst_mf_aac_dec_debug);
+#define GST_CAT_DEFAULT gst_mf_aac_dec_debug
+
+static GstStaticPadTemplate sink_template = GST_STATIC_PAD_TEMPLATE ("sink",
+    GST_PAD_SINK,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("audio/mpeg, "
+        "mpegversion = (int) {2, 4}, "
+        "stream-format = (string) raw, framed = (boolean) true, "
+        "channels = (int) [1, 6], rate = (int) [8000, 48000]")
+    );
+
+static GstStaticPadTemplate src_template = GST_STATIC_PAD_TEMPLATE ("src",
+    GST_PAD_SRC,
+    GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("audio/x-raw, "
+        "format = (string) " GST_AUDIO_NE (S16) ", "
+        "layout = (string) interleaved, "
+        "channels = (int) [1, 6], rate = (int) [8000, 48000]")
+    );
+
+typedef struct _GstMFAacDec
+{
+  GstMFAudioDecoder parent;
+} GstMFAacDec;
+
+typedef struct _GstMFAacDecClass
+{
+  GstMFAudioDecoderClass parent_class;
+} GstMFAacDecClass;
+
+static GTypeClass *parent_class = nullptr;
+
+static gboolean gst_mf_aac_dec_set_format (GstMFAudioDecoder * decoder,
+    GstMFTransform * transform, GstCaps * caps);
+
+static void
+gst_mf_aac_dec_class_init (GstMFAacDecClass * klass, gpointer data)
+{
+  GstElementClass *element_class = GST_ELEMENT_CLASS (klass);
+  GstMFAudioDecoderClass *decoder_class = GST_MF_AUDIO_DECODER_CLASS (klass);
+  GstMFAudioDecoderClassData *cdata = (GstMFAudioDecoderClassData *) data;
+  gchar *long_name;
+
+  parent_class = (GTypeClass *) g_type_class_peek_parent (klass);
+
+  long_name = g_strdup_printf ("Media Foundation %s", cdata->device_name);
+  gst_element_class_set_metadata (element_class, long_name,
+      "Codec/Decoder/Audio",
+      "Microsoft Media Foundation AAC Decoder",
+      "Seungha Yang <seungha@centricular.com>");
+  g_free (long_name);
+
+  gst_element_class_add_static_pad_template (element_class, &sink_template);
+  gst_element_class_add_static_pad_template (element_class, &src_template);
+
+  decoder_class->set_format = GST_DEBUG_FUNCPTR (gst_mf_aac_dec_set_format);
+
+  decoder_class->codec_id = MFAudioFormat_AAC;
+  decoder_class->enum_flags = cdata->enum_flags;
+  decoder_class->device_index = cdata->device_index;
+
+  g_free (cdata->device_name);
+  g_free (cdata);
+}
+
+static void
+gst_mf_aac_dec_init (GstMFAacDec * self)
+{
+}
+
+/* Portion of HEAACWAVEINFO struct after wfx field
+ * plus 2 bytes AudioSpecificConfig() */
+typedef struct
+{
+  WORD wPayloadType;
+  WORD wAudioProfileLevelIndication;
+  WORD wStructType;
+  WORD wReserved1;
+  DWORD dwReserved2;
+
+  WORD AudioSpecificConfig;
+} AACWaveInfo;
+
+static gboolean
+gst_mf_aac_dec_set_format (GstMFAudioDecoder * decoder,
+    GstMFTransform * transform, GstCaps * caps)
+{
+  GstMFAacDec *self = (GstMFAacDec *) decoder;
+  HRESULT hr;
+  const GValue *value;
+  GstStructure *structure;
+  GstBuffer *codec_data;
+  ComPtr < IMFMediaType > in_type;
+  ComPtr < IMFMediaType > out_type;
+  AACWaveInfo wave_info;
+  GstMapInfo map_info;
+  guint channels, rate;
+  const guint8 *data;
+  GstAudioInfo in_audio_info, out_audio_info;
+  GList *output_list, *iter;
+  GstCaps *out_caps;
+
+  G_STATIC_ASSERT (sizeof (AACWaveInfo) >= 12);
+
+  if (!gst_audio_info_from_caps (&in_audio_info, caps)) {
+    GST_ERROR_OBJECT (self, "Failed to get audio info from caps");
+    return FALSE;
+  }
+
+  structure = gst_caps_get_structure (caps, 0);
+  value = gst_structure_get_value (structure, "codec_data");
+  if (!value) {
+    GST_ERROR_OBJECT (self, "Missing codec_data");
+    return FALSE;
+  }
+
+  codec_data = gst_value_get_buffer (value);
+  if (!codec_data || gst_buffer_get_size (codec_data) < 2) {
+    GST_ERROR_OBJECT (self, "Invalid codec_data");
+    return FALSE;
+  }
+
+  if (!gst_buffer_map (codec_data, &map_info, GST_MAP_READ)) {
+    GST_ERROR_OBJECT (self, "Invalid codec_data buffer");
+    return FALSE;
+  }
+
+  data = (guint8 *) map_info.data;
+  channels = gst_codec_utils_aac_get_channels (data, map_info.size);
+  rate = gst_codec_utils_aac_get_sample_rate (data, map_info.size);
+
+  /* Fallback to channels/rate values specified in caps */
+  if (channels == 0)
+    channels = in_audio_info.channels;
+
+  if (rate == 0)
+    rate = in_audio_info.rate;
+
+  memset (&wave_info, 0, sizeof (AACWaveInfo));
+  wave_info.wAudioProfileLevelIndication = 0xfe;
+  memcpy (&wave_info.AudioSpecificConfig, data, 2);
+
+  hr = MFCreateMediaType (&in_type);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetGUID (MF_MT_MAJOR_TYPE, MFMediaType_Audio);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetGUID (MF_MT_SUBTYPE, MFAudioFormat_AAC);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetUINT32 (MF_MT_AAC_PAYLOAD_TYPE, 0);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetUINT32 (MF_MT_AUDIO_NUM_CHANNELS, channels);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetUINT32 (MF_MT_AUDIO_SAMPLES_PER_SECOND, rate);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  /* FIXME: should parse this somehow? */
+  hr = in_type->SetUINT32 (MF_MT_AAC_AUDIO_PROFILE_LEVEL_INDICATION, 0xfe);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  hr = in_type->SetBlob (MF_MT_USER_DATA, (UINT8 *) & wave_info, 12);
+  if (!gst_mf_result (hr))
+    return FALSE;
+
+  if (!gst_mf_transform_set_input_type (transform, in_type.Get ())) {
+    GST_ERROR_OBJECT (self, "Failed to set format");
+    return FALSE;
+  }
+
+  if (!gst_mf_transform_get_output_available_types (transform, &output_list)) {
+    GST_ERROR_OBJECT (self, "Failed to get output types");
+    return FALSE;
+  }
+
+  for (iter = output_list; iter; iter = g_list_next (iter)) {
+    GUID guid;
+    IMFMediaType *type = (IMFMediaType *) iter->data;
+    UINT32 bps;
+
+    hr = type->GetGUID (MF_MT_MAJOR_TYPE, &guid);
+    if (!gst_mf_result (hr))
+      continue;
+
+    if (!IsEqualGUID (guid, MFMediaType_Audio))
+      continue;
+
+    hr = type->GetGUID (MF_MT_SUBTYPE, &guid);
+    if (!gst_mf_result (hr))
+      continue;
+
+    if (!IsEqualGUID (guid, MFAudioFormat_PCM))
+      continue;
+
+    hr = type->GetUINT32 (MF_MT_AUDIO_BITS_PER_SAMPLE, &bps);
+    if (!gst_mf_result (hr))
+      continue;
+
+    if (bps != 16)
+      continue;
+
+    out_type = type;
+    break;
+  }
+
+  g_list_free_full (output_list, (GDestroyNotify) gst_mf_media_type_release);
+
+  if (!out_type) {
+    GST_ERROR_OBJECT (self, "Failed to select output type");
+    return FALSE;
+  }
+
+  if (!gst_mf_transform_set_output_type (transform, out_type.Get ())) {
+    GST_ERROR_OBJECT (self, "Failed to select output type");
+    return FALSE;
+  }
+
+  out_caps = gst_mf_media_type_to_caps (out_type.Get ());
+  if (!out_caps) {
+    GST_ERROR_OBJECT (self, "Failed to get output caps");
+    return FALSE;
+  }
+
+  GST_DEBUG_OBJECT (self, "Output caps %" GST_PTR_FORMAT, out_caps);
+
+  if (!gst_audio_info_from_caps (&out_audio_info, out_caps)) {
+    GST_ERROR_OBJECT (self,
+        "Failed to convert caps to audio info %" GST_PTR_FORMAT, out_caps);
+    gst_caps_unref (out_caps);
+  }
+
+  gst_caps_unref (out_caps);
+
+  return gst_audio_decoder_set_output_format (GST_AUDIO_DECODER (self),
+      &out_audio_info);
+}
+
+static void
+gst_mf_aac_dec_register (GstPlugin * plugin, guint rank,
+    const gchar * device_name, guint32 enum_flags, guint device_index)
+{
+  GType type;
+  GstMFAudioDecoderClassData *cdata;
+  GTypeInfo type_info = {
+    sizeof (GstMFAacDecClass),
+    nullptr,
+    nullptr,
+    (GClassInitFunc) gst_mf_aac_dec_class_init,
+    nullptr,
+    nullptr,
+    sizeof (GstMFAacDec),
+    0,
+    (GInstanceInitFunc) gst_mf_aac_dec_init,
+  };
+
+  cdata = g_new0 (GstMFAudioDecoderClassData, 1);
+  cdata->device_name = g_strdup (device_name);
+  cdata->enum_flags = enum_flags;
+  cdata->device_index = device_index;
+  type_info.class_data = cdata;
+
+  type = g_type_register_static (GST_TYPE_MF_AUDIO_DECODER, "GstMFAacDec",
+      &type_info, (GTypeFlags) 0);
+
+  if (!gst_element_register (plugin, "mfaacdec", rank, type))
+    GST_WARNING ("Failed to register plugin");
+}
+
+static gboolean
+gst_mf_aac_dec_plugin_init_internal (GstPlugin * plugin, guint rank,
+    GstMFTransform * transform, guint device_index, guint32 enum_flags)
+{
+  gchar *device_name = nullptr;
+
+  if (!gst_mf_transform_open (transform))
+    return FALSE;
+
+  g_object_get (transform, "device-name", &device_name, nullptr);
+  if (!device_name) {
+    GST_WARNING_OBJECT (transform, "Unknown device name");
+    return FALSE;
+  }
+
+  gst_mf_aac_dec_register (plugin, rank, device_name, enum_flags, device_index);
+  g_free (device_name);
+
+  return TRUE;
+}
+
+void
+gst_mf_aac_dec_plugin_init (GstPlugin * plugin, guint rank)
+{
+  GstMFTransformEnumParams enum_params = { 0, };
+  MFT_REGISTER_TYPE_INFO input_type;
+  GstMFTransform *transform;
+  gint i;
+  gboolean do_next;
+
+  GST_DEBUG_CATEGORY_INIT (gst_mf_aac_dec_debug, "mfaacdec", 0, "mfaacdec");
+
+  input_type.guidMajorType = MFMediaType_Audio;
+  input_type.guidSubtype = MFAudioFormat_AAC;
+
+  enum_params.category = MFT_CATEGORY_AUDIO_DECODER;
+  enum_params.enum_flags = (MFT_ENUM_FLAG_SYNCMFT |
+      MFT_ENUM_FLAG_SORTANDFILTER | MFT_ENUM_FLAG_SORTANDFILTER_APPROVED_ONLY);
+  enum_params.input_typeinfo = &input_type;
+
+  i = 0;
+  do {
+    enum_params.device_index = i++;
+    transform = gst_mf_transform_new (&enum_params);
+    do_next = TRUE;
+
+    if (!transform) {
+      do_next = FALSE;
+    } else {
+      if (gst_mf_aac_dec_plugin_init_internal (plugin, rank, transform,
+              enum_params.device_index, enum_params.enum_flags)) {
+        do_next = FALSE;
+      }
+      gst_clear_object (&transform);
+    }
+  } while (do_next);
+}
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.h b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.h
new file mode 100644
index 0000000..94f9fb9
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaacdec.h
@@ -0,0 +1,30 @@
+/* GStreamer
+ * Copyright (C) 2022 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+
+G_BEGIN_DECLS
+
+void gst_mf_aac_dec_plugin_init (GstPlugin * plugin,
+                                 guint rank);
+
+G_END_DECLS
+
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.cpp b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.cpp
new file mode 100644
index 0000000..7064d87
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.cpp
@@ -0,0 +1,285 @@
+/* GStreamer
+ * Copyright (C) 2022 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gst/gst.h>
+#include "gstmfaudiodecoder.h"
+#include <wrl.h>
+#include <string.h>
+
+/* *INDENT-OFF* */
+using namespace Microsoft::WRL;
+/* *INDENT-ON* */
+
+GST_DEBUG_CATEGORY (gst_mf_audio_decoder_debug);
+#define GST_CAT_DEFAULT gst_mf_audio_decoder_debug
+
+#define gst_mf_audio_decoder_parent_class parent_class
+G_DEFINE_ABSTRACT_TYPE_WITH_CODE (GstMFAudioDecoder, gst_mf_audio_decoder,
+    GST_TYPE_AUDIO_DECODER,
+    GST_DEBUG_CATEGORY_INIT (gst_mf_audio_decoder_debug, "mfaudiodecoder", 0,
+        "mfaudiodecoder"));
+
+static gboolean gst_mf_audio_decoder_open (GstAudioDecoder * dec);
+static gboolean gst_mf_audio_decoder_close (GstAudioDecoder * dec);
+static gboolean gst_mf_audio_decoder_set_format (GstAudioDecoder * dec,
+    GstCaps * caps);
+static GstFlowReturn gst_mf_audio_decoder_handle_frame (GstAudioDecoder * dec,
+    GstBuffer * buffer);
+static GstFlowReturn gst_mf_audio_decoder_drain (GstAudioDecoder * dec);
+static void gst_mf_audio_decoder_flush (GstAudioDecoder * dec, gboolean hard);
+
+static void
+gst_mf_audio_decoder_class_init (GstMFAudioDecoderClass * klass)
+{
+  GstAudioDecoderClass *audiodec_class = GST_AUDIO_DECODER_CLASS (klass);
+
+  audiodec_class->open = GST_DEBUG_FUNCPTR (gst_mf_audio_decoder_open);
+  audiodec_class->close = GST_DEBUG_FUNCPTR (gst_mf_audio_decoder_close);
+  audiodec_class->set_format =
+      GST_DEBUG_FUNCPTR (gst_mf_audio_decoder_set_format);
+  audiodec_class->handle_frame =
+      GST_DEBUG_FUNCPTR (gst_mf_audio_decoder_handle_frame);
+  audiodec_class->flush = GST_DEBUG_FUNCPTR (gst_mf_audio_decoder_flush);
+
+  gst_type_mark_as_plugin_api (GST_TYPE_MF_AUDIO_DECODER,
+      (GstPluginAPIFlags) 0);
+}
+
+static void
+gst_mf_audio_decoder_init (GstMFAudioDecoder * self)
+{
+  gst_audio_decoder_set_drainable (GST_AUDIO_DECODER (self), TRUE);
+}
+
+static gboolean
+gst_mf_audio_decoder_open (GstAudioDecoder * dec)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+  GstMFAudioDecoderClass *klass = GST_MF_AUDIO_DECODER_GET_CLASS (dec);
+  GstMFTransformEnumParams enum_params = { 0, };
+  MFT_REGISTER_TYPE_INFO input_type;
+
+  input_type.guidMajorType = MFMediaType_Audio;
+  input_type.guidSubtype = klass->codec_id;
+
+  enum_params.category = MFT_CATEGORY_AUDIO_DECODER;
+  enum_params.enum_flags = klass->enum_flags;
+  enum_params.input_typeinfo = &input_type;
+  enum_params.device_index = klass->device_index;
+
+  GST_DEBUG_OBJECT (self, "Create MFT with enum flags 0x%x, device index %d",
+      klass->enum_flags, klass->device_index);
+
+  self->transform = gst_mf_transform_new (&enum_params);
+  if (!self->transform) {
+    GST_ERROR_OBJECT (self, "Cannot create MFT object");
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+static gboolean
+gst_mf_audio_decoder_close (GstAudioDecoder * dec)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+
+  gst_clear_object (&self->transform);
+
+  return TRUE;
+}
+
+static gboolean
+gst_mf_audio_decoder_set_format (GstAudioDecoder * dec, GstCaps * caps)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+  GstMFAudioDecoderClass *klass = GST_MF_AUDIO_DECODER_GET_CLASS (dec);
+
+  g_assert (klass->set_format != nullptr);
+
+  GST_DEBUG_OBJECT (self, "Set format");
+
+  gst_mf_audio_decoder_drain (dec);
+
+  if (!gst_mf_transform_open (self->transform)) {
+    GST_ERROR_OBJECT (self, "Failed to open MFT");
+    return FALSE;
+  }
+
+  if (!klass->set_format (self, self->transform, caps)) {
+    GST_ERROR_OBJECT (self, "Failed to set format");
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+static gboolean
+gst_mf_audio_decoder_process_input (GstMFAudioDecoder * self,
+    GstBuffer * buffer)
+{
+  HRESULT hr;
+  ComPtr < IMFSample > sample;
+  ComPtr < IMFMediaBuffer > media_buffer;
+  BYTE *data;
+  gboolean res = FALSE;
+  GstMapInfo info;
+
+  if (!gst_buffer_map (buffer, &info, GST_MAP_READ)) {
+    GST_ELEMENT_ERROR (self,
+        RESOURCE, READ, ("Couldn't map input buffer"), (nullptr));
+    return FALSE;
+  }
+
+  GST_TRACE_OBJECT (self, "Process buffer %" GST_PTR_FORMAT, buffer);
+
+  hr = MFCreateSample (&sample);
+  if (!gst_mf_result (hr))
+    goto done;
+
+  hr = MFCreateMemoryBuffer (info.size, &media_buffer);
+  if (!gst_mf_result (hr))
+    goto done;
+
+  hr = media_buffer->Lock (&data, nullptr, nullptr);
+  if (!gst_mf_result (hr))
+    goto done;
+
+  memcpy (data, info.data, info.size);
+  media_buffer->Unlock ();
+
+  hr = media_buffer->SetCurrentLength (info.size);
+  if (!gst_mf_result (hr))
+    goto done;
+
+  hr = sample->AddBuffer (media_buffer.Get ());
+  if (!gst_mf_result (hr))
+    goto done;
+
+  if (!gst_mf_transform_process_input (self->transform, sample.Get ())) {
+    GST_ERROR_OBJECT (self, "Failed to process input");
+    goto done;
+  }
+
+  res = TRUE;
+
+done:
+  gst_buffer_unmap (buffer, &info);
+
+  return res;
+}
+
+static GstFlowReturn
+gst_mf_audio_decoder_process_output (GstMFAudioDecoder * self)
+{
+  HRESULT hr;
+  BYTE *data = nullptr;
+  ComPtr < IMFMediaBuffer > media_buffer;
+  ComPtr < IMFSample > sample;
+  GstBuffer *buffer;
+  GstFlowReturn res = GST_FLOW_ERROR;
+  DWORD buffer_len = 0;
+
+  res = gst_mf_transform_get_output (self->transform, &sample);
+
+  if (res != GST_FLOW_OK)
+    return res;
+
+  hr = sample->GetBufferByIndex (0, &media_buffer);
+  if (!gst_mf_result (hr))
+    return GST_FLOW_ERROR;
+
+  hr = media_buffer->Lock (&data, nullptr, &buffer_len);
+  if (!gst_mf_result (hr))
+    return GST_FLOW_ERROR;
+
+  /* Can happen while draining */
+  if (buffer_len == 0 || !data) {
+    GST_DEBUG_OBJECT (self, "Empty media buffer");
+    media_buffer->Unlock ();
+    return GST_FLOW_OK;
+  }
+
+  buffer = gst_audio_decoder_allocate_output_buffer (GST_AUDIO_DECODER (self),
+      buffer_len);
+  gst_buffer_fill (buffer, 0, data, buffer_len);
+  media_buffer->Unlock ();
+
+  return gst_audio_decoder_finish_frame (GST_AUDIO_DECODER (self), buffer, 1);
+}
+
+static GstFlowReturn
+gst_mf_audio_decoder_handle_frame (GstAudioDecoder * dec, GstBuffer * buffer)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+  GstFlowReturn ret;
+
+  if (!buffer)
+    return gst_mf_audio_decoder_drain (dec);
+
+  if (!gst_mf_audio_decoder_process_input (self, buffer)) {
+    GST_ERROR_OBJECT (self, "Failed to process input");
+    return GST_FLOW_ERROR;
+  }
+
+  do {
+    ret = gst_mf_audio_decoder_process_output (self);
+  } while (ret == GST_FLOW_OK);
+
+  if (ret == GST_MF_TRANSFORM_FLOW_NEED_DATA)
+    ret = GST_FLOW_OK;
+
+  return ret;
+}
+
+static GstFlowReturn
+gst_mf_audio_decoder_drain (GstAudioDecoder * dec)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+  GstFlowReturn ret = GST_FLOW_OK;
+
+  if (!self->transform)
+    return GST_FLOW_OK;
+
+  gst_mf_transform_drain (self->transform);
+
+  do {
+    ret = gst_mf_audio_decoder_process_output (self);
+  } while (ret == GST_FLOW_OK);
+
+  if (ret == GST_MF_TRANSFORM_FLOW_NEED_DATA)
+    ret = GST_FLOW_OK;
+
+  return ret;
+}
+
+static void
+gst_mf_audio_decoder_flush (GstAudioDecoder * dec, gboolean hard)
+{
+  GstMFAudioDecoder *self = GST_MF_AUDIO_DECODER (dec);
+
+  if (!self->transform)
+    return;
+
+  gst_mf_transform_flush (self->transform);
+}
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.h b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.h
new file mode 100644
index 0000000..3e54d82
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfaudiodecoder.h
@@ -0,0 +1,69 @@
+/* GStreamer
+ * Copyright (C) 2022 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+#include <gst/audio/audio.h>
+#include "gstmfutils.h"
+#include "gstmftransform.h"
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_MF_AUDIO_DECODER           (gst_mf_audio_decoder_get_type())
+#define GST_MF_AUDIO_DECODER(obj)           (G_TYPE_CHECK_INSTANCE_CAST((obj),GST_TYPE_MF_AUDIO_DECODER,GstMFAudioDecoder))
+#define GST_MF_AUDIO_DECODER_CLASS(klass)   (G_TYPE_CHECK_CLASS_CAST((klass), GST_TYPE_MF_AUDIO_DECODER,GstMFAudioDecoderClass))
+#define GST_MF_AUDIO_DECODER_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS((obj), GST_TYPE_MF_AUDIO_DECODER,GstMFAudioDecoderClass))
+#define GST_IS_MF_AUDIO_DECODER(obj)        (G_TYPE_CHECK_INSTANCE_TYPE((obj),GST_TYPE_MF_AUDIO_DECODER))
+#define GST_IS_MF_AUDIO_DECODER_CLASS(obj)  (G_TYPE_CHECK_CLASS_TYPE((klass), GST_TYPE_MF_AUDIO_DECODER))
+
+typedef struct _GstMFAudioDecoder GstMFAudioDecoder;
+typedef struct _GstMFAudioDecoderClass GstMFAudioDecoderClass;
+
+typedef struct
+{
+  gchar * device_name;
+  guint32 enum_flags;
+  guint device_index;
+} GstMFAudioDecoderClassData;
+
+struct _GstMFAudioDecoder
+{
+  GstAudioDecoder parent;
+
+  GstMFTransform *transform;
+};
+
+struct _GstMFAudioDecoderClass
+{
+  GstAudioDecoderClass parent_class;
+
+  GUID codec_id;
+  guint32 enum_flags;
+  guint device_index;
+
+  gboolean (*set_format)      (GstMFAudioDecoder * decoder,
+                               GstMFTransform * transform,
+                               GstCaps * caps);
+};
+
+GType gst_mf_audio_decoder_get_type (void);
+
+G_END_DECLS
+
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.cpp b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.cpp
index eac6610..3f85e04 100644
--- a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.cpp
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.cpp
@@ -343,6 +343,198 @@ gst_mf_media_type_to_video_caps (IMFMediaType * media_type)
   return caps;
 }
 
+/* Desktop only defines */
+#ifndef KSAUDIO_SPEAKER_MONO
+#define KSAUDIO_SPEAKER_MONO            (SPEAKER_FRONT_CENTER)
+#endif
+#ifndef KSAUDIO_SPEAKER_1POINT1
+#define KSAUDIO_SPEAKER_1POINT1         (SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY)
+#endif
+#ifndef KSAUDIO_SPEAKER_STEREO
+#define KSAUDIO_SPEAKER_STEREO          (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT)
+#endif
+#ifndef KSAUDIO_SPEAKER_2POINT1
+#define KSAUDIO_SPEAKER_2POINT1         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_LOW_FREQUENCY)
+#endif
+#ifndef KSAUDIO_SPEAKER_3POINT0
+#define KSAUDIO_SPEAKER_3POINT0         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER)
+#endif
+#ifndef KSAUDIO_SPEAKER_3POINT1
+#define KSAUDIO_SPEAKER_3POINT1         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | \
+                                         SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY)
+#endif
+#ifndef KSAUDIO_SPEAKER_QUAD
+#define KSAUDIO_SPEAKER_QUAD            (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | \
+                                         SPEAKER_BACK_LEFT  | SPEAKER_BACK_RIGHT)
+#endif
+#define KSAUDIO_SPEAKER_SURROUND        (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | \
+                                         SPEAKER_FRONT_CENTER | SPEAKER_BACK_CENTER)
+#ifndef KSAUDIO_SPEAKER_5POINT0
+#define KSAUDIO_SPEAKER_5POINT0         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | \
+                                         SPEAKER_SIDE_LEFT  | SPEAKER_SIDE_RIGHT)
+#endif
+#define KSAUDIO_SPEAKER_5POINT1         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | \
+                                         SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | \
+                                         SPEAKER_BACK_LEFT  | SPEAKER_BACK_RIGHT)
+#ifndef KSAUDIO_SPEAKER_7POINT0
+#define KSAUDIO_SPEAKER_7POINT0         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | \
+                                         SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | \
+                                         SPEAKER_SIDE_LEFT | SPEAKER_SIDE_RIGHT)
+#endif
+#ifndef KSAUDIO_SPEAKER_7POINT1
+#define KSAUDIO_SPEAKER_7POINT1         (SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | \
+                                         SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | \
+                                         SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | \
+                                         SPEAKER_FRONT_LEFT_OF_CENTER | SPEAKER_FRONT_RIGHT_OF_CENTER)
+#endif
+
+static struct
+{
+  guint64 mf_pos;
+  GstAudioChannelPosition gst_pos;
+} mf_to_gst_pos[] = {
+  {SPEAKER_FRONT_LEFT, GST_AUDIO_CHANNEL_POSITION_FRONT_LEFT},
+  {SPEAKER_FRONT_RIGHT, GST_AUDIO_CHANNEL_POSITION_FRONT_RIGHT},
+  {SPEAKER_FRONT_CENTER, GST_AUDIO_CHANNEL_POSITION_FRONT_CENTER},
+  {SPEAKER_LOW_FREQUENCY, GST_AUDIO_CHANNEL_POSITION_LFE1},
+  {SPEAKER_BACK_LEFT, GST_AUDIO_CHANNEL_POSITION_REAR_LEFT},
+  {SPEAKER_BACK_RIGHT, GST_AUDIO_CHANNEL_POSITION_REAR_RIGHT},
+  {SPEAKER_FRONT_LEFT_OF_CENTER,
+      GST_AUDIO_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER},
+  {SPEAKER_FRONT_RIGHT_OF_CENTER,
+      GST_AUDIO_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER},
+  {SPEAKER_BACK_CENTER, GST_AUDIO_CHANNEL_POSITION_REAR_CENTER},
+  /* Enum values diverge from this point onwards */
+  {SPEAKER_SIDE_LEFT, GST_AUDIO_CHANNEL_POSITION_SIDE_LEFT},
+  {SPEAKER_SIDE_RIGHT, GST_AUDIO_CHANNEL_POSITION_SIDE_RIGHT},
+  {SPEAKER_TOP_CENTER, GST_AUDIO_CHANNEL_POSITION_TOP_CENTER},
+  {SPEAKER_TOP_FRONT_LEFT, GST_AUDIO_CHANNEL_POSITION_TOP_FRONT_LEFT},
+  {SPEAKER_TOP_FRONT_CENTER, GST_AUDIO_CHANNEL_POSITION_TOP_FRONT_CENTER},
+  {SPEAKER_TOP_FRONT_RIGHT, GST_AUDIO_CHANNEL_POSITION_TOP_FRONT_RIGHT},
+  {SPEAKER_TOP_BACK_LEFT, GST_AUDIO_CHANNEL_POSITION_TOP_REAR_LEFT},
+  {SPEAKER_TOP_BACK_CENTER, GST_AUDIO_CHANNEL_POSITION_TOP_REAR_CENTER},
+  {SPEAKER_TOP_BACK_RIGHT, GST_AUDIO_CHANNEL_POSITION_TOP_REAR_RIGHT}
+};
+
+/* *INDENT-OFF* */
+static DWORD default_ch_masks[] = {
+  0,
+  KSAUDIO_SPEAKER_MONO,
+  /* 2ch */
+  KSAUDIO_SPEAKER_STEREO,
+  /* 2.1ch */
+  /* KSAUDIO_SPEAKER_3POINT0 ? */
+  KSAUDIO_SPEAKER_2POINT1,
+  /* 4ch */
+  /* KSAUDIO_SPEAKER_3POINT1 or KSAUDIO_SPEAKER_SURROUND ? */
+  KSAUDIO_SPEAKER_QUAD,
+  /* 5ch */
+  KSAUDIO_SPEAKER_5POINT0,
+  /* 5.1ch */
+  KSAUDIO_SPEAKER_5POINT1,
+  /* 7ch */
+  KSAUDIO_SPEAKER_7POINT0,
+  /* 7.1ch */
+  KSAUDIO_SPEAKER_7POINT1,
+};
+/* *INDENT-ON* */
+
+static void
+gst_mf_media_audio_channel_mask_to_position (guint channels, DWORD mask,
+    GstAudioChannelPosition * position)
+{
+  guint i, ch;
+
+  for (i = 0, ch = 0; i < G_N_ELEMENTS (mf_to_gst_pos) && ch < channels; i++) {
+    if ((mask & mf_to_gst_pos[i].mf_pos) == 0)
+      continue;
+
+    position[ch] = mf_to_gst_pos[i].gst_pos;
+    ch++;
+  }
+}
+
+static GstCaps *
+gst_mf_media_type_to_audio_caps (IMFMediaType * media_type)
+{
+  GUID subtype;
+  HRESULT hr;
+  UINT32 bps;
+  GstAudioFormat format = GST_AUDIO_FORMAT_UNKNOWN;
+  GstAudioInfo info;
+  UINT32 rate, channels, mask;
+  GstAudioChannelPosition position[64];
+
+  hr = media_type->GetGUID (MF_MT_SUBTYPE, &subtype);
+  if (FAILED (hr)) {
+    GST_WARNING ("failed to get subtype, hr: 0x%x", (guint) hr);
+    return nullptr;
+  }
+
+  if (!IsEqualGUID (subtype, MFAudioFormat_PCM) &&
+      !IsEqualGUID (subtype, MFAudioFormat_Float)) {
+    GST_FIXME ("Unknown subtype");
+    return nullptr;
+  }
+
+  hr = media_type->GetUINT32 (MF_MT_AUDIO_BITS_PER_SAMPLE, &bps);
+  if (FAILED (hr)) {
+    GST_WARNING ("Failed to get bps, hr: 0x%x", (guint) hr);
+    return nullptr;
+  }
+
+  if (IsEqualGUID (subtype, MFAudioFormat_PCM)) {
+    format = gst_audio_format_build_integer (TRUE, G_LITTLE_ENDIAN, bps, bps);
+  } else if (bps == 32) {
+    format = GST_AUDIO_FORMAT_F32LE;
+  } else if (bps == 64) {
+    format = GST_AUDIO_FORMAT_F64LE;
+  }
+
+  if (format == GST_AUDIO_FORMAT_UNKNOWN) {
+    GST_WARNING ("Unknown audio format");
+    return nullptr;
+  }
+
+  hr = media_type->GetUINT32 (MF_MT_AUDIO_NUM_CHANNELS, &channels);
+  if (FAILED (hr) || channels == 0) {
+    GST_WARNING ("Unknown channels");
+    return nullptr;
+  }
+
+  hr = media_type->GetUINT32 (MF_MT_AUDIO_SAMPLES_PER_SECOND, &rate);
+  if (FAILED (hr) || rate == 0) {
+    GST_WARNING ("Unknown rate");
+    return nullptr;
+  }
+
+  for (guint i = 0; i < G_N_ELEMENTS (position); i++)
+    position[i] = GST_AUDIO_CHANNEL_POSITION_NONE;
+
+  hr = media_type->GetUINT32 (MF_MT_AUDIO_CHANNEL_MASK, &mask);
+  if (FAILED (hr)) {
+    if (channels == 1) {
+      position[0] = GST_AUDIO_CHANNEL_POSITION_MONO;
+    } else if (channels == 2) {
+      position[0] = GST_AUDIO_CHANNEL_POSITION_FRONT_LEFT;
+      position[1] = GST_AUDIO_CHANNEL_POSITION_FRONT_RIGHT;
+    } else if (channels <= 8) {
+      GST_WARNING ("Unknown channel position, use default value");
+      gst_mf_media_audio_channel_mask_to_position (channels,
+          default_ch_masks[channels], position);
+    } else {
+      GST_WARNING ("Failed to determine channel position");
+      return nullptr;
+    }
+  } else {
+    gst_mf_media_audio_channel_mask_to_position (channels, mask, position);
+  }
+
+  gst_audio_info_set_format (&info, format, rate, channels, position);
+
+  return gst_audio_info_to_caps (&info);
+}
+
 GstCaps *
 gst_mf_media_type_to_caps (IMFMediaType * media_type)
 {
@@ -357,8 +549,11 @@ gst_mf_media_type_to_caps (IMFMediaType * media_type)
     return nullptr;
   }
 
-  if (IsEqualGUID (major_type, MFMediaType_Video))
+  if (IsEqualGUID (major_type, MFMediaType_Video)) {
     return gst_mf_media_type_to_video_caps (media_type);
+  } else if (IsEqualGUID (major_type, MFMediaType_Audio)) {
+    return gst_mf_media_type_to_audio_caps (media_type);
+  }
 
   return nullptr;
 }
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.h b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.h
index d5e247c..7fd57d8 100644
--- a/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.h
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/gstmfutils.h
@@ -23,6 +23,7 @@
 
 #include <gst/gst.h>
 #include <gst/video/video.h>
+#include <gst/audio/audio.h>
 
 #ifndef INITGUID
 #include <initguid.h>
@@ -75,4 +76,4 @@ void           _gst_mf_dump_attributes (IMFAttributes * attr,
 
 G_END_DECLS
 
-#endif /* __GST_MF_UTILS_H__ */
\ No newline at end of file
+#endif /* __GST_MF_UTILS_H__ */
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/meson.build b/subprojects/gst-plugins-bad/sys/mediafoundation/meson.build
index b771d3a..af4e3b5 100644
--- a/subprojects/gst-plugins-bad/sys/mediafoundation/meson.build
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/meson.build
@@ -1,5 +1,7 @@
 mf_sources = [
+  'gstmfaacdec.cpp',
   'gstmfaacenc.cpp',
+  'gstmfaudiodecoder.cpp',
   'gstmfaudioencoder.cpp',
   'gstmfdevice.cpp',
   'gstmfh264enc.cpp',
diff --git a/subprojects/gst-plugins-bad/sys/mediafoundation/plugin.cpp b/subprojects/gst-plugins-bad/sys/mediafoundation/plugin.cpp
index d0c2d8c..f4e4fad 100644
--- a/subprojects/gst-plugins-bad/sys/mediafoundation/plugin.cpp
+++ b/subprojects/gst-plugins-bad/sys/mediafoundation/plugin.cpp
@@ -72,6 +72,7 @@
 #include "gstmfvp9enc.h"
 #include "gstmfaacenc.h"
 #include "gstmfmp3enc.h"
+#include "gstmfaacdec.h"
 
 #if GST_MF_HAVE_D3D11
 #include <gst/d3d11/gstd3d11.h>
@@ -237,6 +238,7 @@ plugin_init (GstPlugin * plugin)
 
   gst_mf_aac_enc_plugin_init (plugin, GST_RANK_SECONDARY);
   gst_mf_mp3_enc_plugin_init (plugin, GST_RANK_SECONDARY);
+  gst_mf_aac_dec_plugin_init (plugin, GST_RANK_SECONDARY);
 
   /* So that call MFShutdown() when this plugin is no more used
    * (i.e., gst_deinit). Otherwise valgrind-like tools would complain
-- 
2.7.4