libavfilter/asrc_flite.c

   1 /*
   2  * Copyright (c) 2012 Stefano Sabatini
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /**
  22  * @file
  23  * flite voice synth source
  24  */
  25
  26 #include <flite/flite.h>
  27 #include "libavutil/channel_layout.h"
  28 #include "libavutil/file.h"
  29 #include "libavutil/opt.h"
  30 #include "libavutil/thread.h"
  31 #include "avfilter.h"
  32 #include "audio.h"
  33 #include "formats.h"
  34 #include "internal.h"
  35
  36 typedef struct FliteContext {
  37     const AVClass *class;
  38     char *voice_str;
  39     char *textfile;
  40     char *text;
  41     cst_wave *wave;
  42     int16_t *wave_samples;
  43     int      wave_nb_samples;
  44     int list_voices;
  45     cst_voice *voice;
  46     struct voice_entry *voice_entry;
  47     int64_t pts;
  48     int frame_nb_samples; ///< number of samples per frame
  49 } FliteContext;
  50
  51 #define OFFSET(x) offsetof(FliteContext, x)
  52 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
  53
  54 static const AVOption flite_options[] = {
  55     { "list_voices", "list voices and exit",              OFFSET(list_voices), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
  56     { "nb_samples",  "set number of samples per frame",   OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
  57     { "n",           "set number of samples per frame",   OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.i64=512}, 0, INT_MAX, FLAGS },
  58     { "text",        "set text to speak",                 OFFSET(text),      AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
  59     { "textfile",    "set filename of the text to speak", OFFSET(textfile),  AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
  60     { "v",           "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
  61     { "voice",       "set voice",                         OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
  62     { NULL }
  63 };
  64
  65 AVFILTER_DEFINE_CLASS(flite);
  66
  67 static AVMutex flite_mutex = AV_MUTEX_INITIALIZER;
  68
  69 static int flite_inited = 0;
  70
  71 /* declare functions for all the supported voices */
  72 #define DECLARE_REGISTER_VOICE_FN(name) \
  73     cst_voice *register_cmu_us_## name(const char *); \
  74     void     unregister_cmu_us_## name(cst_voice *)
  75 DECLARE_REGISTER_VOICE_FN(awb);
  76 DECLARE_REGISTER_VOICE_FN(kal);
  77 DECLARE_REGISTER_VOICE_FN(kal16);
  78 DECLARE_REGISTER_VOICE_FN(rms);
  79 DECLARE_REGISTER_VOICE_FN(slt);
  80
  81 struct voice_entry {
  82     const char *name;
  83     cst_voice * (*register_fn)(const char *);
  84     void (*unregister_fn)(cst_voice *);
  85     cst_voice *voice;
  86     unsigned usage_count;
  87 };
  88
  89 #define MAKE_VOICE_STRUCTURE(voice_name) {             \
  90     .name          =                      #voice_name, \
  91     .register_fn   =   register_cmu_us_ ## voice_name, \
  92     .unregister_fn = unregister_cmu_us_ ## voice_name, \
  93 }
  94 static struct voice_entry voice_entries[] = {
  95     MAKE_VOICE_STRUCTURE(awb),
  96     MAKE_VOICE_STRUCTURE(kal),
  97     MAKE_VOICE_STRUCTURE(kal16),
  98     MAKE_VOICE_STRUCTURE(rms),
  99     MAKE_VOICE_STRUCTURE(slt),
 100 };
 101
 102 static void list_voices(void *log_ctx, const char *sep)
 103 {
 104     int i, n = FF_ARRAY_ELEMS(voice_entries);
 105     for (i = 0; i < n; i++)
 106         av_log(log_ctx, AV_LOG_INFO, "%s%s",
 107                voice_entries[i].name, i < (n-1) ? sep : "\n");
 108 }
 109
 110 static int select_voice(struct voice_entry **entry_ret, const char *voice_name, void *log_ctx)
 111 {
 112     int i;
 113
 114     for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
 115         struct voice_entry *entry = &voice_entries[i];
 116         if (!strcmp(entry->name, voice_name)) {
 117             cst_voice *voice;
 118             pthread_mutex_lock(&flite_mutex);
 119             if (!entry->voice)
 120                 entry->voice = entry->register_fn(NULL);
 121             voice = entry->voice;
 122             if (voice)
 123                 entry->usage_count++;
 124             pthread_mutex_unlock(&flite_mutex);
 125             if (!voice) {
 126                 av_log(log_ctx, AV_LOG_ERROR,
 127                        "Could not register voice '%s'\n", voice_name);
 128                 return AVERROR_UNKNOWN;
 129             }
 130             *entry_ret = entry;
 131             return 0;
 132         }
 133     }
 134
 135     av_log(log_ctx, AV_LOG_ERROR, "Could not find voice '%s'\n", voice_name);
 136     av_log(log_ctx, AV_LOG_INFO, "Choose between the voices: ");
 137     list_voices(log_ctx, ", ");
 138
 139     return AVERROR(EINVAL);
 140 }
 141
 142 static av_cold int init(AVFilterContext *ctx)
 143 {
 144     FliteContext *flite = ctx->priv;
 145     int ret = 0;
 146
 147     if (flite->list_voices) {
 148         list_voices(ctx, "\n");
 149         return AVERROR_EXIT;
 150     }
 151
 152     pthread_mutex_lock(&flite_mutex);
 153     if (!flite_inited) {
 154         if ((ret = flite_init()) >= 0)
 155             flite_inited = 1;
 156     }
 157     pthread_mutex_unlock(&flite_mutex);
 158     if (ret < 0) {
 159         av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
 160         return AVERROR_EXTERNAL;
 161     }
 162
 163     if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
 164         return ret;
 165     flite->voice = flite->voice_entry->voice;
 166
 167     if (flite->textfile && flite->text) {
 168         av_log(ctx, AV_LOG_ERROR,
 169                "Both text and textfile options set: only one must be specified\n");
 170         return AVERROR(EINVAL);
 171     }
 172
 173     if (flite->textfile) {
 174         uint8_t *textbuf;
 175         size_t textbuf_size;
 176
 177         if ((ret = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
 178             av_log(ctx, AV_LOG_ERROR,
 179                    "The text file '%s' could not be read: %s\n",
 180                    flite->textfile, av_err2str(ret));
 181             return ret;
 182         }
 183
 184         if (!(flite->text = av_malloc(textbuf_size+1))) {
 185             av_file_unmap(textbuf, textbuf_size);
 186             return AVERROR(ENOMEM);
 187         }
 188         memcpy(flite->text, textbuf, textbuf_size);
 189         flite->text[textbuf_size] = 0;
 190         av_file_unmap(textbuf, textbuf_size);
 191     }
 192
 193     if (!flite->text) {
 194         av_log(ctx, AV_LOG_ERROR,
 195                "No speech text specified, specify the 'text' or 'textfile' option\n");
 196         return AVERROR(EINVAL);
 197     }
 198
 199     /* synth all the file data in block */
 200     flite->wave = flite_text_to_wave(flite->text, flite->voice);
 201     flite->wave_samples    = flite->wave->samples;
 202     flite->wave_nb_samples = flite->wave->num_samples;
 203     return 0;
 204 }
 205
 206 static av_cold void uninit(AVFilterContext *ctx)
 207 {
 208     FliteContext *flite = ctx->priv;
 209
 210     if (flite->voice_entry) {
 211         pthread_mutex_lock(&flite_mutex);
 212         if (!--flite->voice_entry->usage_count) {
 213             flite->voice_entry->unregister_fn(flite->voice);
 214             flite->voice_entry->voice = NULL;
 215         }
 216         pthread_mutex_unlock(&flite_mutex);
 217     }
 218     delete_wave(flite->wave);
 219     flite->wave = NULL;
 220 }
 221
 222 static int query_formats(AVFilterContext *ctx)
 223 {
 224     FliteContext *flite = ctx->priv;
 225     int ret;
 226
 227     AVFilterChannelLayouts *chlayouts = NULL;
 228     AVFilterFormats *sample_formats = NULL;
 229     AVFilterFormats *sample_rates = NULL;
 230     AVChannelLayout chlayout = { 0 };
 231
 232     av_channel_layout_default(&chlayout, flite->wave->num_channels);
 233
 234     if ((ret = ff_add_channel_layout         (&chlayouts     , &chlayout               )) < 0 ||
 235         (ret = ff_set_common_channel_layouts (ctx            , chlayouts               )) < 0 ||
 236         (ret = ff_add_format                 (&sample_formats, AV_SAMPLE_FMT_S16       )) < 0 ||
 237         (ret = ff_set_common_formats         (ctx            , sample_formats          )) < 0 ||
 238         (ret = ff_add_format                 (&sample_rates  , flite->wave->sample_rate)) < 0 ||
 239         (ret = ff_set_common_samplerates     (ctx            , sample_rates            )) < 0)
 240         return ret;
 241
 242     return 0;
 243 }
 244
 245 static int config_props(AVFilterLink *outlink)
 246 {
 247     AVFilterContext *ctx = outlink->src;
 248     FliteContext *flite = ctx->priv;
 249
 250     outlink->sample_rate = flite->wave->sample_rate;
 251     outlink->time_base = (AVRational){1, flite->wave->sample_rate};
 252
 253     av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
 254            flite->voice_str,
 255            av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
 256     return 0;
 257 }
 258
 259 static int request_frame(AVFilterLink *outlink)
 260 {
 261     AVFrame *samplesref;
 262     FliteContext *flite = outlink->src->priv;
 263     int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
 264
 265     if (!nb_samples)
 266         return AVERROR_EOF;
 267
 268     samplesref = ff_get_audio_buffer(outlink, nb_samples);
 269     if (!samplesref)
 270         return AVERROR(ENOMEM);
 271
 272     memcpy(samplesref->data[0], flite->wave_samples,
 273            nb_samples * flite->wave->num_channels * 2);
 274     samplesref->pts = flite->pts;
 275 #if FF_API_FRAME_PKT
 276 FF_DISABLE_DEPRECATION_WARNINGS
 277     samplesref->pkt_pos = -1;
 278 FF_ENABLE_DEPRECATION_WARNINGS
 279 #endif
 280     samplesref->sample_rate = flite->wave->sample_rate;
 281     flite->pts += nb_samples;
 282     flite->wave_samples += nb_samples * flite->wave->num_channels;
 283     flite->wave_nb_samples -= nb_samples;
 284
 285     return ff_filter_frame(outlink, samplesref);
 286 }
 287
 288 static const AVFilterPad flite_outputs[] = {
 289     {
 290         .name          = "default",
 291         .type          = AVMEDIA_TYPE_AUDIO,
 292         .config_props  = config_props,
 293         .request_frame = request_frame,
 294     },
 295 };
 296
 297 const AVFilter ff_asrc_flite = {
 298     .name          = "flite",
 299     .description   = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
 300     .init          = init,
 301     .uninit        = uninit,
 302     .priv_size     = sizeof(FliteContext),
 303     .inputs        = NULL,
 304     FILTER_OUTPUTS(flite_outputs),
 305     FILTER_QUERY_FUNC(query_formats),
 306     .priv_class    = &flite_class,
 307 };