src/modules/UniSyn/UniSyn.h

   1 /*************************************************************************/
   2 /*                                                                       */
   3 /*                Centre for Speech Technology Research                  */
   4 /*                     University of Edinburgh, UK                       */
   5 /*                       Copyright (c) 1996,1997                         */
   6 /*                        All Rights Reserved.                           */
   7 /*                                                                       */
   8 /*  Permission is hereby granted, free of charge, to use and distribute  */
   9 /*  this software and its documentation without restriction, including   */
  10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
  11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
  12 /*  permit persons to whom this work is furnished to do so, subject to   */
  13 /*  the following conditions:                                            */
  14 /*   1. The code must retain the above copyright notice, this list of    */
  15 /*      conditions and the following disclaimer.                         */
  16 /*   2. Any modifications must be clearly marked as such.                */
  17 /*   3. Original authors' names are not deleted.                         */
  18 /*   4. The authors' names are not used to endorse or promote products   */
  19 /*      derived from this software without specific prior written        */
  20 /*      permission.                                                      */
  21 /*                                                                       */
  22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
  23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
  24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
  25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
  26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
  27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
  28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
  29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
  30 /*  THIS SOFTWARE.                                                       */
  31 /*                                                                       */
  32 /*************************************************************************/
  33 /*                                                                       */
  34 /*                          Author: Paul Taylor                          */
  35 /*                           Date: March 1998                            */
  36 /* --------------------------------------------------------------------- */
  37 /*                                                                       */
  38 /*************************************************************************/
  39
  40
  41 #ifndef __UNISYN_H__
  42 #define __UNISYN_H__
  43
  44 #include "festival.h"
  45
  46
  47 typedef EST_TVector <EST_Wave> EST_WaveVector;
  48
  49 #ifdef HAVE_US_TDPSOLA_TM
  50 void us_init_tdpsola();
  51 void us_tdpsola_synthesis(EST_Utterance &utt,
  52                           const EST_String &ola_method);
  53 #endif
  54
  55
  56 void us_linear_smooth_amplitude( EST_Utterance *utt );
  57
  58 /**@name Functions for Concatenating Units
  59
  60 */
  61
  62 //@{
  63
  64 /** Iterate through the Unit relation and create the<parameter>
  65     SourceCeof</parameter> relation, which contains a series of
  66     windowed frames of speech and a track of pitch-synchronous
  67     coefficients.</para>
  68
  69     <para> <parameter>SourceCoef</parameter> contains a single item
  70     with two features, <parameter>coefs</parameter> and
  71     <parameter>frame</parameter>
  72
  73     <parameter>coefs'</parameter>value is a track with all the
  74     concatenated pitchmarks and coefficients from the units.
  75
  76     <function>us_unit_concat</function> is where the pitch synchronous
  77     windowing of the frames in each Unit is performed and the result
  78     of this is stored as the value of <parameter>frame</parameter>
  79     </para>
  80     <formalpara><title>Require:</title><para>Unit</para>
  81         </formalpara>
  82     <formalpara><title>Provide:</title><para>SourceCoef</para>
  83         </formalpara>
  84
  85
  86     @param utt: utterance
  87
  88     @param window_factor: This specifies
  89     how large the analysis window is in relation to the local pitch
  90     period. A value of 1.0 is often used as this means each frame
  91     approximately extends from the previous pitch mark to the next.
  92
  93     @param window_name: This specifies
  94     the type of window used. "hanning" is standard but any window type
  95     available from the signal processing library can be used.
  96
  97     @param window_symmetric: if this is set to true, then symmetric
  98     analysis windows are used centred at each pitch mark, with size
  99     determined by the time difference between current and previous
 100     pitchmarks.
 101
 102     @param no_waveform: if this is set to true, only the coefficients
 103     are copied into SourceCoef - no waveform analysis is performed.
 104 */
 105
 106 void us_unit_concat(EST_Utterance &utt, float window_factor,
 107                     const EST_String &window_name,
 108                     bool no_waveform=false,
 109                     bool window_symmetric=true);
 110
 111
 112 /** This function provides the setup for copy resynthesis. In copy
 113 resynthesis, a natural waveform is used as the source speech for
 114 synthesis rather than diphones or other concatenated units. This is
 115 often useful for testing a prosody module or for altering the pitch or
 116 duration of a natural waveform for an experiment. (As such, this
 117 function should really be thought of as a very simple unit selection
 118 module)</para>
 119
 120 <para> In addition to the speech waveform itself, the function
 121 requires a set of pitchmarks in the standard form, and a set of labels
 122 which mark segment boundaries. The <parameter>Segment</parameter>
 123 relation must already exist in the utterance prior to calling this
 124 function. </para>
 125
 126 First, the function creates a<parameter>Unit</parameter> relation with
 127 a single item containing the waveform and the pitchmarks. Next it adds
 128 a set of <parameter>source_end</parameter> features to each item in
 129 the <parameter>Segment</parameter> relation. It does this by
 130 calculating a mapping between the <parameter>Segment</parameter>
 131 relation and the input labels. This mapping is performed by dynamic
 132 programming, as often the two sets of labels don't match exactly.
 133 </para>
 134 <para>
 135 The final result, therefore is a Unit relation and Segment relation
 136 with source_end features. As this is exactly the same output of the
 137 standard concantenative synthesis modules, from here on the utterance
 138 can be processed as if the units were from a genuine synthesizer.
 139 </para>
 140 <para>
 141 Copy synthesis itself can be performed by ....
 142 </para>
 143
 144     <formalpara><title>Require:</title><para>Segment</para>
 145         </formalpara>
 146     <formalpara><title>Provide:</title><para>Unit</para>
 147         </formalpara>
 148
 149
 150     @param utt: utterance
 151
 152     @param source_sig: waveform
 153     @param source_pm: pitchmarks belonging to waveform
 154     @param source_seg: set of items with end times referring to points
 155     in the waveform
 156
 157 */
 158
 159 void us_get_copy_wave(EST_Utterance &utt, EST_Wave &source_sig,
 160                        EST_Track &source_pm, EST_Relation &source_seg);
 161
 162 /** This function produces a waveform from the Unit relation without
 163 prosodic modification. In effect, this function simply concatenates
 164 the waveform parts of the units in the unit relation. An overlap add
 165 operation is performed at unit boundaries so that waveform
 166 discontinuities don't occur.
 167
 168
 169 */
 170 void us_unit_raw_concat(EST_Utterance &utt);
 171
 172 /** Items in the Unit relation can take an optional
 173 flag<parameter>energy_factor</parameter>, which scales the amplitude
 174 of the unit waveform. This is useful because units often have
 175 different energy levels due to different recording circumstances.  An
 176 <parameter>energy_factor</parameter> of 1.0 leaves the waveform
 177 unchanged.
 178
 179 */
 180
 181 void us_energy_normalise(EST_Relation &unit);
 182
 183 //@}
 184
 185
 186 /**@name Functions for Producing Mappings
 187
 188 */
 189
 190 //@{
 191
 192 /** This function produces the mapping between the SourceCoef track
 193 and TargetCoef track. The mapping is controlled by two types of
 194 modification, <emph>duration</emph> and <emph>pitch</emph>. </para>
 195
 196 <para>Duration is specified by the <parameter>Segment</parameter>
 197 relation. Each item in this relation has two features
 198 <parameter>source_end</parameter> and
 199 <parameter>target_end</parameter>.<parameter>source_end</parameter> is
 200 marks the end point of that segment in the concatenated set of source
 201 coefficients, while <parameter>target_end</parameter> marks the
 202 desired end of that segment.</para>
 203
 204 <para> Pitch modification is specified by the patterns of pitchmarks
 205 in the <parameter>SourceCoef</parameter> track and
 206 <parameter>TargetCoef</parameter> track. While these tracks actually
 207 represent periods, their reciprocal represents the source and target
 208 F0 contours.
 209 </para><para>
 210
 211 The mapping is an integer array with one element for every pitchmark in
 212 the TargetCoef track. Therefore, every target pitchmark has a mapping
 213 element, and the value of that element is the frame number in the
 214 SourceCoef track which should be used to generate the frame of speech
 215 for that target pitchmark. Depending on the mapping, source frames can
 216 be duplicated or skipped.
 217
 218 </para><para> If the duration is constant, a higher target pitch will
 219 mean source frames are duplicated. If the pitch is constant, a longer
 220 target duration will also mean source frames are duplicated.  The
 221 duration and pitch modifications are calculated at the same time,
 222 leading to a single mapping.
 223
 224 <formalpara><title>Require:</title><para>SourceCoef, TargetCoef, Segment</para>
 225 </formalpara>
 226
 227         <formalpara><title>Provide:</title><para>US_Map</para>
 228         </formalpara>
 229 */
 230
 231
 232 void us_mapping(EST_Utterance &utt, const EST_String &method);
 233
 234
 235 // for graphical display only:
 236 void map_to_relation(EST_IVector &map, EST_Relation &r,
 237                      const EST_Track &source_pm,
 238                      const EST_Track &target_pm);
 239 //@}
 240
 241 /**@name Functions for Generating Waveforms
 242
 243 */
 244
 245 //@{
 246
 247 /** Standard waveform generation function. This function genrates the
 248 actual synthetic speech waveform, using information in the SourceCoef,
 249 TargetCoef and US_map relations.
 250 </para><para>
 251
 252 The first stage involves time domain processing, whereby a speech
 253 waveform or residual waveform is generated. The second (optional)
 254 stage passes this waveform through the set of filter coefficients
 255 specified in the TargetCoef track. The output synthetic waveform is
 256 put in the Wave relation.
 257 </para>
 258 <para>
 259 LPC resynthesis  is performed by the <link linkend="lpc-filter-1">lpc_filter_1</link> function.
 260
 261 </para>
 262
 263     <formalpara><title>Require:</title><para>SourceCoef, TargetCoef,
 264     US_map</para> </formalpara>
 265     <formalpara><title>Provide:</title><para>Wave</para>
 266     </formalpara>
 267
 268     @param utt: utterance
 269     @param filter_method: type of filter used - normally "lpc" or none ("")
 270     @param td_method: type of time domain synthesis.
 271 */
 272
 273 void us_generate_wave(EST_Utterance &utt,
 274                       const EST_String &filter_method,
 275                       const EST_String &ola_method);
 276
 277 /** This copies coefficients from <parameter>source_coef</parameter>
 278 into <parameter>target_coef</parameter> according to the frame mapping
 279 specified by
 280 <parameter>map</parameter>. <parameter>target_coef</parameter> should
 281 already have been allocated, and the pitchmarks in the time array set
 282 to appropriate values. (this can be done by the <link
 283 linkend="f0-to-pitchmarks">f0_to_pitchmarks</link> function).
 284
 285 */
 286
 287 void map_coefs(EST_Track &source_coef, EST_Track &target_coef,
 288                EST_IVector &map);
 289
 290 /** Time domain resynthesis.
 291
 292 Generate a speech waveform by copying frames into a set of time
 293 positions given by target_pm. The frame used for each time position is
 294 given by map, and the frames themselves are stored individually as
 295 waveforms in frames.
 296 </para>
 297
 298 @param target_sig: output waveform
 299 @param target_pm: new pitchmark positions
 300 @param frames: array containing waveforms, each representing a single analysis
 301      frame
 302 @param map: mapping between target_pm and frames.
 303
 304 */
 305
 306 void td_synthesis(EST_WaveVector &frames,
 307                   EST_Track &target_pm, EST_Wave &target_sig,
 308                   EST_IVector &map);
 309
 310
 311 /** Variant of td_synthesis, where each frame is re-windowed according to the
 312 size of the local synthesis pitch period.
 313 </para>
 314 @param target_sig: output waveform
 315 @param target_pm: new pitchmark positions
 316 @param frames: array containing waveforms, each representing a single analysis
 317      frame
 318 @param map: mapping between target_pm and frames.
 319
 320 */
 321
 322 void td_synthesis2(EST_WaveVector &frames,
 323                    EST_Track &target_pm, EST_Wave &target_sig,
 324                    EST_IVector &map);
 325
 326 //@}
 327
 328
 329 void asymmetric_window_td_synthesis(EST_WaveVector &frames,
 330                                     EST_Track &target_pm,
 331                                     EST_Wave &target_sig,
 332                                     EST_IVector &map,
 333                                     EST_IVector &frame_pm_indices);
 334
 335
 336 /**@name Pitchmark Functions
 337
 338 */
 339 //@{
 340
 341 /** This function generates the target pitchmarks from the target F0
 342 contour. The pitchmarks are generated by reading a value, \(f_{0}\)
 343 off the f0 contour at time \(t\), calculating the local pitch period
 344 \(\tau = 1/f_{0}\), and placing a pitchmark at time \(T + t\). The
 345 process is then repeated by reading the F0 value at this new point and
 346 so on. </para>
 347
 348 <para> The F0 contour must be continuous in all regions, that is
 349 unvoiced regions must have pseudo f0 values also. Although artificial
 350 contours are best generated in this way to begin with, the function
 351 \ref{**} can be used to interpolate through unvoiced regions for
 352 non-continuous contours.
 353 </para>
 354
 355 <para> As the last F0 value in the contour may not be the end of the
 356 utterance (for example if the last phone is unvoiced), the pitchmarks may be extended past the end of the contour.
 357
 358 </para> <para>
 359
 360 After processing, the generated track only contains the target
 361 pitchmarks, but later functions may fill the amplitude array of the
 362 track with target coefficients, and hence the space for these can be
 363 allocated at this stage.
 364
 365 </para>
 366 @param fz: input F0 contour.
 367
 368 @param pm: set of pitchmarks to be generated. These are set to the
 369 correct size in the function.
 370
 371 @param num_channels: (optional) number of coefficients used in further
 372 processing.
 373
 374 @param default_f0: (optional) f0 value for interpolated end values
 375
 376 @param target_end: (optional) fill from the end of the contour to this
 377 point with default f0 values.
 378
 379 */
 380 void f0_to_pitchmarks(EST_Track &fz, EST_Track &pm, int num_channels=0,
 381                       float default_f0=100.0, float target_end=-1);
 382
 383
 384
 385 /** This is a utility function for converting a set of pitchmarks back
 386 to an F0 contour and is usually used in system development etc.  The
 387 generated F0 is evenly spaced.
 388
 389 @param pm: input set of pitchmarks to be generated
 390
 391 @param fz: otuput F0 contour.
 392
 393 @param shift: frame shift of generated contour in seconds.
 394 */
 395
 396 void pitchmarks_to_f0(EST_Track &pm, EST_Track &fz, float shift);
 397
 398 //@}
 399
 400 void register_unisyn_features(void);
 401
 402 #endif // __UNISYN_H__
 403