src/modules/MultiSyn/DiphoneVoiceModule.h

   1 /*************************************************************************/
   2 /*                                                                       */
   3 /*                Centre for Speech Technology Research                  */
   4 /*                 (University of Edinburgh, UK) and                     */
   5 /*                           Korin Richmond                              */
   6 /*                         Copyright (c) 2002                            */
   7 /*                         All Rights Reserved.                          */
   8 /*                                                                       */
   9 /*  Permission is hereby granted, free of charge, to use and distribute  */
  10 /*  this software and its documentation without restriction, including   */
  11 /*  without limitation the rights to use, copy, modify, merge, publish,  */
  12 /*  distribute, sublicense, and/or sell copies of this work, and to      */
  13 /*  permit persons to whom this work is furnished to do so, subject to   */
  14 /*  the following conditions:                                            */
  15 /*                                                                       */
  16 /*   1. The code must retain the above copyright notice, this list of    */
  17 /*      conditions and the following disclaimer.                         */
  18 /*   2. Any modifications must be clearly marked as such.                */
  19 /*   3. Original authors' names are not deleted.                         */
  20 /*   4. The authors' names are not used to endorse or promote products   */
  21 /*      derived from this software without specific prior written        */
  22 /*      permission.                                                      */
  23 /*                                                                       */
  24 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
  25 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
  26 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT   */
  27 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
  28 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
  29 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
  30 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
  31 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
  32 /*  THIS SOFTWARE.                                                       */
  33 /*                                                                       */
  34 /*************************************************************************/
  35 /*                                                                       */
  36 /*                          Author: Korin Richmond                       */
  37 /*                            Date:  Aug  2002                           */
  38 /* --------------------------------------------------------------------- */
  39 /* A diphone unit selection "voice module"                               */
  40 /* (implemented using a list of utterance objects)                       */
  41 /*************************************************************************/
  42
  43 #ifndef __DIPHONEVOICEMODULE_H__
  44 #define __DIPHONEVOICEMODULE_H__
  45
  46 #include "VoiceModuleBase.h"
  47 #include "EST_DiphoneCoverage.h"
  48 #include "siod_defs.h"
  49 #include "EST_Val_defs.h"
  50 #include "EST_String.h"
  51
  52 #include "EST_viterbi.h"
  53
  54 #include "EST_types.h" // for EST_StrList
  55
  56 #include "EST_FlatTargetCost.h"
  57
  58 class EST_Utterance;
  59 class EST_Relation;
  60 class EST_VTCandidate;
  61 class EST_VTPath;
  62 class EST_Features;
  63 class EST_Track;
  64 class EST_Wave;
  65 class EST_Item;
  66
  67 // return standard join point time for this segment
  68 // (one half of a diphone)
  69 float getJoinTime( const EST_Item *seg );
  70
  71 //template<class T> class EST_TStringHash;
  72 #include "EST_THash.h"
  73 template<class T> class EST_TList;
  74 typedef EST_TList<EST_Item*> ItemList;
  75
  76
  77 SIOD_REGISTER_CLASS_DCLS(du_voicemodule,DiphoneVoiceModule)
  78 VAL_REGISTER_CLASS_DCLS(du_voicemodule,DiphoneVoiceModule)
  79
  80 // following is necessary to make some of a candidate's information
  81 // available in a faster way that EST_Item feature lookups (critically
  82 // the join cost coefficients EST_FVectors for example)
  83 // i.e. yet another temporary hack... (would be better if EST_viterbi
  84 // code allowed other things apart from just EST_Item* in order to
  85 // perform the search)
  86 class DiphoneCandidate {
  87 public:
  88   DiphoneCandidate( const EST_Item *phone1,
  89                     const DiphoneVoiceModule *p,
  90                     const EST_FVector *left,
  91                     const EST_FVector *right )
  92     : ph1(phone1), dvm( p ), l_coef(left), r_coef(right),
  93     ph1_jccid(-1), ph1_jccindex(-1), ph2_jccid(-1), ph2_jccindex(-1){};
  94
  95   const EST_Item *ph1;
  96   const DiphoneVoiceModule *dvm;
  97   const EST_FVector *l_coef;
  98   const EST_FVector *r_coef;
  99   int ph1_jccid, ph1_jccindex;
 100   int ph2_jccid, ph2_jccindex;
 101 };
 102
 103 VAL_REGISTER_CLASS_DCLS(diphonecandidate,DiphoneCandidate)
 104
 105 class DiphoneVoiceModule : public VoiceModuleBase {
 106 public:
 107   DiphoneVoiceModule( const EST_StrList& basenames,
 108                       const EST_String& uttDir,
 109                       const EST_String& wavDir,
 110                       const EST_String& pmDir,
 111                       const EST_String& coefDir,
 112                       unsigned int srate = 16000,
 113                       const EST_String& uttExt  = ".utt",
 114                       const EST_String& wavExt  = ".wav",
 115                       const EST_String& pmExt   = ".pm",
 116                       const EST_String& coefExt = ".coef" );
 117
 118   virtual ~DiphoneVoiceModule();
 119
 120   virtual void initialise(const EST_TargetCost *tc, bool ignore_bad_tag=false );
 121   virtual unsigned int numModuleUnits() const;
 122   virtual unsigned int numUnitTypes() const;
 123   virtual unsigned int numAvailableCandidates( const EST_String &unit ) const;
 124
 125
 126   ///// Some "debugging" functions - deliberately don't mind doing
 127   // slow things like returning copies of things.  Such functions are
 128   // not intended to do important things, but just to make it easier
 129   // to work out whats "in" the voice database object.
 130
 131   // return copy of utterance number
 132   bool getUtterance( EST_Utterance **utt, int n ) const;
 133
 134
 135   // return pointer to utterance which has feature "feat_name"
 136   // set to value "value"
 137   bool getUtterance( EST_Utterance **utt,
 138                      const EST_String &feat_name,
 139                      const EST_Val &value ) const;
 140   void getDiphoneCoverageStats(EST_DiphoneCoverage *dc) const;
 141
 142 //   int DiphoneVoiceModule::getCandidateList( const EST_Item& target,
 143 //                                          const EST_TargetCost& tc,
 144 //                                          EST_VTCandidate *head,
 145 //                                          EST_VTCandidate *tail ) const;
 146
 147
 148
 149   int getCandidateList( const EST_Item& target,
 150                         const EST_TargetCost *tc,
 151                         const TCDataHash *tcdh,
 152                         const float tc_weight,
 153                         EST_VTCandidate **head,
 154                         EST_VTCandidate **tail ) const;
 155
 156   // append all instances of a certain phone present in the utterances
 157   // in this voice.  Returns the number added
 158   int getPhoneList( const EST_String &phone, ItemList &list );
 159
 160 private:
 161   // don't allow copying of Voices (for now?)
 162   DiphoneVoiceModule( const DiphoneVoiceModule& );
 163   DiphoneVoiceModule& operator=( const DiphoneVoiceModule& );
 164
 165   // Flatpack
 166   void flatPack( EST_Relation *segs, const EST_TargetCost *tc) const;
 167
 168   void addCoefficients( EST_Relation *segs, const EST_Track& coefs );
 169   void addToCatalogue( const EST_Utterance *utt, int *num_ignored, bool ignore_bad=false );
 170   void getDiphone( const EST_Item *phone1,
 171                    EST_Track* coef, EST_Wave* sig, int* midframe,
 172                    bool extendLeft=0, bool extendRight=0 ) const;
 173
 174   friend class DiphoneUnitVoice;
 175
 176 private:
 177   EST_StrList fileList;
 178   EST_String utt_dir;  // utterance files
 179   EST_String utt_ext;
 180   EST_String pm_dir;   // pitch marks
 181   EST_String pm_ext;
 182   EST_String coef_dir; // for coefficients that aren't pitch syncronous
 183   EST_String coef_ext;
 184   EST_String wave_dir; // waveform (or residual)
 185   EST_String wave_ext;
 186
 187   unsigned int wav_srate; //sample rate of voice waveform data
 188
 189   TCDataHash *tcdatahash;
 190
 191   EST_TList<EST_Utterance *> *utt_dbase;
 192   EST_TStringHash<ItemList*> *catalogue;
 193 };
 194
 195 #endif // __DIPHONEVOICEMODULE_H__
 196