src/modules/Lexicon/lex_ff.cc

   1 /*************************************************************************/
   2 /*                                                                       */
   3 /*                Centre for Speech Technology Research                  */
   4 /*                     University of Edinburgh, UK                       */
   5 /*                         Copyright (c) 1998                            */
   6 /*                        All Rights Reserved.                           */
   7 /*                                                                       */
   8 /*  Permission is hereby granted, free of charge, to use and distribute  */
   9 /*  this software and its documentation without restriction, including   */
  10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
  11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
  12 /*  permit persons to whom this work is furnished to do so, subject to   */
  13 /*  the following conditions:                                            */
  14 /*   1. The code must retain the above copyright notice, this list of    */
  15 /*      conditions and the following disclaimer.                         */
  16 /*   2. Any modifications must be clearly marked as such.                */
  17 /*   3. Original authors' names are not deleted.                         */
  18 /*   4. The authors' names are not used to endorse or promote products   */
  19 /*      derived from this software without specific prior written        */
  20 /*      permission.                                                      */
  21 /*                                                                       */
  22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
  23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
  24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
  25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
  26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
  27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
  28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
  29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
  30 /*  THIS SOFTWARE.                                                       */
  31 /*                                                                       */
  32 /*************************************************************************/
  33 /*             Author :  Alan W Black                                    */
  34 /*             Date   :  May 1998                                        */
  35 /*-----------------------------------------------------------------------*/
  36 /*                                                                       */
  37 /* Word based ffeature functions                                        */
  38 /*                                                                       */
  39 /*=======================================================================*/
  40 #include <cstdio>
  41 #include "festival.h"
  42 #include "lexiconP.h"
  43
  44 static EST_String Phrase("Phrase");
  45 static EST_Val f_content("content");
  46 static EST_Val f_string0("0");
  47 static EST_Val f_string1("1");
  48
  49 static EST_Val ff_word_gpos(EST_Item *s)
  50 {
  51     /* Part of speech by guessing, returns, prep, det, aux, content */
  52     /* from simple lookup list                                      */
  53     EST_String word;
  54     LISP l;
  55     LISP guess_pos;
  56
  57     word = downcase(s->name());
  58
  59     guess_pos = siod_get_lval("guess_pos","no guess_pos set");
  60
  61     for (l=guess_pos; l != NIL; l=cdr(l))
  62         if (siod_member_str(word,cdr(car(l))))
  63             return EST_Val(get_c_string(car(car(l))));
  64
  65     return f_content;
  66 }
  67
  68 EST_Val ff_word_contentp(EST_Item *s)
  69 {
  70     /* 1 if this is a content word, 0 otherwise */
  71
  72     if (ff_word_gpos(s) == "content")
  73         return f_string1;
  74     else
  75         return f_string0;
  76 }
  77
  78 static EST_Val ff_word_n_content(EST_Item *s)
  79 {
  80     // returns the next content word after s
  81     EST_Item *p;
  82
  83     for (p=s->as_relation("Word")->next(); p != 0; p = p->next())
  84     {
  85         if (ff_word_gpos(p) == "content")
  86             return EST_Val(p->name());
  87     }
  88
  89     return f_string0;
  90 }
  91
  92 static EST_Val ff_word_nn_content(EST_Item *s)
  93 {
  94     // returns the next next content word after s
  95     int count = 0;
  96     EST_Item *p;
  97
  98     for (p=s->as_relation("Word")->next(); p != 0; p = p->next())
  99     {
 100         if (ff_word_gpos(p) == "content")
 101         {
 102             count ++;
 103             if (count == 2)
 104                 return EST_Val(p->name());
 105         }
 106     }
 107
 108     return f_string0;
 109 }
 110
 111 static EST_Val ff_word_p_content(EST_Item *s)
 112 {
 113     // returns the previous content word after s
 114     EST_Item *p;
 115
 116     for (p=s->as_relation("Word")->prev(); p != 0; p = p->prev())
 117         if (ff_word_gpos(p) == "content")
 118             return EST_Val(p->name());
 119
 120     return f_string0;
 121 }
 122
 123 static EST_Val ff_word_pp_content(EST_Item *s)
 124 {
 125     // returns the previous previous content word after s
 126     int count = 0;
 127     EST_Item *p;
 128
 129     for (p=s->as_relation("Word")->prev(); p != 0; p = p->prev())
 130     {
 131         if (ff_word_gpos(p) == "content")
 132         {
 133             count ++;
 134             if (count == 2)
 135                 return EST_Val(p->name());
 136         }
 137     }
 138
 139     return f_string0;
 140 }
 141
 142 static EST_Val ff_content_words_out(EST_Item *s)
 143 {
 144     EST_Item *nn = s->as_relation(Phrase);
 145     EST_Item *p;
 146     int pos=0;
 147
 148     for (p=nn->next(); p; p=p->next())
 149     {
 150         if (ff_word_gpos(p) == "content")
 151             pos++;
 152     }
 153     // don't think you can get here
 154     return EST_Val(pos);
 155 }
 156
 157 static EST_Val ff_content_words_in(EST_Item *s)
 158 {
 159     EST_Item *nn = s->as_relation(Phrase);
 160     EST_Item *p;
 161     int pos=0;
 162
 163     for (p=nn->prev(); p; p=p->prev())
 164     {
 165         if (ff_word_gpos(p) == "content")
 166             pos++;
 167     }
 168     // don't think you can get here
 169     return EST_Val(pos);
 170 }
 171
 172 static EST_Val ff_word_cap(EST_Item *s)
 173 {
 174     //  "1" is the word starts with a capital letter
 175     const char *word = s->name();
 176
 177     if ((word[0] >= 'A') && (word[0] <='Z'))
 178         return f_string1;
 179     else
 180         return f_string0;
 181 }
 182
 183 static EST_Val ff_syl_onset_type(EST_Item *s)
 184 {
 185     // Return van Santen's classification of onset type in to one
 186     // of three forms:
 187     //   -V    contains only voiceless consonants
 188     //   +V-S  contains voiced obstruents but no sonorants
 189     //   +S    contains just sonorants
 190     EST_Item *nn = s->as_relation("SylStructure");
 191     EST_Item *p;
 192     int vox=FALSE;
 193     int sonorant=FALSE;
 194
 195     for (p=daughter1(nn); p->next() != 0; p=p->next())
 196     {
 197         if (ph_is_vowel(p->name()))
 198             break;
 199         if (ph_is_voiced(p->name()))
 200             vox = TRUE;
 201         if (ph_is_sonorant(p->name()))
 202             sonorant = TRUE;
 203     }
 204
 205     if (p==daughter1(nn)) // null-onset case
 206         return EST_Val("+V-S");
 207     else if (sonorant)
 208         return EST_Val("+S");
 209     else if (vox)
 210         return EST_Val("+V-S");
 211     else
 212         return EST_Val("-V");
 213 }
 214
 215 static EST_Val ff_syl_coda_type(EST_Item *s)
 216 {
 217     // Return van Santen's classification of onset type in to one
 218     // of three forms:
 219     //   -V    contains only voiceless consonants
 220     //   +V-S  contains voiced obstruents but no sonorants
 221     //   +S    contains just sonorants
 222     EST_Item *nn = s->as_relation("SylStructure");
 223     EST_Item *p;
 224     int vox=FALSE;
 225     int sonorant=FALSE;
 226
 227     for (p=daughter1(nn); p->next() != 0; p=p->next())
 228     {
 229         if (ph_is_vowel(p->name()))
 230             break;
 231     }
 232
 233     if (p->next() == 0)         // empty coda
 234         return EST_Val("+S");
 235
 236     for (p=p->next(); p != 0; p=p->next())
 237     {
 238         if (ph_is_voiced(p->name()))
 239             vox = TRUE;
 240         if (ph_is_sonorant(p->name()))
 241             sonorant = TRUE;
 242     }
 243
 244     if (sonorant)
 245         return EST_Val("+S");
 246     else if (vox)
 247         return EST_Val("+V-S");
 248     else
 249         return EST_Val("-V");
 250 }
 251
 252 void festival_lex_ff_init(void)
 253 {
 254
 255     festival_def_nff("gpos","Word",ff_word_gpos,
 256     "Word.gpos\n\
 257   Returns a guess at the part of speech of this word.  The lisp a-list\n\
 258   guess_pos is used to load up this word.  If no part of speech is\n\
 259   found in there \"content\" is returned.  This allows a quick efficient\n\
 260   method for part of speech tagging into closed class and content words.");
 261     festival_def_nff("contentp","Word",ff_word_contentp,
 262     "Word.contentp\n\
 263   Returns 1 if this word is a content word as defined by gpos, 0 otherwise.");
 264     festival_def_nff("cap","Word",ff_word_cap,
 265     "Word.cap\n\
 266   Returns 1 if this word starts with a capital letter, 0 otherwise.");
 267     festival_def_nff("n_content","Word",ff_word_n_content,
 268     "Word.n_content\n\
 269   Next content word.  Note this doesn't use the standard n. notation as\n\
 270   it may have to search a number of words forward before finding a\n\
 271   non-function word.  Uses gpos to define content/function word distinction.\n\
 272   This also works for Tokens.");
 273     festival_def_nff("nn_content","Word",ff_word_nn_content,
 274     "Word.nn_content\n\
 275   Next next content word.  Note this doesn't use the standard n.n. notation\n\
 276   as it may have to search a number of words forward before finding the \n\
 277   second non-function word.  Uses gpos to define content/function word\n\
 278   distinction.  This also works for Tokens.");
 279     festival_def_nff("p_content","Word",ff_word_p_content,
 280     "Word.p_content\n\
 281   Previous content word.  Note this doesn't use the standard p. notation\n\
 282   as it may have to search a number of words backward before finding the \n\
 283   first non-function word.  Uses gpos to define content/function word\n\
 284   distinction.  This also works for Tokens.");
 285     festival_def_nff("pp_content","Word",ff_word_pp_content,
 286     "Word.pp_content\n\
 287   Previous previous content word.  Note this doesn't use the standard p.p.\n\
 288   notation as it may have to search a number of words backward before\n\
 289   finding the first non-function word.  Uses gpos to define \n\
 290   content/function word distinction.  This also works for Tokens.");
 291     festival_def_nff("content_words_out","Word",ff_content_words_out,
 292     "Word.content_words_out\n\
 293   Number of content words to end of this phrase.");
 294     festival_def_nff("content_words_in","Word",ff_content_words_in,
 295     "Word.content_words_in\n\
 296   Number of content words from start this phrase.");
 297     festival_def_nff("syl_onset_type","Syllable",ff_syl_onset_type,
 298     "Syllable.syl_onset_type\n\
 299   Return the van Santen and Hirschberg classification. -V for unvoiced,\n\
 300   +V-S for voiced but no sonorants, and +S for sonorants.");
 301     festival_def_nff("syl_coda_type","Syllable",ff_syl_coda_type,
 302     "Syllable.syl_coda_type\n\
 303   Return the van Santen and Hirschberg classification. -V for unvoiced,\n\
 304   +V-S for voiced but no sonorants, and +S for sonorants.");
 305
 306 }