1 /*************************************************************************/
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1998 */
6 /* All Rights Reserved. */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
32 /*************************************************************************/
33 /* Author : Alan W Black */
35 /*-----------------------------------------------------------------------*/
37 /* Word based ffeature functions */
39 /*=======================================================================*/
44 static EST_String Phrase("Phrase");
45 static EST_Val f_content("content");
46 static EST_Val f_string0("0");
47 static EST_Val f_string1("1");
49 static EST_Val ff_word_gpos(EST_Item *s)
51 /* Part of speech by guessing, returns, prep, det, aux, content */
52 /* from simple lookup list */
57 word = downcase(s->name());
59 guess_pos = siod_get_lval("guess_pos","no guess_pos set");
61 for (l=guess_pos; l != NIL; l=cdr(l))
62 if (siod_member_str(word,cdr(car(l))))
63 return EST_Val(get_c_string(car(car(l))));
68 EST_Val ff_word_contentp(EST_Item *s)
70 /* 1 if this is a content word, 0 otherwise */
72 if (ff_word_gpos(s) == "content")
78 static EST_Val ff_word_n_content(EST_Item *s)
80 // returns the next content word after s
83 for (p=s->as_relation("Word")->next(); p != 0; p = p->next())
85 if (ff_word_gpos(p) == "content")
86 return EST_Val(p->name());
92 static EST_Val ff_word_nn_content(EST_Item *s)
94 // returns the next next content word after s
98 for (p=s->as_relation("Word")->next(); p != 0; p = p->next())
100 if (ff_word_gpos(p) == "content")
104 return EST_Val(p->name());
111 static EST_Val ff_word_p_content(EST_Item *s)
113 // returns the previous content word after s
116 for (p=s->as_relation("Word")->prev(); p != 0; p = p->prev())
117 if (ff_word_gpos(p) == "content")
118 return EST_Val(p->name());
123 static EST_Val ff_word_pp_content(EST_Item *s)
125 // returns the previous previous content word after s
129 for (p=s->as_relation("Word")->prev(); p != 0; p = p->prev())
131 if (ff_word_gpos(p) == "content")
135 return EST_Val(p->name());
142 static EST_Val ff_content_words_out(EST_Item *s)
144 EST_Item *nn = s->as_relation(Phrase);
148 for (p=nn->next(); p; p=p->next())
150 if (ff_word_gpos(p) == "content")
153 // don't think you can get here
157 static EST_Val ff_content_words_in(EST_Item *s)
159 EST_Item *nn = s->as_relation(Phrase);
163 for (p=nn->prev(); p; p=p->prev())
165 if (ff_word_gpos(p) == "content")
168 // don't think you can get here
172 static EST_Val ff_word_cap(EST_Item *s)
174 // "1" is the word starts with a capital letter
175 const char *word = s->name();
177 if ((word[0] >= 'A') && (word[0] <='Z'))
183 static EST_Val ff_syl_onset_type(EST_Item *s)
185 // Return van Santen's classification of onset type in to one
187 // -V contains only voiceless consonants
188 // +V-S contains voiced obstruents but no sonorants
189 // +S contains just sonorants
190 EST_Item *nn = s->as_relation("SylStructure");
195 for (p=daughter1(nn); p->next() != 0; p=p->next())
197 if (ph_is_vowel(p->name()))
199 if (ph_is_voiced(p->name()))
201 if (ph_is_sonorant(p->name()))
205 if (p==daughter1(nn)) // null-onset case
206 return EST_Val("+V-S");
208 return EST_Val("+S");
210 return EST_Val("+V-S");
212 return EST_Val("-V");
215 static EST_Val ff_syl_coda_type(EST_Item *s)
217 // Return van Santen's classification of onset type in to one
219 // -V contains only voiceless consonants
220 // +V-S contains voiced obstruents but no sonorants
221 // +S contains just sonorants
222 EST_Item *nn = s->as_relation("SylStructure");
227 for (p=daughter1(nn); p->next() != 0; p=p->next())
229 if (ph_is_vowel(p->name()))
233 if (p->next() == 0) // empty coda
234 return EST_Val("+S");
236 for (p=p->next(); p != 0; p=p->next())
238 if (ph_is_voiced(p->name()))
240 if (ph_is_sonorant(p->name()))
245 return EST_Val("+S");
247 return EST_Val("+V-S");
249 return EST_Val("-V");
252 void festival_lex_ff_init(void)
255 festival_def_nff("gpos","Word",ff_word_gpos,
257 Returns a guess at the part of speech of this word. The lisp a-list\n\
258 guess_pos is used to load up this word. If no part of speech is\n\
259 found in there \"content\" is returned. This allows a quick efficient\n\
260 method for part of speech tagging into closed class and content words.");
261 festival_def_nff("contentp","Word",ff_word_contentp,
263 Returns 1 if this word is a content word as defined by gpos, 0 otherwise.");
264 festival_def_nff("cap","Word",ff_word_cap,
266 Returns 1 if this word starts with a capital letter, 0 otherwise.");
267 festival_def_nff("n_content","Word",ff_word_n_content,
269 Next content word. Note this doesn't use the standard n. notation as\n\
270 it may have to search a number of words forward before finding a\n\
271 non-function word. Uses gpos to define content/function word distinction.\n\
272 This also works for Tokens.");
273 festival_def_nff("nn_content","Word",ff_word_nn_content,
275 Next next content word. Note this doesn't use the standard n.n. notation\n\
276 as it may have to search a number of words forward before finding the \n\
277 second non-function word. Uses gpos to define content/function word\n\
278 distinction. This also works for Tokens.");
279 festival_def_nff("p_content","Word",ff_word_p_content,
281 Previous content word. Note this doesn't use the standard p. notation\n\
282 as it may have to search a number of words backward before finding the \n\
283 first non-function word. Uses gpos to define content/function word\n\
284 distinction. This also works for Tokens.");
285 festival_def_nff("pp_content","Word",ff_word_pp_content,
287 Previous previous content word. Note this doesn't use the standard p.p.\n\
288 notation as it may have to search a number of words backward before\n\
289 finding the first non-function word. Uses gpos to define \n\
290 content/function word distinction. This also works for Tokens.");
291 festival_def_nff("content_words_out","Word",ff_content_words_out,
292 "Word.content_words_out\n\
293 Number of content words to end of this phrase.");
294 festival_def_nff("content_words_in","Word",ff_content_words_in,
295 "Word.content_words_in\n\
296 Number of content words from start this phrase.");
297 festival_def_nff("syl_onset_type","Syllable",ff_syl_onset_type,
298 "Syllable.syl_onset_type\n\
299 Return the van Santen and Hirschberg classification. -V for unvoiced,\n\
300 +V-S for voiced but no sonorants, and +S for sonorants.");
301 festival_def_nff("syl_coda_type","Syllable",ff_syl_coda_type,
302 "Syllable.syl_coda_type\n\
303 Return the van Santen and Hirschberg classification. -V for unvoiced,\n\
304 +V-S for voiced but no sonorants, and +S for sonorants.");