1 /*************************************************************************/
3 /* Carnegie Mellon University and */
4 /* Centre for Speech Technology Research */
5 /* University of Edinburgh, UK */
6 /* Copyright (c) 1998-2001 */
7 /* All Rights Reserved. */
9 /* Permission is hereby granted, free of charge, to use and distribute */
10 /* this software and its documentation without restriction, including */
11 /* without limitation the rights to use, copy, modify, merge, publish, */
12 /* distribute, sublicense, and/or sell copies of this work, and to */
13 /* permit persons to whom this work is furnished to do so, subject to */
14 /* the following conditions: */
15 /* 1. The code must retain the above copyright notice, this list of */
16 /* conditions and the following disclaimer. */
17 /* 2. Any modifications must be clearly marked as such. */
18 /* 3. Original authors' names are not deleted. */
19 /* 4. The authors' names are not used to endorse or promote products */
20 /* derived from this software without specific prior written */
23 /* THE UNIVERSITY OF EDINBURGH, CARNEGIE MELLON UNIVERSITY AND THE */
24 /* CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO */
25 /* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY */
26 /* AND FITNESS, IN NO EVENT SHALL THE UNIVERSITY OF EDINBURGH, CARNEGIE */
27 /* MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, */
28 /* INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER */
29 /* RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION */
30 /* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF */
31 /* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
33 /*************************************************************************/
34 /* Author : Alan W Black */
35 /* Date : April 1998 */
36 /*-----------------------------------------------------------------------*/
38 /* Yet another unit selection method. */
40 /* Using an acoustic measure find the distance between all units in the */
41 /* db. Try to minimise the mean difference between units in a cluster */
42 /* using CART technology, based on features like phonetic and prosodic */
43 /* context. This gives a bunch of CARTs for each unit type in the db */
44 /* which are acoustically close. Use these as candidates and optimise */
45 /* a path through them minimising join using a viterbi search. */
48 /* requires little or no measurements at selection time */
49 /* allows for clear method of pruning */
50 /* no weights need to be generated (well, except where they do) */
51 /* will optimise appropriately with varying numbers of example units */
54 /* Units can't cross between clusters */
56 /* Implementation of Black, A. and Taylor, P. (1997). Automatically */
57 /* clustering similar units for unit selection in speech synthesis */
58 /* Proceedings of Eurospeech 97, vol2 pp 601-604, Rhodes, Greece. */
60 /* postscript: http://www.cs.cmu.edu/~awb/papers/ES97units.ps */
61 /* http://www.cs.cmu.edu/~awb/papers/ES97units/ES97units.html */
65 /* This is a new implementation using the newer unit selection/signal */
66 /* processing archtecture in festival */
68 /* This is still in development but become more stable. It is robust */
69 /* for many cases, though a lot depends on the db and parameters */
72 /* This had significant new work (and bug fixes) done on it when awb */
75 /*=======================================================================*/
81 static EST_String static_unit_prev_move = "unit_prev_move";
82 static EST_String static_unit_this_move = "unit_this_move";
83 static EST_String static_jscore = "local_join_cost";
84 static EST_String static_tscore = "local_target_cost";
85 static EST_String static_cscore = "cummulative_unit_score";
87 static void setup_clunits_params();
88 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f);
89 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f);
90 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
94 static float optimal_couple(CLunit *u0,
99 float different_prev_pen,
100 float non_consecutive_pen);
101 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
102 EST_Relation &source_lab);
104 VAL_REGISTER_CLASS_NODEL(vtcand,EST_VTCandidate);
105 VAL_REGISTER_CLASS_NODEL(clunit,CLunit);
107 LISP selection_trees = NIL;
108 LISP clunits_params = NIL;
109 static int optimal_coupling = 0;
110 static int extend_selections = 0;
111 static int clunits_debug = 0;
112 static int clunits_log_scores = 0;
113 static int clunits_smooth_frames = 0;
114 float continuity_weight = 1;
115 float f0_join_weight = 0.0;
116 float different_prev_pen = 1000.0;
117 float non_consecutive_pen = 100.0;
118 static EST_String clunit_name_feat = "name";
122 static LISP clunits_select(LISP utt)
124 // Select units from db using CARTs to index into clustered unit groups
125 EST_Utterance *u = get_c_utt(utt);
128 cldb = check_cldb(); // make sure there is one loaded
129 setup_clunits_params();
131 f = u->relation("Segment")->head();
132 for (s=f; s; s=s->next())
133 s->set_val("clunit_name",ffeature(s,clunit_name_feat));
137 EST_Viterbi_Decoder v(TS_candlist,TS_npath,-1);
138 v.set_big_is_good(FALSE); // big is bad
140 v.initialise(u->relation("Segment"));
142 if (!v.result("unit_id"))
144 cerr << "CLUNIT: failed to find path\n";
147 v.copy_feature(static_unit_this_move);
148 v.copy_feature(static_unit_prev_move);
149 v.copy_feature(static_jscore);
150 v.copy_feature(static_tscore);
151 v.copy_feature(static_cscore);
157 static LISP clunits_get_units(LISP utt)
159 // Create unit stream and loading params
160 EST_Utterance *u = get_c_utt(utt);
161 EST_Relation *units,*ss;
164 cldb = check_cldb(); // make sure there is one loaded
166 units = u->create_relation("Unit");
167 for (s=u->relation("Segment")->head(); s != 0; s=s->next())
169 EST_Item *unit = units->append();
170 CLunit *db_unit = clunit(s->f("unit_id"));
172 unit->set_name(db_unit->name);
173 unit->set("fileid",db_unit->fileid);
174 // These should be modified from the optimal coupling
175 if ((s->prev()) && (s->f_present("unit_this_move")))
176 st = s->F("unit_this_move");
179 if (s->next() && (s->next()->f_present("unit_prev_move")))
180 e = s->next()->F("unit_prev_move");
185 unit->set("start",st);
186 unit->set("middle",db_unit->start);
188 unit->set("unit_start",st);
189 unit->set("unit_middle",db_unit->start);
190 unit->set("unit_end",e);
191 unit->set("seg_start",db_unit->start);
192 unit->set("seg_end",db_unit->end);
193 cldb->load_coefs_sig(unit);
195 printf("unit: %s fileid %s start %f end %f\n",
196 (const char *)db_unit->name,
197 (const char *)db_unit->fileid,
201 // Make it look as much like the diphones as possible for
202 // the rest of the code
203 ss = u->create_relation("SourceSegments");
204 for (s = u->relation("Segment")->head(); s != 0 ; s = s->next())
206 EST_Item *d = ss->append();
207 d->set_name(ffeature(s,"clunit_name"));
210 cl_parse_diphone_times(*units,*ss);
215 static void cl_parse_diphone_times(EST_Relation &diphone_stream,
216 EST_Relation &source_lab)
220 int e_frame, m_frame = 0;
221 float dur_1 = 0.0, dur_2 = 0.0, p_time;
222 float t_time = 0.0, end;
225 for (s = source_lab.head(), u = diphone_stream.head(); u; u = u->next(),
228 pm = track(u->f("coefs"));
231 cerr << "CLUNIT: couldn't get pitchmarks for " << u->name() << endl;
235 e_frame = pm->num_frames() - 1;
236 m_frame = u->I("middle_frame");
238 dur_1 = pm->t(m_frame);
239 dur_2 = pm->t(e_frame) - dur_1;
241 s->set("end", (dur_1 + p_time));
242 p_time = s->F("end") + dur_2;
244 end = dur_1 + dur_2 + t_time;
246 u->set("end", t_time);
249 s->set("end", (dur_2 + p_time));
252 static LISP clunits_simple_wave(LISP utt)
254 // Naive joining of waveforms
255 EST_Utterance *u = get_c_utt(utt);
256 EST_Wave *w = new EST_Wave;
262 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
263 size += wave(s->f("sig"))->num_samples();
265 if (u->relation("Unit")->head())
266 { // This will copy the necessary wave features across
267 s = u->relation("Unit")->head();
268 *w = *(wave(s->f("sig")));
270 i = w->num_samples();
271 w->resize(size); // its maximum size
272 for (s=u->relation("Unit")->head()->next(); s; s=s->next())
274 w1 = wave(s->f("sig"));
275 // Find last zero crossing
276 for (c=0; ((i > 0) && (c < 40)); c++,i--)
277 if (((w->a_no_check(i) < 0) && (w->a_no_check(i-1) >= 0)) ||
278 ((w->a_no_check(i) >= 0) && (w->a_no_check(i-1) < 0)))
280 if (c == 40) i += 40;
281 // Find next zero crossing
282 for (c=0,k=1; ((k < w1->num_samples()) && (c < 40)); k++,i++)
283 if (((w1->a_no_check(k) < 0) && (w1->a_no_check(k-1) >= 0)) ||
284 ((w1->a_no_check(k) >= 0) && (w1->a_no_check(k-1) < 0)))
286 if (c == 40) k -= 40;
287 for (; k < w1->num_samples(); k++,i++)
288 w->a_no_check(i) = w1->a_no_check(k);
292 witem = u->create_relation("Wave")->append();
293 witem->set_val("wave",est_val(w));
298 static LISP clunits_windowed_wave(LISP utt)
300 // windowed join, no prosodic modification
301 EST_Utterance *u = get_c_utt(utt);
302 EST_Wave *w = new EST_Wave;
307 int size,i,k,wi,samp_idx, l_samp_idx;
311 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
312 size += wave(s->f("sig"))->num_samples();
314 if (u->relation("Unit")->head())
315 { // This will copy the necessary wave features across
316 s = u->relation("Unit")->head();
317 www = wave(s->f("sig"));
320 w->resize(size); // its maximum size
323 for (s=u->relation("Unit")->head(); s; s=s->next())
325 w1 = wave(s->f("sig"));
326 t1 = track(s->f("coefs"));
329 for (i=0; i < t1->num_frames()-1; i++)
331 samp_idx = (int)(t1->t(i)*w->sample_rate());
332 width = samp_idx - l_samp_idx;
333 if (clunits_smooth_frames && (i==0) && (lwidth != 0))
334 width = (width+lwidth)/2; // not sure if this is worth it
336 for (k=-width; ((k<width)&&((samp_idx+k)<w1->num_samples())) ;k++)
338 (int)(0.5*(1+cos((PI/(double)(width))*(double)k))*
340 l_samp_idx = samp_idx;
346 witem = u->create_relation("Wave")->append();
347 witem->set_val("wave",est_val(w));
352 static LISP clunits_smoothedjoin_wave(LISP utt)
354 // Actually not very smoothed yet, just joined
355 EST_Utterance *u = get_c_utt(utt);
356 EST_Wave *w = new EST_Wave;
362 int samp_end, samp_start;
365 for (size=0,s=u->relation("Unit")->head(); s != 0; s = s->next())
367 samp_end = s->I("samp_end");
368 samp_start = s->I("samp_start");
369 size += samp_end-samp_start;
372 if (u->relation("Unit")->head())
373 { // This will copy the necessary wave features across
374 s = u->relation("Unit")->head();
375 www = wave(s->f("sig"));
378 w->resize(size); // its maximum size
380 for (s=u->relation("Unit")->head(); s; s=s->next())
382 samp_end = s->I("samp_end");
383 samp_start = s->I("samp_start");
384 w1 = wave(s->f("sig"));
385 /* printf("%s %s %f %f %d %d\n",
386 (const char *)s->S("name"),
387 (const char *)s->S("fileid"),
388 (float)samp_start/(float)w->sample_rate(),
389 (float)samp_end/(float)w->sample_rate(),
392 t1 = track(s->f("coefs"));
393 for (i=samp_start; i<samp_end; i++,wi++)
394 w->a_no_check(wi) = w1->a_no_check(i);
395 /* printf("%d %f\n",wi,(float)wi/(float)w->sample_rate()); */
399 witem = u->create_relation("Wave")->append();
400 witem->set_val("wave",est_val(w));
405 static void setup_clunits_params()
408 clunits_params = siod_get_lval("clunits_params",
409 "CLUNITS: no parameters set for module");
410 optimal_coupling = get_param_int("optimal_coupling",clunits_params,0);
411 different_prev_pen = get_param_float("different_prev_pen",clunits_params,1000.0);
412 non_consecutive_pen = get_param_float("non_consectutive_pen",clunits_params,100.0);
413 extend_selections = get_param_int("extend_selections",clunits_params,0);
414 continuity_weight = get_param_float("continuity_weight",clunits_params,1);
415 f0_join_weight = get_param_float("f0_join_weight",clunits_params,0.0);
416 clunits_debug = get_param_int("clunits_debug",clunits_params,0);
417 clunits_log_scores = get_param_int("log_scores",clunits_params,0);
418 clunits_smooth_frames = get_param_int("smooth_frames",clunits_params,0);
419 clunit_name_feat = get_param_str("clunit_name_feat",clunits_params,"name");
421 siod_get_lval("clunits_selection_trees",
422 "CLUNITS: clunits_selection_trees unbound");
425 static EST_VTCandidate *TS_candlist(EST_Item *s,EST_Features &f)
427 // Return a list of candidate units for target s
428 // Use the appropriate CART to select a small group of candidates
429 EST_VTCandidate *all_cands = 0;
430 EST_VTCandidate *c, *gt;
431 LISP tree,group,l,pd,cc,ls;
433 EST_String lookingfor;
440 lookingfor = s->S("clunit_name");
443 cc = siod_get_lval("clunits_cand_hooks",NULL);
445 pd = apply_hooks(siod_get_lval("clunits_cand_hooks",NULL),
449 tree = car(cdr(siod_assoc_str(lookingfor,selection_trees)));
450 pd = wagon_pd(s,tree);
454 cerr << "CLUNITS: no predicted class for " <<
455 s->S("clunit_name") << endl;
459 cluster_mean = get_c_float(car(cdr(pd)));
461 for (bbb=0,l=group; l != NIL; l=cdr(l),bbb++)
463 c = new EST_VTCandidate;
464 name = s->S("clunit_name")+"_"+get_c_string(car(car(l)));
465 u = cldb->get_unit(name);
468 cerr << "CLUNITS: failed to find unit " << name <<
472 cldb->load_join_coefs(u);
473 c->name = est_val(u);
475 // Mean distance from others in cluster (could be precalculated)
476 c->score = get_c_float(car(cdr(car(l))))-cluster_mean;
477 c->score *= c->score;
478 // Maybe this should be divided by overall mean of set
479 // to normalise this figure (?)
485 if (extend_selections)
487 // An experiment, for all candidates of the previous
488 // item whose following is of this phone type, include
489 // them as a candidate
490 EST_Item *ppp = s->prev();
493 EST_VTCandidate *lc = vtcand(ppp->f("unit_cands"));
494 for (ccc=0 ; lc && (ccc < extend_selections); lc = lc->next)
496 CLunit *unit = clunit(lc->name);
500 next_unit = unit->next_unit;
504 ss = next_unit->name.before("_");
505 if (ss.matches(".*_.*_.*"))
508 ss += next_unit->name.after("_").before("_");
510 /* printf("%s %s\n",(const char *)ss, (const char *)lookingfor); */
511 for (gt=all_cands; gt; gt=gt->next)
512 if (clunit(gt->name)->name == next_unit->name)
513 break; /* got this one already */
514 if ((ss == lookingfor) && (gt == 0))
515 { // its the right type so add it
516 c = new EST_VTCandidate;
517 c->name = est_val(next_unit);
518 cldb->load_join_coefs(next_unit);
529 s->set_val("unit_cands",est_val(all_cands));
532 printf("cands %d (extends %d) %s\n",bbb,ccc,(const char *)lookingfor);
536 static EST_VTPath *TS_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f)
538 // Combine candidate c with previous path updating score
541 EST_VTPath *np = new EST_VTPath;
543 float u0_move=0.0, u1_move=0.0;
548 if ((p == 0) || (p->c == 0))
549 cost = 0; // nothing previous to join to
552 u0 = clunit(p->c->name);
553 u1 = clunit(c->name);
554 // printf("u0 %s u1 %s\n",
555 // (const char *)u0->name,
556 // (const char *)u1->name);
557 if (optimal_coupling)
558 cost = optimal_couple(u0,u1,u0_move,u1_move,
561 non_consecutive_pen);
562 else // naive measure
563 cost = naive_join_cost(u0,u1,c->s,u0_move,u1_move);
564 // When optimal_coupling == 2 the moves will be 0, just the scores
566 if (optimal_coupling == 1)
568 np->f.set(static_unit_prev_move,u0_move); // new (prev) end
569 np->f.set(static_unit_this_move,u1_move); // new start
572 // printf("cost %f continuity_weight %f\n", cost, continuity_weight);
573 cost *= continuity_weight;
574 np->state = c->pos; // "state" is candidate number
575 if (clunits_log_scores && (cost != 0))
578 np->f.set(static_jscore,cost);
579 np->f.set(static_tscore,c->score);
581 np->score = (c->score+cost);
583 np->score = (c->score+cost) + p->score;
584 np->f.set(static_cscore,np->score);
586 if (clunits_debug > 1)
587 printf("joining cost %f\n",np->score);
591 static float optimal_couple(CLunit *u0,
596 float different_prev_pen,
597 float non_consecutive_pen
600 // Find combination cost of u0 to u1, checking for best
601 // frame up to n frames back in u0 and u1.
602 // Note this checks the u0 with u1's predecessor, which may or may not
603 // be of the same type
604 // There is some optimisation here in unit coeff access
605 EST_Track *u0_cep, *u1_p_cep;
606 float dist, best_val;
609 int u1_p_st, u1_p_end;
610 int best_u0, best_u1;
614 u1_p = u1->prev_unit;
622 if (u1_p == u0) // they are consecutive
624 if (u1_p == 0) // hacky condition, when there is no previous we'll
625 return 0.0; // assume a good join (should be silence there)
627 if (u1_p->join_coeffs == 0)
628 cldb->load_join_coefs(u1_p);
629 // Get indexes into full cep for utterances rather than sub ceps
630 u0_cep = u0->join_coeffs;
631 u1_p_cep = u1_p->join_coeffs;
633 u0_end = u0_cep->num_frames();
634 u1_p_end = u1_p_cep->num_frames();
636 if (!streq(u1_p->base_name,u0->base_name))
637 { /* prev(u1) is a different phone from u0 so don't slide */
638 f = different_prev_pen;
639 u0_st = u0_cep->num_frames()-1;
640 u1_p_st = u1_p_cep->num_frames()-1;
643 { /* we'll only check the edge for the join */
644 u0_st = u0_cep->num_frames()-1;
645 u1_p_st = u1_p_cep->num_frames()-1;
650 u0_st = (int)(u0_cep->num_frames() * 0.33);
651 u1_p_st = (int)(u1_p_cep->num_frames() * 0.33);
659 // Here we look for the best join without sliding the windows
660 if ((u0_end-u0_st) < (u1_p_end-u1_p_st))
663 eee = u1_p_end-u1_p_st;
664 for (i=0; i < eee; i++)
666 dist = frame_distance(*u0_cep,i+u0_st,
678 // This tries *all* possible matches in the pair, its slow
679 // and has a tendency to shorten things more than you'd like
680 // so we just use the more simple test above.
682 for (i=u0_st; i < u0_end; i++)
684 for (j=u1_p_st; j < u1_p_end; j++)
686 dist = frame_distance(*u0_cep,i,
701 u0_move = u0_cep->t(best_u0);
702 u1_move = u1_p_cep->t(best_u1);
705 return non_consecutive_pen+(best_val*f);
708 static float naive_join_cost(CLunit *unit0, CLunit *unit1,
713 // A naive join cost, because I haven't ported the info yet
715 u0_move = unit0->end;
716 u1_move = unit1->start;
720 else if (unit1->prev_unit->name == unit0->name)
722 else if (ph_is_silence(s->name()))
724 else if (ph_is_stop(s->name()))
726 else if (ph_is_fricative(s->name()))
732 static LISP cldb_load_all_coeffs(LISP filelist)
737 for (f=filelist; f; f=cdr(f))
739 cldb->get_file_coefs_sig(get_c_string(car(f)));
740 cldb->get_file_join_coefs(get_c_string(car(f)));
746 void festival_clunits_init(void)
748 // Initialization for clunits selection
750 proclaim_module("clunits",
751 "Copyright (C) University of Edinburgh and CMU 1997-2010\n");
753 gc_protect(&clunits_params);
754 gc_protect(&selection_trees);
756 festival_def_utt_module("Clunits_Select",clunits_select,
757 "(Clunits_Select UTT)\n\
758 Select units from current databases using cluster selection method.");
760 festival_def_utt_module("Clunits_Get_Units",clunits_get_units,
761 "(Clunits_Get_Units UTT)\n\
762 Construct Unit relation from the selected units in Segment and extract\n\
763 their parameters from the clunit db.");
765 festival_def_utt_module("Clunits_Simple_Wave",clunits_simple_wave,
766 "(Clunits_Simple_Wave UTT)\n\
767 Naively concatenate signals together into a single wave (for debugging).");
769 festival_def_utt_module("Clunits_Windowed_Wave",clunits_windowed_wave,
770 "(Clunits_Windowed_Wave UTT)\n\
771 Use hamming window over edges of units to join them, no prosodic \n\
772 modification though.");
774 festival_def_utt_module("Clunits_SmoothedJoin_Wave",clunits_smoothedjoin_wave,
775 "(Clunits_SmoothedJoin_Wave UTT)\n\
778 init_subr_1("clunits:load_db",cl_load_db,
779 "(clunits:load_db PARAMS)\n\
780 Load index file for cluster database and set up params, and select it.");
782 init_subr_1("clunits:select",cldb_select,
783 "(clunits:select NAME)\n\
784 Select a previously loaded cluster database.");
786 init_subr_1("clunits:load_all_coefs",cldb_load_all_coeffs,
787 "(clunits:load_all_coefs FILEIDLIST)\n\
788 Load in coefficients, signal and join coefficients for each named\n\
789 fileid. This is can be called at startup to to reduce the load time\n\
790 during synthesis (though may make the image large).");
792 init_subr_0("clunits:list",cldb_list,
794 List names of currently loaded cluster databases.");
796 init_subr_2("acost:build_disttabs",make_unit_distance_tables,
797 "(acost:build_disttabs UTTTYPES PARAMS)\n\
798 Built matrices of distances between each ling_item in each each list\n\
799 of ling_items in uttypes. Uses acoustic weights in PARAMS and save\n\
800 the result as a matrix for later use.");
802 init_subr_2("acost:utt.load_coeffs",acost_utt_load_coeffs,
803 "(acost:utt.load_coeffs UTT PARAMS)\n\
804 Load in the acoustic coefficients into UTT and set the Acoustic_Coeffs\n\
805 feature for each segment in UTT.");
807 init_subr_3("acost:file_difference",ac_distance_tracks,
808 "(acost:file_difference FILENAME1 FILENAME2 PARAMS)\n\
809 Load in the two named tracks and find the acoustic difference over all\n\
810 based on the weights in PARAMS.");
812 init_subr_2("cl_mapping", l_cl_mapping,
813 "(cl_mapping UTT PARAMS)\n\
814 Impose prosody upto some percentage, and not absolutely.");