Add Tizen.AIAvatar project (#6014)
[platform/core/csapi/tizenfx.git] / src / Tizen.AIAvatar / src / internal / Uix / TTSLipSyncer.cs
1 /*
2  * Copyright(c) 2023 Samsung Electronics Co., Ltd.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17
18 using System;
19 using System.Collections.Generic;
20 using Tizen.Uix.Tts;
21 using System.Runtime.InteropServices;
22
23 using static Tizen.AIAvatar.AIAvatar;
24 using System.Linq;
25 using Tizen.NUI;
26
27 namespace Tizen.AIAvatar
28 {
29     internal class TTSLipSyncer
30     {
31
32         private Avatar currentAvatar;
33
34         private List<UtteranceText> textList;
35         private TtsClient ttsHandle;
36         private VoiceInfo voiceInfo;
37         private List<Byte> byteList;
38
39         private byte[] recordedBuffer;
40         private byte[] audioTailBuffer;
41
42         private int sampleRate;
43         private float desiredBufferDuration = 0.175f;
44         private float audioTailLengthFactor = 0.015f;
45         private float audioBufferMultiflier = 2f;
46
47         private int desiredBufferLength;
48         private int audioTailLength;
49
50         private bool isPrepared = false;
51         private bool isAsync = false;
52
53         private Action<byte[], int> bufferChangedAction;
54
55         private int audioLength;
56         private bool isAsyncLipStarting;
57
58         private AsyncLipSyncer lipSyncer;
59
60
61         internal TTSLipSyncer(Avatar avatar)
62         {
63             currentAvatar = avatar; 
64             lipSyncer = (currentAvatar.AvatarAnimator.GetAnimationModule(AnimationModuleType.LipSyncer) as AsyncLipSyncer);
65             InitTts();
66         }
67
68         ~TTSLipSyncer()
69         {
70             DeinitTts();
71         }
72
73         internal event EventHandler PlayReadyCallback;
74
75         internal TtsClient TtsHandle
76         {
77             get { return ttsHandle; }
78         }
79
80         internal VoiceInfo VoiceInfo
81         {
82             get { return voiceInfo; }
83             set
84             {
85                 voiceInfo = value;
86             }
87         }
88
89         internal List<VoiceInfo> GetSupportedVoices()
90         {
91             var voiceInfoList = new List<VoiceInfo>();
92
93             if (ttsHandle == null)
94             {
95                 Log.Error(LogTag, $"ttsHandle is null");
96                 return voiceInfoList;
97             }
98
99             var supportedVoices = ttsHandle.GetSupportedVoices();
100             foreach (var supportedVoice in supportedVoices)
101             {
102                 Log.Info(LogTag, $"{supportedVoice.Language} & {supportedVoice.VoiceType} is supported");
103                 voiceInfoList.Add(new VoiceInfo() { Lang = supportedVoice.Language, Type = supportedVoice.VoiceType });
104             }
105             return voiceInfoList;
106         }
107
108         internal bool IsSupportedVoice(string lang)
109         {
110             if (ttsHandle == null)
111             {
112                 Log.Error(LogTag, $"ttsHandle is null");
113                 return false;
114             }
115             var supportedVoices = ttsHandle.GetSupportedVoices();
116
117             foreach (var supportedVoice in supportedVoices)
118             {
119                 if (supportedVoice.Language.Equals(lang))
120                 {
121                     Log.Info(LogTag, $"{lang} is supported");
122                     return true;
123                 }
124             }
125             return false;
126         }
127
128         internal bool IsSupportedVoice(VoiceInfo voiceInfo)
129         {
130             if (ttsHandle == null)
131             {
132                 Log.Error(LogTag, $"ttsHandle is null");
133                 return false;
134             }
135             var supportedVoices = ttsHandle.GetSupportedVoices();
136             foreach (var supportedVoice in supportedVoices)
137             {
138                 if (supportedVoice.Language.Equals(voiceInfo.Lang) && (supportedVoice.VoiceType == voiceInfo.Type))
139                 {
140                     Log.Info(LogTag, $"{voiceInfo.Lang} & {voiceInfo.Type} is supported");
141                     return true;
142                 }
143             }
144             return false;
145         }
146
147
148         internal void AddText(string txt, VoiceInfo voiceInfo)
149         {
150             if (voiceInfo.Lang == null || voiceInfo.Type == null)
151             {
152                 Log.Error(LogTag, "VoiceInfo's value is null");
153             }
154             if (ttsHandle == null)
155             {
156                 Log.Error(LogTag, $"ttsHandle is null");
157                 return;
158             }
159             var temp = new UtteranceText();
160             temp.Text = txt;
161             temp.UttID = ttsHandle.AddText(txt, voiceInfo.Lang, (int)voiceInfo.Type, 0);
162             try
163             {
164                 textList.Add(temp);
165             }
166             catch (Exception e)
167             {
168                 Log.Error(LogTag, $"Error AddText" + e.Message);
169             }
170         }
171
172         internal void AddText(string txt, string lang)
173         {
174             if (ttsHandle == null)
175             {
176                 Log.Error(LogTag, $"ttsHandle is null");
177                 return;
178             }
179             var temp = new UtteranceText();
180             temp.Text = txt;
181             temp.UttID = ttsHandle.AddText(txt, lang, (int)voiceInfo.Type, 0);
182             try
183             {
184                 textList.Add(temp);
185             }
186             catch (Exception e)
187             {
188                 Log.Error(LogTag, $"Error AddText" + e.Message);
189             }
190         }
191
192         internal void Prepare(EventHandler playReadyCallback)
193         {
194             if (ttsHandle == null)
195             {
196                 Log.Error(LogTag, $"ttsHandle is null");
197                 return;
198             }
199             Log.Info(LogTag, "Prepare TTS");
200             isPrepared = true;
201             isAsync = false;
202             PlayReadyCallback = playReadyCallback;
203             Play(true);
204         }
205
206         internal bool PlayPreparedText()
207         {
208             if (byteList != null && byteList.Count != 0)
209             {
210                 Log.Info(LogTag, "PlayPreparedText TTS");
211                 currentAvatar?.AvatarAnimator?.PlayLipSync(byteList.ToArray(), sampleRate);
212                 return true;
213             }
214             return false;
215         }
216
217         internal void Play(bool isPrepared = false)
218         {
219             if (ttsHandle == null)
220             {
221                 Log.Error(LogTag, $"ttsHandle is null");
222                 return;
223             }
224
225             this.isPrepared = isPrepared;
226             isAsync = false;
227             ttsHandle.Play();
228         }
229
230         internal void PlayAsync(EventHandler playReadyCallback)
231         {
232             if (ttsHandle == null)
233             {
234                 Log.Error(LogTag, $"ttsHandle is null");
235                 return;
236             }
237
238             isPrepared = false;
239             isAsync = true;
240             PlayReadyCallback = playReadyCallback;
241             ttsHandle.Play();
242         }
243
244         public void Pause()
245         {
246             if (ttsHandle == null)
247             {
248                 Log.Error(LogTag, $"ttsHandle is null");
249                 return;
250             }
251             ttsHandle.Pause();
252         }
253
254         internal void Stop()
255         {
256             if (ttsHandle == null)
257             {
258                 Log.Error(LogTag, $"ttsHandle is null");
259                 return;
260             }
261             ttsHandle.Stop();
262             currentAvatar?.AvatarAnimator?.StopLipSync();
263         }
264
265         private void InitTts()
266         {
267             try
268             {
269                 ttsHandle = new TtsClient();
270
271                 // Register Callbacks
272                 ttsHandle.DefaultVoiceChanged += TtsDefaultVoiceChangedCallback;
273                 ttsHandle.EngineChanged += TtsEngineChangedCallback;
274                 ttsHandle.ErrorOccurred += TtsErrorOccuredCallback;
275                 ttsHandle.StateChanged += TtsStateChangedCallback;
276                 ttsHandle.UtteranceCompleted += TtsUtteranceCompletedCallback;
277                 ttsHandle.UtteranceStarted += TtsUtteranceStartedCallback;
278
279                 ttsHandle.SynthesizedPcm += TtsSyntheiszedPCM;
280                 ttsHandle.PlayingMode = PlayingMode.ByClient;
281
282                 ttsHandle.Prepare();
283
284                 voiceInfo = new VoiceInfo
285                 {
286                     Lang = ttsHandle.DefaultVoice.Language,
287                     Type = ttsHandle.DefaultVoice.VoiceType
288                 };
289
290                 textList = new List<UtteranceText>();
291                 Log.Info(LogTag, voiceInfo.Lang + ", " + voiceInfo.Type.ToString());
292
293             }
294             catch (Exception e)
295             {
296                 Log.Error(LogTag, "[ERROR] Fail to prepare Tts");
297                 Log.Error(LogTag, e.Message);
298             }
299         }
300
301         internal void DeinitTts()
302         {
303             try
304             {
305                 if (ttsHandle != null)
306                 {
307                     ttsHandle.Unprepare();
308
309                     // Unregister Callbacks
310                     ttsHandle.DefaultVoiceChanged -= TtsDefaultVoiceChangedCallback;
311                     ttsHandle.EngineChanged -= TtsEngineChangedCallback;
312                     ttsHandle.ErrorOccurred -= TtsErrorOccuredCallback;
313                     ttsHandle.StateChanged -= TtsStateChangedCallback;
314                     ttsHandle.UtteranceCompleted -= TtsUtteranceCompletedCallback;
315                     ttsHandle.UtteranceStarted -= TtsUtteranceStartedCallback;
316
317                     ttsHandle.Dispose();
318                     ttsHandle = null;
319                 }
320
321                 if (textList != null)
322                 {
323                     textList.Clear();
324                     textList = null;
325                 }
326
327                 if (byteList != null)
328                 {
329                     byteList.Clear();
330                     byteList = null;
331                 }
332                 currentAvatar = null;
333             }
334             catch (Exception e)
335             {
336                 Log.Error(LogTag, "[ERROR] Fail to unprepare Tts");
337                 Log.Error(LogTag, e.Message);
338             }
339         }
340
341         private void TtsSyntheiszedPCM(object sender, SynthesizedPcmEventArgs e)
342         {
343
344             var dataSize = e.Data.Length;
345             var audio = new byte[dataSize];
346             sampleRate = e.SampleRate;
347
348             //Marshal.Copy(e.Data, audio, 0, dataSize);
349             switch (e.EventType) //START
350             {
351                 case SynthesizedPcmEvent.Start://start
352                     Tizen.Log.Info(LogTag, "------------------Start : " + e.UtteranceId);
353                     Tizen.Log.Info(LogTag, "Output audio Size : " + dataSize);
354                     Tizen.Log.Info(LogTag, "SampleRate" + e.SampleRate);
355                     if (byteList == null)
356                     {
357                         byteList = new List<byte>();
358                     }
359                     if (recordedBuffer == null)
360                     {
361                         recordedBuffer = new byte[0];
362                     }
363                     byteList.Clear();
364
365                     if (isAsync)
366                     {
367                         recordedBuffer = Array.Empty<byte>();
368
369                         desiredBufferLength = (int)(e.SampleRate * desiredBufferDuration * audioBufferMultiflier);
370                         audioTailLength = (int)(sampleRate * audioTailLengthFactor * audioBufferMultiflier);
371                         audioTailBuffer = new byte[audioTailLength];
372                         PlayReadyCallback?.Invoke(null, EventArgs.Empty);
373                         InitAsyncBuffer();
374                         lipSyncer.SampleRate = sampleRate;
375                     }
376                     break;
377                 case SynthesizedPcmEvent.Continue://continue
378                     if (isAsync)
379                     {
380                         recordedBuffer = recordedBuffer.Concat(e.Data).ToArray();
381                         //PlayAsync
382                         if (recordedBuffer.Length >= desiredBufferLength)
383                         {
384                             Tizen.Log.Error(LogTag, "Current recordbuffer length :" + recordedBuffer.Length);
385                             UpdateBuffer(recordedBuffer, sampleRate);
386                             
387                             Buffer.BlockCopy(recordedBuffer, recordedBuffer.Length - audioTailLength, audioTailBuffer, 0, audioTailLength);
388
389                             recordedBuffer = Array.Empty<byte>();
390                             recordedBuffer = recordedBuffer.Concat(audioTailBuffer).ToArray();
391                             Array.Clear(audioTailBuffer, 0, audioTailLength);
392                         }
393                     }
394                     else
395                     {
396                         byteList.AddRange(e.Data);
397                     }
398                     break;
399                 case SynthesizedPcmEvent.Finish://finish
400                     Tizen.Log.Info(LogTag, "------------------Finish : " + e.UtteranceId);
401                     if (!isAsync)
402                     {
403                         if (!isPrepared)
404                         {
405                             //Play voice immediately
406                             //PlayPreparedText();
407                         }
408                         else
409                         {
410                             //Notify finished state
411                             Log.Info(LogTag, "Notify finished state");
412                             PlayReadyCallback?.Invoke(null, EventArgs.Empty);
413                         }
414                     }
415                     else
416                     {
417                         lipSyncer.SetFinishAsyncLip(true);
418                     }
419                     break;
420                 case SynthesizedPcmEvent.Fail: //fail
421                     break;
422
423             }
424         }
425
426         private void TtsUtteranceStartedCallback(object sender, UtteranceEventArgs e)
427         {
428             Log.Debug(LogTag, "Utterance start now (" + e.UtteranceId + ")");
429         }
430
431         private void TtsUtteranceCompletedCallback(object sender, UtteranceEventArgs e)
432         {
433             Log.Debug(LogTag, "Utterance complete (" + e.UtteranceId + ")");
434
435             foreach (UtteranceText item in textList)
436             {
437                 if (item.UttID == e.UtteranceId)
438                 {
439                     textList.Remove(item);
440                     Log.Debug(LogTag, "TextList Count (" + textList.Count.ToString() + ")");
441                     break;
442                 }
443             }
444         }
445
446         private void TtsStateChangedCallback(object sender, StateChangedEventArgs e)
447         {
448             Log.Debug(LogTag, "Current state is changed from (" + e.Previous + ") to (" + e.Current + ")");
449         }
450
451         private void TtsErrorOccuredCallback(object sender, ErrorOccurredEventArgs e)
452         {
453             Log.Error(LogTag, "Error is occured (" + e.ErrorMessage + ")");
454         }
455
456         private void TtsEngineChangedCallback(object sender, EngineChangedEventArgs e)
457         {
458             Log.Debug(LogTag, "Prefered engine is changed (" + e.EngineId + ") (" + e.VoiceType.Language + ")");
459         }
460
461         private void TtsDefaultVoiceChangedCallback(object sender, DefaultVoiceChangedEventArgs e)
462         {
463             Log.Debug(LogTag, "Default voice is changed from (" + e.Previous + ") to (" + e.Current + ")");
464         }
465
466         internal void InitAsyncBuffer()
467         {
468             if (!lipSyncer.IsAsyncInit)
469             {
470                 audioLength = (int)(sampleRate * 0.16f * 2f);
471
472                 lipSyncer.InitAsyncLipsync();
473                 lipSyncer.IsAsyncInit = true;
474
475                 lipSyncer.SetFinishAsyncLip(false);
476                 isAsyncLipStarting = false;
477             }
478         }
479
480         internal void UpdateBuffer(byte[] recordBuffer, int sampleRate)
481         {
482             if (lipSyncer != null)
483             {
484                 Log.Error(LogTag, "OnTTSBufferChanged");
485                 lipSyncer.EnqueueAnimation(recordBuffer, sampleRate, audioLength);
486                 if (!isAsyncLipStarting)
487                 {
488                     lipSyncer.StartAsyncLipPlayTimer();
489                     isAsyncLipStarting = true;
490                 }
491             }
492             else
493             {
494                 Log.Error(LogTag, "avatarLipSyncer is null");
495             }
496         }
497     }
498 }