File: Internal\Synthesis\EngineSite.cs
Web Access
Project: src\src\runtime\src\libraries\System.Speech\src\System.Speech.csproj (System.Speech)
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Runtime.InteropServices;
using System.Speech.Synthesis;
using System.Speech.Synthesis.TtsEngine;
using System.Text;

// Exceptions cannot get through the COM code.
// The engine site saves the last exception before sending it back to the client.
#pragma warning disable 6500

namespace System.Speech.Internal.Synthesis
{
    internal class EngineSite : ITtsEngineSite, ITtsEventSink
    {
        #region Constructors

        internal EngineSite(ResourceLoader resourceLoader)
        {
            _resourceLoader = resourceLoader;
        }

        #endregion

        #region Internal Methods
        internal TtsEventMapper? EventMapper
        {
            get
            {
                return _eventMapper;
            }
            set
            {
                _eventMapper = value;
            }
        }

        #region ISpTTSEngineStite implementation
        /// <summary>
        /// Adds events directly to an event sink.
        /// </summary>
        public void AddEvents([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] SpeechEventInfo[] events, int ulCount)
        {
            try
            {
                foreach (SpeechEventInfo sapiEvent in events)
                {
                    int evtMask = 1 << sapiEvent.EventId;

                    if (sapiEvent.EventId == (short)TtsEventId.EndInputStream && _eventMapper != null)
                    {
                        _eventMapper.FlushEvent();
                    }

                    if ((evtMask & _eventInterest) != 0)
                    {
                        TTSEvent ttsEvent = CreateTtsEvent(sapiEvent);
                        if (_eventMapper == null)
                        {
                            AddEvent(ttsEvent);
                        }
                        else
                        {
                            _eventMapper.AddEvent(ttsEvent);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                _exception = e;
                _actions |= SPVESACTIONS.SPVES_ABORT;
            }
        }

        /// <summary>
        /// Queries the voice object to determine which real-time action(s) to perform.
        /// </summary>
        public int Write(IntPtr pBuff, int cb)
        {
            try
            {
                _audio!.Play(pBuff, cb);
            }
            catch (Exception e)
            {
                _exception = e;
                _actions |= SPVESACTIONS.SPVES_ABORT;
            }
            return cb;
        }

        /// <summary>
        /// Retrieves the number and type of items to be skipped in the text stream.
        /// </summary>
        public SkipInfo GetSkipInfo()
        {
            return new SkipInfo(1 /*BSPVSKIPTYPE.SPVST_SENTENCE */, 1);
        }

        /// <summary>
        /// Notifies that the last skip request has been completed and to pass it the results.
        /// </summary>
        public void CompleteSkip(int ulNumSkipped)
        {
            return;
        }

        /// <summary>
        /// Passes back the event interest for the voice.
        /// </summary>
        public int EventInterest
        {
            get
            {
                return _eventInterest;
            }
        }

        /// <summary>
        ///  Queries the voice object to determine which real-time action(s) to perform
        /// </summary>
        public int Actions
        {
            get
            {
                return (int)_actions;
            }
        }

        /// <summary>
        ///  Retrieves the current TTS rendering rate adjustment that should be used by the engine.
        /// </summary>
        public int Rate
        {
            get
            {
                _actions &= ~SPVESACTIONS.SPVES_RATE;
                return _defaultRate;
            }
        }

        /// <summary>
        /// Retrieves the base output volume level the engine should use during synthesis.
        /// </summary>
        public int Volume
        {
            get
            {
                _actions &= ~SPVESACTIONS.SPVES_VOLUME;
                return _volume;
            }
        }

        /// <summary>
        /// Load a file either from a local network or from the Internet.
        /// </summary>
        public Stream? LoadResource(Uri uri, string? mediaType)
        {
            try
            {
                string? localPath;
                string? mediaTypeUnused; // TODO: Should this be passed out of this function?
                Uri? baseUriUnused;
                using (Stream stream = _resourceLoader.LoadFile(uri, out mediaTypeUnused, out baseUriUnused, out localPath))
                {
                    // Read the file in memory for SES and release the original file immediately
                    // This scheme is really bad if the files being read are big but I would assume
                    // That it should not be the case.
                    int cLen = (int)stream.Length;
                    MemoryStream memStream = new(cLen);
                    byte[] ab = new byte[cLen];

                    stream.ReadExactly(ab);

                    _resourceLoader.UnloadFile(localPath);
                    memStream.Write(ab, 0, cLen);
                    memStream.Position = 0;

                    return memStream;
                }
            }
            catch (Exception e)
            {
                _exception = e;
                _actions |= SPVESACTIONS.SPVES_ABORT;
            }
            return null;
        }

        #endregion

        public void AddEvent(TTSEvent evt)
        {
            _audio!.InjectEvent(evt);
        }

        public void FlushEvent()
        {
        }

        internal void SetEventsInterest(int eventInterest)
        {
            _eventInterest = eventInterest;
            _eventMapper?.FlushEvent();
        }

        #endregion

        #region Internal Properties

        /// <summary>
        ///  Retrieves the current TTS rendering rate adjustment that should be used by the engine.
        /// </summary>
        internal int VoiceRate
        {
            get
            {
                return _defaultRate;
            }
            set
            {
                _defaultRate = value;
                _actions |= SPVESACTIONS.SPVES_RATE;
            }
        }

        /// <summary>
        /// Retrieves the base output volume level the engine should use during synthesis.
        /// </summary>
        internal int VoiceVolume
        {
            get
            {
                return _volume;
            }
            set
            {
                _volume = value;
                _actions |= SPVESACTIONS.SPVES_VOLUME;
            }
        }

        /// <summary>
        /// Set and reset the last exception
        /// </summary>
        internal Exception? LastException
        {
            get
            {
                return _exception;
            }
            set
            {
                _exception = value;
            }
        }

        internal void Abort()
        {
            _actions = SPVESACTIONS.SPVES_ABORT;
        }

        internal void InitRun(AudioBase audioDevice, int defaultRate, Prompt prompt)
        {
            _audio = audioDevice;
            _prompt = prompt;
            _defaultRate = defaultRate;
            _actions = SPVESACTIONS.SPVES_RATE | SPVESACTIONS.SPVES_VOLUME;
        }

        #endregion

        #region Private Members

        private TTSEvent CreateTtsEvent(SpeechEventInfo sapiEvent)
        {
            TTSEvent ttsEvent;
            switch ((TtsEventId)sapiEvent.EventId)
            {
                case TtsEventId.Phoneme:
                    ttsEvent = TTSEvent.CreatePhonemeEvent("" + (char)((uint)sapiEvent.Param2 & 0xFFFF), // current phoneme
                                                           "" + (char)(sapiEvent.Param1 & 0xFFFF), // next phoneme
                                                           TimeSpan.FromMilliseconds(sapiEvent.Param1 >> 16),
                                                           (SynthesizerEmphasis)((uint)sapiEvent.Param2 >> 16),
                                                           _prompt!, _audio!.Duration);
                    break;
                case TtsEventId.Bookmark:
                    // BookmarkDetected
                    string? bookmark = Marshal.PtrToStringUni(sapiEvent.Param2);
                    ttsEvent = new TTSEvent((TtsEventId)sapiEvent.EventId, _prompt!, null, null, _audio!.Duration, _audio.Position, bookmark, (uint)sapiEvent.Param1, sapiEvent.Param2);
                    break;
                default:
                    ttsEvent = new TTSEvent((TtsEventId)sapiEvent.EventId, _prompt!, null, null, _audio!.Duration, _audio.Position, null, (uint)sapiEvent.Param1, sapiEvent.Param2);
                    break;
            }
            return ttsEvent;
        }

        #endregion

        #region private Fields

        private int _eventInterest;

        private SPVESACTIONS _actions = SPVESACTIONS.SPVES_RATE | SPVESACTIONS.SPVES_VOLUME;

        private AudioBase? _audio;

        private Prompt? _prompt;

        // Last Exception
        private Exception? _exception;

        // Rate setup in the control panel
        private int _defaultRate;

        // Rate setup in the control panel
        private int _volume = 100;

        // Get a resource load
        private ResourceLoader _resourceLoader;

        // Map the TTS events to the right format
        private TtsEventMapper? _eventMapper;

        #endregion
    }

    internal interface ITtsEventSink
    {
        void AddEvent(TTSEvent evt);
        void FlushEvent();
    }

    internal abstract class TtsEventMapper : ITtsEventSink
    {
        internal TtsEventMapper(ITtsEventSink? sink)
        {
            _sink = sink;
        }

        protected virtual void SendToOutput(TTSEvent evt)
        {
            _sink?.AddEvent(evt);
        }

        public virtual void AddEvent(TTSEvent evt)
        {
            SendToOutput(evt);
        }

        public virtual void FlushEvent()
        {
            _sink?.FlushEvent();
        }

        private ITtsEventSink? _sink;
    }

    internal class PhonemeEventMapper : TtsEventMapper
    {
        public enum PhonemeConversion
        {
            IpaToSapi,
            SapiToIpa,
            NoConversion
        }

        internal PhonemeEventMapper(ITtsEventSink sink, PhonemeConversion conversion, AlphabetConverter alphabetConverter) : base(sink)
        {
            _queue = new Queue();
            _phonemeQueue = new Queue();
            _conversion = conversion;
            _alphabetConverter = alphabetConverter;
            Reset();
        }

        public override void AddEvent(TTSEvent evt)
        {
            if (_conversion == PhonemeConversion.NoConversion)
            {
                SendToOutput(evt);
            }
            else if (evt.Id == TtsEventId.Phoneme)
            {
                _phonemeQueue.Enqueue(evt);

                int prefixSeek = _phonemes.Length + 1;
                _phonemes.Append(evt.Phoneme);
                do
                {
                    string prefix = _phonemes.ToString(0, prefixSeek);
                    if (_alphabetConverter.IsPrefix(prefix, _conversion == PhonemeConversion.SapiToIpa))
                    {
                        if (_alphabetConverter.IsConvertibleUnit(prefix, _conversion == PhonemeConversion.SapiToIpa))
                        {
                            _lastComplete = prefixSeek;
                        }
                        prefixSeek++;
                    }
                    else
                    {
                        if (_lastComplete == 0)
                        {
                            Trace.TraceError("Cannot convert the phonemes correctly. Attempt to start over...");
                            Reset();
                            break;
                        }
                        ConvertCompleteUnit();
                        _lastComplete = 0;
                        prefixSeek = 1;
                    }
                } while (prefixSeek <= _phonemes.Length);
            }
            else
            {
                SendToQueue(evt);
            }
        }

        public override void FlushEvent()
        {
            ConvertCompleteUnit();
            while (_queue.Count > 0)
            {
                SendToOutput((TTSEvent)_queue.Dequeue()!);
            }
            _phonemeQueue.Clear();
            _lastComplete = 0;

            base.FlushEvent();
        }

        private void ConvertCompleteUnit()
        {
            if (_lastComplete == 0)
            {
                return;
            }
            if (_phonemeQueue.Count == 0)
            {
                Trace.TraceError("Failed to convert phonemes. Phoneme queue is empty.");
                return;
            }

            char[] source = new char[_lastComplete];
            _phonemes.CopyTo(0, source, 0, _lastComplete);
            _phonemes.Remove(0, _lastComplete);
            char[]? target;
            if (_conversion == PhonemeConversion.IpaToSapi)
            {
                target = _alphabetConverter.IpaToSapi(source);
            }
            else
            {
                target = _alphabetConverter.SapiToIpa(source);
            }

            //
            // Convert the audio duration
            // Update the next phoneme id
            // Retain any other information based on the first TTS phoneme event.
            //
            TTSEvent ttsEvent, targetEvent, basePhonemeEvent;
            long totalDurationTicks = 0;
            basePhonemeEvent = (TTSEvent)_phonemeQueue.Peek()!;
            for (int i = 0; i < _lastComplete;)
            {
                ttsEvent = (TTSEvent)_phonemeQueue.Dequeue()!;
                totalDurationTicks += ttsEvent.PhonemeDuration.Ticks;
                i += ttsEvent.Phoneme!.Length;
            }

            targetEvent = TTSEvent.CreatePhonemeEvent(new string(target), "",
                                                      TimeSpan.FromTicks(totalDurationTicks),
                                                      basePhonemeEvent.PhonemeEmphasis,
                                                      basePhonemeEvent.Prompt,
                                                      basePhonemeEvent.AudioPosition);
            SendToQueue(targetEvent);
        }

        [MemberNotNull(nameof(_phonemes))]
        private void Reset()
        {
            _phonemeQueue.Clear();
            _phonemes = new StringBuilder();
            _lastComplete = 0;
        }

        private void SendToQueue(TTSEvent evt)
        {
            if (evt.Id == TtsEventId.Phoneme)
            {
                TTSEvent firstEvent;
                if (_queue.Count > 0)
                {
                    firstEvent = (TTSEvent)_queue.Dequeue()!;
                    if (firstEvent.Id == TtsEventId.Phoneme)
                    {
                        firstEvent.NextPhoneme = evt.Phoneme;
                    }
                    else
                    {
                        Trace.TraceError("First event in the queue of the phone mapper is not a PHONEME event");
                    }
                    SendToOutput(firstEvent);
                    while (_queue.Count > 0)
                    {
                        SendToOutput((TTSEvent)_queue.Dequeue()!);
                    }
                }
                _queue.Enqueue(evt);
            }
            else
            {
                if (_queue.Count > 0)
                {
                    _queue.Enqueue(evt);
                }
                else
                {
                    SendToOutput(evt);
                }
            }
        }

        private PhonemeConversion _conversion;
        private StringBuilder _phonemes;
        private Queue _queue;
        private Queue _phonemeQueue;
        private AlphabetConverter _alphabetConverter;
        private int _lastComplete;
    }
}