RecognizedAudio.cs

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.IO;
using System.Speech.AudioFormat;
using System.Speech.Internal;

namespace System.Speech.Recognition
{
    [Serializable]
    public class RecognizedAudio
    {
        internal RecognizedAudio(byte[] rawAudioData, SpeechAudioFormatInfo audioFormat, DateTime startTime, TimeSpan audioPosition, TimeSpan audioDuration)
        {
            _audioFormat = audioFormat;
            _startTime = startTime;
            _audioPosition = audioPosition;
            _audioDuration = audioDuration;
            _rawAudioData = rawAudioData;
        }
        public SpeechAudioFormatInfo Format
        {
            get { return _audioFormat; }
        }

        // Chronological "wall-clock" time the user started speaking the result at. This is useful for latency calculations etc.
        public DateTime StartTime
        {
            get { return _startTime; }
        }

        // Position in the audio stream this audio starts at.
        // Note: the stream starts at zero when the engine first starts processing audio.
        public TimeSpan AudioPosition
        {
            get { return _audioPosition; }
        }

        // Length of this audio fragment
        public TimeSpan Duration
        {
            get { return _audioDuration; }
        }

        // Different ways to store the audio, either as a binary data stream or as a wave file.
        public void WriteToWaveStream(Stream outputStream)
        {
            ArgumentNullException.ThrowIfNull(outputStream);

            using (StreamMarshaler sm = new(outputStream))
            {
                WriteWaveHeader(sm);
            }

            // now write the raw data
            outputStream.Write(_rawAudioData, 0, _rawAudioData.Length);

            outputStream.Flush();
        }

        // Different ways to store the audio, either as a binary data stream or as a wave file.
        public void WriteToAudioStream(Stream outputStream)
        {
            ArgumentNullException.ThrowIfNull(outputStream);

            // now write the raw data
            outputStream.Write(_rawAudioData, 0, _rawAudioData.Length);

            outputStream.Flush();
        }

        // Get another audio object from this one representing a range of audio.
        public RecognizedAudio GetRange(TimeSpan audioPosition, TimeSpan duration)
        {
            if (audioPosition.Ticks < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(audioPosition), SR.Get(SRID.NegativeTimesNotSupported));
            }
            if (duration.Ticks < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(duration), SR.Get(SRID.NegativeTimesNotSupported));
            }
            if (audioPosition > _audioDuration)
            {
                throw new ArgumentOutOfRangeException(nameof(audioPosition));
            }

            if (duration > audioPosition + _audioDuration)
            {
                throw new ArgumentOutOfRangeException(nameof(duration));
            }

            // Get the position and length in bytes offset and bytes length.
            int startPosition = (int)((_audioFormat.BitsPerSample * _audioFormat.SamplesPerSecond * audioPosition.Ticks) / (TimeSpan.TicksPerSecond * 8));
            int length = (int)((_audioFormat.BitsPerSample * _audioFormat.SamplesPerSecond * duration.Ticks) / (TimeSpan.TicksPerSecond * 8));
            if (startPosition + length > _rawAudioData.Length)
            {
                length = _rawAudioData.Length - startPosition;
            }

            // Extract the data from the original stream
            byte[] audioBytes = new byte[length];
            Array.Copy(_rawAudioData, startPosition, audioBytes, 0, length);
            return new RecognizedAudio(audioBytes, _audioFormat, _startTime + audioPosition, audioPosition, duration);
        }

        #region Private Methods

        private void WriteWaveHeader(StreamMarshaler sm)
        {
            char[] riff = new char[4] { 'R', 'I', 'F', 'F' };
            byte[] formatSpecificData = _audioFormat.FormatSpecificData();
            sm.WriteArray<char>(riff, riff.Length);

            sm.WriteStream((uint)(_rawAudioData.Length + 38 + formatSpecificData.Length)); // Must be four bytes

            char[] wave = new char[4] { 'W', 'A', 'V', 'E' };
            sm.WriteArray(wave, wave.Length);

            char[] fmt = new char[4] { 'f', 'm', 't', ' ' };
            sm.WriteArray(fmt, fmt.Length);

            sm.WriteStream(18 + formatSpecificData.Length);

            sm.WriteStream((ushort)_audioFormat.EncodingFormat);
            sm.WriteStream((ushort)_audioFormat.ChannelCount);
            sm.WriteStream(_audioFormat.SamplesPerSecond);
            sm.WriteStream(_audioFormat.AverageBytesPerSecond);
            sm.WriteStream((ushort)_audioFormat.BlockAlign);
            sm.WriteStream((ushort)_audioFormat.BitsPerSample);
            sm.WriteStream((ushort)formatSpecificData.Length);

            // write codec specific data
            if (formatSpecificData.Length > 0)
            {
                sm.WriteStream(formatSpecificData);
            }

            char[] data = new char[4] { 'd', 'a', 't', 'a' };
            sm.WriteArray(data, data.Length);
            sm.WriteStream(_rawAudioData.Length);
        }

        #endregion

        #region Private Fields

        private DateTime _startTime;
        private TimeSpan _audioPosition;
        private TimeSpan _audioDuration;
        private SpeechAudioFormatInfo _audioFormat;
        private byte[] _rawAudioData;

        #endregion
    }
}
File: Result\RecognizedAudio.cs	Web Access
Project: src\src\runtime\src\libraries\System.Speech\src\System.Speech.csproj (System.Speech)