From d4ecbdf6455460984ec4df3512169403722d60e6 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 27 Oct 2024 12:24:57 +0800 Subject: [PATCH 1/2] Add C# API for Moonshine models --- .github/scripts/test-dot-net.sh | 3 ++ .../offline-decode-files/Program.cs | 29 +++++++++++++++---- .../offline-decode-files/run-moonshine.sh | 18 ++++++++++++ scripts/dotnet/OfflineModelConfig.cs | 2 ++ scripts/dotnet/OfflineMoonshineModelConfig.cs | 29 +++++++++++++++++++ 5 files changed, 76 insertions(+), 5 deletions(-) create mode 100755 dotnet-examples/offline-decode-files/run-moonshine.sh create mode 100644 scripts/dotnet/OfflineMoonshineModelConfig.cs diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index eec3b6bb4..f4bfc66c5 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -9,6 +9,9 @@ rm -fv *.wav rm -rfv sherpa-onnx-pyannote-* cd ../offline-decode-files +./run-moonshine.sh +rm -rf sherpa-onnx-* + ./run-sense-voice-ctc.sh rm -rf sherpa-onnx-* diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index d971becd3..4841305be 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -17,7 +17,7 @@ class Options { [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] - public int SampleRate { get; set; } = 16000; + public int SampleRate { get; set; } = 16000; [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")] public int FeatureDim { get; set; } = 80; @@ -31,7 +31,7 @@ class Options [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")] public string Decoder { get; set; } = ""; - [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] + [Option(Required = false, Default = "", HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] public string Joiner { get; set; } = ""; [Option("model-type", Required = false, Default = "", HelpText = "model type")] @@ -44,10 +44,22 @@ class Options public string WhisperDecoder { get; set; } = ""; [Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")] - public string WhisperLanguage{ get; set; } = ""; + public string WhisperLanguage { get; set; } = ""; [Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")] - public string WhisperTask{ get; set; } = "transcribe"; + public string WhisperTask { get; set; } = "transcribe"; + + [Option("moonshine-preprocessor", Required = false, Default = "", HelpText = "Path to preprocess.onnx. Used only for Moonshine models")] + public string MoonshinePreprocessor { get; set; } = ""; + + [Option("moonshine-encoder", Required = false, Default = "", HelpText = "Path to encode.onnx. Used only for Moonshine models")] + public string MoonshineEncoder { get; set; } = ""; + + [Option("moonshine-uncached-decoder", Required = false, Default = "", HelpText = "Path to uncached_decode.onnx. Used only for Moonshine models")] + public string MoonshineUncachedDecoder { get; set; } = ""; + + [Option("moonshine-cached-decoder", Required = false, Default = "", HelpText = "Path to cached_decode.onnx. Used only for Moonshine models")] + public string MoonshineCachedDecoder { get; set; } = ""; [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")] public string TdnnModel { get; set; } = ""; @@ -90,7 +102,7 @@ class Options public float HotwordsScore { get; set; } = 1.5F; [Option("files", Required = true, HelpText = "Audio files for decoding")] - public IEnumerable Files { get; set; } = new string[] {}; + public IEnumerable Files { get; set; } = new string[] { }; } static void Main(string[] args) @@ -236,6 +248,13 @@ private static void Run(Options options) config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel; config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn; } + else if (!String.IsNullOrEmpty(options.MoonshinePreprocessor)) + { + config.ModelConfig.Moonshine.Preprocessor = options.MoonshinePreprocessor; + config.ModelConfig.Moonshine.Encoder = options.MoonshineEncoder; + config.ModelConfig.Moonshine.UncachedDecoder = options.MoonshineUncachedDecoder; + config.ModelConfig.Moonshine.CachedDecoder = options.MoonshineCachedDecoder; + } else { Console.WriteLine("Please provide a model"); diff --git a/dotnet-examples/offline-decode-files/run-moonshine.sh b/dotnet-examples/offline-decode-files/run-moonshine.sh new file mode 100755 index 000000000..025e0902d --- /dev/null +++ b/dotnet-examples/offline-decode-files/run-moonshine.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 +fi + +dotnet run \ + --num-threads=2 \ + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \ + --files ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index b24aeaf89..b7433c277 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -24,6 +24,7 @@ public OfflineModelConfig() BpeVocab = ""; TeleSpeechCtc = ""; SenseVoice = new OfflineSenseVoiceModelConfig(); + Moonshine = new OfflineMoonshineModelConfig(); } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -54,5 +55,6 @@ public OfflineModelConfig() public string TeleSpeechCtc; public OfflineSenseVoiceModelConfig SenseVoice; + public OfflineMoonshineModelConfig Moonshine; } } diff --git a/scripts/dotnet/OfflineMoonshineModelConfig.cs b/scripts/dotnet/OfflineMoonshineModelConfig.cs new file mode 100644 index 000000000..53c205173 --- /dev/null +++ b/scripts/dotnet/OfflineMoonshineModelConfig.cs @@ -0,0 +1,29 @@ +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineMoonshineModelConfig + { + public OfflineMoonshineModelConfig() + { + Preprocessor = ""; + Encoder = ""; + UncachedDecoder = ""; + CachedDecoder = ""; + } + [MarshalAs(UnmanagedType.LPStr)] + public string Preprocessor; + + [MarshalAs(UnmanagedType.LPStr)] + public string Encoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string UncachedDecoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string CachedDecoder; + } +} From 70b77fbad68002eb34415c42feb4f06d48468b78 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 27 Oct 2024 13:06:43 +0800 Subject: [PATCH 2/2] Return timestamps for non-streaming ASR. --- .../offline-decode-files/Program.cs | 15 ++++- scripts/dotnet/OfflineRecognizerResult.cs | 55 ++++++++++++++++++- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index 4841305be..d855da6f8 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -292,10 +292,21 @@ private static void Run(Options options) // display results for (int i = 0; i != files.Length; ++i) { - var text = streams[i].Result.Text; + var r = streams[i].Result; Console.WriteLine("--------------------"); Console.WriteLine(files[i]); - Console.WriteLine(text); + Console.WriteLine("Text: {0}", r.Text); + Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens)); + if (r.Timestamps != null && r.Timestamps.Length > 0) { + Console.Write("Timestamps: ["); + var sep = ""; + for (int k = 0; k != r.Timestamps.Length; ++k) + { + Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00")); + sep = ", "; + } + Console.WriteLine("]"); + } } Console.WriteLine("--------------------"); } diff --git a/scripts/dotnet/OfflineRecognizerResult.cs b/scripts/dotnet/OfflineRecognizerResult.cs index 8c10b6f88..ecf682b19 100644 --- a/scripts/dotnet/OfflineRecognizerResult.cs +++ b/scripts/dotnet/OfflineRecognizerResult.cs @@ -31,17 +31,70 @@ public OfflineRecognizerResult(IntPtr handle) byte[] stringBuffer = new byte[length]; Marshal.Copy(impl.Text, stringBuffer, 0, length); _text = Encoding.UTF8.GetString(stringBuffer); + + _tokens = new String[impl.Count]; + + unsafe + { + byte* buf = (byte*)impl.Tokens; + for (int i = 0; i < impl.Count; i++) + { + length = 0; + byte* start = buf; + while (*buf != 0) + { + ++buf; + length += 1; + } + ++buf; + + stringBuffer = new byte[length]; + fixed (byte* pTarget = stringBuffer) + { + for (int k = 0; k < length; k++) + { + pTarget[k] = start[k]; + } + } + + _tokens[i] = Encoding.UTF8.GetString(stringBuffer); + } + } + + unsafe + { + if (impl.Timestamps != IntPtr.Zero) + { + float *t = (float*)impl.Timestamps; + _timestamps = new float[impl.Count]; + fixed (float* f = _timestamps) + { + for (int k = 0; k < impl.Count; k++) + { + f[k] = t[k]; + } + } + } + } + } [StructLayout(LayoutKind.Sequential)] struct Impl { public IntPtr Text; + public IntPtr Timestamps; + public int Count; + public IntPtr Tokens; } private String _text; public String Text => _text; - } + private String[] _tokens; + public String[] Tokens => _tokens; + private float[] _timestamps; + public float[] Timestamps => _timestamps; + } }