Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add APIs about max speech duration in VAD for various programming languages #1349

Merged
merged 9 commits into from
Sep 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/dot-net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ jobs:
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface

cd huggingface
git fetch
git pull
mkdir -p windows-for-dotnet

cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void main(List<String> arguments) async {
model: sileroVad,
minSilenceDuration: 0.25,
minSpeechDuration: 0.5,
maxSpeechDuration: 5.0,
);

final vadConfig = sherpa_onnx.VadModelConfig(
Expand Down
3 changes: 3 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {

@Int32()
external int windowSize;

@Float()
external double maxSpeechDuration;
}

final class SherpaOnnxVadModelConfig extends Struct {
Expand Down
7 changes: 5 additions & 2 deletions flutter/sherpa_onnx/lib/src/vad.dart
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,20 @@ class SileroVadModelConfig {
this.threshold = 0.5,
this.minSilenceDuration = 0.5,
this.minSpeechDuration = 0.25,
this.windowSize = 512});
this.windowSize = 512,
this.maxSpeechDuration = 5.0});

@override
String toString() {
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)';
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
}

final String model;
final double threshold;
final double minSilenceDuration;
final double minSpeechDuration;
final int windowSize;
final double maxSpeechDuration;
}

class VadModelConfig {
Expand Down Expand Up @@ -127,6 +129,7 @@ class VoiceActivityDetector {
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;

c.ref.sampleRate = config.sampleRate;
c.ref.numThreads = config.numThreads;
Expand Down
1 change: 1 addition & 0 deletions go-api-examples/vad-asr-paraformer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func main() {
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.WindowSize = 512
config.SileroVad.MaxSpeechDuration = 5.0
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
Expand Down
1 change: 1 addition & 0 deletions go-api-examples/vad-asr-whisper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func main() {
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.WindowSize = 512
config.SileroVad.MaxSpeechDuration = 5.0
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadNonStreamingParaformer.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public static Vad createVad() {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadNonStreamingSenseVoice.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public static Vad createVad() {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
1 change: 1 addition & 0 deletions java-api-examples/VadRemoveSilence.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public static void main(String[] args) {
.setMinSilenceDuration(0.25f)
.setMinSpeechDuration(0.5f)
.setWindowSize(512)
.setMaxSpeechDuration(5.0f)
.build();

VadModelConfig config =
Expand Down
3 changes: 2 additions & 1 deletion lazarus-examples/generate_subtitles/my_init.pas
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector;
WindowSize := 512; {Please don't change it unless you know the details}

Config.SileroVad.Model := VadFilename;
Config.SileroVad.MinSpeechDuration := 0.5;
Config.SileroVad.MinSpeechDuration := 0.25;
Config.SileroVad.MinSilenceDuration := 0.5;
Config.SileroVad.MaxSpeechDuration := 5.0;
Config.SileroVad.Threshold := 0.5;
Config.SileroVad.WindowSize := WindowSize;
Config.NumThreads:= 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ function createVad() {
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
maxSpeechDuration: 5,
windowSize: 512,
},
sampleRate: 16000,
Expand Down
1 change: 1 addition & 0 deletions nodejs-examples/test-vad-with-non-streaming-asr-whisper.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ function createVad() {
threshold: 0.5,
minSpeechDuration: 0.25,
minSilenceDuration: 0.5,
maxSpeechDuration: 5,
windowSize: 512,
},
sampleRate: 16000,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ def main():

config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = args.silero_vad_model
config.silero_vad.threshold = 0.5
config.silero_vad.min_silence_duration = 0.25 # seconds
config.silero_vad.min_speech_duration = 0.25 # seconds

# If the current segment is larger than this value, then it increases
# the threshold to 0.9 internally. After detecting this segment,
# it resets the threshold to its original value.
config.silero_vad.max_speech_duration = 5 # seconds

config.sample_rate = sample_rate

window_size = config.silero_vad.window_size
Expand Down
3 changes: 3 additions & 0 deletions scripts/dotnet/SileroVadModelConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public SileroVadModelConfig()
MinSilenceDuration = 0.5F;
MinSpeechDuration = 0.25F;
WindowSize = 512;
MaxSpeechDuration = 5.0F;
}

[MarshalAs(UnmanagedType.LPStr)]
Expand All @@ -26,5 +27,7 @@ public SileroVadModelConfig()
public float MinSpeechDuration;

public int WindowSize;

public float MaxSpeechDuration;
}
}
2 changes: 2 additions & 0 deletions scripts/go/sherpa_onnx.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
MinSilenceDuration float32
MinSpeechDuration float32
WindowSize int
MaxSpeechDuration float32
}

type VadModelConfig struct {
Expand Down Expand Up @@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)

c.sample_rate = C.int(config.SampleRate)
c.num_threads = C.int(config.NumThreads)
Expand Down
3 changes: 3 additions & 0 deletions scripts/node-addon-api/lib/vad.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ config = {
sileroVad: {
model: "./silero_vad.onnx",
threshold: 0.5,
minSilenceDuration: 0.5,
minSpeechDuration: 0.25,
maxSpeechDuration: 5,
}
}
*/
Expand Down
1 change: 1 addition & 0 deletions scripts/node-addon-api/src/vad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);

return c;
}
Expand Down
3 changes: 3 additions & 0 deletions sherpa-onnx/c-api/c-api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
vad_config.silero_vad.window_size =
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);

vad_config.silero_vad.max_speech_duration =
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);

vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");
Expand Down
5 changes: 5 additions & 0 deletions sherpa-onnx/c-api/c-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
float min_speech_duration;

int window_size;

// If a speech segment is longer than this value, then we increase
// the threshold to 0.9. After finishing detecting the segment,
// the threshold value is reset to its original value.
float max_speech_duration;
} SherpaOnnxSileroVadModelConfig;

SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ public class SileroVadModelConfig {
private final float minSilenceDuration;
private final float minSpeechDuration;
private final int windowSize;
private final float maxSpeechDuration;

private SileroVadModelConfig(Builder builder) {
this.model = builder.model;
this.threshold = builder.threshold;
this.minSilenceDuration = builder.minSilenceDuration;
this.minSpeechDuration = builder.minSpeechDuration;
this.windowSize = builder.windowSize;
this.maxSpeechDuration = builder.maxSpeechDuration;
}

public static Builder builder() {
Expand All @@ -41,12 +43,17 @@ public int getWindowSize() {
return windowSize;
}

public float getMaxSpeechDuration() {
return maxSpeechDuration;
}

public static class Builder {
private String model = "";
private float threshold = 0.5f;
private float minSilenceDuration = 0.25f;
private float minSpeechDuration = 0.5f;
private int windowSize = 512;
private float maxSpeechDuration = 5.0f;

public SileroVadModelConfig build() {
return new SileroVadModelConfig(this);
Expand Down Expand Up @@ -77,5 +84,10 @@ public Builder setWindowSize(int windowSize) {
this.windowSize = windowSize;
return this;
}

public Builder setMaxSpeechDuration(float maxSpeechDuration) {
this.maxSpeechDuration = maxSpeechDuration;
return this;
}
}
}
4 changes: 4 additions & 0 deletions sherpa-onnx/jni/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);

fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
ans.silero_vad.max_speech_duration =
env->GetFloatField(silero_vad_config, fid);

fid = env->GetFieldID(cls, "sampleRate", "I");
ans.sample_rate = env->GetIntField(config, fid);

Expand Down
1 change: 1 addition & 0 deletions sherpa-onnx/kotlin-api/Vad.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ data class SileroVadModelConfig(
var minSilenceDuration: Float = 0.25F,
var minSpeechDuration: Float = 0.25F,
var windowSize: Int = 512,
var maxSpeechDuration: Float = 5.0F,
)

data class VadModelConfig(
Expand Down
9 changes: 7 additions & 2 deletions sherpa-onnx/pascal-api/sherpa_onnx.pas
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ TSherpaOnnxSileroVadModelConfig = record
MinSilenceDuration: Single;
MinSpeechDuration: Single;
WindowSize: Integer;
MaxSpeechDuration: Single;
function ToString: AnsiString;
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
end;
Expand Down Expand Up @@ -594,6 +595,7 @@ SherpaOnnxSileroVadModelConfig = record
MinSilenceDuration: cfloat;
MinSpeechDuration: cfloat;
WindowSize: cint32;
MaxSpeechDuration: cfloat;
end;
SherpaOnnxVadModelConfig = record
SileroVad: SherpaOnnxSileroVadModelConfig;
Expand Down Expand Up @@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
'Threshold := %.2f, ' +
'MinSilenceDuration := %.2f, ' +
'MinSpeechDuration := %.2f, ' +
'WindowSize := %d' +
'WindowSize := %d, ' +
'MaxSpeechDuration := %.2f' +
')',
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
Self.MinSpeechDuration, Self.WindowSize
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
]);
end;

Expand All @@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
Dest.MinSilenceDuration := 0.5;
Dest.MinSpeechDuration := 0.25;
Dest.WindowSize := 512;
Dest.MaxSpeechDuration := 5.0;
end;

function TSherpaOnnxVadModelConfig.ToString: AnsiString;
Expand Down Expand Up @@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;

C.SampleRate := Config.SampleRate;
C.NumThreads := Config.NumThreads;
Expand Down
6 changes: 4 additions & 2 deletions swift-api-examples/SherpaOnnx.swift
Original file line number Diff line number Diff line change
Expand Up @@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
threshold: Float = 0.5,
minSilenceDuration: Float = 0.25,
minSpeechDuration: Float = 0.5,
windowSize: Int = 512
windowSize: Int = 512,
maxSpeechDuration: Float = 5.0
) -> SherpaOnnxSileroVadModelConfig {
return SherpaOnnxSileroVadModelConfig(
model: toCPointer(model),
threshold: threshold,
min_silence_duration: minSilenceDuration,
min_speech_duration: minSpeechDuration,
window_size: Int32(windowSize)
window_size: Int32(windowSize),
max_speech_duration: maxSpeechDuration
)
}

Expand Down
Loading
Loading