From 1851ff63373ed1d3ef614b431a153bcc6528e4e2 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 16:51:40 +0800 Subject: [PATCH] Java API for speaker diarization (#1416) --- .github/workflows/run-java-test.yaml | 7 ++ .../OfflineSpeakerDiarizationDemo.java | 99 +++++++++++++++++++ java-api-examples/README.md | 6 ++ .../run-offline-speaker-diarization.sh | 45 +++++++++ sherpa-onnx/java-api/Makefile | 9 ++ .../sherpa/onnx/FastClusteringConfig.java | 44 +++++++++ .../onnx/OfflineSpeakerDiarization.java | 61 ++++++++++++ .../OfflineSpeakerDiarizationCallback.java | 8 ++ .../onnx/OfflineSpeakerDiarizationConfig.java | 79 +++++++++++++++ .../OfflineSpeakerDiarizationSegment.java | 27 +++++ ...OfflineSpeakerSegmentationModelConfig.java | 52 ++++++++++ ...peakerSegmentationPyannoteModelConfig.java | 32 ++++++ .../k2fsa/sherpa/onnx/OfflineTtsCallback.java | 2 + .../onnx/SpeakerEmbeddingExtractorConfig.java | 1 - 14 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 java-api-examples/OfflineSpeakerDiarizationDemo.java create mode 100755 java-api-examples/run-offline-speaker-diarization.sh create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 3e932707c..5759ea5d8 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -107,6 +107,13 @@ jobs: make -j4 ls -lh lib + - name: Run java test (speaker diarization) + shell: bash + run: | + cd ./java-api-examples + ./run-offline-speaker-diarization.sh + rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* + - name: Run java test (kws) shell: bash run: | diff --git a/java-api-examples/OfflineSpeakerDiarizationDemo.java b/java-api-examples/OfflineSpeakerDiarizationDemo.java new file mode 100644 index 000000000..a5ef8d1f4 --- /dev/null +++ b/java-api-examples/OfflineSpeakerDiarizationDemo.java @@ -0,0 +1,99 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use sherpa-onnx Java API for speaker diarization, +import com.k2fsa.sherpa.onnx.*; + +public class OfflineSpeakerDiarizationDemo { + public static void main(String[] args) { + /* Please use the following commands to download files used in this file + Step 1: Download a speaker segmentation model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + Step 2: Download a speaker embedding extractor model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + + Step 3. Download test wave files + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + Step 4. Run it + */ + + String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + String waveFilename = "./0-four-speakers-zh.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineSpeakerSegmentationPyannoteModelConfig pyannote = + OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build(); + + OfflineSpeakerSegmentationModelConfig segmentation = + OfflineSpeakerSegmentationModelConfig.builder() + .setPyannote(pyannote) + .setDebug(true) + .build(); + + SpeakerEmbeddingExtractorConfig embedding = + SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build(); + + // The test wave file ./0-four-speakers-zh.wav contains four speakers, so + // we use numClusters=4 here. If you don't know the number of speakers + // in the test wave file, please set the numClusters to -1 and provide + // threshold for clustering + FastClusteringConfig clustering = + FastClusteringConfig.builder() + .setNumClusters(4) // set it to -1 if you don't know the actual number + .setThreshold(0.5f) + .build(); + + OfflineSpeakerDiarizationConfig config = + OfflineSpeakerDiarizationConfig.builder() + .setSegmentation(segmentation) + .setEmbedding(embedding) + .setClustering(clustering) + .setMinDurationOn(0.2f) + .setMinDurationOff(0.5f) + .build(); + + OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config); + if (sd.getSampleRate() != reader.getSampleRate()) { + System.out.printf( + "Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate()); + return; + } + + // OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples()); + // without callback is also ok + + // or you can use a callback to show the progress + OfflineSpeakerDiarizationSegment[] segments = + sd.processWithCallback( + reader.getSamples(), + (int numProcessedChunks, int numTotalChunks, long arg) -> { + float progress = 100.0f * numProcessedChunks / numTotalChunks; + System.out.printf("Progress: %.2f%%\n", progress); + + return 0; + }); + + for (OfflineSpeakerDiarizationSegment s : segments) { + System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker()); + } + + sd.release(); + } +} diff --git a/java-api-examples/README.md b/java-api-examples/README.md index 697f0c876..779c1b254 100755 --- a/java-api-examples/README.md +++ b/java-api-examples/README.md @@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. # Usage +## Non-streaming speaker diarization + +```bash +./run-offline-speaker-diarization.sh +``` + ## Streaming Speech recognition ``` diff --git a/java-api-examples/run-offline-speaker-diarization.sh b/java-api-examples/run-offline-speaker-diarization.sh new file mode 100755 index 000000000..d5cd63b5f --- /dev/null +++ b/java-api-examples/run-offline-speaker-diarization.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./OfflineSpeakerDiarizationDemo.java diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 69c3631b4..6e4778ae7 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java java_files += KeywordSpotterResult.java java_files += KeywordSpotter.java +java_files += OfflineSpeakerSegmentationPyannoteModelConfig.java +java_files += OfflineSpeakerSegmentationModelConfig.java +java_files += FastClusteringConfig.java +java_files += OfflineSpeakerDiarizationConfig.java +java_files += OfflineSpeakerDiarizationSegment.java +java_files += OfflineSpeakerDiarizationCallback.java +java_files += OfflineSpeakerDiarization.java + + class_files := $(java_files:%.java=%.class) java_files := $(addprefix src/$(package_dir)/,$(java_files)) diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java new file mode 100644 index 000000000..f2e957259 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java @@ -0,0 +1,44 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class FastClusteringConfig { + private final int numClusters; + private final float threshold; + + private FastClusteringConfig(Builder builder) { + this.numClusters = builder.numClusters; + this.threshold = builder.threshold; + } + + public static Builder builder() { + return new Builder(); + } + + public int getNumClusters() { + return numClusters; + } + + public float getThreshold() { + return threshold; + } + + public static class Builder { + private int numClusters = -1; + private float threshold = 0.5f; + + public FastClusteringConfig build() { + return new FastClusteringConfig(this); + } + + public Builder setNumClusters(int numClusters) { + this.numClusters = numClusters; + return this; + } + + public Builder setThreshold(float threshold) { + this.threshold = threshold; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java new file mode 100644 index 000000000..b75cd09ea --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java @@ -0,0 +1,61 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarization { + static { + System.loadLibrary("sherpa-onnx-jni"); + } + + private long ptr = 0; + + public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) { + ptr = newFromFile(config); + } + + public int getSampleRate() { + return getSampleRate(ptr); + } + + // Only config.clustering is used. All other fields are ignored + public void setConfig(OfflineSpeakerDiarizationConfig config) { + setConfig(ptr, config); + } + + public OfflineSpeakerDiarizationSegment[] process(float[] samples) { + return process(ptr, samples); + } + + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback) { + return processWithCallback(ptr, samples, callback, 0); + } + + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback, long arg) { + return processWithCallback(ptr, samples, callback, arg); + } + + protected void finalize() throws Throwable { + release(); + } + + // You'd better call it manually if it is not used anymore + public void release() { + if (this.ptr == 0) { + return; + } + delete(this.ptr); + this.ptr = 0; + } + + private native int getSampleRate(long ptr); + + private native void delete(long ptr); + + private native long newFromFile(OfflineSpeakerDiarizationConfig config); + + private native void setConfig(long ptr, OfflineSpeakerDiarizationConfig config); + + private native OfflineSpeakerDiarizationSegment[] process(long ptr, float[] samples); + + private native OfflineSpeakerDiarizationSegment[] processWithCallback(long ptr, float[] samples, OfflineSpeakerDiarizationCallback callback, long arg); +} \ No newline at end of file diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java new file mode 100644 index 000000000..7787386d3 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java @@ -0,0 +1,8 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +@FunctionalInterface +public interface OfflineSpeakerDiarizationCallback { + Integer invoke(int numProcessedChunks, int numTotalCunks, long arg); +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java new file mode 100644 index 000000000..9965c5742 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java @@ -0,0 +1,79 @@ +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarizationConfig { + private final OfflineSpeakerSegmentationModelConfig segmentation; + private final SpeakerEmbeddingExtractorConfig embedding; + private final FastClusteringConfig clustering; + private final float minDurationOn; + private final float minDurationOff; + + private OfflineSpeakerDiarizationConfig(Builder builder) { + this.segmentation = builder.segmentation; + this.embedding = builder.embedding; + this.clustering = builder.clustering; + this.minDurationOff = builder.minDurationOff; + this.minDurationOn = builder.minDurationOn; + } + + public static Builder builder() { + return new Builder(); + } + + public OfflineSpeakerSegmentationModelConfig getSegmentation() { + return segmentation; + } + + public SpeakerEmbeddingExtractorConfig getEmbedding() { + return embedding; + } + + public FastClusteringConfig getClustering() { + return clustering; + } + + public float getMinDurationOff() { + return minDurationOff; + } + + public float getMinDurationOn() { + return minDurationOn; + } + + public static class Builder { + private OfflineSpeakerSegmentationModelConfig segmentation = OfflineSpeakerSegmentationModelConfig.builder().build(); + private SpeakerEmbeddingExtractorConfig embedding = SpeakerEmbeddingExtractorConfig.builder().build(); + private FastClusteringConfig clustering = FastClusteringConfig.builder().build(); + private float minDurationOn = 0.2f; + private float minDurationOff = 0.5f; + + public OfflineSpeakerDiarizationConfig build() { + return new OfflineSpeakerDiarizationConfig(this); + } + + public Builder setSegmentation(OfflineSpeakerSegmentationModelConfig segmentation) { + this.segmentation = segmentation; + return this; + } + + public Builder setEmbedding(SpeakerEmbeddingExtractorConfig embedding) { + this.embedding = embedding; + return this; + } + + public Builder setClustering(FastClusteringConfig clustering) { + this.clustering = clustering; + return this; + } + + public Builder setMinDurationOff(float minDurationOff) { + this.minDurationOff = minDurationOff; + return this; + } + + public Builder setMinDurationOn(float minDurationOn) { + this.minDurationOn = minDurationOn; + return this; + } + } + +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java new file mode 100644 index 000000000..1bb1a7635 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java @@ -0,0 +1,27 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarizationSegment { + private final float start; + private final float end; + private final int speaker; + + public OfflineSpeakerDiarizationSegment(float start, float end, int speaker) { + this.start = start; + this.end = end; + this.speaker = speaker; + } + + public float getStart() { + return start; + } + + public float getEnd() { + return end; + } + + public int getSpeaker() { + return speaker; + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java new file mode 100644 index 000000000..55df6c295 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java @@ -0,0 +1,52 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerSegmentationModelConfig { + private final OfflineSpeakerSegmentationPyannoteModelConfig pyannote; + private final int numThreads; + private final boolean debug; + private final String provider; + + private OfflineSpeakerSegmentationModelConfig(Builder builder) { + this.pyannote = builder.pyannote; + this.numThreads = builder.numThreads; + this.debug = builder.debug; + this.provider = builder.provider; + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + private OfflineSpeakerSegmentationPyannoteModelConfig pyannote = OfflineSpeakerSegmentationPyannoteModelConfig.builder().build(); + private int numThreads = 1; + private boolean debug = true; + private String provider = "cpu"; + + public OfflineSpeakerSegmentationModelConfig build() { + return new OfflineSpeakerSegmentationModelConfig(this); + } + + public Builder setPyannote(OfflineSpeakerSegmentationPyannoteModelConfig pyannote) { + this.pyannote = pyannote; + return this; + } + + public Builder setNumThreads(int numThreads) { + this.numThreads = numThreads; + return this; + } + + public Builder setDebug(boolean debug) { + this.debug = debug; + return this; + } + + public Builder setProvider(String provider) { + this.provider = provider; + return this; + } + } +} \ No newline at end of file diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java new file mode 100644 index 000000000..51fd99874 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java @@ -0,0 +1,32 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerSegmentationPyannoteModelConfig { + private final String model; + + private OfflineSpeakerSegmentationPyannoteModelConfig(Builder builder) { + this.model = builder.model; + } + + public static Builder builder() { + return new Builder(); + } + + public String getModel() { + return model; + } + + public static class Builder { + private String model = ""; + + public OfflineSpeakerSegmentationPyannoteModelConfig build() { + return new OfflineSpeakerSegmentationPyannoteModelConfig(this); + } + + public Builder setModel(String model) { + this.model = model; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java index 396594a96..2fc1d45dd 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java @@ -1,3 +1,5 @@ +// Copyright 2024 Xiaomi Corporation + package com.k2fsa.sherpa.onnx; @FunctionalInterface diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java index ffc688f34..80f800cdc 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java @@ -50,5 +50,4 @@ public Builder setProvider(String provider) { return this; } } - }