forked from josepatino/pyBK
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.ini
126 lines (113 loc) · 5.02 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# AUTHORS
# Jose PATINO, EURECOM, Sophia-Antipolis, France, 2019
# http://www.eurecom.fr/en/people/patino-jose
# Contact: patino[at]eurecom[dot]fr, josempatinovillar[at]gmail[dot]com
# SYSTEM CONFIGURATION
# For INI config formatting please refer to https://docs.python.org/3/library/configparser.html
[EXPERIMENT]
name = example
[GENERAL]
# Format of speech activity detection (SAD) files (LBL, MDTM or RTTM)
# Current version does not provide with automatic SAD
SADformat = LBL
# Set to 0 if you have previous VAD files, or to 1 if you do not have them or want to do automatic VAD in any case
# Note that access to audio files is necessary to perform VAD
performVAD = 0
performFeatureExtraction = 1
[PATH]
# Path to the respective necessary files
# Audio files, necessary if performFeatureExtraction=1
audio = ./audio/
#Features in HTK format, necessary if performFeatureExtraction=0
features = ./features/
# UEM files indicate the audio part to be considered in an audio file basis. The system expects a .uem file with the same name as its audio file
UEM = ./uem/
# SAD files indicate the speech part to be considered in an audio file basis. The system expects a .lbl/.mdtm/.rttm file with the same name as its audio file
SAD = ./sad/
# Diarization output folder will contain the concatenated files diarization outputs in .mdtm/.rttm format
output = ./out/
[EXTENSION]
# Audio reading relies on the librosa library
# For format options please refer to https://librosa.github.io/librosa/
audio = .wav
features = .htk
UEM = .uem
SAD = .lbl
output = .rttm
[FEATURES]
# Features (Mel-Frequency Cepstral Coefficients (MFCCs)) are extracted using the librosa library
# For details and optins please refer to https://librosa.github.io/librosa/
# Window length for featur extraction in ms
framelength = 0.025
# Window shift for feature extraction in ms
# Note that VAD is assumed to have the same resolution as feature extraction, if you need something more specific you can bypass this easily
frameshift = 0.01
# Number of mel filters used
nfilters = 30
# Number of MFCCs employed
ncoeff = 30
[KBM]
# This section configures the parameters of the binary key background model (KBM) which will be trained on the MFCCs from each respective audio file
# Minimum number of Gaussians in the initial pool
minimumNumberOfInitialGaussians = 1024
# Maximum window rate for Gaussian computation
maximumKBMWindowRate = 50
# Window length for computing Gaussians
windowLength = 200
# Number of final Gaussian components in the KBM
kbmSize = 320
# If set to 1, the KBM size is set as a proportion, given by "relKBMsize", of the pool size
useRelativeKBMsize = 1
# Relative KBM size if "useRelativeKBMsize = 1" (value between 0 and 1).
relKBMsize = 0.3
[SEGMENT]
# This section configures the frames of features on which the binary keys/cumulative vectors are extracted
# Window size in frames
length = 100
# Window increment after and before window in frames
increment = 100
# Window shifting in frames
rate = 100
[BINARY_KEY]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of top selected components per frame
topGaussiansPerFrame = 5
# Percentage of bits set to 1 in the binary keys
bitsPerSegmentFactor = 0.2
[CLUSTERING]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of initial clusters
N_init = 16
# Set to one to perform linkage clustering instead of clustering/reassignment
linkage = 0
# Linkage criterion used if linkage==1 ('average', 'single', 'complete')
linkageCriterion = average
# Similarity metric: 'cosine' for cumulative vectors, and 'jaccard' for binary keys
metric = cosine
[CLUSTERING_SELECTION]
# This section configures the parameters used by the AHC algorithm
# Distance metric used in the selection of the output clustering solution ('jaccard','cosine')
metric_clusteringSelection = cosine
# Method employed for number of clusters selection. Can be either 'elbow' for an elbow criterion based on within-class sum of squares (WCSS) or 'spectral' for spectral clustering
bestClusteringCriterion = elbow
# Spectral clustering parameters, employed if bestClusteringCriterion == spectral
sigma = 1
percentile = 40
# If known, max nr of speakers in a sesssion in the database. This is to limit the effect of changes in very small meaningless eigenvalues values generating huge eigengaps
maxNrSpeakers = 16
[RESEGMENTATION]
# This section configures the parameters used by the Gaussian mixture model (GMM) based maximum likelihood assignment resegmentation algorithm
# Set to 1 to perform re-segmentation
resegmentation = 1
# Number of GMM components
modelSize = 6
# Number of expectation-maximization (EM) iterations
nbIter = 10
# Size of the likelihood smoothing window in nb of frames
smoothWin = 100
[OUTPUT]
# 'MDTM' or 'RTTM'
format = RTTM
# If 0, all the segmentation outputs are stored in a single output file
# If 1, all partial clustering solutions obtained at every iteration are stored in separated files. Useful for debugging and system tuning purposes
returnAllPartialSolutions = 0