config.ini

# AUTHORS
# Jose PATINO, EURECOM, Sophia-Antipolis, France, 2019
# http://www.eurecom.fr/en/people/patino-jose
# Contact: patino[at]eurecom[dot]fr, josempatinovillar[at]gmail[dot]com

# SYSTEM CONFIGURATION 
# For INI config formatting please refer to https://docs.python.org/3/library/configparser.html

[EXPERIMENT]
name = example

[GENERAL]  
# Format of speech activity detection (SAD) files (LBL, MDTM or RTTM)
# Current version does not provide with automatic SAD
SADformat = LBL
# Set to 0 if you have previous VAD files, or to 1 if you do not have them or want to do automatic VAD in any case
# Note that access to audio files is necessary to perform VAD
performVAD = 0
performFeatureExtraction = 1

[PATH]
# Path to the respective necessary files
# Audio files, necessary if performFeatureExtraction=1
audio = ./audio/
#Features in HTK format, necessary if performFeatureExtraction=0
features = ./features/
# UEM files indicate the audio part to be considered in an audio file basis. The system expects a .uem file with the same name as its audio file
UEM = ./uem/
# SAD files indicate the speech part to be considered in an audio file basis. The system expects a .lbl/.mdtm/.rttm file with the same name as its audio file
SAD = ./sad/
# Diarization output folder will contain the concatenated files diarization outputs in .mdtm/.rttm format
output = ./out/

[EXTENSION]
# Audio reading relies on the librosa library
# For format options please refer to https://librosa.github.io/librosa/
audio = .wav
features = .htk
UEM = .uem
SAD = .lbl
output = .rttm

[FEATURES]
# Features (Mel-Frequency Cepstral Coefficients (MFCCs)) are extracted using the librosa library
# For details and optins please refer to https://librosa.github.io/librosa/
# Window length for featur extraction in ms
framelength = 0.025
# Window shift for feature extraction in ms
# Note that VAD is assumed to have the same resolution as feature extraction, if you need something more specific you can bypass this easily
frameshift = 0.01
# Number of mel filters used
nfilters = 30
# Number of MFCCs employed
ncoeff = 30

[KBM]
# This section configures the parameters of the binary key background model (KBM) which will be trained on the MFCCs from each respective audio file
# Minimum number of Gaussians in the initial pool
minimumNumberOfInitialGaussians = 1024
# Maximum window rate for Gaussian computation
maximumKBMWindowRate = 50
# Window length for computing Gaussians
windowLength = 200
# Number of final Gaussian components in the KBM
kbmSize = 320
# If set to 1, the KBM size is set as a proportion, given by "relKBMsize", of the pool size
useRelativeKBMsize = 1
# Relative KBM size if "useRelativeKBMsize = 1" (value between 0 and 1).
relKBMsize = 0.3

[SEGMENT]
# This section configures the frames of features on which the binary keys/cumulative vectors are extracted
# Window size in frames
length = 100
# Window increment after and before window in frames
increment = 100
# Window shifting in frames
rate = 100

[BINARY_KEY]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of top selected components per frame
topGaussiansPerFrame = 5
# Percentage of bits set to 1 in the binary keys
bitsPerSegmentFactor = 0.2

[CLUSTERING]
# This section configures the parameters used for the binary keys/cumulative vectors extraction
# Number of initial clusters
N_init = 16
# Set to one to perform linkage clustering instead of clustering/reassignment
linkage = 0
# Linkage criterion used if linkage==1 ('average', 'single', 'complete')
linkageCriterion = average
# Similarity metric: 'cosine' for cumulative vectors, and 'jaccard' for binary keys
metric = cosine

[CLUSTERING_SELECTION]
# This section configures the parameters used by the AHC algorithm
# Distance metric used in the selection of the output clustering solution ('jaccard','cosine')
metric_clusteringSelection = cosine
# Method employed for number of clusters selection. Can be either 'elbow' for an elbow criterion based on within-class sum of squares (WCSS) or 'spectral' for spectral clustering
bestClusteringCriterion = elbow
# Spectral clustering parameters, employed if bestClusteringCriterion == spectral
sigma = 1
percentile = 40
# If known, max nr of speakers in a sesssion in the database. This is to limit the effect of changes in very small meaningless eigenvalues values generating huge eigengaps 
maxNrSpeakers = 16

[RESEGMENTATION]
# This section configures the parameters used by the Gaussian mixture model (GMM) based maximum likelihood assignment resegmentation algorithm
# Set to 1 to perform re-segmentation
resegmentation = 1
# Number of GMM components
modelSize = 6
# Number of expectation-maximization (EM) iterations
nbIter = 10
# Size of the likelihood smoothing window in nb of frames
smoothWin = 100

[OUTPUT]
# 'MDTM' or 'RTTM'
format = RTTM              
# If 0, all the segmentation outputs are stored in a single output file
# If 1, all partial clustering solutions obtained at every iteration are stored in separated files. Useful for debugging and system tuning purposes
returnAllPartialSolutions = 0