Skip to content

Commit

Permalink
ticket VS-1603
Browse files Browse the repository at this point in the history
add support for audio transcribe
  • Loading branch information
rbucek committed May 24, 2024
1 parent 5f71876 commit b7ae09e
Show file tree
Hide file tree
Showing 17 changed files with 833 additions and 64 deletions.
116 changes: 116 additions & 0 deletions audio_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <stdio.h>
#include <math.h>

#include "config.h"

#include "audio_convert.h"
#include "tools_global.h"

Expand Down Expand Up @@ -98,6 +100,83 @@ cAudioConvert::eResult cAudioConvert::readRaw(sAudioInfo *audioInfo) {
return(rslt_write);
}

cAudioConvert::eResult cAudioConvert::resampleRaw(sAudioInfo *audioInfo, const char *fileNameDst, unsigned sampleRateDst) {
FILE *infile = fopen(fileName.c_str(), "rb");
if(!infile) {
return(_rslt_open_for_read_failed);
}
FILE *outfile = fopen(fileNameDst, "wb");
if(!outfile) {
fclose(infile);
return(_rslt_open_for_write_failed);
}
double src_ratio = (double)sampleRateDst / audioInfo->sampleRate;
unsigned input_buffer_len = 1024;
unsigned output_buffer_len = input_buffer_len * ((int)src_ratio + 1);
int16_t input_buffer[input_buffer_len];
int16_t output_buffer[output_buffer_len * 4];
#if HAVE_LIBSAMPLERATE
float input_buffer_float[input_buffer_len];
float output_buffer_float[output_buffer_len * 4];
SRC_STATE *src_state = src_new(SRC_LINEAR, audioInfo->channels, NULL);
if(!src_state) {
fclose(infile);
fclose(outfile);
return(_rslt_samplerate_failed);
}
SRC_DATA src_data;
src_data.data_in = input_buffer_float;
src_data.data_out = output_buffer_float;
src_data.input_frames = 0;
src_data.output_frames = output_buffer_len / audioInfo->channels;
src_data.src_ratio = src_ratio;
src_data.end_of_input = 0;
size_t readcount;
while((readcount = fread(input_buffer, sizeof(int16_t), input_buffer_len, infile)) > 0) {
for (size_t i = 0; i < readcount; i++) {
input_buffer_float[i] = input_buffer[i] / 32768.0;
}
src_data.input_frames = readcount / audioInfo->channels;
src_data.data_in = input_buffer_float;
int error = src_process(src_state, &src_data);
if(error) {
src_delete(src_state);
fclose(infile);
fclose(outfile);
return(_rslt_samplerate_failed);
}
for(int i = 0; i < src_data.output_frames_gen * audioInfo->channels; i++) {
float sample = output_buffer_float[i];
if(sample > 1.0) sample = 1.0;
if(sample < -1.0) sample = -1.0;
output_buffer[i] = (int16_t)(sample * 32767.0);
}
if(fwrite(output_buffer, sizeof(int16_t), src_data.output_frames_gen * audioInfo->channels, outfile) != (size_t)src_data.output_frames_gen) {
src_delete(src_state);
fclose(infile);
fclose(outfile);
return(_rslt_write_failed);
}
}
src_delete(src_state);
#else
size_t readcount;
while((readcount = fread(input_buffer, sizeof(int16_t), input_buffer_len, infile)) > 0) {
int output_len = (int)(readcount * src_ratio) / audioInfo->channels;
linear_resample(input_buffer, output_buffer, readcount, src_ratio, audioInfo->channels);
if(fwrite(output_buffer, sizeof(int16_t), output_len * audioInfo->channels, outfile) != (size_t)(output_len * audioInfo->channels)) {
fclose(infile);
fclose(outfile);
return(_rslt_write_failed);
}
}

#endif
fclose(infile);
fclose(outfile);
return(_rslt_ok);
}

cAudioConvert::eResult cAudioConvert::readWav() {
if(!fileHandle) {
fileHandle = fopen(fileName.c_str(), "r");
Expand Down Expand Up @@ -623,6 +702,43 @@ cAudioConvert::eResult cAudioConvert::write(u_char *data, unsigned datalen) {
return(_rslt_ok);
}

void cAudioConvert::linear_resample(int16_t* input, int16_t* output, int input_len, double ratio, int channels) {
int output_len = (int)(input_len * ratio) / channels;
for(int ch = 0; ch < channels; ++ch) {
if(ratio >= 1) {
for(int i = 0; i < output_len; ++i) {
double src_index = i / ratio;
int index = (int)src_index;
double frac = src_index - index;
if(index + 1 < input_len / channels) {
output[i * channels + ch] = (int16_t)((1.0 - frac) * input[(index * channels) + ch] + frac * input[((index + 1) * channels) + ch]);
} else {
output[i * channels + ch] = input[(index * channels) + ch];
}
}
} else {
double inv_ratio = 1 / ratio;
for(int i = 0; i < output_len; ++i) {
double src_index_start = i * inv_ratio;
double src_index_end = (i + 1) * inv_ratio;
int index_start = (int)src_index_start;
int index_end = (int)src_index_end;
double sum = 0.0;
int count = 0;
for(int j = index_start; j < index_end && j < input_len / channels; ++j) {
sum += input[(j * channels) + ch];
count++;
}
if(count > 0) {
output[i * channels + ch] = (int16_t)(sum / count);
} else {
output[i * channels + ch] = input[(index_start * channels) + ch];
}
}
}
}
}

void cAudioConvert::test() {

{
Expand Down
10 changes: 9 additions & 1 deletion audio_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
#include <vorbis/codec.h>
#include <vorbis/vorbisenc.h>

#if HAVE_LIBSAMPLERATE
#include <samplerate.h>
#endif

#include "endian.h"
#include "bswap.h"

Expand Down Expand Up @@ -38,7 +42,9 @@ class cAudioConvert {
_rslt_ogg_corrupt_secondary_header,
_rslt_ogg_missing_vorbis_headers,
_rslt_ogg_failed_encode_initialization,
_rslt_unknown_format
_rslt_samplerate_failed,
_rslt_unknown_format,
_rslt_no_library_needed
};
struct sAudioInfo {
sAudioInfo() {
Expand Down Expand Up @@ -148,6 +154,7 @@ class cAudioConvert {
eResult getAudioInfo();
std::string jsonAudioInfo();
eResult readRaw(sAudioInfo *audioInfo);
eResult resampleRaw(sAudioInfo *audioInfo, const char *fileNameDst, unsigned sampleRateDst);
eResult readWav();
bool readWavHeader(sWavHeader *wavHeader);
eResult writeWavHeader(long int size = 0);
Expand All @@ -159,6 +166,7 @@ class cAudioConvert {
eResult writeOggEnd();
eResult _writeOgg();
eResult write(u_char *data, unsigned datalen);
void linear_resample(int16_t* input, int16_t* output, int input_len, double ratio, int channels);
void test();
public:
eSrcDstType srcDstType;
Expand Down
113 changes: 62 additions & 51 deletions calltable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include "separate_processing.h"
#include "ssl_dssl.h"
#include "diameter.h"
#include "transcribe.h"
#include "heap_chunk.h"

#if HAVE_LIBJEMALLOC
Expand Down Expand Up @@ -1971,7 +1972,7 @@ void Call::_read_rtp_srtp(CallBranch *c_branch, packet_s_process_0 *packetS, RTP
rtp->call_ipport_n_orig != c_branch->ipport_n)) &&
(opt_srtp_rtp_decrypt ||
(opt_srtp_rtp_dtls_decrypt && (exists_srtp_fingerprint || !exists_srtp_crypto_config)) ||
(opt_srtp_rtp_audio_decrypt && (flags & FLAG_SAVEAUDIO)) ||
(opt_srtp_rtp_audio_decrypt && (enable_save_audio(this) || enable_audio_transcribe(this))) ||
opt_saveRAW || opt_savewav_force)) {
int index_call_ip_port_by_src = get_index_by_ip_port_by_src(c_branch, packetS->saddr_(), packetS->source_(), iscaller);
if(opt_srtp_rtp_local_instances) {
Expand Down Expand Up @@ -3290,7 +3291,7 @@ void convertRawToWav_vmcodecs_callback(SimpleBuffer *out, string str, int fd, vo
}

int
Call::convertRawToWav() {
Call::convertRawToWav(void **transcribe_call) {

#if not EXPERIMENTAL_LITE_RTP_MOD

Expand Down Expand Up @@ -4053,60 +4054,65 @@ Call::convertRawToWav() {
bdir = 0;
}
}
if(adir == 1 && bdir == 1) {
// merge caller and called
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
if(!opt_saveaudio_reversestereo) {
wav_mix(wav0, wav1, out, maxsamplerate, 0, opt_saveaudio_stereo);
if(enable_save_audio(this)) {
if(adir == 1 && bdir == 1) {
// merge caller and called
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
if(!opt_saveaudio_reversestereo) {
wav_mix(wav0, wav1, out, maxsamplerate, 0, opt_saveaudio_stereo);
} else {
wav_mix(wav1, wav0, out, maxsamplerate, 0, opt_saveaudio_stereo);
}
} else {
wav_mix(wav1, wav0, out, maxsamplerate, 0, opt_saveaudio_stereo);
if(!opt_saveaudio_reversestereo) {
ogg_mix(wav0, wav1, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
} else {
ogg_mix(wav1, wav0, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
}
}
} else {
if(!opt_saveaudio_reversestereo) {
ogg_mix(wav0, wav1, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
if(!sverb.noaudiounlink) unlink(wav0);
if(!sverb.noaudiounlink) unlink(wav1);
} else if(adir == 1) {
// there is only caller sound
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
wav_mix(wav0, NULL, out, maxsamplerate, 0, opt_saveaudio_stereo);
} else {
ogg_mix(wav1, wav0, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
ogg_mix(wav0, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
}
if(!sverb.noaudiounlink) unlink(wav0);
} else if(bdir == 1) {
// there is only called sound
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
wav_mix(wav1, NULL, out, maxsamplerate, 1, opt_saveaudio_stereo);
} else {
ogg_mix(wav1, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 1);
}
if(!sverb.noaudiounlink) unlink(wav1);
}
string tmp;
tmp.append(out);
addtofilesqueue(tsf_audio, tmp, 0);
if(opt_cachedir[0] != '\0') {
Call::_addtocachequeue(tmp);
}
// Here we put our CURL hook
// And use it only if cacheing is turned off
if (opt_curl_hook_wav[0] != '\0' && opt_cachedir[0] == '\0') {
SimpleBuffer responseBuffer;
s_get_curl_response_params curl_params(s_get_curl_response_params::_rt_json);
curl_params.addParam("voipmonitor", "true");
curl_params.addParam("stereo", opt_saveaudio_stereo ? "false" : "true");
curl_params.addParam("wav_file_name_with_path", out);
curl_params.addParam("call_id", this->call_id.c_str());
if (!get_curl_response(opt_curl_hook_wav, &responseBuffer, &curl_params)) {
if(verbosity > 1) syslog(LOG_ERR, "FAIL: Send event to hook[%s] for call_id[%s], error[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), curl_params.error.c_str());
} else {
if(verbosity > 1) syslog(LOG_INFO, "SUCCESS: Send event to hook[%s] for call_id[%s], response[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), (char*)responseBuffer);
}
}
if(!sverb.noaudiounlink) unlink(wav0);
if(!sverb.noaudiounlink) unlink(wav1);
} else if(adir == 1) {
// there is only caller sound
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
wav_mix(wav0, NULL, out, maxsamplerate, 0, opt_saveaudio_stereo);
} else {
ogg_mix(wav0, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
}
if(!sverb.noaudiounlink) unlink(wav0);
} else if(bdir == 1) {
// there is only called sound
if(!(flags & FLAG_FORMATAUDIO_OGG)) {
wav_mix(wav1, NULL, out, maxsamplerate, 1, opt_saveaudio_stereo);
} else {
ogg_mix(wav1, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 1);
}
if(!sverb.noaudiounlink) unlink(wav1);
}
string tmp;
tmp.append(out);
addtofilesqueue(tsf_audio, tmp, 0);
if(opt_cachedir[0] != '\0') {
Call::_addtocachequeue(tmp);
}
// Here we put our CURL hook
// And use it only if cacheing is turned off
if (opt_curl_hook_wav[0] != '\0' && opt_cachedir[0] == '\0') {
SimpleBuffer responseBuffer;
s_get_curl_response_params curl_params(s_get_curl_response_params::_rt_json);
curl_params.addParam("voipmonitor", "true");
curl_params.addParam("stereo", opt_saveaudio_stereo ? "false" : "true");
curl_params.addParam("wav_file_name_with_path", out);
curl_params.addParam("call_id", this->call_id.c_str());
if (!get_curl_response(opt_curl_hook_wav, &responseBuffer, &curl_params)) {
if(verbosity > 1) syslog(LOG_ERR, "FAIL: Send event to hook[%s] for call_id[%s], error[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), curl_params.error.c_str());
} else {
if(verbosity > 1) syslog(LOG_INFO, "SUCCESS: Send event to hook[%s] for call_id[%s], response[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), (char*)responseBuffer);
}
if(enable_audio_transcribe(this) && transcribe_call && (adir || bdir)) {
*transcribe_call = Transcribe::createTranscribeCall(this, adir ? wav0 : NULL, bdir ? wav1 : NULL, maxsamplerate);
}

#endif
Expand Down Expand Up @@ -12054,7 +12060,11 @@ void *Calltable::processAudioQueueThread(void *audioQueueThread) {
calltable->unlock_calls_audioqueue();
if(call) {
if(verbosity > 0) printf("converting RAW file to WAV %s\n", call->fbasename);
call->convertRawToWav();
Transcribe::sCall *transcribe_call;
call->convertRawToWav((void**)&transcribe_call);
if(enable_audio_transcribe(call) && transcribe_call) {
transcribePushCall(transcribe_call);
}
if(useChartsCacheOrCdrStatInProcessCall()) {
calltable->lock_calls_charts_cache_queue();
calltable->calls_charts_cache_queue.push_back(sChartsCallData(sChartsCallData::_call, call));
Expand Down Expand Up @@ -15794,6 +15804,7 @@ string printCallFlags(unsigned long int flags) {
if(flags & FLAG_SAVEAUDIO) outStr << "saveaudio ";
if(flags & FLAG_FORMATAUDIO_WAV) outStr << "format_wav ";
if(flags & FLAG_FORMATAUDIO_OGG) outStr << "format_ogg ";
if(flags & FLAG_AUDIOTRANSCRIBE) outStr << "audio_transcribe ";
if(flags & FLAG_SAVEGRAPH) outStr << "savegraph ";
if(flags & FLAG_SAVERTPHEADER) outStr << "savertpheader ";
if(flags & FLAG_SAVERTP_VIDEO_HEADER) outStr << "savertp_video_header ";
Expand Down
3 changes: 2 additions & 1 deletion calltable.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ typedef vector<RTP*> CALL_RTP_DYNAMIC_ARRAY_TYPE;
#define FLAG_SAVENOTIFYPCAP (1 << 25)
#define FLAG_SAVESUBSCRIBEDB (1 << 26)
#define FLAG_SAVESUBSCRIBEPCAP (1 << 27)
#define FLAG_AUDIOTRANSCRIBE (1 << 28)

#define CDR_CHANGE_SRC_PORT_CALLER (1 << 0)
#define CDR_CHANGE_SRC_PORT_CALLED (1 << 1)
Expand Down Expand Up @@ -1719,7 +1720,7 @@ class Call : public CallStructs, public Call_abstract {
* @brief convert raw files to one WAV
*
*/
int convertRawToWav();
int convertRawToWav(void **transcribe_call);

void selectRtpAB();

Expand Down
1 change: 1 addition & 0 deletions common.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ struct sVerbose {
int diameter_assign;
int rdtsc;
int suppress_drop_partitions;
int whisper;
int _debug1;
int _debug2;
int _debug3;
Expand Down
7 changes: 7 additions & 0 deletions filter_mysql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ void filter_base::_loadBaseDataRow(SqlDb_row *sqlRow, map<string, string> *row,
baseRow->dtmf = _value_is_null(sqlRow, row, "dtmf") ? -1 : _value(sqlRow, row, "dtmf");
baseRow->graph = _value_is_null(sqlRow, row, "graph") ? -1 : _value(sqlRow, row, "graph");
baseRow->wav = _value_is_null(sqlRow, row, "wav") ? -1 : _value(sqlRow, row, "wav");
baseRow->audio_transcribe = _value_is_null(sqlRow, row, "audio_transcribe") ? -1 : _value(sqlRow, row, "audio_transcribe");
baseRow->skip = _value_is_null(sqlRow, row, "skip") ? -1 : _value(sqlRow, row, "skip");
baseRow->script = _value_is_null(sqlRow, row, "script") ? -1 : _value(sqlRow, row, "script");
baseRow->mos_lqo = _value_is_null(sqlRow, row, "mos_lqo") ? -1 : _value(sqlRow, row, "mos_lqo");
Expand Down Expand Up @@ -108,6 +109,9 @@ u_int64_t filter_base::getFlagsFromBaseData(filter_db_row_base *baseRow, u_int32
else if(baseRow->wav == 3) flags |= _FLAG_AUDIO_OGG;
else if(baseRow->wav == 0) flags |= _FLAG_NOWAV;

if(baseRow->audio_transcribe == 1) flags |= _FLAG_AUDIO_TRANSCRIBE;
else if(baseRow->audio_transcribe == 0) flags |= _FLAG_NO_AUDIO_TRANSCRIBE;

if(baseRow->skip == 1) flags |= _FLAG_SKIP;
else if(baseRow->skip == 0) flags |= _FLAG_NOSKIP;

Expand Down Expand Up @@ -175,6 +179,9 @@ void filter_base::setCallFlagsFromFilterFlags(volatile unsigned long int *callFl
if(filterFlags & _FLAG_AUDIO_OGG) {*callFlags |= FLAG_SAVEAUDIO_OGG; *callFlags &= ~FLAG_FORMATAUDIO_WAV;}
if(filterFlags & _FLAG_NOWAV) *callFlags &= ~FLAG_SAVEAUDIO;

if(filterFlags & _FLAG_AUDIO_TRANSCRIBE) *callFlags |= FLAG_AUDIOTRANSCRIBE;
if(filterFlags & _FLAG_NO_AUDIO_TRANSCRIBE) *callFlags &= ~FLAG_AUDIOTRANSCRIBE;

if(filterFlags & _FLAG_GRAPH) *callFlags |= FLAG_SAVEGRAPH;
if(filterFlags & _FLAG_NOGRAPH) *callFlags &= ~FLAG_SAVEGRAPH;

Expand Down
Loading

0 comments on commit b7ae09e

Please sign in to comment.