ticket VS-1603

add support for audio transcribe
voipmonitor · May 24, 2024 · b7ae09e · b7ae09e
1 parent 5f71876
commit b7ae09e
Show file tree

Hide file tree

Showing 17 changed files with 833 additions and 64 deletions.
diff --git a/audio_convert.cpp b/audio_convert.cpp
@@ -3,6 +3,8 @@
 #include <stdio.h>
 #include <math.h>
 
+#include "config.h"
+
 #include "audio_convert.h"
 #include "tools_global.h"
 
@@ -98,6 +100,83 @@ cAudioConvert::eResult cAudioConvert::readRaw(sAudioInfo *audioInfo) {
 	return(rslt_write);
 }
 
+cAudioConvert::eResult cAudioConvert::resampleRaw(sAudioInfo *audioInfo, const char *fileNameDst, unsigned sampleRateDst) {
+	FILE *infile = fopen(fileName.c_str(), "rb");
+	if(!infile) {
+		return(_rslt_open_for_read_failed);
+	}
+	FILE *outfile = fopen(fileNameDst, "wb");
+	if(!outfile) {
+		fclose(infile);
+		return(_rslt_open_for_write_failed);
+	}
+	double src_ratio = (double)sampleRateDst / audioInfo->sampleRate;
+	unsigned input_buffer_len = 1024;
+	unsigned output_buffer_len = input_buffer_len * ((int)src_ratio + 1);
+	int16_t input_buffer[input_buffer_len];
+	int16_t output_buffer[output_buffer_len * 4];
+#if HAVE_LIBSAMPLERATE
+	float input_buffer_float[input_buffer_len];
+	float output_buffer_float[output_buffer_len * 4];
+	SRC_STATE *src_state = src_new(SRC_LINEAR, audioInfo->channels, NULL);
+	if(!src_state) {
+		fclose(infile);
+		fclose(outfile);
+		return(_rslt_samplerate_failed);
+	}
+	SRC_DATA src_data;
+	src_data.data_in = input_buffer_float;
+	src_data.data_out = output_buffer_float;
+	src_data.input_frames = 0;
+	src_data.output_frames = output_buffer_len / audioInfo->channels;
+	src_data.src_ratio = src_ratio;
+	src_data.end_of_input = 0;
+	size_t readcount;
+	while((readcount = fread(input_buffer, sizeof(int16_t), input_buffer_len, infile)) > 0) {
+		for (size_t i = 0; i < readcount; i++) {
+			input_buffer_float[i] = input_buffer[i] / 32768.0;
+		}
+		src_data.input_frames = readcount / audioInfo->channels;
+		src_data.data_in = input_buffer_float;
+		int error = src_process(src_state, &src_data);
+		if(error) {
+			src_delete(src_state);
+			fclose(infile);
+			fclose(outfile);
+			return(_rslt_samplerate_failed);
+		}
+		for(int i = 0; i < src_data.output_frames_gen * audioInfo->channels; i++) {
+			float sample = output_buffer_float[i];
+			if(sample > 1.0) sample = 1.0;
+			if(sample < -1.0) sample = -1.0;
+			output_buffer[i] = (int16_t)(sample * 32767.0);
+		}
+		if(fwrite(output_buffer, sizeof(int16_t), src_data.output_frames_gen * audioInfo->channels, outfile) != (size_t)src_data.output_frames_gen) {
+			src_delete(src_state);
+			fclose(infile);
+			fclose(outfile);
+			return(_rslt_write_failed);
+		}
+	}
+	src_delete(src_state);
+#else 
+	size_t readcount;
+	while((readcount = fread(input_buffer, sizeof(int16_t), input_buffer_len, infile)) > 0) {
+		int output_len = (int)(readcount * src_ratio) / audioInfo->channels;
+		linear_resample(input_buffer, output_buffer, readcount, src_ratio, audioInfo->channels);
+		if(fwrite(output_buffer, sizeof(int16_t), output_len * audioInfo->channels, outfile) != (size_t)(output_len * audioInfo->channels)) {
+			fclose(infile);
+			fclose(outfile);
+			return(_rslt_write_failed);
+		}
+	}
+
+#endif
+	fclose(infile);
+	fclose(outfile);
+	return(_rslt_ok);
+}
+
 cAudioConvert::eResult cAudioConvert::readWav() {
 	if(!fileHandle) {
 		fileHandle = fopen(fileName.c_str(), "r");
@@ -623,6 +702,43 @@ cAudioConvert::eResult cAudioConvert::write(u_char *data, unsigned datalen) {
 	return(_rslt_ok);
 }
 
+void cAudioConvert::linear_resample(int16_t* input, int16_t* output, int input_len, double ratio, int channels) {
+	int output_len = (int)(input_len * ratio) / channels;
+	for(int ch = 0; ch < channels; ++ch) {
+		if(ratio >= 1) {
+			for(int i = 0; i < output_len; ++i) {
+				double src_index = i / ratio;
+				int index = (int)src_index;
+				double frac = src_index - index;
+				if(index + 1 < input_len / channels) {
+					output[i * channels + ch] = (int16_t)((1.0 - frac) * input[(index * channels) + ch] + frac * input[((index + 1) * channels) + ch]);
+				} else {
+					output[i * channels + ch] = input[(index * channels) + ch];
+				}
+			}
+		} else {
+			double inv_ratio = 1 / ratio;
+			for(int i = 0; i < output_len; ++i) {
+				double src_index_start = i * inv_ratio;
+				double src_index_end = (i + 1) * inv_ratio;
+				int index_start = (int)src_index_start;
+				int index_end = (int)src_index_end;
+				double sum = 0.0;
+				int count = 0;
+				for(int j = index_start; j < index_end && j < input_len / channels; ++j) {
+					sum += input[(j * channels) + ch];
+					count++;
+				}
+				if(count > 0) {
+					output[i * channels + ch] = (int16_t)(sum / count);
+				} else {
+					output[i * channels + ch] = input[(index_start * channels) + ch];
+				}
+			}
+		}
+	}
+}
+
 void cAudioConvert::test() {
 
 	{

diff --git a/audio_convert.h b/audio_convert.h
@@ -8,6 +8,10 @@
 #include <vorbis/codec.h>
 #include <vorbis/vorbisenc.h>
 
+#if HAVE_LIBSAMPLERATE
+#include <samplerate.h>
+#endif
+
 #include "endian.h"
 #include "bswap.h"
 
@@ -38,7 +42,9 @@ class cAudioConvert {
 		_rslt_ogg_corrupt_secondary_header,
 		_rslt_ogg_missing_vorbis_headers,
 		_rslt_ogg_failed_encode_initialization,
-		_rslt_unknown_format
+		_rslt_samplerate_failed,
+		_rslt_unknown_format,
+		_rslt_no_library_needed
 	};
 	struct sAudioInfo {
 		sAudioInfo() {
@@ -148,6 +154,7 @@ class cAudioConvert {
 	eResult getAudioInfo();
 	std::string jsonAudioInfo();
 	eResult readRaw(sAudioInfo *audioInfo);
+	eResult resampleRaw(sAudioInfo *audioInfo, const char *fileNameDst, unsigned sampleRateDst);
 	eResult readWav();
 	bool readWavHeader(sWavHeader *wavHeader);
 	eResult writeWavHeader(long int size = 0);
@@ -159,6 +166,7 @@ class cAudioConvert {
 	eResult writeOggEnd();
 	eResult _writeOgg();
 	eResult write(u_char *data, unsigned datalen);
+	void linear_resample(int16_t* input, int16_t* output, int input_len, double ratio, int channels);
 	void test();
 public:
 	eSrcDstType srcDstType;

diff --git a/calltable.cpp b/calltable.cpp
@@ -66,6 +66,7 @@
 #include "separate_processing.h"
 #include "ssl_dssl.h"
 #include "diameter.h"
+#include "transcribe.h"
 #include "heap_chunk.h"
 
 #if HAVE_LIBJEMALLOC
@@ -1971,7 +1972,7 @@ void Call::_read_rtp_srtp(CallBranch *c_branch, packet_s_process_0 *packetS, RTP
 	     rtp->call_ipport_n_orig != c_branch->ipport_n)) &&
 	   (opt_srtp_rtp_decrypt || 
 	    (opt_srtp_rtp_dtls_decrypt && (exists_srtp_fingerprint || !exists_srtp_crypto_config)) ||
-	    (opt_srtp_rtp_audio_decrypt && (flags & FLAG_SAVEAUDIO)) || 
+	    (opt_srtp_rtp_audio_decrypt && (enable_save_audio(this) || enable_audio_transcribe(this))) || 
 	    opt_saveRAW || opt_savewav_force)) {
 		int index_call_ip_port_by_src = get_index_by_ip_port_by_src(c_branch, packetS->saddr_(), packetS->source_(), iscaller);
 		if(opt_srtp_rtp_local_instances) {
@@ -3290,7 +3291,7 @@ void convertRawToWav_vmcodecs_callback(SimpleBuffer *out, string str, int fd, vo
 }
 
 int
-Call::convertRawToWav() {
+Call::convertRawToWav(void **transcribe_call) {
 
 #if not EXPERIMENTAL_LITE_RTP_MOD
 
@@ -4053,60 +4054,65 @@ Call::convertRawToWav() {
 			bdir = 0;
 		}
 	}
-	if(adir == 1 && bdir == 1) {
-		// merge caller and called 
-		if(!(flags & FLAG_FORMATAUDIO_OGG)) {
-			if(!opt_saveaudio_reversestereo) {
-				wav_mix(wav0, wav1, out, maxsamplerate, 0, opt_saveaudio_stereo);
+	if(enable_save_audio(this)) {
+		if(adir == 1 && bdir == 1) {
+			// merge caller and called 
+			if(!(flags & FLAG_FORMATAUDIO_OGG)) {
+				if(!opt_saveaudio_reversestereo) {
+					wav_mix(wav0, wav1, out, maxsamplerate, 0, opt_saveaudio_stereo);
+				} else {
+					wav_mix(wav1, wav0, out, maxsamplerate, 0, opt_saveaudio_stereo);
+				}
 			} else {
-				wav_mix(wav1, wav0, out, maxsamplerate, 0, opt_saveaudio_stereo);
+				if(!opt_saveaudio_reversestereo) {
+					ogg_mix(wav0, wav1, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
+				} else {
+					ogg_mix(wav1, wav0, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
+				}
 			}
-		} else {
-			if(!opt_saveaudio_reversestereo) {
-				ogg_mix(wav0, wav1, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
+			if(!sverb.noaudiounlink) unlink(wav0);
+			if(!sverb.noaudiounlink) unlink(wav1);
+		} else if(adir == 1) {
+			// there is only caller sound
+			if(!(flags & FLAG_FORMATAUDIO_OGG)) {
+				wav_mix(wav0, NULL, out, maxsamplerate, 0, opt_saveaudio_stereo);
 			} else {
-				ogg_mix(wav1, wav0, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
+				ogg_mix(wav0, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
+			}
+			if(!sverb.noaudiounlink) unlink(wav0);
+		} else if(bdir == 1) {
+			// there is only called sound
+			if(!(flags & FLAG_FORMATAUDIO_OGG)) {
+				wav_mix(wav1, NULL, out, maxsamplerate, 1, opt_saveaudio_stereo);
+			} else {
+				ogg_mix(wav1, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 1);
+			}
+			if(!sverb.noaudiounlink) unlink(wav1);
+		}
+		string tmp;
+		tmp.append(out);
+		addtofilesqueue(tsf_audio, tmp, 0);
+		if(opt_cachedir[0] != '\0') {
+			Call::_addtocachequeue(tmp);
+		}
+		// Here we put our CURL hook
+		// And use it only if cacheing is turned off
+		if (opt_curl_hook_wav[0] != '\0' && opt_cachedir[0] == '\0') {
+			SimpleBuffer responseBuffer;
+			s_get_curl_response_params curl_params(s_get_curl_response_params::_rt_json);
+			curl_params.addParam("voipmonitor", "true");
+			curl_params.addParam("stereo", opt_saveaudio_stereo ? "false" : "true");
+			curl_params.addParam("wav_file_name_with_path", out);
+			curl_params.addParam("call_id", this->call_id.c_str());
+			if (!get_curl_response(opt_curl_hook_wav, &responseBuffer, &curl_params)) {
+				if(verbosity > 1) syslog(LOG_ERR, "FAIL: Send event to hook[%s] for call_id[%s], error[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), curl_params.error.c_str());
+			} else {
+				if(verbosity > 1) syslog(LOG_INFO, "SUCCESS: Send event to hook[%s] for call_id[%s], response[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), (char*)responseBuffer);
 			}
 		}
-		if(!sverb.noaudiounlink) unlink(wav0);
-		if(!sverb.noaudiounlink) unlink(wav1);
-	} else if(adir == 1) {
-		// there is only caller sound
-		if(!(flags & FLAG_FORMATAUDIO_OGG)) {
-			wav_mix(wav0, NULL, out, maxsamplerate, 0, opt_saveaudio_stereo);
-		} else {
-			ogg_mix(wav0, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 0);
-		}
-		if(!sverb.noaudiounlink) unlink(wav0);
-	} else if(bdir == 1) {
-		// there is only called sound
-		if(!(flags & FLAG_FORMATAUDIO_OGG)) {
-			wav_mix(wav1, NULL, out, maxsamplerate, 1, opt_saveaudio_stereo);
-		} else {
-			ogg_mix(wav1, NULL, out, opt_saveaudio_stereo, maxsamplerate, opt_saveaudio_oggquality, 1);
-		}
-		if(!sverb.noaudiounlink) unlink(wav1);
 	}
-	string tmp;
-	tmp.append(out);
-	addtofilesqueue(tsf_audio, tmp, 0);
-	if(opt_cachedir[0] != '\0') {
-		Call::_addtocachequeue(tmp);
-	}
-	// Here we put our CURL hook
-	// And use it only if cacheing is turned off
-	if (opt_curl_hook_wav[0] != '\0' && opt_cachedir[0] == '\0') {
-		SimpleBuffer responseBuffer;
-		s_get_curl_response_params curl_params(s_get_curl_response_params::_rt_json);
-		curl_params.addParam("voipmonitor", "true");
-		curl_params.addParam("stereo", opt_saveaudio_stereo ? "false" : "true");
-		curl_params.addParam("wav_file_name_with_path", out);
-		curl_params.addParam("call_id", this->call_id.c_str());
-		if (!get_curl_response(opt_curl_hook_wav, &responseBuffer, &curl_params)) {
-			if(verbosity > 1) syslog(LOG_ERR, "FAIL: Send event to hook[%s] for call_id[%s], error[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), curl_params.error.c_str());
-		} else {
-			if(verbosity > 1) syslog(LOG_INFO, "SUCCESS: Send event to hook[%s] for call_id[%s], response[%s]\n", opt_curl_hook_wav, this->call_id.c_str(), (char*)responseBuffer);
-		}
+	if(enable_audio_transcribe(this) && transcribe_call && (adir || bdir)) {
+		*transcribe_call = Transcribe::createTranscribeCall(this, adir ? wav0 : NULL, bdir ? wav1 : NULL, maxsamplerate);
 	}
 
 #endif
@@ -12054,7 +12060,11 @@ void *Calltable::processAudioQueueThread(void *audioQueueThread) {
 		calltable->unlock_calls_audioqueue();
 		if(call) {
 			if(verbosity > 0) printf("converting RAW file to WAV %s\n", call->fbasename);
-			call->convertRawToWav();
+			Transcribe::sCall *transcribe_call;
+			call->convertRawToWav((void**)&transcribe_call);
+			if(enable_audio_transcribe(call) && transcribe_call) {
+				transcribePushCall(transcribe_call);
+			}
 			if(useChartsCacheOrCdrStatInProcessCall()) {
 				calltable->lock_calls_charts_cache_queue();
 				calltable->calls_charts_cache_queue.push_back(sChartsCallData(sChartsCallData::_call, call));
@@ -15794,6 +15804,7 @@ string printCallFlags(unsigned long int flags) {
 	if(flags & FLAG_SAVEAUDIO)		outStr << "saveaudio ";
 	if(flags & FLAG_FORMATAUDIO_WAV)	outStr << "format_wav ";
 	if(flags & FLAG_FORMATAUDIO_OGG)	outStr << "format_ogg ";
+	if(flags & FLAG_AUDIOTRANSCRIBE)	outStr << "audio_transcribe ";
 	if(flags & FLAG_SAVEGRAPH)		outStr << "savegraph ";
 	if(flags & FLAG_SAVERTPHEADER)		outStr << "savertpheader ";
 	if(flags & FLAG_SAVERTP_VIDEO_HEADER)	outStr << "savertp_video_header ";

diff --git a/calltable.h b/calltable.h
@@ -124,6 +124,7 @@ typedef vector<RTP*> CALL_RTP_DYNAMIC_ARRAY_TYPE;
 #define FLAG_SAVENOTIFYPCAP		(1 << 25)
 #define FLAG_SAVESUBSCRIBEDB		(1 << 26)
 #define FLAG_SAVESUBSCRIBEPCAP		(1 << 27)
+#define FLAG_AUDIOTRANSCRIBE		(1 << 28)
 
 #define CDR_CHANGE_SRC_PORT_CALLER	(1 << 0)
 #define CDR_CHANGE_SRC_PORT_CALLED	(1 << 1)
@@ -1719,7 +1720,7 @@ class Call : public CallStructs, public Call_abstract {
 	 * @brief convert raw files to one WAV
 	 *
 	*/
-	int convertRawToWav();
+	int convertRawToWav(void **transcribe_call);
 
 	void selectRtpAB();
 

diff --git a/common.h b/common.h
@@ -153,6 +153,7 @@ struct sVerbose {
 	int diameter_assign;
 	int rdtsc;
 	int suppress_drop_partitions;
+	int whisper;
 	int _debug1;
 	int _debug2;
 	int _debug3;

diff --git a/filter_mysql.cpp b/filter_mysql.cpp
@@ -53,6 +53,7 @@ void filter_base::_loadBaseDataRow(SqlDb_row *sqlRow, map<string, string> *row,
 	baseRow->dtmf = _value_is_null(sqlRow, row, "dtmf") ? -1 : _value(sqlRow, row, "dtmf");
 	baseRow->graph = _value_is_null(sqlRow, row, "graph") ? -1 : _value(sqlRow, row, "graph");
 	baseRow->wav = _value_is_null(sqlRow, row, "wav") ? -1 : _value(sqlRow, row, "wav");
+	baseRow->audio_transcribe = _value_is_null(sqlRow, row, "audio_transcribe") ? -1 : _value(sqlRow, row, "audio_transcribe");
 	baseRow->skip = _value_is_null(sqlRow, row, "skip") ? -1 : _value(sqlRow, row, "skip");
 	baseRow->script = _value_is_null(sqlRow, row, "script") ? -1 : _value(sqlRow, row, "script");
 	baseRow->mos_lqo = _value_is_null(sqlRow, row, "mos_lqo") ? -1 : _value(sqlRow, row, "mos_lqo");
@@ -108,6 +109,9 @@ u_int64_t filter_base::getFlagsFromBaseData(filter_db_row_base *baseRow, u_int32
 	else if(baseRow->wav == 3)		flags |= _FLAG_AUDIO_OGG;
 	else if(baseRow->wav == 0)		flags |= _FLAG_NOWAV;
 
+	if(baseRow->audio_transcribe == 1)	flags |= _FLAG_AUDIO_TRANSCRIBE;
+	else if(baseRow->audio_transcribe == 0)	flags |= _FLAG_NO_AUDIO_TRANSCRIBE;
+
 	if(baseRow->skip == 1)			flags |= _FLAG_SKIP;
 	else if(baseRow->skip == 0)		flags |= _FLAG_NOSKIP;
 
@@ -175,6 +179,9 @@ void filter_base::setCallFlagsFromFilterFlags(volatile unsigned long int *callFl
 	if(filterFlags & _FLAG_AUDIO_OGG)		{*callFlags |= FLAG_SAVEAUDIO_OGG; *callFlags &= ~FLAG_FORMATAUDIO_WAV;}
 	if(filterFlags & _FLAG_NOWAV)			*callFlags &= ~FLAG_SAVEAUDIO;
 
+	if(filterFlags & _FLAG_AUDIO_TRANSCRIBE)	*callFlags |= FLAG_AUDIOTRANSCRIBE;
+	if(filterFlags & _FLAG_NO_AUDIO_TRANSCRIBE)	*callFlags &= ~FLAG_AUDIOTRANSCRIBE;
+
 	if(filterFlags & _FLAG_GRAPH)			*callFlags |= FLAG_SAVEGRAPH;
 	if(filterFlags & _FLAG_NOGRAPH)			*callFlags &= ~FLAG_SAVEGRAPH;