From b9b99e73b2482bf568ee3ba679ec9c2f760fa350 Mon Sep 17 00:00:00 2001
From: Andy Ross <andyross@google.com>
Date: Thu, 14 Dec 2023 15:21:54 -0800
Subject: [PATCH] Hacky fixups

There were multiple bugs with the buffer copying and interpretation.
Turns out that the microphone input was NOT lying: it really is giving
us 4 channels of S16_LE, contra what the downstream ALSA endpoint has
(S32_LE).  That seems like a topology bug?

But the reference input WAS lying.  It was claiming 4 channels but
providing only two, leading to underruns.

Combined with buffer copying bugs, the end result was that we were
moving the right amount of bytes but in the wrong format.

I think this is right now, but it needs a bunch of cleanup still.
Commit so I have a reference to fall back on.
---
 app/boards/intel_adsp_ace15_mtpm.conf         |   1 -
 .../google/google_rtc_audio_processing.c      | 173 +++++++++++++-----
 2 files changed, 127 insertions(+), 47 deletions(-)

diff --git a/app/boards/intel_adsp_ace15_mtpm.conf b/app/boards/intel_adsp_ace15_mtpm.conf
index 36f645cb0d02..4f4c86c231ef 100644
--- a/app/boards/intel_adsp_ace15_mtpm.conf
+++ b/app/boards/intel_adsp_ace15_mtpm.conf
@@ -95,6 +95,5 @@ CONFIG_PROBE_DMA_MAX=2
 CONFIG_KCPS_DYNAMIC_CLOCK_CONTROL=y
 CONFIG_COMP_GOOGLE_RTC_AUDIO_PROCESSING=y
 CONFIG_COMP_GOOGLE_RTC_AUDIO_PROCESSING_NUM_CHANNELS=2
-CONFIG_GOOGLE_RTC_AUDIO_PROCESSING_MIC_BITS=32
 
 CONFIG_COMP_STUBS=y
\ No newline at end of file
diff --git a/src/audio/google/google_rtc_audio_processing.c b/src/audio/google/google_rtc_audio_processing.c
index 026248a40840..557acc8c9d16 100644
--- a/src/audio/google/google_rtc_audio_processing.c
+++ b/src/audio/google/google_rtc_audio_processing.c
@@ -91,8 +91,8 @@ struct google_rtc_audio_processing_comp_data {
 	int aec_reference_source;
 	int raw_microphone_source;
 	struct comp_buffer *ref_comp_buffer;
-	int ref_frame_bytes;
-	int out_frame_bytes;
+	int ref_framesz;
+	int cap_framesz;
 };
 
 #if CONFIG_GOOGLE_RTC_AUDIO_PROCESSING_MIC_BITS == 16
@@ -591,6 +591,8 @@ static int google_rtc_audio_processing_prepare(struct processing_module *mod,
 	int out_chan = sink_get_channels(sinks[0]);
 	int out_rate = sink_get_rate(sinks[0]);
 
+	ref_chan = 2;
+
 	/* Too many channels is a soft failure, AEC treats only the first N */
 	if (ref_chan > REF_CHAN_MAX)
 		comp_warn(dev, "Too many ref channels: %d, truncating to %d",
@@ -604,6 +606,16 @@ static int google_rtc_audio_processing_prepare(struct processing_module *mod,
 		ret = -EINVAL;
 	}
 
+	/* But workaround: the ref source on MTL claims 4 channels,
+	 * but only provides two.  Use our kconfig-derived vlaue
+	 * instead, but warn about it.
+	 */
+	if (ref_chan != REF_CHAN_MAX) {
+		comp_err(dev, "Incorrect ref channels %d, setting %d\n",
+			 ref_chan, REF_CHAN_MAX);
+		ref_chan = REF_CHAN_MAX;
+	}
+
 	cd->num_aec_reference_channels = MIN(ref_chan, REF_CHAN_MAX);
 	cd->num_capture_channels = MIN(mic_chan, MIC_CHAN_MAX);
 
@@ -624,8 +636,7 @@ static int google_rtc_audio_processing_prepare(struct processing_module *mod,
 		comp_err(dev, "Mic stream must be %d bit samples\n",
 			 CONFIG_GOOGLE_RTC_AUDIO_PROCESSING_MIC_BITS);
 
-		// FIXME: squash for now, the streams lie.  See below.
-		//ret = -EINVAL;
+		ret = -EINVAL;
 	}
 
 	if (mic_fmt != out_fmt) {
@@ -636,10 +647,10 @@ static int google_rtc_audio_processing_prepare(struct processing_module *mod,
 	// FIXME: the streams have a bad format on MTL, so we can't
 	// use this API.  Compute by hand.
 	//
-	//cd->ref_frame_bytes = source_get_frame_bytes(sources[cd->aec_reference_source]);
-	//cd->out_frame_bytes = sink_get_frame_bytes(sinks[0]);
-	cd->ref_frame_bytes = sizeof(mic_sample_t) * source_get_channels(sources[cd->aec_reference_source]);
-	cd->out_frame_bytes = cd->ref_frame_bytes;
+	//cd->ref_framesz = source_get_frame_bytes(sources[cd->aec_reference_source]);
+	//cd->cap_framesz = sink_get_frame_bytes(sinks[0]);
+	cd->ref_framesz = sizeof(ref_sample_t) * ref_chan;
+	cd->cap_framesz = sizeof(mic_sample_t) * mic_chan;
 
 	ret = GoogleRtcAudioProcessingSetStreamFormats(cd->state, mic_rate,
 						       cd->num_capture_channels,
@@ -684,6 +695,9 @@ static int google_rtc_audio_processing_reset(struct processing_module *mod)
 /* FunctionMostlyExistsToKeepLineLengthsUnderControl */
 static inline void execute_aec(struct google_rtc_audio_processing_comp_data *cd)
 {
+	static int tlast;
+	int t0 = k_cycle_get_32();
+
 	/* Note that reference input and mic output share the same
 	 * buffer for efficiency
 	 */
@@ -693,11 +707,16 @@ static inline void execute_aec(struct google_rtc_audio_processing_comp_data *cd)
 						       (const float **)cd->raw_mic_buffers,
 						       cd->refout_buffers);
 	cd->buffered_frames = 0;
+
+	int t1 = k_cycle_get_32();
+	//printk("A took %d, dt %d\n", k_cyc_to_us_floor32(t1 - t0), k_cyc_to_us_floor32(t0 - tlast));
+	tlast = t0;
 }
 
 static void mic_in_copy(struct sof_source *src, int frames, float **dst_bufs, int frame0)
 {
-	size_t chan = MIN(MIC_CHAN_MAX, source_get_channels(src));
+	size_t chan = source_get_channels(src);
+	//size_t chan = MIC_CHAN_MAX;
 	size_t samples = frames * chan;
 	size_t bytes = samples * sizeof(mic_sample_t);
 	const mic_sample_t *buf, *bufstart, *bufend;
@@ -705,19 +724,25 @@ static void mic_in_copy(struct sof_source *src, int frames, float **dst_bufs, in
 	int i, c, err;
 	size_t bufsz;
 
-	for (i = 0; i < chan; i++)
+	__ASSERT(chan == 4, "");
+	__ASSERT(sizeof(mic_sample_t) == 2, "");
+
+	assert(MIC_CHAN_MAX <= chan); /* we truncate only */
+	__ASSERT(frames + frame0 <= NUM_FRAMES, "frames %d frame0 %d", frames, frame0);
+
+	for (i = 0; i < MIC_CHAN_MAX; i++)
 		dst[i] = &dst_bufs[i][frame0];
 
 	err = source_get_data(src, bytes, (void *)&buf, (void *)&bufstart, &bufsz);
-	assert(err == 0);
+	__ASSERT(err == 0, "err not zero: %d, bytes %d", err, bytes);
 	bufend = &bufstart[bufsz];
 
 	for (i = 0; i < frames; i++) {
-		for  (c = 0; c < chan; c++) {
+		for  (c = 0; c < MIC_CHAN_MAX; c++)
 			*dst[c]++ = mic_to_float(*buf++);
-			if (buf >= bufend)
-				buf = bufstart;
-		}
+		buf += chan - MIC_CHAN_MAX; /* skip unused channels */
+		if (buf >= bufend)
+			buf = bufstart;
 	}
 	source_release_data(src, bytes);
 }
@@ -725,7 +750,9 @@ static void mic_in_copy(struct sof_source *src, int frames, float **dst_bufs, in
 /* Nearly verbatim except for types.  Needs macro/inlining attention */
 static void ref_copy(struct sof_source *src, int frames, float **dst_bufs, int frame0)
 {
-	size_t chan = MIN(REF_CHAN_MAX, source_get_channels(src));
+	// FIXME: MTL passes 4 channels, not 2
+	//size_t chan = source_get_channels(src);
+	size_t chan = REF_CHAN_MAX;
 	size_t samples = frames * chan;
 	size_t bytes = samples * sizeof(ref_sample_t);
 	const ref_sample_t *buf, *bufstart, *bufend;
@@ -733,7 +760,10 @@ static void ref_copy(struct sof_source *src, int frames, float **dst_bufs, int f
 	int i, c, err;
 	size_t bufsz;
 
-	for (i = 0; i < chan; i++)
+	__ASSERT(chan == 2, "");
+	__ASSERT(sizeof(ref_sample_t) == 2, "");
+
+	for (i = 0; i < REF_CHAN_MAX; i++)
 		dst[i] = &dst_bufs[i][frame0];
 
 	err = source_get_data(src, bytes, (void *)&buf, (void *)&bufstart, &bufsz);
@@ -741,18 +771,17 @@ static void ref_copy(struct sof_source *src, int frames, float **dst_bufs, int f
 	bufend = &bufstart[bufsz];
 
 	for (i = 0; i < frames; i++) {
-		for  (c = 0; c < chan; c++) {
+		for (c = 0; c < chan; c++)
 			*dst[c]++ = ref_to_float(*buf++);
-			if (buf >= bufend)
-				buf = bufstart;
-		}
+		if (buf >= bufend)
+			buf = bufstart;
 	}
 	source_release_data(src, bytes);
 }
 
 static void mic_out_copy(struct sof_sink *sink, int frames, float **src_bufs)
 {
-	size_t chan = MIN(MIC_CHAN_MAX, sink_get_channels(sink));
+	size_t chan = sink_get_channels(sink);
 	size_t samples = frames * chan;
 	size_t bytes = samples * sizeof(mic_sample_t);
 	mic_sample_t *buf, *bufstart, *bufend;
@@ -760,19 +789,24 @@ static void mic_out_copy(struct sof_sink *sink, int frames, float **src_bufs)
 	size_t bufsz;
 	float *src[MIC_CHAN_MAX];
 
-	for (i = 0; i < chan; i++)
+	__ASSERT(chan == 4, "");
+	__ASSERT(sizeof(mic_sample_t) == 2, "");
+
+	for (i = 0; i < MIC_CHAN_MAX; i++)
 		src[i] = src_bufs[i];
 
+	//printk("sink %d free %d\n", bytes, sink_get_free_size(sink));
 	err = sink_get_buffer(sink, bytes, (void *)&buf, (void *)&bufstart, &bufsz);
 	assert(err == 0);
 	bufend = &bufstart[bufsz];
 
 	for (i = 0; i < frames; i++) {
-		for  (c = 0; c < chan; c++) {
+		for (c = 0; c < MIC_CHAN_MAX; c++)
 			*buf++ = float_to_mic(*src[c]++);
-			if (buf >= bufend)
-				buf = bufstart;
-		}
+		for (/**/; c < chan; c++)
+			*buf++ = 0; /* clear unused channels */
+		if (buf >= bufend)
+			buf = bufstart;
 	}
 	sink_commit_buffer(sink, bytes);
 }
@@ -803,36 +837,52 @@ static int mod_process(struct processing_module *mod, struct sof_source **source
 	if (!ref_ok)
 		bzero(refoutbuf, sizeof(refoutbuf));
 
-	int fmic = source_get_data_frames_available(mic);
-	int fref = source_get_data_frames_available(ref);
+	// FIXME: SOF lies about these values
+	//int fmic = source_get_data_frames_available(mic);
+	//int fref = source_get_data_frames_available(ref);
+	int fmic = source_get_data_available(mic) / cd->cap_framesz;
+	int fref = source_get_data_available(ref) / cd->ref_framesz;
 	int frames = ref_ok ? MIN(fmic, fref) : fmic;
 	int n, frames_rem;
 
+#if 0
+	static int tlast;
+	int now = k_cycle_get_32();
+	printk("F%d (m %db r %db) dt %d\n", frames, source_get_data_available(mic), source_get_data_available(ref), k_cyc_to_us_floor32(now - tlast));
+	tlast = now;
+	if (fmic < 300) return 0; //DEBUG
+#endif
+
 	/* If fref > fmic (common at pipeline startup if
 	 * playback was already active), we should consume the early
 	 * samples so AEC compares the most recent values.
 	 */
 	if (ref_ok && fref > fmic)
-		source_release_data(ref, (fref - fmic) * cd->ref_frame_bytes);
+		source_release_data(ref, (fref - fmic) * cd->ref_framesz);
 
 	for (frames_rem = frames; frames_rem; frames_rem -= n) {
 		n = MIN(frames_rem, cd->num_frames - cd->buffered_frames);
 
-		int si = cd->buffered_frames * source_get_channels(mic);
-
-		mic_in_copy(mic, n, cd->raw_mic_buffers, si);
+		mic_in_copy(mic, n, cd->raw_mic_buffers, cd->buffered_frames);
 
 		if (ref_ok)
-			ref_copy(ref, n, cd->refout_buffers, si);
+			ref_copy(ref, n, cd->refout_buffers, cd->buffered_frames);
 
 		cd->buffered_frames += n;
 
+		__ASSERT(cd->buffered_frames <= NUM_FRAMES, "bf %d", cd->buffered_frames);
 		if (cd->buffered_frames >= cd->num_frames) {
 			execute_aec(cd);
-			mic_out_copy(out, n, cd->refout_buffers);
+			__ASSERT(cd->buffered_frames == 0, "bf %d", cd->buffered_frames);
+			mic_out_copy(out, cd->num_frames, cd->refout_buffers);
+
+			/* Safety valve. */
+			if (sink_get_free_size(out) < cd->num_frames * cd->cap_framesz) {
+				//comp_warn(mod->dev, "Two AEC's in one process()\n");
+				break;
+			}
 		}
 	}
-
 	return 0;
 }
 
@@ -841,28 +891,59 @@ static bool mod_is_ready_to_process(struct processing_module *mod,
 				    struct sof_sink **sinks, int num_of_sinks)
 {
 	struct google_rtc_audio_processing_comp_data *cd = module_get_private_data(mod);
+#if 0
+	static int count, cyc, last_cyc;
+	cyc = k_cycle_get_32();
+	if((count++ % 100) == 0)
+		printk("RDY  CPU%d us %d\n", arch_curr_cpu()->id, k_cyc_to_us_floor32(cyc-last_cyc));
+	last_cyc = cyc;
+#endif
+
+	/* AEC produces its output in a single 10ms chunk, so we need
+	 * at least that much space in the output buffer.  We're
+	 * otherwise happy to process any amount of input; it's
+	 * accumulated in a relatively cheap copy, so frontload that
+	 * to the extent possible.
+	 */
+	return sink_get_free_size(sinks[0]) >= cd->num_frames * cd->cap_framesz;
+#if 0
+	struct google_rtc_audio_processing_comp_data *cd = module_get_private_data(mod);
 	struct sof_source *mic = sources[cd->raw_microphone_source];
 	struct sof_source *ref = sources[cd->aec_reference_source];
 	struct sof_sink *out = sinks[0];
 	bool ref_ok = ref_stream_active(cd);
 
-	/* this should  source_get_min_available(ref_stream)!!!
-	 * Currently the topology sets IBS incorrectly
-	 */
-	if (ref_ok && (source_get_data_available(ref)
-		       < cd->num_frames * cd->ref_frame_bytes))
+	static int ref_need, mic_need, out_need;
+
+	if (!ref_need) {
+		ref_need = cd->num_frames * cd->ref_framesz;
+		mic_need = source_get_min_available(mic);
+		out_need = cd->num_frames * cd->cap_framesz;
+	}
+
+	static int count, cyc, last_cyc;
+	cyc = k_cycle_get_32();
+	if((count++ % 1000) == 0)
+		printk("RDY  CPU%d us %d\n", arch_curr_cpu()->id, k_cyc_to_us_floor32(cyc-last_cyc));
+	last_cyc = cyc;
+
+	int nref = source_get_data_available(ref);
+	int nmic = source_get_data_available(mic);
+	int nout = sink_get_free_size(out);
+
+#if 0
+	if (!ref_ok || !nref)
 		return false;
 
-	if (source_get_data_available(mic) < source_get_min_available(mic))
+	if (nmic == 0)
 		return false;
+#endif
 
-	/* Output comes out all at once, the output sink much have
-	 * space for the full block
-	 */
-	if (sink_get_free_size(out) < cd->num_frames * cd->out_frame_bytes)
+	if (nout < out_need)
 		return false;
 
 	return true;
+#endif
 }
 
 static struct module_interface google_rtc_audio_processing_interface = {