Hint compiler about memory aliasing restrictions

arkq · Nov 20, 2024 · 1ecfc8c · 1ecfc8c
1 parent c927a9e
commit 1ecfc8c
Show file tree

Hide file tree

Showing 9 changed files with 72 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ libraries (except original Qualcomm libraries) were compiled with Clang version
 | aptxHD100                              | &mdash;   | &mdash; | 1m21.950s | 0.89616 |
 | [libopenaptx-0.2.0][2]                 | 1m22.090s | 0.89062 | 1m25.730s | 0.85429 |
 
-[1]: ./archive "Archive with Qualcomm apt-X encoding libraries"
+[1]: archive/aarch64 "Archive with Qualcomm apt-X encoding libraries"
 [2]: https://github.com/pali/libopenaptx "The apt-X encoder/decoder based on FFmpeg code"
 
 ## Resources

diff --git a/src/aptx422/processor.c b/src/aptx422/processor.c
@@ -34,7 +34,9 @@ void aptX_prediction_filtering(int32_t a, aptX_prediction_filter_422 * f) {
 	int32_t tmp1 = a + f->unk8;
 	clamp_int24_t(tmp1);
 
-	int32_t tmp2 = ((int64_t)tmp1 * f->unk2 + (int64_t)f->unk3 * f->unk6) >> 22;
+	int64_t x1 = (int64_t)f->unk3 * f->unk6;
+	int64_t x2 = (int64_t)tmp1 * f->unk2;
+	int32_t tmp2 = (x1 + x2) >> 22;
 	clamp_int24_t(tmp2);
 
 	int32_t v1 = 128;
@@ -44,24 +46,21 @@ void aptX_prediction_filtering(int32_t a, aptX_prediction_filter_422 * f) {
 		v2 = ((a >> 31) & 0xFF000000) + 8388736;
 	}
 
-	int32_t * q = &f->arr2[f->i + f->width];
+	size_t q = f->i + f->width;
 	int64_t sum = 0;
-	int32_t c = a;
-
-	f->i = (f->i + 1) % f->width;
-	f->subband_param_unk3_3 = a;
+	int64_t c = a;
 
 	for (size_t i = 0; i < (size_t)f->width; i++, q--) {
 
 		int32_t tmp;
-		if (*q >= 0)
+		if (f->arr2[q] >= 0)
 			tmp = v2 - f->arr1[i];
 		else
 			tmp = v1 - f->arr1[i];
 
 		f->arr1[i] += (tmp >> 8) - (((uint32_t)tmp) << 23 == 0x80000000);
-		sum += (int64_t)f->arr1[i] * c;
-		c = *q;
+		sum += c * f->arr1[i];
+		c = f->arr2[q];
 	}
 
 	f->unk6 = tmp1;
@@ -70,6 +69,8 @@ void aptX_prediction_filtering(int32_t a, aptX_prediction_filter_422 * f) {
 	f->unk8 = f->unk7 + tmp2;
 	clamp_int24_t(f->unk8);
 
+	f->i = (f->i + 1) % f->width;
+
 	f->arr2[f->i] = a;
 	f->arr2[f->i + f->width] = a;
 }
@@ -88,15 +89,15 @@ void aptX_process_subband(int32_t a, int32_t dither, aptX_prediction_filter_422
 		f->sign2 = f->sign1;
 		f->sign1 = -1;
 	}
-	if (tmp == 0) {
-		sign1 *= 0;
-		sign2 *= 0;
+	else if (tmp > 0) {
+		sign1 *= 1;
+		sign2 *= 1;
 		f->sign2 = f->sign1;
 		f->sign1 = 1;
 	}
-	if (tmp > 0) {
-		sign1 *= 1;
-		sign2 *= 1;
+	else {
+		sign1 *= 0;
+		sign2 *= 0;
 		f->sign2 = f->sign1;
 		f->sign1 = 1;
 	}

diff --git a/src/aptx422/qmf.c b/src/aptx422/qmf.c
@@ -61,7 +61,8 @@ void aptX_QMF_conv_inner(const int32_t s1[16], const int32_t s2[16], int32_t * o
 	*out_b = r2;
 }
 
-void aptX_QMF_analysis(aptX_QMF_analyzer_422 * qmf, const int32_t samples[4], const int32_t refs[4], int32_t diff[4]) {
+void aptX_QMF_analysis(aptX_QMF_analyzer_422 * restrict qmf, const int32_t samples[restrict 4],
+                       const int32_t refs[restrict 4], int32_t diff[restrict 4]) {
 
 	int32_t a, b, c, d;
 	int32_t tmp[4];

diff --git a/src/aptx422/search.c b/src/aptx422/search.c
@@ -16,15 +16,11 @@ static size_t aptX_search_quant_coeff(uint32_t a, int32_t x, const int32_t * dat
 	 * integer space. The search is done using a simple binary search algorithm. */
 
 	int64_t aa = (int64_t)a << 32;
+	int64_t xx = x << 8;
 	size_t i = 0;
-	size_t n;
-
-	for (n = size / 2; n > 0; n /= 2)
-		/* XXX: There might be a potential error during calculation, because it
-		 *      seems that the subtraction is performed as an unsigned operation.
-		 *      Anyway, this algorithm and the original one (from the apt-X lib)
-		 *      have been stress-tested and both return the same values. */
-		if ((int64_t)data[i + n] * (x << 8) - aa <= 0)
+
+	for (size_t n = size / 2; n > 0; n /= 2)
+		if (xx * data[i + n] <= aa)
 			i += n;
 
 	return i;

diff --git a/src/aptxhd100/processor.c b/src/aptxhd100/processor.c
@@ -9,7 +9,6 @@
  */
 
 #include "processor.h"
-#include <stdio.h>
 
 #include "mathex.h"
 
@@ -21,7 +20,7 @@ void aptXHD_invert_quantization(int32_t a, int32_t dither, aptXHD_inverter_100 *
 	int64_t tmp = (int64_t)dither * i->subband_param_dith16_sf1[i_];
 	tmp = rshift32(((int64_t)sl1 << 31) + tmp);
 	clamp_int24_t(tmp);
-	i->unk11 = (i->unk9 * tmp) >> 19;
+	i->unk11 = (tmp * i->unk9) >> 19;
 	clamp_int24_t(i->unk11);
 
 	i->unk10 = rshift15(32620 * i->unk10 + (i->subband_param_incr16[i_] << 15));
@@ -33,82 +32,82 @@ void aptXHD_invert_quantization(int32_t a, int32_t dither, aptXHD_inverter_100 *
 
 void aptXHD_prediction_filtering(int32_t a, aptXHD_prediction_filter_100 * f) {
 
-	uint64_t x1 = (unsigned)f->unk6 * (uint64_t)(unsigned)f->unk3;
-	x1 += (uint64_t)(f->unk6 * (f->unk3 >> 31) + f->unk3 * (f->unk6 >> 31)) << 32;
+	int32_t tmp1 = a + f->unk8;
+	clamp_int24_t(tmp1);
 
-	f->unk6 = a + f->unk8;
-	clamp_int24_t(f->unk6);
+	int64_t x1 = (int64_t)f->unk3 * f->unk6;
+	int64_t x2 = (int64_t)tmp1 * f->unk2;
+	int32_t tmp2 = (x1 + x2) >> 22;
+	clamp_int24_t(tmp2);
 
-	uint64_t x2 = (unsigned)f->unk2 * (uint64_t)(unsigned)f->unk6;
-	x2 += (uint64_t)(f->unk6 * (f->unk2 >> 31) + f->unk2 * (f->unk6 >> 31)) << 32;
-
-	f->unk8 = (x1 + x2) >> 22;
-	clamp_int24_t(f->unk8);
-
-	int32_t v1 = 0x80;
-	int32_t v2 = 0x80;
+	int32_t v1 = 128;
+	int32_t v2 = 128;
 	if (a) {
-		v1 = ((a >> 31) & 0x01000000) - 0x7FFF80;
-		v2 = ((a >> 31) & 0xFF000000) + 0x800080;
+		v1 = ((a >> 31) & 0x01000000) - 8388480;
+		v2 = ((a >> 31) & 0xFF000000) + 8388736;
 	}
 
+	size_t q = f->i + f->width;
 	int64_t sum = 0;
 	int64_t c = a;
 
-	for (size_t i = 0; i < (size_t)f->width; i++) {
+	for (size_t i = 0; i < (size_t)f->width; i++, q--) {
 
 		int32_t tmp;
-		if (f->arr2[f->i + f->width - i] >= 0)
+		if (f->arr2[q] >= 0)
 			tmp = v2 - f->arr1[i];
 		else
 			tmp = v1 - f->arr1[i];
 
 		f->arr1[i] += (tmp >> 8) - (((uint32_t)tmp) << 23 == 0x80000000);
-
 		sum += c * f->arr1[i];
-		c = f->arr2[f->i + f->width - i];
+		c = f->arr2[q];
 	}
 
+	f->unk6 = tmp1;
 	f->unk7 = sum >> 22;
 	clamp_int24_t(f->unk7);
-	f->unk8 = f->unk7 + f->unk8;
+	f->unk8 = f->unk7 + tmp2;
 	clamp_int24_t(f->unk8);
 
 	f->i = (f->i + 1) % f->width;
+
 	f->arr2[f->i] = a;
 	f->arr2[f->i + f->width] = a;
-	f->subband_param_unk3_3 = a;
 }
 
 void aptXHD_process_subband(int32_t a, int32_t dither, aptXHD_prediction_filter_100 * f, aptXHD_inverter_100 * i) {
 
 	aptXHD_invert_quantization(a, dither, i);
 
+	int32_t sign1 = f->sign1;
+	int32_t sign2 = f->sign2;
+
 	int32_t tmp = f->unk7 + i->unk11;
-	int sign1 = f->sign1;
-	int sign2 = f->sign2;
-	if (tmp > 0) {
-		f->sign1 = 1;
-		f->sign2 = sign1;
-	} else if (tmp < 0) {
-		f->sign1 = -1;
-		f->sign2 = sign1;
+	if (tmp < 0) {
 		sign1 *= -1;
 		sign2 *= -1;
-	} else {
+		f->sign2 = f->sign1;
+		f->sign1 = -1;
+	}
+	else if (tmp > 0) {
+		sign1 *= 1;
+		sign2 *= 1;
+		f->sign2 = f->sign1;
+		f->sign1 = 1;
+	}
+	else {
+		sign1 *= 0;
+		sign2 *= 0;
+		f->sign2 = f->sign1;
 		f->sign1 = 1;
-		f->sign2 = sign1;
-		sign1 = 0;
-		sign2 = 0;
 	}
 
 	tmp = -1 * f->unk2 * sign1;
 	tmp = ((tmp + 1) >> 1) - ((tmp & 3) == 1);
-
-	tmp = tmp + 0x80000 * sign2;
 	clip_range(tmp, -0x100000, 0x100000);
 
-	f->unk3 = 254 * f->unk3 + (tmp >> 4 << 8);
+	f->unk3 = 254 * f->unk3 + 0x800000 * sign2 + (tmp >> 4 << 8);
 	f->unk3 = rshift8(f->unk3);
 	clip_range(f->unk3, -0x300000, 0x300000);
 

diff --git a/src/aptxhd100/qmf.c b/src/aptxhd100/qmf.c
@@ -61,10 +61,11 @@ void aptXHD_QMF_conv_inner(const int32_t s1[16], const int32_t s2[16], int32_t *
 	*out_b = r2;
 }
 
-void aptXHD_QMF_analysis(aptXHD_QMF_analyzer_100 * qmf, const int32_t samples[4], const int32_t refs[4],
-                         int32_t diff[4]) {
+void aptXHD_QMF_analysis(aptXHD_QMF_analyzer_100 * restrict qmf, const int32_t samples[restrict 4],
+                         const int32_t refs[restrict 4], int32_t diff[restrict 4]) {
 
 	int32_t a, b, c, d;
+	int32_t tmp[4];
 
 	qmf->outer[0][qmf->i_outer + 0] = samples[0];
 	qmf->outer[0][qmf->i_outer + 16] = samples[0];
@@ -96,12 +97,12 @@ void aptXHD_QMF_analysis(aptXHD_QMF_analyzer_100 * qmf, const int32_t samples[4]
 
 	qmf->i_inner = (qmf->i_inner + 1) % 16;
 
-	aptXHD_QMF_conv_inner(&qmf->inner[2][qmf->i_inner + 15], &qmf->inner[0][qmf->i_inner], &diff[0], &diff[1]);
+	aptXHD_QMF_conv_inner(&qmf->inner[2][qmf->i_inner + 15], &qmf->inner[0][qmf->i_inner], &tmp[0], &tmp[1]);
 
-	aptXHD_QMF_conv_inner(&qmf->inner[1][qmf->i_inner + 15], &qmf->inner[3][qmf->i_inner], &diff[2], &diff[3]);
+	aptXHD_QMF_conv_inner(&qmf->inner[1][qmf->i_inner + 15], &qmf->inner[3][qmf->i_inner], &tmp[2], &tmp[3]);
 
 	for (size_t i = 0; i < 4; i++)
-		diff[i] -= refs[i];
+		diff[i] = tmp[i] - refs[i];
 	for (size_t i = 0; i < 4; i++)
 		clamp_int24_t(diff[i]);
 }
diff --git a/src/aptxhd100/quantizer.c b/src/aptxhd100/quantizer.c
@@ -29,10 +29,10 @@ static void aptXHD_quantize_difference(int32_t diff, int32_t dither, int32_t qua
 	int absdiff = abs32(diff);
 	clamp_int24_t(absdiff);
 
-	int64_t v3 = v2 * 16 * (int64_t)(quant * -256);
-	q->unk3 = rshift3((v3 >> 32) + absdiff);
+	int32_t v3 = rshift32((int64_t)(v2 << 4) * (quant * -1 << 8)) + absdiff;
+	q->unk3 = ((v3 + 4) >> 3) - ((uint8_t)(v3 << 5) == 0x80);
 
-	if (absdiff + (v3 >> 32) < 0) {
+	if (q->unk3 < 0) {
 		q->unk2 = q->unk1;
 		q->unk1 = q->unk1 - 1;
 		q->unk3 = -q->unk3;

diff --git a/src/aptxhd100/search.c b/src/aptxhd100/search.c
@@ -15,9 +15,8 @@ static size_t aptXHD_search_quant_coeff(uint32_t a, int32_t x, const int32_t * d
 	int64_t aa = (int64_t)a << 32;
 	int64_t xx = x << 8;
 	size_t i = 0;
-	size_t n;
 
-	for (n = size / 2; n > 0; n /= 2)
+	for (size_t n = size / 2; n > 0; n /= 2)
 		if (xx * data[i + n] <= aa)
 			i += n;
 

diff --git a/test/heval-hd100.c b/test/heval-hd100.c
@@ -67,9 +67,9 @@ static int eval_init(size_t nloops, bool errstop) {
 		aptxhdbtenc_init(&enc_100, endian);
 		aptXHD_init(&enc_new, endian);
 
-		int c, b, ret = 0;
-		for (c = 0; c < APTXHD_CHANNELS; c++)
-			for (b = 0; b < APTXHD_SUBBANDS; b++) {
+		int ret = 0;
+		for (size_t c = 0; c < APTXHD_CHANNELS; c++)
+			for (size_t b = 0; b < APTXHD_SUBBANDS; b++) {
 				for (size_t i = 0; i < param_sizes[b]; i++)
 					ret |= diffint("bit16", enc_new.encoder[c].processor[b].inverter.subband_param_bit16_sl1[i],
 					               enc_100.encoder[c].processor[b].inverter.subband_param_bit16_sl1[i]);