diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c index d34b5eb70..51a65e940 100644 --- a/silk/LPC_analysis_filter.c +++ b/silk/LPC_analysis_filter.c @@ -31,6 +31,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "SigProc_FIX.h" #include "celt_lpc.h" +#include +#include "stack_alloc.h" /*******************************************/ /* LPC analysis filter */ @@ -46,7 +48,7 @@ POSSIBILITY OF SUCH DAMAGE. C89-compliant. */ #define USE_CELT_FIR 0 -void silk_LPC_analysis_filter( +void silk_LPC_analysis_filter_c( opus_int16 *out, /* O Output signal */ const opus_int16 *in, /* I Input signal */ const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ @@ -109,3 +111,68 @@ void silk_LPC_analysis_filter( silk_memset( out, 0, d * sizeof( opus_int16 ) ); #endif } + +/* NEON optimized LPC analysis filter - processes 8 outputs at a time + * Computes: out[i] = in[i] - sum(B[j] * in[i-d+j], j=0..d-1) + * where B is in Q12 format + */ +void silk_LPC_analysis_filter( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 d, /* I Filter order */ + int arch /* I Run-time architecture */ +) +{ + int ix, j; + (void)arch; + + celt_assert(d >= 6); + celt_assert((d & 1) == 0); + celt_assert(d <= len); + + for(ix = d; ix < len; ix++) { + const opus_int16 *in_ptr = &in[ix - 1]; + int32x4_t acc0 = vdupq_n_s32(0); + int32x4_t acc1 = vdupq_n_s32(0); + + /* Process coefficients int groups of 8 */ + for(j = 0; j < (d & ~7); j += 8) { + int16x4_t b_vec0 = vld1_s16(&B[j]); + int16x4_t b_vec1 = vld1_s16(&B[j + 4]); + int16x4_t in_vec0 = vld1_s16(&in_ptr[-j-3]); + int16x4_t in_vec1 = vld1_s16(&in_ptr[-j-7]); + in_vec0 = vrev64_s16(in_vec0); + in_vec1 = vrev64_s16(in_vec1); + acc0 = vmlal_s16(acc0, b_vec0, in_vec0); + acc1 = vmlal_s16(acc1, b_vec1, in_vec1); + } + + acc0 = vaddq_s32(acc0, acc1); + int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0)); + opus_int32 out32_Q12 = vget_lane_s32(vpadd_s32(sum, sum), 0); + + /* Handle remaining coefficients */ + for(; j < d; j++) { + out32_Q12 = silk_SMLABB_ovflw(out32_Q12, in_ptr[-j], B[j]); + } + + /* Subtract predicton */ + out32_Q12 = silk_SUB32_ovflw(silk_LSHIFT((opus_int32)in_ptr[1], 12), out32_Q12); + + /* Scale to Q0 and saturate */ + opus_int32 out32 = silk_RSHIFT_ROUND(out32_Q12, 12); + out[ix] = (opus_int16)silk_SAT16(out32); + } + + silk_memset(out, 0, d * sizeof(opus_int16)); + +#ifdef OPUS_CHECK_ASM + VARDECL( opus_int16, out_c ); + ALLOC( out_c, len, opus_int16 ); + silk_LPC_analysis_filter_c( out_c, in, B, len, d, arch ); + silk_assert( !memcmp( out, out_c, len * sizeof(opus_int16) ) ); +#endif + +} diff --git a/silk/arm/NSQ_del_dec_neon_intr.c b/silk/arm/NSQ_del_dec_neon_intr.c index 668dde6dc..651824cbe 100644 --- a/silk/arm/NSQ_del_dec_neon_intr.c +++ b/silk/arm/NSQ_del_dec_neon_intr.c @@ -508,35 +508,93 @@ static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon( return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) ); } +static OPUS_INLINE int32x4_t silk_SMLAWB_lane_0_neon( + const int32x4_t out_s32x4, + const int32x4_t in_s32x4, + const int32x4_t coef_s32x4 +) +{ + return vaddq_s32( out_s32x4, vqdmulhq_laneq_s32( in_s32x4, coef_s32x4, 0 ) ); +} + +static OPUS_INLINE int32x4_t silk_SMLAWB_lane_1_neon( + const int32x4_t out_s32x4, + const int32x4_t in_s32x4, + const int32x4_t coef_s32x4 +) +{ + return vaddq_s32( out_s32x4, vqdmulhq_laneq_s32( in_s32x4, coef_s32x4, 1 ) ); +} + +static OPUS_INLINE int32x4_t silk_SMLAWB_lane_2_neon( + const int32x4_t out_s32x4, + const int32x4_t in_s32x4, + const int32x4_t coef_s32x4 +) +{ + return vaddq_s32( out_s32x4, vqdmulhq_laneq_s32( in_s32x4, coef_s32x4, 2 ) ); +} + +static OPUS_INLINE int32x4_t silk_SMLAWB_lane_3_neon( + const int32x4_t out_s32x4, + const int32x4_t in_s32x4, + const int32x4_t coef_s32x4 +) +{ + return vaddq_s32( out_s32x4, vqdmulhq_laneq_s32( in_s32x4, coef_s32x4, 3 ) ); +} + /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */ /* Therefore here we append "_local" to the function name to avoid confusion. */ static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order) { - const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 ); - const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 ); - const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 ); - const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 ); + silk_assert( order == 10 || order == 16 ); + int32x4_t LPC_pred_Q14_s32x4; + int32x4_t a_s32x4_0, a_s32x4_1, b0, b1, b2, b3, b4, b5, b6, b7; - silk_assert( order == 10 || order == 16 ); /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) ); - LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) ); + + __asm__ __volatile__ ( + "ldp %q[a0], %q[a1], [%[aptr]]\n" + "ldp %q[b0], %q[b1], [%[buf], #0]\n" + "ldp %q[b2], %q[b3], [%[buf], #32]\n" + "ldp %q[b4], %q[b5], [%[buf], #64]\n" + "ldp %q[b6], %q[b7], [%[buf], #96]\n" + : [a0]"=w"(a_s32x4_0), [a1]"=w"(a_s32x4_1), [b0]"=w"(b0), [b1]"=w"(b1), [b2]"=w"(b2), [b3]"=w"(b3), [b4]"=w"(b4), [b5]"=w"(b5), [b6]"=w"(b6), [b7]"=w"(b7) + : [aptr]"r"(a_Q12_arch), [buf]"r"(buf32) + ); + /* Block 0: coeffs 0-3 */ + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_0_neon( LPC_pred_Q14_s32x4, b0, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_1_neon( LPC_pred_Q14_s32x4, b1, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_2_neon( LPC_pred_Q14_s32x4, b2, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_3_neon( LPC_pred_Q14_s32x4, b3, a_s32x4_0 ); + /* Block 1: coeffs 4-7 */ + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_0_neon( LPC_pred_Q14_s32x4, b4, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_1_neon( LPC_pred_Q14_s32x4, b5, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_2_neon( LPC_pred_Q14_s32x4, b6, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_3_neon( LPC_pred_Q14_s32x4, b7, a_s32x4_1 ); + + __asm__ __volatile__ ( + "ldp %q[a0], %q[a1], [%[aptr], #32]\n" + "ldp %q[b0], %q[b1], [%[buf], #128]\n" + "ldp %q[b2], %q[b3], [%[buf], #160]\n" + "ldp %q[b4], %q[b5], [%[buf], #192]\n" + "ldp %q[b6], %q[b7], [%[buf], #224]\n" + : [a0]"=w"(a_s32x4_0), [a1]"=w"(a_s32x4_1), [b0]"=w"(b0), [b1]"=w"(b1), [b2]"=w"(b2), [b3]"=w"(b3), [b4]"=w"(b4), [b5]"=w"(b5), [b6]"=w"(b6), [b7]"=w"(b7) + : [aptr]"r"(a_Q12_arch), [buf]"r"(buf32) + ); + /* Block 2: coeffs 8-11 */ + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_0_neon( LPC_pred_Q14_s32x4, b0, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_1_neon( LPC_pred_Q14_s32x4, b1, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_2_neon( LPC_pred_Q14_s32x4, b2, a_s32x4_0 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_3_neon( LPC_pred_Q14_s32x4, b3, a_s32x4_0 ); + /* Block 3: coeffs 12-15 */ + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_0_neon( LPC_pred_Q14_s32x4, b4, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_1_neon( LPC_pred_Q14_s32x4, b5, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_2_neon( LPC_pred_Q14_s32x4, b6, a_s32x4_1 ); + LPC_pred_Q14_s32x4 = silk_SMLAWB_lane_3_neon( LPC_pred_Q14_s32x4, b7, a_s32x4_1 ); return LPC_pred_Q14_s32x4; } @@ -579,6 +637,11 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( opus_int32 a_Q12_arch[MAX_LPC_ORDER]; const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 ); const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1; + static const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) ) + / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) ); + /* Precompute Tilt_Q14_Q16 and LF_shp_Q14_Q15 to avoid repeated calculation in loop */ + const opus_int32 Tilt_Q14_Q16 = silk_LSHIFT32( Tilt_Q14, 16 ) >> 1; + const opus_int32 LF_shp_Q14_Q15 = silk_LSHIFT32( LF_shp_Q14 >> 16, 15 ); opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ]; const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER ); const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT ); @@ -595,25 +658,25 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); - for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) { + /* MAX_SHAPE_LPC_ORDER = 24, which is divisible by 8, so no scalar fallback needed */ + for( i = 0; i < MAX_SHAPE_LPC_ORDER; i += 8 ) { const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i ); vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) ); vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) ); } - for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) { - AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 ); - } - silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder ); for( i = 0; i < length; i++ ) { int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4; int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4; - int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4; + int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4, LF_AR_Q14_cached; int32x2_t AR_shp_Q28_s32x2; int16x4_t r_Q10_s16x4, rr_Q10_s16x4; + /* Cache LF_AR_Q14 to avoid repeated loads */ + LF_AR_Q14_cached = vld1q_s32( psDelDec->LF_AR_Q14 ); + /* Perform common calculations used in all states */ /* Long-term prediction */ @@ -662,27 +725,49 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 ); n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); - /* Loop over allpass sections */ - for( j = 2; j < shapingLPCOrder; j += 2 ) { - /* Output of allpass section */ - tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 ); - tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 ); - vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 ); - n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); - /* Output of allpass section */ - tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 ); - tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 ); - vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 ); - AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] ); - n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); + /* shapingLPCOrder is always even: 12, 14, 16, 20, 24 */ + #define ALLPASS_SECTION( j ) \ + tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ (j) + 0 ] ), tmp1_s32x4 ); \ + tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ (j) - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 ); \ + vst1q_s32( psDelDec->sAR2_Q14[ (j) - 1 ], tmp1_s32x4 ); \ + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); \ + tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ (j) + 1 ] ), tmp2_s32x4 ); \ + tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ (j) + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 ); \ + vst1q_s32( psDelDec->sAR2_Q14[ (j) + 0 ], tmp2_s32x4 ); \ + AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ (j) ] ); \ + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) ); + + ALLPASS_SECTION( 2 ); + ALLPASS_SECTION( 4 ); + ALLPASS_SECTION( 6 ); + ALLPASS_SECTION( 8 ); + ALLPASS_SECTION( 10 ); + if ( shapingLPCOrder > 12 ) { + ALLPASS_SECTION( 12 ); + if ( shapingLPCOrder > 14 ) { + ALLPASS_SECTION( 14 ); + if ( shapingLPCOrder > 16 ) { + ALLPASS_SECTION( 16 ); + if ( shapingLPCOrder > 18 ) { + ALLPASS_SECTION( 18 ); + if ( shapingLPCOrder > 20 ) { + ALLPASS_SECTION( 20 ); + if ( shapingLPCOrder > 22 ) { + ALLPASS_SECTION( 22 ); + } + } + } + } + } } + #undef ALLPASS_SECTION vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 ); n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) ); n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */ - n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */ + n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( LF_AR_Q14_cached, Tilt_Q14_Q16 ) ); /* Q12 */ n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */ n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */ - n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */ + n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( LF_AR_Q14_cached, LF_shp_Q14_Q15 ) ); /* Q12 */ n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */ /* Input minus prediction plus noise feedback */ @@ -867,15 +952,34 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon( /* Replace a state if best from second set outperforms worst in first set */ if( RDmin_Q10 < RDmax_Q10 ) { opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState; - const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) ) - / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) ); + /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */ /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */ /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */ for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) { psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ]; } - for( j = 0; j < numOthers; j++ ) { + /* unroll with software prefetch to hide memory latency */ + for( j = 0; j + 15 < numOthers; j += 16) { + __builtin_prefetch( &ptr[ j + 32 ][ 0 ], 0, 1 ); + ptr[ j + 0 ][ RDmax_ind ] = ptr[ j + 0 ][ RDmin_ind ]; + ptr[ j + 1 ][ RDmax_ind ] = ptr[ j + 1 ][ RDmin_ind ]; + ptr[ j + 2 ][ RDmax_ind ] = ptr[ j + 2 ][ RDmin_ind ]; + ptr[ j + 3 ][ RDmax_ind ] = ptr[ j + 3 ][ RDmin_ind ]; + ptr[ j + 4 ][ RDmax_ind ] = ptr[ j + 4 ][ RDmin_ind ]; + ptr[ j + 5 ][ RDmax_ind ] = ptr[ j + 5 ][ RDmin_ind ]; + ptr[ j + 6 ][ RDmax_ind ] = ptr[ j + 6 ][ RDmin_ind ]; + ptr[ j + 7 ][ RDmax_ind ] = ptr[ j + 7 ][ RDmin_ind ]; + ptr[ j + 8 ][ RDmax_ind ] = ptr[ j + 8 ][ RDmin_ind ]; + ptr[ j + 9 ][ RDmax_ind ] = ptr[ j + 9 ][ RDmin_ind ]; + ptr[ j + 10 ][ RDmax_ind ] = ptr[ j + 10 ][ RDmin_ind ]; + ptr[ j + 11 ][ RDmax_ind ] = ptr[ j + 11 ][ RDmin_ind ]; + ptr[ j + 12 ][ RDmax_ind ] = ptr[ j + 12 ][ RDmin_ind ]; + ptr[ j + 13 ][ RDmax_ind ] = ptr[ j + 13 ][ RDmin_ind ]; + ptr[ j + 14 ][ RDmax_ind ] = ptr[ j + 14 ][ RDmin_ind ]; + ptr[ j + 15 ][ RDmax_ind ] = ptr[ j + 15 ][ RDmin_ind ]; + } + for( ; j < numOthers; j++ ) { ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ]; } diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c index 6f3be025c..07dd4c245 100644 --- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c +++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -199,28 +199,29 @@ void silk_warped_autocorrelation_FIX_neon( stateT++; } while( --n ); } - } + /* Calculate corr_QC[orderT] as sum of squared inputs */ + { + const opus_int32 *input_QS_ptr = input_QST + MAX_SHAPE_LPC_ORDER; + int64x2_t corr_QC_orderT_s64x2 = vdupq_n_s64( 0 ); - { - const opus_int16 *inputT = input; - int32x4_t t_s32x4; - int64x1_t t_s64x1; - int64x2_t t_s64x2 = vdupq_n_s64( 0 ); - for( n = 0; n <= length - 8; n += 8 ) { - int16x8_t input_s16x8 = vld1q_s16( inputT ); - t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); - t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); - t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); - t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); - inputT += 8; - } - t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); - corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); - for( ; n < length; n++ ) { - corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); + for( n = 0; n <= length - 4; n += 4 ) { + int32x4_t input_QS_s32x4 = vld1q_s32( input_QS_ptr ); + int64x2_t t_s64x2_0 = vmull_s32( vget_low_s32( input_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) ); + int64x2_t t_s64x2_1 = vmull_s32( vget_high_s32( input_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) ); + corr_QC_orderT_s64x2 = vsraq_n_s64( corr_QC_orderT_s64x2, t_s64x2_0, 2 * QS - QC ); + corr_QC_orderT_s64x2 = vsraq_n_s64( corr_QC_orderT_s64x2, t_s64x2_1, 2 * QS - QC ); + input_QS_ptr += 4; + } + + int64x1_t sum_s64x1 = vadd_s64( vget_low_s64( corr_QC_orderT_s64x2 ), vget_high_s64( corr_QC_orderT_s64x2 ) ); + corr_QC_orderT = vget_lane_s64( sum_s64x1, 0 ); + + for ( ; n < length; n++ ) { + corr_QC_orderT += silk_RSHIFT64( silk_SMULL( input_QS_ptr[ 0 ], input_QS_ptr[ 0 ] ), 2 * QS - QC ); + input_QS_ptr++; + } + corr_QC[ orderT ] = corr_QC_orderT; } - corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); - corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order;