rt-thread/bsp/stm32_radio/mp3/real/arm/asmpoly_thumb2.s

385 lines
10 KiB
ArmAsm

AREA |.text|, CODE, READONLY, ALIGN=2
THUMB
REQUIRE8
PRESERVE8
PCM RN r0
VB1 RN r1
COEF RN r2
VLO RN r0
VHI RN r3
SUM1LL RN r4
SUM1LH RN r5
SUM2LL RN r6
SUM2LH RN r7
SUM1RL RN r8
SUM1RH RN r9
SUM2RL RN r10
SUM2RH RN r11
CF1 RN r12
CF2 RN r14
SIGN RN r12
MAXPOS RN r14
I RN r12
GBLA RNDVAL
RNDVAL SETA (1 << ((32 - 12) + (6 - 1)))
; C64TOS - clip 64-bit accumulator to short (no rounding)
; xl, xh = value (lo 32, hi 32)
; input assumed to have 6 fraction bits
; sign = temp variable to use for sign
; maxPos = 0x00007fff (takes 2 instr. to generate - calculating
; once and using repeatedly saves if you do several CTOS in a row) */
MACRO
C64TOS $xl, $xh, $sign, $maxPos
mov $xl, $xl, lsr #(20+6)
orr $xl, $xl, $xh, lsl #(12-6)
mov $sign, $xl, ASR #31
cmp $sign, $xl, ASR #15
it ne
eorne $xl, $sign, $maxPos
MEND ; C64TOS
; MC0S - process 2 taps, 1 sample per channel (sample 0) */
; x = vb1 offset */
MACRO
MC0S $x
ldr CF1, [COEF], #4
ldr CF2, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
ldr VHI, [VB1, #(4*(23 - $x))]
smlal SUM1LL, SUM1LH, VLO, CF1
ldr VLO, [VB1, #(4*(32 + $x))]
rsb CF2, CF2, #0
smlal SUM1LL, SUM1LH, VHI, CF2
ldr VHI, [VB1, #(4*(32 + 23 - $x))]
smlal SUM1RL, SUM1RH, VLO, CF1
smlal SUM1RL, SUM1RH, VHI, CF2
MEND ; MCOS
; MC1S - process 2 taps, 1 sample per channel (sample 16)
; x = vb1 offset
MACRO
MC1S $x
ldr CF1, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
ldr VHI, [VB1, #(4*(32 + $x))]
smlal SUM1LL, SUM1LH, VLO, CF1
smlal SUM1RL, SUM1RH, VHI, CF1
MEND ; MC1S
; MC2S - process 2 taps, 2 samples per channel
; x = vb1 offset
MACRO
MC2S $x
; load data as far as possible in advance of using it
ldr CF1, [COEF], #4
ldr CF2, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
ldr VHI, [VB1, #(4*(23 - $x))]
smlal SUM1LL, SUM1LH, VLO, CF1
smlal SUM2LL, SUM2LH, VLO, CF2
rsb CF2, CF2, #0
smlal SUM2LL, SUM2LH, VHI, CF1
smlal SUM1LL, SUM1LH, VHI, CF2
ldr VHI, [VB1, #(4*(32 + 23 - $x))]
ldr VLO, [VB1, #(4*(32 + $x))]
smlal SUM1RL, SUM1RH, VHI, CF2
smlal SUM2RL, SUM2RH, VHI, CF1
rsb CF2, CF2, #0
smlal SUM1RL, SUM1RH, VLO, CF1
smlal SUM2RL, SUM2RH, VLO, CF2
MEND ; MC2S
; void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
xmp3_PolyphaseStereo PROC
EXPORT xmp3_PolyphaseStereo
stmfd sp!, {r4-r11, r14}
; clear out stack space for 2 local variables (4 bytes each)
sub sp, sp, #8
str PCM , [sp, #4] ; sp[1] = pcm pointer
; special case, output sample 0
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM1RL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
mov SUM1RH, #0
MC0S 0
MC0S 1
MC0S 2
MC0S 3
MC0S 4
MC0S 5
MC0S 6
MC0S 7
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
C64TOS SUM1RL, SUM1RH, SIGN, MAXPOS
strh SUM1LL, [PCM, #(2*0)]
strh SUM1RL, [PCM, #(2*1)]
; special case, output sample 16
add COEF, COEF, #(4*(256-16)) ; coef = coefBase + 256 (was coefBase + 16 after MC0S block)
add VB1, VB1, #(4*1024) ; vb1 = vbuf + 64*16
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM1RL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
mov SUM1RH, #0
MC1S 0
MC1S 1
MC1S 2
MC1S 3
MC1S 4
MC1S 5
MC1S 6
MC1S 7
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
C64TOS SUM1RL, SUM1RH, SIGN, MAXPOS
strh SUM1LL, [PCM, #(2*(2*16+0))]
strh SUM1RL, [PCM, #(2*(2*16+1))]
; main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17
sub COEF, COEF, #(4*(264-16)) ; coef = coefBase + 16 (was coefBase + 264 after MC1S block)
sub VB1, VB1, #(4*(1024-64)) ; vb1 = vbuf + 64 (was vbuf + 64*16 after MC1S block)
mov I, #15 ; loop counter, count down
add PCM, PCM, #(2*2) ; pcm+=2
LoopPS
str I, [sp, #0] ; sp[0] = i (loop counter)
str PCM, [sp, #4] ; sp[1] = pcm (pointer to pcm buffer)
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM1RL, #RNDVAL ; load rndVal (low 32)
mov SUM2LL, #RNDVAL ; load rndVal (low 32)
mov SUM2RL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
mov SUM1RH, #0
mov SUM2LH, #0
mov SUM2RH, #0
MC2S 0
MC2S 1
MC2S 2
MC2S 3
MC2S 4
MC2S 5
MC2S 6
MC2S 7
add VB1, VB1, #(4*64) ; vb1 += 64
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
C64TOS SUM1RL, SUM1RH, SIGN, MAXPOS
C64TOS SUM2LL, SUM2LH, SIGN, MAXPOS
C64TOS SUM2RL, SUM2RH, SIGN, MAXPOS
ldr I, [sp, #0] ; load loop counter
add CF2, PCM, I, lsl #3 ; CF2 = PCM + 4*i (short offset)
strh SUM2LL, [CF2], #2 ; *(pcm + 2*2*i + 0)
strh SUM2RL, [CF2], #2 ; *(pcm + 2*2*i + 1)
strh SUM1LL, [PCM], #2 ; *(pcm + 0)
strh SUM1RL, [PCM], #2 ; *(pcm + 1)
subs I, I, #1
bne LoopPS
; restore stack pointer
add sp, sp, #8
ldmfd sp!, {r4-r11, pc}
ENDP
; MONO PROCESSING
; MC0M - process 2 taps, 1 sample (sample 0)
; x = vb1 offset
MACRO
MC0M $x
ldr CF1, [COEF], #4
ldr CF2, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
ldr VHI, [VB1, #(4*(23 - $x))]
rsb CF2, CF2, #0
smlal SUM1LL, SUM1LH, VLO, CF1
smlal SUM1LL, SUM1LH, VHI, CF2
MEND ; MC0M
; MC1M - process 2 taps, 1 sample (sample 16)
; x = vb1 offset
MACRO
MC1M $x
ldr CF1, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
smlal SUM1LL, SUM1LH, VLO, CF1
MEND ; MC1M
; MC2M - process 2 taps, 2 samples
; x = vb1 offset
MACRO
MC2M $x
; load data as far as possible in advance of using it
ldr CF1, [COEF], #4
ldr CF2, [COEF], #4
ldr VLO, [VB1, #(4*($x))]
ldr VHI, [VB1, #(4*(23 - $x))]
smlal SUM1LL, SUM1LH, VLO, CF1
smlal SUM2LL, SUM2LH, VLO, CF2
rsb CF2, CF2, #0
smlal SUM1LL, SUM1LH, VHI, CF2
smlal SUM2LL, SUM2LH, VHI, CF1
MEND ; MC2M
; void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
xmp3_PolyphaseMono PROC
EXPORT xmp3_PolyphaseMono
stmfd sp!, {r4-r11, r14}
; clear out stack space for 4 local variables (4 bytes each)
sub sp, sp, #8
str PCM, [sp, #4] ; sp[1] = pcm pointer
; special case, output sample 0
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
MC0M 0
MC0M 1
MC0M 2
MC0M 3
MC0M 4
MC0M 5
MC0M 6
MC0M 7
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
strh SUM1LL, [PCM, #(2*0)]
; special case, output sample 16
add COEF, COEF, #(4*(256-16)) ; coef = coefBase + 256 (was coefBase + 16 after MC0M block)
add VB1, VB1, #(4*1024) ; vb1 = vbuf + 64*16
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
MC1M 0
MC1M 1
MC1M 2
MC1M 3
MC1M 4
MC1M 5
MC1M 6
MC1M 7
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
strh SUM1LL, [PCM, #(2*16)]
; main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17
sub COEF, COEF, #(4*(264-16)) ; coef = coefBase + 16 (was coefBase + 264 after MC1M block)
sub VB1, VB1, #(4*(1024-64)) ; vb1 = vbuf + 64 (was vbuf + 64*16 after MC1M block)
mov I, #15 ; loop counter, count down
add PCM, PCM, #(2) ; pcm++
LoopPM
str I, [sp, #0] ; sp[0] = i (loop counter)
str PCM, [sp, #4] ; sp[1] = pcm (pointer to pcm buffer)
mov SUM1LL, #RNDVAL ; load rndVal (low 32)
mov SUM2LL, #RNDVAL ; load rndVal (low 32)
mov SUM1LH, #0
mov SUM2LH, #0
MC2M 0
MC2M 1
MC2M 2
MC2M 3
MC2M 4
MC2M 5
MC2M 6
MC2M 7
add VB1, VB1, #(4*64) ; vb1 += 64
ldr PCM, [sp, #4] ; load pcm pointer
mov MAXPOS, #0x7f00
orr MAXPOS, MAXPOS, #0xff
C64TOS SUM1LL, SUM1LH, SIGN, MAXPOS
C64TOS SUM2LL, SUM2LH, SIGN, MAXPOS
ldr I, [sp, #0] ; load loop counter*/
add CF2, PCM, I, lsl #2 ; CF2 = PCM + 2*i (short offset)*/
strh SUM2LL, [CF2], #2 ; (pcm + 2*i + 0)
strh SUM1LL, [PCM], #2 ; (pcm + 0) ; pcm++
subs I, I, #1
bne LoopPM
; restore stack pointer
add sp, sp, #8
ldmfd sp!, {r4-r11, pc}
ENDP
END