@Krashan, post #60
@Don_Adan, post #61
clamped:
move.w d5,(a1)+ ; tu przeniesc, jesli do chipu/fastu
ASR.W #4,d4 ; scale residual signal down
move.w d5,(a1)+ ; albo tu
MOVE.W a2,d6
BMI.S h4neg
ADD.W d4,d2
BRA.S h3
h4neg: SUB.W d4,d2
h3: SWAP d2
MOVE.W a3,d6
BMI.S h3neg
ADD.W d4,d2
BRA.S h2
h3neg: SUB.W d4,d2
h2: MOVE.W a4,d6
BMI.S h2neg
ADD.W d4,d3
BRA.S h1
h2neg: SUB.W d4,d3
h1: SWAP d3
MOVE.W d1,d6
BMI.S h1neg
ADD.W d4,d3
BRA.S update
h1neg: SUB.W d4,d3
; update history vector
update:
move.w d5,(a1)+ ; albo tutaj jesli tylko do fastu
MOVEA.W a3,a2
MOVEA.W a4,a3
MOVEA.W d1,a4
MOVE.W d5,d1
; store output sample
; MOVE.W d5,(a1)+
ADDA.W sampoff(pc),a1
DBF d7,DecLoop
RTS@Don_Adan, post #62
;
; QOA decoder
;==============================================================================
; STEREO DECODING STRATEGY
;
; Stereo QOA files have interleaved slices in LR order. Decoding them in order
; means LMS state has to be swapped for each slice. To avoid this, there are
; two passes over a frame. The first pass loads L channel LMS state then
; decodes only even slices and stores audio samples at output buffer offset 0,
; advancing 4 bytes after each sample. The second pass loads R channel LMS
; state, then decodes only odd slices and stores audio samples at output buffer
; offset 2, advancing 4 bytes after each sample.
;==============================================================================
;==============================================================================
; Decodes QOA mono frame to a buffer.
; INPUTS:
; d0 - number of slices available in the frame buffer (1 to 256 including)
; a0 - frame buffer
; a1 - output buffer
;==============================================================================
XDEF _DecodeMonoFrame
_DecodeMonoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
; LEA sampoff,a2
; MOVE.W #0,(a2)
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
clr.w (A5)
nextslice: SWAP d7
BSR.S slice ; decode slice
SWAP d7
DBF d7,nextslice
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
;==============================================================================
; Decodes QOA stereo frame to a buffer.
; INPUTS:
; d0.w - number of slices per channel available in the frame buffer (1 to 256
; including)
; a0.l - frame buffer
; a1.l - output buffer
;==============================================================================
XDEF _DecodeStereoFrame
_DecodeStereoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
; LEA sampoff,a2
; MOVE.W #2,(a2)
MOVE.L a0,-(sp)
MOVE.L a1,-(sp)
MOVE.W d0,-(sp)
; L channel pass
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
move.w #2,(A5)
LEA 16(a0),a0 ; skip R channel LMS state
nextleft: SWAP d7
BSR.S slice ; decode slice
SWAP d7
ADDQ.L #8,a0 ; skip R channel slice
DBF d7,nextleft
; R channel pass
MOVE.W (sp)+,d7 ; slice counter
MOVEA.L (sp)+,a1 ; output buffer
MOVEA.L (sp)+,a0 ; input buffer
ADDQ.L #2,a1 ; R channel samples
LEA 16(a0),a0 ; skip L channel LMS state
BSR.S loadlms
nextright: ADDQ.l #8,a0 ; skip L channel slice
SWAP d7
BSR.S slice ; decode slice
SWAP d7
DBF d7,nextright
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
loadlms: MOVEA.W (a0)+,a2 ; loading LMS history
MOVEA.W (a0)+,a3
MOVEA.W (a0)+,a4
swap D7
move.w (a0)+,d7
MOVE.L (a0)+,d2 ; loading LMS weights
MOVE.L (a0)+,d3
lea sampoff(pc),A5
RTS
;==============================================================================
; Decodes QOA slice of mono/stereo stream.
; Registers usage:
; d0 - slice
; d2,d3 - LMS weights (updated)
; d4 - residual sample, quantized, dequantized, scaled
; d5 - predicted sample
; d6 - scratch register
; d7 - not used (slice loop counter)
; a0 - not used (input data pointer)
; a1 - output data pointer (advanced)
; a2,a3,a4,d1 - LMS history (updated)
; a6 - pointer to 'dequant' lookup table (modified)
;==============================================================================
slice: MOVE.L (a0)+,d0
LEA dequant(pc),a6
ROL.L #8,d0
MOVE.B d0,d4
ANDI.W #$00F0,d4 ; scale factor in bits 7:4 of d4
ADDA.W d4,a6 ; select lookup table row
;extract 9 residuals from d0, r[0] is in position already
moveq #8,d1 ; can't MOVEQ, upper half in use
BSR.S DecSamp
; now the first bit of r[9] is in d0:0, pull two bits from d1
moveq #1,D4
and.b D0,D4
move.l (A0)+,D0
add.l D0,D0
addx.b D4,D4
add.l D0,D0
addx.b D4,D4
add.b D4,D4
moveq #0,D7
bsr.b OnlyOnce
rol.l #4,D0
; extract 10 residuals from d0
moveq #9,d1
BRA.S DecSamp ; (ab)use RTS at end of DecSamp
;==============================================================================
; Decodes a single sample. 3-bit encoded sample is in bits 3:1 of register d4
;==============================================================================
DecLoop: ROL.L #3,d0
; decode residual sample using lookup table, store in d4
DecSamp: MOVEQ #$E,d4
AND.W d0,d4 ; extract encoded sample in d4
OnlyOnce:
MOVE.W (a6,d4.w),d4 ; decode with lookup table
; calculate predicted sample, store in d5
MOVE.W d7,d5 ; history[-1]
MULS.W d3,d5 ; *= weights[-1]
SWAP d3
MOVE.W a4,d6 ; history[-2]
MULS.W d3,d6 ; *= weights[-2]
ADD.L d6,d5
MOVE.W a3,d6 ; history[-3]
MULS.W d2,d6 ; *= weights[-3]
ADD.L d6,d5
SWAP d2
MOVE.W a2,d6 ; history[-4]
MULS.W d2,d6 ; *= weights[-4]
ADD.L d6,d5
ASR.L #6,d5
ASR.L #7,d5 ; predicted sample in d5
; add predicted sample to reconstructed residual with clamp to
; 16-bit signed range, store in d5
move.w D4,A5
add.l A5,D5
MOVEA.W d5,a5 ; with sign-extend to 32 bits
CMP.L a5,d5
BEQ.S clamped
SGT d5 ; ??FF positive, ??00 negative
EXT.W d5 ; FFFF positive, 0000 negative
EORI.W #$8000,d5 ; 7FFF positive, 8000 negative
; update LMS weights, reconstructed sample in d5, decoded
; residual in d4
clamped: ASR.W #4,d4 ; scale residual signal down
MOVE.W a2,d6
BMI.S h4neg
ADD.W d4,d2
BRA.S h3
h4neg: SUB.W d4,d2
h3: SWAP d2
MOVE.W a3,d6
BMI.S h3neg
ADD.W d4,d2
BRA.S h2
h3neg: SUB.W d4,d2
h2: MOVE.W a4,d6
BMI.S h2neg
ADD.W d4,d3
BRA.S h1
h2neg: SUB.W d4,d3
h1: SWAP d3
MOVE.W d7,d6
BMI.S h1neg
ADD.W d4,d3
BRA.S update
h1neg: SUB.W d4,d3
; update history vector
update:
move.w d5,(a1)+
MOVEA.W a3,a2
MOVEA.W a4,a3
MOVEA.W d1,a4
MOVE.W d5,d7
; store output sample
ADDA.W sampoff(pc),a1
dbf d1,DecLoop
RTS
; not very effective, should be stored in some register once registers
; usage is optimized
sampoff: DC.W 0
dequant: DC.W 1, -1, 3, -3, 5, -5, 7, -7
DC.W 5, -5, 18, -18, 32, -32, 49, -49
DC.W 16, -16, 53, -53, 95, -95, 147, -147
DC.W 34, -34, 113, -113, 203, -203, 315, -315
DC.W 63, -63, 210, -210, 378, -378, 588, -588
DC.W 104, -104, 345, -345, 621, -621, 966, -966
DC.W 158, -158, 528, -528, 950, -950, 1477, -1477
DC.W 228, -228, 760, -760, 1368, -1368, 2128, -2128
DC.W 316, -316, 1053, -1053, 1895, -1895, 2947, -2947
DC.W 422, -422, 1405, -1405, 2529, -2529, 3934, -3934
DC.W 548, -548, 1828, -1828, 3290, -3290, 5117, -5117
DC.W 696, -696, 2320, -2320, 4176, -4176, 6496, -6496
DC.W 868, -868, 2893, -2893, 5207, -5207, 8099, -8099
DC.W 1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933
DC.W 1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005
DC.W 1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336@Don_Adan, post #61
Wrzucil gdzies do sciagniecia jako paczke.Paczka
mono 2sub 10,39 s x3,06 mono 2add 10,38 s x3,06 mono bra 9,98 s x3,19 stereo 2sub 20,78 s x1,53 stereo 2add 20,80 s x1,53 stereo bra 19,99 s x1,59AmiBerry na PC ustawiony na emulowanie 68000/28 cycle exact (8 MB fast)
mono 2sub 27,26 s x1,16 mono 2add 27,28 s x1,16 mono bra 27,48 s x1,15 stereo 2sub 54,56 s x0,58 stereo 2add 54,56 s x0,58 stereo bra 54,99 s x0,57Amiga 1200 z kartą ACA1221 (68020/28, 64 MB fast)
mono 2sub 13,42 s x2,37 mono 2add 13,44 s x2,36 mono bra 13,65 s x2,33 stereo 2sub 26,85 s x1,18 stereo 2add 26,84 s x1,18 stereo bra 27,28 s x1,16Jak już byłem przy Amidze to jeszcze przeleciałem z wyłączonym cache (68020/28 nadal)
mono 2sub 16,25 s x1,96 mono 2add 16,23 s x1,96 mono bra 16,73 s x1,90 stereo 2sub 32,63 s x0,97 stereo 2add 32,61 s x0,97 stereo bra 33,55 s x0,94Jakie wnioski? Po pierwsze nie wierzyć za bardzo emulatorom. Nawet jeżeli ich autorzy zarzekają się, że emulują procesor z dokładnością do jednej setnej cykla. Przy 68000 przynajmniej pokazał, że wersja z BRA jest trochę wolniejsza. Po drugie nieważne, czy zastosuję wersję z 2 SUB czy z 2 ADD. Co się zgadza z teorią, testowany jest znak zdekodowanych próbek audio, a w dobrze zrobionym audio (bez składowej stałej) próbek ujemnych jest tyle co dodatnich. Po trzecie cache na 020 przyspiesza plik mono x1,2 i plik stereo też x1,2. Z tego wyciągam wniosek, że dekoder stereo mieści się w cache tak samo dobrze jak mono.
@Krashan, post #65
@Krashan, post #64
@Don_Adan, post #66
@hrw, post #68
@Krashan, post #69
; now the first bit of r[9] is in d0:0, pull two bits from d1 swap D0 move.w (A0),D0 moveq #0,D1 lsl.l #3,D0 swap D0 bsr.b DecSamp move.l (A0)+,d0 rol.l #6,d0
@Krashan, post #69
@Krashan, post #73
@Tedy, post #74
że testów nie zrobiłem ale powinno być gitNie, to nie zadziała, i to z trzech powodów.
@Krashan, post #75
@Tedy, post #77
@Krashan, post #78
@Don_Adan, post #63
;
; QOA decoder
;==============================================================================
; STEREO DECODING STRATEGY
;
; Stereo QOA files have interleaved slices in LR order. Decoding them in order
; means LMS state has to be swapped for each slice. To avoid this, there are
; two passes over a frame. The first pass loads L channel LMS state then
; decodes only even slices and stores audio samples at output buffer offset 0,
; advancing 4 bytes after each sample. The second pass loads R channel LMS
; state, then decodes only odd slices and stores audio samples at output buffer
; offset 2, advancing 4 bytes after each sample.
;==============================================================================
;==============================================================================
; Decodes QOA mono frame to a buffer.
; INPUTS:
; d0 - number of slices available in the frame buffer (1 to 256 including)
; a0 - frame buffer
; a1 - output buffer
;==============================================================================
XDEF _DecodeMonoFrame
_DecodeMonoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
LEA sampoff,a2
MOVE.W #0,(a2)
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
nextslice:
; SWAP d7 wywalam
BSR.S slice ; decode slice
; SWAP d7 wywalam
DBF d7,nextslice
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
;==============================================================================
; Decodes QOA stereo frame to a buffer.
; INPUTS:
; d0.w - number of slices per channel available in the frame buffer (1 to 256
; including)
; a0.l - frame buffer
; a1.l - output buffer
;==============================================================================
XDEF _DecodeStereoFrame
_DecodeStereoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
LEA sampoff,a2
MOVE.W #2,(a2)
MOVE.L a0,-(sp)
MOVE.L a1,-(sp)
MOVE.W d0,-(sp)
; L channel pass
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
LEA 16(a0),a0 ; skip R channel LMS state
nextleft:
; SWAP d7 wywalam
BSR.S slice ; decode slice
; SWAP d7 wywalam
ADDQ.L #8,a0 ; skip R channel slice
DBF d7,nextleft
; R channel pass
MOVE.W (sp)+,d7 ; slice counter
MOVEA.L (sp)+,a1 ; output buffer
MOVEA.L (sp)+,a0 ; input buffer
ADDQ.L #2,a1 ; R channel samples
LEA 16(a0),a0 ; skip L channel LMS state
BSR.S loadlms
nextright: ADDQ.L #8,a0 ; skip L channel slice dodaje .L bo brak
; SWAP d7 wywalam
BSR.S slice ; decode slice
; SWAP d7 wywalam
DBF d7,nextright
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
loadlms: MOVEA.W (a0)+,a2 ; loading LMS history
MOVEA.W (a0)+,a3
MOVEA.W (a0)+,a4
MOVE.W (a0)+,d5 ; tutaj zamieniam d1 na d5
MOVE.L (a0)+,d2 ; loading LMS weights
MOVE.L (a0)+,d3
move.w #$00F0,d1 ; dodaje
RTS
;==============================================================================
; Decodes QOA slice of mono/stereo stream.
; Registers usage:
; d0 - slice
; d2,d3 - LMS weights (updated)
; d4 - residual sample, quantized, dequantized, scaled
; d5 - predicted sample
; d6 - scratch register
; d7 - not used (slice loop counter)
; a0 - not used (input data pointer)
; a1 - output data pointer (advanced)
; a2,a3,a4,d1 - LMS history (updated)
; a6 - pointer to 'dequant' lookup table (modified)
;==============================================================================
slice: MOVE.L (a0)+,d0
LEA dequant(pc),a6
ROL.L #8,d0
MOVE.B d0,d4
; ANDI.W #$00F0,d4 ; scale factor in bits 7:4 of d4 zamieniam na wersje nizej
and.w D1,D4
ADDA.W d4,a6 ; select lookup table row
;extract 9 residuals from d0, r[0] is in position already
MOVEQ #8,d7 ; tutaj zmieniam
BSR.S DecSamp
; now the first bit of r[9] is in d0:0, pull two bits from d1
; MOVE.L (a0),d4
; ADD.L d4,d4
; ADDX.B d0,d0
; ADD.L d4,d4
: ADDX.B d0,d0
; ADD.B d0,d0
; MOVE.W #0,d7
; BSR.S DecSamp
; MOVE.L (a0)+,d0
; ROL.L #6,d0
swap D0 ; ta wersja jest jednak lepsza na 68000 tez
move.w (A0),D0
moveq #0,D7
lsl.l #3,D0
swap D0
bsr.b DecSamp
move.l (A0)+,d0
rol.l #6,d0
; extract 10 residuals from d0
MOVEQ #9,d7 ; tutaj zmieniam
BRA.S DecSamp ; (ab)use RTS at end of DecSamp
;==============================================================================
; Decodes a single sample. 3-bit encoded sample is in bits 3:1 of register d4
;==============================================================================
DecLoop: ROL.L #3,d0
; decode residual sample using lookup table, store in d4
DecSamp:
; MOVEQ #$E,d4 ; przerzucam nizej
; AND.W d0,d4 ; extract encoded sample in d4
; MOVE.W (a6,d4.w),d4 ; decode with lookup table
; calculate predicted sample, store in d5
MOVE.W d5,d4 ; history[-1] tutaj zamieniam
MULS.W d3,d5 ; *= weights[-1]
SWAP d3
MOVE.W a4,d6 ; history[-2]
MULS.W d3,d6 ; *= weights[-2]
ADD.L d6,d5
MOVE.W a3,d6 ; history[-3]
MULS.W d2,d6 ; *= weights[-3]
ADD.L d6,d5
SWAP d2
MOVE.W a2,d6 ; history[-4]
MULS.W d2,d6 ; *= weights[-4]
ADD.L d6,d5
ASR.L #6,d5
ASR.L #7,d5 ; predicted sample in d5
MOVEQ #$E,d6 ; zamieniam d4 na d6
AND.W d0,d6 ; extract encoded sample in d6
MOVE.W (a6,d6.w),d6 ; decode with lookup table
; add predicted sample to reconstructed residual with clamp to
; 16-bit signed range, store in d5
; EXT.L d4 to wywalam
move.w D6,A5 ; to dodaje bo szybsze na 68020
ADD.L a5,d5 ; tutaj zmieniam
MOVEA.W d5,a5 ; with sign-extend to 32 bits
CMP.L a5,d5
BEQ.S clamped
SGT d5 ; ??FF positive, ??00 negative
EXT.W d5 ; FFFF positive, 0000 negative
EORI.W #$8000,d5 ; 7FFF positive, 8000 negative
; update LMS weights, reconstructed sample in d5, decoded
; residual in d4
clamped:
ASR.W #4,d6 ; scale residual signal down tutaj zmieniam
exg D6,D4 ; tutaj dodaje
move.w d6,a5 ; tutaj dodaje
SUB.W d4,d2
MOVE.W a2,d6
BMI.S hist3
ADD.W d4,d2
ADD.W d4,d2
hist3: SWAP d2
SUB.W d4,d2
MOVE.W a3,d6
BMI.S hist2
ADD.W d4,d2
ADD.W d4,d2
hist2: SUB.W d4,d3
MOVE.W a4,d6
BMI.S hist1
ADD.W d4,d3
ADD.W d4,d3
hist1: SWAP d3
SUB.W d4,d3
MOVE.W a5,d6 ; tutaj zmieniam
BMI.S update
ADD.W d4,d3
ADD.W d4,d3
; update history vector
update: MOVEA.W a3,a2
MOVEA.W a4,a3
MOVEA.W d1,a4
; MOVE.W d5,d1 tutaj wywalam
; store output sample
MOVE.W d5,(a1)+
ADDA.W sampoff(pc),a1
DBF d7,DecLoop
RTS
; not very effective, should be stored in some register once registers
; usage is optimized
sampoff: DC.W 0
dequant: DC.W 1, -1, 3, -3, 5, -5, 7, -7
DC.W 5, -5, 18, -18, 32, -32, 49, -49
DC.W 16, -16, 53, -53, 95, -95, 147, -147
DC.W 34, -34, 113, -113, 203, -203, 315, -315
DC.W 63, -63, 210, -210, 378, -378, 588, -588
DC.W 104, -104, 345, -345, 621, -621, 966, -966
DC.W 158, -158, 528, -528, 950, -950, 1477, -1477
DC.W 228, -228, 760, -760, 1368, -1368, 2128, -2128
DC.W 316, -316, 1053, -1053, 1895, -1895, 2947, -2947
DC.W 422, -422, 1405, -1405, 2529, -2529, 3934, -3934
DC.W 548, -548, 1828, -1828, 3290, -3290, 5117, -5117
DC.W 696, -696, 2320, -2320, 4176, -4176, 6496, -6496
DC.W 868, -868, 2893, -2893, 5207, -5207, 8099, -8099
DC.W 1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933
DC.W 1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005
DC.W 1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336 @Don_Adan, post #80
;
; QOA decoder
;==============================================================================
; STEREO DECODING STRATEGY
;
; Stereo QOA files have interleaved slices in LR order. Decoding them in order
; means LMS state has to be swapped for each slice. To avoid this, there are
; two passes over a frame. The first pass loads L channel LMS state then
; decodes only even slices and stores audio samples at output buffer offset 0,
; advancing 4 bytes after each sample. The second pass loads R channel LMS
; state, then decodes only odd slices and stores audio samples at output buffer
; offset 2, advancing 4 bytes after each sample.
;==============================================================================
;==============================================================================
; Decodes QOA mono frame to a buffer.
; INPUTS:
; d0 - number of slices available in the frame buffer (1 to 256 including)
; a0 - frame buffer
; a1 - output buffer
;==============================================================================
XDEF _DecodeMonoFrame
_DecodeMonoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
clr.w (a5)
nextslice: SWAP d7
BSR.S slice ; decode slice
SWAP d7
DBF d7,nextslice
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
;==============================================================================
; Decodes QOA stereo frame to a buffer.
; INPUTS:
; d0.w - number of slices per channel available in the frame buffer (1 to 256
; including)
; a0.l - frame buffer
; a1.l - output buffer
;==============================================================================
XDEF _DecodeStereoFrame
_DecodeStereoFrame:
MOVEM.L d2-d7/a2-a6,-(sp)
SUBQ.L #1,d0
MOVE.L a0,-(sp)
MOVE.L a1,-(sp)
MOVE.W d0,-(sp)
; L channel pass
MOVE.W d0,d7 ; slice counter
BSR.S loadlms
move.w #2,(a5)
LEA 16(a0),a0 ; skip R channel LMS state
nextleft: SWAP d7
BSR.S slice ; decode slice
SWAP d7
ADDQ.L #8,a0 ; skip R channel slice
DBF d7,nextleft
; R channel pass
MOVE.W (sp)+,d7 ; slice counter
MOVEA.L (sp)+,a1 ; output buffer
MOVEA.L (sp)+,a0 ; input buffer
ADDQ.L #2,a1 ; R channel samples
LEA 16(a0),a0 ; skip L channel LMS state
BSR.S loadlms
nextright: ADDQ.l #8,a0 ; skip L channel slice
SWAP d7
BSR.S slice ; decode slice
SWAP d7
DBF d7,nextright
MOVEM.L (sp)+,d2-d7/a2-a6
RTS
loadlms: MOVEA.W (a0)+,a2 ; loading LMS history
MOVEA.W (a0)+,a3
MOVEA.W (a0)+,a4
move.w (a0)+,d5
MOVE.L (a0)+,d2 ; loading LMS weights
MOVE.L (a0)+,d3
lea sampoff(pc),a5
RTS
;==============================================================================
; Decodes QOA slice of mono/stereo stream.
; Registers usage:
; d0 - slice
; d2,d3 - LMS weights (updated)
; d4 - residual sample, quantized, dequantized, scaled
; d5 - predicted sample
; d6 - scratch register
; d7 - not used (slice loop counter)
; a0 - not used (input data pointer)
; a1 - output data pointer (advanced)
; a2,a3,a4,d1 - LMS history (updated)
; a6 - pointer to 'dequant' lookup table (modified)
;==============================================================================
slice: MOVE.L (a0)+,d0
LEA dequant(pc),a6
ROL.L #8,d0
MOVE.B d0,d4
ANDI.W #$00F0,d4 ; scale factor in bits 7:4 of d4
ADDA.W d4,a6 ; select lookup table row
;extract 9 residuals from d0, r[0] is in position already
MOVE.W #8,d7 ; can't MOVEQ, upper half in use
BSR.S DecSamp
; now the first bit of r[9] is in d0:0, pull two bits from d1
moveq #1,D4
and.b D0,D4
move.l (A0)+,D0
add.l D0,D0
addx.b D4,D4
add.l D0,D0
addx.b D4,D4
add.b D4,D4
clr.w D7
bsr.b OnlyOnce
rol.l #4,D0
; extract 10 residuals from d0
MOVE.W #9,d7
BRA.S DecSamp ; (ab)use RTS at end of DecSamp
;==============================================================================
; Decodes a single sample. 3-bit encoded sample is in bits 3:1 of register d4
;==============================================================================
DecLoop: ROL.L #3,d0
; decode residual sample using lookup table, store in d4
DecSamp: MOVEQ #$E,d4
AND.W d0,d4 ; extract encoded sample in d4
OnlyOnce:
move.w (a6,d4.w),a5 ; decode with lookup table
; calculate predicted sample, store in d5
move.w d5,d4 ; history[-1]
MULS.W d3,d5 ; *= weights[-1]
SWAP d3
MOVE.W a4,d6 ; history[-2]
MULS.W d3,d6 ; *= weights[-2]
ADD.L d6,d5
MOVE.W a3,d6 ; history[-3]
MULS.W d2,d6 ; *= weights[-3]
ADD.L d6,d5
SWAP d2
MOVE.W a2,d6 ; history[-4]
MULS.W d2,d6 ; *= weights[-4]
ADD.L d6,d5
ASR.L #6,d5
ASR.L #7,d5 ; predicted sample in d5
; add predicted sample to reconstructed residual with clamp to
; 16-bit signed range, store in d5
add.l a5,d5
move.w d4,d6
move.l a5,d4
MOVEA.W d5,a5 ; with sign-extend to 32 bits
CMP.L a5,d5
BEQ.S clamped
SGT d5 ; ??FF positive, ??00 negative
EXT.W d5 ; FFFF positive, 0000 negative
EORI.W #$8000,d5 ; 7FFF positive, 8000 negative
; update LMS weights, reconstructed sample in d5, decoded
; residual in d4
clamped: ASR.W #4,d4 ; scale residual signal down
move.w d6,a5
SUB.W d4,d2
MOVE.W a2,d6
BMI.S hist3
ADD.W d4,d2
ADD.W d4,d2
hist3: SWAP d2
SUB.W d4,d2
MOVE.W a3,d6
BMI.S hist2
ADD.W d4,d2
ADD.W d4,d2
hist2: SUB.W d4,d3
MOVE.W a4,d6
BMI.S hist1
ADD.W d4,d3
ADD.W d4,d3
hist1: SWAP d3
SUB.W d4,d3
move.w a5,d6
BMI.S update
ADD.W d4,d3
ADD.W d4,d3
; update history vector
update:
move.w d5,(a1)+
MOVEA.W a3,a2
MOVEA.W a4,a3
MOVEA.W d1,a4
; MOVE.W d5,d1 wywalone
; store output sample
; MOVE.W d5,(a1)+
ADDA.W sampoff(pc),a1
DBF d7,DecLoop
RTS
; not very effective, should be stored in some register once registers
; usage is optimized
sampoff: DC.W 0
dequant: DC.W 1, -1, 3, -3, 5, -5, 7, -7
DC.W 5, -5, 18, -18, 32, -32, 49, -49
DC.W 16, -16, 53, -53, 95, -95, 147, -147
DC.W 34, -34, 113, -113, 203, -203, 315, -315
DC.W 63, -63, 210, -210, 378, -378, 588, -588
DC.W 104, -104, 345, -345, 621, -621, 966, -966
DC.W 158, -158, 528, -528, 950, -950, 1477, -1477
DC.W 228, -228, 760, -760, 1368, -1368, 2128, -2128
DC.W 316, -316, 1053, -1053, 1895, -1895, 2947, -2947
DC.W 422, -422, 1405, -1405, 2529, -2529, 3934, -3934
DC.W 548, -548, 1828, -1828, 3290, -3290, 5117, -5117
DC.W 696, -696, 2320, -2320, 4176, -4176, 6496, -6496
DC.W 868, -868, 2893, -2893, 5207, -5207, 8099, -8099
DC.W 1064, -1064, 3548, -3548, 6386, -6386, 9933, -9933
DC.W 1286, -1286, 4288, -4288, 7718, -7718, 12005, -12005
DC.W 1536, -1536, 5120, -5120, 9216, -9216, 14336, -14336 @Tedy, post #79
@Krashan, post #82
@Krashan, post #82
Ostatecznie ta wersja wyciąga na 68020@28 1,19 czasu rzeczywistego (44,1 kHz stereo)
czy ten niewielki zapas mocy obliczeniowej wystarczy do płynnego odtwarzania, choćby i w prostym trybie Pauli 8-bit.
@Krashan, post #82
@snifferman, post #84
To przy dobrych wiatrach Wicher 500i (68000) na 50Mhz (4.4Mips w sysinfo) powinien też dać radę.Może być ciężko. Podczas gdy 68020 potrzebuje około 200 cykli zegara na jedną próbkę dźwięku, 68000 potrzebuje ich ponad 600. Niestety nie mam maszyny z 68000 na chodzie, żeby dokładnie sprawdzić. Na emulatorze "cycle exact" 68000/28 wyciąga szybkość x0,55. Zatem 50 MHz to za mało. Ale teraz już nie wierzę emulatorom w kwestii czasu wykonania kodu. Jeżeli masz taki konfig, możesz pobrać i sprawdzić. A dokładniej będziesz mógł, gdy wersja 1.2 pojawi się w recencie Aminetu.
Czy tutaj chipset będzie miał znaczenie? A1200 ma szerszą magistralę pamięci jeśli dobrze pamiętam i pchanie danych do Chip będzie wydajniejsze?Tak, jeżeli zrobię zapis do chip RAM 32-bitowymi słowami. Na A1200 jest to jeden dostęp do pamięci, na A500 dwa. Na razie jest zapisywane bajtami.
i wycisnąć ostatnie soki z PauliWiadomo. Ale zacząć trzeba od trybu 8-bitowego.
@alt_, post #85
A da się zrobić tak, żeby dekodować co drugą próbkę (22kHz)?Nie. Zdecydowana większość kodeków kompresujących w dziedzinie czasu (np. ADPCM, QOA, FLAC) musi mieć zdekodowane wszystkie próbki, ponieważ wartości próbek poprzednich są używane do dekodowania kolejnych. Inaczej kodeki pracujące w dziedzinie częstotliwości (MP3, Vorbis, AAC). Tutaj można po prostu nie dekodować górnej połowy pasma, z czego się często na Amidze korzysta. Ale coś za coś, przejście w dekoderze z dziedziny częstotliwości do czasu, jakie w takich kodekach musi być wykonane, też kosztuje grube takty procesora.