static
swscale
swscale_alpha
+ thumb
vaapi
vdpau
version3
elif enabled arm; then
- check_cflags -marm
+ enabled thumb && check_cflags -mthumb || check_cflags -marm
nogas=die
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
+ "it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
+ it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]
mov r11, r10
ldrb r10, [r4], #1 @ band_start_tab[band++]
subs r9, r9, r5 @ - floor
+ it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8 @ & 0x1fe0
+ ite gt
subgt r8, r3, r11
suble r8, r10, r11
add r9, r9, r5 @ + floor => m
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
+ it eq
bxeq lr
push {lr}
mov r12, #256
# define ELF @
#endif
+#if CONFIG_THUMB
+# define A @
+# define T
+#else
+# define A
+# define T @
+#endif
+
.syntax unified
+T .thumb
.macro require8 val=1
ELF .eabi_attribute 24, \val
#endif
.endm
+.macro ldr_pre rt, rn, rm:vararg
+A ldr \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T ldr \rt, [\rn]
+.endm
+
+.macro ldr_post rt, rn, rm:vararg
+A ldr \rt, [\rn], \rm
+T ldr \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro ldrd_reg rt, rt2, rn, rm
+A ldrd \rt, \rt2, [\rn, \rm]
+T add \rt, \rn, \rm
+T ldrd \rt, \rt2, [\rt]
+.endm
+
+.macro ldrd_post rt, rt2, rn, rm
+A ldrd \rt, \rt2, [\rn], \rm
+T ldrd \rt, \rt2, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro ldrh_pre rt, rn, rm
+A ldrh \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T ldrh \rt, [\rn]
+.endm
+
+.macro ldrh_dpre rt, rn, rm
+A ldrh \rt, [\rn, -\rm]!
+T sub \rn, \rn, \rm
+T ldrh \rt, [\rn]
+.endm
+
+.macro ldrh_post rt, rn, rm
+A ldrh \rt, [\rn], \rm
+T ldrh \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro str_post rt, rn, rm:vararg
+A str \rt, [\rn], \rm
+T str \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strb_post rt, rn, rm:vararg
+A strb \rt, [\rn], \rm
+T strb \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strd_post rt, rt2, rn, rm
+A strd \rt, \rt2, [\rn], \rm
+T strd \rt, \rt2, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strh_pre rt, rn, rm
+A strh \rt, [\rn, \rm]!
+T add \rn, \rn, \rm
+T strh \rt, [\rn]
+.endm
+
+.macro strh_dpre rt, rn, rm
+A strh \rt, [\rn, -\rm]!
+T sub \rn, \rn, \rm
+T strh \rt, [\rn]
+.endm
+
+.macro strh_post rt, rn, rm
+A strh \rt, [\rn], \rm
+T strh \rt, [\rn]
+T add \rn, \rn, \rm
+.endm
+
+.macro strh_dpost rt, rn, rm
+A strh \rt, [\rn], -\rm
+T strh \rt, [\rn]
+T sub \rn, \rn, \rm
+.endm
+
#if HAVE_VFP_ARGS
.eabi_attribute 28, 1
# define VFP
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
cmp r3, #32
+ ite eq
moveq r6, #256/32
movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
+ it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
+ itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
mvn r5, r5
mvn r7, r7
tst r6, #0x100
+ it ne
movne r6, r5, lsr #24
tst r8, #0x100
+ it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
- ldr r4, [r1], r2
+ ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
- ldr r8, [r1], r2
+ ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
push {r4-r7}
1:
ldr r5, [r1, #4]
- ldr r4, [r1], r2
+ ldr_post r4, r1, r2
ldr r7, [r1, #4]
- strd r4, r5, [r0], r2
- ldr r6, [r1], r2
+ strd_post r4, r5, r0, r2
+ ldr_post r6, r1, r2
subs r3, r3, #2
- strd r6, r7, [r0], r2
+ strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
- ldr r8, [r1, r2]!
+ ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
- ldr r4, [r1, r2]!
+ ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
uadd8 r11, r11, r7
- strd r8, r9, [r0], r2
+ strd_post r8, r9, r0, r2
ldr r7, [r1, #4]
- strd r10, r11, [r0], r2
+ strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
- ldr r8, [r1, r2]!
+ ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
- ldr r4, [r1, r2]!
+ ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
- ldr r6, [r1, r2]!
+ ldr_pre r6, r1, r2
uhadd8 r14, r5, r7
ldr r7, [r1, #4]
stm r0, {r8,r9}
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
- ldr r9, [r1], r2
+ ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
- ldrd r6, r7, [r0, r2]
+ ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
- ldr r9, [r1], r2
+ ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
- strd r4, r5, [r0], r2
+ strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
- ldrd r4, r5, [r0, r2]
+ ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
- ldr r9, [r1], r2
- strd r6, r7, [r0], r2
+ ldr_post r9, r1, r2
+ strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
- strd r6, r7, [r0], r2
+ strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
- strd r6, r7, [r1], r2
+ strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc
push {r4-r8, lr}
mov lr, #8
1:
- ldrd r4, r5, [r1], r2
+ ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
push {r4-r9, lr}
mov lr, #8
1:
- ldrd r4, r5, [r1], r3
- ldrd r6, r7, [r2], r3
+ ldrd_post r4, r5, r1, r3
+ ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
push {r4-r9, lr}
mov r0, #0
mov lr, #0
- ldrd r4, r5, [r1], r3
+ ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
- ldr r6, [r2], r3
- ldrd r8, r9, [r1], r3
+ ldr_post r6, r2, r3
+ ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
- ldr r6, [r2], r3
+ ldr_post r6, r2, r3
beq 2f
- ldrd r4, r5, [r1], r3
+ ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
- ldr r4, [r0, r1]!
+ ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
+ it lt
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
2: vst1.32 {q2},[r0,:128]!
vst1.32 {q3},[r0,:128]!
ands len, len, #15
+ it eq
bxeq lr
3: vld1.32 {q0},[r1,:128]!
vmul.f32 q0, q0, q8
2: vst1.32 {q8},[r0,:128]!
vst1.32 {q9},[r0,:128]!
ands r3, r3, #7
+ it eq
popeq {pc}
3: vld1.32 {q0},[r1,:128]!
ldr r12, [r2], #4
1:
subs r3, r3, #16
vmul.f32 s12, s4, s12
+ itttt ge
vldmiage r1!, {s16-s19}
vldmiage r2!, {s24-s27}
vldmiage r1!, {s20-s23}
vldmiage r2!, {s28-s31}
+ it ge
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
+ it ge
vmulge.f32 s28, s20, s28
+ itttt gt
vldmiagt r1!, {s0-s3}
vldmiagt r2!, {s8-s11}
vldmiagt r1!, {s4-s7}
vldmiagt r2!, {s12-s15}
+ ittt ge
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}
vmul.f32 s11, s0, s11
1:
subs r3, r3, #16
+ it ge
vldmdbge r2!, {s16-s19}
vmul.f32 s12, s7, s12
+ it ge
vldmiage r1!, {s24-s27}
vmul.f32 s13, s6, s13
+ it ge
vldmdbge r2!, {s20-s23}
vmul.f32 s14, s5, s14
+ it ge
vldmiage r1!, {s28-s31}
vmul.f32 s15, s4, s15
+ it ge
vmulge.f32 s24, s19, s24
+ it gt
vldmdbgt r2!, {s0-s3}
+ it ge
vmulge.f32 s25, s18, s25
vstmia r0!, {s8-s13}
+ it ge
vmulge.f32 s26, s17, s26
+ it gt
vldmiagt r1!, {s8-s11}
+ itt ge
vmulge.f32 s27, s16, s27
vmulge.f32 s28, s23, s28
+ it gt
vldmdbgt r2!, {s4-s7}
+ it ge
vmulge.f32 s29, s22, s29
vstmia r0!, {s14-s15}
+ ittt ge
vmulge.f32 s30, s21, s30
vmulge.f32 s31, s20, s31
vmulge.f32 s8, s3, s8
+ it gt
vldmiagt r1!, {s12-s15}
+ itttt ge
vmulge.f32 s9, s2, s9
vmulge.f32 s10, s1, s10
vstmiage r0!, {s24-s27}
vmulge.f32 s11, s0, s11
+ it ge
vstmiage r0!, {s28-s31}
bgt 1b
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
+ itt lt
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
+ it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
+ it eq
popeq {r4-r8,pc}
@ 1 channel
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
+ it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
+ it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
+ itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
+ itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
pld [r1]
pld [r1, r2]
- muls r7, r4, r5
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
pld [r1]
pld [r1, r2]
- muls r7, r4, r5
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb ip, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
pop {r4-r6, pc}
2:
.ifc \type,put
- ldrh r5, [r1], r2
- strh r5, [r0], r2
- ldrh r6, [r1], r2
- strh r6, [r0], r2
+ ldrh_post r5, r1, r2
+ strh_post r5, r0, r2
+ ldrh_post r6, r1, r2
+ strh_post r6, r0, r2
.else
vld1.16 {d16[0]}, [r1], r2
vld1.16 {d16[1]}, [r1], r2
ldr ip, [sp]
tst r2, r2
ldr ip, [ip]
+ it ne
tstne r3, r3
vmov.32 d24[0], ip
and ip, ip, ip, lsl #16
+ it eq
bxeq lr
ands ip, ip, ip, lsl #8
+ it lt
bxlt lr
.endm
- .macro align_push_regs
- and ip, sp, #15
- add ip, ip, #32
- sub sp, sp, ip
- vst1.64 {d12-d15}, [sp,:128]
- sub sp, sp, #32
- vst1.64 {d8-d11}, [sp,:128]
- .endm
-
- .macro align_pop_regs
- vld1.64 {d8-d11}, [sp,:128]!
- vld1.64 {d12-d15}, [sp,:128], ip
- .endm
-
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
vld1.64 {d18,d19}, [r0,:128], r1
vld1.64 {d16,d17}, [r0,:128], r1
- align_push_regs
+ vpush {d8-d15}
h264_loop_filter_luma
vst1.64 {d0, d1}, [r0,:128], r1
vst1.64 {d10,d11}, [r0,:128]
- align_pop_regs
+ vpop {d8-d15}
bx lr
endfunc
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
- align_push_regs
+ vpush {d8-d15}
h264_loop_filter_luma
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
- align_pop_regs
+ vpop {d8-d15}
bx lr
endfunc
vrhadd.u8 d11, d11, d7
sub r0, r0, r2, lsl #3
.endif
+
vst1.64 {d12}, [r0,:64], r2
vst1.64 {d13}, [r0,:64], r2
vst1.64 {d14}, [r0,:64], r2
\type\()_h264_qpel8_mc11:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
mov ip, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r11, pc}
endfunc
\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, #2
mov r3, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r10, r11, pc}
endfunc
\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
bl put_h264_qpel8_v_lowpass_neon
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
\type\()_h264_qpel16_mc11:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
mov r3, r2
add ip, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4, r11, pc}
endfunc
\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, #2
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon_packed
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, r2, lsl #1
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_v_lowpass_neon_packed
mov r4, r0
- ldrd r0, [r11]
+ ldrd r0, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
- add sp, r11, #8
+ mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
- bic sp, sp, #15
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
+ it ne
movne lr, #0
cmp lr, #0
- adrne lr, ff_h264_idct_dc_add_neon
- adreq lr, ff_h264_idct_add_neon
+ ite ne
+ adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
- adrne lr, ff_h264_idct_add_neon
- adreq lr, ff_h264_idct_dc_add_neon
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
add r1, r3, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
- adrne lr, ff_h264_idct_add_neon
- adreq lr, ff_h264_idct_dc_add_neon
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
+ itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
+ it ne
movne lr, #0
cmp lr, #0
- adrne lr, ff_h264_idct8_dc_add_neon
- adreq lr, ff_h264_idct8_add_neon
+ ite ne
+ adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
+ "itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
+ "it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
+ "it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
- ldr r10, [r3, lr, lsr #1]
+A ldr r10, [r3, lr, lsr #1]
+T lsr r10, lr, #1
+T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
- strh r10, [r3], r4
+ strh_post r10, r3, r4
mov lr, #15
1:
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
- strh r10, [r3], r12
+ strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
- strh r11, [r5], -r12
+ strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
+ it gt
addgt \tmp, \add, #0
+ it lt
rsblt \tmp, \add, #0
+ it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
+ it gt
addgt \tmp, \add, #0
+ it lt
rsblt \tmp, \add, #0
+ it ne
smlabbne \dst, \src, \mul, \tmp
.endm
strh lr, [r0], #2
subs r3, r3, #8
+ it gt
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
+ it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
+ it lt
rsblt r8, r2, #0
+ it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
+ it lt
rsblt r8, r2, #0
+ it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
+ it le
bxle lr
cmp r3, #8
bgt 1b
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
+ it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
+ it ne
movne r2, #0
bne 2f
cmp r2, #4
+ it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]
vst1.32 {d22}, [r5,:64]
cmp r6, #0
+ it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if null avoid muls
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if null avoid muls
+ itttt ne
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
- teq r3, #0 @ if null avoid muls
+ teq r3, #0 @ if null avoid muls
+ it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
+ itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
- teq r4, #0 @ if null avoid muls
+ teq r4, #0 @ if null avoid muls
+ itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
+ it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
+ teq r11, #0 @ if null avoid muls
+ it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
+ itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
+ teq r9, #0 @ if null avoid muls
+ itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
+ itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
+ itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
+ it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
+ itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
+ it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
__end_b_evaluation2:
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
+ itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
+ it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
+ itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2:
ldrd v1, [a1, #8]
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2
+ itt eq
cmpeq v1, a4
cmpeq v1, a3, lsr #16
beq row_dc_only
ldmfd sp!, {a3, a4}
adds a2, a3, v1
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, v2
mov ip, ip, asr #20
str a2, [a1]
subs a3, a3, v1
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, v2
mov a4, a4, asr #20
subs a2, a3, v3
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub ip, a4, v4
mov ip, ip, asr #20
str a2, [a1, #(16*1)]
adds a3, a3, v3
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add a4, a4, v4
mov a4, a4, asr #20
adds a2, a3, v5
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, v6
mov ip, ip, asr #20
str a2, [a1, #(16*2)]
subs a3, a3, v5
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, v6
mov a4, a4, asr #20
adds a2, a3, v7
mov a2, a2, lsr #20
+ it mi
orrmi a2, a2, #0xf000
add ip, a4, fp
mov ip, ip, asr #20
str a2, [a1, #(16*3)]
subs a3, a3, v7
mov a2, a3, lsr #20
+ it mi
orrmi a2, a2, #0xf000
sub a4, a4, fp
mov a4, a4, asr #20
.macro clip dst, src:vararg
movs \dst, \src
+ it mi
movmi \dst, #0
cmp \dst, #255
+ it gt
movgt \dst, #255
.endm
.macro aclip dst, src:vararg
adds \dst, \src
+ it mi
movmi \dst, #0
cmp \dst, #255
+ it gt
movgt \dst, #255
.endm
orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4}
- strh a2, [v2, v1]!
+ strh_pre a2, v2, v1
sub a2, a3, v3
clip a2, a2, asr #20
sub ip, a4, v4
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
- strh a2, [v1, lr]!
+ strh_pre a2, v1, lr
add a3, a3, v3
clip a2, a3, asr #20
add a4, a4, v4
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
- strh a2, [v2, -lr]!
+ strh_dpre a2, v2, lr
add a2, a3, v5
clip a2, a2, asr #20
add ip, a4, v6
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
- strh a2, [v1, lr]!
+ strh_pre a2, v1, lr
sub a3, a3, v5
clip a2, a3, asr #20
sub a4, a4, v6
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
- strh a2, [v2, -lr]!
+ strh_dpre a2, v2, lr
add a2, a3, v7
clip a2, a2, asr #20
sub a4, a4, fp
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
- strh a2, [v2, -lr]
+ strh_dpre a2, v2, lr
ldr pc, [sp], #4
endfunc
ldr v1, [sp, #32]
sub a4, a4, v2
rsb v2, v1, v1, lsl #3
- ldrh ip, [v2, lr]!
+ ldrh_pre ip, v2, lr
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
sub a2, a3, v3
add a3, a3, v3
and v3, ip, #255
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
add a4, a4, v4
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
add a2, a3, v5
sub a3, a3, v5
and v3, ip, #255
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, v6
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
strh a2, [v2]
ldmfd sp!, {a3, a4}
- ldrh ip, [lr, v1]!
+ ldrh_pre ip, lr, v1
add a2, a3, v7
sub a3, a3, v7
and v3, ip, #255
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, fp
- ldrh ip, [v2, -v1]!
+ ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip
+ itt eq
cmpeq lr, r3
cmpeq lr, r2, lsr #16
beq 1f
pop {r1, r2}
idct_finish_shift_sat COL_SHIFT
- strb r4, [r1], r2
- strb r5, [r1], r2
- strb r6, [r1], r2
- strb r7, [r1], r2
- strb r11,[r1], r2
- strb r10,[r1], r2
- strb r9, [r1], r2
- strb r8, [r1], r2
+ strb_post r4, r1, r2
+ strb_post r5, r1, r2
+ strb_post r6, r1, r2
+ strb_post r7, r1, r2
+ strb_post r11,r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
sub r1, r1, r2, lsl #3
add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT
- strb ip, [r1], r2
+ strb_post ip, r1, r2
ldrb ip, [r1, r2]
usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5
- strb r4, [r1], r2
+ strb_post r4, r1, r2
ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2]
- strb r5, [r1], r2
+ strb_post r5, r1, r2
ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT
usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr
- strb r6, [r1], r2
- strb r10,[r1], r2
- strb r9, [r1], r2
- strb r8, [r1], r2
- strb lr, [r1], r2
+ strb_post r6, r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
+ strb_post lr, r1, r2
sub r1, r1, r2, lsl #3
add r3, r0, r1, lsl #2
pld [r0, r1]
pld [r0, r1, lsl #1]
- pld [r3, -r1]
+A pld [r3, -r1]
pld [r3]
pld [r3, r1]
add r3, r3, r1, lsl #1
orrs r4, r4, r5
idct_col4_top
+ it eq
addeq r2, r2, #16
beq 1f
1: orrs r6, r6, r7
ldrd r4, [r2, #16]
+ it eq
addeq r2, r2, #16
beq 2f
2: orrs r4, r4, r5
ldrd r4, [r2, #16]
+ it eq
addeq r2, r2, #16
beq 3f
vadd.i32 q13, q13, q8
3: orrs r4, r4, r5
+ it eq
addeq r2, r2, #16
beq 4f
vst1.32 {q9}, [r2,:128]
subs r1, r1, #1
+ it eq
popeq {r4-r11,pc}
cmp r4, #0
+ itt eq
subeq r8, r8, #512*4
subeq r9, r9, #512*4
sub r5, r5, #512*4
#ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H
+#if CONFIG_THUMB
+# define A(x)
+# define T(x) x
+#else
+# define A(x) x
+# define T(x)
+#endif
+
#if HAVE_ARMV6 && HAVE_INLINE_ASM
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
unsigned bit;
__asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
"cmpcs %7, %4 \n"
- "ldrcsh %2, [%4], #2 \n"
+ A("ldrcsh %2, [%4], #2 \n")
+ T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
"rev16cs %2, %2 \n"
- "orrcs %1, %1, %2, lsl %3 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n"
+ "ittte ge \n"
"subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n"
"movge %2, #1 \n"
unsigned tmp;
__asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
"cmpcs %7, %4 \n"
- "ldrcsh %2, [%4], #2 \n"
+ A("ldrcsh %2, [%4], #2 \n")
+ T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
"rev16cs %2, %2 \n"
- "orrcs %1, %1, %2, lsl %3 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n"
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
rsb \h, \pr, #256
+ it cs
ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h
+T itttt cs
rev16cs \t1, \t1
- orrcs \cw, \cw, \t1, lsl \bs
+A orrcs \cw, \cw, \t1, lsl \bs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
+ itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
+ it cs
ldrhcs \t1, [\buf], #2
mov \h, #128
+ it cs
rev16cs \t1, \t1
add \h, \h, \t0, lsl #7
- orrcs \cw, \cw, \t1, lsl \bs
+A orrcs \cw, \cw, \t1, lsl \bs
+T ittt cs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
+ itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
cmp r3, #0
ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf
+ it ne
pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word
0:
adds r6, r6, r9
add r4, r4, #11
lsl r8, r8, r9
+ it cs
ldrhcs r10, [r7], #2
lsl r9, r5, r9
mov r5, #128
+ it cs
rev16cs r10, r10
add r5, r5, r9, lsl #7
- orrcs r8, r8, r10, lsl r6
+T ittt cs
+T lslcs r10, r10, r6
+T orrcs r8, r8, r10
+A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16
lsr r5, r5, #8
cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1
+ itt ge
subge r8, r8, r5, lsl #16
subge r5, r9, r5
ldrb r10, [r10, r3]
+ it ge
rsbge r12, r12, #0
cmp r3, #16
strh r12, [r1, r10]
ldr r0, [sp]
ldr r9, [r0, #12]
cmp r7, r9
+ it hi
movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word
mov r12, #2
ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, #1
ldrb r9, [lr, r5]
blt 4f
ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, #1
ldrb r9, [lr, r5]
b 4f
mov r12, #5
mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
mov r12, #7
mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #2
ldrb r9, [lr, r5]
mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
3:
ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
addge r4, r4, #1
ldrb r9, [lr, r5]
+ ite ge
movge r12, #2
movlt r12, #0
ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8
+ it ge
addge r12, r12, #1
movrel r4, X(ff_vp8_dct_cat_prob)
lsl r9, r9, r12
lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1
+ it ge
addge r1, r1, #1
cmp r0, #0
bne 1b
add r4, r2, r4
add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10
+ it ge
rsbge r12, r12, #0
smulbb r12, r12, r11
movrel r9, zigzag_scan-1
push {r4-r6,lr}
1:
subs r12, r12, #4
- ldr r4, [r2], r3
- ldr r5, [r2], r3
- ldr r6, [r2], r3
- ldr lr, [r2], r3
- str r4, [r0], r1
- str r5, [r0], r1
- str r6, [r0], r1
- str lr, [r0], r1
+ ldr_post r4, r2, r3
+ ldr_post r5, r2, r3
+ ldr_post r6, r2, r3
+ ldr_post lr, r2, r3
+ str_post r4, r0, r1
+ str_post r5, r0, r1
+ str_post r6, r0, r1
+ str_post lr, r0, r1
bgt 1b
pop {r4-r6,pc}
endfunc
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
+ "ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
{
int x, y;
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
+ "itet ne \n\t"
"mvnne %1, #1<<31 \n\t"
"moveq %0, %Q2 \n\t"
"eorne %0, %1, %R2, asr #31 \n\t"