avcodec/x86/mpegvideo: Don't duplicate register

Currently several inline ASM blocks used a value as
an input and rax as clobber register. The input value
was just moved into the register which then served as loop
counter. This is wasteful, as one can just use the value's
register directly as loop counter.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt
2025-11-03 19:17:16 +01:00
parent 1fa8ffc1db
commit a9a23925df

View File

@@ -183,19 +183,19 @@ static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
block0 = block[0] * s->c_dc_scale;
/* XXX: only MPEG-1 */
quant_matrix = s->intra_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"movq (%2, %0), %%mm4 \n\t"
"movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -210,8 +210,8 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
@@ -224,13 +224,14 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"movq %%mm4, (%1, %0) \n\t"
"movq %%mm5, 8(%1, %0) \n\t"
"add $16, %%"FF_REG_a" \n\t"
"add $16, %0 \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
: "memory"
);
block[0]= block0;
}
@@ -246,19 +247,19 @@ static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
quant_matrix = s->inter_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"movq (%2, %0), %%mm4 \n\t"
"movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -277,8 +278,8 @@ __asm__ volatile(
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
@@ -291,13 +292,14 @@ __asm__ volatile(
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"movq %%mm4, (%1, %0) \n\t"
"movq %%mm5, 8(%1, %0) \n\t"
"add $16, %%"FF_REG_a" \n\t"
"add $16, %0 \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
: "memory"
);
}
@@ -320,17 +322,17 @@ static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
else
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
x86_reg offset = -2 * nCoeffs;
__asm__ volatile(
"movd %2, %%mm6 \n\t"
"movd %3, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"movq (%2, %0), %%mm4 \n\t"
"movq 8(%2, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"movq %%mm0, %%mm2 \n\t"
@@ -343,13 +345,14 @@ __asm__ volatile(
"paddw %%mm3, %%mm1 \n\t" // so that a right-shift
"psraw $4, %%mm0 \n\t" // is equivalent to divide
"psraw $4, %%mm1 \n\t" // with rounding towards zero
"movq %%mm0, (%0, %%"FF_REG_a") \n\t"
"movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
"movq %%mm0, (%1, %0) \n\t"
"movq %%mm1, 8(%1, %0) \n\t"
"add $16, %%"FF_REG_a" \n\t"
"add $16, %0 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
: "+r" (offset)
: "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
: "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot accumulate
@@ -358,30 +361,27 @@ __asm__ volatile(
static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
av_assert2(s->block_last_index[n]>=0);
if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
else qscale <<= 1;
x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : (unsigned)qscale << 1;
x86_reg offset = s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
const void *quant_matrix = (const char*)s->inter_matrix + offset;
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
quant_matrix = s->inter_matrix;
__asm__ volatile(
"movd %k1, %%mm6 \n\t"
"lea (%2, %0), %1 \n\t"
"neg %0 \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlq $48, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"FF_REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"movq (%3, %0), %%mm4 \n\t"
"movq 8(%3, %0), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
@@ -400,8 +400,8 @@ __asm__ volatile(
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw (%1, %0), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%1, %0), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
"psrlw $5, %%mm0 \n\t"
"psrlw $5, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
@@ -412,12 +412,12 @@ __asm__ volatile(
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
"movq %%mm4, (%0, %%"FF_REG_a") \n\t"
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
"movq %%mm4, (%1, %0) \n\t"
"movq %%mm5, 8(%1, %0) \n\t"
"add $16, %%"FF_REG_a" \n\t"
"add $16, %0 \n\t"
"jng 1b \n\t"
"movd 124(%0, %3), %%mm0 \n\t"
"movd 124(%2), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
@@ -427,10 +427,11 @@ __asm__ volatile(
"pslld $31, %%mm7 \n\t"
"psrlq $15, %%mm7 \n\t"
"pxor %%mm7, %%mm0 \n\t"
"movd %%mm0, 124(%0, %3) \n\t"
"movd %%mm0, 124(%2) \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
: "%"FF_REG_a, "memory"
: "+r"(offset), "+r" (qscale2)
: "r" (block), "r"(quant_matrix)
: "memory"
);
}