diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 48dd9d8766..52a6bfd5bf 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -70,7 +70,7 @@ void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_ void ff_ ## OPNAME ## _h264_qpel8_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _h264_qpel16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride);\ void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ -void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ +void ff_ ## OPNAME ## _pixels8_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ void ff_ ## OPNAME ## _pixels16_l2_shift5_sse2(uint8_t *dst, const int16_t *src16, const uint8_t *src8, int dstStride);\ DEF_QPEL(avg) @@ -186,8 +186,8 @@ SSSE3_HV2_LOWPASS_WRAPPER(put) #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext -#define ff_put_pixels16_l2_shift5_mmxext ff_put_pixels16_l2_shift5_sse2 -#define ff_avg_pixels16_l2_shift5_mmxext ff_avg_pixels16_l2_shift5_sse2 +#define ff_put_pixels4_l2_shift5_sse2 ff_put_pixels4_l2_shift5_mmxext +#define ff_avg_pixels4_l2_shift5_sse2 ff_avg_pixels4_l2_shift5_mmxext #define H264_MC_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \ H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ @@ -309,7 +309,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+2, halfHV, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ @@ -319,7 +319,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ av_assert2(((uintptr_t)temp & 7) == 0);\ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, stride);\ - ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride);\ + ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_sse2(dst, halfV+3, halfHV, stride);\ }\ #define H264_MC(QPEL, SIZE, MMX, ALIGN)\ diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm index 101ab21647..9ca78b0775 100644 --- a/libavcodec/x86/h264_qpel_8bit.asm +++ b/libavcodec/x86/h264_qpel_8bit.asm @@ -781,13 +781,30 @@ INIT_MMX mmxext PIXELS4_L2_SHIFT5 put PIXELS4_L2_SHIFT5 avg +%macro PIXELS8_L2_SHIFT5 1 +cglobal %1_pixels8_l2_shift5, 5, 5, 3 ; dst, src16, src8, dstStride + movsxdifnidn r3, r3d + mov r4d, 8 +.loop: + movu m0, [r1] + movu m1, [r1+48] + psraw m0, 5 + psraw m1, 5 + packuswb m0, m1 + pavgb m0, [r2] + pshufd m1, m0, 0xee ; low half of m1 is high half of m0 + op_%1h m0, [r0], m2 + op_%1h m1, [r0+r3], m2 + add r1, 48*2 + add r2, 8*2 + lea r0, [r0+2*r3] + sub r4d, 2 + jne .loop + RET +%endmacro -%macro PIXELS_L2_SHIFT5 2 -%if cpuflag(sse2) +%macro PIXELS16_L2_SHIFT5 2 cglobal %1_pixels%2_l2_shift5, 5, 5, 4 ; dst, src16, src8, dstStride -%else -cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride -%endif movsxdifnidn r3, r3d mov r4d, %2 .loop: @@ -813,13 +830,12 @@ cglobal %1_pixels%2_l2_shift5, 5, 5 ; dst, src16, src8, dstStride RET %endmacro -INIT_MMX mmxext -PIXELS_L2_SHIFT5 put, 8 -PIXELS_L2_SHIFT5 avg, 8 - INIT_XMM sse2 -PIXELS_L2_SHIFT5 put, 16 -PIXELS_L2_SHIFT5 avg, 16 +PIXELS8_L2_SHIFT5 put +PIXELS8_L2_SHIFT5 avg + +PIXELS16_L2_SHIFT5 put, 16 +PIXELS16_L2_SHIFT5 avg, 16 %if ARCH_X86_64 %macro QPEL16_H_LOWPASS_L2_OP 1