diff --git a/veejay-current/utils/Makefile.am b/veejay-current/utils/Makefile.am new file mode 100644 index 00000000..9b604b11 --- /dev/null +++ b/veejay-current/utils/Makefile.am @@ -0,0 +1,41 @@ +# Process this file with Automake to produce Makefile.in + + + +INCLUDES = -I$(top_srcdir) + + +noinst_LIBRARIES = libmotion.a libmjpegutils.a +#lib_LIBRARIES = libmjpegutils.a + +libmotion_MMXSSE = mblock_sumsq_mmx.s mblock_bsumsq_mmx.s mblock_bsad_mmx.s \ + mblock_sad_mmxe.s mblock_sad_mmx.s \ + mblock_sub44_sads_x86.c attributes.h + +libmjpegutils_a_SOURCES = mjpeg_logging.c cpu_accel.c \ + mjpeg_logging.h mjpeg_types.h cpu_accel.h \ + yuv4mpeg.c yuv4mpeg_ratio.c \ + yuv4mpeg.h yuv4mpeg_intern.h \ + mpegconsts.c mpegconsts.h \ + mpegtimecode.c mpegtimecode.h + +#include_HEADERS = yuv4mpeg.h mpegconsts.h mjpeg_logging.h mjpeg_types.h mpegtimecode.h + +if HAVE_ASM_NASM +if HAVE_ASM_MMX +libmotion_SIMD = $(libmotion_MMXSSE) +else +mpeg2enc_SIMD = +endif +else +mpeg2enc_SIMD = +endif + +libmotion_a_SOURCES = motionsearch.c $(libmotion_SIMD) mmx.h motionsearch.h fastintfns.h + +EXTRA_DIST = videodev_mjpeg.h format_codes.h mblock_sub44_sads_x86_h.c mblock_sub44_sads_x86.h + +MAINTAINERCLEANFILES = Makefile.in + +%.o: %.s ; $(AS) $(ASFLAGS) -o $@ $< + diff --git a/veejay-current/utils/attributes.h b/veejay-current/utils/attributes.h new file mode 100644 index 00000000..8ed7aef9 --- /dev/null +++ b/veejay-current/utils/attributes.h @@ -0,0 +1,27 @@ +/* + * attributes.h + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +//use gcc attribs to align critical data structures +#ifdef ATTRIBUTE_ALIGNED_MAX +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align))) +#else +#define ATTR_ALIGN(align) +#endif diff --git a/veejay-current/utils/cpu_accel.c b/veejay-current/utils/cpu_accel.c new file mode 100644 index 00000000..9528a674 --- /dev/null +++ b/veejay-current/utils/cpu_accel.c @@ -0,0 +1,161 @@ +/* + * cpu_accel.c + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "cpu_accel.h" + +#ifdef HAVE_X86CPU + +#include "mjpeg_types.h" + +/* Some miscelaneous stuff to allow checking whether SSE instructions cause + illegal instruction errors. +*/ + +static sigjmp_buf sigill_recover; + +static RETSIGTYPE sigillhandler(int sig ) +{ + siglongjmp( sigill_recover, 1 ); +} + +typedef RETSIGTYPE (*__sig_t)(int); + +static int testsseill() +{ + int illegal; +#if defined(__CYGWIN__) + /* SSE causes a crash on CYGWIN, apparently. + Perhaps the wrong signal is being caught or something along + those line ;-) or maybe SSE itself won't work... + */ + illegal = 1; +#else + __sig_t old_handler = signal( SIGILL, sigillhandler); + if( sigsetjmp( sigill_recover, 1 ) == 0 ) + { + asm ( "movups %xmm0, %xmm0" ); + illegal = 0; + } + else + illegal = 1; + signal( SIGILL, old_handler ); +#endif + return illegal; +} + +static int x86_accel (void) +{ + int32_t eax, ebx, ecx, edx; + int32_t AMD; + int32_t caps; + +#define cpuid(op,eax,ebx,ecx,edx) \ + asm ("cpuid" \ + : "=a" (eax), \ + "=b" (ebx), \ + "=c" (ecx), \ + "=d" (edx) \ + : "a" (op) \ + : "cc") + + asm ("pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl $0x200000,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0" + : "=a" (eax), + "=b" (ebx) + : + : "cc"); + + + if (eax == ebx) // no cpuid + return 0; + + cpuid (0x00000000, eax, ebx, ecx, edx); + if (!eax) // vendor string only + return 0; + + AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65); + + cpuid (0x00000001, eax, ebx, ecx, edx); + if (! (edx & 0x00800000)) // no MMX + return 0; + + caps = ACCEL_X86_MMX; + /* If SSE capable CPU has same MMX extensions as AMD + and then some. However, to use SSE O.S. must have signalled + it use of FXSAVE/FXRSTOR through CR4.OSFXSR and hence FXSR (bit 24) + here + */ + if ((edx & 0x02000000)) + caps = ACCEL_X86_MMX | ACCEL_X86_MMXEXT; + if( (edx & 0x03000000) == 0x03000000 ) + { + /* Check whether O.S. has SSE support... has to be done with + exception 'cos those Intel morons put the relevant bit + in a reg that is only accesible in ring 0... doh! + */ + if( !testsseill() ) + caps |= ACCEL_X86_SSE; + } + + cpuid (0x80000000, eax, ebx, ecx, edx); + if (eax < 0x80000001) // no extended capabilities + return caps; + + cpuid (0x80000001, eax, ebx, ecx, edx); + + if (edx & 0x80000000) + caps |= ACCEL_X86_3DNOW; + + if (AMD && (edx & 0x00400000)) // AMD MMX extensions + { + caps |= ACCEL_X86_MMXEXT; + } + + return caps; +} +#endif + +int cpu_accel (void) +{ +#ifdef HAVE_X86CPU + static int got_accel = 0; + static int accel; + + if (!got_accel) { + got_accel = 1; + accel = x86_accel (); + } + + return accel; +#else + return 0; +#endif +} diff --git a/veejay-current/utils/cpu_accel.h b/veejay-current/utils/cpu_accel.h new file mode 100644 index 00000000..4cc5c23b --- /dev/null +++ b/veejay-current/utils/cpu_accel.h @@ -0,0 +1,30 @@ +/* + * cpu_accel.h + * Copyright (C) 1999-2000 Aaron Holtzman + * + * This file was part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + + +// x86 accelerations +#define ACCEL_X86_MMX 0x80000000 +#define ACCEL_X86_3DNOW 0x40000000 +#define ACCEL_X86_MMXEXT 0x20000000 +#define ACCEL_X86_SSE 0x10000000 + +int cpu_accel (void); diff --git a/veejay-current/utils/fastintfns.h b/veejay-current/utils/fastintfns.h new file mode 100644 index 00000000..db78af1e --- /dev/null +++ b/veejay-current/utils/fastintfns.h @@ -0,0 +1,32 @@ +/* fast int primitives. min,max,abs,samesign + * + * WARNING: Assumes 2's complement arithmetic. + * + */ + + +static __inline__ int intmax( register int x, register int y ) +{ + return x < y ? y : x; +} + +static __inline__ int intmin( register int x, register int y ) +{ + return x < y ? x : y; +} + +static __inline__ int intabs( register int x ) +{ + return x < 0 ? -x : x; +} + +#define fabsshift ((8*sizeof(unsigned int))-1) + +#define signmask(x) (((int)x)>>fabsshift) +static __inline__ int intsamesign(int x, int y) +{ + return (y+(signmask(x) & -(y<<1))); +} +#undef signmask +#undef fabsshift + diff --git a/veejay-current/utils/format_codes.h b/veejay-current/utils/format_codes.h new file mode 100644 index 00000000..73d7afc7 --- /dev/null +++ b/veejay-current/utils/format_codes.h @@ -0,0 +1,38 @@ +/* + $Id: format_codes.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $ + + Copyright (C) 2001 Andrew Stevens + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __FORMAT_CODES_H__ +#define __FORMAT_CODES_H__ + +#define MPEG_FORMAT_MPEG1 0 +#define MPEG_FORMAT_VCD 1 +#define MPEG_FORMAT_VCD_NSR 2 +#define MPEG_FORMAT_MPEG2 3 +#define MPEG_FORMAT_SVCD 4 +#define MPEG_FORMAT_SVCD_NSR 5 +#define MPEG_FORMAT_VCD_STILL 6 +#define MPEG_FORMAT_SVCD_STILL 7 +#define MPEG_FORMAT_DVD 8 + +#define MPEG_FORMAT_FIRST 0 +#define MPEG_FORMAT_LAST 8 + +#define MPEG_STILLS_FORMAT(x) (x==MPEG_FORMAT_VCD_STILL||x==MPEG_FORMAT_SVCD_STILL) +#endif /* __FORMAT_CODES_H__ */ diff --git a/veejay-current/utils/mblock_bsad_mmx.s b/veejay-current/utils/mblock_bsad_mmx.s new file mode 100644 index 00000000..a5ffdcff --- /dev/null +++ b/veejay-current/utils/mblock_bsad_mmx.s @@ -0,0 +1,329 @@ +; +; bdist1_mmx.s: mmX optimized bidirectional absolute distance sum +; +; Original believed to be Copyright (C) 2000 Brent Byeler +; +; This program is free software; you can reaxstribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; + +;/* +; * absolute difference error between a (16*h) block and a bidirectional +; * prediction +; * +; * p2: address of top left pel of block +; * pf,hxf,hyf: address and half pel flags of forward ref. block +; * pb,hxb,hyb: address and half pel flags of backward ref. block +; * h: height of block +; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb +; * mmX version +; */ + +;int bsad_mmx( +;unsigned char *pf, unsigned char *pb, unsigned char *p2, +;int lx, int hxf, int hyf, int hxb, int hyb, int h) +;{ +; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc; + +; Handy macros for readbility + +%define pf [ebp+8] +%define pb [ebp+12] +%define p2 [ebp+16] +%define lx [ebp+20] +%define hxf [ebp+24] +%define hyf [ebp+28] +%define hxb [ebp+32] +%define hyb [ebp+36] +%define h [ebp+40] + + +%define pfa [esp+4] +%define pfb [esp+8] +%define pfc [esp+12] +%define pba [esp+16] +%define pbb [esp+20] +%define pbc [esp+24] + +SECTION .text +global bsad_mmx + +align 32 +bsad_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + push esi + push edi + + ;; + ;; Make space for local variables on stack + sub esp, 32 + + + mov edx, hxb + mov eax, hxf + mov esi, lx + + mov ecx, pf + add ecx, eax + mov pfa, ecx + mov ecx, esi + imul ecx, hyf + mov ebx, pf + add ecx, ebx + mov pfb, ecx + add eax, ecx + mov pfc, eax + mov eax, pb + add eax, edx + mov pba, eax + mov eax, esi + imul eax, hyb + mov ecx, pb + add eax, ecx + mov pbb, eax + add edx, eax + mov pbc, edx + xor esi, esi ; esi is "s" the accumulator + mov eax, esi + + mov edi, h + test edi, edi ; h = 0? + jle near bsadexit + + pxor mm7, mm7 + pxor mm6, mm6 + pcmpeqw mm5, mm5 + psubw mm6, mm5 + psllw mm6, 1 + +bsadtop: + mov eax, pf + mov ebx, pfa + mov ecx, pfb + mov edx, pfc + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [ecx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [edx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 + paddw mm1, mm6 + psrlw mm0, 2 + psrlw mm1, 2 + mov eax, pb + mov ebx, pba + mov ecx, pbb + mov edx, pbc + movq mm2, [eax] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movq mm4, [ebx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [ecx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [edx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + paddw mm2, mm6 + paddw mm3, mm6 + psrlw mm2, 2 + psrlw mm3, 2 + paddw mm0, mm2 + paddw mm1, mm3 + psrlw mm6, 1 + paddw mm0, mm6 + paddw mm1, mm6 + psllw mm6, 1 + psrlw mm0, 1 + psrlw mm1, 1 + packuswb mm0, mm1 + + mov eax, p2 + movq mm1, [eax] + movq mm2, mm0 + psubusb mm0, mm1 + psubusb mm1, mm2 + por mm0, mm1 + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + paddw mm0, mm1 + movq mm1, mm0 + punpcklwd mm0, mm7 + punpckhwd mm1, mm7 + + paddd mm0, mm1 + movd eax, mm0 + psrlq mm0, 32 + movd ebx, mm0 + add esi, eax + add esi, ebx + mov eax, pf + mov ebx, pfa + mov ecx, pfb + mov edx, pfc + movq mm0, [eax+8] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [ebx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [ecx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [edx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 + paddw mm1, mm6 + psrlw mm0, 2 + psrlw mm1, 2 + mov eax, pb + mov ebx, pba + mov ecx, pbb + mov edx, pbc + movq mm2, [eax+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movq mm4, [ebx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [ecx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [edx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + paddw mm2, mm6 + paddw mm3, mm6 + psrlw mm2, 2 + psrlw mm3, 2 + paddw mm0, mm2 + paddw mm1, mm3 + psrlw mm6, 1 + paddW mm0, mm6 + paddw mm1, mm6 + psllw mm6, 1 + psrlw mm0, 1 + psrlw mm1, 1 + packuswb mm0, mm1 + mov eax, p2 + movq mm1, [eax+8] + movq mm2, mm0 + psubusb mm0, mm1 + psubusb mm1, mm2 + por mm0, mm1 + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + paddw mm0, mm1 + movq mm1, mm0 + punpcklwd mm0, mm7 + punpckhwd mm1, mm7 + paddd mm0, mm1 + movd eax, mm0 + psrlq mm0, 32 + movd ebx, mm0 + add esi, eax + add esi, ebx + + mov eax, lx + add p2, eax + add pf, eax + add pfa, eax + add pfb, eax + add pfc, eax + add pb, eax + add pba, eax + add pbb, eax + add pbc, eax + + dec edi + jg near bsadtop + mov eax, esi + +bsadexit: + + + ;; + ;; Get rid of local variables + add esp, 32 + + ;; Retore (callee saves convention...) + ;; + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret + + diff --git a/veejay-current/utils/mblock_bsumsq_mmx.s b/veejay-current/utils/mblock_bsumsq_mmx.s new file mode 100644 index 00000000..7cec9a4b --- /dev/null +++ b/veejay-current/utils/mblock_bsumsq_mmx.s @@ -0,0 +1,327 @@ +; +; bdist2_mmx.s: MMX optimized bidirectional squared distance sum +; +; Original believed to be Copyright (C) 2000 Brent Byeler +; +; This program is free software; you can reaxstribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; + +;/* +; * squared error between a (16*h) block and a bidirectional +; * prediction +; * +; * p2: address of top left pel of block +; * pf,hxf,hyf: address and half pel flags of forward ref. block +; * pb,hxb,hyb: address and half pel flags of backward ref. block +; * h: height of block +; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb +; * mmX version +; */ + +;int bsumsq_mmx( +;unsigned char *pf, unsigned char *pb, unsigned char *p2, +;int lx, int hxf, int hyf, int hxb, int hyb, int h) +;{ +; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc; +; int s; + +; Handy macros for readbility + +%define pf [ebp+8] +%define pb [ebp+12] +%define p2 [ebp+16] +%define lx [ebp+20] +%define hxf [ebp+24] +%define hyf [ebp+28] +%define hxb [ebp+32] +%define hyb [ebp+36] +%define h [ebp+40] + + +%define pfa [esp+4] +%define pfb [esp+8] +%define pfc [esp+12] +%define pba [esp+16] +%define pbb [esp+20] +%define pbc [esp+24] + +SECTION .text +global bsumsq_mmx + +align 32 +bsumsq_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + push esi + push edi + + ;; + ;; Make space for local variables on stack + sub esp, 32 + + mov edx, hxb + mov eax, hxf + mov esi, lx + + mov ecx, pf + add ecx, eax + mov pfa, ecx + mov ecx, esi + imul ecx, hyf + mov ebx, pf + add ecx, ebx + mov pfb, ecx + add eax, ecx + mov pfc, eax + mov eax, pb + add eax, edx + mov pba, eax + mov eax, esi + imul eax, hyb + mov ecx, pb + add eax, ecx + mov pbb, eax + add edx, eax + mov pbc, edx + xor esi, esi ; esi = s (accumulated sym) + mov eax, esi + + mov edi, h + test edi, edi ; h = 0? + jle near bsumsqexit + + pxor mm7, mm7 + pxor mm6, mm6 + pcmpeqw mm5, mm5 + psubw mm6, mm5 + psllw mm6, 1 + +bsumsqtop: + mov eax, pf + mov ebx, pfa + mov ecx, pfb + mov edx, pfc + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [ecx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [edx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 + paddw mm1, mm6 + psrlw mm0, 2 + psrlw mm1, 2 + + mov eax, pb + mov ebx, pba + mov ecx, pbb + mov edx, pbc + movq mm2, [eax] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movq mm4, [ebx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [ecx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [edx] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + + paddw mm2, mm6 + paddw mm3, mm6 + psrlw mm2, 2 + psrlw mm3, 2 + + paddw mm0, mm2 + paddw mm1, mm3 + psrlw mm6, 1 + paddw mm0, mm6 + paddw mm1, mm6 + psllw mm6, 1 + psrlw mm0, 1 + psrlw mm1, 1 + + mov eax, p2 + movq mm2, [eax] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movd eax, mm0 + psrlq mm0, 32 + movd ebx, mm0 + add esi, eax + add esi, ebx + + mov eax, pf + mov ebx, pfa + mov ecx, pfb + mov edx, pfc + movq mm0, [eax+8] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [ebx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [ecx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [edx+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 + paddw mm1, mm6 + psrlw mm0, 2 + psrlw mm1, 2 + + mov eax, pb + mov ebx, pba + mov ecx, pbb + mov edx, pbc + movq mm2, [eax+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movq mm4, [ebx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [ecx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + movq mm4, [edx+8] + movq mm5, mm4 + punpcklbw mm4, mm7 + punpckhbw mm5, mm7 + paddw mm2, mm4 + paddw mm3, mm5 + paddw mm2, mm6 + paddw mm3, mm6 + psrlw mm2, 2 + psrlw mm3, 2 + + paddw mm0, mm2 + paddw mm1, mm3 + psrlw mm6, 1 + paddW mm0, mm6 + paddw mm1, mm6 + psllw mm6, 1 + psrlw mm0, 1 + psrlw mm1, 1 + + mov eax, p2 + movq mm2, [eax+8] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movd eax, mm0 + psrlq mm0, 32 + movd ebx, mm0 + add esi, eax + add esi, ebx + + mov eax, lx + add p2, eax + add pf, eax + add pfa, eax + add pfb, eax + add pfc, eax + add pb, eax + add pba, eax + add pbb, eax + add pbc, eax + + dec edi + jg near bsumsqtop + mov eax, esi + +bsumsqexit: + + ;; + ;; Get rid of local variables + add esp, 32 + + ;; Retore (callee saves convention...) + ;; + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret + diff --git a/veejay-current/utils/mblock_sad_mmx.s b/veejay-current/utils/mblock_sad_mmx.s new file mode 100644 index 00000000..c89c636b --- /dev/null +++ b/veejay-current/utils/mblock_sad_mmx.s @@ -0,0 +1,1094 @@ +;;; +;;; mblock_sad_mmxe.s: +;;; +;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks +;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel) +; +; sad_* Original Copyright (C) 2000 Chris Atenasio +; Enhancements and rest Copyright (C) 2000 Andrew Stevens + +; +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; +; +; + +SECTION .text + +global sad_00_mmx + +; int sad_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim); +; N.b. distlim is *ignored* as testing for it is more expensive than the +; occasional saving by aborting the computionation early... +; esi = p1 (init: blk1) +; edi = p2 (init: blk2) +; ebx = distlim +; ecx = rowsleft (init: h) +; edx = lx; + +; mm0 = distance accumulators (4 words) +; mm1 = temp +; mm2 = temp +; mm3 = temp +; mm4 = temp +; mm5 = temp +; mm6 = 0 +; mm7 = temp + + +align 32 +sad_00_mmx: + push ebp ; save frame pointer + mov ebp, esp + + push ebx ; Saves registers (called saves convention in + push ecx ; x86 GCC it seems) + push edx ; + push esi + push edi + + pxor mm0, mm0 ; zero acculumators + pxor mm6, mm6 + mov esi, [ebp+8] ; get p1 + mov edi, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] ; get rowsleft + ;mov ebx, [ebp+24] ; distlim + jmp nextrowmm00 +align 32 +nextrowmm00: + movq mm4, [esi] ; load first 8 bytes of p1 row + movq mm5, [edi] ; load first 8 bytes of p2 row + + movq mm7, mm4 ; mm5 = abs(mm4-mm5) + psubusb mm7, mm5 + psubusb mm5, mm4 + paddb mm5, mm7 + + ;; Add the abs(mm4-mm5) bytes to the accumulators + movq mm2, [esi+8] ; load second 8 bytes of p1 row (interleaved) + movq mm7,mm5 ; mm7 := [i : B0..3, mm1]W + punpcklbw mm7,mm6 + movq mm3, [edi+8] ; load second 8 bytes of p2 row (interleaved) + paddw mm0, mm7 + punpckhbw mm5,mm6 + paddw mm0, mm5 + + ;; This is logically where the mm2, mm3 loads would go... + + movq mm7, mm2 ; mm3 = abs(mm2-mm3) + psubusb mm7, mm3 + psubusb mm3, mm2 + paddb mm3, mm7 + + ;; Add the abs(mm4-mm5) bytes to the accumulators + movq mm7,mm3 ; mm7 := [i : B0..3, mm1]W + punpcklbw mm7,mm6 + punpckhbw mm3,mm6 + paddw mm0, mm7 + + add esi, edx ; update pointer to next row + add edi, edx ; ditto + + paddw mm0, mm3 + + + + sub ecx,1 + jnz near nextrowmm00 + +returnmm00: + + ;; Sum the Accumulators + movq mm5, mm0 ; mm5 := [W0+W2,W1+W3, mm0 + psrlq mm5, 32 + movq mm4, mm0 + paddw mm4, mm5 + + movq mm7, mm4 ; mm6 := [W0+W2+W1+W3, mm0] + psrlq mm7, 16 + paddw mm4, mm7 + movd eax, mm4 ; store return value + and eax, 0xffff + + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp + + emms ; clear mmx registers + ret +; +;;; sad_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum +;;; We're reduce to seven bits as otherwise we also have to mess +;;; horribly with carries and signed only comparisons make the code +;;; simply enormous (and probably barely faster than a simple loop). +;;; Since signals with a bona-fide 8bit res will be rare we simply +;;; take the precision hit... +;;; Actually we don't worry about carries from the low-order bits +;;; either so 1/4 of the time we'll be 1 too low... +;;; +; Copyright (C) 2000 Andrew Stevens + +; +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; +; +; + + +global sad_01_mmx + +; int sad_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h); + +; esi = p1 (init: blk1) +; edi = p2 (init: blk2) +; ecx = rowsleft (init: h) +; edx = lx; + +; mm0 = distance accumulators (4 words) +; mm1 = bytes p2 +; mm2 = bytes p1 +; mm3 = bytes p1+1 +; mm4 = temp 4 bytes in words interpolating p1, p1+1 +; mm5 = temp 4 bytes in words from p2 +; mm6 = temp comparison bit mask p1,p2 +; mm7 = temp comparison bit mask p2,p1 + + +align 32 +sad_01_mmx: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx ; Saves registers (called saves convention in + push ecx ; x86 GCC it seems) + push edx ; + push esi + push edi + + pxor mm0, mm0 ; zero acculumators + + mov esi, [ebp+8] ; get p1 + mov edi, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] ; rowsleft := h + jmp nextrowmm01 ; snap to it +align 32 +nextrowmm01: + + ;; + ;; First 8 bytes of row + ;; + + ;; First 4 bytes of 8 + + movq mm4, [esi] ; mm4 := first 4 bytes p1 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1 + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words + psrlw mm4, 1 + + movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + + paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words + psrlw mm4, 1 + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Second 8 bytes of row + ;; + ;; First 4 bytes of 8 + + movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9 + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words + psrlw mm4, 1 + + movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + + paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words + psrlw mm4, 1 + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Loop termination condition... and stepping + ;; + + add esi, edx ; update pointer to next row + add edi, edx ; ditto + + sub ecx,1 + test ecx, ecx ; check rowsleft + jnz near nextrowmm01 + + + ;; Sum the Accumulators + movq mm4, mm0 + psrlq mm4, 32 + paddw mm0, mm4 + movq mm6, mm0 + psrlq mm6, 16 + paddw mm0, mm6 + movd eax, mm0 ; store return value + and eax, 0xffff + + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming +; +;;; sad_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum +;;; We're reduce to seven bits as otherwise we also have to mess +;;; horribly with carries and signed only comparisons make the code +;;; simply enormous (and probably barely faster than a simple loop). +;;; Since signals with a bona-fide 8bit res will be rare we simply +;;; take the precision hit... +;;; Actually we don't worry about carries from the low-order bits +;;; either so 1/4 of the time we'll be 1 too low... +;;; +; Copyright (C) 2000 Andrew Stevens + +; +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; +; +; + + +global sad_10_mmx + +; int sad_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h); + +; esi = p1 (init: blk1) +; edi = p2 (init: blk2) +; ebx = p1+lx +; ecx = rowsleft (init: h) +; edx = lx; + +; mm0 = distance accumulators (4 words) +; mm1 = bytes p2 +; mm2 = bytes p1 +; mm3 = bytes p1+1 +; mm4 = temp 4 bytes in words interpolating p1, p1+1 +; mm5 = temp 4 bytes in words from p2 +; mm6 = temp comparison bit mask p1,p2 +; mm7 = temp comparison bit mask p2,p1 + + +align 32 +sad_10_mmx: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx ; Saves registers (called saves convention in + push ecx ; x86 GCC it seems) + push edx ; + push esi + push edi + + pxor mm0, mm0 ; zero acculumators + + mov esi, [ebp+8] ; get p1 + mov edi, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] ; rowsleft := h + mov ebx, esi + add ebx, edx + jmp nextrowmm10 ; snap to it +align 32 +nextrowmm10: + + ;; + ;; First 8 bytes of row + ;; + + ;; First 4 bytes of 8 + + movq mm4, [esi] ; mm4 := first 4 bytes p1 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words + psrlw mm4, 1 + + movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + + paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words + psrlw mm4, 1 + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Second 8 bytes of row + ;; + ;; First 4 bytes of 8 + + movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8 + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words + psrlw mm4, 1 + + movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + + paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words + psrlw mm4, 1 + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Loop termination condition... and stepping + ;; + + add esi, edx ; update pointer to next row + add edi, edx ; ditto + add ebx, edx + + sub ecx,1 + test ecx, ecx ; check rowsleft + jnz near nextrowmm10 + + ;; Sum the Accumulators + movq mm4, mm0 + psrlq mm4, 32 + paddw mm0, mm4 + movq mm6, mm0 + psrlq mm6, 16 + paddw mm0, mm6 + movd eax, mm0 ; store return value + and eax, 0xffff + + + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming +; +;;; sad_01_mmx.s: +;;; +; Copyright (C) 2000 Andrew Stevens + +; +; This program is free software; you can redistribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; +; +; + + +global sad_11_mmx + +; int sad_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h); + +; esi = p1 (init: blk1) +; edi = p2 (init: blk2) +; ebx = p1+lx +; ecx = rowsleft (init: h) +; edx = lx; + +; mm0 = distance accumulators (4 words) +; mm1 = bytes p2 +; mm2 = bytes p1 +; mm3 = bytes p1+lx +; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes +; but I don't think thats going to happen in iA32-land... +; mm4 = temp 4 bytes in words interpolating p1, p1+1 +; mm5 = temp 4 bytes in words from p2 +; mm6 = temp comparison bit mask p1,p2 +; mm7 = temp comparison bit mask p2,p1 + + +align 32 +sad_11_mmx: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx ; Saves registers (called saves convention in + push ecx ; x86 GCC it seems) + push edx ; + push esi + push edi + + pxor mm0, mm0 ; zero acculumators + + mov esi, [ebp+8] ; get p1 + mov edi, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] ; rowsleft := h + mov ebx, esi + add ebx, edx + jmp nextrowmm11 ; snap to it +align 32 +nextrowmm11: + + ;; + ;; First 8 bytes of row + ;; + + ;; First 4 bytes of 8 + + movq mm4, [esi] ; mm4 := first 4 bytes p1 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 + + + movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1 + punpcklbw mm5, mm7 ; First 4 bytes p1 in Words... + paddw mm4, mm5 + movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1 + punpcklbw mm6, mm7 + paddw mm4, mm6 + + psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words + + movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + paddw mm4, mm6 + + movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1 + punpckhbw mm5, mm7 ; First 4 bytes p1 in Words... + paddw mm4, mm5 + movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1 + punpckhbw mm6, mm7 + paddw mm4, mm6 + + psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Second 8 bytes of row + ;; + ;; First 4 bytes of 8 + + movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8 + pxor mm7, mm7 + movq mm2, mm4 ; mm2 records all 8 bytes + punpcklbw mm4, mm7 ; First 4 bytes p1 in Words... + + movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8 + movq mm3, mm6 ; mm3 records all 8 bytes + punpcklbw mm6, mm7 + paddw mm4, mm6 + + + movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9 + punpcklbw mm5, mm7 ; First 4 bytes p1 in Words... + paddw mm4, mm5 + movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9 + punpcklbw mm6, mm7 + paddw mm4, mm6 + + psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words + + movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words + movq mm1, mm5 + punpcklbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + ;; Second 4 bytes of 8 + + movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words + pxor mm7, mm7 + punpckhbw mm4, mm7 + movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words + punpckhbw mm6, mm7 + paddw mm4, mm6 + + movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1 + punpckhbw mm5, mm7 ; First 4 bytes p1 in Words... + paddw mm4, mm5 + movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1 + punpckhbw mm6, mm7 + paddw mm4, mm6 + + psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words + + movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words + punpckhbw mm5, mm7 + + movq mm7,mm4 + pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5] + + movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)] + psubw mm6,mm5 + pand mm6, mm7 + + paddw mm0, mm6 ; Add to accumulator + + movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4] + pcmpgtw mm6,mm4 + psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)] + pand mm5, mm6 + + paddw mm0, mm5 ; Add to accumulator + + + ;; + ;; Loop termination condition... and stepping + ;; + + add esi, edx ; update pointer to next row + add edi, edx ; ditto + add ebx, edx + + sub ecx,1 + test ecx, ecx ; check rowsleft + jnz near nextrowmm11 + + ;; Sum the Accumulators + movq mm4, mm0 + psrlq mm4, 32 + paddw mm0, mm4 + movq mm6, mm0 + psrlq mm6, 16 + paddw mm0, mm6 + movd eax, mm0 ; store return value + and eax, 0xffff + + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + + +global sad_sub22_mmx + +; int sad_sub22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h); + +; eax = p1 (init: blk1) +; ebx = p2 (init: blk2) +; ecx = rowsleft (init: h) +; edx = lx; + +; mm0 = distance accumulators (4 words) +; mm1 = temp +; mm2 = temp +; mm3 = temp +; mm4 = temp +; mm5 = temp +; mm6 = 0 +; mm7 = temp + + +align 32 +sad_sub22_mmx: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx ; Saves registers (called saves convention in + push ecx ; x86 GCC it seems) + push edx ; + + pxor mm0, mm0 ; zero acculumators + pxor mm6, mm6 + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov ecx, [ebp+20] ; get rowsleft + + jmp nextrow ; snap to it +align 32 +nextrow: + movq mm4, [eax] ; load 8 bytes of p1 + movq mm5, [ebx] ; load 8 bytes of p2 + + movq mm7, mm4 ; mm5 = abs(*p1-*p2) + psubusb mm7, mm5 + psubusb mm5, mm4 + add eax, edx ; update pointer to next row + paddb mm5,mm7 + + ;; Add the mm5 bytes to the accumulatores + movq mm7,mm5 + punpcklbw mm7,mm6 + paddw mm0, mm7 + punpckhbw mm5,mm6 + add ebx, edx ; update pointer to next row + paddw mm0, mm5 + + movq mm4, [eax] ; load 8 bytes of p1 (next row) + movq mm5, [ebx] ; load 8 bytes of p2 (next row) + + movq mm7, mm4 ; mm5 = abs(*p1-*p2) + psubusb mm7, mm5 + psubusb mm5, mm4 + add eax, edx ; update pointer to next row + paddb mm5,mm7 + + ;; Add the mm5 bytes to the accumulatores + movq mm7,mm5 + punpcklbw mm7,mm6 + add ebx, edx ; update pointer to next row + paddw mm0, mm7 + punpckhbw mm5,mm6 + sub ecx,2 + paddw mm0, mm5 + + + jnz nextrow + + ;; Sum the Accumulators + movq mm1, mm0 + psrlq mm1, 16 + movq mm2, mm0 + psrlq mm2, 32 + movq mm3, mm0 + psrlq mm3, 48 + paddw mm0, mm1 + paddw mm2, mm3 + paddw mm0, mm2 + + movd eax, mm0 ; store return value + and eax, 0xffff + + pop edx ; pop pop + pop ecx ; fizz fizz + pop ebx ; ia86 needs a fizz instruction + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + + + + +global sad_sub44_mmx + +; int sad_sub44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh); + +; eax = p1 +; ebx = p2 +; ecx = temp +; edx = qlx; +; esi = rowsleft + +; mm0 = distance accumulator left block p1 +; mm1 = distance accumulator right block p1 +; mm2 = 0 +; mm3 = right block of p1 +; mm4 = left block of p1 +; mm5 = p2 +; mm6 = temp +; mm7 = temp + +align 32 +sad_sub44_mmx: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx + push ecx + push edx + push esi + + pxor mm0, mm0 ; zero acculumator + pxor mm1, mm1 + pxor mm2, mm2 + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get qlx + mov esi, [ebp+20] ; get rowsleft + jmp nextrowqd ; snap to it +align 32 +nextrowqd: + + ;; + ;; Beware loop obfuscated by interleaving to try to + ;; hide latencies... + ;; + movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words + movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words + movq mm3, mm4 + punpcklbw mm4, mm2 + punpcklbw mm5, mm2 + + movq mm7, mm4 + movq mm6, mm5 + psubusw mm7, mm5 + psubusw mm6, mm4 + + add eax, edx ; update a pointer to next row +; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words + + paddw mm7, mm6 + paddw mm0, mm7 ; Add absolute differences to left block accumulators + +; movq mm7,mm3 +; psubusw mm7, mm5 +; psubusw mm5, mm3 + + add ebx, edx ; update a pointer to next row + sub esi, 1 + +; paddw mm7, mm5 +; paddw mm1, mm7 ; Add absolute differences to right block accumulators + + + + jnz nextrowqd + + ;; Sum the accumulators + + movq mm4, mm0 + psrlq mm4, 32 + paddw mm0, mm4 + movq mm6, mm0 + psrlq mm6, 16 + paddw mm0, mm6 + movd eax, mm0 ; store return value + +; movq mm4, mm1 +; psrlq mm4, 32 +; paddw mm1, mm4 +; movq mm6, mm1 +; psrlq mm6, 16 +; paddw mm1, mm6 +; movd ebx, mm1 + + and eax, 0xffff +; sal ebx, 16 +; or eax, ebx + + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming diff --git a/veejay-current/utils/mblock_sad_mmxe.s b/veejay-current/utils/mblock_sad_mmxe.s new file mode 100644 index 00000000..460ed0fc --- /dev/null +++ b/veejay-current/utils/mblock_sad_mmxe.s @@ -0,0 +1,1002 @@ +;;; +;;; mblock_sad_mmxe.s: +;;; +;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks +;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel) +; +; Original MMX sad_* Copyright (C) 2000 Chris Atenasio +; Enhanced MMX and rest Copyright (C) 2000 Andrew Stevens + + ;; Yes, I tried prefetch-ing. It makes no difference or makes + ;; stuff *slower*. + +; +; This program is free software; you can reaxstribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; +; +; + +SECTION .text + +global sad_00_mmxe + +; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim); +; distlim unused - costs more to check than the savings of +; aborting the computation early from time to time... +; eax = p1 +; ebx = p2 +; ecx = rowsleft +; edx = lx; + +; mm0 = distance accumulator +; mm1 = temp +; mm2 = temp +; mm3 = temp +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_00_mmxe: + push ebp ; save frame pointer + mov ebp, esp ; link + + push ebx + push ecx + push edx + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 +sad_00_0misalign: + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov ecx, [ebp+20] ; get rowsleft + jmp nextrow00sse +align 32 +nextrow00sse: + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) + add eax, edx ; update pointer to next row + paddd mm0, mm4 ; accumulate difference + + psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + add ebx, edx ; ditto + paddd mm0, mm5 ; accumulate difference + + + movq mm6, [eax] ; load first 8 bytes of p1 (row 2) + psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) + movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2) + add eax, edx ; update pointer to next row + paddd mm0, mm6 ; accumulate difference + + psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2) + add ebx, edx ; ditto + paddd mm0, mm4 ; accumulate difference + + ;psubd mm2, mm3 ; decrease rowsleft + ;movq mm5, mm1 ; copy distlim + ;pcmpgtd mm5, mm0 ; distlim > dist? + ;pand mm2, mm5 ; mask rowsleft with answer + ;movd ecx, mm2 ; move rowsleft to ecx + + ;add eax, edx ; update pointer to next row + ;add ebx, edx ; ditto + + ;test ecx, ecx ; check rowsleft + sub ecx, 2 + jnz nextrow00sse + + movd eax, mm0 ; store return value + + pop edx + pop ecx + pop ebx + + pop ebp + emms + ret + + + + +global sad_00_Ammxe + ;; This is a special version that only does aligned accesses... + ;; Wonder if it'll make it faster on a P-III + ;; ANSWER: NO its slower hence no longer used. + +; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim); +; distlim unused - costs more to check than the savings of +; aborting the computation early from time to time... +; eax = p1 +; ebx = p2 +; ecx = rowsleft +; edx = lx; + +; mm0 = distance accumulator +; mm1 = temp +; mm2 = right shift to adjust for mis-align +; mm3 = left shift to adjust for mis-align +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_00_Ammxe: + push ebp ; save frame pointer + mov ebp, esp ; link + + push ebx + push ecx + push edx + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, eax + and ebx, 7 ; Misalignment! + cmp ebx, 0 + jz near sad_00_0misalign + sub eax, ebx ; Align eax + mov ecx, 8 ; ecx = 8-misalignment + sub ecx, ebx + shl ebx, 3 ; Convert into bit-shifts... + shl ecx, 3 + movd mm2, ebx ; mm2 = shift to start msb + movd mm3, ecx ; mm3 = shift to end lsb + + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] ; get rowsleft + jmp nextrow00ssea +align 32 +nextrow00ssea: + movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1) + movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1) + movq mm6, mm5 + psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper + psllq mm5, mm3 + por mm4, mm5 + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 + + movq mm7, [eax+16] ; load last 8 bytes of aligned p1 + add eax, edx ; update pointer to next row + psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper + psllq mm7, mm3 + por mm6, mm7 + + + paddd mm0, mm4 ; accumulate difference + + psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + add ebx, edx ; ditto + paddd mm0, mm6 ; accumulate difference + + sub ecx, 1 + jnz nextrow00ssea + + movd eax, mm0 ; store return value + + pop edx + pop ecx + pop ebx + + pop ebp + emms + ret + + +global sad_01_mmxe + +; int sad_01(char *blk1,char *blk2,int lx,int h); + +; eax = p1 +; ebx = p2 +; ecx = counter temp +; edx = lx; + +; mm0 = distance accumulator +; mm1 = distlim +; mm2 = rowsleft +; mm3 = 2 (rows per loop) +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_01_mmxe: + push ebp + mov ebp, esp + + push ebx + push ecx + push edx + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov ecx, [ebp+20] ; get rowsleft + jmp nextrow01 ; snap to it +align 32 +nextrow01: + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + pavgb mm4, [eax+1] ; Interpolate... + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + paddd mm0, mm4 ; accumulate difference + + movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) + pavgb mm5, [eax+9] ; Interpolate + psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + paddd mm0, mm5 ; accumulate difference + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + + movq mm6, [eax] ; load first 8 bytes of p1 (row 2) + pavgb mm6, [eax+1] ; Interpolate + psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) + paddd mm0, mm6 ; accumulate difference + + movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2) + pavgb mm7, [eax+9] + psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2) + paddd mm0, mm7 ; accumulate difference + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + + sub ecx, 2 ; check rowsleft + jnz nextrow01 ; rinse and repeat + + movd eax, mm0 ; store return value + + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + + +global sad_10_mmxe + +; int sad_10(char *blk1,char *blk2,int lx,int h); + +; eax = p1 +; ebx = p2 +; ecx = counter temp +; edx = lx; +; edi = p1+lx + +; mm0 = distance accumulator +; mm2 = rowsleft +; mm3 = 2 (rows per loop) +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_10_mmxe: + push ebp ; save stack pointer + mov ebp, esp + + push ebx + push ecx + push edx + push edi + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov edi, eax + add edi, edx + mov ecx, [ebp+20] ; get rowsleft + jmp nextrow10 ; snap to it +align 32 +nextrow10: + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + pavgb mm4, [edi] ; Interpolate... + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + paddd mm0, mm4 ; accumulate difference + + movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) + pavgb mm5, [edi+8] ; Interpolate + psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + paddd mm0, mm5 ; accumulate difference + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + add edi, edx + + movq mm6, [eax] ; load first 8 bytes of p1 (row 2) + pavgb mm6, [edi] ; Interpolate + psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) + paddd mm0, mm6 ; accumulate difference + + movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2) + pavgb mm7, [edi+8] + psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2) + paddd mm0, mm7 ; accumulate difference + + psubd mm2, mm3 ; decrease rowsleft + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + add edi, edx + + sub ecx, 2 ; check rowsleft (we're doing 2 at a time) + jnz nextrow10 ; rinse and repeat + + movd eax, mm0 ; store return value + + pop edi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + + +global sad_11_mmxe + +; int sad_11(char *blk1,char *blk2,int lx,int h); + +; eax = p1 +; ebx = p2 +; ecx = counter temp +; edx = lx; +; edi = p1+lx + + +; mm0 = distance accumulator +; mm2 = rowsleft +; mm3 = 2 (rows per loop) +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_11_mmxe: + push ebp ; save stack pointer + mov ebp, esp ; so that we can do this + + push ebx ; save the pigs + push ecx ; make them squeal + push edx ; lets have pigs for every meal + push edi + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov edi, eax + add edi, edx + mov ecx, [ebp+20] ; get rowsleft + jmp nextrow11 ; snap to it +align 32 +nextrow11: + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + pavgb mm4, [edi] ; Interpolate... + movq mm5, [eax+1] + pavgb mm5, [edi+1] + pavgb mm4, mm5 + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + paddd mm0, mm4 ; accumulate difference + + movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1) + pavgb mm6, [edi+8] ; Interpolate + movq mm7, [eax+9] + pavgb mm7, [edi+9] + pavgb mm6, mm7 + psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + paddd mm0, mm6 ; accumulate difference + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + add edi, edx + + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + pavgb mm4, [edi] ; Interpolate... + movq mm5, [eax+1] + pavgb mm5, [edi+1] + pavgb mm4, mm5 + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + paddd mm0, mm4 ; accumulate difference + + movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1) + pavgb mm6, [edi+8] ; Interpolate + movq mm7, [eax+9] + pavgb mm7, [edi+9] + pavgb mm6, mm7 + psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) + paddd mm0, mm6 ; accumulate difference + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + add edi, edx + + + sub ecx, 2 ; check rowsleft + jnz near nextrow11 ; rinse and repeat + + movd eax, mm0 ; store return value + + pop edi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + +global sad_sub22_mmxe + +; int sad_sub22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh); + +; eax = p1 +; ebx = p2 +; ecx = counter temp +; edx = flx; + +; mm0 = distance accumulator +; mm2 = rowsleft +; mm3 = 2 (rows per loop) +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_sub22_mmxe: + push ebp ; save frame pointer + mov ebp, esp + + push ebx + push ecx + push edx + + pxor mm0, mm0 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov ecx, [ebp+20] + jmp nextrowfd +align 32 +nextrowfd: + movq mm4, [eax] ; load first 8 bytes of p1 (row 1) + add eax, edx ; update pointer to next row + psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) + add ebx, edx ; ditto + paddd mm0, mm4 ; accumulate difference + + + movq mm6, [eax] ; load first 8 bytes of p1 (row 2) + add eax, edx ; update pointer to next row + psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) + add ebx, edx ; ditto + paddd mm0, mm6 ; accumulate difference + + + sub ecx, 2 + jnz nextrowfd + + movd eax, mm0 + + pop edx + pop ecx + pop ebx + + pop ebp + + emms + ret + + + + + +global sad_sub44_mmxe + +; int sad_sub44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh); + +; eax = p1 +; ebx = p2 +; ecx = temp +; edx = qlx; +; esi = rowsleft + +; mm0 = distance accumulator left block p1 +; mm1 = distance accumulator right block p1 +; mm2 = 0 +; mm3 = 0 +; mm4 = temp +; mm5 = temp +; mm6 = temp + + +align 32 +sad_sub44_mmxe: + push ebp + mov ebp, esp + + push ebx + push ecx + push edx + push esi + + pxor mm0, mm0 ; zero acculumator + pxor mm1, mm1 + pxor mm2, mm2 + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get qlx + + mov esi, [ebp+20] ; get rowsleft + jmp nextrowqd ; snap to it +align 32 +nextrowqd: + movq mm4, [eax] ; load 8 bytes of p1 (two blocks!) + add eax, edx ; update pointer to next row + movq mm6, mm4 ; + mov ecx, [ebx] ; load 4 bytes of p2 + punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out) + movd mm5, ecx + punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out) + psadbw mm4, mm5 ; compare to left block + add ebx, edx ; ditto + +; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out) + + paddd mm0, mm4 ; accumulate difference left block + +; psadbw mm6,mm5 ; compare to right block + + +; paddd mm1, mm6 ; accumulate difference right block + + sub esi, 1 + jnz nextrowqd + + movd eax, mm0 +; movd ebx, mm1 +; sal ebx, 16 +; or eax, ebx + + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret ; we now return you to your regular programming + + +;;; +;;; mblock_*nearest4_sad_mmxe.s: +;;; +;;; Enhanced MMX optimized Sum Absolute Differences routines for +;;; quads macroblocks offset by (0,0) (0,1) (1,0) (1,1) pel +;;; + +;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled +;;; evaluates macroblock quads. A lot of memory accesses can be saved +;;; if each quad is done together rather than each macroblock in the +;;; quad handled individually. + +;;; TODO: Really there ought to be MMX versions and the function's +;;; specification should be documented... +; +; Copyright (C) 2000 Andrew Stevens + + +;;; CURRENTLY not used but used in testing as reference for tweaks... +global mblockq_sad_REF + +; void mblockq_sad_REF(char *blk1,char *blk2,int lx,int h,int *weightvec); +; eax = p1 +; ebx = p2 +; ecx = unused +; edx = lx; +; edi = rowsleft +; esi = h + +; mm0 = SAD (x+0,y+0) +; mm1 = SAD (x+2,y+0) +; mm2 = SAD (x+0,y+2) +; mm3 = SAD (x+2,y+2) +; mm4 = temp +; mm5 = temp +; mm6 = temp +; mm7 = temp + +align 32 +mblockq_sad_REF: + push ebp ; save frame pointer + mov ebp, esp ; link + push eax + push ebx + push ecx + push edx + push edi + push esi + + pxor mm0, mm0 ; zero accumulators + pxor mm1, mm1 + pxor mm2, mm2 + pxor mm3, mm3 + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov edi, [ebp+20] ; get rowsleft + mov esi, edi + + jmp nextrow_block_d1 +align 32 +nextrow_block_d1: + + ;; Do the (+0,+0) SAD + + movq mm4, [eax] ; load 1st 8 bytes of p1 + movq mm6, mm4 + movq mm5, [ebx] + psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 + paddd mm0, mm4 ; accumulate difference + movq mm4, [eax+8] ; load 2nd 8 bytes of p1 + movq mm7, mm4 + psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 + paddd mm0, mm4 ; accumulate difference + + + cmp edi, esi + jz firstrow0 + + ;; Do the (0,+2) SAD + sub ebx, edx + psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1) + paddd mm2, mm6 ; accumulate difference + psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1) + add ebx, edx + paddd mm2, mm7 + +firstrow0: + + ;; Do the (+2,0) SAD + + movq mm4, [eax+1] + + movq mm6, mm4 + psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 + paddd mm1, mm4 ; accumulate difference + movq mm4, [eax+9] + movq mm7, mm4 + psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 + paddd mm1, mm4 ; accumulate difference + + cmp edi, esi + jz firstrow1 + + ;; Do the (+2, +2 ) SAD + sub ebx, edx + psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2 + psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2 + add ebx, edx + paddd mm3, mm6 ; accumulate difference + paddd mm3, mm7 +firstrow1: + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + + sub edi, 1 + jnz near nextrow_block_d1 + + ;; Do the last row of the (0,+2) SAD + + movq mm4, [eax] ; load 1st 8 bytes of p1 + movq mm5, [eax+8] ; load 2nd 8 bytes of p1 + sub ebx, edx + psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1) + psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1) + paddd mm2, mm4 ; accumulate difference + paddd mm2, mm5 + + movq mm4, [eax+1] + movq mm5, [eax+9] + + ;; Do the last row of rhw (+2, +2) SAD + psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2 + psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2 + paddd mm3, mm4 ; accumulate difference + paddd mm3, mm5 + + + mov eax, [ebp+24] ; Weightvec + movd [eax+0], mm0 + movd [eax+4], mm1 + movd [eax+8], mm2 + movd [eax+12], mm3 + + pop esi + pop edi + pop edx + pop ecx + pop ebx + pop eax + + pop ebp + emms + ret + + + +global mblock_nearest4_sads_mmxe + +; void mblock_nearest4_sads_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec); + +; eax = p1 +; ebx = p2 +; ecx = unused +; edx = lx; +; edi = rowsleft +; esi = h + +; mm0 = SAD (x+0,y+0),SAD (x+0,y+2) +; mm1 = SAD (x+2,y+0),SAD (x+2,y+2) + +; mm4 = temp +; mm5 = temp +; mm6 = temp +; mm7 = temp + +align 32 +mblock_nearest4_sads_mmxe: + push ebp ; save frame pointer + mov ebp, esp ; link + push eax + push ebx + push ecx + push edx + push edi + push esi + + mov eax, [ebp+8] ; get p1 + prefetcht0 [eax] + pxor mm0, mm0 ; zero accumulators + pxor mm1, mm1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + + mov edi, [ebp+20] ; get rowsleft + mov esi, edi + + jmp nextrow_block_e1 +align 32 +nextrow_block_e1: + + ;; Do the (+0,+0) SAD + prefetcht0 [eax+edx] + movq mm4, [eax] ; load 1st 8 bytes of p1 + movq mm6, mm4 + movq mm5, [ebx] + psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 + paddd mm0, mm4 ; accumulate difference + movq mm4, [eax+8] ; load 2nd 8 bytes of p1 + movq mm7, mm4 + psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 + paddd mm0, mm4 ; accumulate difference + + + cmp edi, esi + jz firstrowe0 + + ;; Do the (0,+2) SAD + sub ebx, edx + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + movq mm2, [ebx] + psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1) + paddd mm0, mm6 ; accumulate difference + movq mm3, [ebx+8] + psadbw mm7, mm3 ; next 8 bytes of p1 (row 1) + add ebx, edx + paddd mm0, mm7 + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 +firstrowe0: + + ;; Do the (+2,0) SAD + + movq mm4, [eax+1] + movq mm6, mm4 + + psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 + paddd mm1, mm4 ; accumulate difference + + movq mm4, [eax+9] + movq mm7, mm4 + + psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 + paddd mm1, mm4 ; accumulate difference + + cmp edi, esi + jz firstrowe1 + + ;; Do the (+2, +2 ) SAD + sub ebx, edx + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2 + psadbw mm7, mm3 ; 2nd 8 bytes of prev p2 + add ebx, edx + paddd mm1, mm6 ; accumulate difference + paddd mm1, mm7 + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 +firstrowe1: + + add eax, edx ; update pointer to next row + add ebx, edx ; ditto + + sub edi, 1 + jnz near nextrow_block_e1 + + ;; Do the last row of the (0,+2) SAD + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + movq mm4, [eax] ; load 1st 8 bytes of p1 + movq mm5, [eax+8] ; load 2nd 8 bytes of p1 + sub ebx, edx + psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1) + psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1) + paddd mm0, mm4 ; accumulate difference + paddd mm0, mm5 + + + ;; Do the last row of rhw (+2, +2) SAD + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + movq mm4, [eax+1] + movq mm5, [eax+9] + + psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2 + psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2 + paddd mm1, mm4 ; accumulate difference + paddd mm1, mm5 + + + mov eax, [ebp+24] ; Weightvec + movd [eax+8], mm0 + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + movd [eax+12], mm1 + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + movd [eax+0], mm0 + movd [eax+4], mm1 + + pop esi + pop edi + pop edx + pop ecx + pop ebx + pop eax + + pop ebp + emms + ret + +global mblock_sub22_nearest4_sads_mmxe + +; void mblock_sub22_nearest4_sads_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec); + +; eax = p1 +; ebx = p2 +; ecx = counter temp +; edx = flx; + +; mm0 = distance accumulator +; mm1 = distance accumulator +; mm2 = previous p1 row +; mm3 = previous p1 displaced by 1 byte... +; mm4 = temp +; mm5 = temp +; mm6 = temp +; mm7 = temp / 0 if first row 0xff otherwise + + +align 32 +mblock_sub22_nearest4_sads_mmxe: + push ebp ; save frame pointer + mov ebp, esp + push eax + push ebx + push ecx + push edx + + pxor mm0, mm0 ; zero acculumator + pxor mm1, mm1 ; zero acculumator + pxor mm2, mm2 ; zero acculumator + pxor mm3, mm3 ; zero acculumator + + mov eax, [ebp+8] ; get p1 + mov ebx, [ebp+12] ; get p2 + mov edx, [ebp+16] ; get lx + mov ecx, [ebp+20] + movq mm2, [eax+edx] + movq mm3, [eax+edx+1] + jmp nextrowbd22 +align 32 +nextrowbd22: + movq mm5, [ebx] ; load previous row reference block + ; mm2 /mm3 containts current row target block + + psadbw mm2, mm5 ; Comparse (x+0,y+2) + paddd mm1, mm2 + + psadbw mm3, mm5 ; Compare (x+2,y+2) + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + paddd mm1, mm3 + + pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + + movq mm2, [eax] ; Load current row traget block into mm2 / mm3 + movq mm6, mm2 + movq mm3, [eax+1] + sub eax, edx + sub ebx, edx + prefetcht0 [eax] + movq mm7, mm3 + + psadbw mm6, mm5 ; Compare (x+0,y+0) + paddd mm0, mm6 + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + psadbw mm7, mm5 ; Compare (x+2,y+0) + paddd mm0, mm7 + pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 + + sub ecx, 1 + jnz nextrowbd22 + + mov eax, [ebp+24] + movq [eax+0], mm0 + movq [eax+8], mm1 + pop edx + pop ecx + pop ebx + pop eax + pop ebp + + emms + ret + + diff --git a/veejay-current/utils/mblock_sub44_sads_x86.c b/veejay-current/utils/mblock_sub44_sads_x86.c new file mode 100644 index 00000000..35b304da --- /dev/null +++ b/veejay-current/utils/mblock_sub44_sads_x86.c @@ -0,0 +1,245 @@ +/* + * + * mblock_sub44_sads.c + * Copyright (C) 2000 Andrew Stevens + * + * Fast block sum-absolute difference computation for a rectangular area 4*x + * by y where y > h against a 4 by h block. + * + * Used for 4*4 sub-sampled motion compensation calculations. + * + * This is actually just a shell that uses templates from the included + * file "mblock_sub44_sads_x86_h.c". I didn't trust the compiler to do a good + * job on nested inlining. One day I'll experiment. + * + * + * This file is part of mpeg2enc, a free MPEG-2 video stream encoder + * based on the original MSSG reference design + * + * mpeg2enc is free software; you can redistribute new parts + * and/or modify under the terms of the GNU General Public License + * as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * See the files for those sections (c) MSSG + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "mjpeg_types.h" +#include "attributes.h" +#include "mmx.h" +#include "fastintfns.h" +#include "motionsearch.h" +#include "mblock_sub44_sads_x86.h" + +/* + Register usage: + mm0-mm3 Hold the current row + mm4 Used for accumulating partial SAD + mm7 Holds zero + */ + +static inline void mmx_zero_reg (void) +{ + /* load 0 into mm7 */ + pxor_r2r (mm7, mm7); +} + +/* + * Load a 8*4 block of 4*4 sub-sampled pels (qpels) into the MMX + * registers + * + */ + +static __inline__ void load_blk(uint8_t *blk,uint32_t rowstride,int h) +{ + movq_m2r( *blk, mm0); + blk += rowstride; + movq_m2r( *blk, mm1); + if( h == 2 ) + return; + blk += rowstride; + movq_m2r( *blk, mm2); + blk += rowstride; + movq_m2r( *blk, mm3); +} + +/* + * Do a shift right on the 4*4 block in the MMX registers + * We do this as a macro as otherwise we get warnings from the + * pre-optimised asm generated. + */ +#define shift_blk(shift)\ + psrlq_i2r( shift,mm0);\ + psrlq_i2r( shift,mm1);\ + psrlq_i2r( shift,mm2);\ + psrlq_i2r( shift,mm3); + +/* + * Compute the Sum absolute differences between the 4*h block in + * the MMX registers (4 least sig. bytes in mm0..mm3) + * + * and the 4*h block pointed to by refblk + * + * h == 2 || h == 4 + * + * TODO: Currently always loads and shifts 4*4 even if 4*2 is required. + * + */ + +static __inline__ int qblock_sad_mmxe(uint8_t *refblk, + uint32_t h, + uint32_t rowstride) +{ + int res; + pxor_r2r (mm4,mm4); + + movq_r2r (mm0,mm5); /* First row */ + movd_m2r (*refblk, mm6); + pxor_r2r ( mm7, mm7); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + psadbw_r2r ( mm5, mm6); + paddw_r2r ( mm6, mm4 ); + + + + movq_r2r (mm1,mm5); /* Second row */ + movd_m2r (*refblk, mm6); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + psadbw_r2r ( mm5, mm6); + paddw_r2r ( mm6, mm4 ); + + if( h == 4 ) + { + + movq_r2r (mm2,mm5); /* Third row */ + movd_m2r (*refblk, mm6); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + psadbw_r2r ( mm5, mm6); + paddw_r2r ( mm6, mm4 ); + + + movq_r2r (mm3,mm5); /* Fourth row */ + movd_m2r (*refblk, mm6); + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + psadbw_r2r ( mm5, mm6); + paddw_r2r ( mm6, mm4 ); + + } + movd_r2m ( mm4, res ); + + return res; +} + + + +static __inline__ int qblock_sad_mmx(uint8_t *refblk, + uint32_t h, + uint32_t rowstride) +{ + int res; + pxor_r2r (mm4,mm4); + + movq_r2r (mm0,mm5); /* First row */ + movd_m2r (*refblk, mm6); + pxor_r2r ( mm7, mm7); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + + punpcklbw_r2r ( mm7, mm6); + + movq_r2r ( mm5, mm7); + psubusw_r2r ( mm6, mm5); + + psubusw_r2r ( mm7, mm6); + + paddw_r2r ( mm5, mm4); + paddw_r2r ( mm6, mm4 ); + + + + movq_r2r (mm1,mm5); /* Second row */ + movd_m2r (*refblk, mm6); + pxor_r2r ( mm7, mm7); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + movq_r2r ( mm5, mm7); + psubusw_r2r ( mm6, mm5); + psubusw_r2r ( mm7, mm6); + paddw_r2r ( mm5, mm4); + paddw_r2r ( mm6, mm4 ); + + if( h == 4 ) + { + + movq_r2r (mm2,mm5); /* Third row */ + movd_m2r (*refblk, mm6); + pxor_r2r ( mm7, mm7); + refblk += rowstride; + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + movq_r2r ( mm5, mm7); + psubusw_r2r ( mm6, mm5); + psubusw_r2r ( mm7, mm6); + paddw_r2r ( mm5, mm4); + paddw_r2r ( mm6, mm4 ); + + movq_r2r (mm3,mm5); /* Fourth row */ + movd_m2r (*refblk, mm6); + pxor_r2r ( mm7, mm7); + punpcklbw_r2r ( mm7, mm5); + punpcklbw_r2r ( mm7, mm6); + movq_r2r ( mm5, mm7); + psubusw_r2r ( mm6, mm5); + psubusw_r2r ( mm7, mm6); + paddw_r2r ( mm5, mm4); + paddw_r2r ( mm6, mm4 ); + } + + + movq_r2r ( mm4, mm5 ); + psrlq_i2r ( 32, mm5 ); + paddw_r2r ( mm5, mm4 ); + movq_r2r ( mm4, mm6 ); + psrlq_i2r ( 16, mm6 ); + paddw_r2r ( mm6, mm4 ); + movd_r2m ( mm4, res ); + + return res & 0xffff; +} + + +/* + * Do the Extended MMX versions + */ +#define SIMD_SUFFIX(x) x##_mmxe +#include "mblock_sub44_sads_x86_h.c" +#undef SIMD_SUFFIX +/* + * Do the original MMX versions + */ +#define SIMD_SUFFIX(x) x##_mmx +#include "mblock_sub44_sads_x86_h.c" +#undef SIMD_SUFFIX + + + + diff --git a/veejay-current/utils/mblock_sub44_sads_x86.h b/veejay-current/utils/mblock_sub44_sads_x86.h new file mode 100644 index 00000000..11d35074 --- /dev/null +++ b/veejay-current/utils/mblock_sub44_sads_x86.h @@ -0,0 +1,39 @@ +/* (C) 2000/2002 Andrew Stevens */ + +/* This software is free software; you can redistribute it + * and/or modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + * + */ + +#ifndef __MBLOCK_SUB44_SADS_X86_H__ +#define __MBLOCK_SUB44_SADS_X86__ + +#include +#include "mjpeg_types.h" + +int mblocks_sub44_mests_mmxe( uint8_t *blk, uint8_t *ref, + int ilow, int jlow, + int ihigh, int jhigh, + int h, int rowstride, + int threshold, + me_result_s *resvec); +int mblocks_sub44_mests_mmx( uint8_t *blk, uint8_t *ref, + int ilow, int jlow, + int ihigh, int jhigh, + int threshold, + int h, int rowstride, + me_result_s *resvec); + +#endif diff --git a/veejay-current/utils/mblock_sub44_sads_x86_h.c b/veejay-current/utils/mblock_sub44_sads_x86_h.c new file mode 100644 index 00000000..a2916a83 --- /dev/null +++ b/veejay-current/utils/mblock_sub44_sads_x86_h.c @@ -0,0 +1,98 @@ +/* + * + * mblock_sub44_sads_x86_h.c + * Copyright (C) 2000 Andrew Stevens + * + * Fast block sum-absolute difference computation for a rectangular area 4*x + * by y where y > h against a 4 by h block. + * + * Used for 4*4 sub-sampled motion compensation calculations. + * + * + * This file is part of mpeg2enc, a free MPEG-2 video stream encoder + * based on the original MSSG reference design + * + * mpeg2enc is free software; you can redistribute new parts + * and/or modify under the terms of the GNU General Public License + * as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2enc is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * See the files for those sections (c) MSSG + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * + * Generates a vector sad's for 4*4 sub-sampled pel (qpel) data (with + * co-ordinates and top-left qpel address) from specified rectangle + * against a specified 16*h pel (4*4 qpel) reference block. The + * generated vector contains results only for those sad's that fall + * below twice the running best sad and are aligned on 8-pel + * boundaries + * + * Invariant: blk points to top-left sub-sampled pel for macroblock + * at (ilow,ihigh) + * i{low,high) j(low,high) must be multiples of 4. + * + * sad = Sum Absolute Differences + * + * NOTES: for best efficiency i{low,high) should be multiples of 16. + * + * */ + +int SIMD_SUFFIX(mblocks_sub44_mests)( uint8_t *blk, uint8_t *ref, + int ilow,int jlow, + int ihigh, int jhigh, + int h, int rowstride, + int threshold, + me_result_s *resvec) +{ + int32_t x,y; + uint8_t *currowblk = blk; + uint8_t *curblk; + me_result_s *cres = resvec; + int gridrowstride = (rowstride); + int weight; + + for( y=jlow; y <= jhigh ; y+=4) + { + curblk = currowblk; + for( x = ilow; x <= ihigh; x += 4) + { + if( (x & 15) == (ilow & 15) ) + { + load_blk( curblk, rowstride, h ); + } + weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride); + if( weight <= threshold ) + { + threshold = intmin(weight<<2,threshold); + /* Rough and-ready absolute distance penalty */ + /* NOTE: This penalty is *vital* to correct operation + as otherwise the sub-mean filtering won't work on very + uniform images. + */ + cres->weight = (uint16_t)(weight+(intmax(intabs(x),intabs(y))<<2)); + cres->x = (uint8_t)x; + cres->y = (uint8_t)y; + ++cres; + } + curblk += 1; + shift_blk(8); + } + currowblk += gridrowstride; + } + emms(); + return cres - resvec; +} + +#undef concat diff --git a/veejay-current/utils/mblock_sumsq_mmx.s b/veejay-current/utils/mblock_sumsq_mmx.s new file mode 100644 index 00000000..6ee326b9 --- /dev/null +++ b/veejay-current/utils/mblock_sumsq_mmx.s @@ -0,0 +1,673 @@ +; +; dist2_mmx.s: mmX optimized squared distance sum +; +; Original believed to be Copyright (C) 2000 Brent Byeler +; +; This program is free software; you can reaxstribute it and/or +; modify it under the terms of the GNU General Public License +; as published by the Free Software Foundation; either version 2 +; of the License, or (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +; + +; total squared difference between two (16*h) blocks +; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy) +; blk1,blk2: addresses of top left pels of both blocks +; lx: distance (in bytes) of vertically adjacent pels +; hx,hy: flags for horizontal and/or vertical interpolation +; h: height of block (usually 8 or 16) +; mmX version + +global sumsq_mmx +; int sumsq_mmx(unsigned char *blk1, unsigned char *blk2, +; int lx, int hx, int hy, int h) + +; mm7 = 0 + +; eax = pblk1 +; ebx = pblk2 +; ecx = temp +; edx = distance_sum +; edi = h +; esi = lx + + ;; + ;; private constants needed + ;; + +SECTION .data +align 16 +twos: + dw 2 + dw 2 + dw 2 + dw 2 + +SECTION .text + +align 32 +sumsq_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + push esi + push edi + + mov esi, [ebp+16] ; lx + mov eax, [ebp+20] ; hx + mov edx, [ebp+24] ; hy + mov edi, [ebp+28] ; h + + pxor mm5, mm5 ; sum + test edi, edi ; h = 0? + jle near d2exit + + pxor mm7, mm7 ; get zeros i mm7 + + test eax, eax ; hx != 0? + jne near d2is10 + test edx, edx ; hy != 0? + jne near d2is10 + + mov eax, [ebp+8] + mov ebx, [ebp+12] + jmp d2top00 + +align 32 +d2top00: + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movq mm1, [eax+8] + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 + + movq mm3, [ebx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + psubw mm1, mm3 + psubw mm2, mm4 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + paddd mm1, mm2 + + paddd mm0, mm1 + + ;; Accumulate sum in edx... we use mm5 + ;movd ecx, mm0 + ;add edx, ecx + ;psrlq mm0, 32 + ;movd ecx, mm0 + ;add edx, ecx + paddd mm5, mm0 + + add eax, esi + add ebx, esi + dec edi + jg d2top00 + jmp near d2exit + ;jmp d2exit changes made because of problems with nasm 0.98.23 + + +d2is10: + test eax, eax + je near d2is01 + test edx, edx + jne near d2is01 + + + mov eax, [ebp+8] ; blk1 + mov ebx, [ebp+12] ; blk1 + + pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop.. + pcmpeqw mm1, mm1 + psubw mm6, mm1 + jmp d2top10 + +align 32 +d2top10: + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [eax+1] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 ; here we add mm6 = 0.... weird... + paddw mm1, mm6 + psrlw mm0, 1 + psrlw mm1, 1 + + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movq mm1, [eax+8] + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 + movq mm3, [eax+9] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + paddw mm1, mm3 + paddw mm2, mm4 + paddw mm1, mm6 + paddw mm2, mm6 + psrlw mm1, 1 + psrlw mm2, 1 + + movq mm3, [ebx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + psubw mm1, mm3 + psubw mm2, mm4 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + paddd mm1, mm2 + + + paddd mm0, mm1 + ; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end + ; movd ecx, mm0 + ; add edx, ecx + ; psrlq mm0, 32 + ; movd ecx, mm0 + ; add edx, ecx + paddd mm5, mm0 + add eax, esi + add ebx, esi + dec edi + jg near d2top10 + + + jmp near d2exit + ;jmp d2exit changes made because of problems with nasm 0.98.23 + +d2is01: + test eax, eax + jne near d2is11 + test edx, edx + je near d2is11 + + mov eax, [ebp+8] ; blk1 + mov edx, [ebp+12] ; blk2 + mov ebx, eax + add ebx, esi ; blk1 + lx + + pxor mm6, mm6 + pcmpeqw mm1, mm1 + psubw mm6, mm1 ; mm6 = 1 + jmp d2top01 + +align 32 +d2top01: + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm6 + paddw mm1, mm6 + psrlw mm0, 1 + psrlw mm1, 1 + + movq mm2, [edx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movq mm1, [eax+8] + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 + + movq mm3, [ebx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + paddw mm1, mm3 + paddw mm2, mm4 + paddw mm1, mm6 + paddw mm2, mm6 + psrlw mm1, 1 + psrlw mm2, 1 + + movq mm3, [edx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + psubw mm1, mm3 + psubw mm2, mm4 + + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + paddd mm0, mm1 + paddd mm0, mm2 + + paddd mm5, mm0 + + mov eax, ebx ; eax = eax + lx + add edx, esi ; edx = edx + lx + add ebx, esi ; ebx = ebx + lx + dec edi + jg near d2top01 + jmp near d2exit + ;jmp d2exit changes made because of problems with nasm 0.98.23 + +d2is11: + mov eax, [ebp+8] ; blk1 + mov edx, [ebp+12] ; blk2 + mov ebx, eax ; blk1 + add ebx, esi ; ebx = blk1 + lx + jmp d2top11 + +align 32 +d2top11: + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + movq mm2, [eax+1] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + movq mm4, [ebx+1] + movq mm6, mm4 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 + paddw mm2, mm4 + paddw mm3, mm6 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm6, [twos] + paddw mm0, mm6 ; round mm0 + paddw mm1, mm6 ; round mm1 + psrlw mm0, 2 + psrlw mm1, 2 + + movq mm2, [edx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm0, mm1 + + movq mm1, [eax+8] + movq mm2, mm1 + punpcklbw mm1, mm7 + punpckhbw mm2, mm7 + + movq mm3, [eax+9] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + paddw mm1, mm3 + paddw mm2, mm4 + + movq mm3, [ebx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + paddw mm1, mm3 + paddw mm2, mm4 + + movq mm3, [ebx+9] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + paddw mm1, mm3 + paddw mm2, mm4 + + movq mm6, [twos] + paddw mm1, mm6 + paddw mm2, mm6 + + psrlw mm1, 2 + psrlw mm2, 2 + + movq mm3, [edx+8] + movq mm4, mm3 + punpcklbw mm3, mm7 + punpckhbw mm4, mm7 + + psubw mm1, mm3 + psubw mm2, mm4 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + paddd mm1, mm2 + + paddd mm0, mm1 + + ;; + ;; Accumulate the result in "s" we use mm6 for the purpose... + paddd mm5, mm0 + + mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed... + add ebx, esi + add edx, esi + dec edi + jg near d2top11 + + +d2exit: + ;; Put the final sum in eax for return... + movd eax, mm5 + psrlq mm5, 32 + movd ecx, mm5 + add eax, ecx + + pop edi + pop esi + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret + + +; total squared difference between two (8*h) blocks +; blk1,blk2: addresses of top left pels of both blocks +; lx: distance (in bytes) of vertically adjacent pels +; h: height of block (usually 4, or 8) +; mmX version + +global sumsq_sub22_mmx +; int sumsq_sub22_mmx(unsigned char *blk1, unsigned char *blk2, +; int lx, int h) + +; mm7 = 0 + +; eax = pblk1 +; ebx = pblk2 +; ecx = temp +; edx = distance_sum +; edi = h +; esi = lx + +align 32 +sumsq_sub22_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + push esi + push edi + + mov esi, [ebp+16] ; lx + mov edi, [ebp+20] ; h + + pxor mm5, mm5 ; sum + test edi, edi ; h = 0? + jle near d2exit + + pxor mm7, mm7 ; get zeros i mm7 + + mov eax, [ebp+8] ; blk1 + mov ebx, [ebp+12] ; blk2 + jmp d2top22 + +align 32 +d2top22: + movq mm0, [eax] + movq mm1, mm0 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + + movq mm2, [ebx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + psubw mm0, mm2 + psubw mm1, mm3 + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + paddd mm5, mm0 + paddd mm5, mm1 + + add eax, esi + add ebx, esi + dec edi + jg d2top22 + jmp d2exit + + +; total squared difference between interpolation of two (8*h) blocks and +; another 8*h block +; blk1,blk2: addresses of top left pels of both blocks +; lx: distance (in bytes) of vertically adjacent pels +; h: height of block (usually 4, or 8) +; mmX version + +global bsumsq_sub22_mmx +; int bsumsq_sub22_mmx(unsigned char *blk1f, unsigned char*blk1b, +; unsigned char *blk2, +; int lx, int h) + +; mm7 = 0 + +; eax = pblk1f +; ebx = pblk2 +; ecx = pblk1b +; edx = distance_sum +; edi = h +; esi = lx + +align 32 +bsumsq_sub22_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + push esi + push edi + + mov esi, [ebp+20] ; lx + mov edi, [ebp+24] ; h + + pxor mm5, mm5 ; sum + test edi, edi ; h = 0? + jle near d2exit + + pxor mm7, mm7 ; get zeros i mm7 + + mov eax, [ebp+8] ; blk1f + mov ebx, [ebp+12] ; blk1b + mov ecx, [ebp+16] ; blk2 + jmp bd2top22 + +align 32 +bd2top22: + movq mm0, [eax] + movq mm1, mm0 + movq mm4, [ebx] + movq mm6, mm4 + punpcklbw mm0, mm7 + punpckhbw mm1, mm7 + punpcklbw mm4, mm7 + punpckhbw mm6, mm7 + + movq mm2, [ecx] + movq mm3, mm2 + punpcklbw mm2, mm7 + punpckhbw mm3, mm7 + + paddw mm0, mm4 + psrlw mm0, 1 + psubw mm0, mm2 + pmaddwd mm0, mm0 + paddw mm1, mm6 + psrlw mm1, 1 + psubw mm1, mm3 + pmaddwd mm1, mm1 + paddd mm5, mm0 + paddd mm5, mm1 + + add eax, esi + add ebx, esi + add ecx, esi + dec edi + jg bd2top22 + jmp d2exit + + +global variance_mmx + +;;; variance of a (size*size) block, multiplied by 256 +;;; p: address of top left pel of block +;;; lx: seperation (in bytes) of vertically adjacent pels +;;; NOTE: size is 8 or 16 + +;;; int variance_mmx(uint8_t *p, int size, int lx) + + +variance_mmx: + push ebp ; save frame pointer + mov ebp, esp ; link + push ebx + push ecx + push edx + + mov eax, 0 ; col offset... + mov edx, [ebp+16] ; lx + + pxor mm0, mm0 + pxor mm7, mm7 ; Zero sum accumulator (4 words) + pxor mm6, mm6 ; Zero squares accumulator (2 dwords) +varcols: + mov ebx, [ebp+8] ; p + mov ecx, [ebp+12] ; size +varrows: + + movq mm2, [ebx+eax] + movq mm3, mm2 + punpcklbw mm2, mm0 + punpckhbw mm3, mm0 + + movq mm4, mm2 + movq mm5, mm3 + + pmaddwd mm4, mm2 ; Squares 0:3 + paddw mm7, mm2 ; Accumulate sum 0:3 (words) + paddd mm6, mm4 ; Accumulate sum squares 0:3 (dwords) + + pmaddwd mm5, mm3 ; Squares 4:7 (words) + paddw mm7, mm3 ; Accumulate sum 0:3 (words) + paddd mm6, mm5 ; Accumulate sum squares 4:7 (dwords) + + + add ebx, edx ; Next row + dec ecx + jnz varrows + + add eax, 8 + cmp eax, [ebp+12] + jl varcols + + ;; Sum squared -> eax + + movq mm1, mm7 + psrlq mm1, 32 + paddw mm7, mm1 + movq mm1, mm7 + psrlq mm1, 16 + paddw mm7, mm1 + movd eax, mm7 + and eax, 0xffff + imul eax, eax + + ;; Squares sum -> ecx + + movq mm1, mm6 + psrlq mm1, 32 + paddd mm6, mm1 + movd ecx, mm6 + + mov ebx, [ebp+12] + shr eax, 6 ; Divide sum squared by 64 for 8*8 + cmp ebx, 8 + jz var8_8 ; If 16 * 16 divide again by 4 (256) + shr eax, 2 +var8_8: + + sub ecx, eax + mov eax, ecx + + pop edx + pop ecx + pop ebx + + pop ebp ; restore stack pointer + + emms ; clear mmx registers + ret diff --git a/veejay-current/utils/mjpeg_logging.c b/veejay-current/utils/mjpeg_logging.c new file mode 100644 index 00000000..891647fa --- /dev/null +++ b/veejay-current/utils/mjpeg_logging.c @@ -0,0 +1,218 @@ +/* + $Id: mjpeg_logging.c,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $ + + Copyright (C) 2000 Herbert Valerio Riedel + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#include + +extern int fred; +#include "mjpeg_logging.h" + +static const char _rcsid[] = "$Id: "; + +#define MAX_DEFAULT_ID_SIZE 16 +#define DEFAULT_DEFAULT_ID "???" + +#ifdef HAVE___PROGNAME +extern const char *__progname; +#endif + +static log_level_t mjpeg_log_verbosity = 0; +static char default_handler_id[MAX_DEFAULT_ID_SIZE]; +static char default_handler_id_is_set = 0; + +static int default_mjpeg_log_filter( log_level_t level ) +{ + int verb_from_env; + if( mjpeg_log_verbosity == 0 ) + { + char *mjpeg_verb_env = getenv("MJPEG_VERBOSITY"); + if( mjpeg_verb_env != NULL ) + { + verb_from_env = LOG_WARN-atoi(mjpeg_verb_env); + if( verb_from_env >= LOG_DEBUG && verb_from_env <= LOG_ERROR ) + mjpeg_log_verbosity = verb_from_env; + } + } + return (level < LOG_WARN && level < mjpeg_log_verbosity); +} + +static mjpeg_log_filter_t _filter = default_mjpeg_log_filter; + +static void +default_mjpeg_log_handler(log_level_t level, const char message[]) +{ + if( (*_filter)( level ) ) + return; + switch(level) { + case LOG_ERROR: + fprintf(stderr, "E: %s\n", message); + break; + case LOG_DEBUG: + fprintf(stderr, "D: %s\n", message); + break; + case LOG_WARN: + fprintf(stderr, "W: %s\n", message); + break; + case LOG_INFO: + fprintf(stderr, "I: %s\n", message); + break; + default: + assert(0); + } +} + +static mjpeg_log_handler_t _handler = default_mjpeg_log_handler; + + +mjpeg_log_handler_t +mjpeg_log_set_handler(mjpeg_log_handler_t new_handler) +{ + mjpeg_log_handler_t old_handler = _handler; + + _handler = new_handler; + + return old_handler; +} + +/*************** + * + * Set default log handlers degree of verboseity. + * 0 = quiet, 1 = info, 2 = debug + * + *************/ + +int +mjpeg_default_handler_verbosity(int verbosity) +{ + int prev_verb = mjpeg_log_verbosity; + mjpeg_log_verbosity = LOG_WARN - verbosity; + return prev_verb; +} + +/* + * Set identifier string used by default handler + * + */ +int +mjpeg_default_handler_identifier(const char *new_id) +{ + const char *s; + if (new_id == NULL) { + default_handler_id_is_set = 0; + return 0; + } + /* find basename of new_id (remove any directory prefix) */ + if ((s = strrchr(new_id, '/')) == NULL) + s = new_id; + else + s = s + 1; + strncpy(default_handler_id, s, MAX_DEFAULT_ID_SIZE); + default_handler_id[MAX_DEFAULT_ID_SIZE-1] = '\0'; + default_handler_id_is_set = 1; + return 0; +} + + +static void +mjpeg_logv(log_level_t level, const char format[], va_list args) +{ + char buf[1024] = { 0, }; + + /* TODO: Original had a re-entrancy error trap to assist bug + finding. To make this work with multi-threaded applications a + lock is needed hence delete. + */ + + + vsnprintf(buf, sizeof(buf)-1, format, args); + + _handler(level, buf); +} + +void +mjpeg_log(log_level_t level, const char format[], ...) +{ + va_list args; + va_start (args, format); + mjpeg_logv(level, format, args); + va_end (args); +} + +void +mjpeg_debug(const char format[], ...) +{ + va_list args; + va_start (args, format); + mjpeg_logv(LOG_DEBUG, format, args); + va_end (args); +} + +void +mjpeg_info(const char format[], ...) +{ + va_list args; + va_start (args, format); + mjpeg_logv(LOG_INFO, format, args); + va_end (args); +} + +void +mjpeg_warn(const char format[], ...) +{ + va_list args; + va_start (args, format); + mjpeg_logv(LOG_WARN, format, args); + va_end (args); +} + +void +mjpeg_error(const char format[], ...) +{ + va_list args; + va_start (args, format); + mjpeg_logv(LOG_ERROR, format, args); + va_end (args); +} + +void +mjpeg_error_exit1(const char format[], ...) +{ + va_list args; + va_start( args, format ); + mjpeg_logv( LOG_ERROR, format, args); + va_end(args); + exit(EXIT_FAILURE); +} + + +/* + * Local variables: + * c-file-style: "gnu" + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/veejay-current/utils/mjpeg_logging.h b/veejay-current/utils/mjpeg_logging.h new file mode 100644 index 00000000..7b1d5d89 --- /dev/null +++ b/veejay-current/utils/mjpeg_logging.h @@ -0,0 +1,79 @@ +/* + $Id: mjpeg_logging.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $ + + Copyright (C) 2000 Herbert Valerio Riedel + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __MJPEG_LOGGING_H__ +#define __MJPEG_LOGGING_H__ + +#include "mjpeg_types.h" + +typedef enum { + LOG_DEBUG = 1, + LOG_INFO, + LOG_WARN, + LOG_ERROR +} log_level_t; + +#ifdef __cplusplus +extern "C" { +#endif +void +mjpeg_log(log_level_t level, const char format[], ...) GNUC_PRINTF(2, 3); + +typedef int(*mjpeg_log_filter_t)(log_level_t level); + +typedef void(*mjpeg_log_handler_t)(log_level_t level, const char message[]); + +mjpeg_log_handler_t +mjpeg_log_set_handler(mjpeg_log_handler_t new_handler); + +int +mjpeg_default_handler_identifier(const char *new_id); + +int +mjpeg_default_handler_verbosity(int verbosity); + +void +mjpeg_debug(const char format[], ...) GNUC_PRINTF(1,2); + +void +mjpeg_info(const char format[], ...) GNUC_PRINTF(1,2); + +void +mjpeg_warn(const char format[], ...) GNUC_PRINTF(1,2); + +void +mjpeg_error(const char format[], ...) GNUC_PRINTF(1,2); + +void +mjpeg_error_exit1(const char format[], ...) GNUC_PRINTF(1,2); + +#ifdef __cplusplus +} +#endif +#endif /* __MJPEG_LOGGING_H__ */ + + +/* + * Local variables: + * c-file-style: "gnu" + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/veejay-current/utils/mjpeg_types.h b/veejay-current/utils/mjpeg_types.h new file mode 100644 index 00000000..b40d24ab --- /dev/null +++ b/veejay-current/utils/mjpeg_types.h @@ -0,0 +1,109 @@ +/* + $Id: mjpeg_types.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $ + + Copyright (C) 2000 Herbert Valerio Riedel + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __MJPEG_TYPES_H__ +#define __MJPEG_TYPES_H__ +#include + +#if defined(HAVE_STDINT_H) +# include +#elif defined(HAVE_INTTYPES_H) +# include +#elif defined(__CYGWIN__) +# include +typedef u_int8_t uint8_t; +typedef u_int16_t uint16_t; +typedef u_int32_t uint32_t; +typedef u_int64_t uint64_t; +# define INT8_C(c) c +# define INT16_C(c) c +# define INT32_C(c) c +# define INT64_C(c) c ## LL +# define UINT8_C(c) c ## U +# define UINT16_C(c) c ## U +# define UINT32_C(c) c ## U +# define UINT64_C(c) c ## ULL +#else +/* warning ISO/IEC 9899:1999 was missing and even */ +/* fixme */ +#endif /* HAVE_STDINT_H */ + +#if defined(__FreeBSD__) +#include /* FreeBSD - ssize_t */ +#endif + +#if defined(HAVE_STDBOOL_H) && !defined(__cplusplus) +#include +#else +/* ISO/IEC 9899:1999 missing -- enabling workaround */ + +# ifndef __cplusplus +typedef enum + { + false = 0, + true = 1 + } locBool; + +# define false false +# define true true +# define bool locBool +# endif +#endif + +#ifndef PRId64 +#define PRId64 PRID64_STRING_FORMAT +#endif + +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4) +#define GNUC_PRINTF( format_idx, arg_idx ) \ + __attribute__((format (printf, format_idx, arg_idx))) +#define GNUC_SCANF( format_idx, arg_idx ) \ + __attribute__((format (scanf, format_idx, arg_idx))) +#define GNUC_FORMAT( arg_idx ) \ + __attribute__((format_arg (arg_idx))) +#define GNUC_NORETURN \ + __attribute__((noreturn)) +#define GNUC_CONST \ + __attribute__((const)) +#define GNUC_UNUSED \ + __attribute__((unused)) +#define GNUC_PACKED \ + __attribute__((packed)) +#else /* !__GNUC__ */ +#define GNUC_PRINTF( format_idx, arg_idx ) +#define GNUC_SCANF( format_idx, arg_idx ) +#define GNUC_FORMAT( arg_idx ) +#define GNUC_NORETURN +#define GNUC_CONST +#define GNUC_UNUSED +#define GNUC_PACKED +#endif /* !__GNUC__ */ + + +#endif /* __MJPEG_TYPES_H__ */ + + +/* + * Local variables: + * c-file-style: "gnu" + * tab-width: 8 + * indent-tabs-mode: nil + * End: + */ diff --git a/veejay-current/utils/mmx.h b/veejay-current/utils/mmx.h new file mode 100644 index 00000000..87c2c6ae --- /dev/null +++ b/veejay-current/utils/mmx.h @@ -0,0 +1,271 @@ +/* + * mmx.h + * Copyright (C) 1997-1999 H. Dietz and R. Fisher + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "attributes.h" + +/* + * The type of an value that fits in an MMX register (note that long + * long constant values MUST be suffixed by LL and unsigned long long + * values by ULL, lest they be truncated by the compiler) + */ + +typedef union { + int64_t q; /* Quadword (64-bit) value */ + uint64_t uq; /* Unsigned Quadword */ + int32_t d[2]; /* 2 Doubleword (32-bit) values */ + uint32_t ud[2]; /* 2 Unsigned Doubleword */ + int16_t w[4]; /* 4 Word (16-bit) values */ + uint16_t uw[4]; /* 4 Unsigned Word */ + int8_t b[8]; /* 8 Byte (8-bit) values */ + uint8_t ub[8]; /* 8 Unsigned Byte */ + float s[2]; /* Single-precision (32-bit) value */ +} ATTR_ALIGN(8) mmx_t; /* On an 8-byte (64-bit) boundary */ + + +#define mmx_si2r(op,imm,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "J" (imm) ) + +#define mmx_m2r(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)) + +#define mmx_r2m(op,reg,mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op,regs,regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + + +#define emms() __asm__ __volatile__ ("emms") +#define femms() __asm__ __volatile__ ("femms") + +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) + +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) + +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) + +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) + +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) + +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) + +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) + +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) + +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) + +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) + +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) + +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) + +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) + +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) + +#define por_m2r(var,reg) mmx_m2r (por, var, reg) +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) + +#define pslld_i2r(imm,reg) mmx_si2r (pslld, imm, reg) +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) +#define psllq_i2r(imm,reg) mmx_si2r (psllq, imm, reg) +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) +#define psllw_i2r(imm,reg) mmx_si2r (psllw, imm, reg) +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) + +#define psrad_i2r(imm,reg) mmx_si2r (psrad, imm, reg) +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) +#define psraw_i2r(imm,reg) mmx_si2r (psraw, imm, reg) +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) + +#define psrld_i2r(imm,reg) mmx_si2r (psrld, imm, reg) +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) +#define psrlq_i2r(imm,reg) mmx_si2r (psrlq, imm, reg) +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) +#define psrlw_i2r(imm,reg) mmx_si2r (psrlw, imm, reg) +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) + +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) + +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) + +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) + +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) + +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) + +#define pxor_m2r(var, reg) mmx_m2r (pxor, var, reg) +#define pxor_r2r(regs, regd) mmx_r2r (pxor, regs, regd) + +/* AMD MMX extensions - also available in intel SSE */ + + +#define mmx_m2ri(op,mem,reg,imm) \ + __asm__ __volatile__ (#op " %1, %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem), "X" (imm)) +#define mmx_r2ri(op,regs,regd,imm) \ + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "X" (imm) ) + +#define mmx_fetch(mem,hint) \ + __asm__ __volatile__ ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "X" (mem)) + +/* 3DNow goodies */ +#define pfmul_m2r(var,reg) mmx_m2r( pfmul, var, reg ) +#define pfmul_r2r(regs,regd) mmx_r2r( pfmul, regs, regd ) +#define pfadd_m2r(var,reg) mmx_m2r( pfadd, var, reg ) +#define pfadd_r2r(regs,regd) mmx_r2r( pfadd, regs, regd ) +#define pf2id_r2r(regs,regd) mmx_r2r( pf2id, regs, regd ) +#define pi2fd_r2r(regs,regd) mmx_r2r( pi2fd, regs, regd ) + +/* SSE goodies */ +#define mulps_r2r( regs, regd) mmx_r2r( mulps, regs, regd ) +#define mulps_m2r( var, regd) mmx_m2r( mulps, var, regd ) +#define movups_r2r(reg1, reg2) mmx_r2r( movups, reg1, reg2 ) +#define movups_m2r(var, reg) mmx_m2r( movups, var, reg ) +#define movaps_m2r(var, reg) mmx_m2r( movaps, var, reg ) +#define movups_r2m(reg, var) mmx_r2m( movups, reg, var ) +#define movaps_r2m(reg, var) mmx_r2m( movaps, reg, var ) +#define cvtps2pi_r2r(regs,regd) mmx_r2r( cvtps2pi, regs, regd ) +#define cvtpi2ps_r2r(regs,regd) mmx_r2r( cvtpi2ps, regs, regd ) +#define shufps_r2ri(regs, regd, imm) mmx_r2ri( shufps, regs, regd, imm ) + + +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) + +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) + +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) + +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) + +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) + +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) + +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) + +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) + +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) + +#define pmovmskb(mmreg,reg) \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) + +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) + +#define prefetcht0(mem) mmx_fetch (mem, t0) +#define prefetcht1(mem) mmx_fetch (mem, t1) +#define prefetcht2(mem) mmx_fetch (mem, t2) +#define prefetchnta(mem) mmx_fetch (mem, nta) + +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +#define pshufw_m2ri(var,reg,imm) mmx_m2ri( pshufw, var, reg, imm) +#define pshufw_r2ri(regs,regd,imm) mmx_r2ri( pshufw, regs, regd, imm) + +#define sfence() __asm__ __volatile__ ("sfence\n\t") diff --git a/veejay-current/utils/motionsearch.c b/veejay-current/utils/motionsearch.c new file mode 100644 index 00000000..cc44cafc --- /dev/null +++ b/veejay-current/utils/motionsearch.c @@ -0,0 +1,1237 @@ +/* motion.c, motion estimation */ + + +/* (C) 2000/2001 Andrew Stevens */ + +/* These modifications are free software; you can redistribute it + * and/or modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + * + */ + +#include +#include +#include +#include +#include "math.h" +#include "cpu_accel.h" +#include "fastintfns.h" +#include "motionsearch.h" +#include "mjpeg_logging.h" + + +#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM) + +#include "mblock_sub44_sads_x86.h" + +static int (*pmblocks_sub44_mests)( uint8_t *blk, uint8_t *ref, + int ilow, int jlow, + int ihigh, int jhigh, + int h, int rowstride, + int threshold, + me_result_s *resvec); + + +void mblock_sub22_nearest4_sads_mmxe(uint8_t *blk1,uint8_t *blk2, + int frowstride,int fh, int* resvec) + __asm__ ("mblock_sub22_nearest4_sads_mmxe"); + +void mblock_nearest4_sads_mmxe(uint8_t *blk1, uint8_t *blk2, + int rowstride, int h, int *resvec) + __asm__ ("mblock_nearest4_sads_mmxe"); + +int sad_00_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim) __asm__ ("sad_00_mmxe"); +int sad_01_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmxe"); +int sad_10_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmxe"); +int sad_11_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmxe"); + + +int sad_sub22_mmxe ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh) __asm__ ("sad_sub22_mmxe"); +int sad_sub44_mmxe ( uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh) __asm__ ("sad_sub44_mmxe"); +int sumsq_mmx( uint8_t *blk1, uint8_t *blk2, + int rowstride, int hx, int hy, int h) __asm__ ("sumsq_mmx"); +int sumsq_sub22_mmx( uint8_t *blk1, uint8_t *blk2, + int rowstride, int h) __asm__ ("sumsq_sub22_mmx"); +int bsumsq_sub22_mmx( uint8_t *blk1f, uint8_t *blk1b, + uint8_t *blk2, + int rowstride, int h) __asm__ ("bsumsq_sub22_mmx"); +int bsumsq_mmx (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, + int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsumsq_mmx"); +int bsad_mmx (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, + int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsad_mmx"); + +int variance_mmx( uint8_t *p, int size, int rowstride) __asm__ ("variance_mmx"); + +int sad_00_mmx ( uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim) __asm__ ("sad_00_mmx"); +int sad_01_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmx"); +int sad_10_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmx"); +int sad_11_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmx"); +int sad_sub22_mmx ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh) __asm__ ("sad_sub22_mmx"); +int sad_sub44_mmx (uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh) __asm__ ("sad_sub44_mmx"); + +#endif + + +/* + * Function pointers for selecting CPU specific implementations + * + */ + +void (*pfind_best_one_pel)( me_result_set *sub22set, + uint8_t *org, uint8_t *blk, + int i0, int j0, + int ihigh, int jhigh, + int rowstride, int h, + me_result_s *res + ); +int (*pbuild_sub22_mests)( me_result_set *sub44set, + me_result_set *sub22set, + int i0, int j0, int ihigh, int jhigh, + int null_mc_sad, + uint8_t *s22org, uint8_t *s22blk, + int frowstride, int fh, + int reduction + ); + +int (*pbuild_sub44_mests)( me_result_set *sub44set, + int ilow, int jlow, int ihigh, int jhigh, + int i0, int j0, + int null_mc_sad, + uint8_t *s44org, uint8_t *s44blk, + int qrowstride, int qh, + int reduction ); + +int (*psumsq_sub22)( uint8_t *blk1, uint8_t *blk2, + int rowstride, int h); +int (*pbsumsq_sub22)( uint8_t *blk1f, uint8_t *blk1b, + uint8_t *blk2, + int rowstride, int h); + +int (*pvariance)(uint8_t *mb, int size, int rowstride); + + +int (*psad_sub22) ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh); +int (*psad_sub44) ( uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh); +int (*psad_00) ( uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim); +int (*psad_01) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); +int (*psad_10) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); +int (*psad_11) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); + +int (*psumsq) (uint8_t *blk1, uint8_t *blk2, + int rowstride, int hx, int hy, int h); + + +int (*pbsumsq) (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h); + +int (*pbsad) (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h); + + + + + +/* + * Round search radius to suit the search algorithm. + * Currently radii must be multiples of 8. + * + */ + +int round_search_radius( int radius ) +{ + return intmax(8,((radius+4) /8)*8); +} + + + +/* + Take a vector of motion estimations and repeatedly make passes + discarding all elements whose sad "weight" is above the current mean weight. +*/ + +static void sub_mean_reduction( me_result_set *matchset, + int times, + int *minweight_res) +{ + me_result_s *matches = matchset->mests; + int len = matchset->len; + int i,j; + int weight_sum; + int mean_weight; + int min_weight = 100000; + if( len == 0 ) + { + *minweight_res = 100000; + matchset->len = 0; + return; + } + + for(;;) + { + weight_sum = 0; + for( i = 0; i < len ; ++i ) + weight_sum += matches[i].weight; + mean_weight = weight_sum / len; + + if( times <= 0) + break; + + j = 0; + for( i =0; i < len; ++i ) + { + if( matches[i].weight <= mean_weight ) + { + if( times == 1) + { + min_weight = matches[i].weight ; + } + matches[j] = matches[i]; + ++j; + } + } + len = j; + --times; + } + matchset->len = len; + *minweight_res = mean_weight; +} + +/* + * Build a vector of the top 4*4 sub-sampled motion estimations in + * the box (ilow,jlow) to (ihigh,jhigh). + * + * The algorithm is as follows: + * + * 1. Matches on an 4*4 pel grid are collected. All those matches + * whose that is over a (conservative) threshold (basically 50% more + * than moving average of the mean sad of such matches) are discarded. + * + * 2. Multiple passes are made discarding worse than-average matches. + * The number of passes is specified by the user. The default it 2 + * (leaving roughly 1/4 of the matches). + * + * The intial threshold and discard passes are controlled by reduction + * [1..4]. The intial SAD threshold is calculated as 6 / reduction of + * a reference SAD passed as a parameter. For reduction == 1 one + * discard pass is made otherwise two are made. + * + * The net result is very fast and find good matches if they're to be + * found. I.e. the penalty over exhaustive search is pretty low. + * + * NOTE: The "discard below average" trick depends critically on + * having some variation in the matches. The slight penalty imposed + * for distant matches (reasonable since the motion vectors have to + * be encoded) is *vital* as otherwise pathologically bad performance + * results on highly uniform images. + * + * TODO: We should probably allow the user to eliminate the initial + * thinning of 4*4 grid matches if ultimate quality is demanded + * (e.g. for low bit-rate applications). + * + */ + +static int build_sub44_mests( me_result_set *sub44set, + int ilow, int jlow, int ihigh, int jhigh, + int i0, int j0, + int null_ctl_sad, + uint8_t *s44org, uint8_t *s44blk, + int qrowstride, int qh, + int reduction + ) +{ + uint8_t *s44orgblk; + me_result_s *sub44_mests = sub44set->mests; + int istrt = ilow-i0; + int jstrt = jlow-j0; + int iend = ihigh-i0; + int jend = jhigh-j0; + int mean_weight; + int threshold; + + int i,j; + int s1; + uint8_t *old_s44orgblk; + int sub44_num_mests; + + /* N.b. we may ignore the right hand block of the pair going over the + right edge as we have carefully allocated the buffer oversize to ensure + no memory faults. The later motion estimation calculations + performed on the results of this pass will filter out + out-of-range blocks... + */ + + threshold = 6*null_ctl_sad / (4*4*reduction); + s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2); + + /* Exhaustive search on 4*4 sub-sampled data. This is affordable because + (a) it is only 16th of the size of the real 1-pel data + (b) we ignore those matches with an sad above our threshold. + */ + + sub44_num_mests = 0; + + /* Invariant: s44orgblk = s44org+(i>>2)+qrowstride*(j>>2) */ + s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2); + for( j = jstrt; j <= jend; j += 4 ) + { + old_s44orgblk = s44orgblk; + for( i = istrt; i <= iend; i += 4 ) + { + s1 = ((*psad_sub44)( s44orgblk,s44blk,qrowstride,qh) & 0xffff); + if( s1 < threshold ) + { + threshold = intmin(s1<<2,threshold); + sub44_mests[sub44_num_mests].x = i; + sub44_mests[sub44_num_mests].y = j; + sub44_mests[sub44_num_mests].weight = s1 + + (intmax(intabs(i-i0),intabs(j-j0))<<1); + ++sub44_num_mests; + } + s44orgblk += 1; + } + s44orgblk = old_s44orgblk + qrowstride; + } + sub44set->len = sub44_num_mests; + + sub_mean_reduction( sub44set, 1+(reduction>1), &mean_weight); + + + return sub44set->len; +} + +#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM) + +static int build_sub44_mests_mmx( me_result_set *sub44set, + int ilow, int jlow, int ihigh, int jhigh, + int i0, int j0, + int null_ctl_sad, + uint8_t *s44org, uint8_t *s44blk, + int qrowstride, int qh, + int reduction) +{ + uint8_t *s44orgblk; + me_result_s *sub44_mests = sub44set->mests; + int istrt = ilow-i0; + int jstrt = jlow-j0; + int iend = ihigh-i0; + int jend = jhigh-j0; + int mean_weight; + int threshold; + + + threshold = 6*null_ctl_sad / (4*4*reduction); + s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2); + + sub44set->len = (*pmblocks_sub44_mests)( s44orgblk, s44blk, + istrt, jstrt, + iend, jend, + qh, qrowstride, + threshold, + sub44_mests); + + /* If we're really pushing quality we reduce once otherwise twice. */ + + sub_mean_reduction( sub44set, 1+(reduction>1), &mean_weight); + + + return sub44set->len; +} +#endif + + + +/* Build a vector of the best 2*2 sub-sampled motion estimations for + * the 16*16 macroblock at i0,j0 using a set of 4*4 sub-sampled matches as + * starting points. As with with the 4*4 matches We don't collect + * them densely as they're just search starting points for 1-pel + * search and ones that are 1 out * should still give better than + * average matches... + * + * The resulting candidate motion vectors are thinned by thresholding + * and discarding worse than-average matches. + * + * The intial threshold and number of discard passes are controlled by reduction + * [1..4]: the intial SAD threshold is calculated as 6 / reduction of + * a reference SAD passed as a parameter and then reduction discard passes + * are made. + * + * A super-fast version using MMX assembly code for X86 follows. + * Other CPU's could/should be handled the same way. */ + + +static int build_sub22_mests( me_result_set *sub44set, + me_result_set *sub22set, + int i0, int j0, int ihigh, int jhigh, + int null_ctl_sad, + uint8_t *s22org, uint8_t *s22blk, + int frowstride, int fh, + int reduction) +{ + int i,k,s; + int threshold = 6*null_ctl_sad / (2 * 2*reduction); + + int min_weight; + int ilim = ihigh-i0; + int jlim = jhigh-j0; + int x,y; + uint8_t *s22orgblk; + + sub22set->len = 0; + for( k = 0; k < sub44set->len; ++k ) + { + + x = sub44set->mests[k].x; + y = sub44set->mests[k].y; + + s22orgblk = s22org +((y+j0)>>1)*frowstride +((x+i0)>>1); + for( i = 0; i < 4; ++i ) + { + if( x <= ilim && y <= jlim ) + { + s = (*psad_sub22)( s22orgblk,s22blk,frowstride,fh)+ + (intmax(intabs(x),intabs(y))<<3); + if( s < threshold ) + { + me_result_s *mc = &sub22set->mests[sub22set->len]; + mc->x = (int8_t)x; + mc->y = (int8_t)y; + mc->weight = s; + ++(sub22set->len); + } + } + + if( i == 1 ) + { + s22orgblk += frowstride-1; + x -= 2; + y += 2; + } + else + { + s22orgblk += 1; + x += 2; + + } + } + + } + + sub_mean_reduction( sub22set, reduction, &min_weight ); + return sub22set->len; +} + +#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM) +static int build_sub22_mests_mmxe( me_result_set *sub44set, + me_result_set *sub22set, + int i0, int j0, int ihigh, int jhigh, + int null_ctl_sad, + uint8_t *s22org, uint8_t *s22blk, + int frowstride, int fh, + int reduction) +{ + int i,k,s; + int threshold = 6*null_ctl_sad / (2 * 2*reduction); + + int min_weight; + int ilim = ihigh-i0; + int jlim = jhigh-j0; + int x,y; + uint8_t *s22orgblk; + int resvec[4]; + + /* TODO: The calculation of the lstrow offset really belongs in + asm code... */ + int lstrow=(fh-1)*frowstride; + + sub22set->len = 0; + for( k = 0; k < sub44set->len; ++k ) + { + + x = sub44set->mests[k].x; + y = sub44set->mests[k].y; + + s22orgblk = s22org +((y+j0)>>1)*frowstride +((x+i0)>>1); + /* + Get SAD for 2*2 subsampled macroblocks: orgblk,orgblk(+2,0), + orgblk(0,+2), and orgblk(+2,+2) Done all in one go to reduce + memory bandwidth demand + */ + mblock_sub22_nearest4_sads_mmxe(s22orgblk+lstrow, s22blk+lstrow, frowstride, fh, resvec); + for( i = 0; i < 4; ++i ) + { + if( x <= ilim && y <= jlim ) + { + s =resvec[i]+(intmax(intabs(x),intabs(y))<<3); + if( s < threshold ) + { + me_result_s *mc = &sub22set->mests[sub22set->len]; + mc->x = (int8_t)x; + mc->y = (int8_t)y; + mc->weight = s; + ++(sub22set->len); + } + } + + if( i == 1 ) + { + x -= 2; + y += 2; + } + else + { + x += 2; + } + } + + } + + + sub_mean_reduction( sub22set, reduction, &min_weight ); + return sub22set->len; +} + +#endif + +/* + * Search for the best 1-pel match within 1-pel of a good 2*2-pel + * match. + * + * N.b. best_so_far must be initialised by the caller! + */ + + +static void find_best_one_pel( me_result_set *sub22set, + uint8_t *org, uint8_t *blk, + int i0, int j0, + int ihigh, int jhigh, + int rowstride, int h, + me_result_s *best_so_far + ) + +{ + int i,k; + int d; + me_result_s minpos = *best_so_far; + int dmin = INT_MAX; + int ilim = ihigh-i0; + int jlim = jhigh-j0; + uint8_t *orgblk; + int penalty; + me_result_s matchrec; + + for( k = 0; k < sub22set->len; ++k ) + { + + matchrec = sub22set->mests[k]; + orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y); + penalty = intmax(intabs(matchrec.x),intabs(matchrec.y))<<5; + + for( i = 0; i < 4; ++i ) + { + if( matchrec.x <= ilim && matchrec.y <= jlim ) + { + + d = penalty+(*psad_00)(orgblk,blk,rowstride,h, dmin); + if (dlen; ++k ) + { + matchrec = sub22set->mests[k]; + orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y); + penalty = intmax(abs(matchrec.x),abs(matchrec.y))<<5; + + /* Get SAD for macroblocks: orgblk,orgblk(+1,0), + orgblk(0,+1), and orgblk(+1,+1) + Done all in one go to reduce memory bandwidth demand + */ + mblock_nearest4_sads_mmxe(orgblk,blk,rowstride,h, + resvec); + + for( i = 0; i < 4; ++i ) + { + if( matchrec.x <= ilim && matchrec.y <= jlim ) + { + + d = penalty+resvec[i]; + if (d= distlim) + break; + + p1+= rowstride; + p2+= rowstride; + } + return s; +} + +static int sad_01(uint8_t *blk1,uint8_t *blk2,int rowstride, int h) +{ + uint8_t *p1,*p2; + int i,j; + int s; + register int v; + + s = 0; + p1 = blk1; + p2 = blk2; + for (j=0; j>1) - p2[i]; + s+=intabs(v); + } + p1+= rowstride; + p2+= rowstride; + } + return s; +} + +static int sad_10(uint8_t *blk1,uint8_t *blk2, int rowstride, int h) +{ + uint8_t *p1,*p1a,*p2; + int i,j; + int s; + register int v; + + s = 0; + p1 = blk1; + p2 = blk2; + p1a = p1 + rowstride; + for (j=0; j>1) - p2[i]; + s+= intabs(v); + } + p1 = p1a; + p1a+= rowstride; + p2+= rowstride; + } + + return s; +} + +static int sad_11(uint8_t *blk1,uint8_t *blk2, int rowstride, int h) +{ + uint8_t *p1,*p1a,*p2; + int i,j; + int s; + register int v; + + s = 0; + p1 = blk1; + p2 = blk2; + p1a = p1 + rowstride; + + for (j=0; j>2) - p2[i]; + s+=intabs(v); + } + p1 = p1a; + p1a+= rowstride; + p2+= rowstride; + } + return s; +} + + +/* + * Compute subsampled images for fast motion compensation search + * N.b. rowstride should be *two* line widths for interlace images + */ + +void subsample_image( uint8_t *image, int rowstride, + uint8_t *sub22_image, + uint8_t *sub44_image) +{ + uint8_t *blk = image; + uint8_t *b, *nb; + uint8_t *pb; + uint8_t *qb; + uint8_t *start_s22blk, *start_s44blk; + int i; + int nextfieldline = rowstride; + + start_s22blk = sub22_image; + start_s44blk = sub44_image; + b = blk; + nb = (blk+nextfieldline); + + pb = (uint8_t *) start_s22blk; + + while( nb < start_s22blk ) + { + for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */ + { + pb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2; + pb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2; + pb += 2; + b += 4; + nb += 4; + } + b += nextfieldline; + nb = b + nextfieldline; + } + + + /* Now create the 4*4 sub-sampled data from the 2*2 + N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the + original. Albeit half as many lines and pixels... + */ + + nextfieldline = nextfieldline >> 1; + + qb = start_s44blk; + b = start_s22blk; + nb = (start_s22blk+nextfieldline); + + while( nb < start_s44blk ) + { + for( i = 0; i < nextfieldline/4; ++i ) + { + qb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2; + qb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2; + qb += 2; + b += 4; + nb += 4; + } + b += nextfieldline; + nb = b + nextfieldline; + } + +} + +/* + * Same as sad_00 except for 2*2 subsampled data so only 8 wide! + * + */ + + +static int sad_sub22( uint8_t *s22blk1, uint8_t *s22blk2,int frowstride,int fh) +{ + uint8_t *p1 = s22blk1; + uint8_t *p2 = s22blk2; + int s = 0; + int j; + + for( j = 0; j < fh; ++j ) + { + register int diff; +#define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) + pipestep(0); pipestep(1); + pipestep(2); pipestep(3); + pipestep(4); pipestep(5); + pipestep(6); pipestep(7); + p1 += frowstride; + p2 += frowstride; +#undef pipestep + } + + return s; +} + + + +/* + * Same as sad_00 except for 4*4 sub-sampled data. + * + * N.b.: currently assumes only 16*16 or 16*8 motion estimation will + * be used... I.e. 4*4 or 4*2 sub-sampled blocks will be compared. + * + * + */ + + +static int sad_sub44( uint8_t *s44blk1, uint8_t *s44blk2,int qrowstride,int qh) +{ + register uint8_t *p1 = s44blk1; + register uint8_t *p2 = s44blk2; + int s = 0; + register int diff; + + /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */ +#define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff; + pipestep(0); pipestep(1); pipestep(2); pipestep(3); + if( qh > 1 ) + { + p1 += qrowstride; p2 += qrowstride; + pipestep(0); pipestep(1); pipestep(2); pipestep(3); + if( qh > 2 ) + { + p1 += qrowstride; p2 += qrowstride; + pipestep(0); pipestep(1); pipestep(2); pipestep(3); + p1 += qrowstride; p2 += qrowstride; + pipestep(0); pipestep(1); pipestep(2); pipestep(3); + } + } + + + return s; +} + +/* + * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels + * blk1,blk2: addresses of top left pels of both blocks + * rowstride: distance (in bytes) of vertically adjacent pels + * h: height of block (usually 8 or 16) + */ + +static int sumsq_sub22(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) +{ + uint8_t *p1 = blk1, *p2 = blk2; + int i,j,v; + int s = 0; + for (j=0; j>1) - p2[i]; + s+= v*v; + } + p1f+= rowstride; + p1b+= rowstride; + p2+= rowstride; + } + return s; +} + +/* + * total squared difference between two (16*h) blocks + * including optional half pel interpolation of blk1 (hx,hy) + * blk1,blk2: addresses of top left pels of both blocks + * rowstride: distance (in bytes) of vertically adjacent pels + * hx,hy: flags for horizontal and/or vertical interpolation + * h: height of block (usually 8 or 16) + */ + + +static int sumsq(blk1,blk2,rowstride,hx,hy,h) + uint8_t *blk1,*blk2; + int rowstride,hx,hy,h; +{ + uint8_t *p1,*p1a,*p2; + int i,j; + int s,v; + + s = 0; + p1 = blk1; + p2 = blk2; + if (!hx && !hy) + for (j=0; j>1) - p2[i]; + s+= v*v; + } + p1+= rowstride; + p2+= rowstride; + } + else if (!hx && hy) + { + p1a = p1 + rowstride; + for (j=0; j>1) - p2[i]; + s+= v*v; + } + p1 = p1a; + p1a+= rowstride; + p2+= rowstride; + } + } + else /* if (hx && hy) */ + { + p1a = p1 + rowstride; + for (j=0; j>2) - p2[i]; + s+= v*v; + } + p1 = p1a; + p1a+= rowstride; + p2+= rowstride; + } + } + + return s; +} + + +/* + * absolute difference error between a (16*h) block and a bidirectional + * prediction + * + * p2: address of top left pel of block + * pf,hxf,hyf: address and half pel flags of forward ref. block + * pb,hxb,hyb: address and half pel flags of backward ref. block + * h: height of block + * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb + */ + + +static int bsad(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h) + uint8_t *pf,*pb,*p2; + int rowstride,hxf,hyf,hxb,hyb,h; +{ + uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; + int i,j; + int s,v; + + pfa = pf + hxf; + pfb = pf + rowstride*hyf; + pfc = pfb + hxf; + + pba = pb + hxb; + pbb = pb + rowstride*hyb; + pbc = pbb + hxb; + + s = 0; + + for (j=0; j>2) + + ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1) + - *p2++; + s += abs(v); + } + p2+= rowstride-16; + pf+= rowstride-16; + pfa+= rowstride-16; + pfb+= rowstride-16; + pfc+= rowstride-16; + pb+= rowstride-16; + pba+= rowstride-16; + pbb+= rowstride-16; + pbc+= rowstride-16; + } + + return s; +} + +/* + * squared error between a (16*h) block and a bidirectional + * prediction + * + * p2: address of top left pel of block + * pf,hxf,hyf: address and half pel flags of forward ref. block + * pb,hxb,hyb: address and half pel flags of backward ref. block + * h: height of block + * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb + */ + + +static int bsumsq(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h) + uint8_t *pf,*pb,*p2; + int rowstride,hxf,hyf,hxb,hyb,h; +{ + uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; + int i,j; + int s,v; + + pfa = pf + hxf; + pfb = pf + rowstride*hyf; + pfc = pfb + hxf; + + pba = pb + hxb; + pbb = pb + rowstride*hyb; + pbc = pbb + hxb; + + s = 0; + + for (j=0; j>2) + + ((ui(*pb++) + ui(*pba++) + ui(*pbb++) + ui(*pbc++) + 2)>>2) + + 1 + )>>1) - ui(*p2++); +#undef ui + s+=v*v; + } + p2+= rowstride-16; + pf+= rowstride-16; + pfa+= rowstride-16; + pfb+= rowstride-16; + pfc+= rowstride-16; + pb+= rowstride-16; + pba+= rowstride-16; + pbb+= rowstride-16; + pbc+= rowstride-16; + } + + return s; +} + + +/* + * variance of a (size*size) block, multiplied by 256 + * p: address of top left pel of block + * rowstride: distance (in bytes) of vertically adjacent pels + * SIZE is a multiple of 8. + */ +static int variance(uint8_t *p, int size, int rowstride) +{ + int i,j; + unsigned int v,s,s2; + int var; + s = s2 = 0; + + for (j=0; j +#include "mjpeg_types.h" + +struct me_result +{ + uint16_t weight; + int8_t x; + int8_t y; +}; + +typedef struct me_result me_result_s; + +#define MAX_MATCHES (256*256/4) + +typedef struct _me_result_vec { + int len; + me_result_s mests[MAX_MATCHES]; +} me_result_set; + +/* + * Function pointers for selecting CPU specific implementations + * + * Top-level motion estimation entry points. + * + */ + +/* + * Use given set of 2*2 sub-sampled motion estimates generated using + * pbuild_sub22_mests to generate a best 1-pel motion estimate. + * + */ + +extern void (*pfind_best_one_pel)( me_result_set *sub22set, + uint8_t *org, uint8_t *blk, + int i0, int j0, + int ihigh, int jhigh, + int rowstride, int h, + me_result_s *res + ); + +/* + * Use given set of 4*4 sub-sampled motion estimates generated using + * pbuild_sub44_mests to generate a set of best 2*2 sub-sampled motion + * estimates. + */ + +extern int (*pbuild_sub22_mests)( me_result_set *sub44set, + me_result_set *sub22set, + int i0, int j0, int ihigh, int jhigh, + int null_mc_sad, + uint8_t *s22org, uint8_t *s22blk, + int frowstride, int fh, + int reduction + ); + +/* + * Generate a set the best 4*4 sub-sampled motion estimates + * from the specifed rectangle of candidates in s44org with respect + * to the reference block s44blk + */ + +extern int (*pbuild_sub44_mests)( me_result_set *sub44set, + int ilow, int jlow, int ihigh, int jhigh, + int i0, int j0, + int null_mc_sad, + uint8_t *s44org, uint8_t *s44blk, + int qrowstride, int qh, + int reduction ); + + +/* + * Lower level difference comparison routines + * + * Naming convention: for block difference metric functions + * sad - Sum Absolute difference + * sumsq - Sum of squares + * sub44 - 4*4 Subsampled macroblocks (i.e. 4x4 subsampled pels) + * sub22 2*2 submsapled macroblocks (i.e. 8x8 subsampled pels) + * Ordinary macroblocks (i.e. 16x16 pels) + * "b" prefix: Difference of bi-directionally interpolated block + * (i.e. mean of two blocks) + * _xy suffix: Difference with half-pel sub-sampling offset. + * (i.e. _00 is just ordinary no sub-sampling) + * mests - Motion estimate results relative motion vector with sad. + */ + +extern int (*psumsq_sub22)( uint8_t *blk1, uint8_t *blk2, + int rowstride, int h); +extern int (*pbsumsq_sub22)( uint8_t *blk1f, uint8_t *blk1b, + uint8_t *blk2, + int rowstride, int h); + +extern int (*pvariance)(uint8_t *mb, int size, int rowstride); + + +extern int (*psad_sub22) ( uint8_t *blk1, uint8_t *blk2, int frowstride, int fh); +extern int (*psad_sub44) ( uint8_t *blk1, uint8_t *blk2, int qrowstride, int qh); +extern int (*psad_00) ( uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim); +extern int (*psad_01) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); +extern int (*psad_10) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); +extern int (*psad_11) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h); + +extern int (*psumsq) (uint8_t *blk1, uint8_t *blk2, + int rowstride, int hx, int hy, int h); + + +extern int (*pbsumsq) (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h); + +extern int (*pbsad) (uint8_t *pf, uint8_t *pb, + uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h); + +void init_motion_search(void); +int round_search_radius( int radius ); +void subsample_image( uint8_t *image, int rowstride, + uint8_t *sub22_image, + uint8_t *sub44_image); + +#endif /* __MOTIONSEARCH_H__ */ diff --git a/veejay-current/utils/mpegconsts.c b/veejay-current/utils/mpegconsts.c new file mode 100644 index 00000000..008ac503 --- /dev/null +++ b/veejay-current/utils/mpegconsts.c @@ -0,0 +1,450 @@ + +/* + * mpegconsts.c: Video format constants for MPEG and utilities for display + * and conversion to format used for yuv4mpeg + * + * Copyright (C) 2001 Andrew Stevens + * Copyright (C) 2001 Matthew Marjanovic + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include +#include "mpegconsts.h" +#include "yuv4mpeg.h" +#include "yuv4mpeg_intern.h" + +static y4m_ratio_t +mpeg_framerates[] = { + Y4M_FPS_UNKNOWN, + Y4M_FPS_NTSC_FILM, + Y4M_FPS_FILM, + Y4M_FPS_PAL, + Y4M_FPS_NTSC, + Y4M_FPS_30, + Y4M_FPS_PAL_FIELD, + Y4M_FPS_NTSC_FIELD, + Y4M_FPS_60 +}; + + +#define MPEG_NUM_RATES (sizeof(mpeg_framerates)/sizeof(mpeg_framerates[0])) +const mpeg_framerate_code_t mpeg_num_framerates = MPEG_NUM_RATES; + +static const char * +framerate_definitions[MPEG_NUM_RATES] = +{ + "illegal", + "24000.0/1001.0 (NTSC 3:2 pulldown converted FILM)", + "24.0 (NATIVE FILM)", + "25.0 (PAL/SECAM VIDEO / converted FILM)", + "30000.0/1001.0 (NTSC VIDEO)", + "30.0", + "50.0 (PAL FIELD RATE)", + "60000.0/1001.0 (NTSC FIELD RATE)", + "60.0" +}; + + +static const char *mpeg1_aspect_ratio_definitions[] = +{ + "1:1 (square pixels)", + "1:0.6735", + "1:0.7031 (16:9 Anamorphic PAL/SECAM for 720x578/352x288 images)", + "1:0.7615", + "1:0.8055", + "1:0.8437 (16:9 Anamorphic NTSC for 720x480/352x240 images)", + "1:0.8935", + "1:0.9375 (4:3 PAL/SECAM for 720x578/352x288 images)", + "1:0.9815", + "1:1.0255", + "1:1:0695", + "1:1.1250 (4:3 NTSC for 720x480/352x240 images)", + "1:1.1575", + "1:1.2015" +}; + +static const y4m_ratio_t mpeg1_aspect_ratios[] = +{ + Y4M_SAR_MPEG1_1, + Y4M_SAR_MPEG1_2, + Y4M_SAR_MPEG1_3, /* Anamorphic 16:9 PAL */ + Y4M_SAR_MPEG1_4, + Y4M_SAR_MPEG1_5, + Y4M_SAR_MPEG1_6, /* Anamorphic 16:9 NTSC */ + Y4M_SAR_MPEG1_7, + Y4M_SAR_MPEG1_8, /* PAL/SECAM 4:3 */ + Y4M_SAR_MPEG1_9, + Y4M_SAR_MPEG1_10, + Y4M_SAR_MPEG1_11, + Y4M_SAR_MPEG1_12, /* NTSC 4:3 */ + Y4M_SAR_MPEG1_13, + Y4M_SAR_MPEG1_14, +}; + +static const char *mpeg2_aspect_ratio_definitions[] = +{ + "1:1 pixels", + "4:3 display", + "16:9 display", + "2.21:1 display" +}; + + +static const y4m_ratio_t mpeg2_aspect_ratios[] = +{ + Y4M_DAR_MPEG2_1, + Y4M_DAR_MPEG2_2, + Y4M_DAR_MPEG2_3, + Y4M_DAR_MPEG2_4 +}; + +static const char **aspect_ratio_definitions[2] = +{ + mpeg1_aspect_ratio_definitions, + mpeg2_aspect_ratio_definitions +}; + +static const y4m_ratio_t *mpeg_aspect_ratios[2] = +{ + mpeg1_aspect_ratios, + mpeg2_aspect_ratios +}; + +const mpeg_aspect_code_t mpeg_num_aspect_ratios[2] = +{ + sizeof(mpeg1_aspect_ratios)/sizeof(mpeg1_aspect_ratios[0]), + sizeof(mpeg2_aspect_ratios)/sizeof(mpeg2_aspect_ratios[0]) +}; + +/* + * Convert MPEG frame-rate code to corresponding frame-rate + */ + +y4m_ratio_t +mpeg_framerate( mpeg_framerate_code_t code ) +{ + if( code == 0 || code > mpeg_num_framerates ) + return y4m_fps_UNKNOWN; + else + return mpeg_framerates[code]; +} + +/* + * Look-up MPEG frame rate code for a (exact) frame rate. + */ + + +mpeg_framerate_code_t +mpeg_framerate_code( y4m_ratio_t framerate ) +{ + mpeg_framerate_code_t i; + + y4m_ratio_reduce(&framerate); + for (i = 1; i < mpeg_num_framerates; ++i) { + if (Y4M_RATIO_EQL(framerate, mpeg_framerates[i])) + return i; + } + return 0; +} + + +/* small enough to distinguish 1/1000 from 1/1001 */ +#define MPEG_FPS_TOLERANCE 0.0001 + + +y4m_ratio_t +mpeg_conform_framerate( double fps ) +{ + mpeg_framerate_code_t i; + y4m_ratio_t result; + + /* try to match it to a standard frame rate */ + for (i = 1; i < mpeg_num_framerates; i++) + { + double deviation = 1.0 - (Y4M_RATIO_DBL(mpeg_framerates[i]) / fps); + if ( (deviation > -MPEG_FPS_TOLERANCE) && + (deviation < +MPEG_FPS_TOLERANCE) ) + return mpeg_framerates[i]; + } + /* no luck? just turn it into a ratio (6 decimal place accuracy) */ + result.n = (fps * 1000000.0) + 0.5; + result.d = 1000000; + y4m_ratio_reduce(&result); + return result; +} + + + +/* + * Convert MPEG aspect-ratio code to corresponding aspect-ratio + */ + +y4m_ratio_t +mpeg_aspect_ratio( int mpeg_version, mpeg_aspect_code_t code ) +{ + y4m_ratio_t ratio; + if( mpeg_version < 1 || mpeg_version > 2 ) + return y4m_sar_UNKNOWN; + if( code == 0 || code > mpeg_num_aspect_ratios[mpeg_version-1] ) + return y4m_sar_UNKNOWN; + else + { + ratio = mpeg_aspect_ratios[mpeg_version-1][code-1]; + y4m_ratio_reduce(&ratio); + return ratio; + } +} + +/* + * Look-up corresponding MPEG aspect ratio code given an exact aspect ratio. + * + * WARNING: The semantics of aspect ratio coding *changed* between + * MPEG1 and MPEG2. In MPEG1 it is the *pixel* aspect ratio. In + * MPEG2 it is the (far more sensible) aspect ratio of the eventual + * display. + * + */ + +mpeg_aspect_code_t +mpeg_frame_aspect_code( int mpeg_version, y4m_ratio_t aspect_ratio ) +{ + mpeg_aspect_code_t i; + y4m_ratio_t red_ratio = aspect_ratio; + y4m_ratio_reduce( &red_ratio ); + if( mpeg_version < 1 || mpeg_version > 2 ) + return 0; + for( i = 1; i < mpeg_num_aspect_ratios[mpeg_version-1]; ++i ) + { + y4m_ratio_t red_entry = mpeg_aspect_ratios[mpeg_version-1][i-1]; + y4m_ratio_reduce( &red_entry ); + if( Y4M_RATIO_EQL( red_entry, red_ratio) ) + return i; + } + + return 0; + +} + + + +/* + * Guess the correct MPEG aspect ratio code, + * given the true sample aspect ratio and frame size of a video stream + * (and the MPEG version, 1 or 2). + * + * Returns 0 if it has no good guess. + * + */ + + +/* this is big enough to accommodate the difference between 720 and 704 */ +#define GUESS_ASPECT_TOLERANCE 0.03 + +mpeg_aspect_code_t +mpeg_guess_mpeg_aspect_code(int mpeg_version, y4m_ratio_t sampleaspect, + int frame_width, int frame_height) +{ + if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_UNKNOWN)) + { + return 0; + } + switch (mpeg_version) { + case 1: + if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_SQUARE)) + { + return 1; + } + else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_NTSC_CCIR601)) + { + return 12; + } + else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_NTSC_16_9)) + { + return 6; + } + else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_PAL_CCIR601)) + { + return 8; + } + else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_PAL_16_9)) + { + return 3; + } + return 0; + break; + case 2: + if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_SQUARE)) + { + return 1; /* '1' means square *pixels* in MPEG-2; go figure. */ + } + else + { + int i; + double true_far; /* true frame aspect ratio */ + true_far = + (double)(sampleaspect.n * frame_width) / + (double)(sampleaspect.d * frame_height); + /* start at '2'... */ + for (i = 2; i < mpeg_num_aspect_ratios[mpeg_version-1]; i++) + { + double ratio = + true_far / Y4M_RATIO_DBL(mpeg_aspect_ratios[mpeg_version-1][i-1]); + if ( (ratio > (1.0 - GUESS_ASPECT_TOLERANCE)) && + (ratio < (1.0 + GUESS_ASPECT_TOLERANCE)) ) + return i; + } + return 0; + } + break; + default: + return 0; + break; + } +} + + + + +/* + * Guess the true sample aspect ratio of a video stream, + * given the MPEG aspect ratio code and the actual frame size + * (and the MPEG version, 1 or 2). + * + * Returns y4m_sar_UNKNOWN if it has no good guess. + * + */ +y4m_ratio_t +mpeg_guess_sample_aspect_ratio(int mpeg_version, + mpeg_aspect_code_t code, + int frame_width, int frame_height) +{ + switch (mpeg_version) + { + case 1: + /* MPEG-1 codes turn into SAR's, just not quite the right ones. + For the common/known values, we provide the ratio used in practice, + otherwise say we don't know.*/ + switch (code) + { + case 1: return y4m_sar_SQUARE; break; + case 3: return y4m_sar_PAL_16_9; break; + case 6: return y4m_sar_NTSC_16_9; break; + case 8: return y4m_sar_PAL_CCIR601; break; + case 12: return y4m_sar_NTSC_CCIR601; break; + default: + return y4m_sar_UNKNOWN; break; + } + break; + case 2: + /* MPEG-2 codes turn into Frame Aspect Ratios, though not exactly the + FAR's used in practice. For common/standard frame sizes, we provide + the original SAR; otherwise, we say we don't know. */ + if (code == 1) + { + return y4m_sar_SQUARE; /* '1' means square *pixels* in MPEG-2 */ + } + else if ((code >= 2) && (code <= 4)) + { + return y4m_guess_sar(frame_width, frame_height, + mpeg2_aspect_ratios[code-1]); + } + else + { + return y4m_sar_UNKNOWN; + } + break; + default: + return y4m_sar_UNKNOWN; + break; + } +} + + + + + +/* + * Look-up MPEG explanatory definition string for frame rate code + * + */ + + +const char * +mpeg_framerate_code_definition( mpeg_framerate_code_t code ) +{ + if( code == 0 || code >= mpeg_num_framerates ) + return "UNDEFINED: illegal/reserved frame-rate ratio code"; + + return framerate_definitions[code]; +} + +/* + * Look-up MPEG explanatory definition string aspect ratio code for an + * aspect ratio code + * + */ + +const char * +mpeg_aspect_code_definition( int mpeg_version, mpeg_aspect_code_t code ) +{ + if( mpeg_version < 1 || mpeg_version > 2 ) + return "UNDEFINED: illegal MPEG version"; + + if( code < 1 || code > mpeg_num_aspect_ratios[mpeg_version-1] ) + return "UNDEFINED: illegal aspect ratio code"; + + return aspect_ratio_definitions[mpeg_version-1][code-1]; +} + + +/* + * Look-up explanatory definition of interlace field order code + * + */ + +const char * +mpeg_interlace_code_definition( int yuv4m_interlace_code ) +{ + const char *def; + switch( yuv4m_interlace_code ) + { + case Y4M_UNKNOWN : + def = "unknown"; + break; + case Y4M_ILACE_NONE : + def = "none/progressive"; + break; + case Y4M_ILACE_TOP_FIRST : + def = "top-field-first"; + break; + case Y4M_ILACE_BOTTOM_FIRST : + def = "bottom-field-first"; + break; + default : + def = "UNDEFINED: illegal video interlacing type-code!"; + break; + } + return def; +} + + +/* + * Local variables: + * c-file-style: "stroustrup" + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/veejay-current/utils/mpegconsts.h b/veejay-current/utils/mpegconsts.h new file mode 100644 index 00000000..461fd2db --- /dev/null +++ b/veejay-current/utils/mpegconsts.h @@ -0,0 +1,149 @@ + +/* + * mpegconsts.c: Video format constants for MPEG and utilities for display + * and conversion to format used for yuv4mpeg + * + * Copyright (C) 2001 Andrew Stevens + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __MPEGCONSTS_H__ +#define __MPEGCONSTS_H__ + + +#include "yuv4mpeg.h" + + +typedef unsigned int mpeg_framerate_code_t; +typedef unsigned int mpeg_aspect_code_t; + +extern const mpeg_framerate_code_t mpeg_num_framerates; +extern const mpeg_aspect_code_t mpeg_num_aspect_ratios[2]; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Convert MPEG frame-rate code to corresponding frame-rate + * y4m_fps_UNKNOWN = { 0, 0 } = Undefined/resrerved code. + */ + +y4m_ratio_t +mpeg_framerate( mpeg_framerate_code_t code ); + + +/* + * Look-up MPEG frame rate code for a (exact) frame rate. + * 0 = No MPEG code defined for frame-rate + */ + +mpeg_framerate_code_t +mpeg_framerate_code( y4m_ratio_t framerate ); + + +/* + * Convert floating-point framerate to an exact ratio. + * Uses a standard MPEG rate, if it finds one within MPEG_FPS_TOLERANCE + * (see mpegconsts.c), otherwise uses "fps:1000000" as the ratio. + */ + +y4m_ratio_t +mpeg_conform_framerate( double fps ); + + +/* + * Convert MPEG aspect ratio code to corresponding aspect ratio + * + * WARNING: The semantics of aspect ratio coding *changed* between + * MPEG1 and MPEG2. In MPEG1 it is the *pixel* aspect ratio. In + * MPEG2 it is the (far more sensible) aspect ratio of the eventual + * display. + * + */ + +y4m_ratio_t +mpeg_aspect_ratio( int mpeg_version, mpeg_aspect_code_t code ); + +/* + * Look-up MPEG aspect ratio code for an aspect ratio - tolerance + * is Y4M_ASPECT_MULT used by YUV4MPEG (see yuv4mpeg_intern.h) + * + * WARNING: The semantics of aspect ratio coding *changed* between + * MPEG1 and MPEG2. In MPEG1 it is the *pixel* aspect ratio. In + * MPEG2 it is the (far more sensible) aspect ratio of the eventual + * display. + * + */ + +mpeg_aspect_code_t +mpeg_frame_aspect_code( int mpeg_version, y4m_ratio_t aspect_ratio ); + +/* + * Look-up MPEG explanatory definition string aspect ratio code for an + * aspect ratio code + * + */ + +const char * +mpeg_aspect_code_definition( int mpeg_version, mpeg_aspect_code_t code ); + +/* + * Look-up MPEG explanatory definition string aspect ratio code for an + * frame rate code + * + */ + +const char * +mpeg_framerate_code_definition( mpeg_framerate_code_t code ); + +const char * +mpeg_interlace_code_definition( int yuv4m_interlace_code ); + + +/* + * Guess the correct MPEG aspect ratio code, + * given the true sample aspect ratio and frame size of a video stream + * (and the MPEG version, 1 or 2). + * + * Returns 0 if it has no good answer. + * + */ +mpeg_aspect_code_t +mpeg_guess_mpeg_aspect_code(int mpeg_version, y4m_ratio_t sampleaspect, + int frame_width, int frame_height); + +/* + * Guess the true sample aspect ratio of a video stream, + * given the MPEG aspect ratio code and the actual frame size + * (and the MPEG version, 1 or 2). + * + * Returns y4m_sar_UNKNOWN if it has no good answer. + * + */ +y4m_ratio_t +mpeg_guess_sample_aspect_ratio(int mpeg_version, + mpeg_aspect_code_t code, + int frame_width, int frame_height); + + +#ifdef __cplusplus +}; +#endif + + + +#endif /* __MPEGCONSTS_H__ */ diff --git a/veejay-current/utils/mpegtimecode.c b/veejay-current/utils/mpegtimecode.c new file mode 100644 index 00000000..09986739 --- /dev/null +++ b/veejay-current/utils/mpegtimecode.c @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2001 Kawamata/Hitoshi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include "mpegtimecode.h" + +/************************************************************** + * // NTSC DROP FRAME TIMECODE / 29.97fps (SMTPE) + * // hh:mm:ss:ff + * // hh: 0.. + * // mm: 0..59 + * // ss: 0..59 + * // ff: 0..29 # ss != 0 || mm % 10 == 0 + * // 2..29 # ss == 0 && mm % 10 != 0 + * // + * // 00:00:00:00 00:00:00:01 00:00:00:02 ... 00:00:00:29 + * // 00:00:01:00 00:00:01:01 00:00:01:02 ... 00:00:01:29 + * // : + * // 00:00:59:00 00:00:59:01 00:00:59:02 ... 00:00:59:29 + * // 00:01:00:02 ... 00:01:00:29 + * // 00:01:01:00 00:01:01:01 00:01:01:02 ... 00:01:00:29 + * // : + * // 00:01:59:00 00:01:59:01 00:01:59:02 ... 00:01:59:29 + * // 00:02:00:02 ... 00:02:00:29 + * // 00:02:01:00 00:02:01:01 00:02:01:02 ... 00:02:00:29 + * // : + * // : + * // 00:09:59:00 00:09:59:01 00:09:59:02 ... 00:09:59:29 + * // 00:10:00:00 00:10:00:01 00:10:00:02 ... 00:10:00:29 + * // 00:10:01:00 00:10:01:01 00:10:01:02 ... 00:10:01:29 + * // : + * // 00:10:59:00 00:10:59:01 00:10:59:02 ... 00:10:59:29 + * // 00:11:00:02 ... 00:11:00:29 + * // 00:11:01:00 00:11:01:01 00:11:01:02 ... 00:11:00:29 + * // : + * // : + * // DROP FRAME / 59.94fps (no any standard) + * // DROP FRAME / 23.976fps (no any standard) + ***************************************************************/ + +int dropframetimecode = -1; + +/* mpeg_timecode() return -tc->f on first frame in the minute, tc->f on other. */ +int +mpeg_timecode(MPEG_timecode_t *tc, int f, int fpscode, double fps) +{ + static const int ifpss[] = { 0, 24, 24, 25, 30, 30, 50, 60, 60, }; + int h, m, s; + + if (dropframetimecode < 0) { + char *env = getenv("MJPEG_DROP_FRAME_TIME_CODE"); + dropframetimecode = (env && *env != '0' && *env != 'n' && *env != 'N'); + } + if (dropframetimecode && + 0 < fpscode && fpscode + 1 < sizeof ifpss / sizeof ifpss[0] && + ifpss[fpscode] == ifpss[fpscode + 1]) { + int topinmin = 0, k = (30*4) / ifpss[fpscode]; + f *= k; /* frame# when 119.88fps */ + h = (f / ((10*60*30-18)*4)); /* # of 10min. */ + f %= ((10*60*30-18)*4); /* frame# in 10min. */ + f -= (2*4); /* frame# in 10min. - (2*4) */ + m = (f / ((60*30-2)*4)); /* min. in 10min. */ + topinmin = ((f - k) / ((60*30-2)*4) < m); + m += (h % 6 * 10); /* min. */ + h /= 6; /* hour */ + f %= ((60*30-2)*4); /* frame# in min. - (2*4)*/ + f += (2*4); /* frame# in min. */ + s = f / (30*4); /* sec. */ + f %= (30*4); /* frame# in sec. */ + f /= k; /* frame# in sec. on original fps */ + tc->f = f; + if (topinmin) + f = -f; + } else { + int ifps = ((0 < fpscode && fpscode < sizeof ifpss / sizeof ifpss[0])? + ifpss[fpscode]: (int)(fps + .5)); + s = f / ifps; + f %= ifps; + m = s / 60; + s %= 60; + h = m / 60; + m %= 60; + tc->f = f; + } + tc->s = s; + tc->m = m; + tc->h = h; + return f; +} diff --git a/veejay-current/utils/mpegtimecode.h b/veejay-current/utils/mpegtimecode.h new file mode 100644 index 00000000..e702a9b7 --- /dev/null +++ b/veejay-current/utils/mpegtimecode.h @@ -0,0 +1,39 @@ +/* -*- mode:C -*- */ +/* + * Copyright (C) 2001 Kawamata/Hitoshi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __MPEGTIMECODE_H__ +#define __MPEGTIMECODE_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + char h, m, s, f; +} MPEG_timecode_t; + +extern int dropframetimecode; +extern int mpeg_timecode(MPEG_timecode_t *tc, int f, int fpscode, double fps); +/* mpeg_timecode() return -tc->f on first frame in the minute, tc->f on other. */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/veejay-current/utils/videodev_mjpeg.h b/veejay-current/utils/videodev_mjpeg.h new file mode 100644 index 00000000..68fd79c5 --- /dev/null +++ b/veejay-current/utils/videodev_mjpeg.h @@ -0,0 +1,118 @@ +/* These are the MJPEG API extensions for the Video4Linux API, + first introduced by the Iomega Buz driver by Rainer Johanni + +*/ + +/* This is identical with the mgavideo internal params struct, + please tell me if you change this struct here ! top-field-first */ + + int APPn; /* Number of APP segment to be written, must be 0..15 */ + int APP_len; /* Length of data in JPEG APPn segment */ + char APP_data[60]; /* Data in the JPEG APPn segment. */ + + int COM_len; /* Length of data in JPEG COM segment */ + char COM_data[60]; /* Data in JPEG COM segment */ + + unsigned long jpeg_markers; /* Which markers should go into the JPEG output. + Unless you exactly know what you do, leave them untouched. + Inluding less markers will make the resulting code + smaller, but there will be fewer aplications + which can read it. + The presence of the APP and COM marker is + influenced by APP0_len and COM_len ONLY! */ +#define JPEG_MARKER_DHT (1<<3) /* Define Huffman Tables */ +#define JPEG_MARKER_DQT (1<<4) /* Define Quantization Tables */ +#define JPEG_MARKER_DRI (1<<5) /* Define Restart Interval */ +#define JPEG_MARKER_COM (1<<6) /* Comment segment */ +#define JPEG_MARKER_APP (1<<7) /* App segment, driver will allways use APP0 */ + + int VFIFO_FB; /* Flag for enabling Video Fifo Feedback. + If this flag is turned on and JPEG decompressing + is going to the screen, the decompress process + is stopped every time the Video Fifo is full. + This enables a smooth decompress to the screen + but the video output signal will get scrambled */ + + /* Misc */ + + char reserved[312]; /* Makes 512 bytes for this structure */ +}; + +struct mjpeg_requestbuffers +{ + unsigned long count; /* Number of buffers for MJPEG grabbing */ + unsigned long size; /* Size PER BUFFER in bytes */ +}; + +struct mjpeg_sync +{ + unsigned long frame; /* Frame (0 - n) for double buffer */ + unsigned long length; /* number of code bytes in buffer (capture only) */ + unsigned long seq; /* frame sequence number */ + struct timeval timestamp; /* timestamp */ +}; + +struct mjpeg_status +{ + int input; /* Input channel, has to be set prior to BUZIOC_G_STATUS */ + int signal; /* Returned: 1 if valid video signal detected */ + int norm; /* Returned: VIDEO_MODE_PAL or VIDEO_MODE_NTSC */ + int color; /* Returned: 1 if color signal detected */ +}; + +/* +Private IOCTL to set up for displaying MJPEG +*/ +#define MJPIOC_G_PARAMS _IOR ('v', BASE_VIDIOCPRIVATE+0, struct mjpeg_params) +#define MJPIOC_S_PARAMS _IOWR('v', BASE_VIDIOCPRIVATE+1, struct mjpeg_params) +#define MJPIOC_REQBUFS _IOWR('v', BASE_VIDIOCPRIVATE+2, struct mjpeg_requestbuffers) +#define MJPIOC_QBUF_CAPT _IOW ('v', BASE_VIDIOCPRIVATE+3, int) +#define MJPIOC_QBUF_PLAY _IOW ('v', BASE_VIDIOCPRIVATE+4, int) +#define MJPIOC_SYNC _IOR ('v', BASE_VIDIOCPRIVATE+5, struct mjpeg_sync) +#define MJPIOC_G_STATUS _IOWR('v', BASE_VIDIOCPRIVATE+6, struct mjpeg_status) diff --git a/veejay-current/utils/yuv4mpeg.c b/veejay-current/utils/yuv4mpeg.c new file mode 100644 index 00000000..5c7cd8fd --- /dev/null +++ b/veejay-current/utils/yuv4mpeg.c @@ -0,0 +1,774 @@ +/* + * yuv4mpeg.c: Functions for reading and writing "new" YUV4MPEG streams + * + * Copyright (C) 2001 Matthew J. Marjanovic + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +#include + +#include +#include +#include +#include +#include "yuv4mpeg.h" +#include "yuv4mpeg_intern.h" +#include "mjpeg_logging.h" + + +static int _y4mparam_allow_unknown_tags = 1; /* default is forgiveness */ + +static void *(*_y4m_alloc)(size_t bytes) = malloc; +static void (*_y4m_free)(void *ptr) = free; + + + +int y4m_allow_unknown_tags(int yn) +{ + int old = _y4mparam_allow_unknown_tags; + if (yn >= 0) + _y4mparam_allow_unknown_tags = (yn) ? 1 : 0; + return old; +} + + + +/************************************************************************* + * + * Convenience functions for fd read/write + * + * - guaranteed to transfer entire payload (or fail) + * - returns: + * 0 on complete success + * +(# of remaining bytes) on eof (for y4m_read) + * -(# of rem. bytes) on error (and ERRNO should be set) + * + *************************************************************************/ + + +ssize_t y4m_read(int fd, void *buf, size_t len) +{ + ssize_t n; + uint8_t *ptr = (uint8_t *)buf; + + while (len > 0) { + n = read(fd, ptr, len); + if (n <= 0) { + /* return amount left to read */ + if (n == 0) + return len; /* n == 0 --> eof */ + else + return -len; /* n < 0 --> error */ + } + ptr += n; + len -= n; + } + return 0; +} + + +ssize_t y4m_write(int fd, const void *buf, size_t len) +{ + ssize_t n; + const uint8_t *ptr = (const uint8_t *)buf; + + while (len > 0) { + n = write(fd, ptr, len); + if (n <= 0) return -len; /* return amount left to write */ + ptr += n; + len -= n; + } + return 0; +} + + + +/************************************************************************* + * + * "Extra tags" handling + * + *************************************************************************/ + + +static char *y4m_new_xtag(void) +{ + return _y4m_alloc(Y4M_MAX_XTAG_SIZE * sizeof(char)); +} + + +void y4m_init_xtag_list(y4m_xtag_list_t *xtags) +{ + int i; + xtags->count = 0; + for (i = 0; i < Y4M_MAX_XTAGS; i++) { + xtags->tags[i] = NULL; + } +} + + +void y4m_fini_xtag_list(y4m_xtag_list_t *xtags) +{ + int i; + for (i = 0; i < Y4M_MAX_XTAGS; i++) { + if (xtags->tags[i] != NULL) { + _y4m_free(xtags->tags[i]); + xtags->tags[i] = NULL; + } + } + xtags->count = 0; +} + + +void y4m_copy_xtag_list(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src) +{ + int i; + for (i = 0; i < src->count; i++) { + if (dest->tags[i] == NULL) + dest->tags[i] = y4m_new_xtag(); + strncpy(dest->tags[i], src->tags[i], Y4M_MAX_XTAG_SIZE); + } + dest->count = src->count; +} + + + +static int y4m_snprint_xtags(char *s, int maxn, const y4m_xtag_list_t *xtags) +{ + int i, room; + + for (i = 0, room = maxn - 1; i < xtags->count; i++) { + int n = snprintf(s, room + 1, " %s", xtags->tags[i]); + if ((n < 0) || (n > room)) return Y4M_ERR_HEADER; + s += n; + room -= n; + } + s[0] = '\n'; /* finish off header with newline */ + s[1] = '\0'; /* ...and end-of-string */ + return Y4M_OK; +} + + +int y4m_xtag_count(const y4m_xtag_list_t *xtags) +{ + return xtags->count; +} + + +const char *y4m_xtag_get(const y4m_xtag_list_t *xtags, int n) +{ + if (n >= xtags->count) + return NULL; + else + return xtags->tags[n]; +} + + +int y4m_xtag_add(y4m_xtag_list_t *xtags, const char *tag) +{ + if (xtags->count >= Y4M_MAX_XTAGS) return Y4M_ERR_XXTAGS; + if (xtags->tags[xtags->count] == NULL) + xtags->tags[xtags->count] = y4m_new_xtag(); + strncpy(xtags->tags[xtags->count], tag, Y4M_MAX_XTAG_SIZE); + (xtags->count)++; + return Y4M_OK; +} + + +int y4m_xtag_remove(y4m_xtag_list_t *xtags, int n) +{ + int i; + char *q; + + if ((n < 0) || (n >= xtags->count)) return Y4M_ERR_RANGE; + q = xtags->tags[n]; + for (i = n; i < (xtags->count - 1); i++) + xtags->tags[i] = xtags->tags[i+1]; + xtags->tags[i] = q; + (xtags->count)--; + return Y4M_OK; +} + + +int y4m_xtag_clearlist(y4m_xtag_list_t *xtags) +{ + xtags->count = 0; + return Y4M_OK; +} + + +int y4m_xtag_addlist(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src) +{ + int i, j; + + if ((dest->count + src->count) > Y4M_MAX_XTAGS) return Y4M_ERR_XXTAGS; + for (i = dest->count, j = 0; + j < src->count; + i++, j++) { + if (dest->tags[i] == NULL) + dest->tags[i] = y4m_new_xtag(); + strncpy(dest->tags[i], src->tags[i], Y4M_MAX_XTAG_SIZE); + } + dest->count += src->count; + return Y4M_OK; +} + + +/************************************************************************* + * + * Creators/destructors for y4m_*_info_t structures + * + *************************************************************************/ + + +void y4m_init_stream_info(y4m_stream_info_t *info) +{ + if (info == NULL) return; + /* initialize info */ + info->width = Y4M_UNKNOWN; + info->height = Y4M_UNKNOWN; + info->interlace = Y4M_UNKNOWN; + info->framerate = y4m_fps_UNKNOWN; + info->sampleaspect = y4m_sar_UNKNOWN; + y4m_init_xtag_list(&(info->x_tags)); +} + + +void y4m_copy_stream_info(y4m_stream_info_t *dest, + const y4m_stream_info_t *src) +{ + if ((dest == NULL) || (src == NULL)) return; + /* copy info */ + dest->width = src->width; + dest->height = src->height; + dest->interlace = src->interlace; + dest->framerate = src->framerate; + dest->sampleaspect = src->sampleaspect; + dest->framelength = src->framelength; + y4m_copy_xtag_list(&(dest->x_tags), &(src->x_tags)); +} + + +void y4m_fini_stream_info(y4m_stream_info_t *info) +{ + if (info == NULL) return; + y4m_fini_xtag_list(&(info->x_tags)); +} + + +void y4m_si_set_width(y4m_stream_info_t *si, int width) +{ + si->width = width; + si->framelength = (si->height * si->width) * 3 / 2; +} + +int y4m_si_get_width(const y4m_stream_info_t *si) +{ return si->width; } + +void y4m_si_set_height(y4m_stream_info_t *si, int height) +{ + si->height = height; + si->framelength = (si->height * si->width) * 3 / 2; +} + +int y4m_si_get_height(const y4m_stream_info_t *si) +{ return si->height; } + +void y4m_si_set_interlace(y4m_stream_info_t *si, int interlace) +{ si->interlace = interlace; } + +int y4m_si_get_interlace(const y4m_stream_info_t *si) +{ return si->interlace; } + +void y4m_si_set_framerate(y4m_stream_info_t *si, y4m_ratio_t framerate) +{ si->framerate = framerate; } + +y4m_ratio_t y4m_si_get_framerate(const y4m_stream_info_t *si) +{ return si->framerate; } + +void y4m_si_set_sampleaspect(y4m_stream_info_t *si, y4m_ratio_t sar) +{ si->sampleaspect = sar; } + +y4m_ratio_t y4m_si_get_sampleaspect(const y4m_stream_info_t *si) +{ return si->sampleaspect; } + +int y4m_si_get_framelength(const y4m_stream_info_t *si) +{ return si->framelength; } + +y4m_xtag_list_t *y4m_si_xtags(y4m_stream_info_t *si) +{ return &(si->x_tags); } + + + +void y4m_init_frame_info(y4m_frame_info_t *info) +{ + if (info == NULL) return; + /* initialize info */ + y4m_init_xtag_list(&(info->x_tags)); +} + + +void y4m_copy_frame_info(y4m_frame_info_t *dest, const y4m_frame_info_t *src) +{ + if ((dest == NULL) || (src == NULL)) return; + /* copy info */ + y4m_copy_xtag_list(&(dest->x_tags), &(src->x_tags)); +} + + +void y4m_fini_frame_info(y4m_frame_info_t *info) +{ + if (info == NULL) return; + y4m_fini_xtag_list(&(info->x_tags)); +} + + + +/************************************************************************* + * + * Tag parsing + * + *************************************************************************/ + +int y4m_parse_stream_tags(char *s, y4m_stream_info_t *i) +{ + char *token, *value; + char tag; + int err; + + /* parse fields */ + for (token = strtok(s, Y4M_DELIM); + token != NULL; + token = strtok(NULL, Y4M_DELIM)) { + if (token[0] == '\0') continue; /* skip empty strings */ + tag = token[0]; + value = token + 1; + switch (tag) { + case 'W': /* width */ + i->width = atoi(value); + if (i->width <= 0) return Y4M_ERR_RANGE; + break; + case 'H': /* height */ + i->height = atoi(value); + if (i->height <= 0) return Y4M_ERR_RANGE; + break; + case 'F': /* frame rate (fps) */ + if ((err = y4m_parse_ratio(&(i->framerate), value)) != Y4M_OK) + return err; + if (i->framerate.n < 0) return Y4M_ERR_RANGE; + break; + case 'I': /* interlacing */ + switch (value[0]) { + case 'p': i->interlace = Y4M_ILACE_NONE; break; + case 't': i->interlace = Y4M_ILACE_TOP_FIRST; break; + case 'b': i->interlace = Y4M_ILACE_BOTTOM_FIRST; break; + case '?': + default: + i->interlace = Y4M_UNKNOWN; break; + } + break; + case 'A': /* sample (pixel) aspect ratio */ + if ((err = y4m_parse_ratio(&(i->sampleaspect), value)) != Y4M_OK) + return err; + if (i->sampleaspect.n < 0) return Y4M_ERR_RANGE; + break; + case 'X': /* 'X' meta-tag */ + if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err; + break; + default: + /* possible error on unknown options */ + if (_y4mparam_allow_unknown_tags) { + /* unknown tags ok: store in xtag list and warn... */ + if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err; + mjpeg_warn("Unknown stream tag encountered: '%s'", token); + } else { + /* unknown tags are *not* ok */ + return Y4M_ERR_BADTAG; + } + break; + } + } + /* Error checking... width and height must be known since we can't + * parse without them + */ + if( i->width == Y4M_UNKNOWN || i->height == Y4M_UNKNOWN ) + return Y4M_ERR_HEADER; + /* ta da! done. */ + return Y4M_OK; +} + + + +static int y4m_parse_frame_tags(char *s, y4m_frame_info_t *i) +{ + char *token, *value; + char tag; + int err; + + /* parse fields */ + for (token = strtok(s, Y4M_DELIM); + token != NULL; + token = strtok(NULL, Y4M_DELIM)) { + if (token[0] == '\0') continue; /* skip empty strings */ + tag = token[0]; + value = token + 1; + switch (tag) { + case 'X': /* 'X' meta-tag */ + if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err; + break; + default: + /* possible error on unknown options */ + if (_y4mparam_allow_unknown_tags) { + /* unknown tags ok: store in xtag list and warn... */ + if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err; + mjpeg_warn("Unknown frame tag encountered: '%s'", token); + } else { + /* unknown tags are *not* ok */ + return Y4M_ERR_BADTAG; + } + break; + } + } + /* ta da! done. */ + return Y4M_OK; +} + + + + + +/************************************************************************* + * + * Read/Write stream header + * + *************************************************************************/ + + +int y4m_read_stream_header(int fd, y4m_stream_info_t *i) +{ + char line[Y4M_LINE_MAX]; + char *p; + int n; + int err; + + /* read the header line */ + for (n = 0, p = line; n < Y4M_LINE_MAX; n++, p++) { + if (read(fd, p, 1) < 1) + return Y4M_ERR_SYSTEM; + if (*p == '\n') { + *p = '\0'; /* Replace linefeed by end of string */ + break; + } + } + if (n >= Y4M_LINE_MAX) + return Y4M_ERR_HEADER; + /* look for keyword in header */ + if (strncmp(line, Y4M_MAGIC, strlen(Y4M_MAGIC))) + return Y4M_ERR_MAGIC; + if ((err = y4m_parse_stream_tags(line + strlen(Y4M_MAGIC), i)) != Y4M_OK) + return err; + + i->framelength = (i->height * i->width) * 3 / 2; + return Y4M_OK; +} + + + +int y4m_write_stream_header(int fd, const y4m_stream_info_t *i) +{ + char s[Y4M_LINE_MAX+1]; + int n; + int err; + y4m_ratio_t rate = i->framerate; + y4m_ratio_t aspect = i->sampleaspect; + + y4m_ratio_reduce(&rate); + y4m_ratio_reduce(&aspect); + n = snprintf(s, sizeof(s), "%s W%d H%d F%d:%d I%s A%d:%d", + Y4M_MAGIC, + i->width, + i->height, + rate.n, rate.d, + (i->interlace == Y4M_ILACE_NONE) ? "p" : + (i->interlace == Y4M_ILACE_TOP_FIRST) ? "t" : + (i->interlace == Y4M_ILACE_BOTTOM_FIRST) ? "b" : "?", + aspect.n, aspect.d); + if ((n < 0) || (n > Y4M_LINE_MAX)) return Y4M_ERR_HEADER; + if ((err = y4m_snprint_xtags(s + n, sizeof(s) - n - 1, &(i->x_tags))) + != Y4M_OK) + return err; + /* non-zero on error */ + return (y4m_write(fd, s, strlen(s)) ? Y4M_ERR_SYSTEM : Y4M_OK); +} + + + + + +/************************************************************************* + * + * Read/Write frame header + * + *************************************************************************/ + +int y4m_read_frame_header(int fd, y4m_frame_info_t *i) +{ + char line[Y4M_LINE_MAX]; + char *p; + int n; + ssize_t remain; + + /* This is more clever than read_stream_header... + Try to read "FRAME\n" all at once, and don't try to parse + if nothing else is there... + */ + remain = y4m_read(fd, line, sizeof(Y4M_FRAME_MAGIC)-1+1); /* -'\0', +'\n' */ + if (remain < 0) return Y4M_ERR_SYSTEM; + if (remain > 0) { + /* A clean EOF should end exactly at a frame-boundary */ + if (remain == sizeof(Y4M_FRAME_MAGIC)) + return Y4M_ERR_EOF; + else + return Y4M_ERR_BADEOF; + } + if (strncmp(line, Y4M_FRAME_MAGIC, sizeof(Y4M_FRAME_MAGIC)-1)) + return Y4M_ERR_MAGIC; + if (line[sizeof(Y4M_FRAME_MAGIC)-1] == '\n') + return Y4M_OK; /* done -- no tags: that was the end-of-line. */ + + if (line[sizeof(Y4M_FRAME_MAGIC)-1] != Y4M_DELIM[0]) { + return Y4M_ERR_MAGIC; /* wasn't a space -- what was it? */ + } + + /* proceed to get the tags... (overwrite the magic) */ + for (n = 0, p = line; n < Y4M_LINE_MAX; n++, p++) { + if (y4m_read(fd, p, 1)) + return Y4M_ERR_SYSTEM; + if (*p == '\n') { + *p = '\0'; /* Replace linefeed by end of string */ + break; + } + } + if (n >= Y4M_LINE_MAX) return Y4M_ERR_HEADER; + /* non-zero on error */ + return y4m_parse_frame_tags(line, i); +} + + +int y4m_write_frame_header(int fd, const y4m_frame_info_t *i) +{ + char s[Y4M_LINE_MAX+1]; + int n; + int err; + + n = snprintf(s, sizeof(s), "%s", Y4M_FRAME_MAGIC); + if ((n < 0) || (n > Y4M_LINE_MAX)) return Y4M_ERR_HEADER; + if ((err = y4m_snprint_xtags(s + n, sizeof(s) - n - 1, &(i->x_tags))) + != Y4M_OK) + return err; + /* non-zero on error */ + return (y4m_write(fd, s, strlen(s)) ? Y4M_ERR_SYSTEM : Y4M_OK); +} + + + +/************************************************************************* + * + * Read/Write entire frame + * + *************************************************************************/ + +int y4m_read_frame(int fd, const y4m_stream_info_t *si, + y4m_frame_info_t *fi, uint8_t * const yuv[3]) +{ + int err; + int w = si->width; + int h = si->height; + + /* Read frame header */ + if ((err = y4m_read_frame_header(fd, fi)) != Y4M_OK) return err; + /* Read luminance scanlines */ + if (y4m_read(fd, yuv[0], w*h)) return Y4M_ERR_SYSTEM; + /* Read chrominance scanlines */ + if (y4m_read(fd, yuv[1], w*h/4)) return Y4M_ERR_SYSTEM; + if (y4m_read(fd, yuv[2], w*h/4)) return Y4M_ERR_SYSTEM; + + return Y4M_OK; +} + + + + +int y4m_write_frame(int fd, const y4m_stream_info_t *si, + const y4m_frame_info_t *fi, uint8_t * const yuv[3]) +{ + int err; + int w = si->width; + int h = si->height; + + /* Write frame header */ + if ((err = y4m_write_frame_header(fd, fi)) != Y4M_OK) return err; + /* Write luminance,chrominance scanlines */ + if (y4m_write(fd, yuv[0], w*h) || + y4m_write(fd, yuv[1], w*h/4) || + y4m_write(fd, yuv[2], w*h/4)) + return Y4M_ERR_SYSTEM; + return Y4M_OK; +} + + + +/************************************************************************* + * + * Read/Write entire frame, (de)interleaved (to)from two separate fields + * + *************************************************************************/ + + +int y4m_read_fields(int fd, const y4m_stream_info_t *si, y4m_frame_info_t *fi, + uint8_t * const upper_field[3], + uint8_t * const lower_field[3]) +{ + int i, y, err; + int width = si->width; + int height = si->height; + + /* Read frame header */ + if ((err = y4m_read_frame_header(fd, fi)) != Y4M_OK) return err; + /* Read Y', Cb, and Cr planes */ + for (i = 0; i < 3; i++) { + uint8_t *srctop = upper_field[i]; + uint8_t *srcbot = lower_field[i]; + /* alternately write one line from each */ + for (y = 0; y < height; y += 2) { + if (y4m_read(fd, srctop, width)) return Y4M_ERR_SYSTEM; + srctop += width; + if (y4m_read(fd, srcbot, width)) return Y4M_ERR_SYSTEM; + srcbot += width; + } + /* for chroma, width/height are half as big */ + if (i == 0) { + width /= 2; + height /= 2; + } + } + return Y4M_OK; +} + + + +int y4m_write_fields(int fd, const y4m_stream_info_t *si, + const y4m_frame_info_t *fi, + uint8_t * const upper_field[3], + uint8_t * const lower_field[3]) +{ + int i, y, err; + int width = si->width; + int height = si->height; + + /* Write frame header */ + if ((err = y4m_write_frame_header(fd, fi)) != Y4M_OK) return err; + /* Write Y', Cb, and Cr planes */ + for (i = 0; i < 3; i++) { + uint8_t *srctop = upper_field[i]; + uint8_t *srcbot = lower_field[i]; + /* alternately write one line from each */ + for (y = 0; y < height; y += 2) { + if (y4m_write(fd, srctop, width)) return Y4M_ERR_SYSTEM; + srctop += width; + if (y4m_write(fd, srcbot, width)) return Y4M_ERR_SYSTEM; + srcbot += width; + } + /* for chroma, width/height are half as big */ + if (i == 0) { + width /= 2; + height /= 2; + } + } + return Y4M_OK; +} + + + +/************************************************************************* + * + * Handy logging of stream info + * + *************************************************************************/ + +void y4m_log_stream_info(log_level_t level, const char *prefix, + const y4m_stream_info_t *i) +{ + char s[256]; + + snprintf(s, sizeof(s), " frame size: "); + if (i->width == Y4M_UNKNOWN) + snprintf(s+strlen(s), sizeof(s)-strlen(s), "(?)x"); + else + snprintf(s+strlen(s), sizeof(s)-strlen(s), "%dx", i->width); + if (i->height == Y4M_UNKNOWN) + snprintf(s+strlen(s), sizeof(s)-strlen(s), "(?) pixels "); + else + snprintf(s+strlen(s), sizeof(s)-strlen(s), "%d pixels ", i->height); + if (i->framelength == Y4M_UNKNOWN) + snprintf(s+strlen(s), sizeof(s)-strlen(s), "(? bytes)"); + else + snprintf(s+strlen(s), sizeof(s)-strlen(s), "(%d bytes)", i->framelength); + mjpeg_log(level, "%s%s", prefix, s); + if ((i->framerate.n == 0) && (i->framerate.d == 0)) + mjpeg_log(level, "%s frame rate: ??? fps", prefix); + else + mjpeg_log(level, "%s frame rate: %d/%d fps (~%f)", prefix, + i->framerate.n, i->framerate.d, + (double) i->framerate.n / (double) i->framerate.d); + mjpeg_log(level, "%s interlace: %s", prefix, + (i->interlace == Y4M_ILACE_NONE) ? "none/progressive" : + (i->interlace == Y4M_ILACE_TOP_FIRST) ? "top-field-first" : + (i->interlace == Y4M_ILACE_BOTTOM_FIRST) ? "bottom-field-first" : + "anyone's guess"); + if ((i->sampleaspect.n == 0) && (i->sampleaspect.d == 0)) + mjpeg_log(level, "%ssample aspect ratio: ?:?", prefix); + else + mjpeg_log(level, "%ssample aspect ratio: %d:%d", prefix, + i->sampleaspect.n, i->sampleaspect.d); +} + + +/************************************************************************* + * + * Convert error code to string + * + *************************************************************************/ + +const char *y4m_strerr(int err) +{ + switch (err) { + case Y4M_OK: return "no error"; + case Y4M_ERR_RANGE: return "parameter out of range"; + case Y4M_ERR_SYSTEM: return "system error (failed read/write)"; + case Y4M_ERR_HEADER: return "bad stream or frame header"; + case Y4M_ERR_BADTAG: return "unknown header tag"; + case Y4M_ERR_MAGIC: return "bad header magic"; + case Y4M_ERR_XXTAGS: return "too many xtags"; + case Y4M_ERR_EOF: return "end-of-file"; + case Y4M_ERR_BADEOF: return "stream ended unexpectedly (EOF)"; + default: + return "unknown error code"; + } +} + + diff --git a/veejay-current/utils/yuv4mpeg.h b/veejay-current/utils/yuv4mpeg.h new file mode 100644 index 00000000..6028ea88 --- /dev/null +++ b/veejay-current/utils/yuv4mpeg.h @@ -0,0 +1,473 @@ +/* + * yuv4mpeg.h: Functions for reading and writing "new" YUV4MPEG2 streams. + * + * Stream format is described at the end of this file. + * + * + * Copyright (C) 2001 Matthew J. Marjanovic + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __YUV4MPEG_H__ +#define __YUV4MPEG_H__ + +#include +#include +#include +#include + + +/************************************************************************ + * error codes returned by y4m_* functions + ************************************************************************/ +#define Y4M_OK 0 +#define Y4M_ERR_RANGE 1 /* argument or tag value out of range */ +#define Y4M_ERR_SYSTEM 2 /* failed system call, check errno */ +#define Y4M_ERR_HEADER 3 /* illegal/malformed header */ +#define Y4M_ERR_BADTAG 4 /* illegal tag character */ +#define Y4M_ERR_MAGIC 5 /* bad header magic */ +#define Y4M_ERR_EOF 6 /* end-of-file (clean) */ +#define Y4M_ERR_XXTAGS 7 /* too many xtags */ +#define Y4M_ERR_BADEOF 8 /* unexpected end-of-file */ + + +/* generic 'unknown' value for integer parameters (e.g. interlace, height) */ +#define Y4M_UNKNOWN -1 + + + +/************************************************************************ + * 'ratio' datatype, for rational numbers + * (see 'ratio' functions down below) + ************************************************************************/ +typedef struct _y4m_ratio { + int n; /* numerator */ + int d; /* denominator */ +} y4m_ratio_t; + + +/************************************************************************ + * useful standard framerates (as ratios) + ************************************************************************/ +extern const y4m_ratio_t y4m_fps_UNKNOWN; +extern const y4m_ratio_t y4m_fps_NTSC_FILM; /* 24000/1001 film (in NTSC) */ +extern const y4m_ratio_t y4m_fps_FILM; /* 24fps film */ +extern const y4m_ratio_t y4m_fps_PAL; /* 25fps PAL */ +extern const y4m_ratio_t y4m_fps_NTSC; /* 30000/1001 NTSC */ +extern const y4m_ratio_t y4m_fps_30; /* 30fps */ +extern const y4m_ratio_t y4m_fps_PAL_FIELD; /* 50fps PAL field rate */ +extern const y4m_ratio_t y4m_fps_NTSC_FIELD; /* 60000/1001 NTSC field rate */ +extern const y4m_ratio_t y4m_fps_60; /* 60fps */ + +/************************************************************************ + * useful standard sample (pixel) aspect ratios (W:H) + ************************************************************************/ +extern const y4m_ratio_t y4m_sar_UNKNOWN; +extern const y4m_ratio_t y4m_sar_SQUARE; /* square pixels */ +extern const y4m_ratio_t y4m_sar_NTSC_CCIR601; /* 525-line (NTSC) Rec.601 */ +extern const y4m_ratio_t y4m_sar_NTSC_16_9; /* 16:9 NTSC/Rec.601 */ +extern const y4m_ratio_t y4m_sar_NTSC_SVCD_4_3; /* NTSC SVCD 4:3 */ +extern const y4m_ratio_t y4m_sar_NTSC_SVCD_16_9;/* NTSC SVCD 16:9 */ +extern const y4m_ratio_t y4m_sar_PAL_CCIR601; /* 625-line (PAL) Rec.601 */ +extern const y4m_ratio_t y4m_sar_PAL_16_9; /* 16:9 PAL/Rec.601 */ +extern const y4m_ratio_t y4m_sar_PAL_SVCD_4_3; /* PAL SVCD 4:3 */ +extern const y4m_ratio_t y4m_sar_PAL_SVCD_16_9; /* PAL SVCD 16:9 */ +extern const y4m_ratio_t y4m_sar_SQR_ANA16_9; /* anamorphic 16:9 sampled */ + /* from 4:3 with square pixels */ + +/************************************************************************ + * useful standard display aspect ratios (W:H) + ************************************************************************/ +extern const y4m_ratio_t y4m_dar_UNKNOWN; +extern const y4m_ratio_t y4m_dar_4_3; /* standard TV */ +extern const y4m_ratio_t y4m_dar_16_9; /* widescreen TV */ +extern const y4m_ratio_t y4m_dar_221_100; /* word-to-your-mother TV */ + + +/************************************************************************ + * 'xtag_list' --- list of unparsed and/or meta/X header tags + * + * Do not touch this structure directly! + * + * Use the y4m_xtag_*() functions (see below). + * You must initialize/finalize this structure before/after use. + ************************************************************************/ +#define Y4M_MAX_XTAGS 32 /* maximum number of xtags in list */ +#define Y4M_MAX_XTAG_SIZE 32 /* max length of an xtag (including 'X') */ +typedef struct _y4m_xtag_list { + int count; + char *tags[Y4M_MAX_XTAGS]; +} y4m_xtag_list_t; + + + +/************************************************************************ + * 'stream_info' --- stream header information + * + * Do not touch this structure directly! + * + * Use the y4m_si_*() functions (see below). + * You must initialize/finalize this structure before/after use. + ************************************************************************/ +typedef struct _y4m_stream_info { + /* values from header */ + int width; + int height; + int interlace; /* see Y4M_ILACE_* definitions below */ + y4m_ratio_t framerate; /* frames-per-second; 0:0 == unknown */ + y4m_ratio_t sampleaspect; /* pixel width/height; 0:0 == unknown */ + /* computed/derivative values */ + int framelength; /* bytes of data per frame (not including header) */ + /* mystical X tags */ + y4m_xtag_list_t x_tags; +} y4m_stream_info_t; + +/* possible options for the interlace parameter */ +#define Y4M_ILACE_NONE 0 /* non-interlaced, progressive frame */ +#define Y4M_ILACE_TOP_FIRST 1 /* interlaced, top-field first */ +#define Y4M_ILACE_BOTTOM_FIRST 2 /* interlaced, bottom-field first */ + + +/************************************************************************ + * 'frame_info' --- frame header information + * + * Do not touch this structure directly! + * + * Use the y4m_fi_*() functions (see below). + * You must initialize/finalize this structure before/after use. + ************************************************************************/ +typedef struct _y4m_frame_info { + /* mystical X tags */ + y4m_xtag_list_t x_tags; +} y4m_frame_info_t; + + + +#ifdef __cplusplus +extern "C" { +#else +#endif + + +/************************************************************************ + * 'ratio' functions + ************************************************************************/ + +/* 'normalize' a ratio (remove common factors) */ +void y4m_ratio_reduce(y4m_ratio_t *r); + +/* parse "nnn:ddd" into a ratio (returns Y4M_OK or Y4M_ERR_RANGE) */ +int y4m_parse_ratio(y4m_ratio_t *r, const char *s); + +/* quick test of two ratios for equality (i.e. identical components) */ +#define Y4M_RATIO_EQL(a,b) ( ((a).n == (b).n) && ((a).d == (b).d) ) + +/* quick conversion of a ratio to a double (no divide-by-zero check!) */ +#define Y4M_RATIO_DBL(r) ((double)(r).n / (double)(r).d) + +/************************************************************************* + * + * Guess the true SAR (sample aspect ratio) from a list of commonly + * encountered values, given the "suggested" display aspect ratio (DAR), + * and the true frame width and height. + * + * Returns y4m_sar_UNKNOWN if no match is found. + * + *************************************************************************/ +y4m_ratio_t y4m_guess_sar(int width, int height, y4m_ratio_t dar); + + + +/************************************************************************ + * 'xtag' functions + * + * o Before using an xtag_list (but after the structure/memory has been + * allocated), you must initialize it via y4m_init_xtag_list(). + * o After using an xtag_list (but before the structure is released), + * call y4m_fini_xtag_list() to free internal memory. + * + ************************************************************************/ + +/* initialize an xtag_list structure */ +void y4m_init_xtag_list(y4m_xtag_list_t *xtags); + +/* finalize an xtag_list structure */ +void y4m_fini_xtag_list(y4m_xtag_list_t *xtags); + +/* make one xtag_list into a copy of another */ +void y4m_copy_xtag_list(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src); + +/* return number of tags in an xtag_list */ +int y4m_xtag_count(const y4m_xtag_list_t *xtags); + +/* access n'th tag in an xtag_list */ +const char *y4m_xtag_get(const y4m_xtag_list_t *xtags, int n); + +/* append a new tag to an xtag_list + returns: Y4M_OK - success + Y4M_ERR_XXTAGS - list is already full */ +int y4m_xtag_add(y4m_xtag_list_t *xtags, const char *tag); + +/* remove a tag from an xtag_list + returns: Y4M_OK - success + Y4M_ERR_RANGE - n is out of range */ +int y4m_xtag_remove(y4m_xtag_list_t *xtags, int n); + +/* remove all tags from an xtag_list + returns: Y4M_OK - success */ +int y4m_xtag_clearlist(y4m_xtag_list_t *xtags); + +/* append copies of tags from src list to dest list + returns: Y4M_OK - success + Y4M_ERR_XXTAGS - operation would overfill dest list */ +int y4m_xtag_addlist(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src); + + + +/************************************************************************ + * '*_info' functions + * + * o Before using a *_info structure (but after the structure/memory has + * been allocated), you must initialize it via y4m_init_*_info(). + * o After using a *_info structure (but before the structure is released), + * call y4m_fini_*_info() to free internal memory. + * o Use the 'set' and 'get' accessors to modify or access the fields in + * the structures; don't touch the structure directly. (Ok, so there + * is no really convenient C syntax to prevent you from doing this, + * but we are all responsible programmers here, so just don't do it!) + * + ************************************************************************/ + +/* initialize a stream_info structure */ +void y4m_init_stream_info(y4m_stream_info_t *i); + +/* finalize a stream_info structure */ +void y4m_fini_stream_info(y4m_stream_info_t *i); + +/* make one stream_info into a copy of another */ +void y4m_copy_stream_info(y4m_stream_info_t *dest, + const y4m_stream_info_t *src); + +/* access or set stream_info fields */ +void y4m_si_set_width(y4m_stream_info_t *si, int width); +int y4m_si_get_width(const y4m_stream_info_t *si); +void y4m_si_set_height(y4m_stream_info_t *si, int height); +int y4m_si_get_height(const y4m_stream_info_t *si); +void y4m_si_set_interlace(y4m_stream_info_t *si, int interlace); +int y4m_si_get_interlace(const y4m_stream_info_t *si); +void y4m_si_set_framerate(y4m_stream_info_t *si, y4m_ratio_t framerate); +y4m_ratio_t y4m_si_get_framerate(const y4m_stream_info_t *si); +void y4m_si_set_sampleaspect(y4m_stream_info_t *si, y4m_ratio_t sar); +y4m_ratio_t y4m_si_get_sampleaspect(const y4m_stream_info_t *si); +int y4m_si_get_framelength(const y4m_stream_info_t *si); + +/* access stream_info xtag_list */ +y4m_xtag_list_t *y4m_si_xtags(y4m_stream_info_t *si); + + +/* initialize a frame_info structure */ +void y4m_init_frame_info(y4m_frame_info_t *i); + +/* finalize a frame_info structure */ +void y4m_fini_frame_info(y4m_frame_info_t *i); + +/* make one frame_info into a copy of another */ +void y4m_copy_frame_info(y4m_frame_info_t *dest, + const y4m_frame_info_t *src); + +/* access frame_info xtag_list */ +y4m_xtag_list_t *y4m_fi_xtags(y4m_frame_info_t *fi); + + + +/************************************************************************ + * blocking read and write functions + * + * o guaranteed to transfer entire payload (or fail) + * o return values: + * 0 (zero) complete success + * -(# of remaining bytes) error (and errno left set) + * +(# of remaining bytes) EOF (for y4m_read only) + * + ************************************************************************/ + +/* read len bytes from fd into buf */ +ssize_t y4m_read(int fd, void *buf, size_t len); + +/* write len bytes from fd into buf */ +ssize_t y4m_write(int fd, const void *buf, size_t len); + + + +/************************************************************************ + * stream header processing functions + * + * o return values: + * Y4M_OK - success + * Y4M_ERR_* - error (see y4m_strerr() for descriptions) + * + ************************************************************************/ + +/* parse a string of stream header tags */ +int y4m_parse_stream_tags(char *s, y4m_stream_info_t *i); + +/* read a stream header from file descriptor fd */ +int y4m_read_stream_header(int fd, y4m_stream_info_t *i); + +/* write a stream header to file descriptor fd */ +int y4m_write_stream_header(int fd, const y4m_stream_info_t *i); + + + +/************************************************************************ + * frame processing functions + * + * o return values: + * Y4M_OK - success + * Y4M_ERR_* - error (see y4m_strerr() for descriptions) + * + ************************************************************************/ + +/* read a frame header from file descriptor fd */ +int y4m_read_frame_header(int fd, y4m_frame_info_t *i); + +/* write a frame header to file descriptor fd */ +int y4m_write_frame_header(int fd, const y4m_frame_info_t *i); + +/* read a complete frame (header + data) + o yuv[3] points to three buffers, one each for Y, U, V planes */ +int y4m_read_frame(int fd, const y4m_stream_info_t *si, + y4m_frame_info_t *fi, uint8_t * const yuv[3]); + +/* write a complete frame (header + data) + o yuv[3] points to three buffers, one each for Y, U, V planes */ +int y4m_write_frame(int fd, const y4m_stream_info_t *si, + const y4m_frame_info_t *fi, uint8_t * const yuv[3]); + + +/* read a complete frame (header + data), but de-interleave fields + into two separate buffers + o upper_field[3] same as yuv[3] above, but for upper field + o lower_field[3] same as yuv[3] above, but for lower field +*/ +int y4m_read_fields(int fd, const y4m_stream_info_t *si, + y4m_frame_info_t *fi, + uint8_t * const upper_field[3], + uint8_t * const lower_field[3]); + +/* write a complete frame (header + data), but interleave fields + from two separate buffers + o upper_field[3] same as yuv[3] above, but for upper field + o lower_field[3] same as yuv[3] above, but for lower field +*/ +int y4m_write_fields(int fd, const y4m_stream_info_t *si, + const y4m_frame_info_t *fi, + uint8_t * const upper_field[3], + uint8_t * const lower_field[3]); + + + +/************************************************************************ + * miscellaneous functions + ************************************************************************/ + +/* convenient dump of stream header info via mjpeg_log facility + * - each logged/printed line is prefixed by 'prefix' + */ +void y4m_log_stream_info(log_level_t level, const char *prefix, + const y4m_stream_info_t *i); + +/* convert a Y4M_ERR_* error code into mildly explanatory string */ +const char *y4m_strerr(int err); + +/* set 'allow_unknown_tag' flag for library... + o yn = 0 : unknown header tags will produce a parsing error + o yn = 1 : unknown header tags/values will produce a warning, but + are otherwise passed along via the xtags list + o yn = -1: don't change, just return current setting + + return value: previous setting of flag +*/ +int y4m_allow_unknown_tags(int yn); + + +#ifdef __cplusplus +} +#endif + +/************************************************************************ + ************************************************************************ + + Description of the (new!, forever?) YUV4MPEG2 stream format: + + STREAM consists of + o one '\n' terminated STREAM-HEADER + o unlimited number of FRAMEs + + FRAME consists of + o one '\n' terminated FRAME-HEADER + o "length" octets of planar YCrCb 4:2:0 image data + (if frame is interlaced, then the two fields are interleaved) + + + STREAM-HEADER consists of + o string "YUV4MPEG2 " (note the space after the '2') + o unlimited number of ' ' separated TAGGED-FIELDs + o '\n' line terminator + + FRAME-HEADER consists of + o string "FRAME " (note the space after the 'E') + o unlimited number of ' ' separated TAGGED-FIELDs + o '\n' line terminator + + + TAGGED-FIELD consists of + o single ascii character tag + o VALUE (which does not contain whitespace) + + VALUE consists of + o integer (base 10 ascii representation) + or o RATIO + or o single ascii character + or o generic ascii string + + RATIO consists of + o numerator (integer) + o ':' (a colon) + o denominator (integer) + + + The currently supported tags for the STREAM-HEADER: + W - [integer] frame width, pixels, should be > 0 + H - [integer] frame height, pixels, should be > 0 + I - [char] interlacing: p - progressive (none) + t - top-field-first + b - bottom-field-first + ? - unknown + F - [ratio] frame-rate, 0:0 == unknown + A - [ratio] sample (pixel) aspect ratio, 0:0 == unknown + X - [character string] 'metadata' (unparsed, but passed around) + + The currently supported tags for the FRAME-HEADER: + X - character string 'metadata' (unparsed, but passed around) + + ************************************************************************ + ************************************************************************/ + +#endif /* __YUV4MPEG_H__ */ + + diff --git a/veejay-current/utils/yuv4mpeg_intern.h b/veejay-current/utils/yuv4mpeg_intern.h new file mode 100644 index 00000000..140f9d62 --- /dev/null +++ b/veejay-current/utils/yuv4mpeg_intern.h @@ -0,0 +1,85 @@ +/* + * yuv4mpeg_intern.h: Internal constants for "new" YUV4MPEG streams + * + * Copyright (C) 2001 Andrew Stevens + * Copyright (C) 2001 Matthew J. Marjanovic + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef __YUV4MPEG_INTERN_H__ +#define __YUV4MPEG_INTERN_H__ + + +#define Y4M_MAGIC "YUV4MPEG2" +#define Y4M_FRAME_MAGIC "FRAME" + +#define Y4M_DELIM " " /* single-character(space) separating tagged fields */ + +#define Y4M_LINE_MAX 256 /* max number of characters in a header line + (including the '\n', but not the '\0') */ + + +/* standard framerate ratios */ +#define Y4M_FPS_UNKNOWN { 0, 0 } +#define Y4M_FPS_NTSC_FILM { 24000, 1001 } +#define Y4M_FPS_FILM { 24, 1 } +#define Y4M_FPS_PAL { 25, 1 } +#define Y4M_FPS_NTSC { 30000, 1001 } +#define Y4M_FPS_30 { 30, 1 } +#define Y4M_FPS_PAL_FIELD { 50, 1 } +#define Y4M_FPS_NTSC_FIELD { 60000, 1001 } +#define Y4M_FPS_60 { 60, 1 } + +/* standard sample/pixel aspect ratios */ +#define Y4M_SAR_UNKNOWN { 0, 0 } +#define Y4M_SAR_SQUARE { 1, 1 } +#define Y4M_SAR_SQR_ANA_16_9 { 4, 3 } +#define Y4M_SAR_NTSC_CCIR601 { 10, 11 } +#define Y4M_SAR_NTSC_16_9 { 40, 33 } +#define Y4M_SAR_NTSC_SVCD_4_3 { 15, 11 } +#define Y4M_SAR_NTSC_SVCD_16_9 { 20, 11 } +#define Y4M_SAR_PAL_CCIR601 { 59, 54 } +#define Y4M_SAR_PAL_16_9 { 118, 81 } +#define Y4M_SAR_PAL_SVCD_4_3 { 59, 36 } +#define Y4M_SAR_PAL_SVCD_16_9 { 59, 27 } + +#define Y4M_SAR_MPEG1_1 Y4M_SAR_SQUARE +#define Y4M_SAR_MPEG1_2 { 10000, 6735 } +#define Y4M_SAR_MPEG1_3 { 10000, 7031 } /* Anamorphic 16:9 PAL */ +#define Y4M_SAR_MPEG1_4 { 10000, 7615 } +#define Y4M_SAR_MPEG1_5 { 10000, 8055 } +#define Y4M_SAR_MPEG1_6 { 10000, 8437 } /* Anamorphic 16:9 NTSC */ +#define Y4M_SAR_MPEG1_7 { 10000, 8935 } +#define Y4M_SAR_MPEG1_8 { 10000, 9375 } /* PAL/SECAM 4:3 */ +#define Y4M_SAR_MPEG1_9 { 10000, 9815 } +#define Y4M_SAR_MPEG1_10 { 10000, 10255 } +#define Y4M_SAR_MPEG1_11 { 10000, 10695 } +#define Y4M_SAR_MPEG1_12 { 10000, 11250 } /* NTSC 4:3 */ +#define Y4M_SAR_MPEG1_13 { 10000, 11575 } +#define Y4M_SAR_MPEG1_14 { 10000, 12015 } + +#define Y4M_DAR_UNKNOWN { 0, 0 } +#define Y4M_DAR_4_3 { 4, 3 } +#define Y4M_DAR_16_9 { 16, 9 } +#define Y4M_DAR_221_100 { 221, 100 } + +#define Y4M_DAR_MPEG2_1 { 1, 1 } +#define Y4M_DAR_MPEG2_2 { 4, 3 } +#define Y4M_DAR_MPEG2_3 { 16, 9 } +#define Y4M_DAR_MPEG2_4 { 221, 100 } + +#endif /* __YUV4MPEG_INTERN_H__ */ + diff --git a/veejay-current/utils/yuv4mpeg_ratio.c b/veejay-current/utils/yuv4mpeg_ratio.c new file mode 100644 index 00000000..d2685709 --- /dev/null +++ b/veejay-current/utils/yuv4mpeg_ratio.c @@ -0,0 +1,156 @@ +/* + * yuv4mpeg_ratio.c: Functions for dealing with y4m_ratio_t datatype. + * + * Copyright (C) 2001 Matthew J. Marjanovic + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +#include + +#include +#include "yuv4mpeg.h" +#include "yuv4mpeg_intern.h" + + +/* useful list of standard framerates */ +const y4m_ratio_t y4m_fps_UNKNOWN = Y4M_FPS_UNKNOWN; +const y4m_ratio_t y4m_fps_NTSC_FILM = Y4M_FPS_NTSC_FILM; +const y4m_ratio_t y4m_fps_FILM = Y4M_FPS_FILM; +const y4m_ratio_t y4m_fps_PAL = Y4M_FPS_PAL; +const y4m_ratio_t y4m_fps_NTSC = Y4M_FPS_NTSC; +const y4m_ratio_t y4m_fps_30 = Y4M_FPS_30; +const y4m_ratio_t y4m_fps_PAL_FIELD = Y4M_FPS_PAL_FIELD; +const y4m_ratio_t y4m_fps_NTSC_FIELD = Y4M_FPS_NTSC_FIELD; +const y4m_ratio_t y4m_fps_60 = Y4M_FPS_60; + +/* useful list of standard sample aspect ratios */ +const y4m_ratio_t y4m_sar_UNKNOWN = Y4M_SAR_UNKNOWN; +const y4m_ratio_t y4m_sar_SQUARE = Y4M_SAR_SQUARE; +const y4m_ratio_t y4m_sar_SQR_ANA_16_9 = Y4M_SAR_SQR_ANA_16_9; +const y4m_ratio_t y4m_sar_NTSC_CCIR601 = Y4M_SAR_NTSC_CCIR601; +const y4m_ratio_t y4m_sar_NTSC_16_9 = Y4M_SAR_NTSC_16_9; +const y4m_ratio_t y4m_sar_NTSC_SVCD_4_3 = Y4M_SAR_NTSC_SVCD_4_3; +const y4m_ratio_t y4m_sar_NTSC_SVCD_16_9 = Y4M_SAR_NTSC_SVCD_16_9; +const y4m_ratio_t y4m_sar_PAL_CCIR601 = Y4M_SAR_PAL_CCIR601; +const y4m_ratio_t y4m_sar_PAL_16_9 = Y4M_SAR_PAL_16_9; +const y4m_ratio_t y4m_sar_PAL_SVCD_4_3 = Y4M_SAR_PAL_SVCD_4_3; +const y4m_ratio_t y4m_sar_PAL_SVCD_16_9 = Y4M_SAR_PAL_SVCD_16_9; + +/* useful list of standard display aspect ratios */ +const y4m_ratio_t y4m_dar_4_3 = Y4M_DAR_4_3; +const y4m_ratio_t y4m_dar_16_9 = Y4M_DAR_16_9; +const y4m_ratio_t y4m_dar_221_100 = Y4M_DAR_221_100; + +/* + * Euler's algorithm for greatest common divisor + */ + +static int gcd(int a, int b) +{ + a = (a >= 0) ? a : -a; + b = (b >= 0) ? b : -b; + + while (b > 0) { + int x = b; + b = a % b; + a = x; + } + return a; +} + + +/************************************************************************* + * + * Remove common factors from a ratio + * + *************************************************************************/ + + +void y4m_ratio_reduce(y4m_ratio_t *r) +{ + int d; + if ((r->n == 0) && (r->d == 0)) return; /* "unknown" */ + d = gcd(r->n, r->d); + r->n /= d; + r->d /= d; +} + + + +/************************************************************************* + * + * Parse "nnn:ddd" into a ratio + * + * returns: Y4M_OK - success + * Y4M_ERR_RANGE - range error + * + *************************************************************************/ + +int y4m_parse_ratio(y4m_ratio_t *r, const char *s) +{ + char *t = strchr(s, ':'); + if (t == NULL) return Y4M_ERR_RANGE; + r->n = atoi(s); + r->d = atoi(t+1); + if (r->d < 0) return Y4M_ERR_RANGE; + /* 0:0 == unknown, so that is ok, otherwise zero denominator is bad */ + if ((r->d == 0) && (r->n != 0)) return Y4M_ERR_RANGE; + y4m_ratio_reduce(r); + return Y4M_OK; +} + + + +/************************************************************************* + * + * Guess the true SAR (sample aspect ratio) from a list of commonly + * encountered values, given the "suggested" display aspect ratio, and + * the true frame width and height. + * + * Returns y4m_sar_UNKNOWN if no match is found. + * + *************************************************************************/ + +/* this is big enough to accommodate the difference between 720 and 704 */ +#define GUESS_ASPECT_TOLERANCE 0.03 + +y4m_ratio_t y4m_guess_sar(int width, int height, y4m_ratio_t dar) +{ + int i; + double implicit_sar = (double)(dar.n * height) / (double)(dar.d * width); + y4m_ratio_t sarray[] = + { + y4m_sar_SQUARE, + y4m_sar_NTSC_CCIR601, + y4m_sar_NTSC_16_9, + y4m_sar_NTSC_SVCD_4_3, + y4m_sar_NTSC_SVCD_16_9, + y4m_sar_PAL_CCIR601, + y4m_sar_PAL_16_9, + y4m_sar_PAL_SVCD_4_3, + y4m_sar_PAL_SVCD_16_9, + y4m_sar_UNKNOWN + }; + for (i = 0; !(Y4M_RATIO_EQL(sarray[i],y4m_sar_UNKNOWN)); i++) { + double ratio = implicit_sar / Y4M_RATIO_DBL(sarray[i]); + if ( (ratio > (1.0 - GUESS_ASPECT_TOLERANCE)) && + (ratio < (1.0 + GUESS_ASPECT_TOLERANCE)) ) + return sarray[i]; + } + return y4m_sar_UNKNOWN; +}