diff --git a/veejay-current/utils/Makefile.am b/veejay-current/utils/Makefile.am
new file mode 100644
index 00000000..9b604b11
--- /dev/null
+++ b/veejay-current/utils/Makefile.am
@@ -0,0 +1,41 @@
+# Process this file with Automake to produce Makefile.in
+
+
+
+INCLUDES = -I$(top_srcdir)
+
+
+noinst_LIBRARIES = libmotion.a libmjpegutils.a
+#lib_LIBRARIES = libmjpegutils.a
+
+libmotion_MMXSSE =  mblock_sumsq_mmx.s mblock_bsumsq_mmx.s mblock_bsad_mmx.s \
+                    mblock_sad_mmxe.s mblock_sad_mmx.s  \
+                    mblock_sub44_sads_x86.c attributes.h
+
+libmjpegutils_a_SOURCES = mjpeg_logging.c cpu_accel.c \
+                          mjpeg_logging.h mjpeg_types.h cpu_accel.h \
+                          yuv4mpeg.c yuv4mpeg_ratio.c \
+                          yuv4mpeg.h yuv4mpeg_intern.h \
+                          mpegconsts.c mpegconsts.h \
+                          mpegtimecode.c mpegtimecode.h
+
+#include_HEADERS = yuv4mpeg.h mpegconsts.h mjpeg_logging.h mjpeg_types.h mpegtimecode.h
+
+if HAVE_ASM_NASM
+if HAVE_ASM_MMX
+libmotion_SIMD = $(libmotion_MMXSSE)
+else
+mpeg2enc_SIMD = 
+endif
+else
+mpeg2enc_SIMD = 
+endif
+
+libmotion_a_SOURCES = motionsearch.c $(libmotion_SIMD) mmx.h motionsearch.h fastintfns.h
+
+EXTRA_DIST = videodev_mjpeg.h format_codes.h mblock_sub44_sads_x86_h.c mblock_sub44_sads_x86.h
+
+MAINTAINERCLEANFILES = Makefile.in
+
+%.o: %.s ; 	$(AS) $(ASFLAGS) -o $@ $<
+
diff --git a/veejay-current/utils/attributes.h b/veejay-current/utils/attributes.h
new file mode 100644
index 00000000..8ed7aef9
--- /dev/null
+++ b/veejay-current/utils/attributes.h
@@ -0,0 +1,27 @@
+/*
+ * attributes.h
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+//use gcc attribs to align critical data structures
+#ifdef ATTRIBUTE_ALIGNED_MAX
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
+#else
+#define ATTR_ALIGN(align)
+#endif
diff --git a/veejay-current/utils/cpu_accel.c b/veejay-current/utils/cpu_accel.c
new file mode 100644
index 00000000..9528a674
--- /dev/null
+++ b/veejay-current/utils/cpu_accel.c
@@ -0,0 +1,161 @@
+/*
+ * cpu_accel.c
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <signal.h>
+#include <setjmp.h>
+#include "cpu_accel.h"
+
+#ifdef HAVE_X86CPU 
+
+#include "mjpeg_types.h"
+
+/* Some miscelaneous stuff to allow checking whether SSE instructions cause
+   illegal instruction errors.
+*/
+
+static sigjmp_buf sigill_recover;
+
+static RETSIGTYPE sigillhandler(int sig )
+{
+	siglongjmp( sigill_recover, 1 );
+}
+
+typedef RETSIGTYPE (*__sig_t)(int);
+
+static int testsseill()
+{
+	int illegal;
+#if defined(__CYGWIN__)
+	/* SSE causes a crash on CYGWIN, apparently.
+	   Perhaps the wrong signal is being caught or something along
+	   those line ;-) or maybe SSE itself won't work...
+	*/
+	illegal = 1;
+#else
+	__sig_t old_handler = signal( SIGILL, sigillhandler);
+	if( sigsetjmp( sigill_recover, 1 ) == 0 )
+	{
+		asm ( "movups %xmm0, %xmm0" );
+		illegal = 0;
+	}
+	else
+		illegal = 1;
+	signal( SIGILL, old_handler );
+#endif
+	return illegal;
+}
+
+static int x86_accel (void)
+{
+    int32_t eax, ebx, ecx, edx;
+    int32_t AMD;
+    int32_t caps;
+
+#define cpuid(op,eax,ebx,ecx,edx)	\
+    asm ("cpuid"			\
+	 : "=a" (eax),			\
+	   "=b" (ebx),			\
+	   "=c" (ecx),			\
+	   "=d" (edx)			\
+	 : "a" (op)			\
+	 : "cc")
+
+    asm ("pushfl\n\t"
+	 "popl %0\n\t"
+	 "movl %0,%1\n\t"
+	 "xorl $0x200000,%0\n\t"
+	 "pushl %0\n\t"
+	 "popfl\n\t"
+	 "pushfl\n\t"
+	 "popl %0"
+         : "=a" (eax),
+	       "=b" (ebx)
+	 :
+	 : "cc");
+
+
+    if (eax == ebx)		// no cpuid
+	return 0;
+
+    cpuid (0x00000000, eax, ebx, ecx, edx);
+    if (!eax)			// vendor string only
+	return 0;
+
+    AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    cpuid (0x00000001, eax, ebx, ecx, edx);
+    if (! (edx & 0x00800000))	// no MMX
+	return 0;
+
+    caps = ACCEL_X86_MMX;
+    /* If SSE capable CPU has same MMX extensions as AMD
+	   and then some. However, to use SSE O.S. must have signalled
+	   it use of FXSAVE/FXRSTOR through CR4.OSFXSR and hence FXSR (bit 24)
+	   here
+	*/
+    if ((edx & 0x02000000))	
+		caps = ACCEL_X86_MMX | ACCEL_X86_MMXEXT;
+	if( (edx & 0x03000000) == 0x03000000 )
+	{
+		/* Check whether O.S. has SSE support... has to be done with
+		   exception 'cos those Intel morons put the relevant bit
+		   in a reg that is only accesible in ring 0... doh! 
+		*/
+		if( !testsseill() )
+			caps |= ACCEL_X86_SSE;
+	}
+
+    cpuid (0x80000000, eax, ebx, ecx, edx);
+    if (eax < 0x80000001)	// no extended capabilities
+		return caps;
+
+    cpuid (0x80000001, eax, ebx, ecx, edx);
+
+    if (edx & 0x80000000)
+	caps |= ACCEL_X86_3DNOW;
+
+    if (AMD && (edx & 0x00400000))	// AMD MMX extensions
+	{
+		caps |= ACCEL_X86_MMXEXT;
+	}
+
+    return caps;
+}
+#endif
+
+int cpu_accel (void)
+{
+#ifdef HAVE_X86CPU 
+    static int got_accel = 0;
+    static int accel;
+
+    if (!got_accel) {
+	got_accel = 1;
+	accel = x86_accel ();
+    }
+
+    return accel;
+#else
+    return 0;
+#endif
+}
diff --git a/veejay-current/utils/cpu_accel.h b/veejay-current/utils/cpu_accel.h
new file mode 100644
index 00000000..4cc5c23b
--- /dev/null
+++ b/veejay-current/utils/cpu_accel.h
@@ -0,0 +1,30 @@
+/*
+ * cpu_accel.h
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file was part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+
+// x86 accelerations
+#define ACCEL_X86_MMX	0x80000000
+#define ACCEL_X86_3DNOW	0x40000000
+#define ACCEL_X86_MMXEXT 0x20000000
+#define ACCEL_X86_SSE   0x10000000
+
+int cpu_accel (void);
diff --git a/veejay-current/utils/fastintfns.h b/veejay-current/utils/fastintfns.h
new file mode 100644
index 00000000..db78af1e
--- /dev/null
+++ b/veejay-current/utils/fastintfns.h
@@ -0,0 +1,32 @@
+/* fast int primitives. min,max,abs,samesign
+ *
+ * WARNING: Assumes 2's complement arithmetic.
+ *
+ */
+
+
+static __inline__ int intmax( register int x, register int y )
+{
+	return x < y ? y : x;
+}
+
+static __inline__ int intmin( register int x, register int y )
+{
+	return x < y ? x : y;
+}
+
+static __inline__ int intabs( register int x )
+{
+	return x < 0 ? -x : x;
+}
+
+#define fabsshift ((8*sizeof(unsigned int))-1)
+
+#define signmask(x) (((int)x)>>fabsshift)
+static __inline__ int intsamesign(int x, int y)
+{
+	return (y+(signmask(x) & -(y<<1)));
+}
+#undef signmask
+#undef fabsshift
+
diff --git a/veejay-current/utils/format_codes.h b/veejay-current/utils/format_codes.h
new file mode 100644
index 00000000..73d7afc7
--- /dev/null
+++ b/veejay-current/utils/format_codes.h
@@ -0,0 +1,38 @@
+/*
+    $Id: format_codes.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $
+
+    Copyright (C) 2001 Andrew Stevens <andrew.stevens@planet-interkom.de>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef __FORMAT_CODES_H__
+#define __FORMAT_CODES_H__
+
+#define MPEG_FORMAT_MPEG1   0
+#define MPEG_FORMAT_VCD     1
+#define MPEG_FORMAT_VCD_NSR 2
+#define MPEG_FORMAT_MPEG2   3
+#define MPEG_FORMAT_SVCD     4
+#define MPEG_FORMAT_SVCD_NSR 5
+#define MPEG_FORMAT_VCD_STILL 6
+#define MPEG_FORMAT_SVCD_STILL 7
+#define MPEG_FORMAT_DVD      8
+
+#define MPEG_FORMAT_FIRST 0
+#define MPEG_FORMAT_LAST 8
+
+#define MPEG_STILLS_FORMAT(x) (x==MPEG_FORMAT_VCD_STILL||x==MPEG_FORMAT_SVCD_STILL)
+#endif /* __FORMAT_CODES_H__ */
diff --git a/veejay-current/utils/mblock_bsad_mmx.s b/veejay-current/utils/mblock_bsad_mmx.s
new file mode 100644
index 00000000..a5ffdcff
--- /dev/null
+++ b/veejay-current/utils/mblock_bsad_mmx.s
@@ -0,0 +1,329 @@
+;
+;  bdist1_mmx.s:  mmX optimized bidirectional absolute distance sum
+;
+;  Original believed to be Copyright (C) 2000 Brent Byeler
+;
+;  This program is free software; you can reaxstribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+
+;/*
+; * absolute difference error between a (16*h) block and a bidirectional
+; * prediction
+; *
+; * p2: address of top left pel of block
+; * pf,hxf,hyf: address and half pel flags of forward ref. block
+; * pb,hxb,hyb: address and half pel flags of backward ref. block
+; * h: height of block
+; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+; * mmX version
+; */
+ 
+;int bsad_mmx(
+;unsigned char *pf, unsigned char *pb, unsigned char *p2,
+;int lx, int hxf, int hyf, int hxb, int hyb, int h)
+;{
+;  unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+
+; Handy macros for readbility
+
+%define pf [ebp+8]
+%define pb [ebp+12]
+%define p2 [ebp+16]
+%define lx [ebp+20]
+%define hxf [ebp+24]
+%define hyf [ebp+28]
+%define hxb [ebp+32]
+%define hyb [ebp+36]
+%define h   [ebp+40]
+
+
+%define pfa [esp+4]
+%define pfb [esp+8]
+%define pfc [esp+12]
+%define pba [esp+16]
+%define pbb [esp+20]
+%define pbc [esp+24]
+
+SECTION .text
+global bsad_mmx
+
+align 32
+bsad_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+	push esi     
+	push edi
+
+	;;
+	;; Make space for local variables on stack
+	sub       esp, 32
+
+
+	mov       edx, hxb
+	mov       eax, hxf
+	mov       esi, lx
+
+	mov       ecx, pf
+	add       ecx, eax
+	mov       pfa, ecx
+	mov       ecx, esi
+	imul      ecx, hyf
+	mov       ebx, pf
+	add       ecx, ebx
+	mov       pfb, ecx
+	add       eax, ecx
+	mov       pfc, eax
+	mov       eax, pb
+	add       eax, edx
+	mov       pba, eax
+	mov       eax, esi
+	imul      eax, hyb
+	mov       ecx, pb
+	add       eax, ecx
+	mov       pbb, eax
+	add       edx, eax
+	mov       pbc, edx
+	xor       esi, esi         ; esi is "s" the accumulator
+	mov       eax, esi
+
+	mov       edi, h
+	test      edi, edi  ; h = 0?
+	jle       near bsadexit
+
+	pxor	  mm7, mm7
+	pxor	  mm6, mm6
+	pcmpeqw	  mm5, mm5
+	psubw	  mm6, mm5
+	psllw	  mm6, 1
+
+bsadtop:
+	mov	  eax, pf
+	mov	  ebx, pfa
+	mov	  ecx, pfb
+	mov	  edx, pfc
+	movq	  mm0, [eax]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [ebx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [ecx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [edx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psrlw	  mm0, 2
+	psrlw	  mm1, 2
+	mov	  eax, pb
+	mov	  ebx, pba
+	mov	  ecx, pbb
+	mov	  edx, pbc
+	movq	  mm2, [eax]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	movq	  mm4, [ebx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [ecx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [edx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	paddw	  mm2, mm6
+	paddw	  mm3, mm6
+	psrlw	  mm2, 2
+	psrlw	  mm3, 2
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	psrlw	  mm6, 1
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psllw	  mm6, 1
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+	packuswb  mm0, mm1
+
+	mov	  eax, p2
+	movq	  mm1, [eax]
+	movq	  mm2, mm0
+	psubusb	  mm0, mm1
+	psubusb	  mm1, mm2
+	por	  mm0, mm1
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	paddw	  mm0, mm1
+	movq	  mm1, mm0
+	punpcklwd mm0, mm7
+	punpckhwd mm1, mm7
+
+	paddd	  mm0, mm1
+	movd	  eax, mm0
+	psrlq	  mm0, 32
+	movd	  ebx, mm0
+	add	  esi, eax
+	add	  esi, ebx
+	mov	  eax, pf
+	mov	  ebx, pfa
+	mov	  ecx, pfb
+	mov	  edx, pfc
+	movq	  mm0, [eax+8]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [ebx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [ecx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [edx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psrlw	  mm0, 2
+	psrlw	  mm1, 2
+	mov	  eax, pb
+	mov	  ebx, pba
+	mov	  ecx, pbb
+	mov	  edx, pbc
+	movq	  mm2, [eax+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	movq	  mm4, [ebx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [ecx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [edx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	paddw	  mm2, mm6
+	paddw	  mm3, mm6
+	psrlw	  mm2, 2
+	psrlw	  mm3, 2
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	psrlw	  mm6, 1
+	paddW	  mm0, mm6
+	paddw	  mm1, mm6
+	psllw	  mm6, 1
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+	packuswb  mm0, mm1
+	mov	  eax, p2
+	movq	  mm1, [eax+8]
+	movq	  mm2, mm0
+	psubusb	  mm0, mm1
+	psubusb	  mm1, mm2
+	por	  mm0, mm1
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	paddw	  mm0, mm1
+	movq	  mm1, mm0
+	punpcklwd mm0, mm7
+	punpckhwd mm1, mm7
+	paddd	  mm0, mm1
+	movd	  eax, mm0
+	psrlq	  mm0, 32
+	movd	  ebx, mm0
+	add	  esi, eax
+	add	  esi, ebx
+
+    mov       eax, lx
+	add       p2, eax
+	add       pf, eax
+	add       pfa, eax
+	add       pfb, eax
+	add       pfc, eax
+	add       pb, eax
+	add       pba, eax
+	add       pbb, eax
+	add       pbc, eax
+
+	dec       edi
+	jg        near bsadtop
+    mov       eax, esi
+
+bsadexit:
+ 
+ 	
+	;;
+	;; Get rid of local variables
+	add esp, 32
+	
+	;; Retore (callee saves convention...)
+	;;
+	pop edi
+	pop esi
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret	
+	
+
diff --git a/veejay-current/utils/mblock_bsumsq_mmx.s b/veejay-current/utils/mblock_bsumsq_mmx.s
new file mode 100644
index 00000000..7cec9a4b
--- /dev/null
+++ b/veejay-current/utils/mblock_bsumsq_mmx.s
@@ -0,0 +1,327 @@
+;
+;  bdist2_mmx.s:  MMX optimized bidirectional squared distance sum
+;
+;  Original believed to be Copyright (C) 2000 Brent Byeler
+;
+;  This program is free software; you can reaxstribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+
+;/*
+; * squared error between a (16*h) block and a bidirectional
+; * prediction
+; *
+; * p2: address of top left pel of block
+; * pf,hxf,hyf: address and half pel flags of forward ref. block
+; * pb,hxb,hyb: address and half pel flags of backward ref. block
+; * h: height of block
+; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+; * mmX version
+; */
+
+;int bsumsq_mmx(
+;unsigned char *pf, unsigned char *pb, unsigned char *p2,
+;int lx, int hxf, int hyf, int hxb, int hyb, int h)
+;{
+;  unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+;  int s;
+
+; Handy macros for readbility
+
+%define pf [ebp+8]
+%define pb [ebp+12]
+%define p2 [ebp+16]
+%define lx [ebp+20]
+%define hxf [ebp+24]
+%define hyf [ebp+28]
+%define hxb [ebp+32]
+%define hyb [ebp+36]
+%define h   [ebp+40]
+
+
+%define pfa [esp+4]
+%define pfb [esp+8]
+%define pfc [esp+12]
+%define pba [esp+16]
+%define pbb [esp+20]
+%define pbc [esp+24]
+
+SECTION .text
+global bsumsq_mmx
+
+align 32
+bsumsq_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+	push esi     
+	push edi
+
+	;;
+	;; Make space for local variables on stack
+	sub       esp, 32
+	
+	mov       edx, hxb
+	mov       eax, hxf
+	mov       esi, lx
+
+	mov       ecx, pf
+	add       ecx, eax
+	mov       pfa, ecx
+	mov       ecx, esi
+	imul      ecx, hyf
+	mov       ebx, pf
+	add       ecx, ebx
+	mov       pfb, ecx
+	add       eax, ecx
+	mov       pfc, eax
+	mov       eax, pb
+	add       eax, edx
+	mov       pba, eax
+	mov       eax, esi
+	imul      eax, hyb
+	mov       ecx, pb
+	add       eax, ecx
+	mov       pbb, eax
+	add       edx, eax
+	mov       pbc, edx
+	xor       esi, esi	; esi = s (accumulated sym)
+	mov       eax, esi
+
+	mov       edi, h
+	test      edi, edi  ; h = 0?
+	jle       near bsumsqexit
+
+	pxor	  mm7, mm7
+	pxor	  mm6, mm6
+	pcmpeqw	  mm5, mm5
+	psubw	  mm6, mm5
+	psllw	  mm6, 1
+
+bsumsqtop:
+	mov	  eax, pf
+	mov	  ebx, pfa
+	mov	  ecx, pfb
+	mov	  edx, pfc
+	movq	  mm0, [eax]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [ebx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [ecx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [edx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psrlw	  mm0, 2
+	psrlw	  mm1, 2
+
+	mov	  eax, pb
+	mov	  ebx, pba
+	mov	  ecx, pbb
+	mov	  edx, pbc
+	movq	  mm2, [eax]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	movq	  mm4, [ebx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [ecx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [edx]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+
+	paddw	  mm2, mm6
+	paddw	  mm3, mm6
+	psrlw	  mm2, 2
+	psrlw	  mm3, 2
+
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	psrlw	  mm6, 1
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psllw	  mm6, 1
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+
+	mov	  eax, p2
+	movq	  mm2, [eax]
+	movq	  mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm0, mm1
+
+	movd	  eax, mm0
+	psrlq	  mm0, 32
+	movd	  ebx, mm0
+	add	  esi, eax
+	add	  esi, ebx
+
+	mov	  eax, pf
+	mov	  ebx, pfa
+	mov	  ecx, pfb
+	mov	  edx, pfc
+	movq	  mm0, [eax+8]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [ebx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [ecx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [edx+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psrlw	  mm0, 2
+	psrlw	  mm1, 2
+
+	mov	  eax, pb
+	mov	  ebx, pba
+	mov	  ecx, pbb
+	mov	  edx, pbc
+	movq	  mm2, [eax+8]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	movq	  mm4, [ebx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [ecx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	movq	  mm4, [edx+8]
+	movq	  mm5, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm5, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm5
+	paddw	  mm2, mm6
+	paddw	  mm3, mm6
+	psrlw	  mm2, 2
+	psrlw	  mm3, 2
+
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	psrlw	  mm6, 1
+	paddW	  mm0, mm6
+	paddw	  mm1, mm6
+	psllw	  mm6, 1
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+
+	mov	  eax, p2
+	movq	  mm2, [eax+8]
+	movq	  mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm0, mm1
+
+	movd	  eax, mm0
+	psrlq	  mm0, 32
+	movd	  ebx, mm0
+	add	  esi, eax
+	add	  esi, ebx
+
+    mov       eax, lx
+	add       p2, eax
+	add       pf, eax
+	add       pfa, eax
+	add       pfb, eax
+	add       pfc, eax
+	add       pb, eax
+	add       pba, eax
+	add       pbb, eax
+	add       pbc, eax
+
+	dec       edi
+	jg        near bsumsqtop
+    mov       eax, esi
+
+bsumsqexit:
+	
+	;;
+	;; Get rid of local variables
+	add esp, 32
+	
+	;; Retore (callee saves convention...)
+	;;
+	pop edi
+	pop esi
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret	
+	
diff --git a/veejay-current/utils/mblock_sad_mmx.s b/veejay-current/utils/mblock_sad_mmx.s
new file mode 100644
index 00000000..c89c636b
--- /dev/null
+++ b/veejay-current/utils/mblock_sad_mmx.s
@@ -0,0 +1,1094 @@
+;;; 
+;;;  mblock_sad_mmxe.s:  
+;;; 
+;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
+;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
+;
+;  sad_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
+;  Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+;  This program is free software; you can redistribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+
+SECTION .text
+
+global sad_00_mmx
+
+; int sad_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim);
+; N.b. distlim is *ignored* as testing for it is more expensive than the
+; occasional saving by aborting the computionation early...
+; esi = p1 (init:		blk1)
+; edi = p2 (init:		blk2)
+; ebx = distlim  
+; ecx = rowsleft (init:	 h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = temp 
+; mm2 = temp 
+; mm3 = temp
+; mm4 = temp
+; mm5 = temp 
+; mm6 = 0
+; mm7 = temp
+
+
+align 32
+sad_00_mmx:
+	push ebp		; save frame pointer
+	mov ebp, esp
+
+	push ebx		; Saves registers (called saves convention in
+	push ecx		; x86 GCC it seems)
+	push edx		; 
+	push esi
+	push edi
+		
+	pxor mm0, mm0				; zero acculumators
+	pxor mm6, mm6
+	mov esi, [ebp+8]			; get p1
+	mov edi, [ebp+12]			; get p2
+	mov edx, [ebp+16]			; get lx
+	mov  ecx, [ebp+20]			; get rowsleft
+	;mov ebx, [ebp+24]	        ; distlim
+	jmp nextrowmm00
+align 32
+nextrowmm00:
+	movq mm4, [esi]		; load first 8 bytes of p1 row 
+	movq mm5, [edi]		; load first 8 bytes of p2 row
+		
+	movq mm7, mm4       ; mm5 = abs(mm4-mm5)
+	psubusb mm7, mm5
+	psubusb mm5, mm4
+	paddb   mm5, mm7
+
+	;;  Add the abs(mm4-mm5) bytes to the accumulators
+	movq mm2, [esi+8]		; load second 8 bytes of p1 row (interleaved)
+	movq  mm7,mm5				; mm7 := [i :	B0..3, mm1]W
+	punpcklbw mm7,mm6
+	movq mm3, [edi+8]		; load second 8 bytes of p2 row (interleaved)
+	paddw mm0, mm7
+	punpckhbw mm5,mm6
+	paddw mm0, mm5
+
+		;; This is logically where the mm2, mm3 loads would go...
+		
+	movq mm7, mm2       ; mm3 = abs(mm2-mm3)
+	psubusb mm7, mm3
+	psubusb mm3, mm2
+	paddb   mm3, mm7
+
+	;;  Add the abs(mm4-mm5) bytes to the accumulators
+	movq  mm7,mm3				; mm7 := [i :	B0..3, mm1]W
+	punpcklbw mm7,mm6
+	punpckhbw mm3,mm6
+	paddw mm0, mm7
+	
+	add esi, edx		; update pointer to next row
+	add edi, edx		; ditto	
+
+	paddw mm0, mm3
+
+
+
+	sub  ecx,1
+	jnz near nextrowmm00
+		
+returnmm00:	
+
+		;; Sum the Accumulators
+	movq  mm5, mm0				; mm5 := [W0+W2,W1+W3, mm0
+	psrlq mm5, 32
+	movq  mm4, mm0
+	paddw mm4, mm5
+
+	movq  mm7, mm4              ; mm6 := [W0+W2+W1+W3, mm0]
+	psrlq mm7, 16
+	paddw mm4, mm7
+	movd eax, mm4		; store return value
+	and  eax, 0xffff
+
+	pop edi
+	pop esi	
+	pop edx	
+	pop ecx	
+	pop ebx	
+
+	pop ebp	
+
+	emms			; clear mmx registers
+	ret	
+;
+;;;  sad_01_mmx.s:  mmx1 optimised 7bit*8 word absolute difference sum
+;;;     We're reduce to seven bits as otherwise we also have to mess
+;;;     horribly with carries and signed only comparisons make the code
+;;;     simply enormous (and probably barely faster than a simple loop).
+;;;     Since signals with a bona-fide 8bit res will be rare we simply
+;;;     take the precision hit...
+;;;     Actually we don't worry about carries from the low-order bits
+;;;     either so 1/4 of the time we'll be 1 too low...
+;;; 
+;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+;  This program is free software; you can redistribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+
+
+global sad_01_mmx
+
+; int sad_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init:		blk1)
+; edi = p2 (init:		blk2)
+; ecx = rowsleft (init:	 h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+1
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+sad_01_mmx:
+	push ebp		; save stack pointer
+	mov ebp, esp	; so that we can do this
+
+	push ebx		; Saves registers (called saves convention in
+	push ecx		; x86 GCC it seems)
+	push edx		; 
+	push esi
+	push edi
+		
+	pxor mm0, mm0				; zero acculumators
+
+	mov esi, [ebp+8]			; get p1
+	mov edi, [ebp+12]			; get p2
+	mov edx, [ebp+16]			; get lx
+	mov ecx, [ebp+20]			; rowsleft := h
+	jmp nextrowmm01					; snap to it
+align 32
+nextrowmm01:
+
+		;; 
+		;; First 8 bytes of row
+		;; 
+		
+		;; First 4 bytes of 8
+
+	movq mm4, [esi]             ; mm4 := first 4 bytes p1
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [esi+1]			;  mm6 := first 4 bytes p1+1
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              ;  mm4 := First 4 bytes interpolated in words
+	psrlw mm4, 1
+		
+	movq mm5, [edi]				; mm5:=first 4 bytes of p2 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+		
+	paddw mm4, mm6          ;  mm4 := First 4 Interpolated bytes in words
+	psrlw mm4, 1
+
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+ 		;;
+		;; Second 8 bytes of row
+		;; 
+		;; First 4 bytes of 8
+
+	movq mm4, [esi+8]             ; mm4 := first 4 bytes p1+8
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [esi+9]			;  mm6 := first 4 bytes p1+9
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              ;  mm4 := First 4 bytes interpolated in words
+	psrlw mm4, 1
+		
+	movq mm5, [edi+8]				; mm5:=first 4 bytes of p2+8 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+		
+	paddw mm4, mm6          ;  mm4 := First 4 Interpolated bytes in words
+	psrlw mm4, 1
+
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+		;;
+		;;	Loop termination condition... and stepping
+		;;		
+
+	add esi, edx		; update pointer to next row
+	add edi, edx		; ditto
+			
+	sub  ecx,1
+	test ecx, ecx		; check rowsleft
+	jnz near nextrowmm01
+		
+
+		;; Sum the Accumulators
+	movq  mm4, mm0
+	psrlq mm4, 32
+	paddw mm0, mm4
+	movq  mm6, mm0
+	psrlq mm6, 16
+	paddw mm0, mm6
+	movd eax, mm0		; store return value
+	and  eax, 0xffff
+
+	pop edi
+	pop esi	
+	pop edx			
+	pop ecx			
+	pop ebx			
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+;
+;;;  sad_01_mmx.s:  mmx1 optimised 7bit*8 word absolute difference sum
+;;;     We're reduce to seven bits as otherwise we also have to mess
+;;;     horribly with carries and signed only comparisons make the code
+;;;     simply enormous (and probably barely faster than a simple loop).
+;;;     Since signals with a bona-fide 8bit res will be rare we simply
+;;;     take the precision hit...
+;;;     Actually we don't worry about carries from the low-order bits
+;;;     either so 1/4 of the time we'll be 1 too low...
+;;; 
+;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+;  This program is free software; you can redistribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+
+
+global sad_10_mmx
+
+; int sad_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init:		blk1)
+; edi = p2 (init:		blk2)
+; ebx = p1+lx
+; ecx = rowsleft (init:	 h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+1
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+sad_10_mmx:
+	push ebp		; save stack pointer
+	mov ebp, esp	; so that we can do this
+
+	push ebx		; Saves registers (called saves convention in
+	push ecx		; x86 GCC it seems)
+	push edx		; 
+	push esi
+	push edi
+		
+	pxor mm0, mm0				; zero acculumators
+
+	mov esi, [ebp+8]			; get p1
+	mov edi, [ebp+12]			; get p2
+	mov edx, [ebp+16]			; get lx
+	mov ecx, [ebp+20]			; rowsleft := h
+	mov ebx, esi
+    add ebx, edx		
+	jmp nextrowmm10					; snap to it
+align 32
+nextrowmm10:
+
+		;; 
+		;; First 8 bytes of row
+		;; 
+		
+		;; First 4 bytes of 8
+
+	movq mm4, [esi]             ; mm4 := first 4 bytes p1
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [ebx]			    ;  mm6 := first 4 bytes p1+lx
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              ;  mm4 := First 4 bytes interpolated in words
+	psrlw mm4, 1
+		
+	movq mm5, [edi]				; mm5:=first 4 bytes of p2 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+		
+	paddw mm4, mm6          ;  mm4 := First 4 Interpolated bytes in words
+	psrlw mm4, 1
+
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+ 		;;
+		;; Second 8 bytes of row
+		;; 
+		;; First 4 bytes of 8
+
+	movq mm4, [esi+8]             ; mm4 := first 4 bytes p1+8
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [ebx+8]			;  mm6 := first 4 bytes p1+lx+8
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              ;  mm4 := First 4 bytes interpolated in words
+	psrlw mm4, 1
+		
+	movq mm5, [edi+8]				; mm5:=first 4 bytes of p2+8 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+		
+	paddw mm4, mm6          ;  mm4 := First 4 Interpolated bytes in words
+	psrlw mm4, 1
+
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+		;;
+		;;	Loop termination condition... and stepping
+		;;		
+
+	add esi, edx		; update pointer to next row
+	add edi, edx		; ditto
+	add ebx, edx
+
+	sub  ecx,1
+	test ecx, ecx		; check rowsleft
+	jnz near nextrowmm10
+		
+		;; Sum the Accumulators
+	movq  mm4, mm0
+	psrlq mm4, 32
+	paddw mm0, mm4
+	movq  mm6, mm0
+	psrlq mm6, 16
+	paddw mm0, mm6
+	movd eax, mm0		; store return value
+	and  eax, 0xffff
+
+		
+	pop edi
+	pop esi	
+	pop edx			
+	pop ecx			
+	pop ebx			
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+;
+;;;  sad_01_mmx.s:  
+;;; 
+;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+;  This program is free software; you can redistribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+
+
+global sad_11_mmx
+
+; int sad_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init:		blk1)
+; edi = p2 (init:		blk2)
+; ebx = p1+lx
+; ecx = rowsleft (init:	 h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+lx
+; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
+; but I don't think thats going to happen in iA32-land...
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+sad_11_mmx:
+	push ebp		; save stack pointer
+	mov ebp, esp	; so that we can do this
+
+	push ebx		; Saves registers (called saves convention in
+	push ecx		; x86 GCC it seems)
+	push edx		; 
+	push esi
+	push edi
+		
+	pxor mm0, mm0				; zero acculumators
+
+	mov esi, [ebp+8]			; get p1
+	mov edi, [ebp+12]			; get p2
+	mov edx, [ebp+16]			; get lx
+	mov ecx, [ebp+20]			; rowsleft := h
+	mov ebx, esi
+    add ebx, edx		
+	jmp nextrowmm11					; snap to it
+align 32
+nextrowmm11:
+
+		;; 
+		;; First 8 bytes of row
+		;; 
+		
+		;; First 4 bytes of 8
+
+	movq mm4, [esi]             ; mm4 := first 4 bytes p1
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [ebx]			    ;  mm6 := first 4 bytes p1+lx
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              
+
+
+	movq mm5, [esi+1]			; mm5 := first 4 bytes p1+1
+	punpcklbw mm5, mm7            ;  First 4 bytes p1 in Words...
+	paddw mm4, mm5		
+	movq mm6, [ebx+1]           ;  mm6 := first 4 bytes p1+lx+1
+	punpcklbw mm6, mm7
+	paddw mm4, mm6
+
+	psrlw mm4, 2	            ; mm4 := First 4 bytes interpolated in words
+		
+	movq mm5, [edi]				; mm5:=first 4 bytes of p2 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+	paddw mm4, mm6          
+
+	movq mm5, [esi+1]			; mm5 := first 4 bytes p1+1
+	punpckhbw mm5, mm7          ;  First 4 bytes p1 in Words...
+	paddw mm4, mm5
+	movq mm6, [ebx+1]           ;  mm6 := first 4 bytes p1+lx+1
+	punpckhbw mm6, mm7
+	paddw mm4, mm6
+
+	psrlw mm4, 2	            ; mm4 := First 4 bytes interpolated in words
+		
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+ 		;;
+		;; Second 8 bytes of row
+		;; 
+		;; First 4 bytes of 8
+
+	movq mm4, [esi+8]             ; mm4 := first 4 bytes p1+8
+	pxor mm7, mm7
+	movq mm2, mm4				;  mm2 records all 8 bytes
+	punpcklbw mm4, mm7            ;  First 4 bytes p1 in Words...
+	
+	movq mm6, [ebx+8]			    ;  mm6 := first 4 bytes p1+lx+8
+	movq mm3, mm6               ;  mm3 records all 8 bytes
+	punpcklbw mm6, mm7
+	paddw mm4, mm6              
+
+
+	movq mm5, [esi+9]			; mm5 := first 4 bytes p1+9
+	punpcklbw mm5, mm7            ;  First 4 bytes p1 in Words...
+	paddw mm4, mm5
+	movq mm6, [ebx+9]           ;  mm6 := first 4 bytes p1+lx+9
+	punpcklbw mm6, mm7
+	paddw mm4, mm6
+
+	psrlw mm4, 2	            ; mm4 := First 4 bytes interpolated in words
+		
+	movq mm5, [edi+8]				; mm5:=first 4 bytes of p2+8 in words
+	movq mm1, mm5
+	punpcklbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+		;; Second 4 bytes of 8
+	
+	movq mm4, mm2		    ; mm4 := Second 4 bytes p1 in words
+	pxor  mm7, mm7
+	punpckhbw mm4, mm7			
+	movq mm6, mm3			; mm6 := Second 4 bytes p1+1 in words  
+	punpckhbw mm6, mm7
+	paddw mm4, mm6          
+
+	movq mm5, [esi+9]			; mm5 := first 4 bytes p1+1
+	punpckhbw mm5, mm7          ;  First 4 bytes p1 in Words...
+	paddw mm4, mm5	
+	movq mm6, [ebx+9]           ;  mm6 := first 4 bytes p1+lx+1
+	punpckhbw mm6, mm7
+	paddw mm4, mm6
+		
+	psrlw mm4, 2	            ; mm4 := First 4 bytes interpolated in words
+
+	movq mm5, mm1			; mm5:= second 4 bytes of p2 in words
+	punpckhbw mm5, mm7
+			
+	movq  mm7,mm4
+	pcmpgtw mm7,mm5		; mm7 := [i : W0..3,mm4>mm5]
+
+	movq  mm6,mm4		; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ 	psubw mm6,mm5
+	pand  mm6, mm7
+
+	paddw mm0, mm6				; Add to accumulator
+
+	movq  mm6,mm5       ; mm6 := [i : W0..3,mm5>mm4]
+	pcmpgtw mm6,mm4	    
+ 	psubw mm5,mm4		; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+	pand  mm5, mm6		
+
+	paddw mm0, mm5				; Add to accumulator
+
+
+		;;
+		;;	Loop termination condition... and stepping
+		;;		
+
+	add esi, edx		; update pointer to next row
+	add edi, edx		; ditto
+	add ebx, edx
+
+	sub  ecx,1
+	test ecx, ecx		; check rowsleft
+	jnz near nextrowmm11
+		
+		;; Sum the Accumulators
+	movq  mm4, mm0
+	psrlq mm4, 32
+	paddw mm0, mm4
+	movq  mm6, mm0
+	psrlq mm6, 16
+	paddw mm0, mm6
+	movd eax, mm0		; store return value
+	and  eax, 0xffff
+		
+	pop edi
+	pop esi	
+	pop edx			
+	pop ecx			
+	pop ebx			
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+
+global sad_sub22_mmx
+
+; int sad_sub22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
+
+; eax = p1 (init:		blk1)
+; ebx = p2 (init:		 blk2)
+; ecx = rowsleft (init:	 h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = temp 
+; mm2 = temp 
+; mm3 = temp
+; mm4 = temp
+; mm5 = temp 
+; mm6 = 0
+; mm7 = temp
+
+
+align 32
+sad_sub22_mmx:
+	push ebp		; save stack pointer
+	mov ebp, esp	; so that we can do this
+
+	push ebx		; Saves registers (called saves convention in
+	push ecx		; x86 GCC it seems)
+	push edx		; 
+
+	pxor mm0, mm0				; zero acculumators
+	pxor mm6, mm6
+	mov eax, [ebp+8]			; get p1
+	mov ebx, [ebp+12]			; get p2
+	mov edx, [ebp+16]			; get lx
+
+	mov  ecx, [ebp+20]			; get rowsleft
+
+	jmp nextrow					; snap to it
+align 32
+nextrow:
+	movq mm4, [eax]		; load 8 bytes of p1 
+	movq mm5, [ebx]		; load 8 bytes of p2
+		
+	movq mm7, mm4       ; mm5 = abs(*p1-*p2)
+	psubusb mm7, mm5
+	psubusb mm5, mm4
+	add eax, edx		; update pointer to next row
+	paddb   mm5,mm7	
+
+		;;  Add the mm5 bytes to the accumulatores
+	movq  mm7,mm5			
+	punpcklbw mm7,mm6
+	paddw mm0, mm7
+	punpckhbw mm5,mm6
+	add ebx, edx		; update pointer to next row
+	paddw mm0, mm5
+	
+	movq mm4, [eax]		; load 8 bytes of p1 (next row) 
+	movq mm5, [ebx]		; load 8 bytes of p2 (next row)
+		
+	movq mm7, mm4       ; mm5 = abs(*p1-*p2)
+	psubusb mm7, mm5
+	psubusb mm5, mm4
+	add eax, edx		; update pointer to next row
+	paddb   mm5,mm7	
+
+		;;  Add the mm5 bytes to the accumulatores
+	movq  mm7,mm5				
+	punpcklbw mm7,mm6
+	add ebx, edx		; update pointer to next row
+	paddw mm0, mm7              
+	punpckhbw mm5,mm6
+	sub  ecx,2
+	paddw mm0, mm5
+
+
+	jnz nextrow
+
+		;; Sum the Accumulators
+	movq  mm1, mm0
+	psrlq mm1, 16
+	movq  mm2, mm0
+	psrlq mm2, 32
+	movq  mm3, mm0
+	psrlq mm3, 48
+	paddw mm0, mm1
+	paddw mm2, mm3
+	paddw mm0, mm2
+		
+	movd eax, mm0		; store return value
+	and  eax, 0xffff
+	
+	pop edx			; pop pop
+	pop ecx			; fizz fizz
+	pop ebx			; ia86 needs a fizz instruction
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+
+
+
+global sad_sub44_mmx
+
+; int sad_sub44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
+
+; eax = p1
+; ebx = p2
+; ecx = temp
+; edx = qlx;
+; esi = rowsleft
+
+; mm0 = distance accumulator left block p1
+; mm1 = distance accumulator right block p1
+; mm2 = 0
+; mm3 = right block of p1
+; mm4 = left block of p1
+; mm5 = p2
+; mm6 = temp
+; mm7 = temp
+
+align 32
+sad_sub44_mmx:
+	push ebp		; save stack pointer
+	mov ebp, esp		; so that we can do this
+
+	push ebx
+	push ecx
+	push edx
+	push esi     
+
+	pxor mm0, mm0		; zero acculumator
+	pxor mm1, mm1
+	pxor mm2, mm2
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get qlx
+	mov esi, [ebp+20]	; get rowsleft
+	jmp nextrowqd		; snap to it
+align 32
+nextrowqd:
+
+		;;
+		;; Beware loop obfuscated by interleaving to try to
+		;; hide latencies...
+		;; 
+	movq mm4, [eax]				; mm4 =  first 4 bytes of p1 in words
+	movq mm5, [ebx]             ; mm5 = 4 bytes of p2 in words
+	movq mm3, mm4
+	punpcklbw mm4, mm2			
+	punpcklbw mm5, mm2
+	
+	movq mm7, mm4
+	movq mm6, mm5
+	psubusw mm7, mm5
+	psubusw mm6, mm4
+	
+	add eax, edx		        ; update a pointer to next row
+;	punpckhbw mm3, mm2			; mm3 = 2nd 4 bytes of p1 in words
+
+	paddw   mm7, mm6
+	paddw mm0, mm7				; Add absolute differences to left block accumulators
+		
+;	movq mm7,mm3
+;	psubusw mm7, mm5
+;	psubusw mm5, mm3
+
+	add ebx, edx		; update a pointer to next row
+	sub   esi, 1
+
+;	paddw   mm7, mm5
+;	paddw mm1, mm7				; Add absolute differences to right block accumulators
+	
+
+		
+	jnz nextrowqd		
+
+		;;		Sum the accumulators
+
+	movq  mm4, mm0
+	psrlq mm4, 32
+	paddw mm0, mm4
+	movq  mm6, mm0
+	psrlq mm6, 16
+	paddw mm0, mm6
+	movd eax, mm0		; store return value
+
+;	movq  mm4, mm1
+;	psrlq mm4, 32
+;	paddw mm1, mm4
+;	movq  mm6, mm1
+;	psrlq mm6, 16
+;	paddw mm1, mm6
+;	movd ebx, mm1
+
+	and  eax, 0xffff		
+;	sal  ebx, 16
+;	or   eax, ebx
+		
+	pop esi
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
diff --git a/veejay-current/utils/mblock_sad_mmxe.s b/veejay-current/utils/mblock_sad_mmxe.s
new file mode 100644
index 00000000..460ed0fc
--- /dev/null
+++ b/veejay-current/utils/mblock_sad_mmxe.s
@@ -0,0 +1,1002 @@
+;;; 
+;;;  mblock_sad_mmxe.s:  
+;;; 
+;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
+;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
+;
+;  Original MMX sad_* Copyright (C) 2000 Chris Atenasio <chris@crud.net>
+;  Enhanced MMX and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+		;; Yes, I tried prefetch-ing.  It makes no difference or makes
+		;; stuff *slower*.
+
+;
+;  This program is free software; you can reaxstribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+
+SECTION .text
+
+global sad_00_mmxe
+
+; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
+; distlim unused - costs more to check than the savings of
+; aborting the computation early from time to time...
+; eax = p1
+; ebx = p2
+; ecx = rowsleft
+; edx = lx;
+
+; mm0 = distance accumulator
+; mm1 = temp
+; mm2 = temp
+; mm3 = temp
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_00_mmxe:
+	push ebp					; save frame pointer
+	mov ebp, esp				; link
+
+	push ebx
+	push ecx
+	push edx
+
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+sad_00_0misalign:		
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+
+	mov ecx, [ebp+20]	; get rowsleft
+	jmp nextrow00sse
+align 32
+nextrow00sse:
+	movq mm4, [eax]		; load first 8 bytes of p1 (row 1)
+	psadbw mm4, [ebx]	; compare to first 8 bytes of p2 (row 1)
+	movq mm5, [eax+8]	; load next 8 bytes of p1 (row 1)
+	add eax, edx		; update pointer to next row
+	paddd mm0, mm4		; accumulate difference
+	
+	psadbw mm5, [ebx+8]	; compare to next 8 bytes of p2 (row 1)
+	add ebx, edx		; ditto
+	paddd mm0, mm5		; accumulate difference
+
+
+	movq mm6, [eax]		; load first 8 bytes of p1 (row 2)
+	psadbw mm6, [ebx]	; compare to first 8 bytes of p2 (row 2)
+	movq mm4, [eax+8]	; load next 8 bytes of p1 (row 2)
+	add eax, edx		; update pointer to next row
+	paddd mm0, mm6		; accumulate difference
+	
+	psadbw mm4, [ebx+8]	; compare to next 8 bytes of p2 (row 2)
+	add ebx, edx		; ditto
+	paddd mm0, mm4		; accumulate difference
+
+	;psubd mm2, mm3		; decrease rowsleft
+	;movq mm5, mm1		; copy distlim
+	;pcmpgtd mm5, mm0	; distlim > dist?
+	;pand mm2, mm5		; mask rowsleft with answer
+	;movd ecx, mm2		; move rowsleft to ecx
+
+	;add eax, edx		; update pointer to next row
+	;add ebx, edx		; ditto
+	
+	;test ecx, ecx		; check rowsleft
+	sub  ecx, 2
+	jnz nextrow00sse
+
+	movd eax, mm0		; store return value
+	
+	pop edx	
+	pop ecx	
+	pop ebx	
+
+	pop ebp	
+	emms
+	ret	
+
+
+				
+
+global sad_00_Ammxe
+		;; This is a special version that only does aligned accesses...
+		;; Wonder if it'll make it faster on a P-III
+		;; ANSWER:		 NO its slower hence no longer used.
+
+; int sad_00(char *blk1,char *blk2,int lx,int h,int distlim);
+; distlim unused - costs more to check than the savings of
+; aborting the computation early from time to time...
+; eax = p1
+; ebx = p2
+; ecx = rowsleft
+; edx = lx;
+
+; mm0 = distance accumulator
+; mm1 = temp
+; mm2 = right shift to adjust for mis-align
+; mm3 = left shift to adjust for mis-align
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_00_Ammxe:
+	push ebp					; save frame pointer
+	mov ebp, esp				; link
+
+	push ebx
+	push ecx
+	push edx
+		
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, eax
+	and ebx, 7					; Misalignment!
+	cmp ebx, 0
+	jz	near sad_00_0misalign
+	sub eax, ebx				; Align eax
+	mov ecx, 8					; ecx = 8-misalignment
+	sub ecx, ebx
+	shl ebx, 3					; Convert into bit-shifts...
+	shl ecx, 3					
+	movd mm2, ebx				; mm2 = shift to start msb
+	movd mm3, ecx				; mm3 = shift to end lsb
+
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	mov ecx, [ebp+20]	; get rowsleft
+	jmp nextrow00ssea
+align 32
+nextrow00ssea:
+	movq mm4, [eax]				; load first 8 bytes of aligned p1 (row 1)
+	movq mm5, [eax+8]			; load next 8 bytes of aligned p1 (row 1)
+	movq mm6, mm5
+	psrlq mm4, mm2				; mm4 first 8 bytes of p1 proper
+	psllq mm5, mm3
+	por	  mm4, mm5
+	psadbw mm4, [ebx]	; compare to first 8 bytes of p2 
+
+	movq mm7, [eax+16]			; load last 8 bytes of aligned p1
+	add eax, edx		; update pointer to next row
+	psrlq mm6, mm2				; mm6 2nd 8 bytes of p1 proper
+	psllq mm7, mm3
+	por   mm6, mm7
+
+
+	paddd mm0, mm4		; accumulate difference
+	
+	psadbw mm6, [ebx+8]	; compare to next 8 bytes of p2 (row 1)
+	add ebx, edx		; ditto
+	paddd mm0, mm6		; accumulate difference
+
+	sub  ecx, 1
+	jnz nextrow00ssea
+
+	movd eax, mm0		; store return value
+
+	pop edx	
+	pop ecx	
+	pop ebx	
+
+	pop ebp	
+	emms
+	ret	
+
+
+global sad_01_mmxe
+
+; int sad_01(char *blk1,char *blk2,int lx,int h);
+
+; eax = p1
+; ebx = p2
+; ecx = counter temp
+; edx = lx;
+
+; mm0 = distance accumulator
+; mm1 = distlim
+; mm2 = rowsleft
+; mm3 = 2 (rows per loop)
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_01_mmxe:
+	push ebp	
+	mov ebp, esp
+
+	push ebx
+	push ecx
+	push edx
+
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+
+	mov ecx, [ebp+20]	; get rowsleft
+	jmp nextrow01		; snap to it
+align 32
+nextrow01:
+	movq mm4, [eax]				; load first 8 bytes of p1 (row 1)
+	pavgb mm4, [eax+1]			; Interpolate...
+	psadbw mm4, [ebx]			; compare to first 8 bytes of p2 (row 1)
+	paddd mm0, mm4				; accumulate difference
+
+	movq mm5, [eax+8]			; load next 8 bytes of p1 (row 1)
+	pavgb mm5, [eax+9]			; Interpolate
+	psadbw mm5, [ebx+8]			; compare to next 8 bytes of p2 (row 1)
+	paddd mm0, mm5				; accumulate difference
+
+	add eax, edx				; update pointer to next row
+	add ebx, edx				; ditto
+
+	movq mm6, [eax]				; load first 8 bytes of p1 (row 2)
+	pavgb mm6, [eax+1]			; Interpolate
+	psadbw mm6, [ebx]	; compare to first 8 bytes of p2 (row 2)
+	paddd mm0, mm6		; accumulate difference
+	
+	movq mm7, [eax+8]	; load next 8 bytes of p1 (row 2)
+	pavgb mm7, [eax+9]
+	psadbw mm7, [ebx+8]	; compare to next 8 bytes of p2 (row 2)
+	paddd mm0, mm7		; accumulate difference
+
+	add eax, edx		; update pointer to next row
+	add ebx, edx		; ditto
+	
+	sub ecx, 2			; check rowsleft
+	jnz nextrow01		; rinse and repeat
+
+	movd eax, mm0		; store return value
+	
+	pop edx	
+	pop ecx		
+	pop ebx	
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+
+global sad_10_mmxe
+
+; int sad_10(char *blk1,char *blk2,int lx,int h);
+
+; eax = p1
+; ebx = p2
+; ecx = counter temp
+; edx = lx;
+; edi = p1+lx  
+
+; mm0 = distance accumulator
+; mm2 = rowsleft
+; mm3 = 2 (rows per loop)
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_10_mmxe:
+	push ebp		; save stack pointer
+	mov ebp, esp
+
+	push ebx
+	push ecx
+	push edx
+	push edi
+
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	mov edi, eax
+	add edi, edx
+	mov ecx, [ebp+20]	; get rowsleft
+	jmp nextrow10		; snap to it
+align 32
+nextrow10:
+	movq mm4, [eax]				; load first 8 bytes of p1 (row 1)
+	pavgb mm4, [edi]			; Interpolate...
+	psadbw mm4, [ebx]			; compare to first 8 bytes of p2 (row 1)
+	paddd mm0, mm4				; accumulate difference
+
+	movq mm5, [eax+8]			; load next 8 bytes of p1 (row 1)
+	pavgb mm5, [edi+8]			; Interpolate
+	psadbw mm5, [ebx+8]			; compare to next 8 bytes of p2 (row 1)
+	paddd mm0, mm5				; accumulate difference
+
+	add eax, edx				; update pointer to next row
+	add ebx, edx				; ditto
+	add edi, edx
+
+	movq mm6, [eax]				; load first 8 bytes of p1 (row 2)
+	pavgb mm6, [edi]			; Interpolate
+	psadbw mm6, [ebx]	; compare to first 8 bytes of p2 (row 2)
+	paddd mm0, mm6		; accumulate difference
+	
+	movq mm7, [eax+8]	; load next 8 bytes of p1 (row 2)
+	pavgb mm7, [edi+8]
+	psadbw mm7, [ebx+8]	; compare to next 8 bytes of p2 (row 2)
+	paddd mm0, mm7		; accumulate difference
+
+	psubd mm2, mm3		; decrease rowsleft
+
+	add eax, edx		; update pointer to next row
+	add ebx, edx		; ditto
+	add edi, edx
+	
+	sub ecx, 2			; check rowsleft (we're doing 2 at a time)
+	jnz nextrow10		; rinse and repeat
+
+	movd eax, mm0		; store return value
+	
+	pop edi
+	pop edx	
+	pop ecx	
+	pop ebx	
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+
+global sad_11_mmxe
+
+; int sad_11(char *blk1,char *blk2,int lx,int h);
+
+; eax = p1
+; ebx = p2
+; ecx = counter temp
+; edx = lx;
+; edi = p1+lx  
+
+		  
+; mm0 = distance accumulator
+; mm2 = rowsleft
+; mm3 = 2 (rows per loop)
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_11_mmxe:
+	push ebp		; save stack pointer
+	mov ebp, esp		; so that we can do this
+
+	push ebx		; save the pigs
+	push ecx		; make them squeal
+	push edx		; lets have pigs for every meal
+	push edi
+
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	mov edi, eax
+	add edi, edx
+	mov ecx, [ebp+20]	; get rowsleft
+	jmp nextrow11		; snap to it
+align 32
+nextrow11:
+	movq mm4, [eax]				; load first 8 bytes of p1 (row 1)
+	pavgb mm4, [edi]			; Interpolate...
+	movq mm5, [eax+1]
+	pavgb mm5, [edi+1]
+	pavgb mm4, mm5
+	psadbw mm4, [ebx]			; compare to first 8 bytes of p2 (row 1)
+	paddd mm0, mm4				; accumulate difference
+
+	movq mm6, [eax+8]			; load next 8 bytes of p1 (row 1)
+	pavgb mm6, [edi+8]			; Interpolate
+	movq mm7, [eax+9]
+	pavgb mm7, [edi+9]
+	pavgb mm6, mm7
+	psadbw mm6, [ebx+8]			; compare to next 8 bytes of p2 (row 1)
+	paddd mm0, mm6				; accumulate difference
+
+	add eax, edx				; update pointer to next row
+	add ebx, edx				; ditto
+	add edi, edx
+
+	movq mm4, [eax]				; load first 8 bytes of p1 (row 1)
+	pavgb mm4, [edi]			; Interpolate...
+	movq mm5, [eax+1]
+	pavgb mm5, [edi+1]
+	pavgb mm4, mm5
+	psadbw mm4, [ebx]			; compare to first 8 bytes of p2 (row 1)
+	paddd mm0, mm4				; accumulate difference
+
+	movq mm6, [eax+8]			; load next 8 bytes of p1 (row 1)
+	pavgb mm6, [edi+8]			; Interpolate
+	movq mm7, [eax+9]
+	pavgb mm7, [edi+9]
+	pavgb mm6, mm7
+	psadbw mm6, [ebx+8]			; compare to next 8 bytes of p2 (row 1)
+	paddd mm0, mm6				; accumulate difference
+		
+	add eax, edx		; update pointer to next row
+	add ebx, edx		; ditto
+	add edi, edx
+
+			
+	sub ecx, 2				; check rowsleft
+	jnz near nextrow11			; rinse and repeat
+
+	movd eax, mm0				; store return value
+	
+	pop edi
+	pop edx	
+	pop ecx		
+	pop ebx			
+	
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+global sad_sub22_mmxe
+
+; int sad_sub22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
+
+; eax = p1
+; ebx = p2
+; ecx = counter temp
+; edx = flx;
+
+; mm0 = distance accumulator
+; mm2 = rowsleft
+; mm3 = 2 (rows per loop)
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_sub22_mmxe:
+	push ebp		; save frame pointer
+	mov ebp, esp
+
+	push ebx	
+	push ecx
+	push edx	
+
+	pxor mm0, mm0		; zero acculumator
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+
+	mov ecx, [ebp+20]
+	jmp nextrowfd
+align 32
+nextrowfd:
+	movq   mm4, [eax]	 ; load first 8 bytes of p1 (row 1) 
+	add eax, edx		; update pointer to next row
+	psadbw mm4, [ebx]	; compare to first 8 bytes of p2 (row 1)
+	add ebx, edx		; ditto
+	paddd  mm0, mm4		; accumulate difference
+	
+
+	movq mm6, [eax]		; load first 8 bytes of p1 (row 2)
+	add eax, edx		; update pointer to next row
+	psadbw mm6, [ebx]	; compare to first 8 bytes of p2 (row 2)
+	add ebx, edx		; ditto
+	paddd mm0, mm6		; accumulate difference
+	
+
+	sub ecx, 2
+	jnz nextrowfd
+
+	movd eax, mm0
+	
+	pop edx	
+	pop ecx	
+	pop ebx	
+
+	pop ebp
+
+	emms
+	ret
+
+
+
+
+
+global sad_sub44_mmxe
+
+; int sad_sub44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
+
+; eax = p1
+; ebx = p2
+; ecx = temp
+; edx = qlx;
+; esi = rowsleft
+
+; mm0 = distance accumulator left block p1
+; mm1 = distance accumulator right block p1
+; mm2 = 0
+; mm3 = 0
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+
+
+align 32
+sad_sub44_mmxe:
+	push ebp
+	mov ebp, esp
+
+	push ebx
+	push ecx
+	push edx
+	push esi     
+
+	pxor mm0, mm0		; zero acculumator
+	pxor mm1, mm1				
+	pxor mm2, mm2
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get qlx
+
+	mov esi, [ebp+20]	; get rowsleft
+	jmp nextrowqd		; snap to it
+align 32
+nextrowqd: 
+	movq mm4, [eax]				; load 8 bytes of p1 (two blocks!)
+	add eax, edx		; update pointer to next row
+	movq mm6, mm4				  ;
+	mov  ecx, [ebx]       ; load 4 bytes of p2
+    punpcklbw mm4, mm2			; mm4 = bytes 0..3 p1 (spaced out)
+	movd mm5, ecx
+	punpcklbw mm5, mm2      ; mm5 = bytes 0..3 p2  (spaced out)
+	psadbw mm4, mm5	    		; compare to left block
+	add ebx, edx		; ditto
+
+;	punpckhbw mm6, mm2          ; mm6 = bytes 4..7 p1 (spaced out)
+
+	paddd mm0, mm4				; accumulate difference left block
+
+;	psadbw mm6,mm5				; compare to right block
+	
+
+;	paddd mm1, mm6				; accumulate difference right block
+		
+	sub esi, 1
+	jnz nextrowqd
+
+	movd eax, mm0
+;	movd ebx, mm1				
+;	sal  ebx, 16
+;	or   eax, ebx
+	
+	pop esi
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret			; we now return you to your regular programming
+
+
+;;; 
+;;;  mblock_*nearest4_sad_mmxe.s:  
+;;; 
+;;; Enhanced MMX optimized Sum Absolute Differences routines for
+;;; quads macroblocks offset by (0,0) (0,1) (1,0) (1,1) pel
+;;; 
+
+;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
+;;; evaluates macroblock quads.  A lot of memory accesses can be saved
+;;; if each quad is done together rather than each macroblock in the
+;;; quad handled individually.
+
+;;; TODO:		Really there ought to be MMX versions and the function's
+;;; specification should be documented...
+;
+; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>	
+
+
+;;; CURRENTLY not used but used in testing as reference for tweaks...
+global mblockq_sad_REF
+
+; void mblockq_sad_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
+; eax = p1
+; ebx = p2
+; ecx = unused
+; edx = lx;
+; edi = rowsleft
+; esi = h
+		
+; mm0 = SAD (x+0,y+0)
+; mm1 = SAD (x+2,y+0)
+; mm2 = SAD (x+0,y+2)
+; mm3 = SAD (x+2,y+2)
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+; mm7 = temp						
+
+align 32
+mblockq_sad_REF:
+	push ebp					; save frame pointer
+	mov ebp, esp				; link
+	push eax
+	push ebx
+	push ecx
+	push edx
+	push edi
+	push esi
+
+	pxor mm0, mm0		; zero accumulators
+	pxor mm1, mm1
+	pxor mm2, mm2
+	pxor mm3, mm3
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	
+	mov edi, [ebp+20]	; get rowsleft
+	mov esi, edi
+
+	jmp nextrow_block_d1
+align 32
+nextrow_block_d1:		
+
+		;; Do the (+0,+0) SAD
+		
+	movq mm4, [eax]		; load 1st 8 bytes of p1
+	movq mm6, mm4
+	movq mm5, [ebx]
+	psadbw mm4, mm5	; compare to 1st 8 bytes of p2 
+	paddd mm0, mm4		; accumulate difference
+	movq mm4, [eax+8]	; load 2nd 8 bytes of p1
+	movq mm7, mm4		
+	psadbw mm4, [ebx+8]	; compare to 2nd 8 bytes of p2 
+	paddd mm0, mm4		; accumulate difference
+
+		
+    cmp edi, esi
+	jz  firstrow0
+
+		;; Do the (0,+2) SAD
+	sub ebx, edx
+	psadbw mm6, [ebx]	; compare to next 8 bytes of p2 (row 1)
+	paddd mm2, mm6		; accumulate difference
+	psadbw mm7, [ebx+8]	;  next 8 bytes of p1 (row 1)
+	add ebx, edx
+	paddd mm2, mm7	
+
+firstrow0:
+
+		;; Do the (+2,0) SAD
+	
+	movq mm4, [eax+1]
+				
+	movq mm6, mm4
+	psadbw mm4, mm5	; compare to 1st 8 bytes of p2
+	paddd mm1, mm4		; accumulate difference
+	movq mm4, [eax+9]
+	movq mm7, mm4
+	psadbw mm4, [ebx+8]	; compare to 2nd 8 bytes of p2
+	paddd mm1, mm4		; accumulate difference
+
+    cmp edi, esi
+	jz  firstrow1
+
+		;; Do the (+2, +2 ) SAD
+	sub ebx, edx
+	psadbw mm6, [ebx]	; compare to 1st 8 bytes of prev p2 
+	psadbw mm7, [ebx+8]	;  2nd 8 bytes of prev p2
+	add ebx, edx
+	paddd mm3, mm6		; accumulate difference
+	paddd mm3, mm7	
+firstrow1:		
+
+	add eax, edx				; update pointer to next row
+	add ebx, edx		; ditto
+		
+	sub edi, 1
+	jnz near nextrow_block_d1
+
+		;; Do the last row of the (0,+2) SAD
+
+	movq mm4, [eax]		; load 1st 8 bytes of p1
+	movq mm5, [eax+8]	; load 2nd 8 bytes of p1
+	sub  ebx, edx
+	psadbw mm4, [ebx]	; compare to next 8 bytes of p2 (row 1)
+	psadbw mm5, [ebx+8]	;  next 8 bytes of p1 (row 1)
+	paddd mm2, mm4		; accumulate difference
+	paddd mm2, mm5
+		
+	movq mm4, [eax+1]
+	movq mm5, [eax+9]
+		
+		;; Do the last row of rhw (+2, +2) SAD
+	psadbw mm4, [ebx]	; compare to 1st 8 bytes of prev p2 
+	psadbw mm5, [ebx+8]	;  2nd 8 bytes of prev p2
+	paddd mm3, mm4		; accumulate difference
+	paddd mm3, mm5
+		
+
+	mov eax, [ebp+24]			; Weightvec
+	movd [eax+0], mm0
+	movd [eax+4], mm1
+	movd [eax+8], mm2
+	movd [eax+12], mm3
+		
+	pop esi
+	pop edi
+	pop edx	
+	pop ecx	
+	pop ebx	
+	pop eax
+		
+	pop ebp	
+	emms
+	ret	
+
+
+
+global mblock_nearest4_sads_mmxe
+
+; void mblock_nearest4_sads_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
+
+; eax = p1
+; ebx = p2
+; ecx = unused
+; edx = lx;
+; edi = rowsleft
+; esi = h
+		
+; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
+; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
+		
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+; mm7 = temp						
+
+align 32
+mblock_nearest4_sads_mmxe:
+	push ebp					; save frame pointer
+	mov ebp, esp				; link
+	push eax
+	push ebx
+	push ecx
+	push edx
+	push edi
+	push esi
+
+	mov eax, [ebp+8]	; get p1
+	prefetcht0 [eax]
+	pxor mm0, mm0		; zero accumulators
+	pxor mm1, mm1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	
+	mov edi, [ebp+20]	; get rowsleft
+	mov esi, edi
+
+	jmp nextrow_block_e1
+align 32
+nextrow_block_e1:		
+
+		;; Do the (+0,+0) SAD
+	prefetcht0 [eax+edx]		
+	movq mm4, [eax]		; load 1st 8 bytes of p1
+	movq mm6, mm4
+	movq mm5, [ebx]
+	psadbw mm4, mm5	; compare to 1st 8 bytes of p2 
+	paddd mm0, mm4		; accumulate difference
+	movq mm4, [eax+8]	; load 2nd 8 bytes of p1
+	movq mm7, mm4		
+	psadbw mm4, [ebx+8]	; compare to 2nd 8 bytes of p2 
+	paddd mm0, mm4		; accumulate difference
+
+		
+    cmp edi, esi
+	jz  firstrowe0
+
+		;; Do the (0,+2) SAD
+	sub ebx, edx
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	movq   mm2, [ebx]
+	psadbw mm6, mm2	    ; compare to next 8 bytes of p2 (row 1)
+	paddd mm0, mm6		; accumulate difference
+	movq  mm3, [ebx+8]
+	psadbw mm7, mm3	;  next 8 bytes of p1 (row 1)
+	add ebx, edx
+	paddd mm0, mm7	
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 
+firstrowe0:
+
+		;; Do the (+2,0) SAD
+	
+	movq mm4, [eax+1]
+	movq mm6, mm4
+
+	psadbw mm4, mm5	; compare to 1st 8 bytes of p2
+	paddd mm1, mm4		; accumulate difference
+
+	movq mm4, [eax+9]
+	movq mm7, mm4
+
+	psadbw mm4, [ebx+8]	; compare to 2nd 8 bytes of p2
+	paddd mm1, mm4		; accumulate difference
+
+    cmp edi, esi
+	jz  firstrowe1
+
+		;; Do the (+2, +2 ) SAD
+	sub ebx, edx
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 
+	psadbw mm6, mm2	; compare to 1st 8 bytes of prev p2 
+	psadbw mm7, mm3	;  2nd 8 bytes of prev p2
+	add ebx, edx
+	paddd mm1, mm6		; accumulate difference
+	paddd mm1, mm7
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 
+firstrowe1:		
+
+	add eax, edx				; update pointer to next row
+	add ebx, edx		; ditto
+		
+	sub edi, 1
+	jnz near nextrow_block_e1
+
+		;; Do the last row of the (0,+2) SAD
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	movq mm4, [eax]		; load 1st 8 bytes of p1
+	movq mm5, [eax+8]	; load 2nd 8 bytes of p1
+	sub  ebx, edx
+	psadbw mm4, [ebx]	; compare to next 8 bytes of p2 (row 1)
+	psadbw mm5, [ebx+8]	;  next 8 bytes of p1 (row 1)
+	paddd mm0, mm4		; accumulate difference
+	paddd mm0, mm5
+
+		
+		;; Do the last row of rhw (+2, +2) SAD
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64				
+	movq mm4, [eax+1]
+	movq mm5, [eax+9]
+
+	psadbw mm4, [ebx]	; compare to 1st 8 bytes of prev p2 
+	psadbw mm5, [ebx+8]	;  2nd 8 bytes of prev p2
+	paddd mm1, mm4		; accumulate difference
+	paddd mm1, mm5
+		
+
+	mov eax, [ebp+24]			; Weightvec
+	movd [eax+8], mm0
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	movd [eax+12], mm1
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	movd [eax+0], mm0
+	movd [eax+4], mm1
+		
+	pop esi
+	pop edi
+	pop edx	
+	pop ecx	
+	pop ebx	
+	pop eax
+		
+	pop ebp	
+	emms
+	ret
+
+global mblock_sub22_nearest4_sads_mmxe
+
+; void mblock_sub22_nearest4_sads_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
+
+; eax = p1
+; ebx = p2
+; ecx = counter temp
+; edx = flx;
+
+; mm0 = distance accumulator
+; mm1 = distance accumulator
+; mm2 = previous p1 row
+; mm3 = previous p1 displaced by 1 byte...
+; mm4 = temp
+; mm5 = temp
+; mm6 = temp
+; mm7 = temp / 0 if first row 0xff otherwise
+
+
+align 32
+mblock_sub22_nearest4_sads_mmxe:
+	push ebp		; save frame pointer
+	mov ebp, esp
+	push eax
+	push ebx	
+	push ecx
+	push edx	
+
+	pxor mm0, mm0		; zero acculumator
+	pxor mm1, mm1		; zero acculumator
+	pxor mm2, mm2		; zero acculumator
+	pxor mm3, mm3		; zero acculumator						
+
+	mov eax, [ebp+8]	; get p1
+	mov ebx, [ebp+12]	; get p2
+	mov edx, [ebp+16]	; get lx
+	mov ecx, [ebp+20]
+	movq mm2, [eax+edx]
+	movq mm3, [eax+edx+1]
+	jmp nextrowbd22
+align 32
+nextrowbd22:
+	movq   mm5, [ebx]			; load previous row reference block
+								; mm2 /mm3 containts current row target block
+		
+	psadbw mm2, mm5				; Comparse (x+0,y+2)
+	paddd  mm1, mm2
+		
+	psadbw mm3, mm5				; Compare (x+2,y+2)
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	paddd  mm1, mm3
+
+	pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 					
+
+	movq mm2, [eax]				; Load current row traget block into mm2 / mm3
+	movq mm6, mm2
+	movq mm3, [eax+1]
+	sub	   eax, edx
+	sub	   ebx, edx
+	prefetcht0 [eax]
+	movq mm7, mm3		
+
+	psadbw	mm6, mm5			; Compare (x+0,y+0)
+	paddd   mm0, mm6
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+	psadbw  mm7, mm5			; Compare (x+2,y+0)
+	paddd   mm0, mm7
+	pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
+
+	sub ecx, 1
+	jnz nextrowbd22
+
+	mov  eax, [ebp+24]
+	movq [eax+0], mm0
+	movq [eax+8], mm1
+	pop edx	
+	pop ecx	
+	pop ebx	
+	pop eax
+	pop ebp
+
+	emms
+	ret
+
+		
diff --git a/veejay-current/utils/mblock_sub44_sads_x86.c b/veejay-current/utils/mblock_sub44_sads_x86.c
new file mode 100644
index 00000000..35b304da
--- /dev/null
+++ b/veejay-current/utils/mblock_sub44_sads_x86.c
@@ -0,0 +1,245 @@
+/*
+ *
+ * mblock_sub44_sads.c
+ * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+ *
+ * Fast block sum-absolute difference computation for a rectangular area 4*x
+ * by y where y > h against a 4 by h block.
+ *
+ * Used for 4*4 sub-sampled motion compensation calculations.
+ *
+ * This is actually just a shell that uses templates from the included
+ * file "mblock_sub44_sads_x86_h.c".  I didn't trust the compiler to do a good
+ * job on nested inlining.  One day I'll experiment.
+ * 
+ *
+ * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
+ * based on the original MSSG reference design
+ *
+ * mpeg2enc is free software; you can redistribute new parts
+ * and/or modify under the terms of the GNU General Public License 
+ * as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See the files for those sections (c) MSSG
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <config.h>
+#include "mjpeg_types.h"
+#include "attributes.h"
+#include "mmx.h"
+#include "fastintfns.h"
+#include "motionsearch.h"
+#include "mblock_sub44_sads_x86.h"
+
+/*
+  Register usage:
+  mm0-mm3  Hold the current row 
+  mm4      Used for accumulating partial SAD
+  mm7      Holds zero
+ */
+
+static inline void mmx_zero_reg (void)
+{
+	/*  load 0 into mm7	 */
+	pxor_r2r (mm7, mm7);
+}
+
+/*
+ * Load a 8*4 block of 4*4 sub-sampled pels (qpels) into the MMX
+ * registers
+ *
+ */
+
+static __inline__ void load_blk(uint8_t *blk,uint32_t rowstride,int h)
+{
+	movq_m2r( *blk, mm0);
+	blk += rowstride;
+	movq_m2r( *blk, mm1);
+	if( h == 2 )
+		return;
+	blk += rowstride;
+	movq_m2r( *blk, mm2);
+	blk += rowstride;
+	movq_m2r( *blk, mm3);
+}
+
+/*
+ * Do a shift right on the 4*4 block in the MMX registers
+ * We do this as a macro as otherwise we get warnings from the
+ * pre-optimised asm generated. 
+ */
+#define shift_blk(shift)\
+	psrlq_i2r( shift,mm0);\
+	psrlq_i2r( shift,mm1);\
+	psrlq_i2r( shift,mm2);\
+	psrlq_i2r( shift,mm3);
+
+/*
+ * Compute the Sum absolute differences between the 4*h block in
+ * the MMX registers (4 least sig. bytes in mm0..mm3)
+ *
+ * and the 4*h block pointed to by refblk
+ *
+ * h == 2 || h == 4
+ *
+ * TODO: Currently always loads and shifts 4*4 even if 4*2 is required.
+ *
+ */
+
+static __inline__ int qblock_sad_mmxe(uint8_t *refblk, 
+								  uint32_t h,
+								  uint32_t rowstride)
+{
+	int res;
+	pxor_r2r 	(mm4,mm4);
+			
+	movq_r2r	(mm0,mm5);		/* First row */
+	movd_m2r	(*refblk, mm6);
+	pxor_r2r    ( mm7, mm7);
+	refblk += rowstride;
+	punpcklbw_r2r	( mm7, mm5);
+	punpcklbw_r2r	( mm7, mm6);
+	psadbw_r2r      ( mm5, mm6);
+	paddw_r2r     ( mm6, mm4 );
+	
+
+
+	movq_r2r	(mm1,mm5);		/* Second row */
+	movd_m2r	(*refblk, mm6);
+	refblk += rowstride;
+	punpcklbw_r2r	( mm7, mm5);
+	punpcklbw_r2r	( mm7, mm6);
+	psadbw_r2r      ( mm5, mm6);
+	paddw_r2r     ( mm6, mm4 );
+
+	if( h == 4 )
+	{
+
+		movq_r2r	(mm2,mm5);		/* Third row */
+		movd_m2r	(*refblk, mm6);
+		refblk += rowstride;
+		punpcklbw_r2r	( mm7, mm5);
+		punpcklbw_r2r	( mm7, mm6);
+		psadbw_r2r      ( mm5, mm6);
+		paddw_r2r     ( mm6, mm4 );
+
+		
+		movq_r2r	(mm3,mm5);		/* Fourth row */
+		movd_m2r	(*refblk, mm6);
+		punpcklbw_r2r	( mm7, mm5);
+		punpcklbw_r2r	( mm7, mm6);
+		psadbw_r2r      ( mm5, mm6);
+		paddw_r2r     ( mm6, mm4 );
+		
+	}
+	movd_r2m      ( mm4, res );
+
+	return res;
+}
+
+
+
+static __inline__ int qblock_sad_mmx(uint8_t *refblk, 
+								  uint32_t h,
+								  uint32_t rowstride)
+{
+	int res;
+	pxor_r2r 	(mm4,mm4);
+			
+	movq_r2r	(mm0,mm5);		/* First row */
+	movd_m2r	(*refblk, mm6);
+	pxor_r2r    ( mm7, mm7);
+	refblk += rowstride;
+	punpcklbw_r2r	( mm7, mm5);
+
+	punpcklbw_r2r	( mm7, mm6);
+
+	movq_r2r		( mm5, mm7);
+	psubusw_r2r	( mm6, mm5);
+
+	psubusw_r2r   ( mm7, mm6);
+
+	paddw_r2r     ( mm5, mm4);
+	paddw_r2r     ( mm6, mm4 );
+	
+
+
+	movq_r2r	(mm1,mm5);		/* Second row */
+	movd_m2r	(*refblk, mm6);
+	pxor_r2r    ( mm7, mm7);
+	refblk += rowstride;
+	punpcklbw_r2r	( mm7, mm5);
+	punpcklbw_r2r	( mm7, mm6);
+	movq_r2r		( mm5, mm7);
+	psubusw_r2r	( mm6, mm5);
+	psubusw_r2r   ( mm7, mm6);
+	paddw_r2r     ( mm5, mm4);
+	paddw_r2r     ( mm6, mm4 );
+
+	if( h == 4 )
+	{
+
+		movq_r2r	(mm2,mm5);		/* Third row */
+		movd_m2r	(*refblk, mm6);
+		pxor_r2r    ( mm7, mm7);
+		refblk += rowstride;
+		punpcklbw_r2r	( mm7, mm5);
+		punpcklbw_r2r	( mm7, mm6);
+		movq_r2r		( mm5, mm7);
+		psubusw_r2r	( mm6, mm5);
+		psubusw_r2r   ( mm7, mm6);
+		paddw_r2r     ( mm5, mm4);
+		paddw_r2r     ( mm6, mm4 );
+		
+		movq_r2r	(mm3,mm5);		/* Fourth row */
+		movd_m2r	(*refblk, mm6);
+		pxor_r2r    ( mm7, mm7);
+		punpcklbw_r2r	( mm7, mm5);
+		punpcklbw_r2r	( mm7, mm6);
+		movq_r2r		( mm5, mm7);
+		psubusw_r2r	( mm6, mm5);
+		psubusw_r2r   ( mm7, mm6);
+		paddw_r2r     ( mm5, mm4);
+		paddw_r2r     ( mm6, mm4 );
+	}
+
+
+	movq_r2r      ( mm4, mm5 );
+    psrlq_i2r     ( 32, mm5 );
+    paddw_r2r     ( mm5, mm4 );
+	movq_r2r      ( mm4, mm6 );
+    psrlq_i2r     ( 16, mm6 );
+    paddw_r2r     ( mm6, mm4 );
+	movd_r2m      ( mm4, res );
+
+	return res & 0xffff;
+}
+
+
+/*
+ * Do the Extended MMX versions
+ */
+#define SIMD_SUFFIX(x) x##_mmxe
+#include "mblock_sub44_sads_x86_h.c"
+#undef SIMD_SUFFIX
+/*
+ * Do the original MMX versions
+ */
+#define SIMD_SUFFIX(x) x##_mmx
+#include "mblock_sub44_sads_x86_h.c"
+#undef SIMD_SUFFIX
+
+
+
+
diff --git a/veejay-current/utils/mblock_sub44_sads_x86.h b/veejay-current/utils/mblock_sub44_sads_x86.h
new file mode 100644
index 00000000..11d35074
--- /dev/null
+++ b/veejay-current/utils/mblock_sub44_sads_x86.h
@@ -0,0 +1,39 @@
+/* (C) 2000/2002 Andrew Stevens */
+
+/*  This software is free software; you can redistribute it
+ *  and/or modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2 of
+ *  the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ */
+
+#ifndef __MBLOCK_SUB44_SADS_X86_H__
+#define __MBLOCK_SUB44_SADS_X86__
+
+#include <config.h>
+#include "mjpeg_types.h"
+
+int mblocks_sub44_mests_mmxe( uint8_t *blk,  uint8_t *ref,
+							 int ilow, int jlow,
+							 int ihigh, int jhigh, 
+							 int h, int rowstride, 
+							 int threshold,
+							 me_result_s *resvec);
+int mblocks_sub44_mests_mmx( uint8_t *blk,  uint8_t *ref,
+							int ilow, int jlow,
+							int ihigh, int jhigh, 
+							int threshold,
+							int h, int rowstride, 
+							me_result_s *resvec);
+
+#endif
diff --git a/veejay-current/utils/mblock_sub44_sads_x86_h.c b/veejay-current/utils/mblock_sub44_sads_x86_h.c
new file mode 100644
index 00000000..a2916a83
--- /dev/null
+++ b/veejay-current/utils/mblock_sub44_sads_x86_h.c
@@ -0,0 +1,98 @@
+/*
+ *
+ * mblock_sub44_sads_x86_h.c
+ * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+ *
+ * Fast block sum-absolute difference computation for a rectangular area 4*x
+ * by y where y > h against a 4 by h block.
+ *
+ * Used for 4*4 sub-sampled motion compensation calculations.
+ * 
+ *
+ * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
+ * based on the original MSSG reference design
+ *
+ * mpeg2enc is free software; you can redistribute new parts
+ * and/or modify under the terms of the GNU General Public License 
+ * as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2enc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See the files for those sections (c) MSSG
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ *
+ * Generates a vector sad's for 4*4 sub-sampled pel (qpel) data (with
+ * co-ordinates and top-left qpel address) from specified rectangle
+ * against a specified 16*h pel (4*4 qpel) reference block.  The
+ * generated vector contains results only for those sad's that fall
+ * below twice the running best sad and are aligned on 8-pel
+ * boundaries
+ *
+ * Invariant: blk points to top-left sub-sampled pel for macroblock
+ * at (ilow,ihigh)
+ * i{low,high) j(low,high) must be multiples of 4.
+ *
+ * sad = Sum Absolute Differences
+ *
+ * NOTES: for best efficiency i{low,high) should be multiples of 16.
+ *
+ * */
+
+int SIMD_SUFFIX(mblocks_sub44_mests)( uint8_t *blk,  uint8_t *ref,
+									  int ilow,int jlow,
+									  int ihigh, int jhigh, 
+									  int h, int rowstride, 
+									  int threshold,
+									  me_result_s *resvec)
+{
+	int32_t x,y;
+	uint8_t *currowblk = blk;
+	uint8_t *curblk;
+	me_result_s *cres = resvec;
+	int      gridrowstride = (rowstride);
+	int weight;
+
+	for( y=jlow; y <= jhigh ; y+=4)
+	{
+		curblk = currowblk;
+		for( x = ilow; x <= ihigh; x += 4)
+		{
+			if( (x & 15) == (ilow & 15) )
+			{
+				load_blk( curblk, rowstride, h );
+			}
+			weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride);
+			if( weight <= threshold )
+			{
+				threshold = intmin(weight<<2,threshold);
+				/* Rough and-ready absolute distance penalty */
+				/* NOTE: This penalty is *vital* to correct operation 
+				   as otherwise the sub-mean filtering won't work on very
+				   uniform images.
+				 */
+				cres->weight = (uint16_t)(weight+(intmax(intabs(x),intabs(y))<<2));
+				cres->x = (uint8_t)x;
+				cres->y = (uint8_t)y;
+				++cres;
+			}
+			curblk += 1;
+			shift_blk(8);
+		}
+		currowblk += gridrowstride;
+	}
+	emms();
+	return cres - resvec;
+}
+
+#undef concat
diff --git a/veejay-current/utils/mblock_sumsq_mmx.s b/veejay-current/utils/mblock_sumsq_mmx.s
new file mode 100644
index 00000000..6ee326b9
--- /dev/null
+++ b/veejay-current/utils/mblock_sumsq_mmx.s
@@ -0,0 +1,673 @@
+;
+;  dist2_mmx.s:  mmX optimized squared distance sum
+;
+;  Original believed to be Copyright (C) 2000 Brent Byeler
+;
+;  This program is free software; you can reaxstribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+
+; total squared difference between two (16*h) blocks
+; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
+; blk1,blk2: addresses of top left pels of both blocks
+; lx:        distance (in bytes) of vertically adjacent pels
+; hx,hy:     flags for horizontal and/or vertical interpolation
+; h:         height of block (usually 8 or 16)
+; mmX version
+
+global sumsq_mmx
+; int sumsq_mmx(unsigned char *blk1, unsigned char *blk2,
+;                 int lx, int hx, int hy, int h)
+
+; mm7 = 0
+
+; eax = pblk1 
+; ebx = pblk2
+; ecx = temp
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+		;; 
+		;;  private constants needed
+		;; 
+
+SECTION .data
+align 16
+twos:	
+			dw	2
+			dw	2
+			dw	2
+			dw	2
+
+SECTION .text
+
+align 32
+sumsq_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+	push esi     
+	push edi
+
+	mov		esi, [ebp+16] ; lx
+	mov     eax, [ebp+20] ; hx
+	mov     edx, [ebp+24] ; hy
+	mov     edi, [ebp+28] ; h
+
+    pxor      mm5, mm5      ; sum
+	test      edi, edi     ; h = 0?
+	jle       near d2exit
+
+	pxor	  mm7, mm7     ; get zeros i mm7
+
+	test      eax, eax     ; hx != 0?
+	jne       near d2is10
+	test      edx, edx     ; hy != 0?
+	jne       near d2is10
+
+	mov       eax, [ebp+8]
+    mov       ebx, [ebp+12]
+	jmp 	 d2top00
+
+align 32
+d2top00:
+        movq      mm0, [eax]
+        movq      mm1, mm0
+        punpcklbw mm0, mm7
+        punpckhbw mm1, mm7
+
+        movq      mm2, [ebx]
+        movq      mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm0, mm1
+
+        movq      mm1, [eax+8]
+        movq      mm2, mm1
+        punpcklbw mm1, mm7
+        punpckhbw mm2, mm7
+
+        movq      mm3, [ebx+8]
+        movq      mm4, mm3
+        punpcklbw mm3, mm7
+        punpckhbw mm4, mm7
+
+        psubw     mm1, mm3
+        psubw     mm2, mm4
+        pmaddwd   mm1, mm1
+        pmaddwd   mm2, mm2
+        paddd     mm1, mm2
+
+        paddd     mm0, mm1
+	
+		;; Accumulate sum in edx... we use mm5
+		;movd	  ecx, mm0
+        ;add       edx, ecx
+	    ;psrlq	  mm0, 32
+	    ;movd	  ecx, mm0
+        ;add       edx, ecx
+		paddd    mm5, mm0
+
+	add       eax, esi
+	add       ebx, esi
+	dec       edi
+	jg        d2top00
+        jmp       near d2exit
+	;jmp         d2exit changes made because of problems with nasm 0.98.23
+
+
+d2is10:
+	test      eax, eax
+	je        near d2is01
+	test      edx, edx
+	jne       near d2is01
+
+ 
+	mov       eax, [ebp+8] ; blk1
+	mov       ebx, [ebp+12] ; blk1
+
+	pxor	  mm6, mm6    ; mm6 = 0 and isn't changed anyplace in the loop..
+	pcmpeqw	  mm1, mm1
+	psubw	  mm6, mm1
+	jmp		  d2top10
+	
+align 32
+d2top10:
+	movq	  mm0, [eax]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [eax+1]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6   ; here we add mm6 = 0.... weird...
+	paddw	  mm1, mm6
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+
+	movq	  mm2, [ebx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm0, mm1
+
+	movq	  mm1, [eax+8]
+	movq	  mm2, mm1
+	punpcklbw mm1, mm7
+	punpckhbw mm2, mm7
+	movq	  mm3, [eax+9]
+	movq	  mm4, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm4, mm7
+	paddw	  mm1, mm3
+	paddw	  mm2, mm4
+	paddw	  mm1, mm6
+	paddw	  mm2, mm6
+	psrlw	  mm1, 1
+	psrlw	  mm2, 1
+
+	movq	  mm3, [ebx+8]
+	movq	  mm4, mm3
+        punpcklbw mm3, mm7
+        punpckhbw mm4, mm7
+
+        psubw     mm1, mm3
+        psubw     mm2, mm4
+        pmaddwd   mm1, mm1
+        pmaddwd   mm2, mm2
+        paddd     mm1, mm2
+
+
+	paddd	  mm0, mm1
+		; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
+		; movd	  ecx, mm0
+        ; add       edx, ecx
+		; psrlq	  mm0, 32
+		; movd	  ecx, mm0
+        ; add       edx, ecx
+	paddd     mm5, mm0
+	add       eax, esi
+	add       ebx, esi
+	dec       edi
+	jg        near d2top10
+	
+	
+	jmp       near d2exit
+	;jmp       d2exit   changes made because of problems with nasm 0.98.23
+
+d2is01:
+	test      eax, eax
+	jne       near d2is11
+	test      edx, edx
+	je        near d2is11
+
+	mov       eax, [ebp+8] ; blk1
+	mov       edx, [ebp+12] ; blk2
+	mov       ebx, eax
+	add       ebx, esi ;  blk1 + lx
+
+	pxor	  mm6, mm6
+	pcmpeqw	  mm1, mm1
+	psubw	  mm6, mm1  ; mm6 = 1
+	jmp		  d2top01
+	
+align 32
+d2top01:
+	movq	  mm0, [eax]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [ebx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	paddw	  mm0, mm6
+	paddw	  mm1, mm6
+	psrlw	  mm0, 1
+	psrlw	  mm1, 1
+
+	movq	  mm2, [edx]
+	movq	  mm3, mm2
+    punpcklbw mm2, mm7
+    punpckhbw mm3, mm7
+
+    psubw     mm0, mm2
+    psubw     mm1, mm3
+
+	pmaddwd   mm0, mm0
+    pmaddwd   mm1, mm1
+    paddd     mm0, mm1
+
+	movq	  mm1, [eax+8]
+	movq	  mm2, mm1
+	punpcklbw mm1, mm7
+	punpckhbw mm2, mm7
+	
+	movq	  mm3, [ebx+8]
+	movq	  mm4, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm4, mm7
+
+	paddw	  mm1, mm3
+	paddw	  mm2, mm4
+	paddw	  mm1, mm6
+	paddw	  mm2, mm6
+	psrlw	  mm1, 1
+	psrlw	  mm2, 1
+
+	movq	  mm3, [edx+8]
+	movq	  mm4, mm3
+    punpcklbw mm3, mm7
+    punpckhbw mm4, mm7
+
+    psubw     mm1, mm3
+    psubw     mm2, mm4
+
+	pmaddwd   mm1, mm1
+    pmaddwd   mm2, mm2
+    paddd     mm0, mm1
+	paddd	  mm0, mm2
+
+	paddd     mm5, mm0
+
+	mov       eax, ebx    ; eax = eax + lx
+	add       edx, esi    ; edx = edx + lx
+	add       ebx, esi    ; ebx = ebx + lx
+	dec       edi
+	jg        near d2top01
+	jmp       near d2exit
+	;jmp       d2exit    changes made because of problems with nasm 0.98.23
+
+d2is11:
+	mov       eax, [ebp+8] ; blk1
+	mov       edx, [ebp+12] ; blk2
+	mov       ebx, eax  ;  blk1
+	add       ebx, esi  ; ebx = blk1 + lx
+	jmp		  d2top11
+	
+align 32
+d2top11:
+	movq	  mm0, [eax]
+	movq	  mm1, mm0
+	punpcklbw mm0, mm7
+	punpckhbw mm1, mm7
+	movq	  mm2, [eax+1]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq	  mm2, [ebx]
+	movq	  mm3, mm2
+	punpcklbw mm2, mm7
+	punpckhbw mm3, mm7
+	movq	  mm4, [ebx+1]
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+	paddw	  mm2, mm4
+	paddw	  mm3, mm6
+	paddw	  mm0, mm2
+	paddw	  mm1, mm3
+	movq      mm6, [twos]
+	paddw	  mm0, mm6    ; round mm0
+	paddw	  mm1, mm6    ; round mm1
+	psrlw	  mm0, 2
+	psrlw	  mm1, 2
+
+	movq	  mm2, [edx]
+	movq	  mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm0, mm1
+
+	movq	  mm1, [eax+8]
+	movq	  mm2, mm1
+	punpcklbw mm1, mm7
+	punpckhbw mm2, mm7
+	
+	movq	  mm3, [eax+9]
+	movq	  mm4, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm4, mm7
+	
+	paddw	  mm1, mm3
+	paddw	  mm2, mm4
+	
+	movq	  mm3, [ebx+8]
+	movq	  mm4, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm4, mm7
+	paddw     mm1, mm3
+	paddw     mm2, mm4 
+	
+	movq	  mm3, [ebx+9]
+	movq	  mm4, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm4, mm7
+
+	paddw	  mm1, mm3
+	paddw	  mm2, mm4
+
+	movq      mm6, [twos]
+	paddw     mm1, mm6
+	paddw	  mm2, mm6
+	
+	psrlw	  mm1, 2
+	psrlw	  mm2, 2
+
+	movq	  mm3, [edx+8]
+	movq	  mm4, mm3
+        punpcklbw mm3, mm7
+        punpckhbw mm4, mm7
+
+        psubw     mm1, mm3
+        psubw     mm2, mm4
+        pmaddwd   mm1, mm1
+        pmaddwd   mm2, mm2
+        paddd     mm1, mm2
+
+	paddd	  mm0, mm1
+	
+	;;
+	;; Accumulate the result in "s" we use mm6 for the purpose...
+	paddd     mm5, mm0
+
+	mov       eax, ebx    ; ahem ebx = eax at start of loop and wasn't changed...
+	add       ebx, esi   
+	add       edx, esi
+	dec       edi
+	jg        near d2top11
+
+
+d2exit:
+	;; Put the final sum in eax for return...
+	movd	  eax, mm5
+	psrlq	  mm5, 32
+	movd	  ecx, mm5
+    add       eax, ecx
+
+	pop edi
+	pop esi
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret	
+
+
+; total squared difference between two (8*h) blocks
+; blk1,blk2: addresses of top left pels of both blocks
+; lx:        distance (in bytes) of vertically adjacent pels
+; h:         height of block (usually 4, or 8)
+; mmX version
+
+global sumsq_sub22_mmx
+; int sumsq_sub22_mmx(unsigned char *blk1, unsigned char *blk2,
+;                 int lx, int h)
+
+; mm7 = 0
+
+; eax = pblk1 
+; ebx = pblk2
+; ecx = temp
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+align 32
+sumsq_sub22_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+	push esi     
+	push edi
+
+	mov		esi, [ebp+16] ; lx
+	mov     edi, [ebp+20] ; h
+
+    pxor      mm5, mm5      ; sum
+	test      edi, edi     ; h = 0?
+	jle       near d2exit
+
+	pxor	  mm7, mm7     ; get zeros i mm7
+
+	mov       eax, [ebp+8]		; blk1
+    mov       ebx, [ebp+12]		; blk2
+	jmp 	 d2top22
+
+align 32
+d2top22:
+        movq      mm0, [eax]
+        movq      mm1, mm0
+        punpcklbw mm0, mm7
+        punpckhbw mm1, mm7
+
+        movq      mm2, [ebx]
+        movq      mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+        psubw     mm0, mm2
+        psubw     mm1, mm3
+        pmaddwd   mm0, mm0
+        pmaddwd   mm1, mm1
+        paddd     mm5, mm0
+		paddd    mm5, mm1
+
+			add       eax, esi
+	add       ebx, esi
+	dec       edi
+	jg        d2top22
+	jmp       d2exit
+
+
+; total squared difference between interpolation of two (8*h) blocks and
+; another 8*h block		
+; blk1,blk2: addresses of top left pels of both blocks
+; lx:        distance (in bytes) of vertically adjacent pels
+; h:         height of block (usually 4, or 8)
+; mmX version
+		
+global bsumsq_sub22_mmx
+; int bsumsq_sub22_mmx(unsigned char *blk1f, unsigned char*blk1b,
+;				   unsigned char *blk2,
+;                 int lx, int h)
+
+; mm7 = 0
+
+; eax = pblk1f 
+; ebx = pblk2
+; ecx = pblk1b
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+align 32
+bsumsq_sub22_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+	push esi     
+	push edi
+
+	mov		esi, [ebp+20] ; lx
+	mov     edi, [ebp+24] ; h
+
+    pxor      mm5, mm5      ; sum
+	test      edi, edi     ; h = 0?
+	jle       near d2exit
+
+	pxor	  mm7, mm7     ; get zeros i mm7
+
+	mov       eax, [ebp+8]		; blk1f
+    mov       ebx, [ebp+12]		; blk1b
+    mov       ecx, [ebp+16]		; blk2		
+	jmp 	 bd2top22
+
+align 32
+bd2top22:
+        movq      mm0, [eax]
+        movq      mm1, mm0
+		movq      mm4, [ebx]
+		movq      mm6, mm4
+        punpcklbw mm0, mm7
+        punpckhbw mm1, mm7
+		punpcklbw mm4, mm7
+		punpckhbw mm6, mm7
+
+        movq      mm2, [ecx]
+        movq      mm3, mm2
+        punpcklbw mm2, mm7
+        punpckhbw mm3, mm7
+
+		paddw	  mm0, mm4
+		psrlw     mm0, 1
+        psubw     mm0, mm2
+        pmaddwd   mm0, mm0
+		paddw     mm1, mm6
+		psrlw	  mm1, 1
+        psubw     mm1, mm3
+        pmaddwd   mm1, mm1
+        paddd     mm5, mm0
+		paddd    mm5, mm1
+
+		add       eax, esi
+		add       ebx, esi
+		add		  ecx, esi
+		dec       edi
+		jg        bd2top22
+		jmp       d2exit
+
+
+global variance_mmx
+		
+;;;  variance of a (size*size) block, multiplied by 256
+;;; p:  address of top left pel of block
+;;; lx: seperation (in bytes) of vertically adjacent pels
+;;; NOTE:		 size  is 8 or 16
+
+;;; int variance_mmx(uint8_t *p, int size,	int lx)
+
+		
+variance_mmx:
+	push ebp			; save frame pointer
+	mov ebp, esp		; link
+	push ebx
+	push ecx
+	push edx
+
+	mov     eax, 0              ; col offset...
+	mov		edx, [ebp+16]       ; lx
+
+	pxor    mm0, mm0
+	pxor    mm7, mm7			; Zero sum accumulator (4 words) 
+	pxor    mm6, mm6			; Zero squares accumulator (2 dwords)
+varcols:
+	mov		ebx, [ebp+8]		; p
+	mov     ecx, [ebp+12]		; size
+varrows:		
+	
+	movq		mm2, [ebx+eax]
+	movq		mm3, mm2 
+	punpcklbw   mm2, mm0
+	punpckhbw   mm3, mm0
+	
+	movq		mm4, mm2
+	movq		mm5, mm3
+
+	pmaddwd		mm4, mm2		;  Squares 0:3
+	paddw		mm7, mm2		; Accumulate sum 0:3 (words)
+	paddd		mm6, mm4		; Accumulate sum squares 0:3 (dwords)
+
+	pmaddwd		mm5, mm3		;  Squares 4:7 (words)
+	paddw		mm7, mm3		; Accumulate sum 0:3 (words)
+	paddd		mm6, mm5		; Accumulate sum squares 4:7 (dwords)
+
+	
+	add			ebx, edx		; Next row
+	dec			ecx
+	jnz			varrows
+
+	add			eax, 8
+	cmp			eax, [ebp+12]
+	jl			varcols
+
+		;; Sum squared -> eax
+
+	movq		mm1, mm7
+	psrlq		mm1, 32
+	paddw		mm7, mm1
+	movq		mm1, mm7
+	psrlq		mm1, 16
+	paddw		mm7, mm1
+	movd		eax, mm7
+	and			eax, 0xffff
+	imul		eax, eax
+
+		;; Squares sum -> ecx
+
+	movq		mm1, mm6
+	psrlq		mm1, 32
+	paddd		mm6, mm1
+	movd		ecx, mm6
+	
+	mov			ebx, [ebp+12]
+	shr			eax, 6			; Divide sum squared by 64 for 8*8
+	cmp			ebx, 8
+	jz			var8_8			; If 16 * 16 divide again by 4 (256)
+	shr			eax, 2
+var8_8:			
+
+	sub			ecx, eax
+	mov			eax, ecx
+
+	pop edx
+	pop ecx
+	pop ebx
+
+	pop ebp			; restore stack pointer
+
+	emms			; clear mmx registers
+	ret	
diff --git a/veejay-current/utils/mjpeg_logging.c b/veejay-current/utils/mjpeg_logging.c
new file mode 100644
index 00000000..891647fa
--- /dev/null
+++ b/veejay-current/utils/mjpeg_logging.c
@@ -0,0 +1,218 @@
+/*
+    $Id: mjpeg_logging.c,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $
+
+    Copyright (C) 2000 Herbert Valerio Riedel <hvr@gnu.org>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+extern int fred;
+#include "mjpeg_logging.h"
+
+static const char _rcsid[] = "$Id: ";
+
+#define MAX_DEFAULT_ID_SIZE 16
+#define DEFAULT_DEFAULT_ID "???"
+
+#ifdef HAVE___PROGNAME
+extern const char *__progname;
+#endif
+
+static log_level_t mjpeg_log_verbosity = 0;
+static char default_handler_id[MAX_DEFAULT_ID_SIZE];
+static char default_handler_id_is_set = 0;
+
+static int default_mjpeg_log_filter( log_level_t level )
+{
+  int verb_from_env;
+  if( mjpeg_log_verbosity == 0 )
+    {
+      char *mjpeg_verb_env = getenv("MJPEG_VERBOSITY");
+      if( mjpeg_verb_env != NULL )
+        {
+          verb_from_env = LOG_WARN-atoi(mjpeg_verb_env);
+          if( verb_from_env >= LOG_DEBUG && verb_from_env <= LOG_ERROR )
+            mjpeg_log_verbosity = verb_from_env;
+        }
+    }
+  return (level < LOG_WARN && level < mjpeg_log_verbosity);
+}
+
+static mjpeg_log_filter_t _filter = default_mjpeg_log_filter;
+
+static void
+default_mjpeg_log_handler(log_level_t level, const char message[])
+{
+  if( (*_filter)( level ) )
+    return;
+  switch(level) {
+  case LOG_ERROR:
+    fprintf(stderr, "E: %s\n", message);
+    break;
+  case LOG_DEBUG:
+    fprintf(stderr, "D: %s\n", message);
+    break;
+  case LOG_WARN:
+    fprintf(stderr, "W: %s\n", message);
+    break;
+  case LOG_INFO:
+    fprintf(stderr, "I: %s\n", message);
+    break;
+  default:
+    assert(0);
+  }
+}
+
+static mjpeg_log_handler_t _handler = default_mjpeg_log_handler;
+
+
+mjpeg_log_handler_t
+mjpeg_log_set_handler(mjpeg_log_handler_t new_handler)
+{
+  mjpeg_log_handler_t old_handler = _handler;
+
+  _handler = new_handler;
+
+  return old_handler;
+}
+
+/***************
+ *
+ * Set default log handlers degree of verboseity.
+ * 0 = quiet, 1 = info, 2 = debug
+ *
+ *************/
+
+int
+mjpeg_default_handler_verbosity(int verbosity)
+{
+  int prev_verb = mjpeg_log_verbosity;
+  mjpeg_log_verbosity = LOG_WARN - verbosity;
+  return prev_verb;
+}
+
+/*
+ * Set identifier string used by default handler
+ *
+ */
+int
+mjpeg_default_handler_identifier(const char *new_id)
+{
+  const char *s;
+  if (new_id == NULL) {
+    default_handler_id_is_set = 0;
+    return 0;
+  }
+  /* find basename of new_id (remove any directory prefix) */
+  if ((s = strrchr(new_id, '/')) == NULL)
+    s = new_id;
+  else
+    s = s + 1;
+  strncpy(default_handler_id, s, MAX_DEFAULT_ID_SIZE);
+  default_handler_id[MAX_DEFAULT_ID_SIZE-1] = '\0';
+  default_handler_id_is_set = 1;
+  return 0;
+}
+
+
+static void
+mjpeg_logv(log_level_t level, const char format[], va_list args)
+{
+  char buf[1024] = { 0, };
+
+  /* TODO: Original had a re-entrancy error trap to assist bug
+     finding.  To make this work with multi-threaded applications a
+     lock is needed hence delete.
+  */
+
+  
+  vsnprintf(buf, sizeof(buf)-1, format, args);
+
+  _handler(level, buf);
+}
+
+void
+mjpeg_log(log_level_t level, const char format[], ...)
+{
+  va_list args;
+  va_start (args, format);
+  mjpeg_logv(level, format, args);
+  va_end (args);
+}
+
+void
+mjpeg_debug(const char format[], ...)
+{
+  va_list args;
+  va_start (args, format);
+  mjpeg_logv(LOG_DEBUG, format, args);
+  va_end (args);
+}
+
+void
+mjpeg_info(const char format[], ...)
+{
+  va_list args;
+  va_start (args, format);
+  mjpeg_logv(LOG_INFO, format, args);
+  va_end (args);
+}
+
+void
+mjpeg_warn(const char format[], ...)
+{
+  va_list args;
+  va_start (args, format);
+  mjpeg_logv(LOG_WARN, format, args);
+  va_end (args);
+}
+
+void
+mjpeg_error(const char format[], ...)
+{
+  va_list args;
+  va_start (args, format);
+  mjpeg_logv(LOG_ERROR, format, args);
+  va_end (args);
+}
+
+void
+mjpeg_error_exit1(const char format[], ...)
+{
+  va_list args;
+  va_start( args, format );
+  mjpeg_logv( LOG_ERROR, format, args);
+  va_end(args);           
+  exit(EXIT_FAILURE);
+}
+
+
+/* 
+ * Local variables:
+ *  c-file-style: "gnu"
+ *  tab-width: 8
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/veejay-current/utils/mjpeg_logging.h b/veejay-current/utils/mjpeg_logging.h
new file mode 100644
index 00000000..7b1d5d89
--- /dev/null
+++ b/veejay-current/utils/mjpeg_logging.h
@@ -0,0 +1,79 @@
+/*
+    $Id: mjpeg_logging.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $
+
+    Copyright (C) 2000 Herbert Valerio Riedel <hvr@gnu.org>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef __MJPEG_LOGGING_H__
+#define __MJPEG_LOGGING_H__
+
+#include "mjpeg_types.h"
+
+typedef enum {
+  LOG_DEBUG = 1,
+  LOG_INFO,
+  LOG_WARN,
+  LOG_ERROR
+} log_level_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+mjpeg_log(log_level_t level, const char format[], ...) GNUC_PRINTF(2, 3);
+
+typedef int(*mjpeg_log_filter_t)(log_level_t level);
+    
+typedef void(*mjpeg_log_handler_t)(log_level_t level, const char message[]);
+
+mjpeg_log_handler_t
+mjpeg_log_set_handler(mjpeg_log_handler_t new_handler);
+
+int
+mjpeg_default_handler_identifier(const char *new_id);
+
+int
+mjpeg_default_handler_verbosity(int verbosity);
+
+void
+mjpeg_debug(const char format[], ...) GNUC_PRINTF(1,2);
+
+void
+mjpeg_info(const char format[], ...) GNUC_PRINTF(1,2);
+
+void
+mjpeg_warn(const char format[], ...) GNUC_PRINTF(1,2);
+
+void
+mjpeg_error(const char format[], ...) GNUC_PRINTF(1,2);
+
+void
+mjpeg_error_exit1(const char format[], ...) GNUC_PRINTF(1,2);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __MJPEG_LOGGING_H__ */
+
+
+/* 
+ * Local variables:
+ *  c-file-style: "gnu"
+ *  tab-width: 8
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/veejay-current/utils/mjpeg_types.h b/veejay-current/utils/mjpeg_types.h
new file mode 100644
index 00000000..b40d24ab
--- /dev/null
+++ b/veejay-current/utils/mjpeg_types.h
@@ -0,0 +1,109 @@
+/*
+    $Id: mjpeg_types.h,v 1.1.1.1 2004/10/27 23:48:57 niels Exp $
+
+    Copyright (C) 2000 Herbert Valerio Riedel <hvr@gnu.org>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef __MJPEG_TYPES_H__
+#define __MJPEG_TYPES_H__
+#include <config.h>
+
+#if defined(HAVE_STDINT_H)
+# include <stdint.h>
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#elif defined(__CYGWIN__)
+# include <sys/types.h>
+typedef u_int8_t uint8_t;
+typedef u_int16_t uint16_t;
+typedef u_int32_t uint32_t;
+typedef u_int64_t uint64_t;
+# define INT8_C(c)     c
+# define INT16_C(c)    c
+# define INT32_C(c)    c
+# define INT64_C(c)    c ## LL
+# define UINT8_C(c)    c ## U
+# define UINT16_C(c)   c ## U
+# define UINT32_C(c)   c ## U
+# define UINT64_C(c)   c ## ULL
+#else
+/* warning ISO/IEC 9899:1999 <stdint.h> was missing and even <inttypes.h> */
+/* fixme */
+#endif /* HAVE_STDINT_H */
+
+#if defined(__FreeBSD__)
+#include <sys/types.h> /* FreeBSD - ssize_t */
+#endif
+
+#if defined(HAVE_STDBOOL_H) && !defined(__cplusplus)
+#include <stdbool.h>
+#else
+/* ISO/IEC 9899:1999 <stdbool.h> missing -- enabling workaround */
+
+# ifndef __cplusplus
+typedef enum
+  {
+    false = 0,
+    true = 1
+  } locBool;
+
+#  define false   false
+#  define true    true
+#  define bool locBool
+# endif
+#endif
+
+#ifndef PRId64
+#define PRId64 PRID64_STRING_FORMAT
+#endif
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define GNUC_PRINTF( format_idx, arg_idx )    \
+  __attribute__((format (printf, format_idx, arg_idx)))
+#define GNUC_SCANF( format_idx, arg_idx )     \
+  __attribute__((format (scanf, format_idx, arg_idx)))
+#define GNUC_FORMAT( arg_idx )                \
+  __attribute__((format_arg (arg_idx)))
+#define GNUC_NORETURN                         \
+  __attribute__((noreturn))
+#define GNUC_CONST                            \
+  __attribute__((const))
+#define GNUC_UNUSED                           \
+  __attribute__((unused))
+#define GNUC_PACKED                           \
+  __attribute__((packed))
+#else   /* !__GNUC__ */
+#define GNUC_PRINTF( format_idx, arg_idx )
+#define GNUC_SCANF( format_idx, arg_idx )
+#define GNUC_FORMAT( arg_idx )
+#define GNUC_NORETURN
+#define GNUC_CONST
+#define GNUC_UNUSED
+#define GNUC_PACKED
+#endif  /* !__GNUC__ */
+
+
+#endif /* __MJPEG_TYPES_H__ */
+
+
+/* 
+ * Local variables:
+ *  c-file-style: "gnu"
+ *  tab-width: 8
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/veejay-current/utils/mmx.h b/veejay-current/utils/mmx.h
new file mode 100644
index 00000000..87c2c6ae
--- /dev/null
+++ b/veejay-current/utils/mmx.h
@@ -0,0 +1,271 @@
+/*
+ * mmx.h
+ * Copyright (C) 1997-1999 H. Dietz and R. Fisher
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "attributes.h"
+
+/*
+ * The type of an value that fits in an MMX register (note that long
+ * long constant values MUST be suffixed by LL and unsigned long long
+ * values by ULL, lest they be truncated by the compiler)
+ */
+
+typedef	union {
+	int64_t  		q;	/* Quadword (64-bit) value */
+	uint64_t          	uq;	/* Unsigned Quadword */
+	int32_t                 d[2];	/* 2 Doubleword (32-bit) values */
+	uint32_t    		ud[2];	/* 2 Unsigned Doubleword */
+	int16_t         	w[4];	/* 4 Word (16-bit) values */
+	uint16_t      		uw[4];	/* 4 Unsigned Word */
+	int8_t          	b[8];	/* 8 Byte (8-bit) values */
+	uint8_t      		ub[8];	/* 8 Unsigned Byte */
+	float			s[2];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(8) mmx_t;	/* On an 8-byte (64-bit) boundary */
+
+
+#define	mmx_si2r(op,imm,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "J" (imm) )
+
+#define	mmx_m2r(op,mem,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+#define	mmx_r2m(op,reg,mem) \
+	__asm__ __volatile__ (#op " %%" #reg ", %0" \
+			      : "=X" (mem) \
+			      : /* nothing */ )
+
+#define	mmx_r2r(op,regs,regd) \
+	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+
+#define	emms() __asm__ __volatile__ ("emms")
+#define	femms() __asm__ __volatile__ ("femms")
+
+#define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
+#define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
+#define	movd_r2r(regs,regd)	mmx_r2r (movd, regs, regd)
+
+#define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
+#define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
+#define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
+
+#define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
+#define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
+#define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
+#define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
+#define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
+#define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
+#define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
+#define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
+#define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
+
+#define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
+#define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
+#define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
+#define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
+
+#define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
+#define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
+#define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
+#define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
+
+#define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
+#define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
+
+#define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
+#define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
+
+#define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
+#define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
+#define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
+#define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
+#define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
+#define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
+
+#define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
+#define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
+#define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
+#define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
+#define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
+#define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
+
+#define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
+#define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
+
+#define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
+#define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
+
+#define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
+#define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
+
+#define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
+#define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
+
+#define	pslld_i2r(imm,reg)	mmx_si2r (pslld, imm, reg)
+#define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
+#define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
+#define	psllq_i2r(imm,reg)	mmx_si2r (psllq, imm, reg)
+#define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
+#define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
+#define	psllw_i2r(imm,reg)	mmx_si2r (psllw, imm, reg)
+#define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
+#define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
+
+#define	psrad_i2r(imm,reg)	mmx_si2r (psrad, imm, reg)
+#define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
+#define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
+#define	psraw_i2r(imm,reg)	mmx_si2r (psraw, imm, reg)
+#define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
+#define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
+
+#define	psrld_i2r(imm,reg)	mmx_si2r (psrld, imm, reg)
+#define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
+#define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
+#define	psrlq_i2r(imm,reg)	mmx_si2r (psrlq, imm, reg)
+#define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
+#define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
+#define	psrlw_i2r(imm,reg)	mmx_si2r (psrlw, imm, reg)
+#define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
+#define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
+
+#define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
+#define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
+#define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
+#define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
+#define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
+#define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
+
+#define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
+#define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
+#define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
+#define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
+
+#define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
+#define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
+#define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
+#define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
+
+#define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
+#define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
+#define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
+#define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
+#define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
+#define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
+
+#define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
+#define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
+#define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
+#define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
+#define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
+#define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
+
+#define	pxor_m2r(var, reg)	mmx_m2r (pxor, var, reg)
+#define	pxor_r2r(regs, regd)	mmx_r2r (pxor, regs, regd)
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+        __asm__ __volatile__ (#op " %1, %0, %%" #reg \
+                              : /* nothing */ \
+                              : "X" (mem), "X" (imm))
+#define mmx_r2ri(op,regs,regd,imm) \
+        __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+                              : /* nothing */ \
+                              : "X" (imm) )
+
+#define	mmx_fetch(mem,hint) \
+	__asm__ __volatile__ ("prefetch" #hint " %0" \
+			      : /* nothing */ \
+			      : "X" (mem))
+
+/* 3DNow goodies */
+#define pfmul_m2r(var,reg) mmx_m2r( pfmul, var, reg )
+#define pfmul_r2r(regs,regd) mmx_r2r( pfmul, regs, regd )
+#define pfadd_m2r(var,reg) mmx_m2r( pfadd, var, reg )
+#define pfadd_r2r(regs,regd) mmx_r2r( pfadd, regs, regd )
+#define pf2id_r2r(regs,regd) mmx_r2r( pf2id, regs, regd )
+#define pi2fd_r2r(regs,regd) mmx_r2r( pi2fd, regs, regd )
+
+/* SSE goodies */
+#define mulps_r2r( regs, regd) mmx_r2r( mulps, regs, regd )
+#define mulps_m2r( var, regd) mmx_m2r( mulps, var, regd )
+#define movups_r2r(reg1, reg2) mmx_r2r( movups, reg1, reg2 )
+#define movups_m2r(var, reg) mmx_m2r( movups, var, reg )
+#define movaps_m2r(var, reg) mmx_m2r( movaps, var, reg )
+#define movups_r2m(reg, var) mmx_r2m( movups, reg, var )
+#define movaps_r2m(reg, var) mmx_r2m( movaps, reg, var )
+#define cvtps2pi_r2r(regs,regd) mmx_r2r( cvtps2pi, regs, regd )
+#define cvtpi2ps_r2r(regs,regd) mmx_r2r( cvtpi2ps, regs, regd )
+#define shufps_r2ri(regs, regd, imm) mmx_r2ri( shufps, regs, regd, imm )
+
+
+#define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
+
+#define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
+
+#define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
+#define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
+#define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
+#define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
+
+#define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
+#define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
+
+#define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
+#define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
+
+#define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
+#define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
+
+#define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
+#define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
+
+#define	pmovmskb(mmreg,reg) \
+	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+
+#define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
+#define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
+
+#define	prefetcht0(mem)			mmx_fetch (mem, t0)
+#define	prefetcht1(mem)			mmx_fetch (mem, t1)
+#define	prefetcht2(mem)			mmx_fetch (mem, t2)
+#define	prefetchnta(mem)		mmx_fetch (mem, nta)
+
+#define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
+#define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
+
+#define	pshufw_m2ri(var,reg,imm)	mmx_m2ri( pshufw, var, reg, imm)
+#define	pshufw_r2ri(regs,regd,imm)	mmx_r2ri( pshufw, regs, regd, imm)
+
+#define	sfence() __asm__ __volatile__ ("sfence\n\t")
diff --git a/veejay-current/utils/motionsearch.c b/veejay-current/utils/motionsearch.c
new file mode 100644
index 00000000..cc44cafc
--- /dev/null
+++ b/veejay-current/utils/motionsearch.c
@@ -0,0 +1,1237 @@
+/* motion.c, motion estimation                                              */
+
+
+/* (C) 2000/2001 Andrew Stevens */
+
+/* These modifications are free software; you can redistribute it
+ *  and/or modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2 of
+ *  the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "math.h"
+#include "cpu_accel.h"
+#include "fastintfns.h"
+#include "motionsearch.h"
+#include "mjpeg_logging.h"
+
+
+#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
+
+#include "mblock_sub44_sads_x86.h"
+
+static int (*pmblocks_sub44_mests)( uint8_t *blk,  uint8_t *ref,
+							int ilow, int jlow,
+							int ihigh, int jhigh, 
+							int h, int rowstride, 
+							int threshold,
+							me_result_s *resvec);
+
+
+void mblock_sub22_nearest4_sads_mmxe(uint8_t *blk1,uint8_t *blk2,
+				     int frowstride,int fh, int* resvec) 
+                                     __asm__ ("mblock_sub22_nearest4_sads_mmxe");
+
+void mblock_nearest4_sads_mmxe(uint8_t *blk1, uint8_t *blk2, 
+			       int rowstride, int h, int *resvec)
+                               __asm__ ("mblock_nearest4_sads_mmxe");
+
+int sad_00_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h, int distlim) __asm__ ("sad_00_mmxe");
+int sad_01_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmxe");
+int sad_10_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmxe");
+int sad_11_mmxe(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmxe");
+
+
+int sad_sub22_mmxe ( uint8_t *blk1, uint8_t *blk2,  int frowstride, int fh) __asm__ ("sad_sub22_mmxe");
+int sad_sub44_mmxe ( uint8_t *blk1, uint8_t *blk2,  int qrowstride, int qh) __asm__ ("sad_sub44_mmxe");
+int sumsq_mmx( uint8_t *blk1, uint8_t *blk2,
+	       int rowstride, int hx, int hy, int h) __asm__ ("sumsq_mmx");
+int sumsq_sub22_mmx( uint8_t *blk1, uint8_t *blk2,
+		     int rowstride, int h) __asm__ ("sumsq_sub22_mmx");
+int bsumsq_sub22_mmx( uint8_t *blk1f, uint8_t *blk1b, 
+		      uint8_t *blk2,
+		      int rowstride, int h) __asm__ ("bsumsq_sub22_mmx");
+int bsumsq_mmx (uint8_t *pf, uint8_t *pb,
+		uint8_t *p2, int rowstride,
+		int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsumsq_mmx");
+int bsad_mmx (uint8_t *pf, uint8_t *pb,
+	      uint8_t *p2, int rowstride,
+	      int hxf, int hyf, int hxb, int hyb, int h) __asm__ ("bsad_mmx");
+
+int variance_mmx( uint8_t *p, int size,	int rowstride) __asm__ ("variance_mmx");
+
+int sad_00_mmx ( uint8_t *blk1, uint8_t *blk2,  int rowstride, int h, int distlim) __asm__ ("sad_00_mmx");
+int sad_01_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_01_mmx");
+int sad_10_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_10_mmx");
+int sad_11_mmx(uint8_t *blk1, uint8_t *blk2, int rowstride, int h) __asm__ ("sad_11_mmx");
+int sad_sub22_mmx ( uint8_t *blk1, uint8_t *blk2,  int frowstride, int fh) __asm__ ("sad_sub22_mmx");
+int sad_sub44_mmx (uint8_t *blk1, uint8_t *blk2,  int qrowstride, int qh) __asm__ ("sad_sub44_mmx");
+
+#endif
+
+
+/*
+ * Function pointers for selecting CPU specific implementations
+ *
+ */
+
+void (*pfind_best_one_pel)( me_result_set *sub22set,
+							uint8_t *org, uint8_t *blk,
+							int i0, int j0,
+							int ihigh, int jhigh,
+							int rowstride, int h, 
+							me_result_s *res
+	);
+int (*pbuild_sub22_mests)( me_result_set *sub44set,
+							me_result_set *sub22set,
+							int i0,  int j0, int ihigh, int jhigh, 
+							int null_mc_sad,
+							uint8_t *s22org,  uint8_t *s22blk, 
+							int frowstride, int fh,
+							int reduction
+							);
+
+int (*pbuild_sub44_mests)( me_result_set *sub44set,
+							int ilow, int jlow, int ihigh, int jhigh, 
+							int i0, int j0,
+							int null_mc_sad,
+							uint8_t *s44org, uint8_t *s44blk, 
+							int qrowstride, int qh,
+							int reduction );
+
+int (*psumsq_sub22)( uint8_t *blk1, uint8_t *blk2,
+						 int rowstride, int h);
+int (*pbsumsq_sub22)( uint8_t *blk1f, uint8_t *blk1b, 
+						  uint8_t *blk2,
+						  int rowstride, int h);
+
+int (*pvariance)(uint8_t *mb, int size, int rowstride);
+
+
+int (*psad_sub22) ( uint8_t *blk1, uint8_t *blk2,  int frowstride, int fh);
+int (*psad_sub44) ( uint8_t *blk1, uint8_t *blk2,  int qrowstride, int qh);
+int (*psad_00) ( uint8_t *blk1, uint8_t *blk2,  int rowstride, int h, int distlim);
+int (*psad_01) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+int (*psad_10) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+int (*psad_11) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+
+int (*psumsq) (uint8_t *blk1, uint8_t *blk2,
+					  int rowstride, int hx, int hy, int h);
+  
+  
+int (*pbsumsq) (uint8_t *pf, uint8_t *pb,
+				uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
+
+int (*pbsad) (uint8_t *pf, uint8_t *pb,
+					   uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
+
+
+
+
+
+/*
+ * Round search radius to suit the search algorithm.
+ * Currently radii must be multiples of 8.
+ *
+ */
+
+int round_search_radius( int radius )
+{
+	return intmax(8,((radius+4) /8)*8);
+}
+
+
+
+/*
+	Take a vector of motion estimations and repeatedly make passes
+	discarding all elements whose sad "weight" is above the current mean weight.
+*/
+
+static void sub_mean_reduction( me_result_set *matchset, 
+								int times,
+							    int *minweight_res)
+{
+	me_result_s *matches = matchset->mests;
+	int len = matchset->len;
+	int i,j;
+	int weight_sum;
+	int mean_weight;
+	int min_weight = 100000;
+	if( len == 0 )
+	{
+		*minweight_res = 100000;
+		matchset->len = 0;
+		return;
+	}
+
+	for(;;)
+	{
+		weight_sum = 0;
+		for( i = 0; i < len ; ++i )
+			weight_sum += matches[i].weight;
+		mean_weight = weight_sum / len;
+		
+		if( times <= 0)
+			break;
+			
+		j = 0;
+		for( i =0; i < len; ++i )
+		{
+			if( matches[i].weight <= mean_weight )
+			{
+				if( times == 1)
+				{
+					min_weight = matches[i].weight ;
+				}
+				matches[j] = matches[i];
+				++j;
+			}
+		}
+		len = j;
+		--times;
+	}
+	matchset->len = len;
+	*minweight_res = mean_weight;
+}
+
+/* 
+ * Build a vector of the top 4*4 sub-sampled motion estimations in
+ * the box (ilow,jlow) to (ihigh,jhigh).
+ *
+ *	The algorithm is as follows: 
+ * 
+ *  1. Matches on an 4*4 pel grid are collected. All those matches
+ * whose that is over a (conservative) threshold (basically 50% more
+ * than moving average of the mean sad of such matches) are discarded.
+ *	
+ *	2. Multiple passes are made discarding worse than-average matches.
+ *	The number of passes is specified by the user.  The default it 2
+ *	(leaving roughly 1/4 of the matches).
+ *
+ * The intial threshold and discard passes are controlled by reduction
+ * [1..4].  The intial SAD threshold is calculated as 6 / reduction of
+ * a reference SAD passed as a parameter.  For reduction == 1 one
+ * discard pass is made otherwise two are made.
+ *	
+ *	The net result is very fast and find good matches if they're to be
+ *	found.  I.e. the penalty over exhaustive search is pretty low.
+ *	
+ *	NOTE: The "discard below average" trick depends critically on
+ *	having some variation in the matches.  The slight penalty imposed
+ *	for distant matches (reasonable since the motion vectors have to
+ *	be encoded) is *vital* as otherwise pathologically bad performance
+ *	results on highly uniform images.
+ *	
+ *	TODO: We should probably allow the user to eliminate the initial
+ *	thinning of 4*4 grid matches if ultimate quality is demanded
+ *	(e.g. for low bit-rate applications).
+ * 
+ */
+
+static int build_sub44_mests( me_result_set *sub44set,
+							   int ilow, int jlow, int ihigh, int jhigh, 
+							   int i0, int j0,
+							   int null_ctl_sad,
+							   uint8_t *s44org, uint8_t *s44blk, 
+							   int qrowstride, int qh,
+							   int reduction
+							   )
+{
+	uint8_t *s44orgblk;
+	me_result_s *sub44_mests = sub44set->mests;
+	int istrt = ilow-i0;
+	int jstrt = jlow-j0;
+	int iend = ihigh-i0;
+	int jend = jhigh-j0;
+	int mean_weight;
+	int threshold;
+
+	int i,j;
+	int s1;
+	uint8_t *old_s44orgblk;
+	int sub44_num_mests;
+
+	/* N.b. we may ignore the right hand block of the pair going over the
+	   right edge as we have carefully allocated the buffer oversize to ensure
+	   no memory faults.  The later motion estimation calculations
+	   performed on the results of this pass will filter out
+	   out-of-range blocks...
+	*/
+	
+	threshold = 6*null_ctl_sad / (4*4*reduction);
+	s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
+	
+	/* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
+		(a)	it is only 16th of the size of the real 1-pel data 
+		(b) we ignore those matches with an sad above our threshold.	
+	*/
+
+	sub44_num_mests = 0;
+
+	/* Invariant:  s44orgblk = s44org+(i>>2)+qrowstride*(j>>2) */
+	s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
+	for( j = jstrt; j <= jend; j += 4 )
+	{
+		old_s44orgblk = s44orgblk;
+		for( i = istrt; i <= iend; i += 4 )
+		{
+			s1 = ((*psad_sub44)( s44orgblk,s44blk,qrowstride,qh) & 0xffff);
+			if( s1 < threshold )
+			{
+				threshold = intmin(s1<<2,threshold);
+				sub44_mests[sub44_num_mests].x = i;
+				sub44_mests[sub44_num_mests].y = j;
+				sub44_mests[sub44_num_mests].weight = s1 + 
+					(intmax(intabs(i-i0),intabs(j-j0))<<1);
+				++sub44_num_mests;
+			}
+			s44orgblk += 1;
+		}
+		s44orgblk = old_s44orgblk + qrowstride;
+	}
+	sub44set->len = sub44_num_mests;
+			
+	sub_mean_reduction( sub44set, 1+(reduction>1),  &mean_weight);
+
+
+	return sub44set->len;
+}
+
+#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
+
+static int build_sub44_mests_mmx( me_result_set *sub44set,
+								   int ilow, int jlow, int ihigh, int jhigh, 
+								   int i0, int j0,
+								   int null_ctl_sad,
+								   uint8_t *s44org, uint8_t *s44blk, 
+								   int qrowstride, int qh,
+								   int reduction)
+{
+	uint8_t *s44orgblk;
+	me_result_s *sub44_mests = sub44set->mests;
+	int istrt = ilow-i0;
+	int jstrt = jlow-j0;
+	int iend = ihigh-i0;
+	int jend = jhigh-j0;
+	int mean_weight;
+	int threshold;
+
+	
+	threshold = 6*null_ctl_sad / (4*4*reduction);
+	s44orgblk = s44org+(ilow>>2)+qrowstride*(jlow>>2);
+	
+	sub44set->len = (*pmblocks_sub44_mests)( s44orgblk, s44blk,
+											  istrt, jstrt,
+											  iend, jend, 
+											  qh, qrowstride, 
+											  threshold,
+											  sub44_mests);
+	
+   /* If we're really pushing quality we reduce once otherwise twice. */
+			
+	sub_mean_reduction( sub44set, 1+(reduction>1),  &mean_weight);
+
+
+	return sub44set->len;
+}
+#endif
+
+
+
+/*  Build a vector of the best 2*2 sub-sampled motion estimations for
+ *  the 16*16 macroblock at i0,j0 using a set of 4*4 sub-sampled matches as
+ *  starting points.  As with with the 4*4 matches We don't collect
+ *  them densely as they're just search starting points for 1-pel
+ *  search and ones that are 1 out * should still give better than
+ *  average matches...
+ *
+ *	 The resulting candidate motion vectors are thinned by thresholding
+ *   and discarding worse than-average matches.
+ *
+ * The intial threshold and number of discard passes are controlled by reduction
+ * [1..4]:  the intial SAD threshold is calculated as 6 / reduction of
+ * a reference SAD passed as a parameter and then reduction discard passes
+ * are made.
+ *
+ * A super-fast version using MMX assembly code for X86 follows.
+ * Other CPU's could/should be handled the same way.  */
+
+
+static int build_sub22_mests( me_result_set *sub44set,
+							   me_result_set *sub22set,
+							   int i0,  int j0, int ihigh, int jhigh, 
+							   int null_ctl_sad,
+							   uint8_t *s22org,  uint8_t *s22blk, 
+							   int frowstride, int fh,
+							   int reduction)
+{
+	int i,k,s;
+	int threshold = 6*null_ctl_sad / (2 * 2*reduction);
+	
+	int min_weight;
+	int ilim = ihigh-i0;
+	int jlim = jhigh-j0;
+	int x,y;
+	uint8_t *s22orgblk;
+	
+	sub22set->len = 0;
+	for( k = 0; k < sub44set->len; ++k )
+	{
+
+		x = sub44set->mests[k].x;
+		y = sub44set->mests[k].y;
+
+		s22orgblk =  s22org +((y+j0)>>1)*frowstride +((x+i0)>>1);
+		for( i = 0; i < 4; ++i )
+		{
+			if( x <= ilim && y <= jlim )
+			{	
+				s = (*psad_sub22)( s22orgblk,s22blk,frowstride,fh)+
+					(intmax(intabs(x),intabs(y))<<3);
+				if( s < threshold )
+				{
+					me_result_s *mc = &sub22set->mests[sub22set->len];
+					mc->x = (int8_t)x;
+					mc->y = (int8_t)y;
+					mc->weight = s;
+					++(sub22set->len);
+				}
+			}
+
+			if( i == 1 )
+			{
+				s22orgblk += frowstride-1;
+				x -= 2;
+				y += 2;
+			}
+			else
+			{
+				s22orgblk += 1;
+				x += 2;
+				
+			}
+		}
+
+	}
+
+	sub_mean_reduction( sub22set,  reduction, &min_weight );
+	return sub22set->len;
+}
+
+#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM) 
+static int build_sub22_mests_mmxe( me_result_set *sub44set,
+							 me_result_set *sub22set,
+							 int i0,  int j0, int ihigh, int jhigh, 
+							 int null_ctl_sad,
+							 uint8_t *s22org,  uint8_t *s22blk, 
+							 int frowstride, int fh,
+							 int reduction)
+{
+	int i,k,s;
+	int threshold = 6*null_ctl_sad / (2 * 2*reduction);
+
+	int min_weight;
+	int ilim = ihigh-i0;
+	int jlim = jhigh-j0;
+	int x,y;
+	uint8_t *s22orgblk;
+	int resvec[4];
+
+	/* TODO: The calculation of the lstrow offset really belongs in
+       asm code... */
+	int lstrow=(fh-1)*frowstride;
+
+	sub22set->len = 0;
+	for( k = 0; k < sub44set->len; ++k )
+	{
+
+		x = sub44set->mests[k].x;
+		y = sub44set->mests[k].y;
+
+		s22orgblk =  s22org +((y+j0)>>1)*frowstride +((x+i0)>>1);
+		/*
+		  Get SAD for 2*2 subsampled macroblocks: orgblk,orgblk(+2,0),
+		  orgblk(0,+2), and orgblk(+2,+2) Done all in one go to reduce
+		  memory bandwidth demand
+		*/
+		mblock_sub22_nearest4_sads_mmxe(s22orgblk+lstrow, s22blk+lstrow, frowstride, fh, resvec);
+		for( i = 0; i < 4; ++i )
+		{
+			if( x <= ilim && y <= jlim )
+			{	
+				s =resvec[i]+(intmax(intabs(x),intabs(y))<<3);
+				if( s < threshold )
+				{
+					me_result_s *mc = &sub22set->mests[sub22set->len];
+					mc->x = (int8_t)x;
+					mc->y = (int8_t)y;
+					mc->weight = s;
+					++(sub22set->len);
+				}
+			}
+
+			if( i == 1 )
+			{
+				x -= 2;
+				y += 2;
+			}
+			else
+			{
+				x += 2;
+			}
+		}
+
+	}
+
+	
+	sub_mean_reduction( sub22set, reduction, &min_weight );
+	return sub22set->len;
+}
+
+#endif
+
+/*
+ * Search for the best 1-pel match within 1-pel of a good 2*2-pel
+ * match.  
+ * 
+ * N.b. best_so_far must be initialised by the caller!
+ */
+
+
+static void find_best_one_pel( me_result_set *sub22set,
+							   uint8_t *org, uint8_t *blk,
+							   int i0, int j0,
+							   int ihigh, int jhigh,
+							   int rowstride, int h, 
+							   me_result_s *best_so_far
+	)
+
+{
+	int i,k;
+	int d;
+	me_result_s minpos = *best_so_far;
+	int dmin = INT_MAX;
+	int ilim = ihigh-i0;
+	int jlim = jhigh-j0;
+	uint8_t *orgblk;
+	int penalty;
+	me_result_s matchrec;
+ 
+	for( k = 0; k < sub22set->len; ++k )
+	{	
+
+		matchrec = sub22set->mests[k];
+		orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y);
+		penalty = intmax(intabs(matchrec.x),intabs(matchrec.y))<<5;
+
+		for( i = 0; i < 4; ++i )
+		{
+			if( matchrec.x <= ilim && matchrec.y <= jlim )
+			{
+		
+				d = penalty+(*psad_00)(orgblk,blk,rowstride,h, dmin);
+				if (d<dmin)
+				{
+					dmin = d;
+					minpos = matchrec;
+				}
+			}
+			if( i == 1 )
+			{
+				orgblk += rowstride-1;
+				matchrec.x -= 1;
+				matchrec.y += 1;
+			}
+			else
+			{
+				orgblk += 1;
+				matchrec.x += 1;
+			}
+		}
+	}
+
+	minpos.weight = (uint16_t)intmin(255*255, dmin);
+	*best_so_far = minpos;
+}
+
+#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM) 
+static void find_best_one_pel_mmxe( me_result_set *sub22set,
+							 uint8_t *org, uint8_t *blk,
+							 int i0, int j0,
+							 int ihigh, int jhigh,
+							 int rowstride, int h,
+							 me_result_s *best_so_far
+	)
+
+{
+	int i,k;
+	int d;
+	me_result_s minpos = *best_so_far;
+	int ilim = ihigh-i0;
+	int jlim = jhigh-j0;
+	int dmin = INT_MAX;
+	uint8_t *orgblk;
+	int penalty;
+	me_result_s matchrec;
+	int resvec[4];
+
+	for( k = 0; k < sub22set->len; ++k )
+	{	
+		matchrec = sub22set->mests[k];
+		orgblk = org + (i0+matchrec.x)+rowstride*(j0+matchrec.y);
+		penalty = intmax(abs(matchrec.x),abs(matchrec.y))<<5;
+		
+		/* Get SAD for macroblocks: 	orgblk,orgblk(+1,0),
+		   orgblk(0,+1), and orgblk(+1,+1)
+		   Done all in one go to reduce memory bandwidth demand
+		*/
+		mblock_nearest4_sads_mmxe(orgblk,blk,rowstride,h,
+		resvec);
+
+		for( i = 0; i < 4; ++i )
+		{
+			if( matchrec.x <= ilim && matchrec.y <= jlim )
+			{
+		
+				d = penalty+resvec[i];
+				if (d<dmin)
+				{
+					dmin = d;
+					minpos = matchrec;
+				}
+			}
+			if( i == 1 )
+			{
+				orgblk += rowstride-1;
+				matchrec.x -= 1;
+				matchrec.y += 1;
+			}
+			else
+			{
+				orgblk += 1;
+				matchrec.x += 1;
+			}
+		}
+	}
+
+	minpos.weight = (uint16_t)intmin(255*255, dmin);
+	*best_so_far = minpos;
+
+}
+#endif 
+ 
+
+/* 
+ * sum absolute difference between two (16*h) blocks Four variations
+ * depending on the required half pel interpolation of blk1 (hx,hy)
+ *
+ * blk1,blk2: addresses of top left pels of both blocks
+ * rowstride:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ * distlim: bail out if sum exceeds this value 
+ *
+ **/
+
+
+static int sad_00(uint8_t *blk1,uint8_t *blk2,
+					int rowstride, int h,int distlim)
+{
+	uint8_t *p1,*p2;
+	int j;
+	int s;
+	register int v;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	for (j=0; j<h; j++)
+	{
+
+#define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
+		pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
+		pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
+		pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
+		pipestep(12); pipestep(13); pipestep(14); pipestep(15);
+#undef pipestep
+
+		if (s >= distlim)
+			break;
+			
+		p1+= rowstride;
+		p2+= rowstride;
+	}
+	return s;
+}
+
+static int sad_01(uint8_t *blk1,uint8_t *blk2,int rowstride, int h)
+{
+	uint8_t *p1,*p2;
+	int i,j;
+	int s;
+	register int v;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<16; i++)
+		{
+
+			v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
+			s+=intabs(v);
+		}
+		p1+= rowstride;
+		p2+= rowstride;
+	}
+	return s;
+}
+
+static int sad_10(uint8_t *blk1,uint8_t *blk2, int rowstride, int h)
+{
+	uint8_t *p1,*p1a,*p2;
+	int i,j;
+	int s;
+	register int v;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	p1a = p1 + rowstride;
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<16; i++)
+		{
+			v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
+			s+= intabs(v);
+		}
+		p1 = p1a;
+		p1a+= rowstride;
+		p2+= rowstride;
+	}
+
+	return s;
+}
+
+static int sad_11(uint8_t *blk1,uint8_t *blk2, int rowstride, int h)
+{
+	uint8_t *p1,*p1a,*p2;
+	int i,j;
+	int s;
+	register int v;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	p1a = p1 + rowstride;
+	  
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<16; i++)
+		{
+			v = ((unsigned int)((p1[i]+p1[i+1])+(p1a[i]+p1a[i+1])+2)>>2) - p2[i];
+			s+=intabs(v);
+		}
+		p1 = p1a;
+		p1a+= rowstride;
+		p2+= rowstride;
+	}
+	return s;
+}
+
+
+/* 
+ *  Compute subsampled images for fast motion compensation search
+ *  N.b. rowstride should be *two* line widths for interlace images
+ */
+
+void subsample_image( uint8_t *image, int rowstride, 
+					  uint8_t *sub22_image, 
+					  uint8_t *sub44_image)
+{
+	uint8_t *blk = image;
+	uint8_t *b, *nb;
+	uint8_t *pb;
+	uint8_t *qb;
+	uint8_t *start_s22blk, *start_s44blk;
+	int i;
+	int nextfieldline = rowstride;
+
+	start_s22blk   = sub22_image;
+	start_s44blk   = sub44_image;
+	b = blk;
+	nb = (blk+nextfieldline);
+
+	pb = (uint8_t *) start_s22blk;
+
+	while( nb < start_s22blk )
+	{
+		for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
+		{
+			pb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2;
+			pb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2;	
+			pb += 2;
+			b += 4;
+			nb += 4;
+		}
+		b += nextfieldline;
+		nb = b + nextfieldline;
+	}
+
+
+	/* Now create the 4*4 sub-sampled data from the 2*2 
+	   N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
+	   original.  Albeit half as many lines and pixels...
+	*/
+
+	nextfieldline = nextfieldline >> 1;
+
+	qb = start_s44blk;
+	b  = start_s22blk;
+	nb = (start_s22blk+nextfieldline);
+
+	while( nb < start_s44blk )
+	{
+		for( i = 0; i < nextfieldline/4; ++i )
+		{
+			qb[0] = ((b[0]+b[1])+(nb[0]+nb[1])+2)>>2;
+			qb[1] = ((b[2]+b[3])+(nb[2]+nb[3])+2)>>2;
+			qb += 2;
+			b += 4;
+			nb += 4;
+		}
+		b += nextfieldline;
+		nb = b + nextfieldline;
+	}
+
+}
+
+/*
+ * Same as sad_00 except for 2*2 subsampled data so only 8 wide!
+ *
+ */
+ 
+
+static int sad_sub22( uint8_t *s22blk1, uint8_t *s22blk2,int frowstride,int fh)
+{
+	uint8_t *p1 = s22blk1;
+	uint8_t *p2 = s22blk2;
+	int s = 0;
+	int j;
+
+	for( j = 0; j < fh; ++j )
+	{
+		register int diff;
+#define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
+		pipestep(0); pipestep(1);
+		pipestep(2); pipestep(3);
+		pipestep(4); pipestep(5);
+		pipestep(6); pipestep(7);
+		p1 += frowstride;
+		p2 += frowstride;
+#undef pipestep
+	}
+
+	return s;
+}
+
+
+
+/*
+ * Same as sad_00 except for 4*4 sub-sampled data.  
+ *
+ * N.b.: currently assumes only 16*16 or 16*8 motion estimation will
+ * be used...  I.e. 4*4 or 4*2 sub-sampled blocks will be compared.  
+ *
+ *
+ */
+
+
+static int sad_sub44( uint8_t *s44blk1, uint8_t *s44blk2,int qrowstride,int qh)
+{
+	register uint8_t *p1 = s44blk1;
+	register uint8_t *p2 = s44blk2;
+	int s = 0;
+	register int diff;
+
+	/* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
+#define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
+	pipestep(0); pipestep(1);	 pipestep(2); pipestep(3);
+	if( qh > 1 )
+	{
+		p1 += qrowstride; p2 += qrowstride;
+		pipestep(0); pipestep(1);	 pipestep(2); pipestep(3);
+		if( qh > 2 )
+		{
+			p1 += qrowstride; p2 += qrowstride;
+			pipestep(0); pipestep(1);	 pipestep(2); pipestep(3);
+			p1 += qrowstride; p2 += qrowstride;
+			pipestep(0); pipestep(1);	 pipestep(2); pipestep(3);
+		}
+	}
+
+
+	return s;
+}
+
+/*
+ * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
+ * blk1,blk2: addresses of top left pels of both blocks
+ * rowstride:        distance (in bytes) of vertically adjacent pels
+ * h:         height of block (usually 8 or 16)
+ */
+ 
+static int sumsq_sub22(uint8_t *blk1, uint8_t *blk2, int rowstride, int h)
+{
+	uint8_t *p1 = blk1, *p2 = blk2;
+	int i,j,v;
+	int s = 0;
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<8; i++)
+		{
+			v = p1[i] - p2[i];
+			s+= v*v;
+		}
+		p1+= rowstride;
+		p2+= rowstride;
+	}
+	return s;
+}
+
+/* total squared difference between bidirection prediction of (8*h)
+ * blocks of 2*2 sub-sampled pels and reference 
+ * blk1f, blk1b,blk2: addresses of top left
+ * pels of blocks 
+ * rowstride: distance (in bytes) of vertically adjacent
+ * pels 
+ * h: height of block (usually 4 or 8)
+ */
+ 
+static int bsumsq_sub22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, 
+					 int rowstride, int h)
+{
+	uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
+	int i,j,v;
+	int s = 0;
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<8; i++)
+		{
+			v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
+			s+= v*v;
+		}
+		p1f+= rowstride;
+		p1b+= rowstride;
+		p2+= rowstride;
+	}
+	return s;
+}
+
+/*
+ * total squared difference between two (16*h) blocks
+ * including optional half pel interpolation of blk1 (hx,hy)
+ * blk1,blk2: addresses of top left pels of both blocks
+ * rowstride:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ */
+ 
+
+static int sumsq(blk1,blk2,rowstride,hx,hy,h)
+	uint8_t *blk1,*blk2;
+	int rowstride,hx,hy,h;
+{
+	uint8_t *p1,*p1a,*p2;
+	int i,j;
+	int s,v;
+
+	s = 0;
+	p1 = blk1;
+	p2 = blk2;
+	if (!hx && !hy)
+		for (j=0; j<h; j++)
+		{
+			for (i=0; i<16; i++)
+			{
+				v = p1[i] - p2[i];
+				s+= v*v;
+			}
+			p1+= rowstride;
+			p2+= rowstride;
+		}
+	else if (hx && !hy)
+		for (j=0; j<h; j++)
+		{
+			for (i=0; i<16; i++)
+			{
+				v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
+				s+= v*v;
+			}
+			p1+= rowstride;
+			p2+= rowstride;
+		}
+	else if (!hx && hy)
+	{
+		p1a = p1 + rowstride;
+		for (j=0; j<h; j++)
+		{
+			for (i=0; i<16; i++)
+			{
+				v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
+				s+= v*v;
+			}
+			p1 = p1a;
+			p1a+= rowstride;
+			p2+= rowstride;
+		}
+	}
+	else /* if (hx && hy) */
+	{
+		p1a = p1 + rowstride;
+		for (j=0; j<h; j++)
+		{
+			for (i=0; i<16; i++)
+			{
+				v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
+				s+= v*v;
+			}
+			p1 = p1a;
+			p1a+= rowstride;
+			p2+= rowstride;
+		}
+	}
+ 
+	return s;
+}
+
+
+/*
+ * absolute difference error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+ 
+
+static int bsad(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h)
+	uint8_t *pf,*pb,*p2;
+	int rowstride,hxf,hyf,hxb,hyb,h;
+{
+	uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+	int i,j;
+	int s,v;
+
+	pfa = pf + hxf;
+	pfb = pf + rowstride*hyf;
+	pfc = pfb + hxf;
+
+	pba = pb + hxb;
+	pbb = pb + rowstride*hyb;
+	pbc = pbb + hxb;
+
+	s = 0;
+
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<16; i++)
+		{
+			v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
+				  ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
+				- *p2++;
+			s += abs(v);
+		}
+		p2+= rowstride-16;
+		pf+= rowstride-16;
+		pfa+= rowstride-16;
+		pfb+= rowstride-16;
+		pfc+= rowstride-16;
+		pb+= rowstride-16;
+		pba+= rowstride-16;
+		pbb+= rowstride-16;
+		pbc+= rowstride-16;
+	}
+
+	return s;
+}
+
+/*
+ * squared error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+ 
+
+static int bsumsq(pf,pb,p2,rowstride,hxf,hyf,hxb,hyb,h)
+	uint8_t *pf,*pb,*p2;
+	int rowstride,hxf,hyf,hxb,hyb,h;
+{
+	uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+	int i,j;
+	int s,v;
+	
+	pfa = pf + hxf;
+	pfb = pf + rowstride*hyf;
+	pfc = pfb + hxf;
+
+	pba = pb + hxb;
+	pbb = pb + rowstride*hyb;
+	pbc = pbb + hxb;
+
+	s = 0;
+
+	for (j=0; j<h; j++)
+	{
+		for (i=0; i<16; i++)
+		{
+#define ui(x) ((unsigned int)x)
+			v = ((((ui(*pf++) + ui(*pfa++) + ui(*pfb++) + ui(*pfc++) + 2)>>2) +
+				  ((ui(*pb++) + ui(*pba++) + ui(*pbb++) + ui(*pbc++) + 2)>>2)
+				  + 1
+				)>>1) - ui(*p2++);
+#undef ui
+			s+=v*v;
+		}
+		p2+= rowstride-16;
+		pf+= rowstride-16;
+		pfa+= rowstride-16;
+		pfb+= rowstride-16;
+		pfc+= rowstride-16;
+		pb+= rowstride-16;
+		pba+= rowstride-16;
+		pbb+= rowstride-16;
+		pbc+= rowstride-16;
+	}
+
+	return s;
+}
+
+
+/*
+ * variance of a (size*size) block, multiplied by 256
+ * p:  address of top left pel of block
+ * rowstride: distance (in bytes) of vertically adjacent pels
+ * SIZE is a multiple of 8.
+ */
+static int variance(uint8_t *p, int size,	int rowstride)
+{
+	int i,j;
+	unsigned int v,s,s2;
+	int var;
+	s = s2 = 0;
+
+	for (j=0; j<size; j++)
+	{
+		for (i=0; i<size; i++)
+		{
+			v = *p++;
+			s+= v;
+			s2+= v*v;
+		}
+		p+= rowstride-size;
+	}
+	var = s2 - (s*s)/(size*size);
+
+	return var;
+}
+
+
+
+
+/*
+ *  Initialise motion estimation - currently only selection of which
+ * versions of the various low level computation routines to use
+ * 
+ */
+
+void init_motion_search(void)
+{
+	int cpucap = cpu_accel();
+
+	if( cpucap == 0 )	/* No MMX/SSE etc support available */
+	{
+		psad_sub22 = sad_sub22;
+		psad_sub44 = sad_sub44;
+		psad_00 = sad_00;
+		psad_01 = sad_01;
+		psad_10 = sad_10;
+		psad_11 = sad_11;
+		pbsad = bsad;
+		pvariance = variance;
+		psumsq = sumsq;
+		pbsumsq = bsumsq;
+		psumsq_sub22 = sumsq_sub22;
+		pbsumsq_sub22 = bsumsq_sub22;
+		pfind_best_one_pel = find_best_one_pel;
+		pbuild_sub22_mests	= build_sub22_mests;
+		pbuild_sub44_mests	= build_sub44_mests;
+	 }
+#if defined(HAVE_ASM_MMX) && defined(HAVE_ASM_NASM)
+	else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
+	{
+		mjpeg_info( "SETTING EXTENDED MMX for MOTION!");
+		psad_sub22 = sad_sub22_mmxe;
+		psad_sub44 = sad_sub44_mmxe;
+		psad_00 = sad_00_mmxe;
+		psad_01 = sad_01_mmxe;
+		psad_10 = sad_10_mmxe;
+		psad_11 = sad_11_mmxe;
+		pbsad = bsad_mmx;
+		pvariance = variance_mmx;
+		psumsq = sumsq_mmx;
+		pbsumsq = bsumsq_mmx;
+		psumsq_sub22 = sumsq_sub22_mmx;
+		pbsumsq_sub22 = bsumsq_sub22_mmx;
+		pfind_best_one_pel = find_best_one_pel_mmxe;
+		pbuild_sub22_mests	= build_sub22_mests_mmxe;
+		pbuild_sub44_mests	= build_sub44_mests_mmx;
+		pmblocks_sub44_mests = mblocks_sub44_mests_mmxe;
+	}
+	else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
+	{
+		mjpeg_info( "SETTING MMX for MOTION!");
+		psad_sub22 = sad_sub22_mmx;
+		psad_sub44 = sad_sub44_mmx;
+		psad_00 = sad_00_mmx;
+		psad_01 = sad_01_mmx;
+		psad_10 = sad_10_mmx;
+		psad_11 = sad_11_mmx;
+		pbsad = bsad_mmx;
+		pvariance = variance_mmx;
+		psumsq = sumsq_mmx;
+		pbsumsq = bsumsq_mmx;
+		psumsq_sub22 = sumsq_sub22_mmx;
+		pbsumsq_sub22 = bsumsq_sub22_mmx;
+		pfind_best_one_pel = find_best_one_pel;
+		pbuild_sub22_mests	= build_sub22_mests;
+		pbuild_sub44_mests	= build_sub44_mests_mmx;
+		pmblocks_sub44_mests = mblocks_sub44_mests_mmx;
+	}
+#endif
+
+}
diff --git a/veejay-current/utils/motionsearch.h b/veejay-current/utils/motionsearch.h
new file mode 100644
index 00000000..624f7ab5
--- /dev/null
+++ b/veejay-current/utils/motionsearch.h
@@ -0,0 +1,141 @@
+/* (C) 2000/2001 Andrew Stevens */
+
+/*  This software is free software; you can redistribute it
+ *  and/or modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2 of
+ *  the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ */
+
+#ifndef __MOTIONSEARCH_H__
+#define __MOTIONSEARCH_H__
+
+#include <config.h>
+#include "mjpeg_types.h"
+
+struct me_result
+{
+	uint16_t weight;
+	int8_t x;
+	int8_t y;
+};
+
+typedef struct me_result me_result_s;
+
+#define MAX_MATCHES (256*256/4)
+
+typedef struct _me_result_vec {
+	int len;
+	me_result_s mests[MAX_MATCHES];
+} me_result_set;
+
+/*
+ * Function pointers for selecting CPU specific implementations
+ *
+ * Top-level motion estimation entry points.
+ *
+ */
+
+/*
+ * Use given set of 2*2 sub-sampled motion estimates generated using
+ * pbuild_sub22_mests to generate a best 1-pel motion estimate.
+ *
+ */
+
+extern void (*pfind_best_one_pel)( me_result_set *sub22set,
+								   uint8_t *org, uint8_t *blk,
+								   int i0, int j0,
+								   int ihigh, int jhigh,
+								   int rowstride, int h, 
+								   me_result_s *res
+	);
+
+/*
+ * Use given set of 4*4 sub-sampled motion estimates generated using
+ * pbuild_sub44_mests to generate a set of best 2*2 sub-sampled motion
+ * estimates.
+ */
+
+extern int (*pbuild_sub22_mests)( me_result_set *sub44set,
+								  me_result_set *sub22set,
+								  int i0,  int j0, int ihigh, int jhigh, 
+								  int null_mc_sad,
+								  uint8_t *s22org,  uint8_t *s22blk, 
+								  int frowstride, int fh,
+								  int reduction
+	);
+
+/*
+ * Generate a set the best 4*4 sub-sampled motion estimates
+ * from the specifed rectangle of candidates in s44org with respect
+ * to the reference block s44blk
+ */
+
+extern int (*pbuild_sub44_mests)( me_result_set *sub44set,
+								  int ilow, int jlow, int ihigh, int jhigh, 
+								  int i0, int j0,
+								  int null_mc_sad,
+								  uint8_t *s44org, uint8_t *s44blk, 
+								  int qrowstride, int qh,
+								  int reduction );
+
+
+/*
+ * Lower level difference comparison routines
+ *
+ * Naming convention: for block difference metric functions
+ * sad - Sum Absolute difference
+ * sumsq - Sum of squares
+ * sub44 - 4*4 Subsampled macroblocks (i.e. 4x4 subsampled pels)
+ * sub22 2*2 submsapled macroblocks (i.e. 8x8 subsampled pels)
+ * <nothing> Ordinary macroblocks (i.e. 16x16 pels)
+ * "b" prefix:  Difference of bi-directionally interpolated block 
+ *              (i.e. mean of two blocks)
+ * _xy suffix:  Difference with half-pel sub-sampling offset.
+ *              (i.e. _00 is just ordinary no sub-sampling)
+ * mests - Motion estimate results relative motion vector with sad.
+ */
+
+extern int (*psumsq_sub22)( uint8_t *blk1, uint8_t *blk2,
+						 int rowstride, int h);
+extern int (*pbsumsq_sub22)( uint8_t *blk1f, uint8_t *blk1b, 
+						  uint8_t *blk2,
+						  int rowstride, int h);
+
+extern int (*pvariance)(uint8_t *mb, int size, int rowstride);
+
+
+extern int (*psad_sub22) ( uint8_t *blk1, uint8_t *blk2,  int frowstride, int fh);
+extern int (*psad_sub44) ( uint8_t *blk1, uint8_t *blk2,  int qrowstride, int qh);
+extern int (*psad_00) ( uint8_t *blk1, uint8_t *blk2,  int rowstride, int h, int distlim);
+extern int (*psad_01) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+extern int (*psad_10) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+extern int (*psad_11) (uint8_t *blk1, uint8_t *blk2, int rowstride, int h);
+
+extern int (*psumsq) (uint8_t *blk1, uint8_t *blk2,
+					  int rowstride, int hx, int hy, int h);
+  
+  
+extern int (*pbsumsq) (uint8_t *pf, uint8_t *pb,
+					   uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
+
+extern int (*pbsad) (uint8_t *pf, uint8_t *pb,
+					 uint8_t *p2, int rowstride, int hxf, int hyf, int hxb, int hyb, int h);
+
+void init_motion_search(void);
+int round_search_radius( int radius );
+void subsample_image( uint8_t *image, int rowstride, 
+					  uint8_t *sub22_image, 
+					  uint8_t *sub44_image);
+
+#endif  /* __MOTIONSEARCH_H__ */
diff --git a/veejay-current/utils/mpegconsts.c b/veejay-current/utils/mpegconsts.c
new file mode 100644
index 00000000..008ac503
--- /dev/null
+++ b/veejay-current/utils/mpegconsts.c
@@ -0,0 +1,450 @@
+
+/*
+ *  mpegconsts.c:  Video format constants for MPEG and utilities for display
+ *                 and conversion to format used for yuv4mpeg
+ *
+ *  Copyright (C) 2001 Andrew Stevens <andrew.stevens@philips.com>
+ *  Copyright (C) 2001 Matthew Marjanovic <maddog@mir.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of version 2 of the GNU General Public License
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#include <config.h>
+#include "mpegconsts.h"
+#include "yuv4mpeg.h"
+#include "yuv4mpeg_intern.h"
+
+static y4m_ratio_t
+mpeg_framerates[] = {
+  Y4M_FPS_UNKNOWN,
+  Y4M_FPS_NTSC_FILM,
+  Y4M_FPS_FILM,
+  Y4M_FPS_PAL,
+  Y4M_FPS_NTSC,
+  Y4M_FPS_30,
+  Y4M_FPS_PAL_FIELD,
+  Y4M_FPS_NTSC_FIELD,
+  Y4M_FPS_60
+};
+
+
+#define MPEG_NUM_RATES (sizeof(mpeg_framerates)/sizeof(mpeg_framerates[0]))
+const mpeg_framerate_code_t mpeg_num_framerates = MPEG_NUM_RATES;
+
+static const char *
+framerate_definitions[MPEG_NUM_RATES] =
+{
+   "illegal", 
+  "24000.0/1001.0 (NTSC 3:2 pulldown converted FILM)",
+  "24.0 (NATIVE FILM)",
+  "25.0 (PAL/SECAM VIDEO / converted FILM)",
+  "30000.0/1001.0 (NTSC VIDEO)",
+  "30.0",
+  "50.0 (PAL FIELD RATE)",
+  "60000.0/1001.0 (NTSC FIELD RATE)",
+  "60.0"
+};
+
+
+static const char *mpeg1_aspect_ratio_definitions[] =
+{
+	"1:1 (square pixels)",
+	"1:0.6735",
+	"1:0.7031 (16:9 Anamorphic PAL/SECAM for 720x578/352x288 images)",
+	"1:0.7615",
+	"1:0.8055",
+	"1:0.8437 (16:9 Anamorphic NTSC for 720x480/352x240 images)",
+	"1:0.8935",
+	"1:0.9375 (4:3 PAL/SECAM for 720x578/352x288 images)",
+	"1:0.9815",
+	"1:1.0255",
+	"1:1:0695",
+	"1:1.1250 (4:3 NTSC for 720x480/352x240 images)",
+	"1:1.1575",
+	"1:1.2015"
+};
+
+static const y4m_ratio_t mpeg1_aspect_ratios[] =
+{
+	Y4M_SAR_MPEG1_1,
+	Y4M_SAR_MPEG1_2,
+	Y4M_SAR_MPEG1_3, /* Anamorphic 16:9 PAL */
+	Y4M_SAR_MPEG1_4,
+	Y4M_SAR_MPEG1_5,
+	Y4M_SAR_MPEG1_6, /* Anamorphic 16:9 NTSC */
+	Y4M_SAR_MPEG1_7,
+	Y4M_SAR_MPEG1_8, /* PAL/SECAM 4:3 */
+	Y4M_SAR_MPEG1_9,
+	Y4M_SAR_MPEG1_10,
+	Y4M_SAR_MPEG1_11,
+	Y4M_SAR_MPEG1_12, /* NTSC 4:3 */
+	Y4M_SAR_MPEG1_13,
+	Y4M_SAR_MPEG1_14,
+};
+
+static const char *mpeg2_aspect_ratio_definitions[] = 
+{
+	"1:1 pixels",
+	"4:3 display",
+	"16:9 display",
+	"2.21:1 display"
+};
+
+
+static const y4m_ratio_t mpeg2_aspect_ratios[] =
+{
+	Y4M_DAR_MPEG2_1,
+	Y4M_DAR_MPEG2_2,
+ 	Y4M_DAR_MPEG2_3,
+	Y4M_DAR_MPEG2_4
+};
+
+static const char **aspect_ratio_definitions[2] = 
+{
+	mpeg1_aspect_ratio_definitions,
+	mpeg2_aspect_ratio_definitions
+};
+
+static const y4m_ratio_t *mpeg_aspect_ratios[2] = 
+{
+	mpeg1_aspect_ratios,
+	mpeg2_aspect_ratios
+};
+
+const mpeg_aspect_code_t mpeg_num_aspect_ratios[2] = 
+{
+  sizeof(mpeg1_aspect_ratios)/sizeof(mpeg1_aspect_ratios[0]),
+  sizeof(mpeg2_aspect_ratios)/sizeof(mpeg2_aspect_ratios[0])
+};
+
+/*
+ * Convert MPEG frame-rate code to corresponding frame-rate
+ */
+
+y4m_ratio_t
+mpeg_framerate( mpeg_framerate_code_t code )
+{
+	if( code == 0 || code > mpeg_num_framerates )
+		return y4m_fps_UNKNOWN;
+	else
+		return mpeg_framerates[code];
+}
+
+/*
+ * Look-up MPEG frame rate code for a (exact) frame rate.
+ */
+
+
+mpeg_framerate_code_t 
+mpeg_framerate_code( y4m_ratio_t framerate )
+{
+	mpeg_framerate_code_t i;
+  
+	y4m_ratio_reduce(&framerate);
+	for (i = 1; i < mpeg_num_framerates; ++i) {
+		if (Y4M_RATIO_EQL(framerate, mpeg_framerates[i]))
+			return i;
+	}
+	return 0;
+}
+
+
+/* small enough to distinguish 1/1000 from 1/1001 */
+#define MPEG_FPS_TOLERANCE 0.0001
+
+
+y4m_ratio_t
+mpeg_conform_framerate( double fps )
+{
+	mpeg_framerate_code_t i;
+	y4m_ratio_t result;
+
+	/* try to match it to a standard frame rate */
+	for (i = 1; i < mpeg_num_framerates; i++) 
+	{
+		double deviation = 1.0 - (Y4M_RATIO_DBL(mpeg_framerates[i]) / fps);
+		if ( (deviation > -MPEG_FPS_TOLERANCE) &&
+			 (deviation < +MPEG_FPS_TOLERANCE) )
+			return mpeg_framerates[i];
+	}
+	/* no luck?  just turn it into a ratio (6 decimal place accuracy) */
+	result.n = (fps * 1000000.0) + 0.5;
+	result.d = 1000000;
+	y4m_ratio_reduce(&result);
+	return result;
+}
+
+  
+
+/*
+ * Convert MPEG aspect-ratio code to corresponding aspect-ratio
+ */
+
+y4m_ratio_t 
+mpeg_aspect_ratio( int mpeg_version,  mpeg_aspect_code_t code )
+{
+	y4m_ratio_t ratio;
+	if( mpeg_version < 1 || mpeg_version > 2 )
+		return y4m_sar_UNKNOWN;
+	if( code == 0 || code > mpeg_num_aspect_ratios[mpeg_version-1] )
+		return y4m_sar_UNKNOWN;
+	else
+	{
+		ratio = mpeg_aspect_ratios[mpeg_version-1][code-1];
+		y4m_ratio_reduce(&ratio);
+		return ratio;
+	}
+}
+
+/*
+ * Look-up corresponding MPEG aspect ratio code given an exact aspect ratio.
+ *
+ * WARNING: The semantics of aspect ratio coding *changed* between
+ * MPEG1 and MPEG2.  In MPEG1 it is the *pixel* aspect ratio. In
+ * MPEG2 it is the (far more sensible) aspect ratio of the eventual
+ * display.
+ *
+ */
+
+mpeg_aspect_code_t 
+mpeg_frame_aspect_code( int mpeg_version, y4m_ratio_t aspect_ratio )
+{
+	mpeg_aspect_code_t i;
+	y4m_ratio_t red_ratio = aspect_ratio;
+	y4m_ratio_reduce( &red_ratio );
+	if( mpeg_version < 1 || mpeg_version > 2 )
+		return 0;
+	for( i = 1; i < mpeg_num_aspect_ratios[mpeg_version-1]; ++i )
+	{
+		y4m_ratio_t red_entry =  mpeg_aspect_ratios[mpeg_version-1][i-1];
+		y4m_ratio_reduce( &red_entry );
+		if(  Y4M_RATIO_EQL( red_entry, red_ratio) )
+			return i;
+	}
+
+	return 0;
+			
+}
+
+
+
+/*
+ * Guess the correct MPEG aspect ratio code,
+ *  given the true sample aspect ratio and frame size of a video stream
+ *  (and the MPEG version, 1 or 2).
+ *
+ * Returns 0 if it has no good guess.
+ *
+ */
+
+
+/* this is big enough to accommodate the difference between 720 and 704 */
+#define GUESS_ASPECT_TOLERANCE 0.03
+
+mpeg_aspect_code_t 
+mpeg_guess_mpeg_aspect_code(int mpeg_version, y4m_ratio_t sampleaspect,
+							int frame_width, int frame_height)
+{
+	if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_UNKNOWN))
+    {
+		return 0;
+    }
+	switch (mpeg_version) {
+	case 1:
+		if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_SQUARE))
+		{
+			return 1;
+		} 
+		else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_NTSC_CCIR601))
+		{
+			return 12;
+		} 
+		else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_NTSC_16_9))
+		{
+			return 6;
+		} 
+		else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_PAL_CCIR601))
+		{
+			return 8;
+		} 
+		else if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_PAL_16_9))
+		{
+			return 3;
+		} 
+		return 0;
+		break;
+	case 2:
+		if (Y4M_RATIO_EQL(sampleaspect, y4m_sar_SQUARE))
+		{
+			return 1;  /* '1' means square *pixels* in MPEG-2; go figure. */
+		}
+		else
+		{
+			int i;
+			double true_far;  /* true frame aspect ratio */
+			true_far = 
+				(double)(sampleaspect.n * frame_width) /
+				(double)(sampleaspect.d * frame_height);
+			/* start at '2'... */
+			for (i = 2; i < mpeg_num_aspect_ratios[mpeg_version-1]; i++) 
+			{
+				double ratio = 
+					true_far / Y4M_RATIO_DBL(mpeg_aspect_ratios[mpeg_version-1][i-1]);
+				if ( (ratio > (1.0 - GUESS_ASPECT_TOLERANCE)) &&
+					 (ratio < (1.0 + GUESS_ASPECT_TOLERANCE)) )
+					return i;
+			}
+			return 0;
+		}
+		break;
+	default:
+		return 0;
+		break;
+	}
+}
+
+
+
+
+/*
+ * Guess the true sample aspect ratio of a video stream,
+ *  given the MPEG aspect ratio code and the actual frame size
+ *  (and the MPEG version, 1 or 2).
+ *
+ * Returns y4m_sar_UNKNOWN if it has no good guess.
+ *
+ */
+y4m_ratio_t 
+mpeg_guess_sample_aspect_ratio(int mpeg_version,
+							   mpeg_aspect_code_t code,
+							   int frame_width, int frame_height)
+{
+	switch (mpeg_version) 
+	{
+	case 1:
+		/* MPEG-1 codes turn into SAR's, just not quite the right ones.
+		   For the common/known values, we provide the ratio used in practice,
+		   otherwise say we don't know.*/
+		switch (code)
+		{
+		case 1:  return y4m_sar_SQUARE;        break;
+		case 3:  return y4m_sar_PAL_16_9;      break;
+		case 6:  return y4m_sar_NTSC_16_9;     break;
+		case 8:  return y4m_sar_PAL_CCIR601;   break;
+		case 12: return y4m_sar_NTSC_CCIR601;  break;
+		default:
+			return y4m_sar_UNKNOWN;       break;
+		}
+		break;
+	case 2:
+		/* MPEG-2 codes turn into Frame Aspect Ratios, though not exactly the
+		   FAR's used in practice.  For common/standard frame sizes, we provide
+		   the original SAR; otherwise, we say we don't know. */
+		if (code == 1) 
+		{
+			return y4m_sar_SQUARE; /* '1' means square *pixels* in MPEG-2 */
+		}
+		else if ((code >= 2) && (code <= 4))
+		{
+            return y4m_guess_sar(frame_width, frame_height,
+                                 mpeg2_aspect_ratios[code-1]);
+		} 
+		else
+		{
+			return y4m_sar_UNKNOWN;
+		}
+		break;
+	default:
+		return y4m_sar_UNKNOWN;
+		break;
+	}
+}
+
+
+
+
+
+/*
+ * Look-up MPEG explanatory definition string for frame rate code
+ *
+ */
+
+
+const char *
+mpeg_framerate_code_definition(   mpeg_framerate_code_t code  )
+{
+	if( code == 0 || code >=  mpeg_num_framerates )
+		return "UNDEFINED: illegal/reserved frame-rate ratio code";
+
+	return framerate_definitions[code];
+}
+
+/*
+ * Look-up MPEG explanatory definition string aspect ratio code for an
+ * aspect ratio code
+ *
+ */
+
+const char *
+mpeg_aspect_code_definition( int mpeg_version,  mpeg_aspect_code_t code  )
+{
+	if( mpeg_version < 1 || mpeg_version > 2 )
+		return "UNDEFINED: illegal MPEG version";
+	
+	if( code < 1 || code >  mpeg_num_aspect_ratios[mpeg_version-1] )
+		return "UNDEFINED: illegal aspect ratio code";
+
+	return aspect_ratio_definitions[mpeg_version-1][code-1];
+}
+
+
+/*
+ * Look-up explanatory definition of interlace field order code
+ *
+ */
+
+const char *
+mpeg_interlace_code_definition( int yuv4m_interlace_code )
+{
+	const char *def;
+	switch( yuv4m_interlace_code )
+	{
+	case Y4M_UNKNOWN :
+		def = "unknown";
+		break;
+	case Y4M_ILACE_NONE :
+		def = "none/progressive";
+		break;
+	case Y4M_ILACE_TOP_FIRST :
+		def = "top-field-first";
+		break;
+	case Y4M_ILACE_BOTTOM_FIRST :
+		def = "bottom-field-first";
+		break;
+	default :
+		def = "UNDEFINED: illegal video interlacing type-code!";
+		break;
+	}
+	return def;
+}
+
+
+/* 
+ * Local variables:
+ *  c-file-style: "stroustrup"
+ *  tab-width: 4
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/veejay-current/utils/mpegconsts.h b/veejay-current/utils/mpegconsts.h
new file mode 100644
index 00000000..461fd2db
--- /dev/null
+++ b/veejay-current/utils/mpegconsts.h
@@ -0,0 +1,149 @@
+
+/*
+ *  mpegconsts.c:  Video format constants for MPEG and utilities for display
+ *                 and conversion to format used for yuv4mpeg
+ *
+ *  Copyright (C) 2001 Andrew Stevens <andrew.stevens@philips.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of version 2 of the GNU General Public License
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#ifndef __MPEGCONSTS_H__
+#define __MPEGCONSTS_H__
+
+
+#include "yuv4mpeg.h"
+
+
+typedef unsigned int mpeg_framerate_code_t;
+typedef unsigned int mpeg_aspect_code_t;
+
+extern const mpeg_framerate_code_t mpeg_num_framerates;
+extern const mpeg_aspect_code_t mpeg_num_aspect_ratios[2];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Convert MPEG frame-rate code to corresponding frame-rate
+ *  y4m_fps_UNKNOWN = { 0, 0 } = Undefined/resrerved code.
+ */
+
+y4m_ratio_t
+mpeg_framerate( mpeg_framerate_code_t code );
+
+
+/*
+ * Look-up MPEG frame rate code for a (exact) frame rate.
+ *  0 = No MPEG code defined for frame-rate
+ */
+
+mpeg_framerate_code_t 
+mpeg_framerate_code( y4m_ratio_t framerate );
+
+
+/*
+ * Convert floating-point framerate to an exact ratio.
+ *  Uses a standard MPEG rate, if it finds one within MPEG_FPS_TOLERANCE
+ *  (see mpegconsts.c), otherwise uses "fps:1000000" as the ratio.
+ */
+
+y4m_ratio_t
+mpeg_conform_framerate( double fps );
+
+
+/*
+ * Convert MPEG aspect ratio code to corresponding aspect ratio
+ *
+ * WARNING: The semantics of aspect ratio coding *changed* between
+ * MPEG1 and MPEG2.  In MPEG1 it is the *pixel* aspect ratio. In
+ * MPEG2 it is the (far more sensible) aspect ratio of the eventual
+ * display.
+ *
+ */
+
+y4m_ratio_t
+mpeg_aspect_ratio( int mpeg_version,  mpeg_aspect_code_t code );
+
+/*
+ * Look-up MPEG aspect ratio code for an aspect ratio - tolerance
+ * is Y4M_ASPECT_MULT used by YUV4MPEG (see yuv4mpeg_intern.h)
+ *
+ * WARNING: The semantics of aspect ratio coding *changed* between
+ * MPEG1 and MPEG2.  In MPEG1 it is the *pixel* aspect ratio. In
+ * MPEG2 it is the (far more sensible) aspect ratio of the eventual
+ * display.
+ *
+ */
+
+mpeg_aspect_code_t 
+mpeg_frame_aspect_code( int mpeg_version, y4m_ratio_t aspect_ratio );
+
+/*
+ * Look-up MPEG explanatory definition string aspect ratio code for an
+ * aspect ratio code
+ *
+ */
+
+const char *
+mpeg_aspect_code_definition( int mpeg_version,  mpeg_aspect_code_t code  );
+
+/*
+ * Look-up MPEG explanatory definition string aspect ratio code for an
+ * frame rate code
+ *
+ */
+
+const char *
+mpeg_framerate_code_definition( mpeg_framerate_code_t code  );
+
+const char *
+mpeg_interlace_code_definition( int yuv4m_interlace_code );
+
+
+/*
+ * Guess the correct MPEG aspect ratio code,
+ *  given the true sample aspect ratio and frame size of a video stream
+ *  (and the MPEG version, 1 or 2).
+ *
+ * Returns 0 if it has no good answer.
+ *
+ */
+mpeg_aspect_code_t 
+mpeg_guess_mpeg_aspect_code(int mpeg_version, y4m_ratio_t sampleaspect,
+			    int frame_width, int frame_height);
+
+/*
+ * Guess the true sample aspect ratio of a video stream,
+ *  given the MPEG aspect ratio code and the actual frame size
+ *  (and the MPEG version, 1 or 2).
+ *
+ * Returns y4m_sar_UNKNOWN if it has no good answer.
+ *
+ */
+y4m_ratio_t 
+mpeg_guess_sample_aspect_ratio(int mpeg_version,
+			       mpeg_aspect_code_t code,
+			       int frame_width, int frame_height);
+
+
+#ifdef __cplusplus
+};
+#endif
+
+
+
+#endif /* __MPEGCONSTS_H__ */
diff --git a/veejay-current/utils/mpegtimecode.c b/veejay-current/utils/mpegtimecode.c
new file mode 100644
index 00000000..09986739
--- /dev/null
+++ b/veejay-current/utils/mpegtimecode.c
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (C) 2001 Kawamata/Hitoshi <hitoshi.kawamata@nifty.ne.jp>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include "mpegtimecode.h"
+
+/**************************************************************
+ * // NTSC DROP FRAME TIMECODE / 29.97fps (SMTPE)
+ * //    hh:mm:ss:ff
+ * //       hh: 0..
+ * //       mm: 0..59
+ * //       ss: 0..59
+ * //       ff: 0..29 # ss != 0 || mm % 10 == 0
+ * //           2..29 # ss == 0 && mm % 10 != 0
+ * //
+ * // 00:00:00:00 00:00:00:01 00:00:00:02 ... 00:00:00:29
+ * // 00:00:01:00 00:00:01:01 00:00:01:02 ... 00:00:01:29
+ * //                        :
+ * // 00:00:59:00 00:00:59:01 00:00:59:02 ... 00:00:59:29
+ * //                         00:01:00:02 ... 00:01:00:29
+ * // 00:01:01:00 00:01:01:01 00:01:01:02 ... 00:01:00:29
+ * //                        :
+ * // 00:01:59:00 00:01:59:01 00:01:59:02 ... 00:01:59:29
+ * //                         00:02:00:02 ... 00:02:00:29
+ * // 00:02:01:00 00:02:01:01 00:02:01:02 ... 00:02:00:29
+ * //                        :
+ * //                        :
+ * // 00:09:59:00 00:09:59:01 00:09:59:02 ... 00:09:59:29
+ * // 00:10:00:00 00:10:00:01 00:10:00:02 ... 00:10:00:29
+ * // 00:10:01:00 00:10:01:01 00:10:01:02 ... 00:10:01:29
+ * //                        :
+ * // 00:10:59:00 00:10:59:01 00:10:59:02 ... 00:10:59:29
+ * //                         00:11:00:02 ... 00:11:00:29
+ * // 00:11:01:00 00:11:01:01 00:11:01:02 ... 00:11:00:29
+ * //                        :
+ * //                        :
+ * // DROP FRAME / 59.94fps (no any standard)
+ * // DROP FRAME / 23.976fps (no any standard)
+ ***************************************************************/
+
+int dropframetimecode = -1;
+
+/* mpeg_timecode() return -tc->f on first frame in the minute, tc->f on other. */
+int
+mpeg_timecode(MPEG_timecode_t *tc, int f, int fpscode, double fps)
+{
+  static const int ifpss[] = { 0, 24, 24, 25, 30, 30, 50, 60, 60, };
+  int h, m, s;
+
+  if (dropframetimecode < 0) {
+    char *env = getenv("MJPEG_DROP_FRAME_TIME_CODE");
+    dropframetimecode = (env && *env != '0' && *env != 'n' && *env != 'N');
+  }
+  if (dropframetimecode &&
+      0 < fpscode && fpscode + 1 < sizeof ifpss / sizeof ifpss[0] &&
+      ifpss[fpscode] == ifpss[fpscode + 1]) {
+    int topinmin = 0, k = (30*4) / ifpss[fpscode];
+    f *= k;			/* frame# when 119.88fps */
+    h = (f / ((10*60*30-18)*4)); /* # of 10min. */
+    f %= ((10*60*30-18)*4);	/* frame# in 10min. */
+    f -= (2*4);			/* frame# in 10min. - (2*4) */
+    m = (f / ((60*30-2)*4));	/* min. in 10min. */
+    topinmin = ((f - k) / ((60*30-2)*4) < m);
+    m += (h % 6 * 10);		/* min. */
+    h /= 6;			/* hour */
+    f %= ((60*30-2)*4);		/* frame# in min. - (2*4)*/
+    f += (2*4);			/* frame# in min. */
+    s = f / (30*4);		/* sec. */
+    f %= (30*4);		/* frame# in sec. */
+    f /= k;			/* frame# in sec. on original fps */
+    tc->f = f;
+    if (topinmin)
+      f = -f;
+  } else {
+    int ifps = ((0 < fpscode && fpscode < sizeof ifpss / sizeof ifpss[0])?
+		ifpss[fpscode]: (int)(fps + .5));
+    s = f / ifps;
+    f %= ifps;
+    m = s / 60;
+    s %= 60;
+    h = m / 60;
+    m %= 60;
+    tc->f = f;
+  }
+  tc->s = s;
+  tc->m = m;
+  tc->h = h;
+  return f;
+}
diff --git a/veejay-current/utils/mpegtimecode.h b/veejay-current/utils/mpegtimecode.h
new file mode 100644
index 00000000..e702a9b7
--- /dev/null
+++ b/veejay-current/utils/mpegtimecode.h
@@ -0,0 +1,39 @@
+/* -*- mode:C -*- */
+/*
+ *  Copyright (C) 2001 Kawamata/Hitoshi <hitoshi.kawamata@nifty.ne.jp>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __MPEGTIMECODE_H__
+#define __MPEGTIMECODE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  char h, m, s, f;
+} MPEG_timecode_t;
+
+extern int dropframetimecode;
+extern int mpeg_timecode(MPEG_timecode_t *tc, int f, int fpscode, double fps);
+/* mpeg_timecode() return -tc->f on first frame in the minute, tc->f on other. */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/veejay-current/utils/videodev_mjpeg.h b/veejay-current/utils/videodev_mjpeg.h
new file mode 100644
index 00000000..68fd79c5
--- /dev/null
+++ b/veejay-current/utils/videodev_mjpeg.h
@@ -0,0 +1,118 @@
+/* These are the MJPEG API extensions for the Video4Linux API,
+   first introduced by the Iomega Buz driver by Rainer Johanni 
+   <rainer@johanni.de>
+*/
+
+/* This is identical with the mgavideo internal params struct, 
+   please tell me if you change this struct here ! <gz@lysator.liu.se) */
+struct mjpeg_params
+{
+
+   /* The following parameters can only be queried */
+
+   int major_version;            /* Major version number of driver */
+   int minor_version;            /* Minor version number of driver */
+
+   /* Main control parameters */
+
+   int input;                    /* Input channel: 0 = Composite, 1 = S-VHS */
+   int norm;                     /* Norm: VIDEO_MODE_PAL or VIDEO_MODE_NTSC */
+   int decimation;               /* decimation of captured video,
+                                    enlargement of video played back.
+                                    Valid values are 1, 2, 4 or 0.
+                                    0 is a special value where the user
+                                    has full control over video scaling */
+
+   /* The following parameters only have to be set if decimation==0,
+      for other values of decimation they provide the data how the image is captured */
+
+   int HorDcm;                    /* Horizontal decimation: 1, 2 or 4 */
+   int VerDcm;                    /* Vertical decimation: 1 or 2 */
+   int TmpDcm;                    /* Temporal decimation: 1 or 2,
+                                     if TmpDcm==2 in capture every second frame is dropped,
+                                     in playback every frame is played twice */
+   int field_per_buff;            /* Number of fields per buffer: 1 or 2 */
+   int img_x;                     /* start of image in x direction */
+   int img_y;                     /* start of image in y direction */
+   int img_width;                 /* image width BEFORE decimation,
+                                     must be a multiple of HorDcm*16 */
+   int img_height;                /* image height BEFORE decimation,
+                                     must be a multiple of VerDcm*8 */
+
+   /* --- End of parameters for decimation==0 only --- */
+
+   /* JPEG control parameters */
+
+   int  quality;                  /* Measure for quality of compressed images.
+                                     Scales linearly with the size of the compressed images.
+                                     Must be beetween 0 and 100, 100 is a compression
+                                     ratio of 1:4 */
+
+   int  odd_even;                 /* Which field should come first ???
+                                     This is more aptly named "top_first",
+                                     i.e. (odd_even==1) --> top-field-first */
+
+   int  APPn;                     /* Number of APP segment to be written, must be 0..15 */
+   int  APP_len;                  /* Length of data in JPEG APPn segment */
+   char APP_data[60];             /* Data in the JPEG APPn segment. */
+
+   int  COM_len;                  /* Length of data in JPEG COM segment */
+   char COM_data[60];             /* Data in JPEG COM segment */
+
+   unsigned long jpeg_markers;    /* Which markers should go into the JPEG output.
+                                     Unless you exactly know what you do, leave them untouched.
+                                     Inluding less markers will make the resulting code
+                                     smaller, but there will be fewer aplications
+                                     which can read it.
+                                     The presence of the APP and COM marker is
+                                     influenced by APP0_len and COM_len ONLY! */
+#define JPEG_MARKER_DHT (1<<3)    /* Define Huffman Tables */
+#define JPEG_MARKER_DQT (1<<4)    /* Define Quantization Tables */
+#define JPEG_MARKER_DRI (1<<5)    /* Define Restart Interval */
+#define JPEG_MARKER_COM (1<<6)    /* Comment segment */
+#define JPEG_MARKER_APP (1<<7)    /* App segment, driver will allways use APP0 */
+
+   int  VFIFO_FB;                 /* Flag for enabling Video Fifo Feedback.
+                                     If this flag is turned on and JPEG decompressing
+                                     is going to the screen, the decompress process
+                                     is stopped every time the Video Fifo is full.
+                                     This enables a smooth decompress to the screen
+                                     but the video output signal will get scrambled */
+
+   /* Misc */
+
+	char reserved[312];  /* Makes 512 bytes for this structure */
+};
+
+struct mjpeg_requestbuffers
+{
+   unsigned long count;      /* Number of buffers for MJPEG grabbing */
+   unsigned long size;       /* Size PER BUFFER in bytes */
+};
+
+struct mjpeg_sync
+{
+   unsigned long frame;      /* Frame (0 - n) for double buffer */
+   unsigned long length;     /* number of code bytes in buffer (capture only) */
+   unsigned long seq;        /* frame sequence number */
+   struct timeval timestamp; /* timestamp */
+};
+
+struct mjpeg_status
+{
+   int input;                /* Input channel, has to be set prior to BUZIOC_G_STATUS */
+   int signal;               /* Returned: 1 if valid video signal detected */
+   int norm;                 /* Returned: VIDEO_MODE_PAL or VIDEO_MODE_NTSC */
+   int color;                /* Returned: 1 if color signal detected */
+};
+
+/*
+Private IOCTL to set up for displaying MJPEG
+*/
+#define MJPIOC_G_PARAMS       _IOR ('v', BASE_VIDIOCPRIVATE+0,  struct mjpeg_params)
+#define MJPIOC_S_PARAMS       _IOWR('v', BASE_VIDIOCPRIVATE+1,  struct mjpeg_params)
+#define MJPIOC_REQBUFS        _IOWR('v', BASE_VIDIOCPRIVATE+2,  struct mjpeg_requestbuffers)
+#define MJPIOC_QBUF_CAPT      _IOW ('v', BASE_VIDIOCPRIVATE+3,  int)
+#define MJPIOC_QBUF_PLAY      _IOW ('v', BASE_VIDIOCPRIVATE+4,  int)
+#define MJPIOC_SYNC           _IOR ('v', BASE_VIDIOCPRIVATE+5,  struct mjpeg_sync)
+#define MJPIOC_G_STATUS       _IOWR('v', BASE_VIDIOCPRIVATE+6,  struct mjpeg_status)
diff --git a/veejay-current/utils/yuv4mpeg.c b/veejay-current/utils/yuv4mpeg.c
new file mode 100644
index 00000000..5c7cd8fd
--- /dev/null
+++ b/veejay-current/utils/yuv4mpeg.c
@@ -0,0 +1,774 @@
+/*
+ *  yuv4mpeg.c:  Functions for reading and writing "new" YUV4MPEG streams
+ *
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+#include <config.h>
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "yuv4mpeg.h"
+#include "yuv4mpeg_intern.h"
+#include "mjpeg_logging.h"
+
+
+static int _y4mparam_allow_unknown_tags = 1;  /* default is forgiveness */
+
+static void *(*_y4m_alloc)(size_t bytes) = malloc;
+static void (*_y4m_free)(void *ptr) = free;
+
+
+
+int y4m_allow_unknown_tags(int yn)
+{
+  int old = _y4mparam_allow_unknown_tags;
+  if (yn >= 0)
+    _y4mparam_allow_unknown_tags = (yn) ? 1 : 0;
+  return old;
+}
+
+
+
+/*************************************************************************
+ *
+ * Convenience functions for fd read/write
+ *
+ *   - guaranteed to transfer entire payload (or fail)
+ *   - returns:
+ *               0 on complete success
+ *               +(# of remaining bytes) on eof (for y4m_read)
+ *               -(# of rem. bytes) on error (and ERRNO should be set)
+ *     
+ *************************************************************************/
+
+
+ssize_t y4m_read(int fd, void *buf, size_t len)
+{
+   ssize_t n;
+   uint8_t *ptr = (uint8_t *)buf;
+
+   while (len > 0) {
+     n = read(fd, ptr, len);
+     if (n <= 0) {
+       /* return amount left to read */
+       if (n == 0)
+	 return len;  /* n == 0 --> eof */
+       else
+	 return -len; /* n < 0 --> error */
+     }
+     ptr += n;
+     len -= n;
+   }
+   return 0;
+}
+
+
+ssize_t y4m_write(int fd, const void *buf, size_t len)
+{
+   ssize_t n;
+   const uint8_t *ptr = (const uint8_t *)buf;
+
+   while (len > 0) {
+     n = write(fd, ptr, len);
+     if (n <= 0) return -len;  /* return amount left to write */
+     ptr += n;
+     len -= n;
+   }
+   return 0;
+}
+
+
+
+/*************************************************************************
+ *
+ * "Extra tags" handling
+ *
+ *************************************************************************/
+
+
+static char *y4m_new_xtag(void)
+{
+  return _y4m_alloc(Y4M_MAX_XTAG_SIZE * sizeof(char));
+}
+
+
+void y4m_init_xtag_list(y4m_xtag_list_t *xtags)
+{
+  int i;
+  xtags->count = 0;
+  for (i = 0; i < Y4M_MAX_XTAGS; i++) {
+    xtags->tags[i] = NULL;
+  }
+}
+
+
+void y4m_fini_xtag_list(y4m_xtag_list_t *xtags)
+{
+  int i;
+  for (i = 0; i < Y4M_MAX_XTAGS; i++) {
+    if (xtags->tags[i] != NULL) {
+      _y4m_free(xtags->tags[i]);
+      xtags->tags[i] = NULL;
+    }
+  }
+  xtags->count = 0;
+}
+
+
+void y4m_copy_xtag_list(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src)
+{
+  int i;
+  for (i = 0; i < src->count; i++) {
+    if (dest->tags[i] == NULL) 
+      dest->tags[i] = y4m_new_xtag();
+    strncpy(dest->tags[i], src->tags[i], Y4M_MAX_XTAG_SIZE);
+  }
+  dest->count = src->count;
+}
+
+
+
+static int y4m_snprint_xtags(char *s, int maxn, const y4m_xtag_list_t *xtags)
+{
+  int i, room;
+  
+  for (i = 0, room = maxn - 1; i < xtags->count; i++) {
+    int n = snprintf(s, room + 1, " %s", xtags->tags[i]);
+    if ((n < 0) || (n > room)) return Y4M_ERR_HEADER;
+    s += n;
+    room -= n;
+  }
+  s[0] = '\n';  /* finish off header with newline */
+  s[1] = '\0';  /* ...and end-of-string           */
+  return Y4M_OK;
+}
+
+
+int y4m_xtag_count(const y4m_xtag_list_t *xtags)
+{
+  return xtags->count;
+}
+
+
+const char *y4m_xtag_get(const y4m_xtag_list_t *xtags, int n)
+{
+  if (n >= xtags->count)
+    return NULL;
+  else
+    return xtags->tags[n];
+}
+
+
+int y4m_xtag_add(y4m_xtag_list_t *xtags, const char *tag)
+{
+  if (xtags->count >= Y4M_MAX_XTAGS) return Y4M_ERR_XXTAGS;
+  if (xtags->tags[xtags->count] == NULL) 
+    xtags->tags[xtags->count] = y4m_new_xtag();
+  strncpy(xtags->tags[xtags->count], tag, Y4M_MAX_XTAG_SIZE);
+  (xtags->count)++;
+  return Y4M_OK;
+}
+
+
+int y4m_xtag_remove(y4m_xtag_list_t *xtags, int n)
+{
+  int i;
+  char *q;
+
+  if ((n < 0) || (n >= xtags->count)) return Y4M_ERR_RANGE;
+  q = xtags->tags[n];
+  for (i = n; i < (xtags->count - 1); i++)
+    xtags->tags[i] = xtags->tags[i+1];
+  xtags->tags[i] = q;
+  (xtags->count)--;
+  return Y4M_OK;
+}
+
+
+int y4m_xtag_clearlist(y4m_xtag_list_t *xtags)
+{
+  xtags->count = 0;
+  return Y4M_OK;
+}
+
+
+int y4m_xtag_addlist(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src)
+{
+  int i, j;
+
+  if ((dest->count + src->count) > Y4M_MAX_XTAGS) return Y4M_ERR_XXTAGS;
+  for (i = dest->count, j = 0;
+       j < src->count;
+       i++, j++) {
+    if (dest->tags[i] == NULL) 
+      dest->tags[i] = y4m_new_xtag();
+    strncpy(dest->tags[i], src->tags[i], Y4M_MAX_XTAG_SIZE);
+  }
+  dest->count += src->count;
+  return Y4M_OK;
+}  
+
+
+/*************************************************************************
+ *
+ * Creators/destructors for y4m_*_info_t structures
+ *
+ *************************************************************************/
+
+
+void y4m_init_stream_info(y4m_stream_info_t *info)
+{
+  if (info == NULL) return;
+  /* initialize info */
+  info->width = Y4M_UNKNOWN;
+  info->height = Y4M_UNKNOWN;
+  info->interlace = Y4M_UNKNOWN;
+  info->framerate = y4m_fps_UNKNOWN;
+  info->sampleaspect = y4m_sar_UNKNOWN;
+  y4m_init_xtag_list(&(info->x_tags));
+}
+
+
+void y4m_copy_stream_info(y4m_stream_info_t *dest,
+			  const y4m_stream_info_t *src)
+{
+  if ((dest == NULL) || (src == NULL)) return;
+  /* copy info */
+  dest->width = src->width;
+  dest->height = src->height;
+  dest->interlace = src->interlace;
+  dest->framerate = src->framerate;
+  dest->sampleaspect = src->sampleaspect;
+  dest->framelength = src->framelength;
+  y4m_copy_xtag_list(&(dest->x_tags), &(src->x_tags));
+}
+
+
+void y4m_fini_stream_info(y4m_stream_info_t *info)
+{
+  if (info == NULL) return;
+  y4m_fini_xtag_list(&(info->x_tags));
+}
+
+
+void y4m_si_set_width(y4m_stream_info_t *si, int width)
+{
+  si->width = width;
+  si->framelength = (si->height * si->width) * 3 / 2;
+}
+
+int y4m_si_get_width(const y4m_stream_info_t *si)
+{ return si->width; }
+
+void y4m_si_set_height(y4m_stream_info_t *si, int height)
+{
+  si->height = height; 
+  si->framelength = (si->height * si->width) * 3 / 2;
+}
+
+int y4m_si_get_height(const y4m_stream_info_t *si)
+{ return si->height; }
+
+void y4m_si_set_interlace(y4m_stream_info_t *si, int interlace)
+{ si->interlace = interlace; }
+
+int y4m_si_get_interlace(const y4m_stream_info_t *si)
+{ return si->interlace; }
+
+void y4m_si_set_framerate(y4m_stream_info_t *si, y4m_ratio_t framerate)
+{ si->framerate = framerate; }
+
+y4m_ratio_t y4m_si_get_framerate(const y4m_stream_info_t *si)
+{ return si->framerate; }
+
+void y4m_si_set_sampleaspect(y4m_stream_info_t *si, y4m_ratio_t sar)
+{ si->sampleaspect = sar; }
+
+y4m_ratio_t y4m_si_get_sampleaspect(const y4m_stream_info_t *si)
+{ return si->sampleaspect; }
+
+int y4m_si_get_framelength(const y4m_stream_info_t *si)
+{ return si->framelength; }
+
+y4m_xtag_list_t *y4m_si_xtags(y4m_stream_info_t *si)
+{ return &(si->x_tags); }
+
+
+
+void y4m_init_frame_info(y4m_frame_info_t *info)
+{
+  if (info == NULL) return;
+  /* initialize info */
+  y4m_init_xtag_list(&(info->x_tags));
+}
+
+
+void y4m_copy_frame_info(y4m_frame_info_t *dest, const y4m_frame_info_t *src)
+{
+  if ((dest == NULL) || (src == NULL)) return;
+  /* copy info */
+  y4m_copy_xtag_list(&(dest->x_tags), &(src->x_tags));
+}
+
+
+void y4m_fini_frame_info(y4m_frame_info_t *info)
+{
+  if (info == NULL) return;
+  y4m_fini_xtag_list(&(info->x_tags));
+}
+
+
+
+/*************************************************************************
+ *
+ * Tag parsing 
+ *
+ *************************************************************************/
+
+int y4m_parse_stream_tags(char *s, y4m_stream_info_t *i)
+{
+  char *token, *value;
+  char tag;
+  int err;
+
+  /* parse fields */
+  for (token = strtok(s, Y4M_DELIM); 
+       token != NULL; 
+       token = strtok(NULL, Y4M_DELIM)) {
+    if (token[0] == '\0') continue;   /* skip empty strings */
+    tag = token[0];
+    value = token + 1;
+    switch (tag) {
+    case 'W':  /* width */
+      i->width = atoi(value);
+      if (i->width <= 0) return Y4M_ERR_RANGE;
+      break;
+    case 'H':  /* height */
+      i->height = atoi(value); 
+      if (i->height <= 0) return Y4M_ERR_RANGE;
+      break;
+    case 'F':  /* frame rate (fps) */
+      if ((err = y4m_parse_ratio(&(i->framerate), value)) != Y4M_OK)
+	return err;
+      if (i->framerate.n < 0) return Y4M_ERR_RANGE;
+      break;
+    case 'I':  /* interlacing */
+      switch (value[0]) {
+      case 'p':  i->interlace = Y4M_ILACE_NONE; break;
+      case 't':  i->interlace = Y4M_ILACE_TOP_FIRST; break;
+      case 'b':  i->interlace = Y4M_ILACE_BOTTOM_FIRST; break;
+      case '?':
+      default:
+	i->interlace = Y4M_UNKNOWN; break;
+      }
+      break;
+    case 'A':  /* sample (pixel) aspect ratio */
+      if ((err = y4m_parse_ratio(&(i->sampleaspect), value)) != Y4M_OK)
+	return err;
+      if (i->sampleaspect.n < 0) return Y4M_ERR_RANGE;
+      break;
+    case 'X':  /* 'X' meta-tag */
+      if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err;
+      break;
+    default:
+      /* possible error on unknown options */
+      if (_y4mparam_allow_unknown_tags) {
+	/* unknown tags ok:  store in xtag list and warn... */
+	if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err;
+	mjpeg_warn("Unknown stream tag encountered:  '%s'", token);
+      } else {
+	/* unknown tags are *not* ok */
+	return Y4M_ERR_BADTAG;
+      }
+      break;
+    }
+  }
+  /* Error checking... width and height must be known since we can't
+   * parse without them
+   */
+  if( i->width == Y4M_UNKNOWN || i->height == Y4M_UNKNOWN )
+	  return Y4M_ERR_HEADER;
+  /* ta da!  done. */
+  return Y4M_OK;
+}
+
+
+
+static int y4m_parse_frame_tags(char *s, y4m_frame_info_t *i)
+{
+  char *token, *value;
+  char tag;
+  int err;
+
+  /* parse fields */
+  for (token = strtok(s, Y4M_DELIM); 
+       token != NULL; 
+       token = strtok(NULL, Y4M_DELIM)) {
+    if (token[0] == '\0') continue;   /* skip empty strings */
+    tag = token[0];
+    value = token + 1;
+    switch (tag) {
+    case 'X':  /* 'X' meta-tag */
+      if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err;
+      break;
+    default:
+      /* possible error on unknown options */
+      if (_y4mparam_allow_unknown_tags) {
+	/* unknown tags ok:  store in xtag list and warn... */
+	if ((err = y4m_xtag_add(&(i->x_tags), token)) != Y4M_OK) return err;
+	mjpeg_warn("Unknown frame tag encountered:  '%s'", token);
+      } else {
+	/* unknown tags are *not* ok */
+	return Y4M_ERR_BADTAG;
+      }
+      break;
+    }
+  }
+  /* ta da!  done. */
+  return Y4M_OK;
+}
+
+
+
+
+
+/*************************************************************************
+ *
+ * Read/Write stream header
+ *
+ *************************************************************************/
+
+
+int y4m_read_stream_header(int fd, y4m_stream_info_t *i)
+{
+   char line[Y4M_LINE_MAX];
+   char *p;
+   int n;
+   int err;
+
+   /* read the header line */
+   for (n = 0, p = line; n < Y4M_LINE_MAX; n++, p++) {
+     if (read(fd, p, 1) < 1) 
+       return Y4M_ERR_SYSTEM;
+     if (*p == '\n') {
+       *p = '\0';           /* Replace linefeed by end of string */
+       break;
+     }
+   }
+   if (n >= Y4M_LINE_MAX)
+      return Y4M_ERR_HEADER;
+   /* look for keyword in header */
+   if (strncmp(line, Y4M_MAGIC, strlen(Y4M_MAGIC)))
+    return Y4M_ERR_MAGIC;
+   if ((err = y4m_parse_stream_tags(line + strlen(Y4M_MAGIC), i)) != Y4M_OK)
+     return err;
+
+   i->framelength = (i->height * i->width) * 3 / 2;
+   return Y4M_OK;
+}
+
+
+
+int y4m_write_stream_header(int fd, const y4m_stream_info_t *i)
+{
+  char s[Y4M_LINE_MAX+1];
+  int n;
+  int err;
+  y4m_ratio_t rate = i->framerate;
+  y4m_ratio_t aspect = i->sampleaspect;
+
+  y4m_ratio_reduce(&rate);
+  y4m_ratio_reduce(&aspect);
+  n = snprintf(s, sizeof(s), "%s W%d H%d F%d:%d I%s A%d:%d",
+	       Y4M_MAGIC,
+	       i->width,
+	       i->height,
+	       rate.n, rate.d,
+	       (i->interlace == Y4M_ILACE_NONE) ? "p" :
+	       (i->interlace == Y4M_ILACE_TOP_FIRST) ? "t" :
+	       (i->interlace == Y4M_ILACE_BOTTOM_FIRST) ? "b" : "?",
+	       aspect.n, aspect.d);
+  if ((n < 0) || (n > Y4M_LINE_MAX)) return Y4M_ERR_HEADER;
+  if ((err = y4m_snprint_xtags(s + n, sizeof(s) - n - 1, &(i->x_tags))) 
+      != Y4M_OK) 
+    return err;
+  /* non-zero on error */
+  return (y4m_write(fd, s, strlen(s)) ? Y4M_ERR_SYSTEM : Y4M_OK);
+}
+
+
+
+
+
+/*************************************************************************
+ *
+ * Read/Write frame header
+ *
+ *************************************************************************/
+
+int y4m_read_frame_header(int fd, y4m_frame_info_t *i)
+{
+  char line[Y4M_LINE_MAX];
+  char *p;
+  int n;
+  ssize_t remain;
+  
+  /* This is more clever than read_stream_header...
+     Try to read "FRAME\n" all at once, and don't try to parse
+     if nothing else is there...
+  */
+  remain = y4m_read(fd, line, sizeof(Y4M_FRAME_MAGIC)-1+1); /* -'\0', +'\n' */
+  if (remain < 0) return Y4M_ERR_SYSTEM;
+  if (remain > 0) {
+    /* A clean EOF should end exactly at a frame-boundary */
+    if (remain == sizeof(Y4M_FRAME_MAGIC))
+      return Y4M_ERR_EOF;
+    else
+      return Y4M_ERR_BADEOF;
+  }
+  if (strncmp(line, Y4M_FRAME_MAGIC, sizeof(Y4M_FRAME_MAGIC)-1))
+    return Y4M_ERR_MAGIC;
+  if (line[sizeof(Y4M_FRAME_MAGIC)-1] == '\n')
+    return Y4M_OK; /* done -- no tags:  that was the end-of-line. */
+
+  if (line[sizeof(Y4M_FRAME_MAGIC)-1] != Y4M_DELIM[0]) {
+    return Y4M_ERR_MAGIC; /* wasn't a space -- what was it? */
+  }
+
+  /* proceed to get the tags... (overwrite the magic) */
+  for (n = 0, p = line; n < Y4M_LINE_MAX; n++, p++) {
+    if (y4m_read(fd, p, 1))
+      return Y4M_ERR_SYSTEM;
+    if (*p == '\n') {
+      *p = '\0';           /* Replace linefeed by end of string */
+      break;
+    }
+  }
+  if (n >= Y4M_LINE_MAX) return Y4M_ERR_HEADER;
+  /* non-zero on error */
+  return y4m_parse_frame_tags(line, i);
+}
+
+
+int y4m_write_frame_header(int fd, const y4m_frame_info_t *i)
+{
+  char s[Y4M_LINE_MAX+1];
+  int n;
+  int err;
+  
+  n = snprintf(s, sizeof(s), "%s", Y4M_FRAME_MAGIC);
+  if ((n < 0) || (n > Y4M_LINE_MAX)) return Y4M_ERR_HEADER;
+  if ((err = y4m_snprint_xtags(s + n, sizeof(s) - n - 1, &(i->x_tags))) 
+      != Y4M_OK) 
+    return err;
+  /* non-zero on error */
+  return (y4m_write(fd, s, strlen(s)) ? Y4M_ERR_SYSTEM : Y4M_OK);
+}
+
+
+
+/*************************************************************************
+ *
+ * Read/Write entire frame
+ *
+ *************************************************************************/
+
+int y4m_read_frame(int fd, const y4m_stream_info_t *si, 
+		   y4m_frame_info_t *fi, uint8_t * const yuv[3])
+{
+  int err;
+  int w = si->width;
+  int h = si->height;
+  
+  /* Read frame header */
+  if ((err = y4m_read_frame_header(fd, fi)) != Y4M_OK) return err;
+  /* Read luminance scanlines */
+  if (y4m_read(fd, yuv[0], w*h)) return Y4M_ERR_SYSTEM;
+  /* Read chrominance scanlines */
+  if (y4m_read(fd, yuv[1], w*h/4)) return Y4M_ERR_SYSTEM;
+  if (y4m_read(fd, yuv[2], w*h/4)) return Y4M_ERR_SYSTEM;
+
+  return Y4M_OK;
+}
+
+
+
+
+int y4m_write_frame(int fd, const y4m_stream_info_t *si, 
+		    const y4m_frame_info_t *fi, uint8_t * const yuv[3])
+{
+  int err;
+  int w = si->width;
+  int h = si->height;
+
+  /* Write frame header */
+  if ((err = y4m_write_frame_header(fd, fi)) != Y4M_OK) return err;
+  /* Write luminance,chrominance scanlines */
+  if (y4m_write(fd, yuv[0], w*h) ||
+      y4m_write(fd, yuv[1], w*h/4) ||
+      y4m_write(fd, yuv[2], w*h/4))
+    return Y4M_ERR_SYSTEM;
+  return Y4M_OK;
+}
+
+
+
+/*************************************************************************
+ *
+ * Read/Write entire frame, (de)interleaved (to)from two separate fields
+ *
+ *************************************************************************/
+
+
+int y4m_read_fields(int fd, const y4m_stream_info_t *si, y4m_frame_info_t *fi,
+                    uint8_t * const upper_field[3], 
+                    uint8_t * const lower_field[3])
+{
+  int i, y, err;
+  int width = si->width;
+  int height = si->height;
+  
+  /* Read frame header */
+  if ((err = y4m_read_frame_header(fd, fi)) != Y4M_OK) return err;
+  /* Read Y', Cb, and Cr planes */
+  for (i = 0; i < 3; i++) {
+    uint8_t *srctop = upper_field[i];
+    uint8_t *srcbot = lower_field[i];
+    /* alternately write one line from each */
+    for (y = 0; y < height; y += 2) {
+      if (y4m_read(fd, srctop, width)) return Y4M_ERR_SYSTEM;
+      srctop += width;
+      if (y4m_read(fd, srcbot, width)) return Y4M_ERR_SYSTEM;
+      srcbot += width;
+    }
+    /* for chroma, width/height are half as big */
+    if (i == 0) {
+      width /= 2;
+      height /= 2;
+    }
+  }
+  return Y4M_OK;
+}
+
+
+
+int y4m_write_fields(int fd, const y4m_stream_info_t *si,
+		     const y4m_frame_info_t *fi,
+                     uint8_t * const upper_field[3], 
+                     uint8_t * const lower_field[3])
+{
+  int i, y, err;
+  int width = si->width;
+  int height = si->height;
+
+  /* Write frame header */
+  if ((err = y4m_write_frame_header(fd, fi)) != Y4M_OK) return err;
+  /* Write Y', Cb, and Cr planes */
+  for (i = 0; i < 3; i++) {
+    uint8_t *srctop = upper_field[i];
+    uint8_t *srcbot = lower_field[i];
+    /* alternately write one line from each */
+    for (y = 0; y < height; y += 2) {
+      if (y4m_write(fd, srctop, width)) return Y4M_ERR_SYSTEM;
+      srctop += width;
+      if (y4m_write(fd, srcbot, width)) return Y4M_ERR_SYSTEM;
+      srcbot += width;
+    }
+    /* for chroma, width/height are half as big */
+    if (i == 0) {
+      width /= 2;
+      height /= 2;
+    }
+  }
+  return Y4M_OK;
+}
+
+
+
+/*************************************************************************
+ *
+ * Handy logging of stream info
+ *
+ *************************************************************************/
+
+void y4m_log_stream_info(log_level_t level, const char *prefix,
+			 const y4m_stream_info_t *i)
+{
+  char s[256];
+
+  snprintf(s, sizeof(s), "  frame size:  ");
+  if (i->width == Y4M_UNKNOWN)
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "(?)x");
+  else
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "%dx", i->width);
+  if (i->height == Y4M_UNKNOWN)
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "(?) pixels ");
+  else
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "%d pixels ", i->height);
+  if (i->framelength == Y4M_UNKNOWN)
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "(? bytes)");
+  else
+    snprintf(s+strlen(s), sizeof(s)-strlen(s), "(%d bytes)", i->framelength);
+  mjpeg_log(level, "%s%s", prefix, s);
+  if ((i->framerate.n == 0) && (i->framerate.d == 0))
+    mjpeg_log(level, "%s  frame rate:  ??? fps", prefix);
+  else
+    mjpeg_log(level, "%s  frame rate:  %d/%d fps (~%f)", prefix,
+	      i->framerate.n, i->framerate.d, 
+	      (double) i->framerate.n / (double) i->framerate.d);
+  mjpeg_log(level, "%s   interlace:  %s", prefix,
+	  (i->interlace == Y4M_ILACE_NONE) ? "none/progressive" :
+	  (i->interlace == Y4M_ILACE_TOP_FIRST) ? "top-field-first" :
+	  (i->interlace == Y4M_ILACE_BOTTOM_FIRST) ? "bottom-field-first" :
+	  "anyone's guess");
+  if ((i->sampleaspect.n == 0) && (i->sampleaspect.d == 0))
+    mjpeg_log(level, "%ssample aspect ratio:  ?:?", prefix);
+  else
+    mjpeg_log(level, "%ssample aspect ratio:  %d:%d", prefix,
+	      i->sampleaspect.n, i->sampleaspect.d);
+}
+
+
+/*************************************************************************
+ *
+ * Convert error code to string
+ *
+ *************************************************************************/
+
+const char *y4m_strerr(int err)
+{
+  switch (err) {
+  case Y4M_OK:          return "no error";
+  case Y4M_ERR_RANGE:   return "parameter out of range";
+  case Y4M_ERR_SYSTEM:  return "system error (failed read/write)";
+  case Y4M_ERR_HEADER:  return "bad stream or frame header";
+  case Y4M_ERR_BADTAG:  return "unknown header tag";
+  case Y4M_ERR_MAGIC:   return "bad header magic";
+  case Y4M_ERR_XXTAGS:  return "too many xtags";
+  case Y4M_ERR_EOF:     return "end-of-file";
+  case Y4M_ERR_BADEOF:  return "stream ended unexpectedly (EOF)";
+  default: 
+    return "unknown error code";
+  }
+}
+
+
diff --git a/veejay-current/utils/yuv4mpeg.h b/veejay-current/utils/yuv4mpeg.h
new file mode 100644
index 00000000..6028ea88
--- /dev/null
+++ b/veejay-current/utils/yuv4mpeg.h
@@ -0,0 +1,473 @@
+/*
+ *  yuv4mpeg.h:  Functions for reading and writing "new" YUV4MPEG2 streams.
+ *
+ *               Stream format is described at the end of this file.
+ *
+ *
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#ifndef __YUV4MPEG_H__
+#define __YUV4MPEG_H__
+
+#include <stdlib.h>
+#include <mjpeg_types.h>
+#include <unistd.h>
+#include <mjpeg_logging.h>
+
+
+/************************************************************************
+ *  error codes returned by y4m_* functions
+ ************************************************************************/
+#define Y4M_OK          0
+#define Y4M_ERR_RANGE   1  /* argument or tag value out of range */
+#define Y4M_ERR_SYSTEM  2  /* failed system call, check errno */
+#define Y4M_ERR_HEADER  3  /* illegal/malformed header */
+#define Y4M_ERR_BADTAG  4  /* illegal tag character */
+#define Y4M_ERR_MAGIC   5  /* bad header magic */
+#define Y4M_ERR_EOF     6  /* end-of-file (clean) */
+#define Y4M_ERR_XXTAGS  7  /* too many xtags */
+#define Y4M_ERR_BADEOF  8  /* unexpected end-of-file */
+
+
+/* generic 'unknown' value for integer parameters (e.g. interlace, height) */
+#define Y4M_UNKNOWN -1
+
+
+
+/************************************************************************
+ *  'ratio' datatype, for rational numbers
+ *                                     (see 'ratio' functions down below)
+ ************************************************************************/
+typedef struct _y4m_ratio {
+  int n;  /* numerator   */
+  int d;  /* denominator */
+} y4m_ratio_t;
+
+
+/************************************************************************
+ *  useful standard framerates (as ratios)
+ ************************************************************************/
+extern const y4m_ratio_t y4m_fps_UNKNOWN;
+extern const y4m_ratio_t y4m_fps_NTSC_FILM;  /* 24000/1001 film (in NTSC)  */
+extern const y4m_ratio_t y4m_fps_FILM;       /* 24fps film                 */
+extern const y4m_ratio_t y4m_fps_PAL;        /* 25fps PAL                  */
+extern const y4m_ratio_t y4m_fps_NTSC;       /* 30000/1001 NTSC            */
+extern const y4m_ratio_t y4m_fps_30;         /* 30fps                      */
+extern const y4m_ratio_t y4m_fps_PAL_FIELD;  /* 50fps PAL field rate       */
+extern const y4m_ratio_t y4m_fps_NTSC_FIELD; /* 60000/1001 NTSC field rate */
+extern const y4m_ratio_t y4m_fps_60;         /* 60fps                      */
+
+/************************************************************************
+ *  useful standard sample (pixel) aspect ratios (W:H)
+ ************************************************************************/
+extern const y4m_ratio_t y4m_sar_UNKNOWN; 
+extern const y4m_ratio_t y4m_sar_SQUARE;        /* square pixels */
+extern const y4m_ratio_t y4m_sar_NTSC_CCIR601;  /* 525-line (NTSC) Rec.601 */
+extern const y4m_ratio_t y4m_sar_NTSC_16_9;     /* 16:9 NTSC/Rec.601       */
+extern const y4m_ratio_t y4m_sar_NTSC_SVCD_4_3; /* NTSC SVCD 4:3           */
+extern const y4m_ratio_t y4m_sar_NTSC_SVCD_16_9;/* NTSC SVCD 16:9          */
+extern const y4m_ratio_t y4m_sar_PAL_CCIR601;   /* 625-line (PAL) Rec.601  */
+extern const y4m_ratio_t y4m_sar_PAL_16_9;      /* 16:9 PAL/Rec.601        */
+extern const y4m_ratio_t y4m_sar_PAL_SVCD_4_3;  /* PAL SVCD 4:3            */
+extern const y4m_ratio_t y4m_sar_PAL_SVCD_16_9; /* PAL SVCD 16:9           */
+extern const y4m_ratio_t y4m_sar_SQR_ANA16_9;   /* anamorphic 16:9 sampled */
+                                            /* from 4:3 with square pixels */
+
+/************************************************************************
+ *  useful standard display aspect ratios (W:H)
+ ************************************************************************/
+extern const y4m_ratio_t y4m_dar_UNKNOWN; 
+extern const y4m_ratio_t y4m_dar_4_3;     /* standard TV   */
+extern const y4m_ratio_t y4m_dar_16_9;    /* widescreen TV */
+extern const y4m_ratio_t y4m_dar_221_100; /* word-to-your-mother TV */
+
+
+/************************************************************************
+ *  'xtag_list' --- list of unparsed and/or meta/X header tags
+ *
+ *     Do not touch this structure directly!
+ *
+ *     Use the y4m_xtag_*() functions (see below).
+ *     You must initialize/finalize this structure before/after use.
+ ************************************************************************/
+#define Y4M_MAX_XTAGS 32        /* maximum number of xtags in list       */
+#define Y4M_MAX_XTAG_SIZE 32    /* max length of an xtag (including 'X') */
+typedef struct _y4m_xtag_list {
+  int count;
+  char *tags[Y4M_MAX_XTAGS];
+} y4m_xtag_list_t;
+
+
+
+/************************************************************************
+ *  'stream_info' --- stream header information
+ *
+ *     Do not touch this structure directly!
+ *
+ *     Use the y4m_si_*() functions (see below).
+ *     You must initialize/finalize this structure before/after use.
+ ************************************************************************/
+typedef struct _y4m_stream_info {
+  /* values from header */
+  int width;
+  int height;
+  int interlace;            /* see Y4M_ILACE_* definitions below   */
+  y4m_ratio_t framerate;    /* frames-per-second;   0:0 == unknown */
+  y4m_ratio_t sampleaspect; /* pixel width/height;  0:0 == unknown */
+  /* computed/derivative values */
+  int framelength;    /* bytes of data per frame (not including header) */
+  /* mystical X tags */
+  y4m_xtag_list_t x_tags;
+} y4m_stream_info_t;
+
+/* possible options for the interlace parameter */
+#define Y4M_ILACE_NONE          0   /* non-interlaced, progressive frame */
+#define Y4M_ILACE_TOP_FIRST     1   /* interlaced, top-field first       */
+#define Y4M_ILACE_BOTTOM_FIRST  2   /* interlaced, bottom-field first    */
+
+
+/************************************************************************
+ *  'frame_info' --- frame header information
+ *
+ *     Do not touch this structure directly!
+ *
+ *     Use the y4m_fi_*() functions (see below).
+ *     You must initialize/finalize this structure before/after use.
+ ************************************************************************/
+typedef struct _y4m_frame_info {
+  /* mystical X tags */
+  y4m_xtag_list_t x_tags;
+} y4m_frame_info_t;
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#else
+#endif
+
+
+/************************************************************************
+ *  'ratio' functions
+ ************************************************************************/
+
+/* 'normalize' a ratio (remove common factors) */
+void y4m_ratio_reduce(y4m_ratio_t *r);
+
+/* parse "nnn:ddd" into a ratio (returns Y4M_OK or Y4M_ERR_RANGE) */
+int y4m_parse_ratio(y4m_ratio_t *r, const char *s);
+
+/* quick test of two ratios for equality (i.e. identical components) */
+#define Y4M_RATIO_EQL(a,b) ( ((a).n == (b).n) && ((a).d == (b).d) )
+
+/* quick conversion of a ratio to a double (no divide-by-zero check!) */
+#define Y4M_RATIO_DBL(r) ((double)(r).n / (double)(r).d)
+
+/*************************************************************************
+ *
+ * Guess the true SAR (sample aspect ratio) from a list of commonly 
+ * encountered values, given the "suggested" display aspect ratio (DAR),
+ * and the true frame width and height.
+ *
+ * Returns y4m_sar_UNKNOWN if no match is found.
+ *
+ *************************************************************************/
+y4m_ratio_t y4m_guess_sar(int width, int height, y4m_ratio_t dar);
+
+
+
+/************************************************************************
+ *  'xtag' functions
+ *
+ * o Before using an xtag_list (but after the structure/memory has been
+ *    allocated), you must initialize it via y4m_init_xtag_list().
+ * o After using an xtag_list (but before the structure is released),
+ *    call y4m_fini_xtag_list() to free internal memory.
+ *
+ ************************************************************************/
+
+/* initialize an xtag_list structure */
+void y4m_init_xtag_list(y4m_xtag_list_t *xtags);
+
+/* finalize an xtag_list structure */
+void y4m_fini_xtag_list(y4m_xtag_list_t *xtags);
+
+/* make one xtag_list into a copy of another */
+void y4m_copy_xtag_list(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src);
+
+/* return number of tags in an xtag_list */
+int y4m_xtag_count(const y4m_xtag_list_t *xtags);
+
+/* access n'th tag in an xtag_list */
+const char *y4m_xtag_get(const y4m_xtag_list_t *xtags, int n);
+
+/* append a new tag to an xtag_list
+    returns:          Y4M_OK - success
+              Y4M_ERR_XXTAGS - list is already full */
+int y4m_xtag_add(y4m_xtag_list_t *xtags, const char *tag);
+
+/* remove a tag from an xtag_list 
+    returns:         Y4M_OK - success
+              Y4M_ERR_RANGE - n is out of range */
+int y4m_xtag_remove(y4m_xtag_list_t *xtags, int n);
+
+/* remove all tags from an xtag_list 
+    returns:   Y4M_OK - success       */
+int y4m_xtag_clearlist(y4m_xtag_list_t *xtags);
+
+/* append copies of tags from src list to dest list
+    returns:          Y4M_OK - success
+              Y4M_ERR_XXTAGS - operation would overfill dest list */
+int y4m_xtag_addlist(y4m_xtag_list_t *dest, const y4m_xtag_list_t *src);
+
+
+
+/************************************************************************
+ *  '*_info' functions
+ *
+ * o Before using a *_info structure (but after the structure/memory has
+ *    been allocated), you must initialize it via y4m_init_*_info().
+ * o After using a *_info structure (but before the structure is released),
+ *    call y4m_fini_*_info() to free internal memory.
+ * o Use the 'set' and 'get' accessors to modify or access the fields in
+ *    the structures; don't touch the structure directly.  (Ok, so there
+ *    is no really convenient C syntax to prevent you from doing this,
+ *    but we are all responsible programmers here, so just don't do it!)
+ *
+ ************************************************************************/
+
+/* initialize a stream_info structure */
+void y4m_init_stream_info(y4m_stream_info_t *i);
+
+/* finalize a stream_info structure */
+void y4m_fini_stream_info(y4m_stream_info_t *i);
+
+/* make one stream_info into a copy of another */
+void y4m_copy_stream_info(y4m_stream_info_t *dest,
+			  const y4m_stream_info_t *src);
+
+/* access or set stream_info fields */
+void y4m_si_set_width(y4m_stream_info_t *si, int width);
+int y4m_si_get_width(const y4m_stream_info_t *si);
+void y4m_si_set_height(y4m_stream_info_t *si, int height);
+int y4m_si_get_height(const y4m_stream_info_t *si);
+void y4m_si_set_interlace(y4m_stream_info_t *si, int interlace);
+int y4m_si_get_interlace(const y4m_stream_info_t *si);
+void y4m_si_set_framerate(y4m_stream_info_t *si, y4m_ratio_t framerate);
+y4m_ratio_t y4m_si_get_framerate(const y4m_stream_info_t *si);
+void y4m_si_set_sampleaspect(y4m_stream_info_t *si, y4m_ratio_t sar);
+y4m_ratio_t y4m_si_get_sampleaspect(const y4m_stream_info_t *si);
+int y4m_si_get_framelength(const y4m_stream_info_t *si);
+
+/* access stream_info xtag_list */
+y4m_xtag_list_t *y4m_si_xtags(y4m_stream_info_t *si);
+
+
+/* initialize a frame_info structure */
+void y4m_init_frame_info(y4m_frame_info_t *i);
+
+/* finalize a frame_info structure */
+void y4m_fini_frame_info(y4m_frame_info_t *i);
+
+/* make one frame_info into a copy of another */
+void y4m_copy_frame_info(y4m_frame_info_t *dest,
+			 const y4m_frame_info_t *src);
+
+/* access frame_info xtag_list */
+y4m_xtag_list_t *y4m_fi_xtags(y4m_frame_info_t *fi);
+
+
+
+/************************************************************************
+ *  blocking read and write functions
+ *
+ *  o guaranteed to transfer entire payload (or fail)
+ *  o return values:
+ *                         0 (zero)   complete success
+ *          -(# of remaining bytes)   error (and errno left set)
+ *          +(# of remaining bytes)   EOF (for y4m_read only)
+ *
+ ************************************************************************/
+
+/* read len bytes from fd into buf */
+ssize_t y4m_read(int fd, void *buf, size_t len);
+
+/* write len bytes from fd into buf */
+ssize_t y4m_write(int fd, const void *buf, size_t len);
+
+
+
+/************************************************************************
+ *  stream header processing functions
+ *  
+ *  o return values:
+ *                   Y4M_OK - success
+ *                Y4M_ERR_* - error (see y4m_strerr() for descriptions)
+ *
+ ************************************************************************/
+
+/* parse a string of stream header tags */
+int y4m_parse_stream_tags(char *s, y4m_stream_info_t *i);
+
+/* read a stream header from file descriptor fd */
+int y4m_read_stream_header(int fd, y4m_stream_info_t *i);
+
+/* write a stream header to file descriptor fd */
+int y4m_write_stream_header(int fd, const y4m_stream_info_t *i);
+
+
+
+/************************************************************************
+ *  frame processing functions
+ *  
+ *  o return values:
+ *                   Y4M_OK - success
+ *                Y4M_ERR_* - error (see y4m_strerr() for descriptions)
+ *
+ ************************************************************************/
+
+/* read a frame header from file descriptor fd */
+int y4m_read_frame_header(int fd, y4m_frame_info_t *i);
+
+/* write a frame header to file descriptor fd */
+int y4m_write_frame_header(int fd, const y4m_frame_info_t *i);
+
+/* read a complete frame (header + data)
+   o yuv[3] points to three buffers, one each for Y, U, V planes */
+int y4m_read_frame(int fd, const y4m_stream_info_t *si, 
+		   y4m_frame_info_t *fi, uint8_t * const yuv[3]);
+
+/* write a complete frame (header + data)
+   o yuv[3] points to three buffers, one each for Y, U, V planes */
+int y4m_write_frame(int fd, const y4m_stream_info_t *si, 
+		    const y4m_frame_info_t *fi, uint8_t * const yuv[3]);
+
+
+/* read a complete frame (header + data), but de-interleave fields
+    into two separate buffers
+   o upper_field[3] same as yuv[3] above, but for upper field
+   o lower_field[3] same as yuv[3] above, but for lower field
+*/
+int y4m_read_fields(int fd, const y4m_stream_info_t *si, 
+		    y4m_frame_info_t *fi,
+		    uint8_t * const upper_field[3], 
+		    uint8_t * const lower_field[3]);
+
+/* write a complete frame (header + data), but interleave fields
+    from two separate buffers
+   o upper_field[3] same as yuv[3] above, but for upper field
+   o lower_field[3] same as yuv[3] above, but for lower field
+*/
+int y4m_write_fields(int fd, const y4m_stream_info_t *si, 
+		     const y4m_frame_info_t *fi,
+		     uint8_t * const upper_field[3], 
+		     uint8_t * const lower_field[3]);
+
+
+
+/************************************************************************
+ *  miscellaneous functions
+ ************************************************************************/
+
+/* convenient dump of stream header info via mjpeg_log facility
+ *  - each logged/printed line is prefixed by 'prefix'
+ */
+void y4m_log_stream_info(log_level_t level, const char *prefix,
+			 const y4m_stream_info_t *i);
+
+/* convert a Y4M_ERR_* error code into mildly explanatory string */
+const char *y4m_strerr(int err);
+
+/* set 'allow_unknown_tag' flag for library...
+    o yn = 0 :  unknown header tags will produce a parsing error
+    o yn = 1 :  unknown header tags/values will produce a warning, but
+                 are otherwise passed along via the xtags list
+    o yn = -1:  don't change, just return current setting
+
+   return value:  previous setting of flag
+*/
+int y4m_allow_unknown_tags(int yn);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+/************************************************************************
+ ************************************************************************
+
+  Description of the (new!, forever?) YUV4MPEG2 stream format:
+
+  STREAM consists of
+    o one '\n' terminated STREAM-HEADER
+    o unlimited number of FRAMEs
+
+  FRAME consists of
+    o one '\n' terminated FRAME-HEADER
+    o "length" octets of planar YCrCb 4:2:0 image data
+        (if frame is interlaced, then the two fields are interleaved)
+
+
+  STREAM-HEADER consists of
+     o string "YUV4MPEG2 "  (note the space after the '2')
+     o unlimited number of ' ' separated TAGGED-FIELDs
+     o '\n' line terminator
+
+  FRAME-HEADER consists of
+     o string "FRAME "  (note the space after the 'E')
+     o unlimited number of ' ' separated TAGGED-FIELDs
+     o '\n' line terminator
+
+
+  TAGGED-FIELD consists of
+     o single ascii character tag
+     o VALUE (which does not contain whitespace)
+
+  VALUE consists of
+     o integer (base 10 ascii representation)
+  or o RATIO
+  or o single ascii character
+  or o generic ascii string
+
+  RATIO consists of
+     o numerator (integer)
+     o ':' (a colon)
+     o denominator (integer)
+
+
+  The currently supported tags for the STREAM-HEADER:
+     W - [integer] frame width, pixels, should be > 0
+     H - [integer] frame height, pixels, should be > 0
+     I - [char] interlacing:  p - progressive (none)
+                            t - top-field-first
+                            b - bottom-field-first
+		            ? - unknown
+     F - [ratio] frame-rate, 0:0 == unknown
+     A - [ratio] sample (pixel) aspect ratio, 0:0 == unknown
+     X - [character string] 'metadata' (unparsed, but passed around)
+
+  The currently supported tags for the FRAME-HEADER:
+     X - character string 'metadata' (unparsed, but passed around)
+
+ ************************************************************************
+ ************************************************************************/
+
+#endif /* __YUV4MPEG_H__ */
+
+
diff --git a/veejay-current/utils/yuv4mpeg_intern.h b/veejay-current/utils/yuv4mpeg_intern.h
new file mode 100644
index 00000000..140f9d62
--- /dev/null
+++ b/veejay-current/utils/yuv4mpeg_intern.h
@@ -0,0 +1,85 @@
+/*
+ *  yuv4mpeg_intern.h:  Internal constants for "new" YUV4MPEG streams
+ *
+ *  Copyright (C) 2001 Andrew Stevens <andrew.stevens@philips.com>
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of version 2 of the GNU General Public License
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#ifndef __YUV4MPEG_INTERN_H__
+#define __YUV4MPEG_INTERN_H__
+
+
+#define Y4M_MAGIC "YUV4MPEG2"
+#define Y4M_FRAME_MAGIC "FRAME"
+
+#define Y4M_DELIM " "  /* single-character(space) separating tagged fields */
+
+#define Y4M_LINE_MAX 256   /* max number of characters in a header line
+                               (including the '\n', but not the '\0') */
+
+
+/* standard framerate ratios */
+#define Y4M_FPS_UNKNOWN    { 0, 0 }
+#define Y4M_FPS_NTSC_FILM  { 24000, 1001 }
+#define Y4M_FPS_FILM       { 24, 1 }
+#define Y4M_FPS_PAL        { 25, 1 }
+#define Y4M_FPS_NTSC       { 30000, 1001 }
+#define Y4M_FPS_30         { 30, 1 }
+#define Y4M_FPS_PAL_FIELD  { 50, 1 }
+#define Y4M_FPS_NTSC_FIELD { 60000, 1001 }
+#define Y4M_FPS_60         { 60, 1 }
+
+/* standard sample/pixel aspect ratios */
+#define Y4M_SAR_UNKNOWN        {   0, 0  }
+#define Y4M_SAR_SQUARE         {   1, 1  }
+#define Y4M_SAR_SQR_ANA_16_9   {   4, 3  }
+#define Y4M_SAR_NTSC_CCIR601   {  10, 11 }
+#define Y4M_SAR_NTSC_16_9      {  40, 33 }
+#define Y4M_SAR_NTSC_SVCD_4_3  {  15, 11 }
+#define Y4M_SAR_NTSC_SVCD_16_9 {  20, 11 }
+#define Y4M_SAR_PAL_CCIR601    {  59, 54 }
+#define Y4M_SAR_PAL_16_9       { 118, 81 }
+#define Y4M_SAR_PAL_SVCD_4_3   {  59, 36 }
+#define Y4M_SAR_PAL_SVCD_16_9  {  59, 27 }
+
+#define Y4M_SAR_MPEG1_1 Y4M_SAR_SQUARE
+#define Y4M_SAR_MPEG1_2  { 10000, 6735 }
+#define Y4M_SAR_MPEG1_3  { 10000, 7031 } /* Anamorphic 16:9 PAL */
+#define Y4M_SAR_MPEG1_4  { 10000, 7615 }
+#define Y4M_SAR_MPEG1_5  { 10000, 8055 }
+#define Y4M_SAR_MPEG1_6  { 10000, 8437 } /* Anamorphic 16:9 NTSC */
+#define Y4M_SAR_MPEG1_7  { 10000, 8935 } 
+#define Y4M_SAR_MPEG1_8  { 10000, 9375 } /* PAL/SECAM 4:3 */
+#define Y4M_SAR_MPEG1_9  { 10000, 9815 }
+#define Y4M_SAR_MPEG1_10 { 10000, 10255 }
+#define Y4M_SAR_MPEG1_11 { 10000, 10695 }
+#define Y4M_SAR_MPEG1_12 { 10000, 11250 } /* NTSC 4:3 */
+#define Y4M_SAR_MPEG1_13 { 10000, 11575 }
+#define Y4M_SAR_MPEG1_14 { 10000, 12015 }
+
+#define Y4M_DAR_UNKNOWN        {   0, 0  }
+#define Y4M_DAR_4_3      {   4, 3   }
+#define Y4M_DAR_16_9     {  16, 9   }
+#define Y4M_DAR_221_100  { 221, 100 }
+
+#define Y4M_DAR_MPEG2_1 {   1, 1   }
+#define Y4M_DAR_MPEG2_2 {   4, 3   }
+#define Y4M_DAR_MPEG2_3 {  16, 9   }
+#define Y4M_DAR_MPEG2_4 { 221, 100 }
+
+#endif /* __YUV4MPEG_INTERN_H__ */
+
diff --git a/veejay-current/utils/yuv4mpeg_ratio.c b/veejay-current/utils/yuv4mpeg_ratio.c
new file mode 100644
index 00000000..d2685709
--- /dev/null
+++ b/veejay-current/utils/yuv4mpeg_ratio.c
@@ -0,0 +1,156 @@
+/*
+ *  yuv4mpeg_ratio.c:  Functions for dealing with y4m_ratio_t datatype.
+ *
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+#include <config.h>
+
+#include <string.h>
+#include "yuv4mpeg.h"
+#include "yuv4mpeg_intern.h"
+
+
+/* useful list of standard framerates */
+const y4m_ratio_t y4m_fps_UNKNOWN    = Y4M_FPS_UNKNOWN;
+const y4m_ratio_t y4m_fps_NTSC_FILM  = Y4M_FPS_NTSC_FILM;
+const y4m_ratio_t y4m_fps_FILM       = Y4M_FPS_FILM;
+const y4m_ratio_t y4m_fps_PAL        = Y4M_FPS_PAL;
+const y4m_ratio_t y4m_fps_NTSC       = Y4M_FPS_NTSC;
+const y4m_ratio_t y4m_fps_30         = Y4M_FPS_30;
+const y4m_ratio_t y4m_fps_PAL_FIELD  = Y4M_FPS_PAL_FIELD;
+const y4m_ratio_t y4m_fps_NTSC_FIELD = Y4M_FPS_NTSC_FIELD;
+const y4m_ratio_t y4m_fps_60         = Y4M_FPS_60;
+
+/* useful list of standard sample aspect ratios */
+const y4m_ratio_t y4m_sar_UNKNOWN        = Y4M_SAR_UNKNOWN;
+const y4m_ratio_t y4m_sar_SQUARE         = Y4M_SAR_SQUARE;
+const y4m_ratio_t y4m_sar_SQR_ANA_16_9   = Y4M_SAR_SQR_ANA_16_9;
+const y4m_ratio_t y4m_sar_NTSC_CCIR601   = Y4M_SAR_NTSC_CCIR601;
+const y4m_ratio_t y4m_sar_NTSC_16_9      = Y4M_SAR_NTSC_16_9;
+const y4m_ratio_t y4m_sar_NTSC_SVCD_4_3  = Y4M_SAR_NTSC_SVCD_4_3;
+const y4m_ratio_t y4m_sar_NTSC_SVCD_16_9 = Y4M_SAR_NTSC_SVCD_16_9;
+const y4m_ratio_t y4m_sar_PAL_CCIR601    = Y4M_SAR_PAL_CCIR601;
+const y4m_ratio_t y4m_sar_PAL_16_9       = Y4M_SAR_PAL_16_9;
+const y4m_ratio_t y4m_sar_PAL_SVCD_4_3   = Y4M_SAR_PAL_SVCD_4_3;
+const y4m_ratio_t y4m_sar_PAL_SVCD_16_9  = Y4M_SAR_PAL_SVCD_16_9;
+
+/* useful list of standard display aspect ratios */
+const y4m_ratio_t y4m_dar_4_3            = Y4M_DAR_4_3;
+const y4m_ratio_t y4m_dar_16_9           = Y4M_DAR_16_9;
+const y4m_ratio_t y4m_dar_221_100        = Y4M_DAR_221_100;
+
+/*
+ *  Euler's algorithm for greatest common divisor
+ */
+
+static int gcd(int a, int b)
+{
+  a = (a >= 0) ? a : -a;
+  b = (b >= 0) ? b : -b;
+
+  while (b > 0) {
+    int x = b;
+    b = a % b;
+    a = x;
+  }
+  return a;
+}
+    
+
+/*************************************************************************
+ *
+ * Remove common factors from a ratio
+ *
+ *************************************************************************/
+
+
+void y4m_ratio_reduce(y4m_ratio_t *r)
+{
+  int d;
+  if ((r->n == 0) && (r->d == 0)) return;  /* "unknown" */
+  d = gcd(r->n, r->d);
+  r->n /= d;
+  r->d /= d;
+}
+
+
+
+/*************************************************************************
+ *
+ * Parse "nnn:ddd" into a ratio
+ *
+ * returns:         Y4M_OK  - success
+ *           Y4M_ERR_RANGE  - range error 
+ *
+ *************************************************************************/
+
+int y4m_parse_ratio(y4m_ratio_t *r, const char *s)
+{
+  char *t = strchr(s, ':');
+  if (t == NULL) return Y4M_ERR_RANGE;
+  r->n = atoi(s);
+  r->d = atoi(t+1);
+  if (r->d < 0) return Y4M_ERR_RANGE;
+  /* 0:0 == unknown, so that is ok, otherwise zero denominator is bad */
+  if ((r->d == 0) && (r->n != 0)) return Y4M_ERR_RANGE;
+  y4m_ratio_reduce(r);
+  return Y4M_OK;
+}
+
+
+
+/*************************************************************************
+ *
+ * Guess the true SAR (sample aspect ratio) from a list of commonly 
+ * encountered values, given the "suggested" display aspect ratio, and
+ * the true frame width and height.
+ *
+ * Returns y4m_sar_UNKNOWN if no match is found.
+ *
+ *************************************************************************/
+
+/* this is big enough to accommodate the difference between 720 and 704 */
+#define GUESS_ASPECT_TOLERANCE 0.03
+
+y4m_ratio_t y4m_guess_sar(int width, int height, y4m_ratio_t dar)
+{
+  int i;
+  double implicit_sar = (double)(dar.n * height) / (double)(dar.d * width);
+  y4m_ratio_t sarray[] =
+    {
+      y4m_sar_SQUARE,
+      y4m_sar_NTSC_CCIR601,
+      y4m_sar_NTSC_16_9,
+      y4m_sar_NTSC_SVCD_4_3,
+      y4m_sar_NTSC_SVCD_16_9,
+      y4m_sar_PAL_CCIR601,
+      y4m_sar_PAL_16_9,
+      y4m_sar_PAL_SVCD_4_3,
+      y4m_sar_PAL_SVCD_16_9,
+      y4m_sar_UNKNOWN
+    };
+  for (i = 0; !(Y4M_RATIO_EQL(sarray[i],y4m_sar_UNKNOWN)); i++) {
+    double ratio = implicit_sar / Y4M_RATIO_DBL(sarray[i]);
+    if ( (ratio > (1.0 - GUESS_ASPECT_TOLERANCE)) &&
+	 (ratio < (1.0 + GUESS_ASPECT_TOLERANCE)) )
+      return sarray[i];
+  }
+  return y4m_sar_UNKNOWN;
+}