Initial checkin of veejay 1.4

git-svn-id: svn://code.dyne.org/veejay/trunk@1172 eb8d1916-c9e9-0310-b8de-cf0c9472ead5
2025-12-20 14:50:01 +01:00 · 2008-11-10 20:16:24 +00:00
parent d81258c54c
commit d8e6f98d53
793 changed files with 244409 additions and 0 deletions
--- a/veejay-current/veejay-server/libyuv/Makefile.am
+++ b/veejay-current/veejay-server/libyuv/Makefile.am
@@ -0,0 +1,9 @@
+# Makefile for veejay
+
+MAINTAINERCLEANFILES = Makefile.in
+AM_CFLAGS=$(YUVCFLAGS)
+INCLUDES = 	-I$(top_srcdir) -I$(includedir) -I$(top_srcdir)/aclib  -I$(top_srcdir)/vjmem -I$(top_srcdir)/vjmsg -I$(top_srcdir)/libpostproc 
+YUV_LIB_FILE = libyuv.la
+pkginclude_HEADERS = yuvconv.h 
+noinst_LTLIBRARIES = $(YUV_LIB_FILE)
+libyuv_la_SOURCES =	subsample.c yuvconv.c
--- a/veejay-current/veejay-server/libyuv/mmx.h
+++ b/veejay-current/veejay-server/libyuv/mmx.h
@@ -0,0 +1,265 @@
+/*
+ * mmx.h
+ * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ * The type of an value that fits in an MMX register (note that long
+ * long constant values MUST be suffixed by LL and unsigned long long
+ * values by ULL, lest they be truncated by the compiler)
+ */
+
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
+
+typedef	union {
+	long long		q;	/* Quadword (64-bit) value */
+	unsigned long long	uq;	/* Unsigned Quadword */
+	int			d[2];	/* 2 Doubleword (32-bit) values */
+	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
+	short			w[4];	/* 4 Word (16-bit) values */
+	unsigned short		uw[4];	/* 4 Unsigned Word */
+	char			b[8];	/* 8 Byte (8-bit) values */
+	unsigned char		ub[8];	/* 8 Unsigned Byte */
+	float			s[2];	/* Single-precision (32-bit) value */
+} ATTR_ALIGN(8) mmx_t;	/* On an 8-byte (64-bit) boundary */
+
+
+#define	mmx_i2r(op,imm,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_m2r(op,mem,reg) \
+	__asm__ __volatile__ (#op " %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+#define	mmx_r2m(op,reg,mem) \
+	__asm__ __volatile__ (#op " %%" #reg ", %0" \
+			      : "=m" (mem) \
+			      : /* nothing */ )
+
+#define	mmx_r2r(op,regs,regd) \
+	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+
+#define	emms() __asm__ __volatile__ ("emms")
+
+#define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
+#define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
+#define	movd_v2r(var,reg)	__asm__ __volatile__ ("movd %0, %%" #reg \
+						      : /* nothing */ \
+						      : "rm" (var))
+#define	movd_r2v(reg,var)	__asm__ __volatile__ ("movd %%" #reg ", %0" \
+						      : "=rm" (var) \
+						      : /* nothing */ )
+
+#define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
+#define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
+#define	movq_r2r(regs,regd)	mmx_r2r (movq, regs, regd)
+
+#define	packssdw_m2r(var,reg)	mmx_m2r (packssdw, var, reg)
+#define	packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define	packsswb_m2r(var,reg)	mmx_m2r (packsswb, var, reg)
+#define	packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define	packuswb_m2r(var,reg)	mmx_m2r (packuswb, var, reg)
+#define	packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define	paddb_m2r(var,reg)	mmx_m2r (paddb, var, reg)
+#define	paddb_r2r(regs,regd)	mmx_r2r (paddb, regs, regd)
+#define	paddd_m2r(var,reg)	mmx_m2r (paddd, var, reg)
+#define	paddd_r2r(regs,regd)	mmx_r2r (paddd, regs, regd)
+#define	paddw_m2r(var,reg)	mmx_m2r (paddw, var, reg)
+#define	paddw_r2r(regs,regd)	mmx_r2r (paddw, regs, regd)
+
+#define	paddsb_m2r(var,reg)	mmx_m2r (paddsb, var, reg)
+#define	paddsb_r2r(regs,regd)	mmx_r2r (paddsb, regs, regd)
+#define	paddsw_m2r(var,reg)	mmx_m2r (paddsw, var, reg)
+#define	paddsw_r2r(regs,regd)	mmx_r2r (paddsw, regs, regd)
+
+#define	paddusb_m2r(var,reg)	mmx_m2r (paddusb, var, reg)
+#define	paddusb_r2r(regs,regd)	mmx_r2r (paddusb, regs, regd)
+#define	paddusw_m2r(var,reg)	mmx_m2r (paddusw, var, reg)
+#define	paddusw_r2r(regs,regd)	mmx_r2r (paddusw, regs, regd)
+
+#define	pand_m2r(var,reg)	mmx_m2r (pand, var, reg)
+#define	pand_r2r(regs,regd)	mmx_r2r (pand, regs, regd)
+
+#define	pandn_m2r(var,reg)	mmx_m2r (pandn, var, reg)
+#define	pandn_r2r(regs,regd)	mmx_r2r (pandn, regs, regd)
+
+#define	pcmpeqb_m2r(var,reg)	mmx_m2r (pcmpeqb, var, reg)
+#define	pcmpeqb_r2r(regs,regd)	mmx_r2r (pcmpeqb, regs, regd)
+#define	pcmpeqd_m2r(var,reg)	mmx_m2r (pcmpeqd, var, reg)
+#define	pcmpeqd_r2r(regs,regd)	mmx_r2r (pcmpeqd, regs, regd)
+#define	pcmpeqw_m2r(var,reg)	mmx_m2r (pcmpeqw, var, reg)
+#define	pcmpeqw_r2r(regs,regd)	mmx_r2r (pcmpeqw, regs, regd)
+
+#define	pcmpgtb_m2r(var,reg)	mmx_m2r (pcmpgtb, var, reg)
+#define	pcmpgtb_r2r(regs,regd)	mmx_r2r (pcmpgtb, regs, regd)
+#define	pcmpgtd_m2r(var,reg)	mmx_m2r (pcmpgtd, var, reg)
+#define	pcmpgtd_r2r(regs,regd)	mmx_r2r (pcmpgtd, regs, regd)
+#define	pcmpgtw_m2r(var,reg)	mmx_m2r (pcmpgtw, var, reg)
+#define	pcmpgtw_r2r(regs,regd)	mmx_r2r (pcmpgtw, regs, regd)
+
+#define	pmaddwd_m2r(var,reg)	mmx_m2r (pmaddwd, var, reg)
+#define	pmaddwd_r2r(regs,regd)	mmx_r2r (pmaddwd, regs, regd)
+
+#define	pmulhw_m2r(var,reg)	mmx_m2r (pmulhw, var, reg)
+#define	pmulhw_r2r(regs,regd)	mmx_r2r (pmulhw, regs, regd)
+
+#define	pmullw_m2r(var,reg)	mmx_m2r (pmullw, var, reg)
+#define	pmullw_r2r(regs,regd)	mmx_r2r (pmullw, regs, regd)
+
+#define	por_m2r(var,reg)	mmx_m2r (por, var, reg)
+#define	por_r2r(regs,regd)	mmx_r2r (por, regs, regd)
+
+#define	pslld_i2r(imm,reg)	mmx_i2r (pslld, imm, reg)
+#define	pslld_m2r(var,reg)	mmx_m2r (pslld, var, reg)
+#define	pslld_r2r(regs,regd)	mmx_r2r (pslld, regs, regd)
+#define	psllq_i2r(imm,reg)	mmx_i2r (psllq, imm, reg)
+#define	psllq_m2r(var,reg)	mmx_m2r (psllq, var, reg)
+#define	psllq_r2r(regs,regd)	mmx_r2r (psllq, regs, regd)
+#define	psllw_i2r(imm,reg)	mmx_i2r (psllw, imm, reg)
+#define	psllw_m2r(var,reg)	mmx_m2r (psllw, var, reg)
+#define	psllw_r2r(regs,regd)	mmx_r2r (psllw, regs, regd)
+
+#define	psrad_i2r(imm,reg)	mmx_i2r (psrad, imm, reg)
+#define	psrad_m2r(var,reg)	mmx_m2r (psrad, var, reg)
+#define	psrad_r2r(regs,regd)	mmx_r2r (psrad, regs, regd)
+#define	psraw_i2r(imm,reg)	mmx_i2r (psraw, imm, reg)
+#define	psraw_m2r(var,reg)	mmx_m2r (psraw, var, reg)
+#define	psraw_r2r(regs,regd)	mmx_r2r (psraw, regs, regd)
+
+#define	psrld_i2r(imm,reg)	mmx_i2r (psrld, imm, reg)
+#define	psrld_m2r(var,reg)	mmx_m2r (psrld, var, reg)
+#define	psrld_r2r(regs,regd)	mmx_r2r (psrld, regs, regd)
+#define	psrlq_i2r(imm,reg)	mmx_i2r (psrlq, imm, reg)
+#define	psrlq_m2r(var,reg)	mmx_m2r (psrlq, var, reg)
+#define	psrlq_r2r(regs,regd)	mmx_r2r (psrlq, regs, regd)
+#define	psrlw_i2r(imm,reg)	mmx_i2r (psrlw, imm, reg)
+#define	psrlw_m2r(var,reg)	mmx_m2r (psrlw, var, reg)
+#define	psrlw_r2r(regs,regd)	mmx_r2r (psrlw, regs, regd)
+
+#define	psubb_m2r(var,reg)	mmx_m2r (psubb, var, reg)
+#define	psubb_r2r(regs,regd)	mmx_r2r (psubb, regs, regd)
+#define	psubd_m2r(var,reg)	mmx_m2r (psubd, var, reg)
+#define	psubd_r2r(regs,regd)	mmx_r2r (psubd, regs, regd)
+#define	psubw_m2r(var,reg)	mmx_m2r (psubw, var, reg)
+#define	psubw_r2r(regs,regd)	mmx_r2r (psubw, regs, regd)
+
+#define	psubsb_m2r(var,reg)	mmx_m2r (psubsb, var, reg)
+#define	psubsb_r2r(regs,regd)	mmx_r2r (psubsb, regs, regd)
+#define	psubsw_m2r(var,reg)	mmx_m2r (psubsw, var, reg)
+#define	psubsw_r2r(regs,regd)	mmx_r2r (psubsw, regs, regd)
+
+#define	psubusb_m2r(var,reg)	mmx_m2r (psubusb, var, reg)
+#define	psubusb_r2r(regs,regd)	mmx_r2r (psubusb, regs, regd)
+#define	psubusw_m2r(var,reg)	mmx_m2r (psubusw, var, reg)
+#define	psubusw_r2r(regs,regd)	mmx_r2r (psubusw, regs, regd)
+
+#define	punpckhbw_m2r(var,reg)		mmx_m2r (punpckhbw, var, reg)
+#define	punpckhbw_r2r(regs,regd)	mmx_r2r (punpckhbw, regs, regd)
+#define	punpckhdq_m2r(var,reg)		mmx_m2r (punpckhdq, var, reg)
+#define	punpckhdq_r2r(regs,regd)	mmx_r2r (punpckhdq, regs, regd)
+#define	punpckhwd_m2r(var,reg)		mmx_m2r (punpckhwd, var, reg)
+#define	punpckhwd_r2r(regs,regd)	mmx_r2r (punpckhwd, regs, regd)
+
+#define	punpcklbw_m2r(var,reg) 		mmx_m2r (punpcklbw, var, reg)
+#define	punpcklbw_r2r(regs,regd)	mmx_r2r (punpcklbw, regs, regd)
+#define	punpckldq_m2r(var,reg)		mmx_m2r (punpckldq, var, reg)
+#define	punpckldq_r2r(regs,regd)	mmx_r2r (punpckldq, regs, regd)
+#define	punpcklwd_m2r(var,reg)		mmx_m2r (punpcklwd, var, reg)
+#define	punpcklwd_r2r(regs,regd)	mmx_r2r (punpcklwd, regs, regd)
+
+#define	pxor_m2r(var,reg)	mmx_m2r (pxor, var, reg)
+#define	pxor_r2r(regs,regd)	mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define pavgusb_m2r(var,reg)	mmx_m2r (pavgusb, var, reg)
+#define pavgusb_r2r(regs,regd)	mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+	__asm__ __volatile__ (#op " %1, %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem), "i" (imm))
+
+#define mmx_r2ri(op,regs,regd,imm) \
+	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+			      : /* nothing */ \
+			      : "i" (imm) )
+
+#define	mmx_fetch(mem,hint) \
+	__asm__ __volatile__ ("prefetch" #hint " %0" \
+			      : /* nothing */ \
+			      : "m" (mem))
+
+
+#define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
+
+#define	movntq_r2m(mmreg,var)		mmx_r2m (movntq, mmreg, var)
+
+#define	pavgb_m2r(var,reg)		mmx_m2r (pavgb, var, reg)
+#define	pavgb_r2r(regs,regd)		mmx_r2r (pavgb, regs, regd)
+#define	pavgw_m2r(var,reg)		mmx_m2r (pavgw, var, reg)
+#define	pavgw_r2r(regs,regd)		mmx_r2r (pavgw, regs, regd)
+
+#define	pextrw_r2r(mmreg,reg,imm)	mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define	pinsrw_r2r(reg,mmreg,imm)	mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define	pmaxsw_m2r(var,reg)		mmx_m2r (pmaxsw, var, reg)
+#define	pmaxsw_r2r(regs,regd)		mmx_r2r (pmaxsw, regs, regd)
+
+#define	pmaxub_m2r(var,reg)		mmx_m2r (pmaxub, var, reg)
+#define	pmaxub_r2r(regs,regd)		mmx_r2r (pmaxub, regs, regd)
+
+#define	pminsw_m2r(var,reg)		mmx_m2r (pminsw, var, reg)
+#define	pminsw_r2r(regs,regd)		mmx_r2r (pminsw, regs, regd)
+
+#define	pminub_m2r(var,reg)		mmx_m2r (pminub, var, reg)
+#define	pminub_r2r(regs,regd)		mmx_r2r (pminub, regs, regd)
+
+#define	pmovmskb(mmreg,reg) \
+	__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+
+#define	pmulhuw_m2r(var,reg)		mmx_m2r (pmulhuw, var, reg)
+#define	pmulhuw_r2r(regs,regd)		mmx_r2r (pmulhuw, regs, regd)
+
+#define	prefetcht0(mem)			mmx_fetch (mem, t0)
+#define	prefetcht1(mem)			mmx_fetch (mem, t1)
+#define	prefetcht2(mem)			mmx_fetch (mem, t2)
+#define	prefetchnta(mem)		mmx_fetch (mem, nta)
+
+#define	psadbw_m2r(var,reg)		mmx_m2r (psadbw, var, reg)
+#define	psadbw_r2r(regs,regd)		mmx_r2r (psadbw, regs, regd)
+
+#define	pshufw_m2r(var,reg,imm)		mmx_m2ri(pshufw, var, reg, imm)
+#define	pshufw_r2r(regs,regd,imm)	mmx_r2ri(pshufw, regs, regd, imm)
+
+#define	sfence() __asm__ __volatile__ ("sfence\n\t")
--- a/veejay-current/veejay-server/libyuv/mmx_macros.h
+++ b/veejay-current/veejay-server/libyuv/mmx_macros.h
@@ -0,0 +1,46 @@
+#include <stdlib.h> /* size_t */
+#include <config.h>
+
+/* MMX memcpy stuff taken from MPlayer (http://www.mplayerhq.hu) */
+
+#define BLOCK_SIZE 4096
+#define CONFUSION_FACTOR 0
+//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)
+
+#undef HAVE_MMX1
+#ifndef MMXEXT
+/*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
+    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
+    standard (non MMX-optimized) version.
+    Note: on K6-2+ it speedups memory copying upto 25% and
+          on K7 and P3 about 500% (5 times). */
+#define HAVE_MMX1
+#endif
+
+#undef MMREG_SIZE
+#define MMREG_SIZE 64 //8
+
+#undef PREFETCH
+#undef EMMS
+
+#ifdef MMXEXT
+#define PREFETCH "prefetchnta"
+#else
+#define PREFETCH "/nop"
+#endif
+
+#define EMMS     "emms"
+
+#undef MOVNTQ
+#ifdef MMXEXT
+#define MOVNTQ "movntq"
+#else
+#define MOVNTQ "movq"
+#endif
+
+#undef MIN_LEN
+#ifdef HAVE_MMX1
+#define MIN_LEN 0x800  /* 2K blocks */
+#else
+#define MIN_LEN 0x40  /* 64-byte blocks */
+#endif
--- a/veejay-current/veejay-server/libyuv/subsample.c
+++ b/veejay-current/veejay-server/libyuv/subsample.c
@@ -0,0 +1,856 @@
+/*
+ * subsample.c:  Routines to do chroma subsampling.  ("Work In Progress")
+ *
+ *
+ *  Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *                2004 Niels Elburg <nwelburg@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+
+
+
+#include <config.h>
+
+#ifdef HAVE_ASM_MMX
+#include "mmx.h"
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <mjpegtools/mjpeg_types.h>
+#include <libvjmem/vjmem.h>
+#include <libvjmsg/vj-msg.h>
+#include <libvje/vje.h>
+#include <libyuv/yuvconv.h>
+
+
+const char *ssm_id[SSM_COUNT] = {
+  "unknown",
+  "420_jpeg",
+  "420_mpeg2",
+#if 0
+  "420_dv_pal",
+  "411_dv_ntsc"
+#endif
+};
+
+
+const char *ssm_description[SSM_COUNT] = {
+  "unknown/illegal",
+  "4:2:0, JPEG/MPEG-1, interstitial siting",
+  "4:2:0, MPEG-2, horizontal cositing",
+#if 0
+  "4:2:0, DV-PAL, cosited, Cb/Cr line alternating",
+  "4:1:1, DV-NTSC"
+  "4:2:2",
+#endif
+};
+
+ 
+#define    RUP8(num)(((num)+8)&~8)
+
+// forward decl
+void ss_420_to_422(uint8_t *buffer, int width, int height);
+void ss_422_to_420(uint8_t *buffer, int width, int height);
+
+typedef struct
+{
+	uint8_t *buf; 
+} yuv_sampler_t;
+
+void *subsample_init(int len)
+{
+	yuv_sampler_t *s = (yuv_sampler_t*) vj_malloc(sizeof(yuv_sampler_t) );
+	if(!s)
+		return NULL;
+	s->buf = (uint8_t*) vj_malloc(sizeof(uint8_t) * RUP8(len*2) );
+	if(!s->buf)
+		return NULL;
+
+	return (void*) s;
+}
+
+void	subsample_free(void *data)
+{
+	yuv_sampler_t *sampler = (yuv_sampler_t*) data;
+	if(sampler)
+	{
+		if(sampler->buf) 
+			free(sampler->buf);
+		free(sampler);
+	}
+	sampler = NULL;
+}
+
+/*************************************************************************
+ * Chroma Subsampling
+ *************************************************************************/
+
+
+/* vertical/horizontal interstitial siting
+ *
+ *    Y   Y   Y   Y
+ *      C       C
+ *    Y   Y   Y   Y
+ *
+ *    Y   Y   Y   Y
+ *      C       C
+ *    Y   Y   Y   Y
+ *
+ */
+
+/*
+static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height)
+{
+  uint8_t *in0, *in1, *out;
+  int x, y;
+
+  in0 = buffer;
+  in1 = buffer + width;
+  out = buffer;
+  for (y = 0; y < height; y += 2) {
+    for (x = 0; x < width; x += 2) {
+      *out = (in0[0] + in0[1] + in1[0] + in1[1]) >> 2;
+      in0 += 2;
+      in1 += 2;
+      out++;
+    }
+    in0 += width;
+    in1 += width;
+  }
+}
+*/
+/*
+
+        using weighted averaging for subsampling 2x2 -> 1x1
+	here, 4 pixels are filled in each inner loop, (weighting
+        16 source pixels)
+*/
+
+static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height)
+{
+  const uint8_t *in0, *in1;
+  uint8_t *out;
+  int x, y = height;
+  in0 = buffer;
+  in1 = buffer + width;
+  out = buffer;
+  for (y = 0; y < height; y += 4) {
+    for (x = 0; x < width; x += 4) {
+     out[0] = (in0[0] + 3 * (in0[1] + in1[0]) + (9 * in1[1]) + 8) >> 4;
+     out[1] = (in0[2] + 3 * (in0[3] + in1[2]) + (9 * in1[3]) + 8) >> 4;
+     out[2] = (in0[4] + 3 * (in0[5] + in1[4]) + (9 * in1[5]) + 8) >> 4;
+     out[3] = (in0[6] + 3 * (in0[7] + in1[6]) + (9 * in1[7]) + 8) >> 4;
+
+      in0 += 8;
+      in1 += 8;
+      out += 4;
+    }
+    for (  ; x < width; x +=2 )
+    {
+ 	out[0] = (in0[0] + 3 * (in0[1] + in1[0]) + (9 * in1[1]) + 8) >> 4;
+        in0 += 2;
+        in1 += 2;
+	out++;
+    }
+    in0 += width*2;
+    in1 += width*2;
+  }
+}
+static void ss_444_to_420jpeg_cp(uint8_t *buffer,uint8_t *dest, int width, int height)
+{
+  const uint8_t *in0, *in1;
+  uint8_t *out;
+  int x, y = height;
+  in0 = buffer;
+  in1 = buffer + width;
+  out = dest;
+  for (y = 0; y < height; y += 4) {
+    for (x = 0; x < width; x += 4) {
+     out[0] = (in0[0] + 3 * (in0[1] + in1[0]) + (9 * in1[1]) + 8) >> 4;
+     out[1] = (in0[2] + 3 * (in0[3] + in1[2]) + (9 * in1[3]) + 8) >> 4;
+     out[2] = (in0[4] + 3 * (in0[5] + in1[4]) + (9 * in1[5]) + 8) >> 4;
+     out[3] = (in0[6] + 3 * (in0[7] + in1[6]) + (9 * in1[7]) + 8) >> 4;
+
+      in0 += 8;
+      in1 += 8;
+      out += 4;
+    }
+    for (  ; x < width; x +=2 )
+    {
+ 	out[0] = (in0[0] + 3 * (in0[1] + in1[0]) + (9 * in1[1]) + 8) >> 4;
+        in0 += 2;
+        in1 += 2;
+	out++;
+    }
+    in0 += width*2;
+    in1 += width*2;
+  }
+}
+/* horizontal interstitial siting
+ *
+ *    Y   Y   Y   Y
+ *    C   C   C   C     in0 
+ *    Y   Y   Y   Y         
+ *    C   C   C   C      
+ *           
+ *    Y   Y   Y   Y       
+ *    C       C     	out0  
+ *    Y   Y   Y   Y       
+ *    C       C  
+ *
+ *
+ */             
+
+
+
+ 
+/* vertical/horizontal interstitial siting
+ *
+ *    Y   Y   Y   Y
+ *      C       C       C      inm
+ *    Y   Y   Y   Y           
+ *                  
+ *    Y   Y   Y - Y           out0
+ *      C     | C |     C      in0
+ *    Y   Y   Y - Y           out1
+ *
+ *
+ *      C       C       C      inp
+ *
+ *
+ *  Each iteration through the loop reconstitutes one 2x2 block of 
+ *   pixels from the "surrounding" 3x3 block of samples...
+ *  Boundary conditions are handled by cheap reflection; i.e. the
+ *   center sample is simply reused.
+ *              
+ */             
+
+
+#define BLANK_CRB in0[1]
+#define BLANK_CRB_2 (in0[1] << 1)
+
+static void tr_420jpeg_to_444(void *data, uint8_t *buffer, int width, int height)
+{
+  uint8_t *inm, *in0, *inp, *out0, *out1;
+  uint8_t cmm, cm0, cmp, c0m, c00, c0p, cpm, cp0, cpp;
+  int x, y;
+
+  yuv_sampler_t *sampler = (yuv_sampler_t*) data;
+
+  uint8_t *saveme = sampler->buf;
+
+  veejay_memcpy(saveme, buffer, width);
+
+  in0 = buffer + ( width * height /4) - 2;
+  inm = in0 - width/2;
+  inp = in0 + width/2;
+  out1 = buffer + (width * height) - 1;
+  out0 = out1 - width;
+
+  for (y = height; y > 0; y -= 2) {
+    if (y == 2) {
+      in0 = saveme + width/2 - 2;
+      inp = in0 + width/2;
+    }
+    for (x = width; x > 0; x -= 2) {
+#if 0
+      if ((x == 2) && (y == 2)) {
+	cmm = in0[1];
+	cm0 = in0[1];
+	cmp = in0[2];
+	c0m = in0[1];
+	c0p = in0[2];
+	cpm = inp[1];
+	cp0 = inp[1];
+	cpp = inp[2];
+      } else if ((x == 2) && (y == height)) {
+	cmm = inm[1];
+	cm0 = inm[1];
+	cmp = inm[2];
+	c0m = in0[1];
+	c0p = in0[2];
+	cpm = in0[1];
+	cp0 = in0[1];
+	cpp = in0[2];
+      } else if ((x == width) && (y == height)) {
+	cmm = inm[0];
+	cm0 = inm[1];
+	cmp = inm[1];
+	c0m = in0[0];
+	c0p = in0[1];
+	cpm = in0[0];
+	cp0 = in0[1];
+	cpp = in0[1];
+      } else if ((x == width) && (y == 2)) {
+	cmm = in0[0];
+	cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      } else if (x == 2) {
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      } else if (y == 2) {
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      } else if (x == width) {
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      } else if (y == height) {
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      } else {
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+      }
+      c00 = in0[1];
+
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+#else
+      cmm = ((x == 2) || (y == 2)) ? BLANK_CRB : inm[0];
+      cm0 = (y == 2) ? BLANK_CRB : inm[1];
+      cmp = ((x == width) || (y == 2)) ? BLANK_CRB : inm[2];
+      c0m = (x == 2) ? BLANK_CRB : in0[0];
+      c00 = in0[1];
+      c0p = (x == width) ? BLANK_CRB : in0[2];
+      cpm = ((x == 2) || (y == height)) ? BLANK_CRB : inp[0];
+      cp0 = (y == height) ? BLANK_CRB : inp[1];
+      cpp = ((x == width) || (y == height)) ? BLANK_CRB : inp[2];
+#endif
+      inm--;
+      in0--;
+      inp--;
+
+      *(out1--) = (1*cpp + 3*(cp0+c0p) + 9*c00 + 8) >> 4;
+      *(out1--) = (1*cpm + 3*(cp0+c0m) + 9*c00 + 8) >> 4;
+      *(out0--) = (1*cmp + 3*(cm0+c0p) + 9*c00 + 8) >> 4;
+      *(out0--) = (1*cmm + 3*(cm0+c0m) + 9*c00 + 8) >> 4;
+    }
+    out1 -= width;
+    out0 -= width;
+  }
+}
+
+// lame box filter
+// the dampening of high frequencies depend
+// on the directions these frequencies occur in the
+// image, resulting in clear edges between certain
+// group of pixels.
+
+static void ss_420jpeg_to_444(uint8_t *buffer, int width, int height)
+{
+#ifndef HAVE_ASM_MMX
+  uint8_t *in, *out0, *out1;
+  int x, y;
+  in = buffer + (width * height / 4) - 1;
+  out1 = buffer + (width * height) - 1;
+  out0 = out1 - width;
+  for (y = height - 1; y >= 0; y -= 2) {
+    for (x = width - 1; x >= 0; x -=2) {
+      uint8_t val = *(in--);
+      *(out1--) = val;
+      *(out1--) = val;
+      *(out0--) = val;
+      *(out0--) = val;
+    }
+    out0 -= width;
+    out1 -= width;
+  }
+#else
+	int x,y;
+	const int mmx_stride = width >> 3;
+	uint8_t *src = buffer + ((width * height) >> 2)-1;
+	uint8_t *dst = buffer + (width * height) -1;
+	uint8_t *dst2 = dst - width;
+
+	for( y = height-1; y >= 0; y -= 2)
+	{
+		for( x = 0; x < mmx_stride; x ++ )
+		{
+			movq_m2r( *src,mm0 );
+			movq_m2r( *src,mm1 );
+			movq_r2m(mm0, *dst );
+			movq_r2m(mm1, *(dst+8) );
+			movq_r2m(mm0, *dst2 );
+			movq_r2m(mm1, *(dst2+8) );
+			dst += 16;
+			dst2 += 16;
+			src += 8;
+		}
+		dst -= width;	
+		dst2 -= width;
+	}
+#endif
+}
+
+
+void ss_420_to_422(uint8_t *buffer, int width, int height)
+{
+
+	//todo, 1x2 super sampling (box)
+}
+
+void ss_422_to_420(uint8_t *buffer, int width, int height )
+{
+
+	//todo 2x1 down sampling (box)
+}
+
+#ifdef HAVE_ASM_MMX
+#undef HAVE_K6_2PLUS
+#if !defined( HAVE_ASM_MMX2) && defined( HAVE_ASM_3DNOW )
+#define HAVE_K6_2PLUS
+#endif
+
+#undef _EMMS
+
+#ifdef HAVE_K6_2PLUS
+/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+#define _EMMS     "femms"
+#else
+#define _EMMS     "emms"
+#endif
+
+#endif
+
+#ifdef HAVE_ASM_MMX
+/* for small memory blocks (<256 bytes) this version is faster */
+#define small_memcpy(to,from,n)\
+{\
+register unsigned long int dummy;\
+__asm__ __volatile__(\
+  "rep; movsb"\
+  :"=&D"(to), "=&S"(from), "=&c"(dummy)\
+  :"0" (to), "1" (from),"2" (n)\
+  : "memory");\
+}
+
+static  inline	void	copy8( uint8_t *dst, uint8_t *in )
+{
+	__asm__ __volatile__ (
+		"movq	(%0),	%%mm0\n"
+		"movq %%mm0, (%1)\n"
+		:: "r" (in), "r" (dst) : "memory" );
+}
+
+static	inline	void	copy16( uint8_t *dst, uint8_t *in)
+{
+	__asm__ __volatile__ (
+		"movq	(%0),	%%mm0\n"
+		"movq  8(%0),	%%mm1\n"
+		"movq  %%mm0,   (%1)\n"
+		"movq  %%mm1,   8(%1)\n"
+		:: "r" (in), "r" (dst) : "memory" );
+}
+
+static	inline void	copy_width( uint8_t *dst, uint8_t *in, int width )
+{
+	int w = width >> 4;
+	int x;
+	uint8_t *d = dst;
+	uint8_t *i = in;
+
+	for( x = 0; x < w; x ++ )
+	{
+		copy16( d, i );
+		d += 16;
+		i += 16;
+	}
+
+	x = (width % 16);
+	if( x )
+		small_memcpy( d, i, x);
+}
+
+static	inline	void	load_mask16to8()
+{
+	const uint64_t mask = 0x00ff00ff00ff00ffLL;
+	const uint8_t *m    = (uint8_t*)&mask;
+
+	__asm __volatile(
+		"movq		(%0), %%mm4\n\t"
+		:: "r" (m)
+	);
+
+}
+
+static	inline	void	down_sample16to8( uint8_t *out, uint8_t *in )
+{
+	//@ down sample by dropping right pixels
+	__asm __volatile(
+		"movq		(%0), %%mm1\n\t"
+		"movq		8(%0),%%mm3\n\t"
+		"pxor		%%mm5,%%mm5\n\t"
+		"pand		%%mm4,%%mm1\n\t"
+		"pand		%%mm4,%%mm3\n\t"
+		"packuswb	%%mm1,%%mm2\n\t"
+		"packuswb	%%mm3,%%mm5\n\t"
+		"psrlq		$32, %%mm2\n\t"
+		"por		%%mm5,%%mm2\n\t"
+		"movq		%%mm2, (%1)\n\t"
+		:: "r" (in), "r" (out)
+	);
+}
+#endif
+static void ss_444_to_422_cp(void *data, uint8_t *buffer, uint8_t *dest, int width, int height)
+{
+	const int dst_stride = width >> 1;
+	int x,y;
+#ifdef HAVE_ASM_MMX
+	int mmxdst_stride=dst_stride >> 3;
+	int left = dst_stride % 8;
+#endif
+	yuv_sampler_t *sampler = (yuv_sampler_t*) data;
+	uint8_t *src = sampler->buf;
+	uint8_t *dst;
+
+#ifdef HAVE_ASM_MMX
+	load_mask16to8();
+#endif
+	for(y = 0; y < height; y ++)
+	{
+		src = buffer + (y*width);
+		dst = dest + (y*dst_stride);
+
+#if defined (HAVE_ASM_MMX) || defined (HAVE_ASM_MMX2)
+
+		for( x= 0; x < mmxdst_stride; x++ )
+		{
+			down_sample16to8( dst, src );
+			src += 16;
+			dst += 8;
+		}
+		for(x=0; x < left; x++)
+		{
+			*(dst++) = ( src[0] + src[1] + 1 ) >> 1;
+			src += 2;
+		}
+#else
+		for(x=0; x < dst_stride; x++)
+		{
+			*(dst++) = ( src[0] + src[1] + 1 ) >> 1;
+			src += 2;
+		}
+
+#endif
+	}
+}
+
+
+static void ss_444_to_422(void *data, uint8_t *buffer, int width, int height)
+{
+	const int dst_stride = width >> 1;
+	int x,y;
+#ifdef HAVE_ASM_MMX
+	int mmxdst_stride=dst_stride >> 3;
+	int left = dst_stride % 8;
+#endif
+	yuv_sampler_t *sampler = (yuv_sampler_t*) data;
+	uint8_t *src = sampler->buf;
+	uint8_t *dst;
+
+#ifdef HAVE_ASM_MMX
+	load_mask16to8();
+#endif
+	for(y = 0; y < height; y ++)
+	{
+		src = sampler->buf;
+		dst = buffer + (y*dst_stride);
+
+#if defined (HAVE_ASM_MMX) || defined (HAVE_ASM_MMX2)
+		copy_width( src, buffer + (y*width), width );
+
+		for( x= 0; x < mmxdst_stride; x++ )
+		{
+			down_sample16to8( dst, src );
+			src += 16;
+			dst += 8;
+		}
+		for(x=0; x < left; x++)
+		{
+			*(dst++) = ( src[0] + src[1] + 1 ) >> 1;
+			src += 2;
+		}
+#else
+		for( x = 0; x < dst_stride; x ++ )
+		{
+			*(dst++) = (src[0] + src[1] + 1 ) >> 1;
+			src += 2;
+		}
+#endif
+	}
+}
+#ifdef HAVE_ASM_MMX
+
+static	inline	void	super_sample8to16( uint8_t *in, uint8_t *out )
+{
+	//@ super sample by duplicating pixels
+	__asm__ __volatile__ (
+		"\n\tpxor	%%mm2,%%mm2"
+		"\n\tpxor	%%mm4,%%mm4"
+		"\n\tmovq	(%0), %%mm1"  
+		"\n\tpunpcklbw	%%mm1,%%mm2" 
+		"\n\tpunpckhbw	%%mm1,%%mm4"   
+		"\n\tmovq	%%mm2,%%mm5"
+		"\n\tmovq	%%mm4,%%mm6"
+		"\n\tpsrlq	$8, %%mm5"    
+		"\n\tpsrlq	$8, %%mm6"  
+		"\n\tpor	%%mm5,%%mm2"
+		"\n\tpor	%%mm6,%%mm4"	
+		"\n\tmovq	%%mm2, (%1)"
+		"\n\tmovq	%%mm4, 8(%1)"
+		:: "r" (in), "r" (out)
+
+	);
+}
+#endif
+
+static void tr_422_to_444(void *data, uint8_t *buffer, int width, int height)
+{
+	int x,y;
+	const int stride = width >> 1;
+
+#ifndef HAVE_ASM_MMX
+	for( y = height-1; y > 0 ; y -- )
+	{
+		uint8_t *dst = buffer + (y * width);
+		uint8_t *src = buffer + (y * stride);
+		for(x=0; x < stride; x++) // for 1 row
+		{
+			dst[0] = src[x]; //put to dst
+			dst[1] = src[x];
+			dst+=2; // increment dst
+		}
+	}
+#else
+
+	const int mmx_stride = stride >> 3;
+	int left = (mmx_stride % 8)-1;
+	if( left < 0 ) left = 0;
+	for( y = height-1; y > 0 ; y -- )
+	{
+		uint8_t *src = buffer + (y * stride);
+		uint8_t *dst = buffer + (y * width);
+		for(x=0; x < mmx_stride; x++) // for 1 row
+		{
+			super_sample8to16(src,dst );
+			src += 8;
+			dst += 16;
+		}
+	/*	for(x=0; x < left; x++) // for 1 row
+		{
+			dst[0] = src[x]; //put to dst
+			dst[1] = src[x];
+			dst+=2; // increment dst
+		}*/
+	}
+#endif
+}
+
+
+
+
+/* vertical intersitial siting; horizontal cositing
+ *
+ *    Y   Y   Y   Y
+ *    C       C
+ *    Y   Y   Y   Y
+ *
+ *    Y   Y   Y   Y
+ *    C       C
+ *    Y   Y   Y   Y
+ *
+ * [1,2,1] kernel for horizontal subsampling:
+ *
+ *    inX[0] [1] [2]
+ *        |   |   |
+ *    C   C   C   C
+ *         \  |  /
+ *          \ | /
+ *            C
+ */
+
+static void ss_444_to_420mpeg2(uint8_t *buffer, int width, int height)
+{
+  uint8_t *in0, *in1, *out;
+  int x, y;
+
+  in0 = buffer;          /* points to */
+  in1 = buffer + width;  /* second of pair of lines */
+  out = buffer;
+  for (y = 0; y < height; y += 2) {
+    /* first column boundary condition -- just repeat it to right */
+    *out = (in0[0] + (2 * in0[0]) + in0[1] +
+	    in1[0] + (2 * in1[0]) + in1[1]) >> 3;
+    out++;
+    in0++;
+    in1++;
+    /* rest of columns just loop */
+    for (x = 2; x < width; x += 2) {
+      *out = (in0[0] + (2 * in0[1]) + in0[2] +
+	      in1[0] + (2 * in1[1]) + in1[2]) >> 3;
+      in0 += 2;
+      in1 += 2;
+      out++;
+    }
+    in0 += width + 1;
+    in1 += width + 1;
+  }
+}
+      
+
+void chroma_subsample_cp(subsample_mode_t mode, void *data, uint8_t *ycbcr[], uint8_t *dcbcr[],
+		      int width, int height)
+{
+  switch (mode) {
+  case SSM_420_JPEG_BOX:
+  case SSM_420_JPEG_TR:
+    ss_444_to_420jpeg_cp(ycbcr[1],dcbcr[1], width, height);
+    ss_444_to_420jpeg_cp(ycbcr[2],dcbcr[2], width, height);
+ 
+    break;
+  case SSM_420_MPEG2:
+    break;
+  case SSM_422_444:
+    ss_444_to_422_cp(data,ycbcr[1],dcbcr[1],width,height);
+    ss_444_to_422_cp(data,ycbcr[2],dcbcr[2],width,height);
+#ifdef HAVE_ASM_MMX
+	__asm__ __volatile__ ( _EMMS:::"memory");
+#endif
+    break;
+  case SSM_420_422:
+    break;
+  default:
+    break;
+  }
+}
+
+
+
+
+
+void chroma_subsample(subsample_mode_t mode, void *data, uint8_t *ycbcr[],
+		      int width, int height)
+{
+  switch (mode) {
+  case SSM_420_JPEG_BOX:
+  case SSM_420_JPEG_TR: 
+    ss_444_to_420jpeg(ycbcr[1], width, height);
+    ss_444_to_420jpeg(ycbcr[2], width, height);
+#ifdef HAVE_ASM_MMX
+	__asm__ __volatile__ ( _EMMS:::"memory");
+#endif
+    break;
+  case SSM_420_MPEG2:
+    ss_444_to_420mpeg2(ycbcr[1], width, height);
+    ss_444_to_420mpeg2(ycbcr[2], width, height);
+    break;
+  case SSM_422_444:
+    ss_444_to_422(data,ycbcr[1],width,height);
+    ss_444_to_422(data,ycbcr[2],width,height);
+#ifdef HAVE_ASM_MMX
+	__asm__ __volatile__ ( _EMMS:::"memory");
+#endif
+    break;
+  case SSM_420_422:
+    ss_422_to_420(ycbcr[1],width,height);
+    ss_422_to_420(ycbcr[2],width,height);
+    break;
+  default:
+    break;
+  }
+}
+
+
+void chroma_supersample(subsample_mode_t mode,void *data, uint8_t *ycbcr[],
+			int width, int height)
+{
+  switch (mode) {
+  case SSM_420_JPEG_BOX:
+      	ss_420jpeg_to_444(ycbcr[1], width, height);
+    	ss_420jpeg_to_444(ycbcr[2], width, height);
+#ifdef HAVE_ASM_MMX
+	__asm__ __volatile__ ( _EMMS:::"memory");
+#endif
+    break;
+  case SSM_420_JPEG_TR:
+    tr_420jpeg_to_444(data,ycbcr[1], width, height);
+    tr_420jpeg_to_444(data,ycbcr[2], width, height);
+    break;
+  case SSM_422_444:
+    tr_422_to_444(data,ycbcr[1],width,height);
+    tr_422_to_444(data,ycbcr[2],width,height);
+#ifdef HAVE_ASM_MMX
+	__asm__ __volatile__ ( _EMMS:::"memory");
+#endif
+    break;
+  case SSM_420_422:
+    ss_420_to_422( ycbcr[1], width, height );
+    ss_420_to_422( ycbcr[2], width, height );
+    break;
+  case SSM_420_MPEG2:
+    //    ss_420mpeg2_to_444(ycbcr[1], width, height);
+    //    ss_420mpeg2_to_444(ycbcr[2], width, height);
+    break;
+  default:
+    break;
+  }
+}
+
+
--- a/veejay-current/veejay-server/libyuv/yuvconv.c
+++ b/veejay-current/veejay-server/libyuv/yuvconv.c
--- a/veejay-current/veejay-server/libyuv/yuvconv.h
+++ b/veejay-current/veejay-server/libyuv/yuvconv.h
@@ -0,0 +1,139 @@
+#ifndef YUVCONF_H
+#define YUVCONF_H
+/*  Veejay -  A visual instrument and realtime video sampler
+ *            Copyright (C)    2004 Niels Elburg <nwelburg@gmail.com>
+ *
+ *  YUV library for veejay.
+ *
+ *  Mjpegtools, (C) The Mjpegtools Development Team (http://mjpeg.sourceforge.net)
+ *  	      Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ */
+typedef enum subsample_mode {
+    SSM_UNKNOWN = 0,
+    SSM_420_JPEG_TR = 1,
+    SSM_420_JPEG_BOX = 2,
+    SSM_420_MPEG2 = 3,
+    SSM_422_444 = 4, 
+    SSM_420_422 = 5, 
+    SSM_COUNT = 6,
+} subsample_mode_t;
+
+extern const char *ssm_id[SSM_COUNT];
+extern const char *ssm_description[SSM_COUNT];
+
+void *subsample_init(int buf_len);
+void subsample_free(void *sampler);
+
+void chroma_subsample(subsample_mode_t mode, void *sampler, uint8_t * ycbcr[],
+		      int width, int height);
+void chroma_subsample_cp(subsample_mode_t mode, void *data, uint8_t *ycbcr[], uint8_t *dcbcr[],
+		      int width, int height);
+
+void chroma_supersample(subsample_mode_t mode, void *sampler, uint8_t * ycbcr[],
+			int width, int height);
+
+// yuv 4:2:2 packed to yuv 4:2:0 planar 
+void vj_yuy2toyv12( uint8_t *y, uint8_t *u, uint8_t *v,  uint8_t *in, int w, int h);
+// yuv 4:2:2 packet to yuv 4:2:2 planar
+void yuy2toyv16( uint8_t *y, uint8_t *u, uint8_t *v, uint8_t *in, int w, int h);
+// yuv 4:2:2 planar to yuv 4:2:2 packed
+void yuv422p_to_yuv422( uint8_t *yuv422[3], uint8_t *dst, int w, int h );
+
+// yuv 4:2:2 planar to yuv 4:2:0 planar
+void yuv420p_to_yuv422( uint8_t *yuv420[3], uint8_t *dst, int w, int h );
+
+// yuv 4:2:2 planar to YUYV
+void yuv422_to_yuyv( uint8_t *yuv422[3], uint8_t *dst, int w, int h );
+
+// scene detection
+int luminance_mean(uint8_t * frame[], int w, int h);
+
+/* software scaler from ffmpeg project: */
+
+typedef struct
+{
+	float lumaGBlur;
+	float chromaGBlur;
+	float lumaSarpen;
+	float chromaSharpen;
+	float chromaHShift;
+	float chromaVShift;
+	int	verbose;
+	int	flags;
+	int	use_filter;
+} sws_template;
+
+void	yuv_init_lib();
+
+void*	yuv_init_swscaler(VJFrame *src, VJFrame *dst, sws_template *templ, int cpu_flags);
+
+void	yuv_convert_and_scale( void *sws, VJFrame *src, VJFrame *dst );
+
+void	yuv_convert_and_scale_rgb( void *sws, VJFrame *src, VJFrame *dst );
+
+void	yuv_convert_and_scale_gray_rgb(void *sws,VJFrame *src, VJFrame *dst);
+void	yuv_convert_and_scale_from_rgb(void *sws , VJFrame *src, VJFrame *dst);
+void	yuv_convert_and_scale_grey(void *sws , VJFrame *src, VJFrame *dst);
+
+int	yuv_sws_get_cpu_flags(void);
+
+void	yuv_free_swscaler(void *sws);
+
+void  	yuv_crop(VJFrame *src, VJFrame *dst, VJRectangle *rect );
+
+VJFrame	*yuv_allocate_crop_image( VJFrame *src, VJRectangle *rect );
+
+void	yuv_deinterlace(
+		uint8_t *data[3],
+		const int width,
+		const int height,
+		int out_pix_fmt,
+		int shift,
+		uint8_t *Y,uint8_t *U, uint8_t *V );
+
+
+void	yuv_init_lib();
+
+void	yuv_free_lib();
+
+void	yuv_convert_ac( VJFrame *src, VJFrame *dst, int a, int b );
+
+//void	yuv_convert_any( VJFrame *src, VJFrame *dst, int a, int b );
+
+
+void	yuv_convert_any_ac_packed( VJFrame *src, uint8_t *dst, int src_fmt, int dst_fmt );
+
+void	yuv_convert_any3( VJFrame *src,int strides[], VJFrame *dst, int a, int b );
+
+
+VJFrame *yuv_rgb_template( uint8_t *rgb_buffer, int w, int h, int fmt );
+
+VJFrame *yuv_yuv_template( uint8_t *Y, uint8_t *U, uint8_t *V, int w, int h, int fmt );
+
+char	*yuv_get_scaler_name(int id);
+
+void	yuv_convert_any_ac( VJFrame *src, VJFrame *dst, int src_fmt, int dst_fmt );
+
+void    *yuv_fx_context_create( VJFrame *src, VJFrame *dst, int src_fmt, int dst_fmt );
+
+void    yuv_fx_context_process( void *ctx, VJFrame *src, VJFrame *dst );
+
+void    yuv_fx_context_destroy( void *ctx );
+
+
+#endif