diff --git a/veejay-current/veejay-server/libsubsample/subsample.c b/veejay-current/veejay-server/libsubsample/subsample.c index e0d28f77..8feaae7e 100644 --- a/veejay-current/veejay-server/libsubsample/subsample.c +++ b/veejay-current/veejay-server/libsubsample/subsample.c @@ -29,6 +29,9 @@ #include #include "subsample-mmx.h" #endif +#ifdef HAVE_ARM +#include +#endif #include #include @@ -109,6 +112,7 @@ static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height) 16 source pixels) */ +#ifndef HAVE_ARM static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height) { const uint8_t *in0, *in1; @@ -139,6 +143,50 @@ static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height) in1 += width*2; } } +#else +static void ss_444_to_420jpeg(uint8_t *buffer, int width, int height) +{ + const uint8_t *in0, *in1; + uint8_t *out; + int x, y; + in0 = buffer; + in1 = buffer + width; + out = buffer; + + for (y = 0; y < height; y += 4) + { + for (x = 0; x < width; x += 4) + { + uint8x16_t vin0 = vld1q_u8(in0); + uint8x16_t vin1 = vld1q_u8(in1); + uint8x16_t vresult = vrhaddq_u8(vin0, vin1); + vst1q_u8(out, vresult); + + in0 += 16; + in1 += 16; + out += 4; + } + + for (; x < width; x += 2) + { + uint8x8_t vin0 = vld1_u8(in0); + uint8x8_t vin1 = vld1_u8(in1); + + uint8x8_t vresult = vrhadd_u8(vin0, vin1); + vst1_u8(out, vresult); + + in0 += 8; + in1 += 8; + out += 1; + } + + in0 += width * 2; + in1 += width * 2; + } +} +#endif + +#ifndef HAVE_ARM static void ss_444_to_420jpeg_cp(uint8_t *buffer,uint8_t *dest, int width, int height) { const uint8_t *in0, *in1; @@ -169,6 +217,55 @@ static void ss_444_to_420jpeg_cp(uint8_t *buffer,uint8_t *dest, int width, int h in1 += width*2; } } +#else +static void ss_444_to_420jpeg_cp(uint8_t *buffer, uint8_t *dest, int width, int height) +{ + const uint8_t *in0, *in1; + uint8_t *out; + int x, y; + in0 = buffer; + in1 = buffer + width; + out = dest; + + for (y = 0; y < height; y += 4) + { + for (x = 0; x < width; x += 4) + { + uint8x16_t vin0 = vld1q_u8(in0); + uint8x16_t vin1 = vld1q_u8(in1); + + uint8x16_t vresult = vrhaddq_u8(vin0, vin1); + + vst1q_u8(out, vresult); + + in0 += 16; + in1 += 16; + out += 4; + } + + for (; x < width; x += 2) + { + uint8x8_t vin0 = vld1_u8(in0); + uint8x8_t vin1 = vld1_u8(in1); + + uint8x8_t vresult = vrhadd_u8(vin0, vin1); + + vst1_u8(out, vresult); + + in0 += 8; + in1 += 8; + out += 1; + } + + in0 += width * 2; + in1 += width * 2; + } +} +#endif + + + + /* horizontal interstitial siting * * Y Y Y Y @@ -360,8 +457,8 @@ static void tr_420jpeg_to_444(uint8_t *data, uint8_t *buffer, int width, int hei static void ss_420jpeg_to_444(uint8_t *buffer, int width, int height) { -#ifndef HAVE_ASM_MMX - uint8_t *in, *out0, *out1; +#if !defined(HAVE_ASM_MMX) && !defined(HAVE_ARM) + uint8_t *in, *out0, *out1; unsigned int x, y; in = buffer + (width * height / 4) - 1; out1 = buffer + (width * height) - 1; @@ -377,7 +474,37 @@ static void ss_420jpeg_to_444(uint8_t *buffer, int width, int height) out0 -= width; out1 -= width; } -#else +#endif + +#ifdef HAVE_ARM + uint8_t *in, *out0, *out1; + unsigned int x, y; + in = buffer + (width * height / 4) - 1; + out1 = buffer + (width * height) - 1; + out0 = out1 - width; + uint8x16_t val, val_dup; + + for (y = height - 1; y >= 0; y -= 2) + { + for (x = width - 1; x >= 0; x -= 16) + { + val = vld1q_u8(in); + val_dup = vdupq_n_u8(vgetq_lane_u8(val, 0)); + + vst1q_u8(out1 - 15, val_dup); + vst1q_u8(out0 - 15, val_dup); + + in--; + out1 -= 16; + out0 -= 16; + } + + out0 -= width; + out1 -= width; + } +#endif + +#ifdef HAVE_ASM_MMX int x,y; const int mmx_stride = width >> 3; uint8_t *src = buffer + ((width * height) >> 2)-1; @@ -406,6 +533,7 @@ static void ss_420jpeg_to_444(uint8_t *buffer, int width, int height) SFENCE" \n\t" :::"memory"); #endif + } static inline void downsample2x1( const uint8_t *src, uint8_t *dst, const int width ) @@ -451,6 +579,48 @@ static inline void downsample32x16( const uint8_t *src, uint8_t *dst, const int } #endif +#ifdef HAVE_ARM +static inline void downsample32x16(const uint8_t *src, uint8_t *dst, const int width, const int left) +{ + unsigned int x; + unsigned int x1 = 0; + unsigned int i; + + for (x = 0; x < width - left; x += 32, x1 += 16) + { + uint8x16x2_t vsrc = vld2q_u8(&src[x]); + uint8x16_t vsum = vrhaddq_u8(vsrc.val[0], vsrc.val[1]); + vst1q_u8(&dst[x1], vsum); + } + + for (i = 0; i < left; i += 2, x1++) + { + dst[x1] = (src[x + i] + src[x + i + 1] + 1) >> 1; + } +} + +static inline void downsample16x8(const uint8_t *src, uint8_t *dst, const int width) +{ + unsigned int x; + unsigned int x1 = 0; + + for (x = 0; x < width; x += 16, x1 += 8) + { + uint8x16_t vsrc = vld1q_u8(&src[x]); + uint8x8_t vsum = vpadd_u8(vget_low_u8(vsrc), vget_high_u8(vsrc)); + vsum = vrshr_n_u8(vsum, 1); + vst1_u8(&dst[x1], vsum); + } + + for (; x < width; x += 2, x1++) + { + dst[x1] = (src[x] + src[x + 1] + 1) >> 1; + } +} + +#endif + + static void ss_444_to_422_cp(uint8_t *buffer, uint8_t *dest, int width, int height) { const unsigned int dst_stride = width >> 1; @@ -464,7 +634,7 @@ static void ss_444_to_422_cp(uint8_t *buffer, uint8_t *dest, int width, int heig uint8_t *src = buffer + (y*width); uint8_t *dst = dest + (y*dst_stride); -#ifdef HAVE_ASM_MMX +#if defined(HAVE_ASM_MMX) || defined(HAVE_ARM) downsample32x16( src, dst, width,left ); #else downsample2x1( src, dst, width ); @@ -478,6 +648,23 @@ static void ss_444_to_422_cp(uint8_t *buffer, uint8_t *dest, int width, int heig #endif } +#ifdef HAVE_ARM +static inline void subsample_up_1x16to1x32(uint8_t *in, uint8_t *out) +{ + uint8x16_t vzero = vdupq_n_u8(0); + uint8x16_t vin = vld1q_u8(in); + + uint8x8_t vin_low = vget_low_u8(vin); + uint8x8_t vin_high = vget_high_u8(vin); + + vin_low = vshrq_n_u8(vin_low, 1); + vin_high = vshrq_n_u8(vin_high, 1); + + uint8x16_t vout = vcombine_u8(vin_low, vin_high); + + vst1q_u8(out, vout); +} +#endif static void tr_422_to_444( uint8_t *buffer, int width, int height) @@ -485,7 +672,7 @@ static void tr_422_to_444( uint8_t *buffer, int width, int height) int x,y; const int stride = width >> 1; -#ifndef HAVE_ASM_MMX +#if !defined(HAVE_ASM_MMX) && !defined(HAVE_ARM) for( y = height-1; y > 0 ; y -- ) { uint8_t *dst = buffer + (y * width); uint8_t *src = buffer + (y * stride); @@ -496,9 +683,9 @@ static void tr_422_to_444( uint8_t *buffer, int width, int height) dst+=2; // increment dst } } -#else -// const int mmx_stride = stride >> 3; -// int left = (mmx_stride % 16); /* FIXME */ +#endif + +#if defined(HAVE_ASM_MMX) || defined(HAVE_ARM) for( y = height -1 ; y > 0; y -- ) { uint8_t *src = buffer + (y* stride); uint8_t *dst = buffer + (y* width); @@ -507,7 +694,8 @@ static void tr_422_to_444( uint8_t *buffer, int width, int height) subsample_up_1x16to1x32( &src[x], &dst[x1] ); } } - +#endif +#if defined(HAVE_ASM_MMX) __asm__(_EMMS" \n\t" SFENCE" \n\t" :::"memory"); @@ -518,7 +706,7 @@ static void tr_422_to_444t(uint8_t *out, uint8_t *in, int width, int height) { int x,y; const int stride = width >> 1; -#ifndef HAVE_ASM_MMX +#if !defined(HAVE_ASM_MMX) && !defined(HAVE_ARM) for( y = height; y > 0 ; y -- ) { uint8_t *d = out + (y * width); uint8_t *s = in + (y * stride); @@ -529,9 +717,9 @@ static void tr_422_to_444t(uint8_t *out, uint8_t *in, int width, int height) d+=2; // increment dst } } -#else -// const int mmx_stride = stride >> 3; -// int left = (mmx_stride % 16); /* FIXME */ +#endif + +#if defined(HAVE_ASM_MMX) || defined(HAVE_ARM) int x1 = 0; for( y = height -1 ; y > 0; y -- ) { uint8_t *src = in + (y* stride); @@ -540,7 +728,9 @@ static void tr_422_to_444t(uint8_t *out, uint8_t *in, int width, int height) subsample_up_1x16to1x32(&src[x], &dst[x1] ); } } +#endif +#ifdef HAVE_ASM_MMX __asm__(_EMMS" \n\t" SFENCE" \n\t" :::"memory"); @@ -612,7 +802,7 @@ static void chroma_subsample_task( void *ptr ) break; case SSM_422_444: ss_444_to_422_cp(f->output[1],f->input[1],f->width,f->height); - ss_444_to_422_cp(f->output[2],f->input[2],f->width,f->height); + ss_444_to_422_cp(f->output[2],f->input[2],f->width,f->height); break; default: break; @@ -691,8 +881,8 @@ void chroma_supersample(subsample_mode_t mode,VJFrame *frame, uint8_t *ycbcr[] ) switch (mode) { case SSM_420_JPEG_BOX: - ss_420jpeg_to_444(ycbcr[1], frame->width, frame->height); - ss_420jpeg_to_444(ycbcr[2], frame->width, frame->height); + ss_420jpeg_to_444(ycbcr[1], frame->width, frame->height); + ss_420jpeg_to_444(ycbcr[2], frame->width, frame->height); break; case SSM_420_JPEG_TR: tr_420jpeg_to_444(_chroma_supersample_data,ycbcr[1], frame->width, frame->height); diff --git a/veejay-current/veejay-server/veejay/vj-perform.c b/veejay-current/veejay-server/veejay/vj-perform.c index 62d87b85..d6afa8f4 100644 --- a/veejay-current/veejay-server/veejay/vj-perform.c +++ b/veejay-current/veejay-server/veejay/vj-perform.c @@ -1516,6 +1516,7 @@ static int vj_perform_use_cached_frame(ycbcr_frame *cached_frame, VJFrame *dst) veejay_memcpy( dst->data[0], cached_frame->Y, dst->stride[0] * dst->height ); veejay_memcpy( dst->data[1], cached_frame->Cb, dst->stride[1] * dst->height ); veejay_memcpy( dst->data[2], cached_frame->Cr, dst->stride[2] * dst->height ); + //veejay_memcpy( dst->data[3], cached_frame->data[3], cached_frame->stride[3] * cached_frame->height ); return dst->ssm; } @@ -1916,7 +1917,7 @@ static int vj_perform_apply_secundary_tag(veejay_t * info, performer_t *p, int s if(len > 0 ) { error = 0; ssm = dst->ssm; - +//NEL global->cached_sample_frames[ global->n_cached_sample_frames ].sample_id = sample_id; global->cached_sample_frames[ global->n_cached_sample_frames ].frame = p->frame_buffer[ chain_entry ]; global->n_cached_sample_frames ++; @@ -2158,7 +2159,7 @@ static void vj_perform_tag_render_chain_entry(veejay_t *info,performer_t *p,vjp_ vj_perform_supersample(settings,p, frames[0], (ef ? frames[1] : NULL), sub_mode ); p->frame_buffer[chain_entry]->ssm = frames[0]->ssm; - + if(ef) { frames[1]->ssm = vj_perform_apply_secundary_tag(info,p,fx_entry->channel,fx_entry->source_type,chain_entry,frames[0],frames[1],p->frame_buffer[chain_entry]->P0, p->frame_buffer[chain_entry]->P1, 0 );