diff --git a/veejay-current/veejay-server/libsubsample/subsample.c b/veejay-current/veejay-server/libsubsample/subsample.c index 6290a732..eadc3aef 100644 --- a/veejay-current/veejay-server/libsubsample/subsample.c +++ b/veejay-current/veejay-server/libsubsample/subsample.c @@ -731,36 +731,12 @@ static void ss_420jpeg_to_444(uint8_t *buffer, int width, int height) /* * subsample YUV 4:4:4 to YUV 4:2:2 using drop method */ -void ss_444_to_422_drop(uint8_t *U, uint8_t *V, int width, int height) { +void ss_444_to_422_drop(uint8_t *restrict U, uint8_t *restrict V, int width, int height) { const int dest_width = width - 1; const int stride = width >> 1; for (int y = 0; y < height; y++) { - int x = 0; -#ifdef HAVE_ASM_SSE2 - for( ; x < dest_width - 15; x += 16 ) { - __m128i u_values = _mm_loadu_si128((__m128i*)(U + y * width + x)); - __m128i v_values = _mm_loadu_si128((__m128i*)(V + y * width + x)); - __m128i u_low = _mm_unpacklo_epi64(u_values, u_values); - __m128i v_low = _mm_unpacklo_epi64(v_values, v_values); - _mm_storeu_si128((__m128i*)(U + y * stride + (x >> 1)), u_low); - _mm_storeu_si128((__m128i*)(V + y * stride + (x >> 1)), v_low); - - } -#else -#ifdef HAVE_ARM_ASIMD - for(; x < dest_width - 15; x += 16) { - uint8x16_t u_values = vld1q_u8(U + y * width + x); - uint8x16_t v_values = vld1q_u8(V + y * width + x); - - uint8x8_t u_low = vget_low_u8(u_values); - uint8x8_t v_low = vget_low_u8(v_values); - - vst1_u8(U + y * stride + (x >> 1), u_low); - vst1_u8(V + y * stride + (x >> 1), v_low); - } -#endif -#endif - for (; x < dest_width; x += 2) { +#pragma omp simd + for (int x = 0; x < dest_width; x += 2) { U[y * stride + (x >> 1)] = U[y * width + x]; V[y * stride + (x >> 1)] = V[y * width + x]; } @@ -770,71 +746,14 @@ void ss_444_to_422_drop(uint8_t *U, uint8_t *V, int width, int height) { /* * subsample YUV 4:4:4 to YUV 4:2:2 using average method */ -void ss_444_to_422_average(uint8_t *U, uint8_t *V, int width, int height) { +void ss_444_to_422_average(uint8_t *restrict U, uint8_t *restrict V, int width, int height) { const int dest_width = width >> 1; const int stride = width >> 1; - - for (int y = 0; y < height; y++) { - int x = 0; -#ifdef HAVE_ASM_SSE2 - for (; x < dest_width - 15; x += 16) { - __m128i U_values1 = _mm_loadu_si128((__m128i *)(U + y * width + x * 2)); - __m128i V_values1 = _mm_loadu_si128((__m128i *)(V + y * width + x * 2)); - __m128i U_values2 = _mm_loadu_si128((__m128i *)(U + y * width + (x + 8) * 2)); - __m128i V_values2 = _mm_loadu_si128((__m128i *)(V + y * width + (x + 8) * 2)); - - __m128i U_sum1 = _mm_add_epi16(_mm_unpacklo_epi8(U_values1, _mm_setzero_si128()), - _mm_unpackhi_epi8(U_values1, _mm_setzero_si128())); - __m128i V_sum1 = _mm_add_epi16(_mm_unpacklo_epi8(V_values1, _mm_setzero_si128()), - _mm_unpackhi_epi8(V_values1, _mm_setzero_si128())); - __m128i U_sum2 = _mm_add_epi16(_mm_unpacklo_epi8(U_values2, _mm_setzero_si128()), - _mm_unpackhi_epi8(U_values2, _mm_setzero_si128())); - __m128i V_sum2 = _mm_add_epi16(_mm_unpacklo_epi8(V_values2, _mm_setzero_si128()), - _mm_unpackhi_epi8(V_values2, _mm_setzero_si128())); - - U_sum1 = _mm_srli_epi16(_mm_add_epi16(U_sum1, _mm_set1_epi16(1)), 1); - V_sum1 = _mm_srli_epi16(_mm_add_epi16(V_sum1, _mm_set1_epi16(1)), 1); - U_sum2 = _mm_srli_epi16(_mm_add_epi16(U_sum2, _mm_set1_epi16(1)), 1); - V_sum2 = _mm_srli_epi16(_mm_add_epi16(V_sum2, _mm_set1_epi16(1)), 1); - U_sum1 = _mm_packus_epi16(U_sum1, U_sum2); - V_sum1 = _mm_packus_epi16(V_sum1, V_sum2); - - _mm_storeu_si128((__m128i *)(U + y * stride + x), U_sum1); - _mm_storeu_si128((__m128i *)(V + y * stride + x), V_sum1); - } -#else -#ifdef HAVE_ARM_ASIMD - for (; x < dest_width - 15; x += 16) { - uint8x16_t U_values1 = vld1q_u8(U + y * width + x * 2); - uint8x16_t V_values1 = vld1q_u8(V + y * width + x * 2); - uint8x16_t U_values2 = vld1q_u8(U + y * width + (x + 8) * 2); - uint8x16_t V_values2 = vld1q_u8(V + y * width + (x + 8) * 2); - - uint16x8_t U_sum1 = vmovl_u8(vget_low_u8(U_values1)); - uint16x8_t V_sum1 = vmovl_u8(vget_low_u8(V_values1)); - U_sum1 = vaddq_u16(U_sum1, vmovl_u8(vget_high_u8(U_values1))); - V_sum1 = vaddq_u16(V_sum1, vmovl_u8(vget_high_u8(V_values1))); - - uint16x8_t U_sum2 = vmovl_u8(vget_low_u8(U_values2)); - uint16x8_t V_sum2 = vmovl_u8(vget_low_u8(V_values2)); - U_sum2 = vaddq_u16(U_sum2, vmovl_u8(vget_high_u8(U_values2))); - V_sum2 = vaddq_u16(V_sum2, vmovl_u8(vget_high_u8(V_values2))); - - U_sum1 = vshrq_n_u16(vaddq_u16(U_sum1, vdupq_n_u16(1)), 1); - V_sum1 = vshrq_n_u16(vaddq_u16(V_sum1, vdupq_n_u16(1)), 1); - U_sum2 = vshrq_n_u16(vaddq_u16(U_sum2, vdupq_n_u16(1)), 1); - V_sum2 = vshrq_n_u16(vaddq_u16(V_sum2, vdupq_n_u16(1)), 1); - - uint8x8_t U_result = vqmovn_u16(vcombine_u16(vget_low_u16(U_sum1), vget_low_u16(U_sum2))); - uint8x8_t V_result = vqmovn_u16(vcombine_u16(vget_low_u16(V_sum1), vget_low_u16(V_sum2))); - - vst1_u8(U + y * stride + x, U_result); - vst1_u8(V + y * stride + x, V_result); - } -#endif -#endif - for (; x < dest_width; x++) { - int src_index = y * width + x * 2; + int x,y; + for (y = 0; y < height; y++) { +#pragma omp simd + for (x = 0; x < dest_width; x++) { + const int src_index = y * width + x * 2; U[y * stride + x] = (U[src_index] + U[src_index + 1] + 1) >> 1; V[y * stride + x] = (V[src_index] + V[src_index + 1] + 1) >> 1; } @@ -850,6 +769,7 @@ static void ss_444_to_422_bilinear(uint8_t *restrict U, uint8_t *restrict V, con const int dest_width = width >> 1; for (int i = 0; i < height; i++) { +#pragma omp simd for (int j = 0; j < dest_width; j++) { const int src_idx = 2 * j; @@ -881,38 +801,8 @@ static void ss_444_to_422_bilinear(uint8_t *restrict U, uint8_t *restrict V, con /* * subsample YUV 4:4:4 to YUV 4:2:2 using mitchell netravali - * without lookup table, keep for reference * - * -static void ss_444_to_422_in_mitchell_netravali(uint8_t *restrict U, uint8_t *restrict V, const int width, const int height) { - const int dest_width = width >> 1; - - for (int i = 0; i < height; i++) { - for (int j = 0; j < dest_width; j++) { - const int src_idx = 2 * j; - int totalU = 0, totalV = 0; - - for (int u = -1; u <= 2; u++) { - int dest_idx = i + u; - dest_idx = (dest_idx < 0) ? 0 : ((dest_idx >= height) ? height - 1 : dest_idx); - - int fu = ((2 * j + u) - src_idx) * ((2 * j + u) - src_idx) << 16 >> 2; - int weightInt = (((1 << 17) - fu) * B1 + (fu * B1 >> 16)) >> 16; - - int srcU = U[dest_idx * width + src_idx] - 128; - int srcV = V[dest_idx * width + src_idx] - 128; - - totalU += (srcU * weightInt) + ((srcU * weightInt >> 31) & 1); - totalV += (srcV * weightInt) + ((srcV * weightInt >> 31) & 1); - } - - U[i * dest_width + j] = (uint8_t)((totalU + (1 << 14)) >> 15) + 128; - V[i * dest_width + j] = (uint8_t)((totalV + (1 << 14)) >> 15) + 128; - } - } -} */ - static void ss_444_to_422_in_mitchell_netravali(uint8_t *restrict U, uint8_t *restrict V, const int width, const int height) { const int output_width = width >> 1; int i,j; @@ -927,6 +817,7 @@ static void ss_444_to_422_in_mitchell_netravali(uint8_t *restrict U, uint8_t *re } for (; i < height; i++) { +#pragma omp simd for (int j = 0; j < output_width; j++) { const int chroma_col = 2 * j; const int output_idx = i * output_width + j; @@ -962,35 +853,43 @@ static void ss_444_to_422_in_mitchell_netravali(uint8_t *restrict U, uint8_t *re } void tr_422_to_444_dup(uint8_t *chromaChannel, int width, int height) { - const int src_width = width >> 1; - const int hei = height - 1; - const int wid = src_width - 1; - for (int y = hei; y >= 0; y--) { - int x = wid; + const int src_width = width >> 1; + const int hei = height - 1; + const int wid = src_width - 1; + + // duplicate the last pixel in the row + chromaChannel[hei * width + 1] = chromaChannel[hei * src_width + wid]; + + for (int y = hei; y >= 0; y--) { + int x = wid - 1; + // upsample the pixels using a bilinear filter. + chromaChannel[y * width + 2 * wid + 1] = (chromaChannel[y * src_width + wid] + chromaChannel[y * src_width + wid + 1]) >> 1; + chromaChannel[y * width + 2 * wid] = (chromaChannel[y * src_width + wid] + chromaChannel[y * src_width + wid - 1]) >> 1; + #ifdef HAVE_ASM_SSE2 - for (int x = src_width - 1; x >= 0; x -= 16) { - __m128i pixels = _mm_loadu_si128((__m128i*)&chromaChannel[y * src_width + x]); - __m128i result = _mm_unpacklo_epi8(pixels, pixels); - _mm_storeu_si128((__m128i*)&chromaChannel[y * width + 2 * x], result); - } -#else -#ifdef HAVE_ARM_SIMD - for (int x = src_width - 1; x >= 0; x -= 16) { - uint8x16_t pixels = vld1q_u8(&chromaChannel[y * src_width + x]); - uint8x16_t result = vdupq_n_u8(vgetq_lane_u8(pixels, 0)); - vst1q_u8(&chromaChannel[y * width + 2 * x], result); - } -#endif -#endif - for (; x >= 0; x--) { - const uint8_t pixel = chromaChannel[y * src_width + x]; - - chromaChannel[y * width + 2 * x] = pixel; - chromaChannel[y * width + 2 * x + 1] = pixel; - } - - chromaChannel[y * width] = chromaChannel[y * width + 1]; + for (; x >= 0; x -= 8) { + __m128i pixels = _mm_loadl_epi64((__m128i*)&chromaChannel[y * src_width + x]); + __m128i duplicated_pixels = _mm_unpacklo_epi8(pixels, pixels); + _mm_storeu_si128((__m128i*)&chromaChannel[y * width + 2 * x], duplicated_pixels); } +#else +#ifdef HAVE_ARM_ASIMD + for (; x >= 0; x -= 8) { + uint8x8_t pixels = vld1_u8(&chromaChannel[y * src_width + x]); + uint8x8x2_t duplicated_pixels; + duplicated_pixels.val[0] = pixels; + duplicated_pixels.val[1] = pixels; + vst2_u8(&chromaChannel[y * width + 2 * x], duplicated_pixels); + } +#endif +#endif + + for ( ; x >= 0; x--) { + const uint8_t pixel = chromaChannel[y * src_width + x]; + chromaChannel[y * width + 2 * x + 1] = pixel; + chromaChannel[y * width + 2 * x] = pixel; + } + } } /* vertical intersitial siting; horizontal cositing