diff options
| author | Carl Hetherington <cth@carlh.net> | 2020-06-02 19:47:30 +0200 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2020-06-02 19:47:30 +0200 |
| commit | 6636b550a4b69bf3d77b51aa033159036d88f33b (patch) | |
| tree | 740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a /src | |
| parent | 5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff) | |
Add SSE/AVX2 version of rgb_to_xyz().sse2
Diffstat (limited to 'src')
| -rw-r--r-- | src/rgb_xyz.cc | 187 | ||||
| -rw-r--r-- | src/rgb_xyz.h | 8 |
2 files changed, 195 insertions, 0 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc index 36185324..fd398ae8 100644 --- a/src/rgb_xyz.cc +++ b/src/rgb_xyz.cc @@ -38,6 +38,7 @@ #include "dcp_assert.h" #include "compose.hpp" #include <cmath> +#include <immintrin.h> using std::min; using std::max; @@ -337,3 +338,189 @@ dcp::rgb_to_xyz ( return xyz; } + + +/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored). + * Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A + * being ignored. There must be no padding bytes between lines of the image (i.e. + * stride must equal width). The pointer must be aligned to a 16-byte boundary. + * + * @param size size of RGB image in pixels. + */ +shared_ptr<dcp::OpenJPEGImage> +dcp::rgb_to_xyz_avx2 ( + uint8_t const * rgb, + dcp::Size size, + ColourConversion const & conversion + ) +{ + /* There must be no padding in the RGB as there *can* be no padding in the XYZ output + * and we have to keep the RGB and XYZ data pointers similarly aligned. + */ + + shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size)); + + float const * lut_in = conversion.in()->lut_float (12, false); + int const * lut_out = conversion.out()->lut_int (16, true, 4095); + + /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */ + double transform[9]; + combined_rgb_to_xyz (conversion, transform); + + __m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32)); + __m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32)); + __m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32)); + + __m256 transform_x = _mm256_set_ps ( + 0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0] + ); + + __m256 transform_y = _mm256_set_ps ( + 0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3] + ); + + __m256 transform_z = _mm256_set_ps ( + 0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6] + ); + + int const pixels = size.width * size.height; + int const fast_loops = pixels / 8; + int const slow_loops = pixels - fast_loops * 8; + + __m128i const * p = reinterpret_cast<__m128i const *> (rgb); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16)); + + for (int i = 0; i < fast_loops; ++i) { + // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words + __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0)); + __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1)); + __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2)); + __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3)); + p += 4; + + // shift right to truncate 16-bit inputs to 12-bit + rgb_A = _mm256_srli_epi32 (rgb_A, 4); + rgb_B = _mm256_srli_epi32 (rgb_B, 4); + rgb_C = _mm256_srli_epi32 (rgb_C, 4); + rgb_D = _mm256_srli_epi32 (rgb_D, 4); + + // input gamma LUT + __m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4); + __m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4); + __m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4); + __m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4); + + // multiply for X + __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x); + __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x); + __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x); + __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x); + + // accumulate + + // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0 + // = B67 B45 A67 A45 B23 B01 A23 A01 + x_A = _mm256_hadd_ps (x_A, x_B); + + // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0 + // = D67 D45 C67 C45 D23 D01 C23 C01 + x_C = _mm256_hadd_ps (x_C, x_D); + + // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01 + // = D47 C47 B47 A47 D03 C03 B03 A03 + x_A = _mm256_hadd_ps (x_A, x_C); + + // same for Y + __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y); + __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y); + __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y); + __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y); + y_A = _mm256_hadd_ps (y_A, y_B); + y_C = _mm256_hadd_ps (y_C, y_D); + y_A = _mm256_hadd_ps (y_A, y_C); + + // and for Z + __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z); + __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z); + __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z); + __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z); + z_A = _mm256_hadd_ps (z_A, z_B); + z_C = _mm256_hadd_ps (z_C, z_D); + z_A = _mm256_hadd_ps (z_A, z_C); + + // clamp + x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0)); + x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0)); + y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0)); + y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0)); + z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0)); + z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0)); + + // round to int + __m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A)); + __m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A)); + __m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A)); + + // out gamma LUT + lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4); + lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4); + lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4); + + // shuffle + lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + + // write to memory + _mm256_store_si256 (xyz_x, lut_2_x); + _mm256_store_si256 (xyz_y, lut_2_y); + _mm256_store_si256 (xyz_z, lut_2_z); + xyz_x++; + xyz_y++; + xyz_z++; + } + + struct { + float r, g, b; + } s; + + struct { + float x, y, z; + } d; + + uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p); + int* xyz_x_slow = reinterpret_cast<int*> (xyz_x); + int* xyz_y_slow = reinterpret_cast<int*> (xyz_y); + int* xyz_z_slow = reinterpret_cast<int*> (xyz_z); + + for (int i = 0; i < slow_loops; ++i) { + /* In gamma LUT (converting 16-bit to 12-bit) */ + s.r = lut_in[*p_slow++ >> 4]; + s.g = lut_in[*p_slow++ >> 4]; + s.b = lut_in[*p_slow++ >> 4]; + p_slow++; + + /* RGB to XYZ, Bradford transform and DCI companding */ + d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2]; + d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5]; + d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8]; + + /* Clamp */ + d.x = max (0.0f, d.x); + d.y = max (0.0f, d.y); + d.z = max (0.0f, d.z); + d.x = min (65535.0f, d.x); + d.y = min (65535.0f, d.y); + d.z = min (65535.0f, d.z); + + /* Out gamma LUT */ + *xyz_x_slow++ = lut_out[lrint(d.x)]; + *xyz_y_slow++ = lut_out[lrint(d.y)]; + *xyz_z_slow++ = lut_out[lrint(d.z)]; + } + + return xyz; +} diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h index e414d07e..1012c424 100644 --- a/src/rgb_xyz.h +++ b/src/rgb_xyz.h @@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz ( ColourConversion const & conversion ); + +extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 ( + uint8_t const * rgb, + dcp::Size size, + ColourConversion const & conversion + ); + + extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix); } |
