diff options
| author | Carl Hetherington <cth@carlh.net> | 2020-06-02 00:52:57 +0200 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2020-06-02 00:52:57 +0200 |
| commit | 4a5609265010b702b0222620bde9f64f1ec7269e (patch) | |
| tree | ee417c06dc616d03fcf20d32f96d74b6f22465e8 | |
| parent | 4e885dc225c5c12e11367602cc4bf429520d9b3b (diff) | |
Basically maybe-working float version of rgb_to_xyz using SSE/AVX2.
| -rw-r--r-- | benchmark/rgb_to_xyz.cc | 6 | ||||
| -rw-r--r-- | src/rgb_xyz.cc | 193 | ||||
| -rw-r--r-- | test/rgb_xyz_test.cc | 17 | ||||
| -rw-r--r-- | wscript | 2 |
4 files changed, 199 insertions, 19 deletions
diff --git a/benchmark/rgb_to_xyz.cc b/benchmark/rgb_to_xyz.cc index 7b4400ec..bdd1e426 100644 --- a/benchmark/rgb_to_xyz.cc +++ b/benchmark/rgb_to_xyz.cc @@ -49,9 +49,9 @@ main () dcp::Size size(1998, 1080); - scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]); + scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6); + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); for (int x = 0; x < size.width; ++x) { for (int c = 0; c < 3; ++c) { *p = (rand() & 0xfff) << 4; @@ -62,7 +62,7 @@ main () shared_ptr<dcp::OpenJPEGImage> xyz; for (int i = 0; i < trials; ++i) { - xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz()); + xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz()); } } diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc index d0774e7c..4d8fc09c 100644 --- a/src/rgb_xyz.cc +++ b/src/rgb_xyz.cc @@ -38,6 +38,7 @@ #include "dcp_assert.h" #include "compose.hpp" #include <cmath> +#include <immintrin.h> using std::min; using std::max; @@ -272,9 +273,11 @@ dcp::combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix) * DCI_COEFFICIENT * 65535; } -/** @param rgb RGB data; packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, - * with the 2-byte value for each R/G/B component stored as - * little-endian; i.e. AV_PIX_FMT_RGB48LE. +/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A + * with the 2-byte value for each R/G/B/A component stored as + * little-endian; i.e. AV_PIX_FMT_RGB48LE. A is ignored but necessary for + * SSE/AVX efficiency. + * * @param size size of RGB image in pixels. * @param size stride of RGB data in pixels. */ @@ -300,28 +303,187 @@ dcp::rgb_to_xyz ( double const * lut_in = conversion.in()->lut (12, false); double const * lut_out = conversion.out()->lut (16, true); + float* lut_in_float = new float[4096]; + for (int i = 0; i < 4096; ++i) { + lut_in_float[i] = lut_in[i]; + } + + int* lut_out_int = new int[65536]; + for (int i = 0; i < 65536; ++i) { + lut_out_int[i] = lrint(lut_out[i] * 4095); + } + /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */ double fast_matrix[9]; combined_rgb_to_xyz (conversion, fast_matrix); int clamped = 0; - int* xyz_x = xyz->data (0); - int* xyz_y = xyz->data (1); - int* xyz_z = xyz->data (2); + int* xyz_x = (int *) aligned_alloc(32, size.height * size.width * 4); + memcpy (xyz_x, xyz->data(0), size.height * size.width * 4); + int* xyz_y = (int *) aligned_alloc(32, size.height * size.width * 4); + memcpy (xyz_y, xyz->data(1), size.height * size.width * 4); + int* xyz_z = (int *) aligned_alloc(32, size.height * size.width * 4); + memcpy (xyz_z, xyz->data(2), size.height * size.width * 4); + + __m256i* xyz_x_sse = (__m256i *) xyz_x; + __m256i* xyz_y_sse = (__m256i *) xyz_y; + __m256i* xyz_z_sse = (__m256i *) xyz_z; + + __m256 transform_x = _mm256_set_ps ( + 0, fast_matrix[2], fast_matrix[1], fast_matrix[0], 0, fast_matrix[2], fast_matrix[1], fast_matrix[0] + ); + + __m256 transform_y = _mm256_set_ps ( + 0, fast_matrix[5], fast_matrix[4], fast_matrix[3], 0, fast_matrix[5], fast_matrix[4], fast_matrix[3] + ); + + __m256 transform_z = _mm256_set_ps ( + 0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6] + ); + + /* in-line these, or keep them const? */ + __m256 const_65535 = _mm256_set1_ps (65535.0); + __m256 const_0 = _mm256_set1_ps (0.0); + __m256i shuffle = _mm256_set_epi32 (7, 3, 6, 2, 5, 1, 4, 0); +#if 0 + __m256 zero = _mm256_setzero_ps (); + + __m256i red_mask_1 = _mm256_set_epi32 (0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000); + __m256i red_mask_2 = _mm256_set_epi32 (0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff); + __m256i red_mask_3 = _mm256_set_epi32 (0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000); +#endif + for (int y = 0; y < size.height; ++y) { uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride); - for (int x = 0; x < size.width; ++x) { - + __m128i const * p_sse = reinterpret_cast<__m128i const *> (rgb + y * stride); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16)); + for (int x = 0; x < size.width / 8; ++x) { + + // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words + __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 0)); + __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 1)); + __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 2)); + __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 3)); + p_sse += 4; + + // shift right to truncate 16-bit inputs to 12-bit + rgb_A = _mm256_srli_epi32 (rgb_A, 4); + rgb_B = _mm256_srli_epi32 (rgb_B, 4); + rgb_C = _mm256_srli_epi32 (rgb_C, 4); + rgb_D = _mm256_srli_epi32 (rgb_D, 4); + + // input gamma LUT + __m256 lut_1_A = _mm256_i32gather_ps (lut_in_float, rgb_A, 4); + __m256 lut_1_B = _mm256_i32gather_ps (lut_in_float, rgb_B, 4); + __m256 lut_1_C = _mm256_i32gather_ps (lut_in_float, rgb_C, 4); + __m256 lut_1_D = _mm256_i32gather_ps (lut_in_float, rgb_D, 4); + + // multiply for X + __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x); + __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x); + __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x); + __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x); + + // accumulate + + //float* dbg_x_A = (float *) &x_A; + //float* dbg_x_B = (float *) &x_B; +#if 0 + printf("A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]); + printf("B:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_B[0], dbg_x_B[1], dbg_x_B[2], dbg_x_B[3], dbg_x_B[4], dbg_x_B[5], dbg_x_B[6], dbg_x_B[7]); +#endif + + // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0 + // = B67 B45 A67 A45 B23 B01 A23 A01 + x_A = _mm256_hadd_ps (x_A, x_B); + +#if 0 + printf("sum A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]); + +#endif + // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0 + // = D67 D45 C67 C45 D23 D01 C23 C01 + x_C = _mm256_hadd_ps (x_C, x_D); + + // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01 + // = D47 C47 B47 A47 D03 C03 B03 A03 + x_A = _mm256_hadd_ps (x_A, x_C); + + // now x values for the 8 pixels are in x_A + + //float* fucker = (float *) &x_A; + //printf("8 pixels: %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f", fucker[0], fucker[4], fucker[1], fucker[5], fucker[2], fucker[6], fucker[3], fucker[7]); + + // multiply for Y + __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y); + __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y); + __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y); + __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y); + + // accumulate + y_A = _mm256_hadd_ps (y_A, y_B); + y_C = _mm256_hadd_ps (y_C, y_D); + y_A = _mm256_hadd_ps (y_A, y_C); + + // multiply for Z + __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z); + __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z); + __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z); + __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z); + + // accumulate + z_A = _mm256_hadd_ps (z_A, z_B); + z_C = _mm256_hadd_ps (z_C, z_D); + z_A = _mm256_hadd_ps (z_A, z_C); + + // clamp + x_A = _mm256_min_ps(x_A, const_65535); + x_A = _mm256_max_ps(x_A, const_0); + y_A = _mm256_min_ps(y_A, const_65535); + y_A = _mm256_max_ps(y_A, const_0); + z_A = _mm256_min_ps(z_A, const_65535); + z_A = _mm256_max_ps(z_A, const_0); + + // round to int + __m256i lut_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A)); + __m256i lut_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A)); + __m256i lut_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A)); + + // out gamma LUT + __m256i lut_x_out = _mm256_i32gather_epi32 (lut_out_int, lut_x, 4); + __m256i lut_y_out = _mm256_i32gather_epi32 (lut_out_int, lut_y, 4); + __m256i lut_z_out = _mm256_i32gather_epi32 (lut_out_int, lut_z, 4); + + //int* res = (int *) &lut_x_out; + //printf("result 1 = %d\n", res[0]); + + // shuffle + lut_x_out = _mm256_permutevar8x32_epi32 (lut_x_out, shuffle); + lut_y_out = _mm256_permutevar8x32_epi32 (lut_y_out, shuffle); + lut_z_out = _mm256_permutevar8x32_epi32 (lut_z_out, shuffle); + + // write to memory + _mm256_store_si256 (xyz_x_sse, lut_x_out); + _mm256_store_si256 (xyz_y_sse, lut_y_out); + _mm256_store_si256 (xyz_z_sse, lut_z_out); + xyz_x_sse++; + xyz_y_sse++; + xyz_z_sse++; + +#if 0 /* In gamma LUT (converting 16-bit to 12-bit) */ s.r = lut_in[*p++ >> 4]; s.g = lut_in[*p++ >> 4]; s.b = lut_in[*p++ >> 4]; + p++; /* RGB to XYZ, Bradford transform and DCI companding */ d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2]; d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5]; d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8]; + printf("actual x %.2f\n", d.x); + /* Clamp */ if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) { @@ -336,9 +498,13 @@ dcp::rgb_to_xyz ( d.z = min (65535.0, d.z); /* Out gamma LUT */ + // 4095 should be in the LUT and LUT should be integer *xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095); *xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095); *xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095); + + printf("c calc %d\n", xyz_x[-1]); +#endif } } @@ -346,5 +512,16 @@ dcp::rgb_to_xyz ( note.get() (DCP_NOTE, String::compose ("%1 XYZ value(s) clamped", clamped)); } + delete[] lut_in_float; + delete[] lut_out_int; + + memcpy (xyz->data(0), xyz_x, size.width * size.height * 4); + memcpy (xyz->data(1), xyz_y, size.width * size.height * 4); + memcpy (xyz->data(2), xyz_z, size.width * size.height * 4); + + free (xyz_x); + free (xyz_y); + free (xyz_z); + return xyz; } diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc index 5dcfe673..18b7b5df 100644 --- a/test/rgb_xyz_test.cc +++ b/test/rgb_xyz_test.cc @@ -52,9 +52,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) srand (0); dcp::Size const size (640, 480); - scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]); + scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6); + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); for (int x = 0; x < size.width; ++x) { /* Write a 12-bit random number for each component */ for (int c = 0; c < 3; ++c) { @@ -64,15 +64,16 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) } } - shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ()); + shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz()); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6); + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); for (int x = 0; x < size.width; ++x) { double cr = *p++ / 65535.0; double cg = *p++ / 65535.0; double cb = *p++ / 65535.0; + p++; /* Input gamma */ @@ -112,9 +113,11 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) cy = pow (cy, 1 / 2.6); cz = pow (cz, 1 / 2.6); - BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 1); - BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 1); - BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 1); + //std::cout << "x=" << x << " y=" << y << " " << (cx * 4095) << " " << xyz->data(0)[y * size.width + x] << "\n"; + + BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3); + BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3); + BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3); } } } @@ -67,7 +67,7 @@ def options(opt): def configure(conf): conf.load('compiler_cxx') conf.load('clang_compilation_database', tooldir=['waf-tools']) - conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS']) + conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS', '-mavx2']) if conf.options.force_cpp11: conf.env.append_value('CXXFLAGS', ['-std=c++11', '-DBOOST_NO_CXX11_SCOPED_ENUMS']) gcc = conf.env['CC_VERSION'] |
