diff options
| author | Carl Hetherington <cth@carlh.net> | 2020-06-02 16:01:57 +0200 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2020-06-02 16:01:57 +0200 |
| commit | 7c9ebaab68c80a14839e400cdd509b847357a794 (patch) | |
| tree | a539f1e944354c98ec4a9af35d163fe1edba5b78 | |
| parent | 6016e4b24205e11b1f006e43a6ed4b7f2bde732a (diff) | |
Hackzzz.sse
| -rw-r--r-- | src/rgb_xyz.cc | 119 | ||||
| -rw-r--r-- | src/rgb_xyz.h | 1 | ||||
| -rw-r--r-- | test/rgb_xyz_test.cc | 14 |
3 files changed, 85 insertions, 49 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc index 219dc675..b29dd637 100644 --- a/src/rgb_xyz.cc +++ b/src/rgb_xyz.cc @@ -284,56 +284,44 @@ dcp::rgb_to_xyz ( { shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size)); - struct { - double r, g, b; - } s; - - struct { - double x, y, z; - } d; + RGBPixel<double> rgb_pixel; + XYZPixel<double> xyz_pixel; double const * lut_in = conversion.in()->lut (12, false); - double const * lut_out = conversion.out()->lut (16, true); + int const * lut_out = conversion.out()->lut_int (16, true, 4095); /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */ double fast_matrix[9]; combined_rgb_to_xyz (conversion, fast_matrix); - int clamped = 0; int* xyz_x = xyz->data (0); int* xyz_y = xyz->data (1); int* xyz_z = xyz->data (2); for (int y = 0; y < size.height; ++y) { uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride); for (int x = 0; x < size.width; ++x) { + /* In gamma LUT (converting 16-bit to 12-bit) */ + rgb_pixel.r = lut_in[*rgb++ >> 4]; + rgb_pixel.g = lut_in[*rgb++ >> 4]; + rgb_pixel.b = lut_in[*rgb++ >> 4]; + + /* RGB to XYZ, Bradford transform and DCI companding */ + xyz_pixel.x = rgb_pixel.r * fast_matrix[0] + rgb_pixel.g * fast_matrix[1] + rgb_pixel.b * fast_matrix[2]; + xyz_pixel.y = rgb_pixel.r * fast_matrix[3] + rgb_pixel.g * fast_matrix[4] + rgb_pixel.b * fast_matrix[5]; + xyz_pixel.z = rgb_pixel.r * fast_matrix[6] + rgb_pixel.g * fast_matrix[7] + rgb_pixel.b * fast_matrix[8]; + + /* Clamp */ + xyz_pixel.x = max (0.0, xyz_pixel.x); + xyz_pixel.y = max (0.0, xyz_pixel.y); + xyz_pixel.z = max (0.0, xyz_pixel.z); + xyz_pixel.x = min (65535.0, xyz_pixel.x); + xyz_pixel.y = min (65535.0, xyz_pixel.y); + xyz_pixel.z = min (65535.0, xyz_pixel.z); - /* In gamma LUT (converting 16-bit to 12-bit) */ - s.r = lut_in[*p++ >> 4]; - s.g = lut_in[*p++ >> 4]; - s.b = lut_in[*p++ >> 4]; - - /* RGB to XYZ, Bradford transform and DCI companding */ - d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2]; - d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5]; - d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8]; - - /* Clamp */ - - if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) { - ++clamped; - } - - d.x = max (0.0, d.x); - d.y = max (0.0, d.y); - d.z = max (0.0, d.z); - d.x = min (65535.0, d.x); - d.y = min (65535.0, d.y); - d.z = min (65535.0, d.z); - - /* Out gamma LUT */ - *xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095); - *xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095); - *xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095); + /* Out gamma LUT */ + *xyz_x++ = lut_out[lrint(xyz_pixel.x)]; + *xyz_y++ = lut_out[lrint(xyz_pixel.y)]; + *xyz_z++ = lut_out[lrint(xyz_pixel.z)]; } } @@ -341,24 +329,31 @@ dcp::rgb_to_xyz ( } -/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A +/** @param rgb RGBA data; packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A * with the 2-byte value for each R/G/B/A component stored as * little-endian; i.e. AV_PIX_FMT_RGB48LE. A is ignored but necessary for - * SSE/AVX efficiency. + * SSE/AVX efficiency. The stride must be equal to the width of the image (i.e. + * no padding bytes). * * @param size size of RGB image in pixels. - * @param size stride of RGB data in pixels. */ shared_ptr<dcp::OpenJPEGImage> dcp::rgb_to_xyz_avx2 ( uint8_t const * rgba, dcp::Size size, - int stride, ColourConversion const & conversion ) { shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size)); + struct { + float r, g, b; + } s; + + struct { + float x, y, z; + } d; + float const * lut_in = conversion.in()->lut_float (12, false); int const * lut_out = conversion.out()->lut_int (16, true, 4095); @@ -385,10 +380,18 @@ dcp::rgb_to_xyz_avx2 ( 0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6] ); + int pixel_count = 0; + for (int y = 0; y < size.height; ++y) { - __m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * stride); + uint16_t const* rgba_p = reinterpret_cast<uint16_t const *>(rgba + y * size.width * 8); + for (int x = 0; x < pixel_count % 8; ++x) { + rgb_to_xyz_pixel (rgba_p, lut_in, fast_matrix, lut_out, xyz_x, xyz_y, xyz_z, rgb_pixel, xyz_pixel); + } + + __m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * size.width * 8); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16)); - for (int x = 0; x < size.width / 8; ++x) { + for (int x = 0; x < sse_width; ++x) { // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0)); @@ -478,6 +481,38 @@ dcp::rgb_to_xyz_avx2 ( xyz_y++; xyz_z++; } + + uint16_t const * p_extra = reinterpret_cast<uint16_t const *> (p); + int* xyz_x_extra = reinterpret_cast<int *> (xyz_x); + int* xyz_y_extra = reinterpret_cast<int *> (xyz_y); + int* xyz_z_extra = reinterpret_cast<int *> (xyz_z); + + std::cout << "doing " << extra_width << " more.\n"; + for (int x = 0; x < extra_width; ++x) { + + /* In gamma LUT (converting 16-bit to 12-bit) */ + s.r = lut_in[*p_extra++ >> 4]; + s.g = lut_in[*p_extra++ >> 4]; + s.b = lut_in[*p_extra++ >> 4]; + + /* RGB to XYZ, Bradford transform and DCI companding */ + d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2]; + d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5]; + d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8]; + + /* Clamp */ + d.x = max (0.0f, d.x); + d.y = max (0.0f, d.y); + d.z = max (0.0f, d.z); + d.x = min (65535.0f, d.x); + d.y = min (65535.0f, d.y); + d.z = min (65535.0f, d.z); + + /* Out gamma LUT */ + *xyz_x_extra++ = lut_out[lrint(d.x)]; + *xyz_y_extra++ = lut_out[lrint(d.y)]; + *xyz_z_extra++ = lut_out[lrint(d.z)]; + } } return xyz; diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h index 59b2400d..ef4f2ac1 100644 --- a/src/rgb_xyz.h +++ b/src/rgb_xyz.h @@ -69,7 +69,6 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz ( extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 ( uint8_t const * rgba, dcp::Size size, - int stride, ColourConversion const & conversion ); diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc index d9c27ae2..8a5ad205 100644 --- a/test/rgb_xyz_test.cc +++ b/test/rgb_xyz_test.cc @@ -51,7 +51,7 @@ using boost::scoped_array; BOOST_AUTO_TEST_CASE (rgb_xyz_test) { srand (0); - dcp::Size const size (640, 480); + dcp::Size const size (647, 485); scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]); for (int y = 0; y < size.height; ++y) { @@ -125,11 +125,13 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2) { srand (0); - dcp::Size const size (640, 480); + dcp::Size const size (647, 485); + + int stride = (size.width + (16 - size.width % 16)) * 8; - scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]); + scoped_array<uint8_t> rgb (new uint8_t[size.height * stride]); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride); for (int x = 0; x < size.width; ++x) { /* Write a 12-bit random number for each component */ for (int c = 0; c < 3; ++c) { @@ -139,10 +141,10 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2) } } - shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz()); + shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, stride, dcp::ColourConversion::srgb_to_xyz()); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride); for (int x = 0; x < size.width; ++x) { double cr = *p++ / 65535.0; |
