diff options
| author | Carl Hetherington <cth@carlh.net> | 2020-06-02 19:47:30 +0200 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2020-06-02 19:47:30 +0200 |
| commit | 6636b550a4b69bf3d77b51aa033159036d88f33b (patch) | |
| tree | 740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a | |
| parent | 5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff) | |
Add SSE/AVX2 version of rgb_to_xyz().sse2
| -rw-r--r-- | benchmark/rgb_to_xyz_avx2.cc | 69 | ||||
| -rw-r--r-- | benchmark/wscript | 2 | ||||
| -rw-r--r-- | src/rgb_xyz.cc | 187 | ||||
| -rw-r--r-- | src/rgb_xyz.h | 8 | ||||
| -rw-r--r-- | test/rgb_xyz_test.cc | 108 |
5 files changed, 346 insertions, 28 deletions
diff --git a/benchmark/rgb_to_xyz_avx2.cc b/benchmark/rgb_to_xyz_avx2.cc new file mode 100644 index 00000000..65abe2ba --- /dev/null +++ b/benchmark/rgb_to_xyz_avx2.cc @@ -0,0 +1,69 @@ +/* + Copyright (C) 2020 Carl Hetherington <cth@carlh.net> + + This file is part of libdcp. + + libdcp is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + libdcp is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libdcp. If not, see <http://www.gnu.org/licenses/>. + + In addition, as a special exception, the copyright holders give + permission to link the code of portions of this program with the + OpenSSL library under certain conditions as described in each + individual source file, and distribute linked combinations + including the two. + + You must obey the GNU General Public License in all respects + for all of the code used other than OpenSSL. If you modify + file(s) with this exception, you may extend this exception to your + version of the file(s), but you are not obligated to do so. If you + do not wish to do so, delete this exception statement from your + version. If you delete this exception statement from all source + files in the program, then also delete it here. +*/ + +#include "openjpeg_image.h" +#include "rgb_xyz.h" +#include "colour_conversion.h" +#include <boost/scoped_array.hpp> +#include <stdint.h> + +using boost::scoped_array; +using boost::shared_ptr; + +int const trials = 256; + +int +main () +{ + srand (1); + + dcp::Size size(1998, 1080); + + scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]); + for (int y = 0; y < size.height; ++y) { + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8); + for (int x = 0; x < size.width; ++x) { + for (int c = 0; c < 4; ++c) { + *p = (rand() & 0xfff) << 4; + ++p; + } + } + } + + shared_ptr<dcp::OpenJPEGImage> xyz; + for (int i = 0; i < trials; ++i) { + xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz()); + } +} + + diff --git a/benchmark/wscript b/benchmark/wscript index 3e748972..0b6994b9 100644 --- a/benchmark/wscript +++ b/benchmark/wscript @@ -32,7 +32,7 @@ # def build(bld): - for p in ['rgb_to_xyz']: + for p in ['rgb_to_xyz', 'rgb_to_xyz_avx2']: obj = bld(features='cxx cxxprogram') obj.name = p obj.uselib = 'BOOST_FILESYSTEM' diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc index 36185324..fd398ae8 100644 --- a/src/rgb_xyz.cc +++ b/src/rgb_xyz.cc @@ -38,6 +38,7 @@ #include "dcp_assert.h" #include "compose.hpp" #include <cmath> +#include <immintrin.h> using std::min; using std::max; @@ -337,3 +338,189 @@ dcp::rgb_to_xyz ( return xyz; } + + +/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored). + * Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A + * being ignored. There must be no padding bytes between lines of the image (i.e. + * stride must equal width). The pointer must be aligned to a 16-byte boundary. + * + * @param size size of RGB image in pixels. + */ +shared_ptr<dcp::OpenJPEGImage> +dcp::rgb_to_xyz_avx2 ( + uint8_t const * rgb, + dcp::Size size, + ColourConversion const & conversion + ) +{ + /* There must be no padding in the RGB as there *can* be no padding in the XYZ output + * and we have to keep the RGB and XYZ data pointers similarly aligned. + */ + + shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size)); + + float const * lut_in = conversion.in()->lut_float (12, false); + int const * lut_out = conversion.out()->lut_int (16, true, 4095); + + /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */ + double transform[9]; + combined_rgb_to_xyz (conversion, transform); + + __m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32)); + __m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32)); + __m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2)); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32)); + + __m256 transform_x = _mm256_set_ps ( + 0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0] + ); + + __m256 transform_y = _mm256_set_ps ( + 0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3] + ); + + __m256 transform_z = _mm256_set_ps ( + 0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6] + ); + + int const pixels = size.width * size.height; + int const fast_loops = pixels / 8; + int const slow_loops = pixels - fast_loops * 8; + + __m128i const * p = reinterpret_cast<__m128i const *> (rgb); + DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16)); + + for (int i = 0; i < fast_loops; ++i) { + // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words + __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0)); + __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1)); + __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2)); + __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3)); + p += 4; + + // shift right to truncate 16-bit inputs to 12-bit + rgb_A = _mm256_srli_epi32 (rgb_A, 4); + rgb_B = _mm256_srli_epi32 (rgb_B, 4); + rgb_C = _mm256_srli_epi32 (rgb_C, 4); + rgb_D = _mm256_srli_epi32 (rgb_D, 4); + + // input gamma LUT + __m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4); + __m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4); + __m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4); + __m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4); + + // multiply for X + __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x); + __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x); + __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x); + __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x); + + // accumulate + + // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0 + // = B67 B45 A67 A45 B23 B01 A23 A01 + x_A = _mm256_hadd_ps (x_A, x_B); + + // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0 + // = D67 D45 C67 C45 D23 D01 C23 C01 + x_C = _mm256_hadd_ps (x_C, x_D); + + // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01 + // = D47 C47 B47 A47 D03 C03 B03 A03 + x_A = _mm256_hadd_ps (x_A, x_C); + + // same for Y + __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y); + __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y); + __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y); + __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y); + y_A = _mm256_hadd_ps (y_A, y_B); + y_C = _mm256_hadd_ps (y_C, y_D); + y_A = _mm256_hadd_ps (y_A, y_C); + + // and for Z + __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z); + __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z); + __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z); + __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z); + z_A = _mm256_hadd_ps (z_A, z_B); + z_C = _mm256_hadd_ps (z_C, z_D); + z_A = _mm256_hadd_ps (z_A, z_C); + + // clamp + x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0)); + x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0)); + y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0)); + y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0)); + z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0)); + z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0)); + + // round to int + __m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A)); + __m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A)); + __m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A)); + + // out gamma LUT + lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4); + lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4); + lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4); + + // shuffle + lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)); + + // write to memory + _mm256_store_si256 (xyz_x, lut_2_x); + _mm256_store_si256 (xyz_y, lut_2_y); + _mm256_store_si256 (xyz_z, lut_2_z); + xyz_x++; + xyz_y++; + xyz_z++; + } + + struct { + float r, g, b; + } s; + + struct { + float x, y, z; + } d; + + uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p); + int* xyz_x_slow = reinterpret_cast<int*> (xyz_x); + int* xyz_y_slow = reinterpret_cast<int*> (xyz_y); + int* xyz_z_slow = reinterpret_cast<int*> (xyz_z); + + for (int i = 0; i < slow_loops; ++i) { + /* In gamma LUT (converting 16-bit to 12-bit) */ + s.r = lut_in[*p_slow++ >> 4]; + s.g = lut_in[*p_slow++ >> 4]; + s.b = lut_in[*p_slow++ >> 4]; + p_slow++; + + /* RGB to XYZ, Bradford transform and DCI companding */ + d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2]; + d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5]; + d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8]; + + /* Clamp */ + d.x = max (0.0f, d.x); + d.y = max (0.0f, d.y); + d.z = max (0.0f, d.z); + d.x = min (65535.0f, d.x); + d.y = min (65535.0f, d.y); + d.z = min (65535.0f, d.z); + + /* Out gamma LUT */ + *xyz_x_slow++ = lut_out[lrint(d.x)]; + *xyz_y_slow++ = lut_out[lrint(d.y)]; + *xyz_z_slow++ = lut_out[lrint(d.z)]; + } + + return xyz; +} diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h index e414d07e..1012c424 100644 --- a/src/rgb_xyz.h +++ b/src/rgb_xyz.h @@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz ( ColourConversion const & conversion ); + +extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 ( + uint8_t const * rgb, + dcp::Size size, + ColourConversion const & conversion + ); + + extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix); } diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc index 5dcfe673..961edc95 100644 --- a/test/rgb_xyz_test.cc +++ b/test/rgb_xyz_test.cc @@ -45,25 +45,46 @@ using std::cout; using boost::shared_ptr; using boost::optional; using boost::scoped_array; +using boost::shared_array; -/** Convert a test image from sRGB to XYZ and check that the transforms are right */ -BOOST_AUTO_TEST_CASE (rgb_xyz_test) +template <class T> +void +input_gamma (T& v) +{ + if (v < 0.04045) { + v /= 12.92; + } else { + v = pow ((v + 0.055) / 1.055, 2.4); + } +} + + +shared_array<uint8_t> +random_data (dcp::Size size, int components) { srand (0); - dcp::Size const size (640, 480); + shared_array<uint8_t> data (new uint8_t[size.width * size.height * components * 2]); - scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]); for (int y = 0; y < size.height; ++y) { - uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6); + uint16_t* p = reinterpret_cast<uint16_t*> (data.get() + y * size.width * components * 2); for (int x = 0; x < size.width; ++x) { /* Write a 12-bit random number for each component */ - for (int c = 0; c < 3; ++c) { - *p = (rand () & 0xfff) << 4; - ++p; + for (int c = 0; c < components; ++c) { + *p++ = (rand () & 0xfff) << 4; } } } + return data; +} + + +/** Convert a test image from sRGB to XYZ and check that the transforms are right */ +BOOST_AUTO_TEST_CASE (rgb_xyz_test) +{ + dcp::Size const size (640, 480); + + shared_array<uint8_t> rgb = random_data (size, 3); shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ()); for (int y = 0; y < size.height; ++y) { @@ -74,25 +95,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) double cg = *p++ / 65535.0; double cb = *p++ / 65535.0; - /* Input gamma */ - - if (cr < 0.04045) { - cr /= 12.92; - } else { - cr = pow ((cr + 0.055) / 1.055, 2.4); - } - - if (cg < 0.04045) { - cg /= 12.92; - } else { - cg = pow ((cg + 0.055) / 1.055, 2.4); - } - - if (cb < 0.04045) { - cb /= 12.92; - } else { - cb = pow ((cb + 0.055) / 1.055, 2.4); - } + input_gamma (cr); + input_gamma (cg); + input_gamma (cb); /* Matrix */ @@ -119,6 +124,55 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test) } } + +/** Convert a test image from sRGB to XYZ using the SSE/AVX code and check that the transforms are right */ +BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2) +{ + srand (0); + dcp::Size const size (647, 481); + + shared_array<uint8_t> rgb = random_data (size, 4); + shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz()); + + uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get()); + for (int y = 0; y < size.height; ++y) { + for (int x = 0; x < size.width; ++x) { + + float cr = *p++ / 65535.0; + float cg = *p++ / 65535.0; + float cb = *p++ / 65535.0; + p++; + + input_gamma (cr); + input_gamma (cg); + input_gamma (cb); + + /* Matrix */ + + float cx = cr * 0.4124564 + cg * 0.3575761 + cb * 0.1804375; + float cy = cr * 0.2126729 + cg * 0.7151522 + cb * 0.0721750; + float cz = cr * 0.0193339 + cg * 0.1191920 + cb * 0.9503041; + + /* Compand */ + + cx *= 48 / 52.37; + cy *= 48 / 52.37; + cz *= 48 / 52.37; + + /* Output gamma */ + + cx = pow (cx, 1 / 2.6); + cy = pow (cy, 1 / 2.6); + cz = pow (cz, 1 / 2.6); + + BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3); + BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3); + BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3); + } + } +} + + static list<string> notes; static void |
