summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmark/rgb_to_xyz_avx2.cc69
-rw-r--r--benchmark/wscript2
-rw-r--r--src/rgb_xyz.cc187
-rw-r--r--src/rgb_xyz.h8
-rw-r--r--test/rgb_xyz_test.cc108
5 files changed, 346 insertions, 28 deletions
diff --git a/benchmark/rgb_to_xyz_avx2.cc b/benchmark/rgb_to_xyz_avx2.cc
new file mode 100644
index 00000000..65abe2ba
--- /dev/null
+++ b/benchmark/rgb_to_xyz_avx2.cc
@@ -0,0 +1,69 @@
+/*
+ Copyright (C) 2020 Carl Hetherington <cth@carlh.net>
+
+ This file is part of libdcp.
+
+ libdcp is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ libdcp is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libdcp. If not, see <http://www.gnu.org/licenses/>.
+
+ In addition, as a special exception, the copyright holders give
+ permission to link the code of portions of this program with the
+ OpenSSL library under certain conditions as described in each
+ individual source file, and distribute linked combinations
+ including the two.
+
+ You must obey the GNU General Public License in all respects
+ for all of the code used other than OpenSSL. If you modify
+ file(s) with this exception, you may extend this exception to your
+ version of the file(s), but you are not obligated to do so. If you
+ do not wish to do so, delete this exception statement from your
+ version. If you delete this exception statement from all source
+ files in the program, then also delete it here.
+*/
+
+#include "openjpeg_image.h"
+#include "rgb_xyz.h"
+#include "colour_conversion.h"
+#include <boost/scoped_array.hpp>
+#include <stdint.h>
+
+using boost::scoped_array;
+using boost::shared_ptr;
+
+int const trials = 256;
+
+int
+main ()
+{
+ srand (1);
+
+ dcp::Size size(1998, 1080);
+
+ scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
+ for (int y = 0; y < size.height; ++y) {
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+ for (int x = 0; x < size.width; ++x) {
+ for (int c = 0; c < 4; ++c) {
+ *p = (rand() & 0xfff) << 4;
+ ++p;
+ }
+ }
+ }
+
+ shared_ptr<dcp::OpenJPEGImage> xyz;
+ for (int i = 0; i < trials; ++i) {
+ xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz());
+ }
+}
+
+
diff --git a/benchmark/wscript b/benchmark/wscript
index 3e748972..0b6994b9 100644
--- a/benchmark/wscript
+++ b/benchmark/wscript
@@ -32,7 +32,7 @@
#
def build(bld):
- for p in ['rgb_to_xyz']:
+ for p in ['rgb_to_xyz', 'rgb_to_xyz_avx2']:
obj = bld(features='cxx cxxprogram')
obj.name = p
obj.uselib = 'BOOST_FILESYSTEM'
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 36185324..fd398ae8 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
#include "dcp_assert.h"
#include "compose.hpp"
#include <cmath>
+#include <immintrin.h>
using std::min;
using std::max;
@@ -337,3 +338,189 @@ dcp::rgb_to_xyz (
return xyz;
}
+
+
+/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored).
+ * Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A
+ * being ignored. There must be no padding bytes between lines of the image (i.e.
+ * stride must equal width). The pointer must be aligned to a 16-byte boundary.
+ *
+ * @param size size of RGB image in pixels.
+ */
+shared_ptr<dcp::OpenJPEGImage>
+dcp::rgb_to_xyz_avx2 (
+ uint8_t const * rgb,
+ dcp::Size size,
+ ColourConversion const & conversion
+ )
+{
+ /* There must be no padding in the RGB as there *can* be no padding in the XYZ output
+ * and we have to keep the RGB and XYZ data pointers similarly aligned.
+ */
+
+ shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
+
+ float const * lut_in = conversion.in()->lut_float (12, false);
+ int const * lut_out = conversion.out()->lut_int (16, true, 4095);
+
+ /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
+ double transform[9];
+ combined_rgb_to_xyz (conversion, transform);
+
+ __m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32));
+ __m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32));
+ __m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32));
+
+ __m256 transform_x = _mm256_set_ps (
+ 0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0]
+ );
+
+ __m256 transform_y = _mm256_set_ps (
+ 0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3]
+ );
+
+ __m256 transform_z = _mm256_set_ps (
+ 0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6]
+ );
+
+ int const pixels = size.width * size.height;
+ int const fast_loops = pixels / 8;
+ int const slow_loops = pixels - fast_loops * 8;
+
+ __m128i const * p = reinterpret_cast<__m128i const *> (rgb);
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+
+ for (int i = 0; i < fast_loops; ++i) {
+ // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+ __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
+ __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1));
+ __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2));
+ __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3));
+ p += 4;
+
+ // shift right to truncate 16-bit inputs to 12-bit
+ rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+ rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+ rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+ rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+ // input gamma LUT
+ __m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4);
+ __m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4);
+ __m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4);
+ __m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4);
+
+ // multiply for X
+ __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+ __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+ __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+ __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+ // accumulate
+
+ // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0
+ // = B67 B45 A67 A45 B23 B01 A23 A01
+ x_A = _mm256_hadd_ps (x_A, x_B);
+
+ // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0
+ // = D67 D45 C67 C45 D23 D01 C23 C01
+ x_C = _mm256_hadd_ps (x_C, x_D);
+
+ // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01
+ // = D47 C47 B47 A47 D03 C03 B03 A03
+ x_A = _mm256_hadd_ps (x_A, x_C);
+
+ // same for Y
+ __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+ __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+ __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+ __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+ y_A = _mm256_hadd_ps (y_A, y_B);
+ y_C = _mm256_hadd_ps (y_C, y_D);
+ y_A = _mm256_hadd_ps (y_A, y_C);
+
+ // and for Z
+ __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+ __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+ __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+ __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+ z_A = _mm256_hadd_ps (z_A, z_B);
+ z_C = _mm256_hadd_ps (z_C, z_D);
+ z_A = _mm256_hadd_ps (z_A, z_C);
+
+ // clamp
+ x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0));
+ x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0));
+ y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0));
+ y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0));
+ z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0));
+ z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0));
+
+ // round to int
+ __m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+ __m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+ __m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+ // out gamma LUT
+ lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4);
+ lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4);
+ lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4);
+
+ // shuffle
+ lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+ lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+ lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+ // write to memory
+ _mm256_store_si256 (xyz_x, lut_2_x);
+ _mm256_store_si256 (xyz_y, lut_2_y);
+ _mm256_store_si256 (xyz_z, lut_2_z);
+ xyz_x++;
+ xyz_y++;
+ xyz_z++;
+ }
+
+ struct {
+ float r, g, b;
+ } s;
+
+ struct {
+ float x, y, z;
+ } d;
+
+ uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p);
+ int* xyz_x_slow = reinterpret_cast<int*> (xyz_x);
+ int* xyz_y_slow = reinterpret_cast<int*> (xyz_y);
+ int* xyz_z_slow = reinterpret_cast<int*> (xyz_z);
+
+ for (int i = 0; i < slow_loops; ++i) {
+ /* In gamma LUT (converting 16-bit to 12-bit) */
+ s.r = lut_in[*p_slow++ >> 4];
+ s.g = lut_in[*p_slow++ >> 4];
+ s.b = lut_in[*p_slow++ >> 4];
+ p_slow++;
+
+ /* RGB to XYZ, Bradford transform and DCI companding */
+ d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2];
+ d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5];
+ d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8];
+
+ /* Clamp */
+ d.x = max (0.0f, d.x);
+ d.y = max (0.0f, d.y);
+ d.z = max (0.0f, d.z);
+ d.x = min (65535.0f, d.x);
+ d.y = min (65535.0f, d.y);
+ d.z = min (65535.0f, d.z);
+
+ /* Out gamma LUT */
+ *xyz_x_slow++ = lut_out[lrint(d.x)];
+ *xyz_y_slow++ = lut_out[lrint(d.y)];
+ *xyz_z_slow++ = lut_out[lrint(d.z)];
+ }
+
+ return xyz;
+}
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index e414d07e..1012c424 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
ColourConversion const & conversion
);
+
+extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
+ uint8_t const * rgb,
+ dcp::Size size,
+ ColourConversion const & conversion
+ );
+
+
extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix);
}
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index 5dcfe673..961edc95 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -45,25 +45,46 @@ using std::cout;
using boost::shared_ptr;
using boost::optional;
using boost::scoped_array;
+using boost::shared_array;
-/** Convert a test image from sRGB to XYZ and check that the transforms are right */
-BOOST_AUTO_TEST_CASE (rgb_xyz_test)
+template <class T>
+void
+input_gamma (T& v)
+{
+ if (v < 0.04045) {
+ v /= 12.92;
+ } else {
+ v = pow ((v + 0.055) / 1.055, 2.4);
+ }
+}
+
+
+shared_array<uint8_t>
+random_data (dcp::Size size, int components)
{
srand (0);
- dcp::Size const size (640, 480);
+ shared_array<uint8_t> data (new uint8_t[size.width * size.height * components * 2]);
- scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+ uint16_t* p = reinterpret_cast<uint16_t*> (data.get() + y * size.width * components * 2);
for (int x = 0; x < size.width; ++x) {
/* Write a 12-bit random number for each component */
- for (int c = 0; c < 3; ++c) {
- *p = (rand () & 0xfff) << 4;
- ++p;
+ for (int c = 0; c < components; ++c) {
+ *p++ = (rand () & 0xfff) << 4;
}
}
}
+ return data;
+}
+
+
+/** Convert a test image from sRGB to XYZ and check that the transforms are right */
+BOOST_AUTO_TEST_CASE (rgb_xyz_test)
+{
+ dcp::Size const size (640, 480);
+
+ shared_array<uint8_t> rgb = random_data (size, 3);
shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ());
for (int y = 0; y < size.height; ++y) {
@@ -74,25 +95,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
double cg = *p++ / 65535.0;
double cb = *p++ / 65535.0;
- /* Input gamma */
-
- if (cr < 0.04045) {
- cr /= 12.92;
- } else {
- cr = pow ((cr + 0.055) / 1.055, 2.4);
- }
-
- if (cg < 0.04045) {
- cg /= 12.92;
- } else {
- cg = pow ((cg + 0.055) / 1.055, 2.4);
- }
-
- if (cb < 0.04045) {
- cb /= 12.92;
- } else {
- cb = pow ((cb + 0.055) / 1.055, 2.4);
- }
+ input_gamma (cr);
+ input_gamma (cg);
+ input_gamma (cb);
/* Matrix */
@@ -119,6 +124,55 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
}
}
+
+/** Convert a test image from sRGB to XYZ using the SSE/AVX code and check that the transforms are right */
+BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
+{
+ srand (0);
+ dcp::Size const size (647, 481);
+
+ shared_array<uint8_t> rgb = random_data (size, 4);
+ shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz());
+
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get());
+ for (int y = 0; y < size.height; ++y) {
+ for (int x = 0; x < size.width; ++x) {
+
+ float cr = *p++ / 65535.0;
+ float cg = *p++ / 65535.0;
+ float cb = *p++ / 65535.0;
+ p++;
+
+ input_gamma (cr);
+ input_gamma (cg);
+ input_gamma (cb);
+
+ /* Matrix */
+
+ float cx = cr * 0.4124564 + cg * 0.3575761 + cb * 0.1804375;
+ float cy = cr * 0.2126729 + cg * 0.7151522 + cb * 0.0721750;
+ float cz = cr * 0.0193339 + cg * 0.1191920 + cb * 0.9503041;
+
+ /* Compand */
+
+ cx *= 48 / 52.37;
+ cy *= 48 / 52.37;
+ cz *= 48 / 52.37;
+
+ /* Output gamma */
+
+ cx = pow (cx, 1 / 2.6);
+ cy = pow (cy, 1 / 2.6);
+ cz = pow (cz, 1 / 2.6);
+
+ BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3);
+ BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3);
+ BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3);
+ }
+ }
+}
+
+
static list<string> notes;
static void