Add SSE/AVX2 version of rgb_to_xyz().sse2

author: Carl Hetherington <cth@carlh.net> 2020-06-02 19:47:30 +0200
committer: Carl Hetherington <cth@carlh.net> 2020-06-02 19:47:30 +0200
commit: 6636b550a4b69bf3d77b51aa033159036d88f33b (patch)
tree: 740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a
parent: 5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff)
5 files changed, 346 insertions, 28 deletions
diff --git a/benchmark/rgb_to_xyz_avx2.cc b/benchmark/rgb_to_xyz_avx2.cc
new file mode 100644
index 00000000..65abe2ba
--- /dev/null
+++ b/benchmark/rgb_to_xyz_avx2.cc
@@ -0,0 +1,69 @@
+/*
+    Copyright (C) 2020 Carl Hetherington <cth@carlh.net>
+
+    This file is part of libdcp.
+
+    libdcp is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    libdcp is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with libdcp.  If not, see <http://www.gnu.org/licenses/>.
+
+    In addition, as a special exception, the copyright holders give
+    permission to link the code of portions of this program with the
+    OpenSSL library under certain conditions as described in each
+    individual source file, and distribute linked combinations
+    including the two.
+
+    You must obey the GNU General Public License in all respects
+    for all of the code used other than OpenSSL.  If you modify
+    file(s) with this exception, you may extend this exception to your
+    version of the file(s), but you are not obligated to do so.  If you
+    do not wish to do so, delete this exception statement from your
+    version.  If you delete this exception statement from all source
+    files in the program, then also delete it here.
+*/
+
+#include "openjpeg_image.h"
+#include "rgb_xyz.h"
+#include "colour_conversion.h"
+#include <boost/scoped_array.hpp>
+#include <stdint.h>
+
+using boost::scoped_array;
+using boost::shared_ptr;
+
+int const trials = 256;
+
+int
+main ()
+{
+	srand (1);
+
+	dcp::Size size(1998, 1080);
+
+	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
+	for (int y = 0; y < size.height; ++y) {
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+		for (int x = 0; x < size.width; ++x) {
+			for (int c = 0; c < 4; ++c) {
+				*p = (rand() & 0xfff) << 4;
+				++p;
+			}
+		}
+	}
+
+	shared_ptr<dcp::OpenJPEGImage> xyz;
+	for (int i = 0; i < trials; ++i) {
+		xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz());
+	}
+}
+
+
diff --git a/benchmark/wscript b/benchmark/wscript
index 3e748972..0b6994b9 100644
--- a/benchmark/wscript
+++ b/benchmark/wscript
@@ -32,7 +32,7 @@
 #
 
 def build(bld):
-    for p in ['rgb_to_xyz']:
+    for p in ['rgb_to_xyz', 'rgb_to_xyz_avx2']:
         obj = bld(features='cxx cxxprogram')
         obj.name = p
         obj.uselib = 'BOOST_FILESYSTEM'
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 36185324..fd398ae8 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
 #include "dcp_assert.h"
 #include "compose.hpp"
 #include <cmath>
+#include <immintrin.h>
 
 using std::min;
 using std::max;
@@ -337,3 +338,189 @@ dcp::rgb_to_xyz (
 
 	return xyz;
 }
+
+
+/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored).
+ *  Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A
+ *  being ignored.  There must be no padding bytes between lines of the image (i.e.
+ *  stride must equal width).  The pointer must be aligned to a 16-byte boundary.
+ *
+ *  @param size size of RGB image in pixels.
+ */
+shared_ptr<dcp::OpenJPEGImage>
+dcp::rgb_to_xyz_avx2 (
+	uint8_t const * rgb,
+	dcp::Size size,
+	ColourConversion const & conversion
+	)
+{
+	/* There must be no padding in the RGB as there *can* be no padding in the XYZ output
+	 * and we have to keep the RGB and XYZ data pointers similarly aligned.
+	 */
+
+	shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
+
+	float const * lut_in = conversion.in()->lut_float (12, false);
+	int const * lut_out = conversion.out()->lut_int (16, true, 4095);
+
+	/* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
+	double transform[9];
+	combined_rgb_to_xyz (conversion, transform);
+
+	__m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32));
+	__m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32));
+	__m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32));
+
+	__m256 transform_x = _mm256_set_ps (
+		0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0]
+		);
+
+	__m256 transform_y = _mm256_set_ps (
+		0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3]
+		);
+
+	__m256 transform_z = _mm256_set_ps (
+		0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6]
+		);
+
+	int const pixels = size.width * size.height;
+	int const fast_loops = pixels / 8;
+	int const slow_loops = pixels - fast_loops * 8;
+
+	__m128i const * p = reinterpret_cast<__m128i const *> (rgb);
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+
+	for (int i = 0; i < fast_loops; ++i) {
+		// 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+		__m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
+		__m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1));
+		__m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2));
+		__m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3));
+		p += 4;
+
+		// shift right to truncate 16-bit inputs to 12-bit
+		rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+		rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+		rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+		rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+		// input gamma LUT
+		__m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4);
+		__m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4);
+		__m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4);
+		__m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4);
+
+		// multiply for X
+		__m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+		__m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+		__m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+		__m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+		// accumulate
+
+		// B7  B6  B5  B4  B3  B2  B1  B0  +  A7  A6  A5  A4  A3  A2  A1  A0
+		//                                 =  B67 B45 A67 A45 B23 B01 A23 A01
+		x_A = _mm256_hadd_ps (x_A, x_B);
+
+		// D7  D6  D5  D4  D3  D2  D1  D0  +  C7  C6  C5  C4  C3  C2  C1  C0
+		//                                 =  D67 D45 C67 C45 D23 D01 C23 C01
+		x_C = _mm256_hadd_ps (x_C, x_D);
+
+		// D67 D45 C67 C45 D23 D01 C23 C01 +  B67 B45 A67 A45 B23 B01 A23 A01
+		//                                 =  D47 C47 B47 A47 D03 C03 B03 A03
+		x_A = _mm256_hadd_ps (x_A, x_C);
+
+		// same for Y
+		__m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+		__m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+		__m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+		__m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+		y_A = _mm256_hadd_ps (y_A, y_B);
+		y_C = _mm256_hadd_ps (y_C, y_D);
+		y_A = _mm256_hadd_ps (y_A, y_C);
+
+		// and for Z
+		__m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+		__m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+		__m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+		__m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+		z_A = _mm256_hadd_ps (z_A, z_B);
+		z_C = _mm256_hadd_ps (z_C, z_D);
+		z_A = _mm256_hadd_ps (z_A, z_C);
+
+		// clamp
+		x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0));
+		x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0));
+		y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0));
+		y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0));
+		z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0));
+		z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0));
+
+		// round to int
+		__m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+		__m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+		__m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+		// out gamma LUT
+		lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4);
+		lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4);
+		lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4);
+
+		// shuffle
+		lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+		lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+		lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+		// write to memory
+		_mm256_store_si256 (xyz_x, lut_2_x);
+		_mm256_store_si256 (xyz_y, lut_2_y);
+		_mm256_store_si256 (xyz_z, lut_2_z);
+		xyz_x++;
+		xyz_y++;
+		xyz_z++;
+	}
+
+        struct {
+                float r, g, b;
+        } s;
+
+        struct {
+                float x, y, z;
+        } d;
+
+	uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p);
+	int* xyz_x_slow = reinterpret_cast<int*> (xyz_x);
+	int* xyz_y_slow = reinterpret_cast<int*> (xyz_y);
+	int* xyz_z_slow = reinterpret_cast<int*> (xyz_z);
+
+	for (int i = 0; i < slow_loops; ++i) {
+		/* In gamma LUT (converting 16-bit to 12-bit) */
+		s.r = lut_in[*p_slow++ >> 4];
+		s.g = lut_in[*p_slow++ >> 4];
+		s.b = lut_in[*p_slow++ >> 4];
+		p_slow++;
+
+		/* RGB to XYZ, Bradford transform and DCI companding */
+		d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2];
+		d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5];
+		d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8];
+
+		/* Clamp */
+		d.x = max (0.0f, d.x);
+		d.y = max (0.0f, d.y);
+		d.z = max (0.0f, d.z);
+		d.x = min (65535.0f, d.x);
+		d.y = min (65535.0f, d.y);
+		d.z = min (65535.0f, d.z);
+
+		/* Out gamma LUT */
+		*xyz_x_slow++ = lut_out[lrint(d.x)];
+		*xyz_y_slow++ = lut_out[lrint(d.y)];
+		*xyz_z_slow++ = lut_out[lrint(d.z)];
+	}
+
+	return xyz;
+}
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index e414d07e..1012c424 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
 	ColourConversion const & conversion
 	);
 
+
+extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
+	uint8_t const * rgb,
+	dcp::Size size,
+	ColourConversion const & conversion
+	);
+
+
 extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix);
 
 }
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index 5dcfe673..961edc95 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -45,25 +45,46 @@ using std::cout;
 using boost::shared_ptr;
 using boost::optional;
 using boost::scoped_array;
+using boost::shared_array;
 
-/** Convert a test image from sRGB to XYZ and check that the transforms are right */
-BOOST_AUTO_TEST_CASE (rgb_xyz_test)
+template <class T>
+void
+input_gamma (T& v)
+{
+	if (v < 0.04045) {
+		v /= 12.92;
+	} else {
+		v = pow ((v + 0.055) / 1.055, 2.4);
+	}
+}
+
+
+shared_array<uint8_t>
+random_data (dcp::Size size, int components)
 {
 	srand (0);
-	dcp::Size const size (640, 480);
+	shared_array<uint8_t> data (new uint8_t[size.width * size.height * components * 2]);
 
-	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+		uint16_t* p = reinterpret_cast<uint16_t*> (data.get() + y * size.width * components * 2);
 		for (int x = 0; x < size.width; ++x) {
 			/* Write a 12-bit random number for each component */
-			for (int c = 0; c < 3; ++c) {
-				*p = (rand () & 0xfff) << 4;
-				++p;
+			for (int c = 0; c < components; ++c) {
+				*p++ = (rand () & 0xfff) << 4;
 			}
 		}
 	}
 
+	return data;
+}
+
+
+/** Convert a test image from sRGB to XYZ and check that the transforms are right */
+BOOST_AUTO_TEST_CASE (rgb_xyz_test)
+{
+	dcp::Size const size (640, 480);
+
+	shared_array<uint8_t> rgb = random_data (size, 3);
 	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ());
 
 	for (int y = 0; y < size.height; ++y) {
@@ -74,25 +95,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 			double cg = *p++ / 65535.0;
 			double cb = *p++ / 65535.0;
 
-			/* Input gamma */
-
-			if (cr < 0.04045) {
-				cr /= 12.92;
-			} else {
-				cr = pow ((cr + 0.055) / 1.055, 2.4);
-			}
-
-			if (cg < 0.04045) {
-				cg /= 12.92;
-			} else {
-				cg = pow ((cg + 0.055) / 1.055, 2.4);
-			}
-
-			if (cb < 0.04045) {
-				cb /= 12.92;
-			} else {
-				cb = pow ((cb + 0.055) / 1.055, 2.4);
-			}
+			input_gamma (cr);
+			input_gamma (cg);
+			input_gamma (cb);
 
 			/* Matrix */
 
@@ -119,6 +124,55 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 	}
 }
 
+
+/** Convert a test image from sRGB to XYZ using the SSE/AVX code and check that the transforms are right */
+BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
+{
+	srand (0);
+	dcp::Size const size (647, 481);
+
+	shared_array<uint8_t> rgb = random_data (size, 4);
+	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, dcp::ColourConversion::srgb_to_xyz());
+
+	uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get());
+	for (int y = 0; y < size.height; ++y) {
+		for (int x = 0; x < size.width; ++x) {
+
+			float cr = *p++ / 65535.0;
+			float cg = *p++ / 65535.0;
+			float cb = *p++ / 65535.0;
+			p++;
+
+			input_gamma (cr);
+			input_gamma (cg);
+			input_gamma (cb);
+
+			/* Matrix */
+
+			float cx = cr * 0.4124564 + cg * 0.3575761 + cb * 0.1804375;
+			float cy = cr * 0.2126729 + cg * 0.7151522 + cb * 0.0721750;
+			float cz = cr * 0.0193339 + cg * 0.1191920 + cb * 0.9503041;
+
+			/* Compand */
+
+			cx *= 48 / 52.37;
+			cy *= 48 / 52.37;
+			cz *= 48 / 52.37;
+
+			/* Output gamma */
+
+			cx = pow (cx, 1 / 2.6);
+			cy = pow (cy, 1 / 2.6);
+			cz = pow (cz, 1 / 2.6);
+
+			BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3);
+			BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3);
+			BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3);
+		}
+	}
+}
+
+
 static list<string> notes;
 
 static void
author	Carl Hetherington <cth@carlh.net>	2020-06-02 19:47:30 +0200
committer	Carl Hetherington <cth@carlh.net>	2020-06-02 19:47:30 +0200
commit	6636b550a4b69bf3d77b51aa033159036d88f33b (patch)
tree	740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a
parent	5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff)