Basically maybe-working float version of rgb_to_xyz using SSE/AVX2.

author: Carl Hetherington <cth@carlh.net> 2020-06-02 00:52:57 +0200
committer: Carl Hetherington <cth@carlh.net> 2020-06-02 00:52:57 +0200
commit: 4a5609265010b702b0222620bde9f64f1ec7269e (patch)
tree: ee417c06dc616d03fcf20d32f96d74b6f22465e8
parent: 4e885dc225c5c12e11367602cc4bf429520d9b3b (diff)
4 files changed, 199 insertions, 19 deletions
diff --git a/benchmark/rgb_to_xyz.cc b/benchmark/rgb_to_xyz.cc
index 7b4400ec..bdd1e426 100644
--- a/benchmark/rgb_to_xyz.cc
+++ b/benchmark/rgb_to_xyz.cc
@@ -49,9 +49,9 @@ main ()
 
 	dcp::Size size(1998, 1080);
 
-	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
+	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
 		for (int x = 0; x < size.width; ++x) {
 			for (int c = 0; c < 3; ++c) {
 				*p = (rand() & 0xfff) << 4;
@@ -62,7 +62,7 @@ main ()
 
 	shared_ptr<dcp::OpenJPEGImage> xyz;
 	for (int i = 0; i < trials; ++i) {
-		xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz());
+		xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
 	}
 }
 
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index d0774e7c..4d8fc09c 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
 #include "dcp_assert.h"
 #include "compose.hpp"
 #include <cmath>
+#include <immintrin.h>
 
 using std::min;
 using std::max;
@@ -272,9 +273,11 @@ dcp::combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix)
 		* DCI_COEFFICIENT * 65535;
 }
 
-/** @param rgb RGB data; packed RGB 16:16:16, 48bpp, 16R, 16G, 16B,
- *  with the 2-byte value for each R/G/B component stored as
- *  little-endian; i.e. AV_PIX_FMT_RGB48LE.
+/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A
+ *  with the 2-byte value for each R/G/B/A component stored as
+ *  little-endian; i.e. AV_PIX_FMT_RGB48LE.  A is ignored but necessary for
+ *  SSE/AVX efficiency.
+ *
  *  @param size size of RGB image in pixels.
  *  @param size stride of RGB data in pixels.
  */
@@ -300,28 +303,187 @@ dcp::rgb_to_xyz (
 	double const * lut_in = conversion.in()->lut (12, false);
 	double const * lut_out = conversion.out()->lut (16, true);
 
+	float* lut_in_float = new float[4096];
+	for (int i = 0; i < 4096; ++i) {
+		lut_in_float[i] = lut_in[i];
+	}
+
+	int* lut_out_int = new int[65536];
+	for (int i = 0; i < 65536; ++i) {
+		lut_out_int[i] = lrint(lut_out[i] * 4095);
+	}
+
 	/* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
 	double fast_matrix[9];
 	combined_rgb_to_xyz (conversion, fast_matrix);
 
 	int clamped = 0;
-	int* xyz_x = xyz->data (0);
-	int* xyz_y = xyz->data (1);
-	int* xyz_z = xyz->data (2);
+	int* xyz_x = (int *) aligned_alloc(32, size.height * size.width * 4);
+	memcpy (xyz_x, xyz->data(0), size.height * size.width * 4);
+	int* xyz_y = (int *) aligned_alloc(32, size.height * size.width * 4);
+	memcpy (xyz_y, xyz->data(1), size.height * size.width * 4);
+	int* xyz_z = (int *) aligned_alloc(32, size.height * size.width * 4);
+	memcpy (xyz_z, xyz->data(2), size.height * size.width * 4);
+
+	__m256i* xyz_x_sse = (__m256i *) xyz_x;
+	__m256i* xyz_y_sse = (__m256i *) xyz_y;
+	__m256i* xyz_z_sse = (__m256i *) xyz_z;
+
+	__m256 transform_x = _mm256_set_ps (
+		0, fast_matrix[2], fast_matrix[1], fast_matrix[0], 0, fast_matrix[2], fast_matrix[1], fast_matrix[0]
+		);
+
+	__m256 transform_y = _mm256_set_ps (
+		0, fast_matrix[5], fast_matrix[4], fast_matrix[3], 0, fast_matrix[5], fast_matrix[4], fast_matrix[3]
+		);
+
+	__m256 transform_z = _mm256_set_ps (
+		0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6]
+		);
+
+	/* in-line these, or keep them const? */
+	__m256 const_65535 = _mm256_set1_ps (65535.0);
+	__m256 const_0 = _mm256_set1_ps (0.0);
+	__m256i shuffle = _mm256_set_epi32 (7, 3, 6, 2, 5, 1, 4, 0);
+#if 0
+	__m256 zero = _mm256_setzero_ps ();
+
+	__m256i red_mask_1 = _mm256_set_epi32 (0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000);
+	__m256i red_mask_2 = _mm256_set_epi32 (0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff);
+	__m256i red_mask_3 = _mm256_set_epi32 (0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000);
+#endif
+
 	for (int y = 0; y < size.height; ++y) {
 		uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride);
-		for (int x = 0; x < size.width; ++x) {
-
+		__m128i const * p_sse = reinterpret_cast<__m128i const *> (rgb + y * stride);
+		DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+		for (int x = 0; x < size.width / 8; ++x) {
+
+			// 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+			__m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 0));
+			__m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 1));
+			__m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 2));
+			__m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 3));
+			p_sse += 4;
+
+			// shift right to truncate 16-bit inputs to 12-bit
+			rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+			rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+			rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+			rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+			// input gamma LUT
+			__m256 lut_1_A = _mm256_i32gather_ps (lut_in_float, rgb_A, 4);
+			__m256 lut_1_B = _mm256_i32gather_ps (lut_in_float, rgb_B, 4);
+			__m256 lut_1_C = _mm256_i32gather_ps (lut_in_float, rgb_C, 4);
+			__m256 lut_1_D = _mm256_i32gather_ps (lut_in_float, rgb_D, 4);
+
+			// multiply for X
+			__m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+			__m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+			__m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+			__m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+			// accumulate
+
+			//float* dbg_x_A = (float *) &x_A;
+			//float* dbg_x_B = (float *) &x_B;
+#if 0
+			printf("A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]);
+			printf("B:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_B[0], dbg_x_B[1], dbg_x_B[2], dbg_x_B[3], dbg_x_B[4], dbg_x_B[5], dbg_x_B[6], dbg_x_B[7]);
+#endif
+
+			// B7  B6  B5  B4  B3  B2  B1  B0  +  A7  A6  A5  A4  A3  A2  A1  A0
+			//                                 =  B67 B45 A67 A45 B23 B01 A23 A01
+			x_A = _mm256_hadd_ps (x_A, x_B);
+
+#if 0
+			printf("sum A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]);
+
+#endif
+			// D7  D6  D5  D4  D3  D2  D1  D0  +  C7  C6  C5  C4  C3  C2  C1  C0
+			//                                 =  D67 D45 C67 C45 D23 D01 C23 C01
+			x_C = _mm256_hadd_ps (x_C, x_D);
+
+			// D67 D45 C67 C45 D23 D01 C23 C01 +  B67 B45 A67 A45 B23 B01 A23 A01
+			//                                 =  D47 C47 B47 A47 D03 C03 B03 A03
+			x_A = _mm256_hadd_ps (x_A, x_C);
+
+			// now x values for the 8 pixels are in x_A
+
+			//float* fucker = (float *) &x_A;
+			//printf("8 pixels: %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f", fucker[0], fucker[4], fucker[1], fucker[5], fucker[2], fucker[6], fucker[3], fucker[7]);
+
+			// multiply for Y
+			__m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+			__m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+			__m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+			__m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+
+			// accumulate
+			y_A = _mm256_hadd_ps (y_A, y_B);
+			y_C = _mm256_hadd_ps (y_C, y_D);
+			y_A = _mm256_hadd_ps (y_A, y_C);
+
+			// multiply for Z
+			__m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+			__m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+			__m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+			__m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+
+			// accumulate
+			z_A = _mm256_hadd_ps (z_A, z_B);
+			z_C = _mm256_hadd_ps (z_C, z_D);
+			z_A = _mm256_hadd_ps (z_A, z_C);
+
+			// clamp
+			x_A = _mm256_min_ps(x_A, const_65535);
+			x_A = _mm256_max_ps(x_A, const_0);
+			y_A = _mm256_min_ps(y_A, const_65535);
+			y_A = _mm256_max_ps(y_A, const_0);
+			z_A = _mm256_min_ps(z_A, const_65535);
+			z_A = _mm256_max_ps(z_A, const_0);
+
+			// round to int
+			__m256i lut_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+			__m256i lut_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+			__m256i lut_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+			// out gamma LUT
+			__m256i lut_x_out = _mm256_i32gather_epi32 (lut_out_int, lut_x, 4);
+			__m256i lut_y_out = _mm256_i32gather_epi32 (lut_out_int, lut_y, 4);
+			__m256i lut_z_out = _mm256_i32gather_epi32 (lut_out_int, lut_z, 4);
+
+			//int* res = (int *) &lut_x_out;
+			//printf("result 1 = %d\n", res[0]);
+
+			// shuffle
+			lut_x_out = _mm256_permutevar8x32_epi32 (lut_x_out, shuffle);
+			lut_y_out = _mm256_permutevar8x32_epi32 (lut_y_out, shuffle);
+			lut_z_out = _mm256_permutevar8x32_epi32 (lut_z_out, shuffle);
+
+			// write to memory
+			_mm256_store_si256 (xyz_x_sse, lut_x_out);
+			_mm256_store_si256 (xyz_y_sse, lut_y_out);
+			_mm256_store_si256 (xyz_z_sse, lut_z_out);
+			xyz_x_sse++;
+			xyz_y_sse++;
+			xyz_z_sse++;
+
+#if 0
 			/* In gamma LUT (converting 16-bit to 12-bit) */
 			s.r = lut_in[*p++ >> 4];
 			s.g = lut_in[*p++ >> 4];
 			s.b = lut_in[*p++ >> 4];
+			p++;
 
 			/* RGB to XYZ, Bradford transform and DCI companding */
 			d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
 			d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
 			d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
 
+			printf("actual x %.2f\n", d.x);
+
 			/* Clamp */
 
 			if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) {
@@ -336,9 +498,13 @@ dcp::rgb_to_xyz (
 			d.z = min (65535.0, d.z);
 
 			/* Out gamma LUT */
+			// 4095 should be in the LUT and LUT should be integer
 			*xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095);
 			*xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095);
 			*xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095);
+
+			printf("c calc %d\n", xyz_x[-1]);
+#endif
 		}
 	}
 
@@ -346,5 +512,16 @@ dcp::rgb_to_xyz (
 		note.get() (DCP_NOTE, String::compose ("%1 XYZ value(s) clamped", clamped));
 	}
 
+	delete[] lut_in_float;
+	delete[] lut_out_int;
+
+	memcpy (xyz->data(0), xyz_x, size.width * size.height * 4);
+	memcpy (xyz->data(1), xyz_y, size.width * size.height * 4);
+	memcpy (xyz->data(2), xyz_z, size.width * size.height * 4);
+
+	free (xyz_x);
+	free (xyz_y);
+	free (xyz_z);
+
 	return xyz;
 }
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index 5dcfe673..18b7b5df 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -52,9 +52,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 	srand (0);
 	dcp::Size const size (640, 480);
 
-	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
+	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
 		for (int x = 0; x < size.width; ++x) {
 			/* Write a 12-bit random number for each component */
 			for (int c = 0; c < 3; ++c) {
@@ -64,15 +64,16 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 		}
 	}
 
-	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ());
+	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
 
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
 		for (int x = 0; x < size.width; ++x) {
 
 			double cr = *p++ / 65535.0;
 			double cg = *p++ / 65535.0;
 			double cb = *p++ / 65535.0;
+			p++;
 
 			/* Input gamma */
 
@@ -112,9 +113,11 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 			cy = pow (cy, 1 / 2.6);
 			cz = pow (cz, 1 / 2.6);
 
-			BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 1);
-			BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 1);
-			BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 1);
+			//std::cout << "x=" << x << " y=" << y << " " << (cx * 4095) << " " << xyz->data(0)[y * size.width + x] << "\n";
+
+			BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3);
+			BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3);
+			BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3);
 		}
 	}
 }
diff --git a/wscript b/wscript
index 84fcfa71..746a7dce 100644
--- a/wscript
+++ b/wscript
@@ -67,7 +67,7 @@ def options(opt):
 def configure(conf):
     conf.load('compiler_cxx')
     conf.load('clang_compilation_database', tooldir=['waf-tools'])
-    conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS'])
+    conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS', '-mavx2'])
     if conf.options.force_cpp11:
         conf.env.append_value('CXXFLAGS', ['-std=c++11', '-DBOOST_NO_CXX11_SCOPED_ENUMS'])
     gcc = conf.env['CC_VERSION']
author	Carl Hetherington <cth@carlh.net>	2020-06-02 00:52:57 +0200
committer	Carl Hetherington <cth@carlh.net>	2020-06-02 00:52:57 +0200
commit	4a5609265010b702b0222620bde9f64f1ec7269e (patch)
tree	ee417c06dc616d03fcf20d32f96d74b6f22465e8
parent	4e885dc225c5c12e11367602cc4bf429520d9b3b (diff)