Add SSE/AVX2 version of rgb_to_xyz().sse2

author: Carl Hetherington <cth@carlh.net> 2020-06-02 19:47:30 +0200
committer: Carl Hetherington <cth@carlh.net> 2020-06-02 19:47:30 +0200
commit: 6636b550a4b69bf3d77b51aa033159036d88f33b (patch)
tree: 740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a /src
parent: 5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff)
2 files changed, 195 insertions, 0 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 36185324..fd398ae8 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
 #include "dcp_assert.h"
 #include "compose.hpp"
 #include <cmath>
+#include <immintrin.h>
 
 using std::min;
 using std::max;
@@ -337,3 +338,189 @@ dcp::rgb_to_xyz (
 
 	return xyz;
 }
+
+
+/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored).
+ *  Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A
+ *  being ignored.  There must be no padding bytes between lines of the image (i.e.
+ *  stride must equal width).  The pointer must be aligned to a 16-byte boundary.
+ *
+ *  @param size size of RGB image in pixels.
+ */
+shared_ptr<dcp::OpenJPEGImage>
+dcp::rgb_to_xyz_avx2 (
+	uint8_t const * rgb,
+	dcp::Size size,
+	ColourConversion const & conversion
+	)
+{
+	/* There must be no padding in the RGB as there *can* be no padding in the XYZ output
+	 * and we have to keep the RGB and XYZ data pointers similarly aligned.
+	 */
+
+	shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
+
+	float const * lut_in = conversion.in()->lut_float (12, false);
+	int const * lut_out = conversion.out()->lut_int (16, true, 4095);
+
+	/* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
+	double transform[9];
+	combined_rgb_to_xyz (conversion, transform);
+
+	__m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32));
+	__m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32));
+	__m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2));
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32));
+
+	__m256 transform_x = _mm256_set_ps (
+		0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0]
+		);
+
+	__m256 transform_y = _mm256_set_ps (
+		0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3]
+		);
+
+	__m256 transform_z = _mm256_set_ps (
+		0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6]
+		);
+
+	int const pixels = size.width * size.height;
+	int const fast_loops = pixels / 8;
+	int const slow_loops = pixels - fast_loops * 8;
+
+	__m128i const * p = reinterpret_cast<__m128i const *> (rgb);
+	DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+
+	for (int i = 0; i < fast_loops; ++i) {
+		// 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+		__m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
+		__m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1));
+		__m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2));
+		__m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3));
+		p += 4;
+
+		// shift right to truncate 16-bit inputs to 12-bit
+		rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+		rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+		rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+		rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+		// input gamma LUT
+		__m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4);
+		__m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4);
+		__m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4);
+		__m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4);
+
+		// multiply for X
+		__m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+		__m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+		__m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+		__m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+		// accumulate
+
+		// B7  B6  B5  B4  B3  B2  B1  B0  +  A7  A6  A5  A4  A3  A2  A1  A0
+		//                                 =  B67 B45 A67 A45 B23 B01 A23 A01
+		x_A = _mm256_hadd_ps (x_A, x_B);
+
+		// D7  D6  D5  D4  D3  D2  D1  D0  +  C7  C6  C5  C4  C3  C2  C1  C0
+		//                                 =  D67 D45 C67 C45 D23 D01 C23 C01
+		x_C = _mm256_hadd_ps (x_C, x_D);
+
+		// D67 D45 C67 C45 D23 D01 C23 C01 +  B67 B45 A67 A45 B23 B01 A23 A01
+		//                                 =  D47 C47 B47 A47 D03 C03 B03 A03
+		x_A = _mm256_hadd_ps (x_A, x_C);
+
+		// same for Y
+		__m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+		__m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+		__m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+		__m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+		y_A = _mm256_hadd_ps (y_A, y_B);
+		y_C = _mm256_hadd_ps (y_C, y_D);
+		y_A = _mm256_hadd_ps (y_A, y_C);
+
+		// and for Z
+		__m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+		__m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+		__m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+		__m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+		z_A = _mm256_hadd_ps (z_A, z_B);
+		z_C = _mm256_hadd_ps (z_C, z_D);
+		z_A = _mm256_hadd_ps (z_A, z_C);
+
+		// clamp
+		x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0));
+		x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0));
+		y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0));
+		y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0));
+		z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0));
+		z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0));
+
+		// round to int
+		__m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+		__m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+		__m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+		// out gamma LUT
+		lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4);
+		lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4);
+		lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4);
+
+		// shuffle
+		lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+		lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+		lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+		// write to memory
+		_mm256_store_si256 (xyz_x, lut_2_x);
+		_mm256_store_si256 (xyz_y, lut_2_y);
+		_mm256_store_si256 (xyz_z, lut_2_z);
+		xyz_x++;
+		xyz_y++;
+		xyz_z++;
+	}
+
+        struct {
+                float r, g, b;
+        } s;
+
+        struct {
+                float x, y, z;
+        } d;
+
+	uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p);
+	int* xyz_x_slow = reinterpret_cast<int*> (xyz_x);
+	int* xyz_y_slow = reinterpret_cast<int*> (xyz_y);
+	int* xyz_z_slow = reinterpret_cast<int*> (xyz_z);
+
+	for (int i = 0; i < slow_loops; ++i) {
+		/* In gamma LUT (converting 16-bit to 12-bit) */
+		s.r = lut_in[*p_slow++ >> 4];
+		s.g = lut_in[*p_slow++ >> 4];
+		s.b = lut_in[*p_slow++ >> 4];
+		p_slow++;
+
+		/* RGB to XYZ, Bradford transform and DCI companding */
+		d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2];
+		d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5];
+		d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8];
+
+		/* Clamp */
+		d.x = max (0.0f, d.x);
+		d.y = max (0.0f, d.y);
+		d.z = max (0.0f, d.z);
+		d.x = min (65535.0f, d.x);
+		d.y = min (65535.0f, d.y);
+		d.z = min (65535.0f, d.z);
+
+		/* Out gamma LUT */
+		*xyz_x_slow++ = lut_out[lrint(d.x)];
+		*xyz_y_slow++ = lut_out[lrint(d.y)];
+		*xyz_z_slow++ = lut_out[lrint(d.z)];
+	}
+
+	return xyz;
+}
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index e414d07e..1012c424 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
 	ColourConversion const & conversion
 	);
 
+
+extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
+	uint8_t const * rgb,
+	dcp::Size size,
+	ColourConversion const & conversion
+	);
+
+
 extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix);
 
 }
author	Carl Hetherington <cth@carlh.net>	2020-06-02 19:47:30 +0200
committer	Carl Hetherington <cth@carlh.net>	2020-06-02 19:47:30 +0200
commit	6636b550a4b69bf3d77b51aa033159036d88f33b (patch)
tree	740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a /src
parent	5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff)