Hackzzz.sse

author: Carl Hetherington <cth@carlh.net> 2020-06-02 16:01:57 +0200
committer: Carl Hetherington <cth@carlh.net> 2020-06-02 16:01:57 +0200
commit: 7c9ebaab68c80a14839e400cdd509b847357a794 (patch)
tree: a539f1e944354c98ec4a9af35d163fe1edba5b78
parent: 6016e4b24205e11b1f006e43a6ed4b7f2bde732a (diff)
3 files changed, 85 insertions, 49 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 219dc675..b29dd637 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -284,56 +284,44 @@ dcp::rgb_to_xyz (
 {
         shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
 
-        struct {
-                double r, g, b;
-        } s;
-
-        struct {
-                double x, y, z;
-        } d;
+	RGBPixel<double> rgb_pixel;
+	XYZPixel<double> xyz_pixel;
 
         double const * lut_in = conversion.in()->lut (12, false);
-        double const * lut_out = conversion.out()->lut (16, true);
+        int const * lut_out = conversion.out()->lut_int (16, true, 4095);
 
         /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
         double fast_matrix[9];
         combined_rgb_to_xyz (conversion, fast_matrix);
 
-        int clamped = 0;
         int* xyz_x = xyz->data (0);
         int* xyz_y = xyz->data (1);
         int* xyz_z = xyz->data (2);
         for (int y = 0; y < size.height; ++y) {
                 uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride);
                 for (int x = 0; x < size.width; ++x) {
+			/* In gamma LUT (converting 16-bit to 12-bit) */
+			rgb_pixel.r = lut_in[*rgb++ >> 4];
+			rgb_pixel.g = lut_in[*rgb++ >> 4];
+			rgb_pixel.b = lut_in[*rgb++ >> 4];
+
+			/* RGB to XYZ, Bradford transform and DCI companding */
+			xyz_pixel.x = rgb_pixel.r * fast_matrix[0] + rgb_pixel.g * fast_matrix[1] + rgb_pixel.b * fast_matrix[2];
+			xyz_pixel.y = rgb_pixel.r * fast_matrix[3] + rgb_pixel.g * fast_matrix[4] + rgb_pixel.b * fast_matrix[5];
+			xyz_pixel.z = rgb_pixel.r * fast_matrix[6] + rgb_pixel.g * fast_matrix[7] + rgb_pixel.b * fast_matrix[8];
+
+			/* Clamp */
+			xyz_pixel.x = max (0.0, xyz_pixel.x);
+			xyz_pixel.y = max (0.0, xyz_pixel.y);
+			xyz_pixel.z = max (0.0, xyz_pixel.z);
+			xyz_pixel.x = min (65535.0, xyz_pixel.x);
+			xyz_pixel.y = min (65535.0, xyz_pixel.y);
+			xyz_pixel.z = min (65535.0, xyz_pixel.z);
 
-                        /* In gamma LUT (converting 16-bit to 12-bit) */
-                        s.r = lut_in[*p++ >> 4];
-                        s.g = lut_in[*p++ >> 4];
-                        s.b = lut_in[*p++ >> 4];
-
-                        /* RGB to XYZ, Bradford transform and DCI companding */
-                        d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
-                        d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
-                        d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
-
-                        /* Clamp */
-
-                        if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) {
-                                ++clamped;
-                        }
-
-                        d.x = max (0.0, d.x);
-                        d.y = max (0.0, d.y);
-                        d.z = max (0.0, d.z);
-                        d.x = min (65535.0, d.x);
-                        d.y = min (65535.0, d.y);
-                        d.z = min (65535.0, d.z);
-
-                        /* Out gamma LUT */
-                        *xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095);
-                        *xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095);
-                        *xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095);
+			/* Out gamma LUT */
+			*xyz_x++ = lut_out[lrint(xyz_pixel.x)];
+			*xyz_y++ = lut_out[lrint(xyz_pixel.y)];
+			*xyz_z++ = lut_out[lrint(xyz_pixel.z)];
                 }
         }
 
@@ -341,24 +329,31 @@ dcp::rgb_to_xyz (
 }
 
 
-/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A
+/** @param rgb RGBA data; packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A
  *  with the 2-byte value for each R/G/B/A component stored as
  *  little-endian; i.e. AV_PIX_FMT_RGB48LE.  A is ignored but necessary for
- *  SSE/AVX efficiency.
+ *  SSE/AVX efficiency.  The stride must be equal to the width of the image (i.e.
+ *  no padding bytes).
  *
  *  @param size size of RGB image in pixels.
- *  @param size stride of RGB data in pixels.
  */
 shared_ptr<dcp::OpenJPEGImage>
 dcp::rgb_to_xyz_avx2 (
 	uint8_t const * rgba,
 	dcp::Size size,
-	int stride,
 	ColourConversion const & conversion
 	)
 {
 	shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
 
+        struct {
+                float r, g, b;
+        } s;
+
+        struct {
+                float x, y, z;
+        } d;
+
 	float const * lut_in = conversion.in()->lut_float (12, false);
 	int const * lut_out = conversion.out()->lut_int (16, true, 4095);
 
@@ -385,10 +380,18 @@ dcp::rgb_to_xyz_avx2 (
 		0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6]
 		);
 
+	int pixel_count = 0;
+
 	for (int y = 0; y < size.height; ++y) {
-		__m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * stride);
+		uint16_t const* rgba_p = reinterpret_cast<uint16_t const *>(rgba + y * size.width * 8);
+		for (int x = 0; x < pixel_count % 8; ++x) {
+			rgb_to_xyz_pixel (rgba_p, lut_in, fast_matrix, lut_out, xyz_x, xyz_y, xyz_z, rgb_pixel, xyz_pixel);
+		}
+
+		__m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * size.width * 8);
+
 		DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
-		for (int x = 0; x < size.width / 8; ++x) {
+		for (int x = 0; x < sse_width; ++x) {
 
 			// 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
 			__m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
@@ -478,6 +481,38 @@ dcp::rgb_to_xyz_avx2 (
 			xyz_y++;
 			xyz_z++;
 		}
+
+		uint16_t const * p_extra = reinterpret_cast<uint16_t const *> (p);
+		int* xyz_x_extra = reinterpret_cast<int *> (xyz_x);
+		int* xyz_y_extra = reinterpret_cast<int *> (xyz_y);
+		int* xyz_z_extra = reinterpret_cast<int *> (xyz_z);
+
+		std::cout << "doing " << extra_width << " more.\n";
+		for (int x = 0; x < extra_width; ++x) {
+
+                        /* In gamma LUT (converting 16-bit to 12-bit) */
+                        s.r = lut_in[*p_extra++ >> 4];
+                        s.g = lut_in[*p_extra++ >> 4];
+                        s.b = lut_in[*p_extra++ >> 4];
+
+                        /* RGB to XYZ, Bradford transform and DCI companding */
+                        d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
+                        d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
+                        d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
+
+                        /* Clamp */
+                        d.x = max (0.0f, d.x);
+                        d.y = max (0.0f, d.y);
+                        d.z = max (0.0f, d.z);
+                        d.x = min (65535.0f, d.x);
+                        d.y = min (65535.0f, d.y);
+                        d.z = min (65535.0f, d.z);
+
+                        /* Out gamma LUT */
+                        *xyz_x_extra++ = lut_out[lrint(d.x)];
+                        *xyz_y_extra++ = lut_out[lrint(d.y)];
+                        *xyz_z_extra++ = lut_out[lrint(d.z)];
+		}
 	}
 
 	return xyz;
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index 59b2400d..ef4f2ac1 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -69,7 +69,6 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
 extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
 	uint8_t const * rgba,
 	dcp::Size size,
-	int stride,
 	ColourConversion const & conversion
 	);
 
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index d9c27ae2..8a5ad205 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -51,7 +51,7 @@ using boost::scoped_array;
 BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 {
         srand (0);
-        dcp::Size const size (640, 480);
+        dcp::Size const size (647, 485);
 
         scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
         for (int y = 0; y < size.height; ++y) {
@@ -125,11 +125,13 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
 BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
 {
 	srand (0);
-	dcp::Size const size (640, 480);
+	dcp::Size const size (647, 485);
+
+	int stride = (size.width + (16 - size.width % 16)) * 8;
 
-	scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
+	scoped_array<uint8_t> rgb (new uint8_t[size.height * stride]);
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride);
 		for (int x = 0; x < size.width; ++x) {
 			/* Write a 12-bit random number for each component */
 			for (int c = 0; c < 3; ++c) {
@@ -139,10 +141,10 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
 		}
 	}
 
-	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
+	shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, stride, dcp::ColourConversion::srgb_to_xyz());
 
 	for (int y = 0; y < size.height; ++y) {
-		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+		uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride);
 		for (int x = 0; x < size.width; ++x) {
 
 			double cr = *p++ / 65535.0;
author	Carl Hetherington <cth@carlh.net>	2020-06-02 16:01:57 +0200
committer	Carl Hetherington <cth@carlh.net>	2020-06-02 16:01:57 +0200
commit	7c9ebaab68c80a14839e400cdd509b847357a794 (patch)
tree	a539f1e944354c98ec4a9af35d163fe1edba5b78
parent	6016e4b24205e11b1f006e43a6ed4b7f2bde732a (diff)