summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2020-06-02 00:52:57 +0200
committerCarl Hetherington <cth@carlh.net>2020-06-02 00:52:57 +0200
commit4a5609265010b702b0222620bde9f64f1ec7269e (patch)
treeee417c06dc616d03fcf20d32f96d74b6f22465e8
parent4e885dc225c5c12e11367602cc4bf429520d9b3b (diff)
Basically maybe-working float version of rgb_to_xyz using SSE/AVX2.
-rw-r--r--benchmark/rgb_to_xyz.cc6
-rw-r--r--src/rgb_xyz.cc193
-rw-r--r--test/rgb_xyz_test.cc17
-rw-r--r--wscript2
4 files changed, 199 insertions, 19 deletions
diff --git a/benchmark/rgb_to_xyz.cc b/benchmark/rgb_to_xyz.cc
index 7b4400ec..bdd1e426 100644
--- a/benchmark/rgb_to_xyz.cc
+++ b/benchmark/rgb_to_xyz.cc
@@ -49,9 +49,9 @@ main ()
dcp::Size size(1998, 1080);
- scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
+ scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
for (int x = 0; x < size.width; ++x) {
for (int c = 0; c < 3; ++c) {
*p = (rand() & 0xfff) << 4;
@@ -62,7 +62,7 @@ main ()
shared_ptr<dcp::OpenJPEGImage> xyz;
for (int i = 0; i < trials; ++i) {
- xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz());
+ xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
}
}
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index d0774e7c..4d8fc09c 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
#include "dcp_assert.h"
#include "compose.hpp"
#include <cmath>
+#include <immintrin.h>
using std::min;
using std::max;
@@ -272,9 +273,11 @@ dcp::combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix)
* DCI_COEFFICIENT * 65535;
}
-/** @param rgb RGB data; packed RGB 16:16:16, 48bpp, 16R, 16G, 16B,
- * with the 2-byte value for each R/G/B component stored as
- * little-endian; i.e. AV_PIX_FMT_RGB48LE.
+/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A
+ * with the 2-byte value for each R/G/B/A component stored as
+ * little-endian; i.e. AV_PIX_FMT_RGB48LE. A is ignored but necessary for
+ * SSE/AVX efficiency.
+ *
* @param size size of RGB image in pixels.
* @param size stride of RGB data in pixels.
*/
@@ -300,28 +303,187 @@ dcp::rgb_to_xyz (
double const * lut_in = conversion.in()->lut (12, false);
double const * lut_out = conversion.out()->lut (16, true);
+ float* lut_in_float = new float[4096];
+ for (int i = 0; i < 4096; ++i) {
+ lut_in_float[i] = lut_in[i];
+ }
+
+ int* lut_out_int = new int[65536];
+ for (int i = 0; i < 65536; ++i) {
+ lut_out_int[i] = lrint(lut_out[i] * 4095);
+ }
+
/* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
double fast_matrix[9];
combined_rgb_to_xyz (conversion, fast_matrix);
int clamped = 0;
- int* xyz_x = xyz->data (0);
- int* xyz_y = xyz->data (1);
- int* xyz_z = xyz->data (2);
+ int* xyz_x = (int *) aligned_alloc(32, size.height * size.width * 4);
+ memcpy (xyz_x, xyz->data(0), size.height * size.width * 4);
+ int* xyz_y = (int *) aligned_alloc(32, size.height * size.width * 4);
+ memcpy (xyz_y, xyz->data(1), size.height * size.width * 4);
+ int* xyz_z = (int *) aligned_alloc(32, size.height * size.width * 4);
+ memcpy (xyz_z, xyz->data(2), size.height * size.width * 4);
+
+ __m256i* xyz_x_sse = (__m256i *) xyz_x;
+ __m256i* xyz_y_sse = (__m256i *) xyz_y;
+ __m256i* xyz_z_sse = (__m256i *) xyz_z;
+
+ __m256 transform_x = _mm256_set_ps (
+ 0, fast_matrix[2], fast_matrix[1], fast_matrix[0], 0, fast_matrix[2], fast_matrix[1], fast_matrix[0]
+ );
+
+ __m256 transform_y = _mm256_set_ps (
+ 0, fast_matrix[5], fast_matrix[4], fast_matrix[3], 0, fast_matrix[5], fast_matrix[4], fast_matrix[3]
+ );
+
+ __m256 transform_z = _mm256_set_ps (
+ 0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6]
+ );
+
+ /* in-line these, or keep them const? */
+ __m256 const_65535 = _mm256_set1_ps (65535.0);
+ __m256 const_0 = _mm256_set1_ps (0.0);
+ __m256i shuffle = _mm256_set_epi32 (7, 3, 6, 2, 5, 1, 4, 0);
+#if 0
+ __m256 zero = _mm256_setzero_ps ();
+
+ __m256i red_mask_1 = _mm256_set_epi32 (0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000);
+ __m256i red_mask_2 = _mm256_set_epi32 (0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff);
+ __m256i red_mask_3 = _mm256_set_epi32 (0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000);
+#endif
+
for (int y = 0; y < size.height; ++y) {
uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride);
- for (int x = 0; x < size.width; ++x) {
-
+ __m128i const * p_sse = reinterpret_cast<__m128i const *> (rgb + y * stride);
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+ for (int x = 0; x < size.width / 8; ++x) {
+
+ // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+ __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 0));
+ __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 1));
+ __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 2));
+ __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p_sse + 3));
+ p_sse += 4;
+
+ // shift right to truncate 16-bit inputs to 12-bit
+ rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+ rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+ rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+ rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+ // input gamma LUT
+ __m256 lut_1_A = _mm256_i32gather_ps (lut_in_float, rgb_A, 4);
+ __m256 lut_1_B = _mm256_i32gather_ps (lut_in_float, rgb_B, 4);
+ __m256 lut_1_C = _mm256_i32gather_ps (lut_in_float, rgb_C, 4);
+ __m256 lut_1_D = _mm256_i32gather_ps (lut_in_float, rgb_D, 4);
+
+ // multiply for X
+ __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+ __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+ __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+ __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+ // accumulate
+
+ //float* dbg_x_A = (float *) &x_A;
+ //float* dbg_x_B = (float *) &x_B;
+#if 0
+ printf("A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]);
+ printf("B:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_B[0], dbg_x_B[1], dbg_x_B[2], dbg_x_B[3], dbg_x_B[4], dbg_x_B[5], dbg_x_B[6], dbg_x_B[7]);
+#endif
+
+ // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0
+ // = B67 B45 A67 A45 B23 B01 A23 A01
+ x_A = _mm256_hadd_ps (x_A, x_B);
+
+#if 0
+ printf("sum A:\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%.2f\n\t%2.f\n", dbg_x_A[0], dbg_x_A[1], dbg_x_A[2], dbg_x_A[3], dbg_x_A[4], dbg_x_A[5], dbg_x_A[6], dbg_x_A[7]);
+
+#endif
+ // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0
+ // = D67 D45 C67 C45 D23 D01 C23 C01
+ x_C = _mm256_hadd_ps (x_C, x_D);
+
+ // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01
+ // = D47 C47 B47 A47 D03 C03 B03 A03
+ x_A = _mm256_hadd_ps (x_A, x_C);
+
+ // now x values for the 8 pixels are in x_A
+
+ //float* fucker = (float *) &x_A;
+ //printf("8 pixels: %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f", fucker[0], fucker[4], fucker[1], fucker[5], fucker[2], fucker[6], fucker[3], fucker[7]);
+
+ // multiply for Y
+ __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+ __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+ __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+ __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+
+ // accumulate
+ y_A = _mm256_hadd_ps (y_A, y_B);
+ y_C = _mm256_hadd_ps (y_C, y_D);
+ y_A = _mm256_hadd_ps (y_A, y_C);
+
+ // multiply for Z
+ __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+ __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+ __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+ __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+
+ // accumulate
+ z_A = _mm256_hadd_ps (z_A, z_B);
+ z_C = _mm256_hadd_ps (z_C, z_D);
+ z_A = _mm256_hadd_ps (z_A, z_C);
+
+ // clamp
+ x_A = _mm256_min_ps(x_A, const_65535);
+ x_A = _mm256_max_ps(x_A, const_0);
+ y_A = _mm256_min_ps(y_A, const_65535);
+ y_A = _mm256_max_ps(y_A, const_0);
+ z_A = _mm256_min_ps(z_A, const_65535);
+ z_A = _mm256_max_ps(z_A, const_0);
+
+ // round to int
+ __m256i lut_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+ __m256i lut_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+ __m256i lut_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+ // out gamma LUT
+ __m256i lut_x_out = _mm256_i32gather_epi32 (lut_out_int, lut_x, 4);
+ __m256i lut_y_out = _mm256_i32gather_epi32 (lut_out_int, lut_y, 4);
+ __m256i lut_z_out = _mm256_i32gather_epi32 (lut_out_int, lut_z, 4);
+
+ //int* res = (int *) &lut_x_out;
+ //printf("result 1 = %d\n", res[0]);
+
+ // shuffle
+ lut_x_out = _mm256_permutevar8x32_epi32 (lut_x_out, shuffle);
+ lut_y_out = _mm256_permutevar8x32_epi32 (lut_y_out, shuffle);
+ lut_z_out = _mm256_permutevar8x32_epi32 (lut_z_out, shuffle);
+
+ // write to memory
+ _mm256_store_si256 (xyz_x_sse, lut_x_out);
+ _mm256_store_si256 (xyz_y_sse, lut_y_out);
+ _mm256_store_si256 (xyz_z_sse, lut_z_out);
+ xyz_x_sse++;
+ xyz_y_sse++;
+ xyz_z_sse++;
+
+#if 0
/* In gamma LUT (converting 16-bit to 12-bit) */
s.r = lut_in[*p++ >> 4];
s.g = lut_in[*p++ >> 4];
s.b = lut_in[*p++ >> 4];
+ p++;
/* RGB to XYZ, Bradford transform and DCI companding */
d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
+ printf("actual x %.2f\n", d.x);
+
/* Clamp */
if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) {
@@ -336,9 +498,13 @@ dcp::rgb_to_xyz (
d.z = min (65535.0, d.z);
/* Out gamma LUT */
+ // 4095 should be in the LUT and LUT should be integer
*xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095);
*xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095);
*xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095);
+
+ printf("c calc %d\n", xyz_x[-1]);
+#endif
}
}
@@ -346,5 +512,16 @@ dcp::rgb_to_xyz (
note.get() (DCP_NOTE, String::compose ("%1 XYZ value(s) clamped", clamped));
}
+ delete[] lut_in_float;
+ delete[] lut_out_int;
+
+ memcpy (xyz->data(0), xyz_x, size.width * size.height * 4);
+ memcpy (xyz->data(1), xyz_y, size.width * size.height * 4);
+ memcpy (xyz->data(2), xyz_z, size.width * size.height * 4);
+
+ free (xyz_x);
+ free (xyz_y);
+ free (xyz_z);
+
return xyz;
}
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index 5dcfe673..18b7b5df 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -52,9 +52,9 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
srand (0);
dcp::Size const size (640, 480);
- scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
+ scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
for (int x = 0; x < size.width; ++x) {
/* Write a 12-bit random number for each component */
for (int c = 0; c < 3; ++c) {
@@ -64,15 +64,16 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
}
}
- shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 6, dcp::ColourConversion::srgb_to_xyz ());
+ shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 6);
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
for (int x = 0; x < size.width; ++x) {
double cr = *p++ / 65535.0;
double cg = *p++ / 65535.0;
double cb = *p++ / 65535.0;
+ p++;
/* Input gamma */
@@ -112,9 +113,11 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
cy = pow (cy, 1 / 2.6);
cz = pow (cz, 1 / 2.6);
- BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 1);
- BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 1);
- BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 1);
+ //std::cout << "x=" << x << " y=" << y << " " << (cx * 4095) << " " << xyz->data(0)[y * size.width + x] << "\n";
+
+ BOOST_REQUIRE_CLOSE (cx * 4095, xyz->data(0)[y * size.width + x], 3);
+ BOOST_REQUIRE_CLOSE (cy * 4095, xyz->data(1)[y * size.width + x], 3);
+ BOOST_REQUIRE_CLOSE (cz * 4095, xyz->data(2)[y * size.width + x], 3);
}
}
}
diff --git a/wscript b/wscript
index 84fcfa71..746a7dce 100644
--- a/wscript
+++ b/wscript
@@ -67,7 +67,7 @@ def options(opt):
def configure(conf):
conf.load('compiler_cxx')
conf.load('clang_compilation_database', tooldir=['waf-tools'])
- conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS'])
+ conf.env.append_value('CXXFLAGS', ['-Wall', '-Wextra', '-D_FILE_OFFSET_BITS=64', '-D__STDC_FORMAT_MACROS', '-mavx2'])
if conf.options.force_cpp11:
conf.env.append_value('CXXFLAGS', ['-std=c++11', '-DBOOST_NO_CXX11_SCOPED_ENUMS'])
gcc = conf.env['CC_VERSION']