summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2020-06-02 19:47:30 +0200
committerCarl Hetherington <cth@carlh.net>2020-06-02 19:47:30 +0200
commit6636b550a4b69bf3d77b51aa033159036d88f33b (patch)
tree740ed4b2615cfbeeeb29c0ecf458400ea8f3b43a /src
parent5f2db276b59257b05e351c1ca5665de9cc3f78f4 (diff)
Add SSE/AVX2 version of rgb_to_xyz().sse2
Diffstat (limited to 'src')
-rw-r--r--src/rgb_xyz.cc187
-rw-r--r--src/rgb_xyz.h8
2 files changed, 195 insertions, 0 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 36185324..fd398ae8 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -38,6 +38,7 @@
#include "dcp_assert.h"
#include "compose.hpp"
#include <cmath>
+#include <immintrin.h>
using std::min;
using std::max;
@@ -337,3 +338,189 @@ dcp::rgb_to_xyz (
return xyz;
}
+
+
+/** @param rgb RGB data arranged as 64 bits per pixel: 16 red, 16 green, 16 blue, 16 dummy (ignored).
+ * Each 2-byte value is little endian; this is effectively AV_PIX_FMT_RGBA64LE with the A
+ * being ignored. There must be no padding bytes between lines of the image (i.e.
+ * stride must equal width). The pointer must be aligned to a 16-byte boundary.
+ *
+ * @param size size of RGB image in pixels.
+ */
+shared_ptr<dcp::OpenJPEGImage>
+dcp::rgb_to_xyz_avx2 (
+ uint8_t const * rgb,
+ dcp::Size size,
+ ColourConversion const & conversion
+ )
+{
+ /* There must be no padding in the RGB as there *can* be no padding in the XYZ output
+ * and we have to keep the RGB and XYZ data pointers similarly aligned.
+ */
+
+ shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
+
+ float const * lut_in = conversion.in()->lut_float (12, false);
+ int const * lut_out = conversion.out()->lut_int (16, true, 4095);
+
+ /* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
+ double transform[9];
+ combined_rgb_to_xyz (conversion, transform);
+
+ __m256i* xyz_x = reinterpret_cast<__m256i*>(xyz->data(0));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_x) % 32));
+ __m256i* xyz_y = reinterpret_cast<__m256i*>(xyz->data(1));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_y) % 32));
+ __m256i* xyz_z = reinterpret_cast<__m256i*>(xyz->data(2));
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(xyz_z) % 32));
+
+ __m256 transform_x = _mm256_set_ps (
+ 0, transform[2], transform[1], transform[0], 0, transform[2], transform[1], transform[0]
+ );
+
+ __m256 transform_y = _mm256_set_ps (
+ 0, transform[5], transform[4], transform[3], 0, transform[5], transform[4], transform[3]
+ );
+
+ __m256 transform_z = _mm256_set_ps (
+ 0, transform[8], transform[7], transform[6], 0, transform[8], transform[7], transform[6]
+ );
+
+ int const pixels = size.width * size.height;
+ int const fast_loops = pixels / 8;
+ int const slow_loops = pixels - fast_loops * 8;
+
+ __m128i const * p = reinterpret_cast<__m128i const *> (rgb);
+ DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
+
+ for (int i = 0; i < fast_loops; ++i) {
+ // 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
+ __m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
+ __m256i rgb_B = _mm256_cvtepu16_epi32(_mm_load_si128(p + 1));
+ __m256i rgb_C = _mm256_cvtepu16_epi32(_mm_load_si128(p + 2));
+ __m256i rgb_D = _mm256_cvtepu16_epi32(_mm_load_si128(p + 3));
+ p += 4;
+
+ // shift right to truncate 16-bit inputs to 12-bit
+ rgb_A = _mm256_srli_epi32 (rgb_A, 4);
+ rgb_B = _mm256_srli_epi32 (rgb_B, 4);
+ rgb_C = _mm256_srli_epi32 (rgb_C, 4);
+ rgb_D = _mm256_srli_epi32 (rgb_D, 4);
+
+ // input gamma LUT
+ __m256 lut_1_A = _mm256_i32gather_ps (lut_in, rgb_A, 4);
+ __m256 lut_1_B = _mm256_i32gather_ps (lut_in, rgb_B, 4);
+ __m256 lut_1_C = _mm256_i32gather_ps (lut_in, rgb_C, 4);
+ __m256 lut_1_D = _mm256_i32gather_ps (lut_in, rgb_D, 4);
+
+ // multiply for X
+ __m256 x_A = _mm256_mul_ps (lut_1_A, transform_x);
+ __m256 x_B = _mm256_mul_ps (lut_1_B, transform_x);
+ __m256 x_C = _mm256_mul_ps (lut_1_C, transform_x);
+ __m256 x_D = _mm256_mul_ps (lut_1_D, transform_x);
+
+ // accumulate
+
+ // B7 B6 B5 B4 B3 B2 B1 B0 + A7 A6 A5 A4 A3 A2 A1 A0
+ // = B67 B45 A67 A45 B23 B01 A23 A01
+ x_A = _mm256_hadd_ps (x_A, x_B);
+
+ // D7 D6 D5 D4 D3 D2 D1 D0 + C7 C6 C5 C4 C3 C2 C1 C0
+ // = D67 D45 C67 C45 D23 D01 C23 C01
+ x_C = _mm256_hadd_ps (x_C, x_D);
+
+ // D67 D45 C67 C45 D23 D01 C23 C01 + B67 B45 A67 A45 B23 B01 A23 A01
+ // = D47 C47 B47 A47 D03 C03 B03 A03
+ x_A = _mm256_hadd_ps (x_A, x_C);
+
+ // same for Y
+ __m256 y_A = _mm256_mul_ps (lut_1_A, transform_y);
+ __m256 y_B = _mm256_mul_ps (lut_1_B, transform_y);
+ __m256 y_C = _mm256_mul_ps (lut_1_C, transform_y);
+ __m256 y_D = _mm256_mul_ps (lut_1_D, transform_y);
+ y_A = _mm256_hadd_ps (y_A, y_B);
+ y_C = _mm256_hadd_ps (y_C, y_D);
+ y_A = _mm256_hadd_ps (y_A, y_C);
+
+ // and for Z
+ __m256 z_A = _mm256_mul_ps (lut_1_A, transform_z);
+ __m256 z_B = _mm256_mul_ps (lut_1_B, transform_z);
+ __m256 z_C = _mm256_mul_ps (lut_1_C, transform_z);
+ __m256 z_D = _mm256_mul_ps (lut_1_D, transform_z);
+ z_A = _mm256_hadd_ps (z_A, z_B);
+ z_C = _mm256_hadd_ps (z_C, z_D);
+ z_A = _mm256_hadd_ps (z_A, z_C);
+
+ // clamp
+ x_A = _mm256_min_ps(x_A, _mm256_set1_ps(65535.0));
+ x_A = _mm256_max_ps(x_A, _mm256_set1_ps(0.0));
+ y_A = _mm256_min_ps(y_A, _mm256_set1_ps(65535.0));
+ y_A = _mm256_max_ps(y_A, _mm256_set1_ps(0.0));
+ z_A = _mm256_min_ps(z_A, _mm256_set1_ps(65535.0));
+ z_A = _mm256_max_ps(z_A, _mm256_set1_ps(0));
+
+ // round to int
+ __m256i lut_2_x = _mm256_cvtps_epi32(_mm256_floor_ps(x_A));
+ __m256i lut_2_y = _mm256_cvtps_epi32(_mm256_floor_ps(y_A));
+ __m256i lut_2_z = _mm256_cvtps_epi32(_mm256_floor_ps(z_A));
+
+ // out gamma LUT
+ lut_2_x = _mm256_i32gather_epi32 (lut_out, lut_2_x, 4);
+ lut_2_y = _mm256_i32gather_epi32 (lut_out, lut_2_y, 4);
+ lut_2_z = _mm256_i32gather_epi32 (lut_out, lut_2_z, 4);
+
+ // shuffle
+ lut_2_x = _mm256_permutevar8x32_epi32 (lut_2_x, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+ lut_2_y = _mm256_permutevar8x32_epi32 (lut_2_y, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+ lut_2_z = _mm256_permutevar8x32_epi32 (lut_2_z, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
+
+ // write to memory
+ _mm256_store_si256 (xyz_x, lut_2_x);
+ _mm256_store_si256 (xyz_y, lut_2_y);
+ _mm256_store_si256 (xyz_z, lut_2_z);
+ xyz_x++;
+ xyz_y++;
+ xyz_z++;
+ }
+
+ struct {
+ float r, g, b;
+ } s;
+
+ struct {
+ float x, y, z;
+ } d;
+
+ uint16_t const * p_slow = reinterpret_cast<uint16_t const *> (p);
+ int* xyz_x_slow = reinterpret_cast<int*> (xyz_x);
+ int* xyz_y_slow = reinterpret_cast<int*> (xyz_y);
+ int* xyz_z_slow = reinterpret_cast<int*> (xyz_z);
+
+ for (int i = 0; i < slow_loops; ++i) {
+ /* In gamma LUT (converting 16-bit to 12-bit) */
+ s.r = lut_in[*p_slow++ >> 4];
+ s.g = lut_in[*p_slow++ >> 4];
+ s.b = lut_in[*p_slow++ >> 4];
+ p_slow++;
+
+ /* RGB to XYZ, Bradford transform and DCI companding */
+ d.x = s.r * transform[0] + s.g * transform[1] + s.b * transform[2];
+ d.y = s.r * transform[3] + s.g * transform[4] + s.b * transform[5];
+ d.z = s.r * transform[6] + s.g * transform[7] + s.b * transform[8];
+
+ /* Clamp */
+ d.x = max (0.0f, d.x);
+ d.y = max (0.0f, d.y);
+ d.z = max (0.0f, d.z);
+ d.x = min (65535.0f, d.x);
+ d.y = min (65535.0f, d.y);
+ d.z = min (65535.0f, d.z);
+
+ /* Out gamma LUT */
+ *xyz_x_slow++ = lut_out[lrint(d.x)];
+ *xyz_y_slow++ = lut_out[lrint(d.y)];
+ *xyz_z_slow++ = lut_out[lrint(d.z)];
+ }
+
+ return xyz;
+}
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index e414d07e..1012c424 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -64,6 +64,14 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
ColourConversion const & conversion
);
+
+extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
+ uint8_t const * rgb,
+ dcp::Size size,
+ ColourConversion const & conversion
+ );
+
+
extern void combined_rgb_to_xyz (ColourConversion const & conversion, double* matrix);
}