summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Hetherington <cth@carlh.net>2020-06-02 16:01:57 +0200
committerCarl Hetherington <cth@carlh.net>2020-06-02 16:01:57 +0200
commit7c9ebaab68c80a14839e400cdd509b847357a794 (patch)
treea539f1e944354c98ec4a9af35d163fe1edba5b78
parent6016e4b24205e11b1f006e43a6ed4b7f2bde732a (diff)
Hackzzz.sse
-rw-r--r--src/rgb_xyz.cc119
-rw-r--r--src/rgb_xyz.h1
-rw-r--r--test/rgb_xyz_test.cc14
3 files changed, 85 insertions, 49 deletions
diff --git a/src/rgb_xyz.cc b/src/rgb_xyz.cc
index 219dc675..b29dd637 100644
--- a/src/rgb_xyz.cc
+++ b/src/rgb_xyz.cc
@@ -284,56 +284,44 @@ dcp::rgb_to_xyz (
{
shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
- struct {
- double r, g, b;
- } s;
-
- struct {
- double x, y, z;
- } d;
+ RGBPixel<double> rgb_pixel;
+ XYZPixel<double> xyz_pixel;
double const * lut_in = conversion.in()->lut (12, false);
- double const * lut_out = conversion.out()->lut (16, true);
+ int const * lut_out = conversion.out()->lut_int (16, true, 4095);
/* This is is the product of the RGB to XYZ matrix, the Bradford transform and the DCI companding */
double fast_matrix[9];
combined_rgb_to_xyz (conversion, fast_matrix);
- int clamped = 0;
int* xyz_x = xyz->data (0);
int* xyz_y = xyz->data (1);
int* xyz_z = xyz->data (2);
for (int y = 0; y < size.height; ++y) {
uint16_t const * p = reinterpret_cast<uint16_t const *> (rgb + y * stride);
for (int x = 0; x < size.width; ++x) {
+ /* In gamma LUT (converting 16-bit to 12-bit) */
+ rgb_pixel.r = lut_in[*rgb++ >> 4];
+ rgb_pixel.g = lut_in[*rgb++ >> 4];
+ rgb_pixel.b = lut_in[*rgb++ >> 4];
+
+ /* RGB to XYZ, Bradford transform and DCI companding */
+ xyz_pixel.x = rgb_pixel.r * fast_matrix[0] + rgb_pixel.g * fast_matrix[1] + rgb_pixel.b * fast_matrix[2];
+ xyz_pixel.y = rgb_pixel.r * fast_matrix[3] + rgb_pixel.g * fast_matrix[4] + rgb_pixel.b * fast_matrix[5];
+ xyz_pixel.z = rgb_pixel.r * fast_matrix[6] + rgb_pixel.g * fast_matrix[7] + rgb_pixel.b * fast_matrix[8];
+
+ /* Clamp */
+ xyz_pixel.x = max (0.0, xyz_pixel.x);
+ xyz_pixel.y = max (0.0, xyz_pixel.y);
+ xyz_pixel.z = max (0.0, xyz_pixel.z);
+ xyz_pixel.x = min (65535.0, xyz_pixel.x);
+ xyz_pixel.y = min (65535.0, xyz_pixel.y);
+ xyz_pixel.z = min (65535.0, xyz_pixel.z);
- /* In gamma LUT (converting 16-bit to 12-bit) */
- s.r = lut_in[*p++ >> 4];
- s.g = lut_in[*p++ >> 4];
- s.b = lut_in[*p++ >> 4];
-
- /* RGB to XYZ, Bradford transform and DCI companding */
- d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
- d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
- d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
-
- /* Clamp */
-
- if (d.x < 0 || d.y < 0 || d.z < 0 || d.x > 65535 || d.y > 65535 || d.z > 65535) {
- ++clamped;
- }
-
- d.x = max (0.0, d.x);
- d.y = max (0.0, d.y);
- d.z = max (0.0, d.z);
- d.x = min (65535.0, d.x);
- d.y = min (65535.0, d.y);
- d.z = min (65535.0, d.z);
-
- /* Out gamma LUT */
- *xyz_x++ = lrint (lut_out[lrint(d.x)] * 4095);
- *xyz_y++ = lrint (lut_out[lrint(d.y)] * 4095);
- *xyz_z++ = lrint (lut_out[lrint(d.z)] * 4095);
+ /* Out gamma LUT */
+ *xyz_x++ = lut_out[lrint(xyz_pixel.x)];
+ *xyz_y++ = lut_out[lrint(xyz_pixel.y)];
+ *xyz_z++ = lut_out[lrint(xyz_pixel.z)];
}
}
@@ -341,24 +329,31 @@ dcp::rgb_to_xyz (
}
-/** @param rgb RGBA data; packed RGBA 16:16:16:16, 48bpp, 16R, 16G, 16B, 16A
+/** @param rgb RGBA data; packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A
* with the 2-byte value for each R/G/B/A component stored as
* little-endian; i.e. AV_PIX_FMT_RGB48LE. A is ignored but necessary for
- * SSE/AVX efficiency.
+ * SSE/AVX efficiency. The stride must be equal to the width of the image (i.e.
+ * no padding bytes).
*
* @param size size of RGB image in pixels.
- * @param size stride of RGB data in pixels.
*/
shared_ptr<dcp::OpenJPEGImage>
dcp::rgb_to_xyz_avx2 (
uint8_t const * rgba,
dcp::Size size,
- int stride,
ColourConversion const & conversion
)
{
shared_ptr<OpenJPEGImage> xyz (new OpenJPEGImage (size));
+ struct {
+ float r, g, b;
+ } s;
+
+ struct {
+ float x, y, z;
+ } d;
+
float const * lut_in = conversion.in()->lut_float (12, false);
int const * lut_out = conversion.out()->lut_int (16, true, 4095);
@@ -385,10 +380,18 @@ dcp::rgb_to_xyz_avx2 (
0, fast_matrix[8], fast_matrix[7], fast_matrix[6], 0, fast_matrix[8], fast_matrix[7], fast_matrix[6]
);
+ int pixel_count = 0;
+
for (int y = 0; y < size.height; ++y) {
- __m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * stride);
+ uint16_t const* rgba_p = reinterpret_cast<uint16_t const *>(rgba + y * size.width * 8);
+ for (int x = 0; x < pixel_count % 8; ++x) {
+ rgb_to_xyz_pixel (rgba_p, lut_in, fast_matrix, lut_out, xyz_x, xyz_y, xyz_z, rgb_pixel, xyz_pixel);
+ }
+
+ __m128i const * p = reinterpret_cast<__m128i const *> (rgba + y * size.width * 8);
+
DCP_ASSERT (!(reinterpret_cast<uintptr_t>(p) % 16));
- for (int x = 0; x < size.width / 8; ++x) {
+ for (int x = 0; x < sse_width; ++x) {
// 2 pixels in each register, extended to 32-bit since we can't do gather with 16-bit words
__m256i rgb_A = _mm256_cvtepu16_epi32(_mm_load_si128(p + 0));
@@ -478,6 +481,38 @@ dcp::rgb_to_xyz_avx2 (
xyz_y++;
xyz_z++;
}
+
+ uint16_t const * p_extra = reinterpret_cast<uint16_t const *> (p);
+ int* xyz_x_extra = reinterpret_cast<int *> (xyz_x);
+ int* xyz_y_extra = reinterpret_cast<int *> (xyz_y);
+ int* xyz_z_extra = reinterpret_cast<int *> (xyz_z);
+
+ std::cout << "doing " << extra_width << " more.\n";
+ for (int x = 0; x < extra_width; ++x) {
+
+ /* In gamma LUT (converting 16-bit to 12-bit) */
+ s.r = lut_in[*p_extra++ >> 4];
+ s.g = lut_in[*p_extra++ >> 4];
+ s.b = lut_in[*p_extra++ >> 4];
+
+ /* RGB to XYZ, Bradford transform and DCI companding */
+ d.x = s.r * fast_matrix[0] + s.g * fast_matrix[1] + s.b * fast_matrix[2];
+ d.y = s.r * fast_matrix[3] + s.g * fast_matrix[4] + s.b * fast_matrix[5];
+ d.z = s.r * fast_matrix[6] + s.g * fast_matrix[7] + s.b * fast_matrix[8];
+
+ /* Clamp */
+ d.x = max (0.0f, d.x);
+ d.y = max (0.0f, d.y);
+ d.z = max (0.0f, d.z);
+ d.x = min (65535.0f, d.x);
+ d.y = min (65535.0f, d.y);
+ d.z = min (65535.0f, d.z);
+
+ /* Out gamma LUT */
+ *xyz_x_extra++ = lut_out[lrint(d.x)];
+ *xyz_y_extra++ = lut_out[lrint(d.y)];
+ *xyz_z_extra++ = lut_out[lrint(d.z)];
+ }
}
return xyz;
diff --git a/src/rgb_xyz.h b/src/rgb_xyz.h
index 59b2400d..ef4f2ac1 100644
--- a/src/rgb_xyz.h
+++ b/src/rgb_xyz.h
@@ -69,7 +69,6 @@ extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz (
extern boost::shared_ptr<OpenJPEGImage> rgb_to_xyz_avx2 (
uint8_t const * rgba,
dcp::Size size,
- int stride,
ColourConversion const & conversion
);
diff --git a/test/rgb_xyz_test.cc b/test/rgb_xyz_test.cc
index d9c27ae2..8a5ad205 100644
--- a/test/rgb_xyz_test.cc
+++ b/test/rgb_xyz_test.cc
@@ -51,7 +51,7 @@ using boost::scoped_array;
BOOST_AUTO_TEST_CASE (rgb_xyz_test)
{
srand (0);
- dcp::Size const size (640, 480);
+ dcp::Size const size (647, 485);
scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 6]);
for (int y = 0; y < size.height; ++y) {
@@ -125,11 +125,13 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test)
BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
{
srand (0);
- dcp::Size const size (640, 480);
+ dcp::Size const size (647, 485);
+
+ int stride = (size.width + (16 - size.width % 16)) * 8;
- scoped_array<uint8_t> rgb (new uint8_t[size.width * size.height * 8]);
+ scoped_array<uint8_t> rgb (new uint8_t[size.height * stride]);
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride);
for (int x = 0; x < size.width; ++x) {
/* Write a 12-bit random number for each component */
for (int c = 0; c < 3; ++c) {
@@ -139,10 +141,10 @@ BOOST_AUTO_TEST_CASE (rgb_xyz_test_avx2)
}
}
- shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, size.width * 8, dcp::ColourConversion::srgb_to_xyz());
+ shared_ptr<dcp::OpenJPEGImage> xyz = dcp::rgb_to_xyz_avx2 (rgb.get(), size, stride, dcp::ColourConversion::srgb_to_xyz());
for (int y = 0; y < size.height; ++y) {
- uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * size.width * 8);
+ uint16_t* p = reinterpret_cast<uint16_t*> (rgb.get() + y * stride);
for (int x = 0; x < size.width; ++x) {
double cr = *p++ / 65535.0;