X-Git-Url: https://git.carlh.net/gitweb/?a=blobdiff_plain;f=src%2Flib%2Fimage.cc;h=2167918f8d1055c1c7fff274021797fb767b84bc;hb=HEAD;hp=9aecac8347a40bbbd0e7e4ff5a7fd2f85e608908;hpb=61fb89205e631cdb49161bbc4b321d5a93868120;p=dcpomatic.git

diff --git a/src/lib/image.cc b/src/lib/image.cc
index 9aecac834..2167918f8 100644
--- a/src/lib/image.cc
+++ b/src/lib/image.cc
@@ -27,6 +27,7 @@
 #include "compose.hpp"
 #include "dcpomatic_assert.h"
 #include "dcpomatic_socket.h"
+#include "enum_indexed_vector.h"
 #include "exceptions.h"
 #include "image.h"
 #include "maths_util.h"
@@ -236,10 +237,10 @@ Image::crop_scale_window (
 	}
 
 	DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUVToRGB::COUNT);
-	int const lut[static_cast<int>(dcp::YUVToRGB::COUNT)] = {
-		SWS_CS_ITU601,
-		SWS_CS_ITU709
-	};
+	EnumIndexedVector<int, dcp::YUVToRGB> lut;
+	lut[dcp::YUVToRGB::REC601] = SWS_CS_ITU601;
+	lut[dcp::YUVToRGB::REC709] = SWS_CS_ITU709;
+	lut[dcp::YUVToRGB::REC2020] = SWS_CS_BT2020;
 
 	/* The 3rd parameter here is:
 	   0 -> source range MPEG (i.e. "video", 16-235)
@@ -254,8 +255,8 @@ Image::crop_scale_window (
 	*/
 	sws_setColorspaceDetails (
 		scale_context,
-		sws_getCoefficients (lut[static_cast<int>(yuv_to_rgb)]), video_range == VideoRange::VIDEO ? 0 : 1,
-		sws_getCoefficients (lut[static_cast<int>(yuv_to_rgb)]), out_video_range == VideoRange::VIDEO ? 0 : 1,
+		sws_getCoefficients(lut[yuv_to_rgb]), video_range == VideoRange::VIDEO ? 0 : 1,
+		sws_getCoefficients(lut[yuv_to_rgb]), out_video_range == VideoRange::VIDEO ? 0 : 1,
 		0, 1 << 16, 1 << 16
 		);
 
@@ -327,7 +328,7 @@ Image::convert_pixel_format (dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_format,
 /** @param out_size Size to scale to.
  *  @param yuv_to_rgb YUVToRGB transform transform to use, if required.
  *  @param out_format Output pixel format.
- *  @param out_aligment Output alignment.
+ *  @param out_alignment Output alignment.
  *  @param fast Try to be fast at the possible expense of quality; at present this means using
  *  fast bilinear rather than bicubic scaling.
  */
@@ -338,6 +339,10 @@ Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_fo
 	   the input image alignment is not PADDED.
 	*/
 	DCPOMATIC_ASSERT (alignment() == Alignment::PADDED);
+	DCPOMATIC_ASSERT(size().width > 0);
+	DCPOMATIC_ASSERT(size().height > 0);
+	DCPOMATIC_ASSERT(out_size.width > 0);
+	DCPOMATIC_ASSERT(out_size.height > 0);
 
 	auto scaled = make_shared<Image>(out_format, out_size, out_alignment);
 	auto scale_context = sws_getContext (
@@ -346,11 +351,13 @@ Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_fo
 		(fast ? SWS_FAST_BILINEAR : SWS_BICUBIC) | SWS_ACCURATE_RND, 0, 0, 0
 		);
 
+	DCPOMATIC_ASSERT(scale_context);
+
 	DCPOMATIC_ASSERT (yuv_to_rgb < dcp::YUVToRGB::COUNT);
-	int const lut[static_cast<int>(dcp::YUVToRGB::COUNT)] = {
-		SWS_CS_ITU601,
-		SWS_CS_ITU709
-	};
+	EnumIndexedVector<int, dcp::YUVToRGB> lut;
+	lut[dcp::YUVToRGB::REC601] = SWS_CS_ITU601;
+	lut[dcp::YUVToRGB::REC709] = SWS_CS_ITU709;
+	lut[dcp::YUVToRGB::REC2020] = SWS_CS_BT2020;
 
 	/* The 3rd parameter here is:
 	   0 -> source range MPEG (i.e. "video", 16-235)
@@ -365,8 +372,8 @@ Image::scale (dcp::Size out_size, dcp::YUVToRGB yuv_to_rgb, AVPixelFormat out_fo
 	*/
 	sws_setColorspaceDetails (
 		scale_context,
-		sws_getCoefficients (lut[static_cast<int>(yuv_to_rgb)]), 0,
-		sws_getCoefficients (lut[static_cast<int>(yuv_to_rgb)]), 0,
+		sws_getCoefficients(lut[yuv_to_rgb]), 0,
+		sws_getCoefficients(lut[yuv_to_rgb]), 0,
 		0, 1 << 16, 1 << 16
 		);
 
@@ -618,7 +625,7 @@ Image::make_black ()
 void
 Image::make_transparent ()
 {
-	if (_pixel_format != AV_PIX_FMT_BGRA && _pixel_format != AV_PIX_FMT_RGBA) {
+	if (_pixel_format != AV_PIX_FMT_BGRA && _pixel_format != AV_PIX_FMT_RGBA && _pixel_format != AV_PIX_FMT_RGBA64BE) {
 		throw PixelFormatError ("make_transparent()", _pixel_format);
 	}
 
@@ -626,16 +633,333 @@ Image::make_transparent ()
 }
 
 
+struct TargetParams
+{
+	int start_x;
+	int start_y;
+	dcp::Size size;
+	uint8_t* const* data;
+	int const* stride;
+	int bpp;
+
+	uint8_t* line_pointer(int y) const {
+		return data[0] + y * stride[0] + start_x * bpp;
+	}
+};
+
+
+/** Parameters of the other image (the one being blended onto the target) when target and other are RGB */
+struct OtherRGBParams
+{
+	int start_x;
+	int start_y;
+	dcp::Size size;
+	uint8_t* const* data;
+	int const* stride;
+	int bpp;
+
+	uint8_t* line_pointer(int y) const {
+		return data[0] + y * stride[0];
+	}
+
+	float alpha_divisor() const {
+		return pow(2, bpp * 2) - 1;
+	}
+};
+
+
+/** Parameters of the other image (the one being blended onto the target) when target and other are YUV */
+struct OtherYUVParams
+{
+	int start_x;
+	int start_y;
+	dcp::Size size;
+	uint8_t* const* data;
+	int const* stride;
+
+	uint8_t* const* alpha_data;
+	int const* alpha_stride;
+	int alpha_bpp;
+};
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgb24(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+	/* Going onto RGB24.  First byte is red, second green, third blue */
+	auto const alpha_divisor = other.alpha_divisor();
+	for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+		auto tp = target.line_pointer(ty);
+		auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+		for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+			float const alpha = get(op + 3) / alpha_divisor;
+			tp[0] = (get(op + red) / value_divisor) * alpha + tp[0] * (1 - alpha);
+			tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+			tp[2] = (get(op + blue) / value_divisor) * alpha + tp[2] * (1 - alpha);
+
+			tp += target.bpp;
+			op += other.bpp / sizeof(OtherType);
+		}
+	}
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_bgra(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+	auto const alpha_divisor = other.alpha_divisor();
+	for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+		auto tp = target.line_pointer(ty);
+		auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+		for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+			float const alpha = get(op + 3) / alpha_divisor;
+			tp[0] = (get(op + blue) / value_divisor) * alpha + tp[0] * (1 - alpha);
+			tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+			tp[2] = (get(op + red) / value_divisor) * alpha + tp[2] * (1 - alpha);
+			tp[3] = (get(op + 3) / value_divisor) * alpha + tp[3] * (1 - alpha);
+
+			tp += target.bpp;
+			op += other.bpp / sizeof(OtherType);
+		}
+	}
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgba(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+	auto const alpha_divisor = other.alpha_divisor();
+	for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+		auto tp = target.line_pointer(ty);
+		auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+		for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+			float const alpha = get(op + 3) / alpha_divisor;
+			tp[0] = (get(op + red) / value_divisor) * alpha + tp[0] * (1 - alpha);
+			tp[1] = (get(op + 1) / value_divisor) * alpha + tp[1] * (1 - alpha);
+			tp[2] = (get(op + blue) / value_divisor) * alpha + tp[2] * (1 - alpha);
+			tp[3] = (get(op + 3) / value_divisor) * alpha + tp[3] * (1 - alpha);
+
+			tp += target.bpp;
+			op += other.bpp / sizeof(OtherType);
+		}
+	}
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_rgb48le(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_scale)
+{
+	auto const alpha_divisor = other.alpha_divisor();
+	for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+		auto tp = reinterpret_cast<uint16_t*>(target.line_pointer(ty));
+		auto op = reinterpret_cast<OtherType*>(other.line_pointer(oy));
+		for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+			float const alpha = get(op + 3) / alpha_divisor;
+			tp[0] = get(op + red) * value_scale * alpha + tp[0] * (1 - alpha);
+			tp[1] = get(op + 1) * value_scale * alpha + tp[1] * (1 - alpha);
+			tp[2] = get(op + blue) * value_scale * alpha + tp[2] * (1 - alpha);
+
+			tp += target.bpp / 2;
+			op += other.bpp / sizeof(OtherType);
+		}
+	}
+}
+
+
+template <class OtherType>
+void
+alpha_blend_onto_xyz12le(TargetParams const& target, OtherRGBParams const& other, int red, int blue, std::function<float (OtherType*)> get, int value_divisor)
+{
+	auto const alpha_divisor = other.alpha_divisor();
+	auto conv = dcp::ColourConversion::srgb_to_xyz();
+	double fast_matrix[9];
+	dcp::combined_rgb_to_xyz(conv, fast_matrix);
+	auto lut_in = conv.in()->double_lut(0, 1, 8, false);
+	auto lut_out = conv.out()->int_lut(0, 1, 16, true, 65535);
+	for (int ty = target.start_y, oy = other.start_y; ty < target.size.height && oy < other.size.height; ++ty, ++oy) {
+		auto tp = reinterpret_cast<uint16_t*>(target.data[0] + ty * target.stride[0] + target.start_x * target.bpp);
+		auto op = reinterpret_cast<OtherType*>(other.data[0] + oy * other.stride[0]);
+		for (int tx = target.start_x, ox = other.start_x; tx < target.size.width && ox < other.size.width; ++tx, ++ox) {
+			float const alpha = get(op + 3) / alpha_divisor;
+
+			/* Convert sRGB to XYZ; op is BGRA.  First, input gamma LUT */
+			double const r = lut_in[get(op + red) / value_divisor];
+			double const g = lut_in[get(op + 1) / value_divisor];
+			double const b = lut_in[get(op + blue) / value_divisor];
+
+			/* RGB to XYZ, including Bradford transform and DCI companding */
+			double const x = max(0.0, min(1.0, r * fast_matrix[0] + g * fast_matrix[1] + b * fast_matrix[2]));
+			double const y = max(0.0, min(1.0, r * fast_matrix[3] + g * fast_matrix[4] + b * fast_matrix[5]));
+			double const z = max(0.0, min(1.0, r * fast_matrix[6] + g * fast_matrix[7] + b * fast_matrix[8]));
+
+			/* Out gamma LUT and blend */
+			tp[0] = lut_out[lrint(x * 65535)] * alpha + tp[0] * (1 - alpha);
+			tp[1] = lut_out[lrint(y * 65535)] * alpha + tp[1] * (1 - alpha);
+			tp[2] = lut_out[lrint(z * 65535)] * alpha + tp[2] * (1 - alpha);
+
+			tp += target.bpp / 2;
+			op += other.bpp / sizeof(OtherType);
+		}
+	}
+}
+
+
+static
+void
+alpha_blend_onto_yuv420p(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+	auto const ts = target.size;
+	auto const os = other.size;
+	for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+		int const hty = ty / 2;
+		int const hoy = oy / 2;
+		uint8_t* tY = target.data[0] + (ty * target.stride[0]) + target.start_x;
+		uint8_t* tU = target.data[1] + (hty * target.stride[1]) + target.start_x / 2;
+		uint8_t* tV = target.data[2] + (hty * target.stride[2]) + target.start_x / 2;
+		uint8_t* oY = other.data[0] + (oy * other.stride[0]) + other.start_x;
+		uint8_t* oU = other.data[1] + (hoy * other.stride[1]) + other.start_x / 2;
+		uint8_t* oV = other.data[2] + (hoy * other.stride[2]) + other.start_x / 2;
+		uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+		for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+			float const a = get_alpha(alpha);
+			*tY = *oY * a + *tY * (1 - a);
+			*tU = *oU * a + *tU * (1 - a);
+			*tV = *oV * a + *tV * (1 - a);
+			++tY;
+			++oY;
+			if (tx % 2) {
+				++tU;
+				++tV;
+			}
+			if (ox % 2) {
+				++oU;
+				++oV;
+			}
+			alpha += other.alpha_bpp;
+		}
+	}
+}
+
+
+static
+void
+alpha_blend_onto_yuv420p10(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+	auto const ts = target.size;
+	auto const os = other.size;
+	for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+		int const hty = ty / 2;
+		int const hoy = oy / 2;
+		uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+		uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (hty * target.stride[1])) + target.start_x / 2;
+		uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (hty * target.stride[2])) + target.start_x / 2;
+		uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+		uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (hoy * other.stride[1])) + other.start_x / 2;
+		uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (hoy * other.stride[2])) + other.start_x / 2;
+		uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+		for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+			float const a = get_alpha(alpha);
+			*tY = *oY * a + *tY * (1 - a);
+			*tU = *oU * a + *tU * (1 - a);
+			*tV = *oV * a + *tV * (1 - a);
+			++tY;
+			++oY;
+			if (tx % 2) {
+				++tU;
+				++tV;
+			}
+			if (ox % 2) {
+				++oU;
+				++oV;
+			}
+			alpha += other.alpha_bpp;
+		}
+	}
+}
+
+
+static
+void
+alpha_blend_onto_yuv422p9or10le(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+	auto const ts = target.size;
+	auto const os = other.size;
+	for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+		uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+		uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (ty * target.stride[1])) + target.start_x / 2;
+		uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (ty * target.stride[2])) + target.start_x / 2;
+		uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+		uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (oy * other.stride[1])) + other.start_x / 2;
+		uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (oy * other.stride[2])) + other.start_x / 2;
+		uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+		for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+			float const a = get_alpha(alpha);
+			*tY = *oY * a + *tY * (1 - a);
+			*tU = *oU * a + *tU * (1 - a);
+			*tV = *oV * a + *tV * (1 - a);
+			++tY;
+			++oY;
+			if (tx % 2) {
+				++tU;
+				++tV;
+			}
+			if (ox % 2) {
+				++oU;
+				++oV;
+			}
+			alpha += other.alpha_bpp;
+		}
+	}
+}
+
+
+static
+void
+alpha_blend_onto_yuv444p9or10le(TargetParams const& target, OtherYUVParams const& other, std::function<float (uint8_t* data)> get_alpha)
+{
+	auto const ts = target.size;
+	auto const os = other.size;
+	for (int ty = target.start_y, oy = other.start_y; ty < ts.height && oy < os.height; ++ty, ++oy) {
+		uint16_t* tY = reinterpret_cast<uint16_t*>(target.data[0] + (ty * target.stride[0])) + target.start_x;
+		uint16_t* tU = reinterpret_cast<uint16_t*>(target.data[1] + (ty * target.stride[1])) + target.start_x;
+		uint16_t* tV = reinterpret_cast<uint16_t*>(target.data[2] + (ty * target.stride[2])) + target.start_x;
+		uint16_t* oY = reinterpret_cast<uint16_t*>(other.data[0] + (oy * other.stride[0])) + other.start_x;
+		uint16_t* oU = reinterpret_cast<uint16_t*>(other.data[1] + (oy * other.stride[1])) + other.start_x;
+		uint16_t* oV = reinterpret_cast<uint16_t*>(other.data[2] + (oy * other.stride[2])) + other.start_x;
+		uint8_t* alpha = other.alpha_data[0] + (oy * other.alpha_stride[0]) + other.start_x * other.alpha_bpp;
+		for (int tx = target.start_x, ox = other.start_x; tx < ts.width && ox < os.width; ++tx, ++ox) {
+			float const a = get_alpha(alpha);
+			*tY = *oY * a + *tY * (1 - a);
+			*tU = *oU * a + *tU * (1 - a);
+			*tV = *oV * a + *tV * (1 - a);
+			++tY;
+			++oY;
+			++tU;
+			++tV;
+			++oU;
+			++oV;
+			alpha += other.alpha_bpp;
+		}
+	}
+}
+
+
 void
 Image::alpha_blend (shared_ptr<const Image> other, Position<int> position)
 {
-	/* We're blending RGBA or BGRA images */
-	DCPOMATIC_ASSERT (other->pixel_format() == AV_PIX_FMT_BGRA || other->pixel_format() == AV_PIX_FMT_RGBA);
+	DCPOMATIC_ASSERT(
+		other->pixel_format() == AV_PIX_FMT_BGRA ||
+		other->pixel_format() == AV_PIX_FMT_RGBA ||
+		other->pixel_format() == AV_PIX_FMT_RGBA64BE
+		);
+
 	int const blue = other->pixel_format() == AV_PIX_FMT_BGRA ? 0 : 2;
 	int const red = other->pixel_format() == AV_PIX_FMT_BGRA ? 2 : 0;
 
-	int const other_bpp = 4;
-
 	int start_tx = position.x;
 	int start_ox = 0;
 
@@ -652,218 +976,147 @@ Image::alpha_blend (shared_ptr<const Image> other, Position<int> position)
 		start_ty = 0;
 	}
 
+	TargetParams target_params = {
+		start_tx,
+		start_ty,
+		size(),
+		data(),
+		stride(),
+		0
+	};
+
+	OtherRGBParams other_rgb_params = {
+		start_ox,
+		start_oy,
+		other->size(),
+		other->data(),
+		other->stride(),
+		other->pixel_format() == AV_PIX_FMT_RGBA64BE ? 8 : 4
+	};
+
+	OtherYUVParams other_yuv_params = {
+		start_ox,
+		start_oy,
+		other->size(),
+		other->data(),
+		other->stride(),
+		nullptr,
+		nullptr,
+		other->pixel_format() == AV_PIX_FMT_RGBA64BE ? 8 : 4
+	};
+
+	auto byteswap = [](uint16_t* p) {
+		return (*p >> 8) | ((*p & 0xff) << 8);
+	};
+
+	auto pass = [](uint8_t* p) {
+		return *p;
+	};
+
+	auto get_alpha_64be = [](uint8_t* p) {
+		return ((static_cast<int16_t>(p[6]) << 8) | p[7]) / 65535.0f;
+	};
+
+	auto get_alpha_byte = [](uint8_t* p) {
+		return p[3] / 255.0f;
+	};
+
 	switch (_pixel_format) {
 	case AV_PIX_FMT_RGB24:
-	{
-		/* Going onto RGB24.  First byte is red, second green, third blue */
-		int const this_bpp = 3;
-		for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-			uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-			uint8_t* op = other->data()[0] + oy * other->stride()[0];
-			for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-				float const alpha = float (op[3]) / 255;
-				tp[0] = op[red] * alpha + tp[0] * (1 - alpha);
-				tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-				tp[2] = op[blue] * alpha + tp[2] * (1 - alpha);
-
-				tp += this_bpp;
-				op += other_bpp;
-			}
+		target_params.bpp = 3;
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_rgb24<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+		} else {
+			alpha_blend_onto_rgb24<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
 		}
 		break;
-	}
 	case AV_PIX_FMT_BGRA:
-	{
-		int const this_bpp = 4;
-		for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-			uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-			uint8_t* op = other->data()[0] + oy * other->stride()[0];
-			for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-				float const alpha = float (op[3]) / 255;
-				tp[0] = op[blue] * alpha + tp[0] * (1 - alpha);
-				tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-				tp[2] = op[red] * alpha + tp[2] * (1 - alpha);
-				tp[3] = op[3] * alpha + tp[3] * (1 - alpha);
-
-				tp += this_bpp;
-				op += other_bpp;
-			}
+		target_params.bpp = 4;
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_bgra<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+		} else {
+			alpha_blend_onto_bgra<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
 		}
 		break;
-	}
 	case AV_PIX_FMT_RGBA:
-	{
-		int const this_bpp = 4;
-		for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-			uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-			uint8_t* op = other->data()[0] + oy * other->stride()[0];
-			for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-				float const alpha = float (op[3]) / 255;
-				tp[0] = op[red] * alpha + tp[0] * (1 - alpha);
-				tp[1] = op[1] * alpha + tp[1] * (1 - alpha);
-				tp[2] = op[blue] * alpha + tp[2] * (1 - alpha);
-				tp[3] = op[3] * alpha + tp[3] * (1 - alpha);
-
-				tp += this_bpp;
-				op += other_bpp;
-			}
+		target_params.bpp = 4;
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_rgba<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+		} else {
+			alpha_blend_onto_rgba<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
 		}
 		break;
-	}
 	case AV_PIX_FMT_RGB48LE:
-	{
-		int const this_bpp = 6;
-		for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-			uint8_t* tp = data()[0] + ty * stride()[0] + start_tx * this_bpp;
-			uint8_t* op = other->data()[0] + oy * other->stride()[0];
-			for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-				float const alpha = float (op[3]) / 255;
-				/* Blend high bytes */
-				tp[1] = op[red] * alpha + tp[1] * (1 - alpha);
-				tp[3] = op[1] * alpha + tp[3] * (1 - alpha);
-				tp[5] = op[blue] * alpha + tp[5] * (1 - alpha);
-
-				tp += this_bpp;
-				op += other_bpp;
-			}
+		target_params.bpp = 6;
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_rgb48le<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 1);
+		} else {
+			alpha_blend_onto_rgb48le<uint8_t>(target_params, other_rgb_params, red, blue, pass, 256);
 		}
 		break;
-	}
 	case AV_PIX_FMT_XYZ12LE:
-	{
-		auto conv = dcp::ColourConversion::srgb_to_xyz();
-		double fast_matrix[9];
-		dcp::combined_rgb_to_xyz (conv, fast_matrix);
-		auto lut_in = conv.in()->lut(0, 1, 8, false);
-		auto lut_out = conv.out()->lut(0, 1, 16, true);
-		int const this_bpp = 6;
-		for (int ty = start_ty, oy = start_oy; ty < size().height && oy < other->size().height; ++ty, ++oy) {
-			uint16_t* tp = reinterpret_cast<uint16_t*> (data()[0] + ty * stride()[0] + start_tx * this_bpp);
-			uint8_t* op = other->data()[0] + oy * other->stride()[0];
-			for (int tx = start_tx, ox = start_ox; tx < size().width && ox < other->size().width; ++tx, ++ox) {
-				float const alpha = float (op[3]) / 255;
-
-				/* Convert sRGB to XYZ; op is BGRA.  First, input gamma LUT */
-				double const r = lut_in[op[red]];
-				double const g = lut_in[op[1]];
-				double const b = lut_in[op[blue]];
-
-				/* RGB to XYZ, including Bradford transform and DCI companding */
-				double const x = max(0.0, min(1.0, r * fast_matrix[0] + g * fast_matrix[1] + b * fast_matrix[2]));
-				double const y = max(0.0, min(1.0, r * fast_matrix[3] + g * fast_matrix[4] + b * fast_matrix[5]));
-				double const z = max(0.0, min(1.0, r * fast_matrix[6] + g * fast_matrix[7] + b * fast_matrix[8]));
-
-				/* Out gamma LUT and blend */
-				tp[0] = lrint(lut_out[lrint(x * 65535)] * 65535) * alpha + tp[0] * (1 - alpha);
-				tp[1] = lrint(lut_out[lrint(y * 65535)] * 65535) * alpha + tp[1] * (1 - alpha);
-				tp[2] = lrint(lut_out[lrint(z * 65535)] * 65535) * alpha + tp[2] * (1 - alpha);
-
-				tp += this_bpp / 2;
-				op += other_bpp;
-			}
+		target_params.bpp = 6;
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_xyz12le<uint16_t>(target_params, other_rgb_params, red, blue, byteswap, 256);
+		} else {
+			alpha_blend_onto_xyz12le<uint8_t>(target_params, other_rgb_params, red, blue, pass, 1);
 		}
 		break;
-	}
 	case AV_PIX_FMT_YUV420P:
 	{
 		auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
-		dcp::Size const ts = size();
-		dcp::Size const os = yuv->size();
-		for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-			int const hty = ty / 2;
-			int const hoy = oy / 2;
-			uint8_t* tY = data()[0] + (ty * stride()[0]) + start_tx;
-			uint8_t* tU = data()[1] + (hty * stride()[1]) + start_tx / 2;
-			uint8_t* tV = data()[2] + (hty * stride()[2]) + start_tx / 2;
-			uint8_t* oY = yuv->data()[0] + (oy * yuv->stride()[0]) + start_ox;
-			uint8_t* oU = yuv->data()[1] + (hoy * yuv->stride()[1]) + start_ox / 2;
-			uint8_t* oV = yuv->data()[2] + (hoy * yuv->stride()[2]) + start_ox / 2;
-			uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-			for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-				float const a = float(alpha[3]) / 255;
-				*tY = *oY * a + *tY * (1 - a);
-				*tU = *oU * a + *tU * (1 - a);
-				*tV = *oV * a + *tV * (1 - a);
-				++tY;
-				++oY;
-				if (tx % 2) {
-					++tU;
-					++tV;
-				}
-				if (ox % 2) {
-					++oU;
-					++oV;
-				}
-				alpha += 4;
-			}
+		other_yuv_params.data = yuv->data();
+		other_yuv_params.stride = yuv->stride();
+		other_yuv_params.alpha_data = other->data();
+		other_yuv_params.alpha_stride = other->stride();
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_yuv420p(target_params, other_yuv_params, get_alpha_64be);
+		} else {
+			alpha_blend_onto_yuv420p(target_params, other_yuv_params, get_alpha_byte);
 		}
 		break;
 	}
 	case AV_PIX_FMT_YUV420P10:
 	{
 		auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
-		dcp::Size const ts = size();
-		dcp::Size const os = yuv->size();
-		for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-			int const hty = ty / 2;
-			int const hoy = oy / 2;
-			uint16_t* tY = ((uint16_t *) (data()[0] + (ty * stride()[0]))) + start_tx;
-			uint16_t* tU = ((uint16_t *) (data()[1] + (hty * stride()[1]))) + start_tx / 2;
-			uint16_t* tV = ((uint16_t *) (data()[2] + (hty * stride()[2]))) + start_tx / 2;
-			uint16_t* oY = ((uint16_t *) (yuv->data()[0] + (oy * yuv->stride()[0]))) + start_ox;
-			uint16_t* oU = ((uint16_t *) (yuv->data()[1] + (hoy * yuv->stride()[1]))) + start_ox / 2;
-			uint16_t* oV = ((uint16_t *) (yuv->data()[2] + (hoy * yuv->stride()[2]))) + start_ox / 2;
-			uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-			for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-				float const a = float(alpha[3]) / 255;
-				*tY = *oY * a + *tY * (1 - a);
-				*tU = *oU * a + *tU * (1 - a);
-				*tV = *oV * a + *tV * (1 - a);
-				++tY;
-				++oY;
-				if (tx % 2) {
-					++tU;
-					++tV;
-				}
-				if (ox % 2) {
-					++oU;
-					++oV;
-				}
-				alpha += 4;
-			}
+		other_yuv_params.data = yuv->data();
+		other_yuv_params.stride = yuv->stride();
+		other_yuv_params.alpha_data = other->data();
+		other_yuv_params.alpha_stride = other->stride();
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_yuv420p10(target_params, other_yuv_params, get_alpha_64be);
+		} else {
+			alpha_blend_onto_yuv420p10(target_params, other_yuv_params, get_alpha_byte);
 		}
 		break;
 	}
+	case AV_PIX_FMT_YUV422P9LE:
 	case AV_PIX_FMT_YUV422P10LE:
 	{
 		auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
-		dcp::Size const ts = size();
-		dcp::Size const os = yuv->size();
-		for (int ty = start_ty, oy = start_oy; ty < ts.height && oy < os.height; ++ty, ++oy) {
-			uint16_t* tY = ((uint16_t *) (data()[0] + (ty * stride()[0]))) + start_tx;
-			uint16_t* tU = ((uint16_t *) (data()[1] + (ty * stride()[1]))) + start_tx / 2;
-			uint16_t* tV = ((uint16_t *) (data()[2] + (ty * stride()[2]))) + start_tx / 2;
-			uint16_t* oY = ((uint16_t *) (yuv->data()[0] + (oy * yuv->stride()[0]))) + start_ox;
-			uint16_t* oU = ((uint16_t *) (yuv->data()[1] + (oy * yuv->stride()[1]))) + start_ox / 2;
-			uint16_t* oV = ((uint16_t *) (yuv->data()[2] + (oy * yuv->stride()[2]))) + start_ox / 2;
-			uint8_t* alpha = other->data()[0] + (oy * other->stride()[0]) + start_ox * 4;
-			for (int tx = start_tx, ox = start_ox; tx < ts.width && ox < os.width; ++tx, ++ox) {
-				float const a = float(alpha[3]) / 255;
-				*tY = *oY * a + *tY * (1 - a);
-				*tU = *oU * a + *tU * (1 - a);
-				*tV = *oV * a + *tV * (1 - a);
-				++tY;
-				++oY;
-				if (tx % 2) {
-					++tU;
-					++tV;
-				}
-				if (ox % 2) {
-					++oU;
-					++oV;
-				}
-				alpha += 4;
-			}
+		other_yuv_params.data = yuv->data();
+		other_yuv_params.stride = yuv->stride();
+		other_yuv_params.alpha_data = other->data();
+		other_yuv_params.alpha_stride = other->stride();
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_yuv422p9or10le(target_params, other_yuv_params, get_alpha_64be);
+		} else {
+			alpha_blend_onto_yuv422p9or10le(target_params, other_yuv_params, get_alpha_byte);
+		}
+		break;
+	}
+	case AV_PIX_FMT_YUV444P9LE:
+	case AV_PIX_FMT_YUV444P10LE:
+	{
+		auto yuv = other->convert_pixel_format (dcp::YUVToRGB::REC709, _pixel_format, Alignment::COMPACT, false);
+		other_yuv_params.data = yuv->data();
+		other_yuv_params.stride = yuv->stride();
+		other_yuv_params.alpha_data = other->data();
+		other_yuv_params.alpha_stride = other->stride();
+		if (other->pixel_format() == AV_PIX_FMT_RGBA64BE) {
+			alpha_blend_onto_yuv444p9or10le(target_params, other_yuv_params, get_alpha_64be);
+		} else {
+			alpha_blend_onto_yuv444p9or10le(target_params, other_yuv_params, get_alpha_byte);
 		}
 		break;
 	}