From 7ace2ca4cd301b4c69671a6983804c6a13ec72a0 Mon Sep 17 00:00:00 2001
From: Carl Hetherington <cth@carlh.net>
Date: Mon, 23 May 2022 00:41:35 +0200
Subject: fixup! wip: Add CUDA J2K frame encoder using libjpeg2k.

---
 src/lib/cuda_j2k_frame_encoder.cc | 179 ++++++++++++++++++++++----------------
 src/lib/cuda_j2k_frame_encoder.h  |  36 +++++---
 2 files changed, 128 insertions(+), 87 deletions(-)
diff --git a/src/lib/cuda_j2k_frame_encoder.cc b/src/lib/cuda_j2k_frame_encoder.cc
index b382e8d3f..675b26f57 100644
--- a/src/lib/cuda_j2k_frame_encoder.cc
+++ b/src/lib/cuda_j2k_frame_encoder.cc
@@ -27,8 +27,10 @@
 #include "player_video.h"
 #include <dcp/openjpeg_image.h>
 #include <nvjpeg2k.h>
+#include <vector>
 
 
+using std::vector;
 using boost::optional;
 
 
@@ -37,116 +39,131 @@ CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
 	nvjpeg2kEncoderCreateSimple(&_encoder_handle);
 	nvjpeg2kEncodeStateCreate(_encoder_handle, &_encoder_state);
 	nvjpeg2kEncodeParamsCreate(&_encoder_params);
-
-	_pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
 }
 
 
-void
-CUDAJ2KFrameEncoder::allocate(dcp::Size size)
+CUDAJ2KFrameEncoder::Frame::Frame(DCPVideo const& vf)
 {
-	for (int i = 0; i < 3; ++i) {
-		_image_component_info[i].component_width = size.width;
-		_image_component_info[i].component_height = size.height;
-		_image_component_info[i].precision = 12;
-		_image_component_info[i].sgn = 0;
-		_pitch_in_bytes[i] = size.width * 2;
-	}
+	auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+
+	void* pixel_data_h[] = {
+		xyz->data(0),
+		xyz->data(1),
+		xyz->data(2)
+	};
+
+	auto const pitch = xyz->size().width * 2;
 
 	for (int i = 0; i < 3; ++i) {
-		auto error = cudaMallocPitch(
+		_pitch_in_bytes[i] = pitch;
+		auto status = cudaMallocPitch(
 			reinterpret_cast<void**>(&_pixel_data_d[i]),
 			&_pitch_in_bytes[i],
-			_image_component_info[i].component_width * 2,
-			_image_component_info[i].component_height
+			pitch,
+			xyz->size().height
 			);
-		if (error != cudaSuccess) {
-			throw CUDAError("cudaMallocPitch", error);
+
+		if (status != cudaSuccess) {
+			throw CUDAError("cudaMallocPitch", status);
 		}
-	}
 
-	_allocated_size = size;
-}
+		status = cudaMemcpy2D(
+			_pixel_data_d[i],
+			_pitch_in_bytes[i],
+			pixel_data_h[i],
+			_pitch_in_bytes[i],
+			pitch,
+			xyz->size().height,
+			cudaMemcpyHostToDevice
+			);
 
+		if (status != cudaSuccess) {
+			throw CUDAError("cudaMemcpy2D", status);
+		}
 
-CUDAJ2KFrameEncoder::~CUDAJ2KFrameEncoder()
-{
-	free();
+		cudaDeviceSynchronize();
+	}
+
+	_device_image.num_components = 3;
+	_device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+	_device_image.pixel_type = NVJPEG2K_UINT16;
+	_device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
 }
 
 
-void
-CUDAJ2KFrameEncoder::free()
+CUDAJ2KFrameEncoder::Frame::Frame(Frame&& other)
 {
 	for (int i = 0; i < 3; ++i) {
-		cudaFree(&_pixel_data_d[i]);
+		_pixel_data_d[i] = other._pixel_data_d[i];
+		other._pixel_data_d[i] = nullptr;
+		_pitch_in_bytes[i] = other._pitch_in_bytes[i];
 	}
 
-	_pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
+	_device_image.num_components = other._device_image.num_components;
+	_device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+	_device_image.pixel_type = NVJPEG2K_UINT16;
+	_device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
 }
 
 
-optional<dcp::ArrayData>
-CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
+CUDAJ2KFrameEncoder::Frame::~Frame()
 {
-	std::cout << "cuda encode starts.\n";
+	cudaFree(_pixel_data_d[0]);
+	cudaFree(_pixel_data_d[1]);
+	cudaFree(_pixel_data_d[2]);
+}
 
-	auto frame = vf.frame();
-	auto size = frame->out_size();
 
-	if (size != _allocated_size) {
-		free();
-		allocate(size);
-	}
+vector<dcp::ArrayData>
+CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
+{
+	int constexpr BATCH_SIZE = 128;
 
-	auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+	auto const size = vf.frame()->out_size();
+	DCPOMATIC_ASSERT(!_size || size == *_size);
+	_size = size;
 
-	for (int i = 0; i < 3; ++i) {
-		_pixel_data_h[i] = reinterpret_cast<uint8_t*>(xyz->data(i));
+	DCPOMATIC_ASSERT(!_resolution || vf.resolution() == *_resolution);
+	_resolution = vf.resolution();
+
+	if (_batch.size() < BATCH_SIZE) {
+		_batch.push_back(Frame(vf));
+		return {};
 	}
 
-	nvjpeg2kImage_t image_h;
-	image_h.num_components = 3;
-	image_h.pixel_data = reinterpret_cast<void**>(_pixel_data_h);
-	image_h.pixel_type = NVJPEG2K_UINT16;
-	image_h.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+	return flush();
+}
+
 
-	nvjpeg2kImage_t image_d;
-	image_d.num_components = 3;
-	image_d.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
-	image_d.pixel_type = NVJPEG2K_UINT16;
-	image_d.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+vector<dcp::ArrayData>
+CUDAJ2KFrameEncoder::flush()
+{
+	if (_batch.empty()) {
+		return {};
+	}
 
+	nvjpeg2kImageComponentInfo_t info[3];
 	for (int i = 0; i < 3; ++i) {
-		auto status = cudaMemcpy2D(
-			image_d.pixel_data[i],
-			image_d.pitch_in_bytes[i],
-			image_h.pixel_data[i],
-			image_h.pitch_in_bytes[i],
-			2 * _image_component_info[i].component_width,
-			_image_component_info[i].component_height,
-			cudaMemcpyHostToDevice
-			);
-		if (status != cudaSuccess) {
-			throw CUDAError("cudaMemcpy2D", status);
-		}
-		cudaDeviceSynchronize();
+		info[i].component_width = _size->width;
+		info[i].component_height = _size->height;
+		info[i].precision = 12;
+		info[i].sgn = 0;
 	}
 
 	nvjpeg2kEncodeConfig_t config;
 	memset(&config, 0, sizeof(config));
 	config.stream_type = NVJPEG2K_STREAM_J2K;
 	config.color_space = NVJPEG2K_COLORSPACE_SRGB;
-	config.image_width = size.width;
-	config.image_height = size.height;
+	config.image_width = _size->width;
+	config.image_height = _size->height;
 	config.num_components = 3;
-	config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&_image_component_info);
+	config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&info);
 	config.code_block_w = 32;
 	config.code_block_h = 32;
 	config.irreversible = 0;
 	config.mct_mode = 1;
 	config.prog_order = NVJPEG2K_CPRL;
-	config.num_resolutions = vf.resolution() == Resolution::FOUR_K ? 7 : 6;
+	config.num_resolutions = *_resolution == Resolution::FOUR_K ? 7 : 6;
 
 	auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encoder_params, &config);
 	if (status != NVJPEG2K_STATUS_SUCCESS) {
@@ -159,23 +176,31 @@ CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
 		throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status);
 	}
 
-	status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, &image_d, 0);
-	if (status != NVJPEG2K_STATUS_SUCCESS) {
-		throw CUDAError("nvjpeg2kEncode", status);
-	}
+	vector<dcp::ArrayData> output;
 
-	size_t compressed_size;
-	status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0);
+	for (auto const& frame: _batch) {
 
-	dcp::ArrayData output(compressed_size);
-	status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, output.data(), &compressed_size, 0);
-	cudaDeviceSynchronize();
+		auto x = frame.device_image();
 
-	if (status != NVJPEG2K_STATUS_SUCCESS) {
-		throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+		status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, frame.device_image(), 0);
+		if (status != NVJPEG2K_STATUS_SUCCESS) {
+			throw CUDAError("nvjpeg2kEncode", status);
+		}
+
+		size_t compressed_size;
+		status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0);
+
+		dcp::ArrayData this_output(compressed_size);
+		status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, this_output.data(), &compressed_size, 0);
+		if (status != NVJPEG2K_STATUS_SUCCESS) {
+			throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+		}
+
+		output.push_back(this_output);
+		cudaStreamSynchronize(0);
 	}
 
-	std::cout << "cuda encode ends with " << status << " " << output.size() << "\n";
+	_batch.clear();
 
 	return output;
 }
diff --git a/src/lib/cuda_j2k_frame_encoder.h b/src/lib/cuda_j2k_frame_encoder.h
index 46ac64607..28e8a3c76 100644
--- a/src/lib/cuda_j2k_frame_encoder.h
+++ b/src/lib/cuda_j2k_frame_encoder.h
@@ -24,6 +24,7 @@
 
 
 #include "j2k_frame_encoder.h"
+#include "types.h"
 #include <dcp/types.h>
 #include <nvjpeg2k.h>
 #include <vector>
@@ -33,24 +34,39 @@ class CUDAJ2KFrameEncoder : public J2KFrameEncoder
 {
 public:
 	CUDAJ2KFrameEncoder();
-	~CUDAJ2KFrameEncoder();
 
-	boost::optional<dcp::ArrayData> encode(DCPVideo const &) override;
+	std::vector<dcp::ArrayData> encode(DCPVideo const &) override;
+	std::vector<dcp::ArrayData> flush() override;
 	void log_thread_start() override;
 
 private:
-	void allocate(dcp::Size size);
-	void free();
-
 	nvjpeg2kEncoder_t _encoder_handle;
 	nvjpeg2kEncodeState_t _encoder_state;
 	nvjpeg2kEncodeParams_t _encoder_params;
-	uint8_t* _pixel_data_h[3];
-	uint8_t* _pixel_data_d[3];
-	size_t _pitch_in_bytes[3];
-	nvjpeg2kImageComponentInfo_t _image_component_info[3];
 
-	dcp::Size _allocated_size;
+	class Frame
+	{
+	public:
+		Frame(DCPVideo const& vf);
+		Frame(Frame const& other) = delete;
+		Frame(Frame&& other);
+		~Frame();
+
+		Frame& operator=(Frame const& other) = delete;
+
+		nvjpeg2kImage_t const* const device_image() const {
+			return &_device_image;
+		}
+
+	private:
+		uint8_t* _pixel_data_d[3];
+		size_t _pitch_in_bytes[3];
+		nvjpeg2kImage_t _device_image;
+	};
+
+	std::vector<Frame> _batch;
+	boost::optional<dcp::Size> _size;
+	boost::optional<Resolution> _resolution;
 };
 
 
-- 
cgit v1.2.3