wip: CUDA with nvjpeg2k

author: Carl Hetherington <cth@carlh.net> 2025-09-19 15:04:25 +0200
committer: Carl Hetherington <cth@carlh.net> 2025-09-30 00:19:53 +0200
commit: 24fc7b5d990044c7e9f2c95ea82ce12f024b1bfc (patch)
tree: c9eb168fa682c0487e91d95b2ddc77147a87ff4e
parent: 19a0537345d9c39962f70420299a07293fe6a975 (diff)
7 files changed, 320 insertions, 2 deletions
diff --git a/src/lib/cuda_j2k_encoder_thread.cc b/src/lib/cuda_j2k_encoder_thread.cc
new file mode 100644
index 000000000..466b78fc1
--- /dev/null
+++ b/src/lib/cuda_j2k_encoder_thread.cc
@@ -0,0 +1,207 @@
+/*
+    Copyright (C) 2025 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "colour_conversion.h"
+#include "cuda_j2k_encoder_thread.h"
+#include "dcp_video.h"
+#include "exceptions.h"
+#include "resolution.h"
+#include <nvjpeg2k.h>
+
+
+using std::shared_ptr;
+
+
+CUDAJ2KEncoderThread::CUDAJ2KEncoderThread(J2KEncoder& encoder)
+	: J2KSyncEncoderThread(encoder)
+{
+	nvjpeg2kEncoderCreateSimple(&_encoder_handle);
+	nvjpeg2kEncodeStateCreate(_encoder_handle, &_encode_state);
+	nvjpeg2kEncodeParamsCreate(&_encode_params);
+
+	nvjpeg2kEncodeParamsSetInputFormat(_encode_params, NVJPEG2K_FORMAT_PLANAR);
+
+	cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking);
+}
+
+
+shared_ptr<dcp::ArrayData>
+CUDAJ2KEncoderThread::encode(DCPVideo const& frame)
+{
+	auto xyz = convert_to_xyz(frame.frame());
+	auto const size = xyz->size();
+
+	/* {x,y,z}lp is 4-byte int representation */
+	int* xlp = xyz->data(0);
+	int* ylp = xyz->data(1);
+	int* zlp = xyz->data(2);
+
+	uint8_t* pixel_data_h[3];
+
+	/* {x,y,z}p is 2-byte int representation */
+	_xyz_x.resize(size.width * size.height);
+	int16_t* xp = _xyz_x.data();
+	pixel_data_h[0] = reinterpret_cast<uint8_t*>(xp);
+
+	_xyz_y.resize(size.width * size.height);
+	int16_t* yp = _xyz_y.data();
+	pixel_data_h[1] = reinterpret_cast<uint8_t*>(yp);
+
+	_xyz_z.resize(size.width * size.height);
+	int16_t* zp = _xyz_z.data();
+	pixel_data_h[2] = reinterpret_cast<uint8_t*>(zp);
+
+	for (int j = 0; j < size.width * size.height; ++j) {
+		*xp++ = static_cast<int16_t>(*xlp++);
+		*yp++ = static_cast<int16_t>(*ylp++);
+		*zp++ = static_cast<int16_t>(*zlp++);
+	}
+
+	allocate_pixel_data_d(size);
+
+	/* Copy to device */
+	size_t pitch_in_bytes_h[3];
+	for (int i = 0; i < 3; ++i) {
+		pitch_in_bytes_h[i] = size.width * 2;
+		auto const status = cudaMemcpy2D(
+			_pixel_data_d[i],
+			_pitch_in_bytes_d[i],
+			pixel_data_h[i],
+			pitch_in_bytes_h[i],
+			size.width * 2,
+			size.height,
+			cudaMemcpyHostToDevice
+			);
+
+		if (status != cudaSuccess) {
+			throw CUDAError("cudaMemcpy2D", status);
+		}
+	}
+
+	nvjpeg2kImage_t device_image;
+	device_image.num_components = 3;
+	device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+	device_image.pixel_type = NVJPEG2K_INT16;
+	device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes_d);
+
+	nvjpeg2kImageComponentInfo_t info[3];
+	for (int i = 0; i < 3; ++i) {
+		info[i].component_width = size.width;
+		info[i].component_height = size.height;
+		info[i].precision = 12;
+		info[i].sgn = 0;
+	}
+
+	nvjpeg2kEncodeConfig_t config;
+	memset(&config, 0, sizeof(config));
+	config.stream_type = NVJPEG2K_STREAM_J2K;
+	config.color_space = NVJPEG2K_COLORSPACE_SRGB;
+	config.image_width = size.width;
+	config.image_height = size.height;
+	config.num_components = 3;
+	config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&info);
+	config.code_block_w = 32;
+	config.code_block_h = 32;
+	config.irreversible = 0;
+	config.mct_mode = 1;
+	config.prog_order = NVJPEG2K_CPRL;
+	config.num_resolutions = frame.resolution() == Resolution::FOUR_K ? 7 : 6;
+	config.num_precincts_init = 7;
+	config.precinct_width[0] = 7;
+	config.precinct_height[0] = 7;
+	for (int i = 1; i < 6; ++i) {
+		config.precinct_width[i] = 8;
+		config.precinct_height[i] = 8;
+	}
+
+	auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encode_params, &config);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status);
+	}
+
+	// XXX: quality
+	status = nvjpeg2kEncodeParamsSetQuality(_encode_params, 50);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status);
+	}
+
+	status = nvjpeg2kEncode(_encoder_handle, _encode_state, _encode_params, &device_image, _stream);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kEncode", status);
+	}
+
+	size_t compressed_size;
+	status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encode_state, nullptr, &compressed_size, _stream);
+
+	auto output = std::make_shared<dcp::ArrayData>(compressed_size);
+	status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encode_state, output->data(), &compressed_size, _stream);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+	}
+
+	return output;
+}
+
+
+void
+CUDAJ2KEncoderThread::allocate_pixel_data_d(dcp::Size size)
+{
+	if (size == _pixel_data_d_size) {
+		return;
+	}
+
+	free_pixel_data_d();
+
+	for (int i = 0; i < 3; ++i) {
+		auto status = cudaMallocPitch(
+			reinterpret_cast<void**>(&_pixel_data_d[i]),
+			&_pitch_in_bytes_d[i],
+			size.width * 2,
+			size.height
+			);
+
+		if (status != cudaSuccess) {
+			throw CUDAError("cudaMallocPitch", status);
+		}
+	}
+
+	_pixel_data_d_size = size;
+}
+
+
+void
+CUDAJ2KEncoderThread::free_pixel_data_d()
+{
+	for (int i = 0; i < 3; ++i) {
+		cudaFree(_pixel_data_d[i]);
+	}
+}
+
+
+CUDAJ2KEncoderThread::~CUDAJ2KEncoderThread()
+{
+	free_pixel_data_d();
+
+	nvjpeg2kEncodeParamsDestroy(_encode_params);
+	nvjpeg2kEncodeStateDestroy(_encode_state);
+	nvjpeg2kEncoderDestroy(_encoder_handle);
+}
+
diff --git a/src/lib/cuda_j2k_encoder_thread.h b/src/lib/cuda_j2k_encoder_thread.h
new file mode 100644
index 000000000..245d34328
--- /dev/null
+++ b/src/lib/cuda_j2k_encoder_thread.h
@@ -0,0 +1,72 @@
+/*
+    Copyright (C) 2025 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#ifndef DCPOMATIC_CUDA_J2K_ENCODER_THREAD
+#define DCPOMATIC_CUDA_J2K_ENCODER_THREAD
+
+
+#include "j2k_sync_encoder_thread.h"
+#include <dcp/array_data.h>
+#include <dcp/types.h>
+#include <nvjpeg2k.h>
+
+
+class DCPVideo;
+
+
+/** @class CUDAJ2KEncoderThread
+ *  @brief J2K encoder thread using CUDA via nvidia's nvjpeg2k library.
+ *
+ *  Each CUDAJ2KEncoderThread sends frames for encoding to a different CUDA
+ *  stream.  The thread sends the image and blocks until the result is ready.
+ *
+ *  This is different to the Grok encoder (that also uses CUDA but via an
+ *  additional paid-for tool called Grok).
+ */
+class CUDAJ2KEncoderThread : public J2KSyncEncoderThread
+{
+public:
+	CUDAJ2KEncoderThread(J2KEncoder& encoder);
+	~CUDAJ2KEncoderThread();
+
+	std::shared_ptr<dcp::ArrayData> encode(DCPVideo const& frame) override;
+
+private:
+	void allocate_pixel_data_d(dcp::Size size);
+	void free_pixel_data_d();
+
+	nvjpeg2kEncoder_t _encoder_handle;
+	nvjpeg2kEncodeState_t _encode_state;
+	nvjpeg2kEncodeParams_t _encode_params;
+	cudaStream_t _stream;
+
+	uint8_t* _pixel_data_d[3] = { nullptr, nullptr, nullptr };
+	size_t _pitch_in_bytes_d[3];
+	dcp::Size _pixel_data_d_size;
+
+	std::vector<int16_t> _xyz_x;
+	std::vector<int16_t> _xyz_y;
+	std::vector<int16_t> _xyz_z;
+};
+
+
+#endif
+
diff --git a/src/lib/exceptions.cc b/src/lib/exceptions.cc
index e2e7fc4bc..4eca9f67e 100644
--- a/src/lib/exceptions.cc
+++ b/src/lib/exceptions.cc
@@ -190,3 +190,20 @@ SQLError::get_filename(SQLiteDatabase& db)
 
 	return {};
 }
+
+
+#ifdef DCPOMATIC_HAVE_NVJPEG2K
+CUDAError::CUDAError(string function, cudaError error)
+	: runtime_error(fmt::format("CUDA error on {} ({})", function, static_cast<int>(error)))
+{
+
+}
+
+
+CUDAError::CUDAError(string function, int error)
+	: runtime_error(fmt::format("CUDA error on {} ({})", function, error))
+{
+
+}
+#endif
+
diff --git a/src/lib/exceptions.h b/src/lib/exceptions.h
index 212d002e2..5dd68408d 100644
--- a/src/lib/exceptions.h
+++ b/src/lib/exceptions.h
@@ -31,6 +31,9 @@
 extern "C" {
 #include <libavutil/pixfmt.h>
 }
+#ifdef DCPOMATIC_HAVE_NVJPEG2K
+#include <cuda_runtime_api.h>
+#endif
 #include <fmt/format.h>
 #include <boost/filesystem.hpp>
 #include <boost/optional.hpp>
@@ -547,4 +550,13 @@ public:
 };
 
 
+#ifdef DCPOMATIC_HAVE_NVJPEG2K
+class CUDAError : public std::runtime_error
+{
+public:
+	CUDAError(std::string function, cudaError_t error);
+	CUDAError(std::string function, int);
+};
+#endif
+
 #endif
diff --git a/src/lib/wscript b/src/lib/wscript
index 2e7b0339c..621898037 100644
--- a/src/lib/wscript
+++ b/src/lib/wscript
@@ -255,7 +255,7 @@ def build(bld):
                  BOOST_FILESYSTEM BOOST_THREAD BOOST_DATETIME BOOST_SIGNALS2 BOOST_REGEX
                  SAMPLERATE POSTPROC TIFF SSH DCP CXML GLIB LZMA XML++
                  CURL ZIP BZ2 FONTCONFIG PANGOMM CAIROMM XMLSEC SUB ICU NETTLE PNG JPEG LEQM_NRT
-                 LIBZ SQLITE3
+                 LIBZ SQLITE3 CUDA
                  """
 
     if bld.env.TARGET_OSX:
@@ -269,6 +269,9 @@ def build(bld):
         if bld.env.TARGET_LINUX:
             obj.uselib += ' POLKIT'
 
+    if bld.env.HAVE_NVJPEG2K_H:
+        obj.source += ' cuda_j2k_encoder_thread.cc'
+
     if bld.env.ENABLE_GROK:
         obj.source += ' grok_j2k_encoder_thread.cc grok/util.cc'
 
diff --git a/src/tools/wscript b/src/tools/wscript
index 3128486f5..cbe397f48 100644
--- a/src/tools/wscript
+++ b/src/tools/wscript
@@ -98,7 +98,7 @@ def configure(conf):
 def build(bld):
     uselib =  'BOOST_THREAD BOOST_DATETIME DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC '
     uselib += 'AVUTIL SWSCALE SWRESAMPLE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB '
-    uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT SQLITE3 BOOST_PROCESS '
+    uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT SQLITE3 BOOST_PROCESS CUDA '
 
     if bld.env.ENABLE_DISK:
         if bld.env.TARGET_LINUX:
diff --git a/wscript b/wscript
index b9ba179bf..a44f25dca 100644
--- a/wscript
+++ b/wscript
@@ -81,6 +81,7 @@ def options(opt):
     opt.add_option('--disable-more-warnings', action='store_true', default=False, help='disable some warnings raised by Xcode 15 with the 2.16 branch')
     opt.add_option('--c++17', action='store_true', default=False, help='build with C++17 and libxml++-4.0')
     opt.add_option('--variant', help="build with variant")
+    opt.add_option('--cuda-path',         help='path to directory containing include/cuda_runtime_api.h etc.', default='/usr/local/cuda')
 
 def configure(conf):
     conf.load('compiler_cxx')
@@ -193,6 +194,9 @@ def configure(conf):
         except conf.errors.ConfigurationError:
             pass
 
+    conf.env.append_value('INCLUDES', os.path.join(conf.options.cuda_path, 'include'))
+    conf.env.append_value('LIBPATH', os.path.join(conf.options.cuda_path, 'targets', 'x86_64-linux', 'lib'))
+
     #
     # Windows/Linux/macOS specific
     #
@@ -680,6 +684,9 @@ def configure(conf):
                    define_name="DCPOMATIC_HAVE_SQLITE3_PREPARE_V3",
                    mandatory=False)
 
+    if conf.check(header_name="nvjpeg2k.h", mandatory=False, define_name="DCPOMATIC_HAVE_NVJPEG2K"):
+        conf.env.LIB_CUDA = ['cudart', 'nvjpeg2k']
+
 
     # Other stuff
author	Carl Hetherington <cth@carlh.net>	2025-09-19 15:04:25 +0200
committer	Carl Hetherington <cth@carlh.net>	2025-09-30 00:19:53 +0200
commit	24fc7b5d990044c7e9f2c95ea82ce12f024b1bfc (patch)
tree	c9eb168fa682c0487e91d95b2ddc77147a87ff4e
parent	19a0537345d9c39962f70420299a07293fe6a975 (diff)