diff options
| author | Carl Hetherington <cth@carlh.net> | 2025-09-19 15:04:25 +0200 |
|---|---|---|
| committer | Carl Hetherington <cth@carlh.net> | 2025-09-30 00:19:53 +0200 |
| commit | 24fc7b5d990044c7e9f2c95ea82ce12f024b1bfc (patch) | |
| tree | c9eb168fa682c0487e91d95b2ddc77147a87ff4e | |
| parent | 19a0537345d9c39962f70420299a07293fe6a975 (diff) | |
wip: CUDA with nvjpeg2k
| -rw-r--r-- | src/lib/cuda_j2k_encoder_thread.cc | 207 | ||||
| -rw-r--r-- | src/lib/cuda_j2k_encoder_thread.h | 72 | ||||
| -rw-r--r-- | src/lib/exceptions.cc | 17 | ||||
| -rw-r--r-- | src/lib/exceptions.h | 12 | ||||
| -rw-r--r-- | src/lib/wscript | 5 | ||||
| -rw-r--r-- | src/tools/wscript | 2 | ||||
| -rw-r--r-- | wscript | 7 |
7 files changed, 320 insertions, 2 deletions
diff --git a/src/lib/cuda_j2k_encoder_thread.cc b/src/lib/cuda_j2k_encoder_thread.cc new file mode 100644 index 000000000..466b78fc1 --- /dev/null +++ b/src/lib/cuda_j2k_encoder_thread.cc @@ -0,0 +1,207 @@ +/* + Copyright (C) 2025 Carl Hetherington <cth@carlh.net> + + This file is part of DCP-o-matic. + + DCP-o-matic is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DCP-o-matic is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DCP-o-matic. If not, see <http://www.gnu.org/licenses/>. + +*/ + + +#include "colour_conversion.h" +#include "cuda_j2k_encoder_thread.h" +#include "dcp_video.h" +#include "exceptions.h" +#include "resolution.h" +#include <nvjpeg2k.h> + + +using std::shared_ptr; + + +CUDAJ2KEncoderThread::CUDAJ2KEncoderThread(J2KEncoder& encoder) + : J2KSyncEncoderThread(encoder) +{ + nvjpeg2kEncoderCreateSimple(&_encoder_handle); + nvjpeg2kEncodeStateCreate(_encoder_handle, &_encode_state); + nvjpeg2kEncodeParamsCreate(&_encode_params); + + nvjpeg2kEncodeParamsSetInputFormat(_encode_params, NVJPEG2K_FORMAT_PLANAR); + + cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking); +} + + +shared_ptr<dcp::ArrayData> +CUDAJ2KEncoderThread::encode(DCPVideo const& frame) +{ + auto xyz = convert_to_xyz(frame.frame()); + auto const size = xyz->size(); + + /* {x,y,z}lp is 4-byte int representation */ + int* xlp = xyz->data(0); + int* ylp = xyz->data(1); + int* zlp = xyz->data(2); + + uint8_t* pixel_data_h[3]; + + /* {x,y,z}p is 2-byte int representation */ + _xyz_x.resize(size.width * size.height); + int16_t* xp = _xyz_x.data(); + pixel_data_h[0] = reinterpret_cast<uint8_t*>(xp); + + _xyz_y.resize(size.width * size.height); + int16_t* yp = _xyz_y.data(); + pixel_data_h[1] = reinterpret_cast<uint8_t*>(yp); + + _xyz_z.resize(size.width * size.height); + int16_t* zp = _xyz_z.data(); + pixel_data_h[2] = reinterpret_cast<uint8_t*>(zp); + + for (int j = 0; j < size.width * size.height; ++j) { + *xp++ = static_cast<int16_t>(*xlp++); + *yp++ = static_cast<int16_t>(*ylp++); + *zp++ = static_cast<int16_t>(*zlp++); + } + + allocate_pixel_data_d(size); + + /* Copy to device */ + size_t pitch_in_bytes_h[3]; + for (int i = 0; i < 3; ++i) { + pitch_in_bytes_h[i] = size.width * 2; + auto const status = cudaMemcpy2D( + _pixel_data_d[i], + _pitch_in_bytes_d[i], + pixel_data_h[i], + pitch_in_bytes_h[i], + size.width * 2, + size.height, + cudaMemcpyHostToDevice + ); + + if (status != cudaSuccess) { + throw CUDAError("cudaMemcpy2D", status); + } + } + + nvjpeg2kImage_t device_image; + device_image.num_components = 3; + device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d); + device_image.pixel_type = NVJPEG2K_INT16; + device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes_d); + + nvjpeg2kImageComponentInfo_t info[3]; + for (int i = 0; i < 3; ++i) { + info[i].component_width = size.width; + info[i].component_height = size.height; + info[i].precision = 12; + info[i].sgn = 0; + } + + nvjpeg2kEncodeConfig_t config; + memset(&config, 0, sizeof(config)); + config.stream_type = NVJPEG2K_STREAM_J2K; + config.color_space = NVJPEG2K_COLORSPACE_SRGB; + config.image_width = size.width; + config.image_height = size.height; + config.num_components = 3; + config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&info); + config.code_block_w = 32; + config.code_block_h = 32; + config.irreversible = 0; + config.mct_mode = 1; + config.prog_order = NVJPEG2K_CPRL; + config.num_resolutions = frame.resolution() == Resolution::FOUR_K ? 7 : 6; + config.num_precincts_init = 7; + config.precinct_width[0] = 7; + config.precinct_height[0] = 7; + for (int i = 1; i < 6; ++i) { + config.precinct_width[i] = 8; + config.precinct_height[i] = 8; + } + + auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encode_params, &config); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status); + } + + // XXX: quality + status = nvjpeg2kEncodeParamsSetQuality(_encode_params, 50); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status); + } + + status = nvjpeg2kEncode(_encoder_handle, _encode_state, _encode_params, &device_image, _stream); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncode", status); + } + + size_t compressed_size; + status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encode_state, nullptr, &compressed_size, _stream); + + auto output = std::make_shared<dcp::ArrayData>(compressed_size); + status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encode_state, output->data(), &compressed_size, _stream); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status); + } + + return output; +} + + +void +CUDAJ2KEncoderThread::allocate_pixel_data_d(dcp::Size size) +{ + if (size == _pixel_data_d_size) { + return; + } + + free_pixel_data_d(); + + for (int i = 0; i < 3; ++i) { + auto status = cudaMallocPitch( + reinterpret_cast<void**>(&_pixel_data_d[i]), + &_pitch_in_bytes_d[i], + size.width * 2, + size.height + ); + + if (status != cudaSuccess) { + throw CUDAError("cudaMallocPitch", status); + } + } + + _pixel_data_d_size = size; +} + + +void +CUDAJ2KEncoderThread::free_pixel_data_d() +{ + for (int i = 0; i < 3; ++i) { + cudaFree(_pixel_data_d[i]); + } +} + + +CUDAJ2KEncoderThread::~CUDAJ2KEncoderThread() +{ + free_pixel_data_d(); + + nvjpeg2kEncodeParamsDestroy(_encode_params); + nvjpeg2kEncodeStateDestroy(_encode_state); + nvjpeg2kEncoderDestroy(_encoder_handle); +} + diff --git a/src/lib/cuda_j2k_encoder_thread.h b/src/lib/cuda_j2k_encoder_thread.h new file mode 100644 index 000000000..245d34328 --- /dev/null +++ b/src/lib/cuda_j2k_encoder_thread.h @@ -0,0 +1,72 @@ +/* + Copyright (C) 2025 Carl Hetherington <cth@carlh.net> + + This file is part of DCP-o-matic. + + DCP-o-matic is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DCP-o-matic is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DCP-o-matic. If not, see <http://www.gnu.org/licenses/>. + +*/ + + +#ifndef DCPOMATIC_CUDA_J2K_ENCODER_THREAD +#define DCPOMATIC_CUDA_J2K_ENCODER_THREAD + + +#include "j2k_sync_encoder_thread.h" +#include <dcp/array_data.h> +#include <dcp/types.h> +#include <nvjpeg2k.h> + + +class DCPVideo; + + +/** @class CUDAJ2KEncoderThread + * @brief J2K encoder thread using CUDA via nvidia's nvjpeg2k library. + * + * Each CUDAJ2KEncoderThread sends frames for encoding to a different CUDA + * stream. The thread sends the image and blocks until the result is ready. + * + * This is different to the Grok encoder (that also uses CUDA but via an + * additional paid-for tool called Grok). + */ +class CUDAJ2KEncoderThread : public J2KSyncEncoderThread +{ +public: + CUDAJ2KEncoderThread(J2KEncoder& encoder); + ~CUDAJ2KEncoderThread(); + + std::shared_ptr<dcp::ArrayData> encode(DCPVideo const& frame) override; + +private: + void allocate_pixel_data_d(dcp::Size size); + void free_pixel_data_d(); + + nvjpeg2kEncoder_t _encoder_handle; + nvjpeg2kEncodeState_t _encode_state; + nvjpeg2kEncodeParams_t _encode_params; + cudaStream_t _stream; + + uint8_t* _pixel_data_d[3] = { nullptr, nullptr, nullptr }; + size_t _pitch_in_bytes_d[3]; + dcp::Size _pixel_data_d_size; + + std::vector<int16_t> _xyz_x; + std::vector<int16_t> _xyz_y; + std::vector<int16_t> _xyz_z; +}; + + +#endif + diff --git a/src/lib/exceptions.cc b/src/lib/exceptions.cc index e2e7fc4bc..4eca9f67e 100644 --- a/src/lib/exceptions.cc +++ b/src/lib/exceptions.cc @@ -190,3 +190,20 @@ SQLError::get_filename(SQLiteDatabase& db) return {}; } + + +#ifdef DCPOMATIC_HAVE_NVJPEG2K +CUDAError::CUDAError(string function, cudaError error) + : runtime_error(fmt::format("CUDA error on {} ({})", function, static_cast<int>(error))) +{ + +} + + +CUDAError::CUDAError(string function, int error) + : runtime_error(fmt::format("CUDA error on {} ({})", function, error)) +{ + +} +#endif + diff --git a/src/lib/exceptions.h b/src/lib/exceptions.h index 212d002e2..5dd68408d 100644 --- a/src/lib/exceptions.h +++ b/src/lib/exceptions.h @@ -31,6 +31,9 @@ extern "C" { #include <libavutil/pixfmt.h> } +#ifdef DCPOMATIC_HAVE_NVJPEG2K +#include <cuda_runtime_api.h> +#endif #include <fmt/format.h> #include <boost/filesystem.hpp> #include <boost/optional.hpp> @@ -547,4 +550,13 @@ public: }; +#ifdef DCPOMATIC_HAVE_NVJPEG2K +class CUDAError : public std::runtime_error +{ +public: + CUDAError(std::string function, cudaError_t error); + CUDAError(std::string function, int); +}; +#endif + #endif diff --git a/src/lib/wscript b/src/lib/wscript index 2e7b0339c..621898037 100644 --- a/src/lib/wscript +++ b/src/lib/wscript @@ -255,7 +255,7 @@ def build(bld): BOOST_FILESYSTEM BOOST_THREAD BOOST_DATETIME BOOST_SIGNALS2 BOOST_REGEX SAMPLERATE POSTPROC TIFF SSH DCP CXML GLIB LZMA XML++ CURL ZIP BZ2 FONTCONFIG PANGOMM CAIROMM XMLSEC SUB ICU NETTLE PNG JPEG LEQM_NRT - LIBZ SQLITE3 + LIBZ SQLITE3 CUDA """ if bld.env.TARGET_OSX: @@ -269,6 +269,9 @@ def build(bld): if bld.env.TARGET_LINUX: obj.uselib += ' POLKIT' + if bld.env.HAVE_NVJPEG2K_H: + obj.source += ' cuda_j2k_encoder_thread.cc' + if bld.env.ENABLE_GROK: obj.source += ' grok_j2k_encoder_thread.cc grok/util.cc' diff --git a/src/tools/wscript b/src/tools/wscript index 3128486f5..cbe397f48 100644 --- a/src/tools/wscript +++ b/src/tools/wscript @@ -98,7 +98,7 @@ def configure(conf): def build(bld): uselib = 'BOOST_THREAD BOOST_DATETIME DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC ' uselib += 'AVUTIL SWSCALE SWRESAMPLE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB ' - uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT SQLITE3 BOOST_PROCESS ' + uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT SQLITE3 BOOST_PROCESS CUDA ' if bld.env.ENABLE_DISK: if bld.env.TARGET_LINUX: @@ -81,6 +81,7 @@ def options(opt): opt.add_option('--disable-more-warnings', action='store_true', default=False, help='disable some warnings raised by Xcode 15 with the 2.16 branch') opt.add_option('--c++17', action='store_true', default=False, help='build with C++17 and libxml++-4.0') opt.add_option('--variant', help="build with variant") + opt.add_option('--cuda-path', help='path to directory containing include/cuda_runtime_api.h etc.', default='/usr/local/cuda') def configure(conf): conf.load('compiler_cxx') @@ -193,6 +194,9 @@ def configure(conf): except conf.errors.ConfigurationError: pass + conf.env.append_value('INCLUDES', os.path.join(conf.options.cuda_path, 'include')) + conf.env.append_value('LIBPATH', os.path.join(conf.options.cuda_path, 'targets', 'x86_64-linux', 'lib')) + # # Windows/Linux/macOS specific # @@ -680,6 +684,9 @@ def configure(conf): define_name="DCPOMATIC_HAVE_SQLITE3_PREPARE_V3", mandatory=False) + if conf.check(header_name="nvjpeg2k.h", mandatory=False, define_name="DCPOMATIC_HAVE_NVJPEG2K"): + conf.env.LIB_CUDA = ['cudart', 'nvjpeg2k'] + # Other stuff |
