From 7ace2ca4cd301b4c69671a6983804c6a13ec72a0 Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Mon, 23 May 2022 00:41:35 +0200 Subject: fixup! wip: Add CUDA J2K frame encoder using libjpeg2k. --- src/lib/cuda_j2k_frame_encoder.cc | 179 ++++++++++++++++++++++---------------- src/lib/cuda_j2k_frame_encoder.h | 36 +++++--- 2 files changed, 128 insertions(+), 87 deletions(-) diff --git a/src/lib/cuda_j2k_frame_encoder.cc b/src/lib/cuda_j2k_frame_encoder.cc index b382e8d3f..675b26f57 100644 --- a/src/lib/cuda_j2k_frame_encoder.cc +++ b/src/lib/cuda_j2k_frame_encoder.cc @@ -27,8 +27,10 @@ #include "player_video.h" #include #include +#include +using std::vector; using boost::optional; @@ -37,116 +39,131 @@ CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder() nvjpeg2kEncoderCreateSimple(&_encoder_handle); nvjpeg2kEncodeStateCreate(_encoder_handle, &_encoder_state); nvjpeg2kEncodeParamsCreate(&_encoder_params); - - _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr; } -void -CUDAJ2KFrameEncoder::allocate(dcp::Size size) +CUDAJ2KFrameEncoder::Frame::Frame(DCPVideo const& vf) { - for (int i = 0; i < 3; ++i) { - _image_component_info[i].component_width = size.width; - _image_component_info[i].component_height = size.height; - _image_component_info[i].precision = 12; - _image_component_info[i].sgn = 0; - _pitch_in_bytes[i] = size.width * 2; - } + auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2)); + + void* pixel_data_h[] = { + xyz->data(0), + xyz->data(1), + xyz->data(2) + }; + + auto const pitch = xyz->size().width * 2; for (int i = 0; i < 3; ++i) { - auto error = cudaMallocPitch( + _pitch_in_bytes[i] = pitch; + auto status = cudaMallocPitch( reinterpret_cast(&_pixel_data_d[i]), &_pitch_in_bytes[i], - _image_component_info[i].component_width * 2, - _image_component_info[i].component_height + pitch, + xyz->size().height ); - if (error != cudaSuccess) { - throw CUDAError("cudaMallocPitch", error); + + if (status != cudaSuccess) { + throw CUDAError("cudaMallocPitch", status); } - } - _allocated_size = size; -} + status = cudaMemcpy2D( + _pixel_data_d[i], + _pitch_in_bytes[i], + pixel_data_h[i], + _pitch_in_bytes[i], + pitch, + xyz->size().height, + cudaMemcpyHostToDevice + ); + if (status != cudaSuccess) { + throw CUDAError("cudaMemcpy2D", status); + } -CUDAJ2KFrameEncoder::~CUDAJ2KFrameEncoder() -{ - free(); + cudaDeviceSynchronize(); + } + + _device_image.num_components = 3; + _device_image.pixel_data = reinterpret_cast(_pixel_data_d); + _device_image.pixel_type = NVJPEG2K_UINT16; + _device_image.pitch_in_bytes = reinterpret_cast(_pitch_in_bytes); } -void -CUDAJ2KFrameEncoder::free() +CUDAJ2KFrameEncoder::Frame::Frame(Frame&& other) { for (int i = 0; i < 3; ++i) { - cudaFree(&_pixel_data_d[i]); + _pixel_data_d[i] = other._pixel_data_d[i]; + other._pixel_data_d[i] = nullptr; + _pitch_in_bytes[i] = other._pitch_in_bytes[i]; } - _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr; + _device_image.num_components = other._device_image.num_components; + _device_image.pixel_data = reinterpret_cast(_pixel_data_d); + _device_image.pixel_type = NVJPEG2K_UINT16; + _device_image.pitch_in_bytes = reinterpret_cast(_pitch_in_bytes); } -optional -CUDAJ2KFrameEncoder::encode(DCPVideo const& vf) +CUDAJ2KFrameEncoder::Frame::~Frame() { - std::cout << "cuda encode starts.\n"; + cudaFree(_pixel_data_d[0]); + cudaFree(_pixel_data_d[1]); + cudaFree(_pixel_data_d[2]); +} - auto frame = vf.frame(); - auto size = frame->out_size(); - if (size != _allocated_size) { - free(); - allocate(size); - } +vector +CUDAJ2KFrameEncoder::encode(DCPVideo const& vf) +{ + int constexpr BATCH_SIZE = 128; - auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2)); + auto const size = vf.frame()->out_size(); + DCPOMATIC_ASSERT(!_size || size == *_size); + _size = size; - for (int i = 0; i < 3; ++i) { - _pixel_data_h[i] = reinterpret_cast(xyz->data(i)); + DCPOMATIC_ASSERT(!_resolution || vf.resolution() == *_resolution); + _resolution = vf.resolution(); + + if (_batch.size() < BATCH_SIZE) { + _batch.push_back(Frame(vf)); + return {}; } - nvjpeg2kImage_t image_h; - image_h.num_components = 3; - image_h.pixel_data = reinterpret_cast(_pixel_data_h); - image_h.pixel_type = NVJPEG2K_UINT16; - image_h.pitch_in_bytes = reinterpret_cast(_pitch_in_bytes); + return flush(); +} + - nvjpeg2kImage_t image_d; - image_d.num_components = 3; - image_d.pixel_data = reinterpret_cast(_pixel_data_d); - image_d.pixel_type = NVJPEG2K_UINT16; - image_d.pitch_in_bytes = reinterpret_cast(_pitch_in_bytes); +vector +CUDAJ2KFrameEncoder::flush() +{ + if (_batch.empty()) { + return {}; + } + nvjpeg2kImageComponentInfo_t info[3]; for (int i = 0; i < 3; ++i) { - auto status = cudaMemcpy2D( - image_d.pixel_data[i], - image_d.pitch_in_bytes[i], - image_h.pixel_data[i], - image_h.pitch_in_bytes[i], - 2 * _image_component_info[i].component_width, - _image_component_info[i].component_height, - cudaMemcpyHostToDevice - ); - if (status != cudaSuccess) { - throw CUDAError("cudaMemcpy2D", status); - } - cudaDeviceSynchronize(); + info[i].component_width = _size->width; + info[i].component_height = _size->height; + info[i].precision = 12; + info[i].sgn = 0; } nvjpeg2kEncodeConfig_t config; memset(&config, 0, sizeof(config)); config.stream_type = NVJPEG2K_STREAM_J2K; config.color_space = NVJPEG2K_COLORSPACE_SRGB; - config.image_width = size.width; - config.image_height = size.height; + config.image_width = _size->width; + config.image_height = _size->height; config.num_components = 3; - config.image_comp_info = reinterpret_cast(&_image_component_info); + config.image_comp_info = reinterpret_cast(&info); config.code_block_w = 32; config.code_block_h = 32; config.irreversible = 0; config.mct_mode = 1; config.prog_order = NVJPEG2K_CPRL; - config.num_resolutions = vf.resolution() == Resolution::FOUR_K ? 7 : 6; + config.num_resolutions = *_resolution == Resolution::FOUR_K ? 7 : 6; auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encoder_params, &config); if (status != NVJPEG2K_STATUS_SUCCESS) { @@ -159,23 +176,31 @@ CUDAJ2KFrameEncoder::encode(DCPVideo const& vf) throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status); } - status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, &image_d, 0); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncode", status); - } + vector output; - size_t compressed_size; - status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0); + for (auto const& frame: _batch) { - dcp::ArrayData output(compressed_size); - status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, output.data(), &compressed_size, 0); - cudaDeviceSynchronize(); + auto x = frame.device_image(); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status); + status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, frame.device_image(), 0); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncode", status); + } + + size_t compressed_size; + status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0); + + dcp::ArrayData this_output(compressed_size); + status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, this_output.data(), &compressed_size, 0); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status); + } + + output.push_back(this_output); + cudaStreamSynchronize(0); } - std::cout << "cuda encode ends with " << status << " " << output.size() << "\n"; + _batch.clear(); return output; } diff --git a/src/lib/cuda_j2k_frame_encoder.h b/src/lib/cuda_j2k_frame_encoder.h index 46ac64607..28e8a3c76 100644 --- a/src/lib/cuda_j2k_frame_encoder.h +++ b/src/lib/cuda_j2k_frame_encoder.h @@ -24,6 +24,7 @@ #include "j2k_frame_encoder.h" +#include "types.h" #include #include #include @@ -33,24 +34,39 @@ class CUDAJ2KFrameEncoder : public J2KFrameEncoder { public: CUDAJ2KFrameEncoder(); - ~CUDAJ2KFrameEncoder(); - boost::optional encode(DCPVideo const &) override; + std::vector encode(DCPVideo const &) override; + std::vector flush() override; void log_thread_start() override; private: - void allocate(dcp::Size size); - void free(); - nvjpeg2kEncoder_t _encoder_handle; nvjpeg2kEncodeState_t _encoder_state; nvjpeg2kEncodeParams_t _encoder_params; - uint8_t* _pixel_data_h[3]; - uint8_t* _pixel_data_d[3]; - size_t _pitch_in_bytes[3]; - nvjpeg2kImageComponentInfo_t _image_component_info[3]; - dcp::Size _allocated_size; + class Frame + { + public: + Frame(DCPVideo const& vf); + Frame(Frame const& other) = delete; + Frame(Frame&& other); + ~Frame(); + + Frame& operator=(Frame const& other) = delete; + + nvjpeg2kImage_t const* const device_image() const { + return &_device_image; + } + + private: + uint8_t* _pixel_data_d[3]; + size_t _pitch_in_bytes[3]; + nvjpeg2kImage_t _device_image; + }; + + std::vector _batch; + boost::optional _size; + boost::optional _resolution; }; -- cgit v1.2.3