From de85e79a50944e0eb9e87412e02537bf83e15928 Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Mon, 23 May 2022 23:03:45 +0200 Subject: [PATCH] Separate thread for handling the CUDA parts seems a little better. --- src/lib/cuda_j2k_frame_encoder.cc | 180 ++++++++++++++++++++---------- src/lib/cuda_j2k_frame_encoder.h | 32 ++++-- 2 files changed, 145 insertions(+), 67 deletions(-) diff --git a/src/lib/cuda_j2k_frame_encoder.cc b/src/lib/cuda_j2k_frame_encoder.cc index 445548558..95b742484 100644 --- a/src/lib/cuda_j2k_frame_encoder.cc +++ b/src/lib/cuda_j2k_frame_encoder.cc @@ -25,35 +25,124 @@ #include "dcp_video.h" #include "exceptions.h" #include "player_video.h" +#include #include #include +#include #include using std::make_pair; +using std::thread; using std::vector; using boost::optional; +vector CUDAJ2KFrameEncoder::_cuda_threads; +std::queue CUDAJ2KFrameEncoder::_input; +std::map, dcp::ArrayData> CUDAJ2KFrameEncoder::_output; +boost::condition CUDAJ2KFrameEncoder::_input_condition; +boost::mutex CUDAJ2KFrameEncoder::_input_mutex; +boost::condition CUDAJ2KFrameEncoder::_output_condition; +boost::mutex CUDAJ2KFrameEncoder::_output_mutex; + + CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder() { - nvjpeg2kEncoderCreateSimple(&_encoder_handle); - nvjpeg2kEncodeStateCreate(_encoder_handle, &_encoder_state); - nvjpeg2kEncodeParamsCreate(&_encoder_params); - - cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking); + if (_cuda_threads.empty()) { + for (int i = 0; i < 8; ++i) { + _cuda_threads.push_back(std::thread(&CUDAJ2KFrameEncoder::cuda_thread)); + } + } } -CUDAJ2KFrameEncoder::~CUDAJ2KFrameEncoder() +void +CUDAJ2KFrameEncoder::cuda_thread() { - cudaStreamDestroy(_stream); + nvjpeg2kEncoder_t encoder_handle; + nvjpeg2kEncodeState_t encoder_state; + nvjpeg2kEncodeParams_t encoder_params; + + nvjpeg2kEncoderCreateSimple(&encoder_handle); + nvjpeg2kEncodeStateCreate(encoder_handle, &encoder_state); + nvjpeg2kEncodeParamsCreate(&encoder_params); + + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + while (true) { + boost::mutex::scoped_lock lm(_input_mutex); + while (_input.empty()) { + std::cout << "gpu starved.\n"; + _input_condition.wait(lm); + } + + auto input = std::move(_input.front()); + _input.pop(); + lm.unlock(); + + nvjpeg2kImageComponentInfo_t info[3]; + for (int i = 0; i < 3; ++i) { + info[i].component_width = input.size().width; + info[i].component_height = input.size().height; + info[i].precision = 12; + info[i].sgn = 0; + } + + nvjpeg2kEncodeConfig_t config; + memset(&config, 0, sizeof(config)); + config.stream_type = NVJPEG2K_STREAM_J2K; + config.color_space = NVJPEG2K_COLORSPACE_SRGB; + config.image_width = input.size().width; + config.image_height = input.size().height; + config.num_components = 3; + config.image_comp_info = reinterpret_cast(&info); + config.code_block_w = 32; + config.code_block_h = 32; + config.irreversible = 0; + config.mct_mode = 1; + config.prog_order = NVJPEG2K_CPRL; + config.num_resolutions = input.resolution() == Resolution::FOUR_K ? 7 : 6; + + auto status = nvjpeg2kEncodeParamsSetEncodeConfig(encoder_params, &config); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status); + } + + // XXX: quality + status = nvjpeg2kEncodeParamsSetQuality(encoder_params, 30); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status); + } + + status = nvjpeg2kEncode(encoder_handle, encoder_state, encoder_params, input.device_image(), stream); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncode", status); + } + + size_t compressed_size; + status = nvjpeg2kEncodeRetrieveBitstream(encoder_handle, encoder_state, nullptr, &compressed_size, stream); + + dcp::ArrayData output(compressed_size); + status = nvjpeg2kEncodeRetrieveBitstream(encoder_handle, encoder_state, output.data(), &compressed_size, stream); + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status); + } + + boost::mutex::scoped_lock lm2(_output_mutex); + _output[make_pair(input.index(), input.eyes())] = output; + _output_condition.notify_all(); + } + + cudaStreamDestroy(stream); } -CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream) +CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf) : _index(vf.index()) , _eyes(vf.eyes()) + , _resolution(vf.resolution()) { _xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2)); @@ -61,7 +150,9 @@ CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream) _pixel_data_h[i] = reinterpret_cast(_xyz->data(i)); } - auto const pitch = _xyz->size().width * 2; + _size = _xyz->size(); + + auto const pitch = _size.width * 2; for (int i = 0; i < 3; ++i) { _pitch_in_bytes[i] = pitch; @@ -69,22 +160,21 @@ CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream) reinterpret_cast(&_pixel_data_d[i]), &_pitch_in_bytes[i], pitch, - _xyz->size().height + _size.height ); if (status != cudaSuccess) { throw CUDAError("cudaMallocPitch", status); } - status = cudaMemcpy2DAsync( + status = cudaMemcpy2D( _pixel_data_d[i], _pitch_in_bytes[i], _pixel_data_h[i], _pitch_in_bytes[i], pitch, - _xyz->size().height, - cudaMemcpyHostToDevice, - stream + _size.height, + cudaMemcpyHostToDevice ); if (status != cudaSuccess) { @@ -102,6 +192,8 @@ CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream) CUDAJ2KFrameEncoder::Input::Input(Input&& other) : _index(other._index) , _eyes(other._eyes) + , _size(other._size) + , _resolution(other._resolution) { for (int i = 0; i < 3; ++i) { _pixel_data_d[i] = other._pixel_data_d[i]; @@ -127,7 +219,7 @@ CUDAJ2KFrameEncoder::Input::~Input() optional CUDAJ2KFrameEncoder::encode(DCPVideo const& vf) { - auto input = Input(vf, _stream); + auto input = Input(vf); auto const size = vf.frame()->out_size(); DCPOMATIC_ASSERT(!_size || size == *_size); @@ -136,58 +228,26 @@ CUDAJ2KFrameEncoder::encode(DCPVideo const& vf) DCPOMATIC_ASSERT(!_resolution || vf.resolution() == *_resolution); _resolution = vf.resolution(); - nvjpeg2kImageComponentInfo_t info[3]; - for (int i = 0; i < 3; ++i) { - info[i].component_width = _size->width; - info[i].component_height = _size->height; - info[i].precision = 12; - info[i].sgn = 0; - } - - nvjpeg2kEncodeConfig_t config; - memset(&config, 0, sizeof(config)); - config.stream_type = NVJPEG2K_STREAM_J2K; - config.color_space = NVJPEG2K_COLORSPACE_SRGB; - config.image_width = _size->width; - config.image_height = _size->height; - config.num_components = 3; - config.image_comp_info = reinterpret_cast(&info); - config.code_block_w = 32; - config.code_block_h = 32; - config.irreversible = 0; - config.mct_mode = 1; - config.prog_order = NVJPEG2K_CPRL; - config.num_resolutions = *_resolution == Resolution::FOUR_K ? 7 : 6; - - auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encoder_params, &config); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status); - } - - // XXX: quality - status = nvjpeg2kEncodeParamsSetQuality(_encoder_params, 30); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status); + { + boost::mutex::scoped_lock lm (_input_mutex); + _input.push(std::move(input)); + std::cout << "push input: " << _input.size() << "\n"; + _input_condition.notify_all(); } - status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, input.device_image(), _stream); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncode", status); + boost::mutex::scoped_lock lm(_output_mutex); + while (_output.find(make_pair(vf.index(), vf.eyes())) == _output.end()) { + _output_condition.wait(lm); } - size_t compressed_size; - status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, _stream); - - dcp::ArrayData output(compressed_size); - status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, output.data(), &compressed_size, _stream); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status); - } - - return output; + auto iter = _output.find(make_pair(vf.index(), vf.eyes())); + auto data = iter->second; + _output.erase(iter); + return data; } + void CUDAJ2KFrameEncoder::log_thread_start () { diff --git a/src/lib/cuda_j2k_frame_encoder.h b/src/lib/cuda_j2k_frame_encoder.h index 36539a05e..34d8a600b 100644 --- a/src/lib/cuda_j2k_frame_encoder.h +++ b/src/lib/cuda_j2k_frame_encoder.h @@ -27,9 +27,12 @@ #include "types.h" #include #include +#include #include #include #include +#include +#include #include @@ -37,7 +40,6 @@ class CUDAJ2KFrameEncoder : public J2KFrameEncoder { public: CUDAJ2KFrameEncoder(); - ~CUDAJ2KFrameEncoder(); boost::optional encode(DCPVideo const &) override; void flush() override; @@ -46,14 +48,10 @@ public: private: void encode_queue(); - nvjpeg2kEncoder_t _encoder_handle; - nvjpeg2kEncodeState_t _encoder_state; - nvjpeg2kEncodeParams_t _encoder_params; - class Input { public: - Input(DCPVideo const& vf, cudaStream_t stream); + Input(DCPVideo const& vf); Input(Input const& other) = delete; Input(Input&& other); ~Input(); @@ -72,6 +70,14 @@ private: return _eyes; } + dcp::Size size() const { + return _size; + } + + Resolution resolution() const { + return _resolution; + } + private: std::shared_ptr _xyz; uint8_t* _pixel_data_h[3]; @@ -80,11 +86,23 @@ private: nvjpeg2kImage_t _device_image; int _index; Eyes _eyes; + dcp::Size _size; + Resolution _resolution; }; boost::optional _size; boost::optional _resolution; - cudaStream_t _stream; + + static void cuda_thread(); + + static std::vector _cuda_threads; + static std::queue _input; + static boost::condition _input_condition; + static boost::mutex _input_mutex; + + static std::map, dcp::ArrayData> _output; + static boost::condition _output_condition; + static boost::mutex _output_mutex; }; -- 2.30.2