From 322acdcd0fa514aa4612edb1cba0a1bec792cba8 Mon Sep 17 00:00:00 2001 From: Carl Hetherington Date: Tue, 24 May 2022 22:33:06 +0200 Subject: [PATCH] fixup! Playback sort of works. --- src/lib/cuda.cc | 195 ------------------------- src/lib/cuda_decoder.cc | 222 +++++++++++++++++++++++++++++ src/lib/{cuda.h => cuda_decoder.h} | 44 +++--- src/lib/j2k_image_proxy.cc | 4 +- src/lib/wscript | 2 +- 5 files changed, 251 insertions(+), 216 deletions(-) delete mode 100644 src/lib/cuda.cc create mode 100644 src/lib/cuda_decoder.cc rename src/lib/{cuda.h => cuda_decoder.h} (60%) diff --git a/src/lib/cuda.cc b/src/lib/cuda.cc deleted file mode 100644 index d12dfe34a..000000000 --- a/src/lib/cuda.cc +++ /dev/null @@ -1,195 +0,0 @@ -/* - Copyright (C) 2022 Carl Hetherington - - This file is part of DCP-o-matic. - - DCP-o-matic is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - DCP-o-matic is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with DCP-o-matic. If not, see . - -*/ - - -#include "cuda.h" -#include "dcpomatic_assert.h" -#include "exceptions.h" -#include -#include - - -using std::shared_ptr; - - -CUDA* CUDA::_instance = nullptr; - - -CUDA::CUDA() -{ - _decode_thread = std::thread(std::bind(&CUDA::decode_thread, this)); -} - - -void -CUDA::decode_thread() -{ - nvjpeg2kHandle_t handle; - auto status = nvjpeg2kCreateSimple(&handle); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kCreateSimple", status); - } - - nvjpeg2kDecodeState_t decode_state; - status = nvjpeg2kDecodeStateCreate(handle, &decode_state); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kDecodeStateCreate", status); - } - - nvjpeg2kStream_t jpeg2k_stream; - status = nvjpeg2kStreamCreate(&jpeg2k_stream); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kStreamCreate", status); - } - - while (true) { - boost::mutex::scoped_lock lm(_decode_mutex); - while (_decode_queue.empty()) { - _decode_queue_empty_condition.wait(lm); - } - - auto input = std::move(_decode_queue.front()); - _decode_queue.pop(); - lm.unlock(); - - try { - std::cout << "we got " << input.data->size() << " bytes.\n"; - auto status = nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kStreamParse", status); - } - - nvjpeg2kImageInfo_t image_info; - status = nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kStreamGetImageInfo", status); - } - std::cout << image_info.num_components << " components.\n"; - - nvjpeg2kImageComponentInfo_t image_component_info[3]; - for (int i = 0; i < 3; ++i) { - status = nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i); - if (status != NVJPEG2K_STATUS_SUCCESS) { - throw CUDAError("nvjpeg2kStreamGetImageComponentInfo", status); - } - } - - auto const width = image_component_info[0].component_width; - auto const height = image_component_info[0].component_height; - std::cout << width << "x" << height << " " << ((int)image_component_info[0].precision) << "\n"; - - uint16_t* decoded_d[3]; - size_t pitch[3]; - - for (int i = 0; i < 3; ++i) { - printf("cudaMallocPitch %d %d\n", width * 2, height); - auto status = cudaMallocPitch(reinterpret_cast(&decoded_d[i]), &pitch[i], width * 2, height); - if (status != cudaSuccess) { - throw CUDAError("cudaMallocPitch", status); - } - } - - nvjpeg2kImage_t output_image; - output_image.pixel_data = reinterpret_cast(decoded_d); - output_image.pixel_type = NVJPEG2K_UINT16; - output_image.pitch_in_bytes = pitch; - output_image.num_components = 3; - - status = nvjpeg2kDecode(handle, decode_state, jpeg2k_stream, &output_image, 0); - std::cout << "decode said " << status << "\n"; - if (status != NVJPEG2K_STATUS_SUCCESS) { - abort(); - throw CUDAError("nvjpeg2kDecode", status); - } - cudaDeviceSynchronize(); - - std::vector decoded_h[3]; - for (int i = 0; i < 3; ++i) { - auto size = pitch[i] * height; - decoded_h[i].resize(size / 2); - auto status = cudaMemcpy(decoded_h[i].data(), decoded_d[i], size, cudaMemcpyDeviceToHost); - if (status != cudaSuccess) { - throw CUDAError("cudaMemcpy", status); - } - } - - auto output = std::make_shared(input.pixel_format, dcp::Size(width, height), input.alignment); - int p = 0; - for (size_t y = 0; y < height; ++y) { - auto q = reinterpret_cast(output->data()[0] + y * output->stride()[0]); - for (size_t x = 0; x < width; ++x) { - *q++ = decoded_h[0][p] << 4; - *q++ = decoded_h[1][p] << 4; - *q++ = decoded_h[2][p] << 4; - ++p; - } - } - - for (int i = 0; i < 3; ++i) { - cudaFree(decoded_d[i]); - } - - lm.lock(); - _decode_output[input.id] = output; - _decode_complete_condition.notify_all(); - lm.unlock(); - } catch (CUDAError&) { - lm.lock(); - _decode_output[input.id] = {}; - _decode_complete_condition.notify_all(); - lm.unlock(); - } - } -} - - -shared_ptr -CUDA::decode(shared_ptr j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment) -{ - boost::mutex::scoped_lock lm(_decode_mutex); - auto id = _next_decode_id++; - _decode_queue.push({id, j2k_data, reduce, pixel_format, alignment}); - _decode_queue_empty_condition.notify_all(); - - while (_decode_output.find(id) == _decode_output.end()) { - _decode_complete_condition.wait(lm); - } - - auto iter = _decode_output.find(id); - if (iter ==_decode_output.end()) { - return {}; - } - - auto output = *iter; - _decode_output.erase(iter); - return output.second; -} - - -CUDA * -CUDA::instance() -{ - if (!_instance) { - _instance = new CUDA(); - } - - return _instance; -} - diff --git a/src/lib/cuda_decoder.cc b/src/lib/cuda_decoder.cc new file mode 100644 index 000000000..bc4c34232 --- /dev/null +++ b/src/lib/cuda_decoder.cc @@ -0,0 +1,222 @@ +/* + Copyright (C) 2022 Carl Hetherington + + This file is part of DCP-o-matic. + + DCP-o-matic is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + DCP-o-matic is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with DCP-o-matic. If not, see . + +*/ + + +#include "cuda_decoder.h" +#include "dcpomatic_assert.h" +#include "dcpomatic_log.h" +#include "exceptions.h" +#include "image.h" +#include "scope_guard.h" +#include +#include + + +using std::shared_ptr; +using std::string; + + +CUDADecoder* CUDADecoder::_instance = nullptr; + + +CUDADecoder::CUDADecoder() +{ + _thread = boost::thread(std::bind(&CUDADecoder::thread, this)); +} + + +CUDADecoder::~CUDADecoder() +{ + try { + _thread.interrupt(); + _thread.join(); + } catch (...) {} + + for (int i = 0; i < 3; ++i) { + cudaFree(_device[i]); + } +} + + +void +CUDADecoder::check_jpeg2k(string name, nvjpeg2kStatus_t status) +{ + if (status != NVJPEG2K_STATUS_SUCCESS) { + throw CUDAError(name, status); + } +} + + +void +CUDADecoder::thread() +try +{ + nvjpeg2kHandle_t handle; + check_jpeg2k("nvjpeg2kCreateSimple", nvjpeg2kCreateSimple(&handle)); + ScopeGuard handle_guard([handle]() { + nvjpeg2kDestroy(handle); + }); + + nvjpeg2kDecodeState_t state; + check_jpeg2k("nvjpeg2kDecodeStateCreate", nvjpeg2kDecodeStateCreate(handle, &state)); + ScopeGuard state_guard([&state]() { + nvjpeg2kDecodeStateDestroy(state); + }); + + nvjpeg2kStream_t jpeg2k_stream; + check_jpeg2k("nvjpeg2kStreamCreate", nvjpeg2kStreamCreate(&jpeg2k_stream)); + ScopeGuard jpeg2k_stream_guard([&jpeg2k_stream]() { + nvjpeg2kStreamDestroy(jpeg2k_stream); + }); + + while (true) { + boost::mutex::scoped_lock lm(_mutex); + while (_queue.empty()) { + _queue_empty_condition.wait(lm); + } + + auto input = std::move(_queue.front()); + _queue.pop(); + lm.unlock(); + + auto output = decode_one(input, handle, state, jpeg2k_stream); + + lm.lock(); + _output[input.id] = output; + _complete_condition.notify_all(); + } +} +catch (CUDAError& e) +{ + LOG_ERROR("CUDA error: %1 (aborting CUDADecoder)", e.what()); +} +catch (boost::thread_interrupted&) +{ + +} +catch (std::exception& e) +{ + LOG_ERROR("Aborting CUDADecoder thread: %1", e.what()); +} +catch (...) +{ + LOG_ERROR_NC("Aborting CUDADecoder thread: unknown error"); +} + + + +shared_ptr +CUDADecoder::decode_one(QueueItem const& input, nvjpeg2kHandle_t handle, nvjpeg2kDecodeState_t state, nvjpeg2kStream_t jpeg2k_stream) +{ + try { + check_jpeg2k("nvjpeg2kStreamParse", nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream)); + + nvjpeg2kImageInfo_t image_info; + check_jpeg2k("nvjpeg2kStreamGetImageInfo", nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info)); + + nvjpeg2kImageComponentInfo_t image_component_info[3]; + for (int i = 0; i < 3; ++i) { + check_jpeg2k("nvjpeg2kStreamGetImageComponentInfo", nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i)); + } + + dcp::Size size(image_component_info[0].component_width, image_component_info[0].component_height); + if (size != _allocation) { + for (int i = 0; i < 3; ++i) { + cudaFree(_device[i]); + _device[i] = nullptr; + auto status = cudaMallocPitch(reinterpret_cast(&_device[i]), &_pitch[i], size.width * 2, size.height); + if (status != cudaSuccess) { + throw CUDAError("cudaMallocPitch", status); + } + _host[i].resize(_pitch[i] * size.height / 2); + } + _allocation = size; + } + + nvjpeg2kImage_t output_image; + output_image.pixel_data = reinterpret_cast(_device); + output_image.pixel_type = NVJPEG2K_UINT16; + output_image.pitch_in_bytes = _pitch; + output_image.num_components = 3; + + check_jpeg2k("nvjpeg2kDecode", nvjpeg2kDecode(handle, state, jpeg2k_stream, &output_image, 0)); + cudaDeviceSynchronize(); + + for (int i = 0; i < 3; ++i) { + auto status = cudaMemcpy(_host[i].data(), _device[i], _pitch[i] * size.height, cudaMemcpyDeviceToHost); + if (status != cudaSuccess) { + throw CUDAError("cudaMemcpy", status); + } + } + + auto output = std::make_shared(input.pixel_format, size, input.alignment); + for (int y = 0; y < size.height; ++y) { + int p = y * _pitch[0] / 2; + auto q = reinterpret_cast(output->data()[0] + y * output->stride()[0]); + for (int x = 0; x < size.width; ++x) { + *q++ = _host[0][p] << 4; + *q++ = _host[1][p] << 4; + *q++ = _host[2][p] << 4; + ++p; + } + } + + return output; + + } catch (CUDAError& e) { + LOG_ERROR("CUDA error: %1", e.what()); + return {}; + } +} + + +shared_ptr +CUDADecoder::decode(shared_ptr j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment) +{ + boost::mutex::scoped_lock lm(_mutex); + auto id = _next_id++; + _queue.push({id, j2k_data, reduce, pixel_format, alignment}); + _queue_empty_condition.notify_all(); + + while (_output.find(id) == _output.end()) { + _complete_condition.wait(lm); + } + + auto iter = _output.find(id); + if (iter == _output.end()) { + return {}; + } + + auto output = *iter; + _output.erase(iter); + return output.second; +} + + +CUDADecoder * +CUDADecoder::instance() +{ + if (!_instance) { + _instance = new CUDADecoder(); + } + + return _instance; +} + diff --git a/src/lib/cuda.h b/src/lib/cuda_decoder.h similarity index 60% rename from src/lib/cuda.h rename to src/lib/cuda_decoder.h index 8d89c7df7..63983861f 100644 --- a/src/lib/cuda.h +++ b/src/lib/cuda_decoder.h @@ -24,31 +24,31 @@ extern "C" { #include } +#include +#include #include #include #include #include -#include -class CUDA +class CUDADecoder { public: - CUDA(); + CUDADecoder(); + ~CUDADecoder(); - CUDA(CUDA &) = delete; - CUDA(CUDA &&) = delete; + CUDADecoder(CUDADecoder &) = delete; + CUDADecoder(CUDADecoder &&) = delete; std::shared_ptr decode(std::shared_ptr j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment); - static CUDA* instance(); + static CUDADecoder* instance(); private: - void decode_thread(); - typedef uint64_t ID; - class DecodeQueueItem + class QueueItem { public: ID id; @@ -58,14 +58,22 @@ private: Image::Alignment alignment; }; - std::queue _decode_queue; - std::map> _decode_output; - boost::condition _decode_queue_empty_condition; - boost::condition _decode_complete_condition; - boost::mutex _decode_mutex; - std::thread _decode_thread; - ID _next_decode_id = 0; - - static CUDA* _instance; + void thread(); + std::shared_ptr decode_one(QueueItem const& input, nvjpeg2kHandle_t handle, nvjpeg2kDecodeState_t state, nvjpeg2kStream_t jpeg2k_stream); + void check_jpeg2k(std::string name, nvjpeg2kStatus_t status); + + std::queue _queue; + std::map> _output; + boost::condition _queue_empty_condition; + boost::condition _complete_condition; + boost::mutex _mutex; + boost::thread _thread; + ID _next_id = 0; + dcp::Size _allocation; + uint16_t* _device[3] = { nullptr, nullptr, nullptr }; + size_t _pitch[3]; + std::vector _host[3]; + + static CUDADecoder* _instance; }; diff --git a/src/lib/j2k_image_proxy.cc b/src/lib/j2k_image_proxy.cc index 26b2064d7..f6b3b0373 100644 --- a/src/lib/j2k_image_proxy.cc +++ b/src/lib/j2k_image_proxy.cc @@ -20,7 +20,7 @@ #ifdef HAVE_NVJPEG2K_H -#include "cuda.h" +#include "cuda_decoder.h" #endif #include "dcpomatic_assert.h" #include "dcpomatic_socket.h" @@ -171,7 +171,7 @@ J2KImageProxy::prepare (Image::Alignment alignment, optional target_s } } #endif - _image = CUDA::instance()->decode(_data, 0, _pixel_format, alignment); + _image = CUDADecoder::instance()->decode(_data, 0, _pixel_format, alignment); } catch (dcp::J2KDecompressionError& e) { _image = make_shared(_pixel_format, _size, alignment); _image->make_black (); diff --git a/src/lib/wscript b/src/lib/wscript index 9dce415bb..c673a6f0d 100644 --- a/src/lib/wscript +++ b/src/lib/wscript @@ -229,7 +229,7 @@ def build(bld): obj.uselib += ' POLKIT' if bld.env.HAVE_NVJPEG2K_H: - obj.source += ' cuda.cc cuda_j2k_frame_encoder.cc' + obj.source += ' cuda_decoder.cc cuda_j2k_frame_encoder.cc' if bld.env.TARGET_WINDOWS_64 or bld.env.TARGET_WINDOWS_32: obj.uselib += ' WINSOCK2 DBGHELP SHLWAPI MSWSOCK BOOST_LOCALE SETUPAPI OLE32 UUID' -- 2.30.2