fixup! Playback sort of works. nvidia
authorCarl Hetherington <cth@carlh.net>
Tue, 24 May 2022 20:33:06 +0000 (22:33 +0200)
committerCarl Hetherington <cth@carlh.net>
Tue, 24 May 2022 20:33:06 +0000 (22:33 +0200)
src/lib/cuda.cc [deleted file]
src/lib/cuda.h [deleted file]
src/lib/cuda_decoder.cc [new file with mode: 0644]
src/lib/cuda_decoder.h [new file with mode: 0644]
src/lib/j2k_image_proxy.cc
src/lib/wscript

diff --git a/src/lib/cuda.cc b/src/lib/cuda.cc
deleted file mode 100644 (file)
index d12dfe3..0000000
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
-    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
-
-    This file is part of DCP-o-matic.
-
-    DCP-o-matic is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    DCP-o-matic is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
-
-*/
-
-
-#include "cuda.h"
-#include "dcpomatic_assert.h"
-#include "exceptions.h"
-#include <dcp/openjpeg_image.h>
-#include <nvjpeg2k.h>
-
-
-using std::shared_ptr;
-
-
-CUDA* CUDA::_instance = nullptr;
-
-
-CUDA::CUDA()
-{
-       _decode_thread = std::thread(std::bind(&CUDA::decode_thread, this));
-}
-
-
-void
-CUDA::decode_thread()
-{
-       nvjpeg2kHandle_t handle;
-       auto status = nvjpeg2kCreateSimple(&handle);
-       if (status != NVJPEG2K_STATUS_SUCCESS) {
-               throw CUDAError("nvjpeg2kCreateSimple", status);
-       }
-
-       nvjpeg2kDecodeState_t decode_state;
-       status = nvjpeg2kDecodeStateCreate(handle, &decode_state);
-       if (status != NVJPEG2K_STATUS_SUCCESS) {
-               throw CUDAError("nvjpeg2kDecodeStateCreate", status);
-       }
-
-       nvjpeg2kStream_t jpeg2k_stream;
-       status = nvjpeg2kStreamCreate(&jpeg2k_stream);
-       if (status != NVJPEG2K_STATUS_SUCCESS) {
-               throw CUDAError("nvjpeg2kStreamCreate", status);
-       }
-
-       while (true) {
-               boost::mutex::scoped_lock lm(_decode_mutex);
-               while (_decode_queue.empty()) {
-                       _decode_queue_empty_condition.wait(lm);
-               }
-
-               auto input = std::move(_decode_queue.front());
-               _decode_queue.pop();
-               lm.unlock();
-
-               try {
-                       std::cout << "we got " << input.data->size() << " bytes.\n";
-                       auto status = nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream);
-                       if (status != NVJPEG2K_STATUS_SUCCESS) {
-                               throw CUDAError("nvjpeg2kStreamParse", status);
-                       }
-
-                       nvjpeg2kImageInfo_t image_info;
-                       status = nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info);
-                       if (status != NVJPEG2K_STATUS_SUCCESS) {
-                               throw CUDAError("nvjpeg2kStreamGetImageInfo", status);
-                       }
-                       std::cout << image_info.num_components << " components.\n";
-
-                       nvjpeg2kImageComponentInfo_t image_component_info[3];
-                       for (int i = 0; i < 3; ++i) {
-                               status = nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i);
-                               if (status != NVJPEG2K_STATUS_SUCCESS) {
-                                       throw CUDAError("nvjpeg2kStreamGetImageComponentInfo", status);
-                               }
-                       }
-
-                       auto const width = image_component_info[0].component_width;
-                       auto const height = image_component_info[0].component_height;
-                       std::cout << width << "x" << height << " " << ((int)image_component_info[0].precision) << "\n";
-
-                       uint16_t* decoded_d[3];
-                       size_t pitch[3];
-
-                       for (int i = 0; i < 3; ++i) {
-                               printf("cudaMallocPitch %d %d\n", width * 2, height);
-                               auto status = cudaMallocPitch(reinterpret_cast<void**>(&decoded_d[i]), &pitch[i], width * 2, height);
-                               if (status != cudaSuccess) {
-                                       throw CUDAError("cudaMallocPitch", status);
-                               }
-                       }
-
-                       nvjpeg2kImage_t output_image;
-                       output_image.pixel_data = reinterpret_cast<void**>(decoded_d);
-                       output_image.pixel_type = NVJPEG2K_UINT16;
-                       output_image.pitch_in_bytes = pitch;
-                       output_image.num_components = 3;
-
-                       status = nvjpeg2kDecode(handle, decode_state, jpeg2k_stream, &output_image, 0);
-                       std::cout << "decode said " << status << "\n";
-                       if (status != NVJPEG2K_STATUS_SUCCESS) {
-                               abort();
-                               throw CUDAError("nvjpeg2kDecode", status);
-                       }
-                       cudaDeviceSynchronize();
-
-                       std::vector<uint16_t> decoded_h[3];
-                       for (int i = 0; i < 3; ++i) {
-                               auto size = pitch[i] * height;
-                               decoded_h[i].resize(size / 2);
-                               auto status = cudaMemcpy(decoded_h[i].data(), decoded_d[i], size, cudaMemcpyDeviceToHost);
-                               if (status != cudaSuccess) {
-                                       throw CUDAError("cudaMemcpy", status);
-                               }
-                       }
-
-                       auto output = std::make_shared<Image>(input.pixel_format, dcp::Size(width, height), input.alignment);
-                       int p = 0;
-                       for (size_t y = 0; y < height; ++y) {
-                               auto q = reinterpret_cast<uint16_t *>(output->data()[0] + y * output->stride()[0]);
-                               for (size_t x = 0; x < width; ++x) {
-                                       *q++ = decoded_h[0][p] << 4;
-                                       *q++ = decoded_h[1][p] << 4;
-                                       *q++ = decoded_h[2][p] << 4;
-                                       ++p;
-                               }
-                       }
-
-                       for (int i = 0; i < 3; ++i) {
-                               cudaFree(decoded_d[i]);
-                       }
-
-                       lm.lock();
-                       _decode_output[input.id] = output;
-                       _decode_complete_condition.notify_all();
-                       lm.unlock();
-               } catch (CUDAError&) {
-                       lm.lock();
-                       _decode_output[input.id] = {};
-                       _decode_complete_condition.notify_all();
-                       lm.unlock();
-               }
-       }
-}
-
-
-shared_ptr<Image>
-CUDA::decode(shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment)
-{
-       boost::mutex::scoped_lock lm(_decode_mutex);
-       auto id = _next_decode_id++;
-       _decode_queue.push({id, j2k_data, reduce, pixel_format, alignment});
-       _decode_queue_empty_condition.notify_all();
-
-       while (_decode_output.find(id) == _decode_output.end()) {
-               _decode_complete_condition.wait(lm);
-       }
-
-       auto iter = _decode_output.find(id);
-       if (iter ==_decode_output.end()) {
-               return {};
-       }
-
-       auto output = *iter;
-       _decode_output.erase(iter);
-       return output.second;
-}
-
-
-CUDA *
-CUDA::instance()
-{
-       if (!_instance) {
-               _instance = new CUDA();
-       }
-
-       return _instance;
-}
-
diff --git a/src/lib/cuda.h b/src/lib/cuda.h
deleted file mode 100644 (file)
index 8d89c7d..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
-    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
-
-    This file is part of DCP-o-matic.
-
-    DCP-o-matic is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    DCP-o-matic is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
-
-*/
-
-
-#include "image.h"
-#include <dcp/array_data.h>
-extern "C" {
-#include <libavutil/pixfmt.h>
-}
-#include <boost/thread/condition.hpp>
-#include <boost/thread/mutex.hpp>
-#include <memory>
-#include <queue>
-#include <thread>
-
-
-class CUDA
-{
-public:
-       CUDA();
-
-       CUDA(CUDA &) = delete;
-       CUDA(CUDA &&) = delete;
-
-       std::shared_ptr<Image> decode(std::shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment);
-
-       static CUDA* instance();
-
-private:
-       void decode_thread();
-
-       typedef uint64_t ID;
-
-       class DecodeQueueItem
-       {
-       public:
-               ID id;
-               std::shared_ptr<const dcp::Data> data;
-               int reduce;
-               AVPixelFormat pixel_format;
-               Image::Alignment alignment;
-       };
-
-       std::queue<DecodeQueueItem> _decode_queue;
-       std::map<ID, std::shared_ptr<Image>> _decode_output;
-       boost::condition _decode_queue_empty_condition;
-       boost::condition _decode_complete_condition;
-       boost::mutex _decode_mutex;
-       std::thread _decode_thread;
-       ID _next_decode_id = 0;
-
-       static CUDA* _instance;
-};
-
diff --git a/src/lib/cuda_decoder.cc b/src/lib/cuda_decoder.cc
new file mode 100644 (file)
index 0000000..bc4c342
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "cuda_decoder.h"
+#include "dcpomatic_assert.h"
+#include "dcpomatic_log.h"
+#include "exceptions.h"
+#include "image.h"
+#include "scope_guard.h"
+#include <dcp/openjpeg_image.h>
+#include <nvjpeg2k.h>
+
+
+using std::shared_ptr;
+using std::string;
+
+
+CUDADecoder* CUDADecoder::_instance = nullptr;
+
+
+CUDADecoder::CUDADecoder()
+{
+       _thread = boost::thread(std::bind(&CUDADecoder::thread, this));
+}
+
+
+CUDADecoder::~CUDADecoder()
+{
+       try {
+               _thread.interrupt();
+               _thread.join();
+       } catch (...) {}
+
+       for (int i = 0; i < 3; ++i) {
+               cudaFree(_device[i]);
+       }
+}
+
+
+void
+CUDADecoder::check_jpeg2k(string name, nvjpeg2kStatus_t status)
+{
+       if (status != NVJPEG2K_STATUS_SUCCESS) {
+               throw CUDAError(name, status);
+       }
+}
+
+
+void
+CUDADecoder::thread()
+try
+{
+       nvjpeg2kHandle_t handle;
+       check_jpeg2k("nvjpeg2kCreateSimple", nvjpeg2kCreateSimple(&handle));
+       ScopeGuard handle_guard([handle]() {
+               nvjpeg2kDestroy(handle);
+       });
+
+       nvjpeg2kDecodeState_t state;
+       check_jpeg2k("nvjpeg2kDecodeStateCreate", nvjpeg2kDecodeStateCreate(handle, &state));
+       ScopeGuard state_guard([&state]() {
+               nvjpeg2kDecodeStateDestroy(state);
+       });
+
+       nvjpeg2kStream_t jpeg2k_stream;
+       check_jpeg2k("nvjpeg2kStreamCreate", nvjpeg2kStreamCreate(&jpeg2k_stream));
+       ScopeGuard jpeg2k_stream_guard([&jpeg2k_stream]() {
+               nvjpeg2kStreamDestroy(jpeg2k_stream);
+       });
+
+       while (true) {
+               boost::mutex::scoped_lock lm(_mutex);
+               while (_queue.empty()) {
+                       _queue_empty_condition.wait(lm);
+               }
+
+               auto input = std::move(_queue.front());
+               _queue.pop();
+               lm.unlock();
+
+               auto output = decode_one(input, handle, state, jpeg2k_stream);
+
+               lm.lock();
+               _output[input.id] = output;
+               _complete_condition.notify_all();
+       }
+}
+catch (CUDAError& e)
+{
+       LOG_ERROR("CUDA error: %1 (aborting CUDADecoder)", e.what());
+}
+catch (boost::thread_interrupted&)
+{
+
+}
+catch (std::exception& e)
+{
+       LOG_ERROR("Aborting CUDADecoder thread: %1", e.what());
+}
+catch (...)
+{
+       LOG_ERROR_NC("Aborting CUDADecoder thread: unknown error");
+}
+
+
+
+shared_ptr<Image>
+CUDADecoder::decode_one(QueueItem const& input, nvjpeg2kHandle_t handle, nvjpeg2kDecodeState_t state, nvjpeg2kStream_t jpeg2k_stream)
+{
+       try {
+               check_jpeg2k("nvjpeg2kStreamParse", nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream));
+
+               nvjpeg2kImageInfo_t image_info;
+               check_jpeg2k("nvjpeg2kStreamGetImageInfo", nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info));
+
+               nvjpeg2kImageComponentInfo_t image_component_info[3];
+               for (int i = 0; i < 3; ++i) {
+                       check_jpeg2k("nvjpeg2kStreamGetImageComponentInfo", nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i));
+               }
+
+               dcp::Size size(image_component_info[0].component_width, image_component_info[0].component_height);
+               if (size != _allocation) {
+                       for (int i = 0; i < 3; ++i) {
+                               cudaFree(_device[i]);
+                               _device[i] = nullptr;
+                               auto status = cudaMallocPitch(reinterpret_cast<void**>(&_device[i]), &_pitch[i], size.width * 2, size.height);
+                               if (status != cudaSuccess) {
+                                       throw CUDAError("cudaMallocPitch", status);
+                               }
+                               _host[i].resize(_pitch[i] * size.height / 2);
+                       }
+                       _allocation = size;
+               }
+
+               nvjpeg2kImage_t output_image;
+               output_image.pixel_data = reinterpret_cast<void**>(_device);
+               output_image.pixel_type = NVJPEG2K_UINT16;
+               output_image.pitch_in_bytes = _pitch;
+               output_image.num_components = 3;
+
+               check_jpeg2k("nvjpeg2kDecode", nvjpeg2kDecode(handle, state, jpeg2k_stream, &output_image, 0));
+               cudaDeviceSynchronize();
+
+               for (int i = 0; i < 3; ++i) {
+                       auto status = cudaMemcpy(_host[i].data(), _device[i], _pitch[i] * size.height, cudaMemcpyDeviceToHost);
+                       if (status != cudaSuccess) {
+                               throw CUDAError("cudaMemcpy", status);
+                       }
+               }
+
+               auto output = std::make_shared<Image>(input.pixel_format, size, input.alignment);
+               for (int y = 0; y < size.height; ++y) {
+                       int p = y * _pitch[0] / 2;
+                       auto q = reinterpret_cast<uint16_t *>(output->data()[0] + y * output->stride()[0]);
+                       for (int x = 0; x < size.width; ++x) {
+                               *q++ = _host[0][p] << 4;
+                               *q++ = _host[1][p] << 4;
+                               *q++ = _host[2][p] << 4;
+                               ++p;
+                       }
+               }
+
+               return output;
+
+       } catch (CUDAError& e) {
+               LOG_ERROR("CUDA error: %1", e.what());
+               return {};
+       }
+}
+
+
+shared_ptr<Image>
+CUDADecoder::decode(shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment)
+{
+       boost::mutex::scoped_lock lm(_mutex);
+       auto id = _next_id++;
+       _queue.push({id, j2k_data, reduce, pixel_format, alignment});
+       _queue_empty_condition.notify_all();
+
+       while (_output.find(id) == _output.end()) {
+               _complete_condition.wait(lm);
+       }
+
+       auto iter = _output.find(id);
+       if (iter == _output.end()) {
+               return {};
+       }
+
+       auto output = *iter;
+       _output.erase(iter);
+       return output.second;
+}
+
+
+CUDADecoder *
+CUDADecoder::instance()
+{
+       if (!_instance) {
+               _instance = new CUDADecoder();
+       }
+
+       return _instance;
+}
+
diff --git a/src/lib/cuda_decoder.h b/src/lib/cuda_decoder.h
new file mode 100644 (file)
index 0000000..6398386
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "image.h"
+#include <dcp/array_data.h>
+extern "C" {
+#include <libavutil/pixfmt.h>
+}
+#include <nvjpeg2k.h>
+#include <boost/thread.hpp>
+#include <boost/thread/condition.hpp>
+#include <boost/thread/mutex.hpp>
+#include <memory>
+#include <queue>
+
+
+class CUDADecoder
+{
+public:
+       CUDADecoder();
+       ~CUDADecoder();
+
+       CUDADecoder(CUDADecoder &) = delete;
+       CUDADecoder(CUDADecoder &&) = delete;
+
+       std::shared_ptr<Image> decode(std::shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment);
+
+       static CUDADecoder* instance();
+
+private:
+       typedef uint64_t ID;
+
+       class QueueItem
+       {
+       public:
+               ID id;
+               std::shared_ptr<const dcp::Data> data;
+               int reduce;
+               AVPixelFormat pixel_format;
+               Image::Alignment alignment;
+       };
+
+       void thread();
+       std::shared_ptr<Image> decode_one(QueueItem const& input, nvjpeg2kHandle_t handle, nvjpeg2kDecodeState_t state, nvjpeg2kStream_t jpeg2k_stream);
+       void check_jpeg2k(std::string name, nvjpeg2kStatus_t status);
+
+       std::queue<QueueItem> _queue;
+       std::map<ID, std::shared_ptr<Image>> _output;
+       boost::condition _queue_empty_condition;
+       boost::condition _complete_condition;
+       boost::mutex _mutex;
+       boost::thread _thread;
+       ID _next_id = 0;
+       dcp::Size _allocation;
+       uint16_t* _device[3] = { nullptr, nullptr, nullptr };
+       size_t _pitch[3];
+       std::vector<uint16_t> _host[3];
+
+       static CUDADecoder* _instance;
+};
+
index 26b2064d7795cbaab3306393f870d23213c9b084..f6b3b0373707deb0f553cca8f94e0f2e2bef4bac 100644 (file)
@@ -20,7 +20,7 @@
 
 
 #ifdef HAVE_NVJPEG2K_H
-#include "cuda.h"
+#include "cuda_decoder.h"
 #endif
 #include "dcpomatic_assert.h"
 #include "dcpomatic_socket.h"
@@ -171,7 +171,7 @@ J2KImageProxy::prepare (Image::Alignment alignment, optional<dcp::Size> target_s
                        }
                }
 #endif
-               _image = CUDA::instance()->decode(_data, 0, _pixel_format, alignment);
+               _image = CUDADecoder::instance()->decode(_data, 0, _pixel_format, alignment);
        } catch (dcp::J2KDecompressionError& e) {
                _image = make_shared<Image>(_pixel_format, _size, alignment);
                _image->make_black ();
index 9dce415bb1ad0453372230843355324f02ec9e18..c673a6f0d541b8384d8a0792dd2501ba2fc5f200 100644 (file)
@@ -229,7 +229,7 @@ def build(bld):
             obj.uselib += ' POLKIT'
 
     if bld.env.HAVE_NVJPEG2K_H:
-        obj.source += ' cuda.cc cuda_j2k_frame_encoder.cc'
+        obj.source += ' cuda_decoder.cc cuda_j2k_frame_encoder.cc'
 
     if bld.env.TARGET_WINDOWS_64 or bld.env.TARGET_WINDOWS_32:
         obj.uselib += ' WINSOCK2 DBGHELP SHLWAPI MSWSOCK BOOST_LOCALE SETUPAPI OLE32 UUID'