Playback sort of works.

author: Carl Hetherington <cth@carlh.net> 2022-05-24 20:50:19 +0200
committer: Carl Hetherington <cth@carlh.net> 2022-05-24 20:50:19 +0200
commit: d39f2ce23a8d12c11b0ccafc4568761802101ba0 (patch)
tree: 2e081c8a33f9f0c037dbdfb6290f2505f97623a6
parent: 9d967309911e96d2ab752692fee7eb64c2b3a43a (diff)
5 files changed, 276 insertions, 3 deletions
diff --git a/src/lib/cuda.cc b/src/lib/cuda.cc
new file mode 100644
index 000000000..d12dfe34a
--- /dev/null
+++ b/src/lib/cuda.cc
@@ -0,0 +1,195 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "cuda.h"
+#include "dcpomatic_assert.h"
+#include "exceptions.h"
+#include <dcp/openjpeg_image.h>
+#include <nvjpeg2k.h>
+
+
+using std::shared_ptr;
+
+
+CUDA* CUDA::_instance = nullptr;
+
+
+CUDA::CUDA()
+{
+	_decode_thread = std::thread(std::bind(&CUDA::decode_thread, this));
+}
+
+
+void
+CUDA::decode_thread()
+{
+	nvjpeg2kHandle_t handle;
+	auto status = nvjpeg2kCreateSimple(&handle);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kCreateSimple", status);
+	}
+
+	nvjpeg2kDecodeState_t decode_state;
+	status = nvjpeg2kDecodeStateCreate(handle, &decode_state);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kDecodeStateCreate", status);
+	}
+
+	nvjpeg2kStream_t jpeg2k_stream;
+	status = nvjpeg2kStreamCreate(&jpeg2k_stream);
+	if (status != NVJPEG2K_STATUS_SUCCESS) {
+		throw CUDAError("nvjpeg2kStreamCreate", status);
+	}
+
+	while (true) {
+		boost::mutex::scoped_lock lm(_decode_mutex);
+		while (_decode_queue.empty()) {
+			_decode_queue_empty_condition.wait(lm);
+		}
+
+		auto input = std::move(_decode_queue.front());
+		_decode_queue.pop();
+		lm.unlock();
+
+		try {
+			std::cout << "we got " << input.data->size() << " bytes.\n";
+			auto status = nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream);
+			if (status != NVJPEG2K_STATUS_SUCCESS) {
+				throw CUDAError("nvjpeg2kStreamParse", status);
+			}
+
+			nvjpeg2kImageInfo_t image_info;
+			status = nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info);
+			if (status != NVJPEG2K_STATUS_SUCCESS) {
+				throw CUDAError("nvjpeg2kStreamGetImageInfo", status);
+			}
+			std::cout << image_info.num_components << " components.\n";
+
+			nvjpeg2kImageComponentInfo_t image_component_info[3];
+			for (int i = 0; i < 3; ++i) {
+				status = nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i);
+				if (status != NVJPEG2K_STATUS_SUCCESS) {
+					throw CUDAError("nvjpeg2kStreamGetImageComponentInfo", status);
+				}
+			}
+
+			auto const width = image_component_info[0].component_width;
+			auto const height = image_component_info[0].component_height;
+			std::cout << width << "x" << height << " " << ((int)image_component_info[0].precision) << "\n";
+
+			uint16_t* decoded_d[3];
+			size_t pitch[3];
+
+			for (int i = 0; i < 3; ++i) {
+				printf("cudaMallocPitch %d %d\n", width * 2, height);
+				auto status = cudaMallocPitch(reinterpret_cast<void**>(&decoded_d[i]), &pitch[i], width * 2, height);
+				if (status != cudaSuccess) {
+					throw CUDAError("cudaMallocPitch", status);
+				}
+			}
+
+			nvjpeg2kImage_t output_image;
+			output_image.pixel_data = reinterpret_cast<void**>(decoded_d);
+			output_image.pixel_type = NVJPEG2K_UINT16;
+			output_image.pitch_in_bytes = pitch;
+			output_image.num_components = 3;
+
+			status = nvjpeg2kDecode(handle, decode_state, jpeg2k_stream, &output_image, 0);
+			std::cout << "decode said " << status << "\n";
+			if (status != NVJPEG2K_STATUS_SUCCESS) {
+				abort();
+				throw CUDAError("nvjpeg2kDecode", status);
+			}
+			cudaDeviceSynchronize();
+
+			std::vector<uint16_t> decoded_h[3];
+			for (int i = 0; i < 3; ++i) {
+				auto size = pitch[i] * height;
+				decoded_h[i].resize(size / 2);
+				auto status = cudaMemcpy(decoded_h[i].data(), decoded_d[i], size, cudaMemcpyDeviceToHost);
+				if (status != cudaSuccess) {
+					throw CUDAError("cudaMemcpy", status);
+				}
+			}
+
+			auto output = std::make_shared<Image>(input.pixel_format, dcp::Size(width, height), input.alignment);
+			int p = 0;
+			for (size_t y = 0; y < height; ++y) {
+				auto q = reinterpret_cast<uint16_t *>(output->data()[0] + y * output->stride()[0]);
+				for (size_t x = 0; x < width; ++x) {
+					*q++ = decoded_h[0][p] << 4;
+					*q++ = decoded_h[1][p] << 4;
+					*q++ = decoded_h[2][p] << 4;
+					++p;
+				}
+			}
+
+			for (int i = 0; i < 3; ++i) {
+				cudaFree(decoded_d[i]);
+			}
+
+			lm.lock();
+			_decode_output[input.id] = output;
+			_decode_complete_condition.notify_all();
+			lm.unlock();
+		} catch (CUDAError&) {
+			lm.lock();
+			_decode_output[input.id] = {};
+			_decode_complete_condition.notify_all();
+			lm.unlock();
+		}
+	}
+}
+
+
+shared_ptr<Image>
+CUDA::decode(shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment)
+{
+	boost::mutex::scoped_lock lm(_decode_mutex);
+	auto id = _next_decode_id++;
+	_decode_queue.push({id, j2k_data, reduce, pixel_format, alignment});
+	_decode_queue_empty_condition.notify_all();
+
+	while (_decode_output.find(id) == _decode_output.end()) {
+		_decode_complete_condition.wait(lm);
+	}
+
+	auto iter = _decode_output.find(id);
+	if (iter ==_decode_output.end()) {
+		return {};
+	}
+
+	auto output = *iter;
+	_decode_output.erase(iter);
+	return output.second;
+}
+
+
+CUDA *
+CUDA::instance()
+{
+	if (!_instance) {
+		_instance = new CUDA();
+	}
+
+	return _instance;
+}
+
diff --git a/src/lib/cuda.h b/src/lib/cuda.h
new file mode 100644
index 000000000..8d89c7df7
--- /dev/null
+++ b/src/lib/cuda.h
@@ -0,0 +1,71 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "image.h"
+#include <dcp/array_data.h>
+extern "C" {
+#include <libavutil/pixfmt.h>
+}
+#include <boost/thread/condition.hpp>
+#include <boost/thread/mutex.hpp>
+#include <memory>
+#include <queue>
+#include <thread>
+
+
+class CUDA
+{
+public:
+	CUDA();
+
+	CUDA(CUDA &) = delete;
+	CUDA(CUDA &&) = delete;
+
+	std::shared_ptr<Image> decode(std::shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment);
+
+	static CUDA* instance();
+
+private:
+	void decode_thread();
+
+	typedef uint64_t ID;
+
+	class DecodeQueueItem
+	{
+	public:
+		ID id;
+		std::shared_ptr<const dcp::Data> data;
+		int reduce;
+		AVPixelFormat pixel_format;
+		Image::Alignment alignment;
+	};
+
+	std::queue<DecodeQueueItem> _decode_queue;
+	std::map<ID, std::shared_ptr<Image>> _decode_output;
+	boost::condition _decode_queue_empty_condition;
+	boost::condition _decode_complete_condition;
+	boost::mutex _decode_mutex;
+	std::thread _decode_thread;
+	ID _next_decode_id = 0;
+
+	static CUDA* _instance;
+};
+
diff --git a/src/lib/exceptions.cc b/src/lib/exceptions.cc
index 4a43ac32b..2c9500577 100644
--- a/src/lib/exceptions.cc
+++ b/src/lib/exceptions.cc
@@ -183,5 +183,6 @@ CUDAError::CUDAError (string function, cudaError error)
 CUDAError::CUDAError (string function, int error)
 	: runtime_error(String::compose("CUDA error on %1 (%2)", function, error))
 {
+	std::cout << "CUDA error " << function << " " << error << "\n";
 }
 
diff --git a/src/lib/j2k_image_proxy.cc b/src/lib/j2k_image_proxy.cc
index 269b01bca..26b2064d7 100644
--- a/src/lib/j2k_image_proxy.cc
+++ b/src/lib/j2k_image_proxy.cc
@@ -19,6 +19,9 @@
 */
 
 
+#ifdef HAVE_NVJPEG2K_H
+#include "cuda.h"
+#endif
 #include "dcpomatic_assert.h"
 #include "dcpomatic_socket.h"
 #include "image.h"
@@ -141,6 +144,7 @@ J2KImageProxy::prepare (Image::Alignment alignment, optional<dcp::Size> target_s
 	}
 
 	try {
+#if 0
 		/* XXX: should check that potentially trashing _data here doesn't matter */
 		auto decompressed = dcp::decompress_j2k (const_cast<uint8_t*>(_data->data()), _data->size(), reduce);
 		_image = make_shared<Image>(_pixel_format, decompressed->size(), alignment);
@@ -166,6 +170,8 @@ J2KImageProxy::prepare (Image::Alignment alignment, optional<dcp::Size> target_s
 				++p;
 			}
 		}
+#endif
+		_image = CUDA::instance()->decode(_data, 0, _pixel_format, alignment);
 	} catch (dcp::J2KDecompressionError& e) {
 		_image = make_shared<Image>(_pixel_format, _size, alignment);
 		_image->make_black ();
diff --git a/src/lib/wscript b/src/lib/wscript
index 377bf1409..9dce415bb 100644
--- a/src/lib/wscript
+++ b/src/lib/wscript
@@ -45,8 +45,6 @@ sources = """
           audio_ring_buffers.cc
           audio_stream.cc
           butler.cc
-          text_content.cc
-          text_decoder.cc
           case_insensitive_sorter.cc
           check_content_change_job.cc
           cinema.cc
@@ -174,6 +172,8 @@ sources = """
           string_text_file_decoder.cc
           subtitle_analysis.cc
           subtitle_encoder.cc
+          text_content.cc
+          text_decoder.cc
           text_ring_buffers.cc
           timer.cc
           transcode_job.cc
@@ -229,7 +229,7 @@ def build(bld):
             obj.uselib += ' POLKIT'
 
     if bld.env.HAVE_NVJPEG2K_H:
-        obj.source += ' cuda_j2k_frame_encoder.cc'
+        obj.source += ' cuda.cc cuda_j2k_frame_encoder.cc'
 
     if bld.env.TARGET_WINDOWS_64 or bld.env.TARGET_WINDOWS_32:
         obj.uselib += ' WINSOCK2 DBGHELP SHLWAPI MSWSOCK BOOST_LOCALE SETUPAPI OLE32 UUID'
author	Carl Hetherington <cth@carlh.net>	2022-05-24 20:50:19 +0200
committer	Carl Hetherington <cth@carlh.net>	2022-05-24 20:50:19 +0200
commit	d39f2ce23a8d12c11b0ccafc4568761802101ba0 (patch)
tree	2e081c8a33f9f0c037dbdfb6290f2505f97623a6
parent	9d967309911e96d2ab752692fee7eb64c2b3a43a (diff)