Try using streams.

author: Carl Hetherington <cth@carlh.net> 2022-05-23 19:53:17 +0200
committer: Carl Hetherington <cth@carlh.net> 2022-05-23 19:53:17 +0200
commit: 692ef68b721c57fdd07de83adbfccaa93a903f30 (patch)
tree: 049371c9026c9d701c9bc22bbf6358ce5b4aaad5 /src/lib
parent: c4578e1197744272e78d5a19c92bc43e0e92e4cb (diff)
2 files changed, 16 insertions, 15 deletions
diff --git a/src/lib/cuda_j2k_frame_encoder.cc b/src/lib/cuda_j2k_frame_encoder.cc
index 7fba7b58e..6aa894b87 100644
--- a/src/lib/cuda_j2k_frame_encoder.cc
+++ b/src/lib/cuda_j2k_frame_encoder.cc
@@ -45,19 +45,17 @@ CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
 }
 
 
-CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf)
+CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream)
 	: _index(vf.index())
 	, _eyes(vf.eyes())
 {
-	auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+	_xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
 
-	void* pixel_data_h[] = {
-		xyz->data(0),
-		xyz->data(1),
-		xyz->data(2)
-	};
+	for (int i = 0; i < 3; ++i) {
+		_pixel_data_h[i] = reinterpret_cast<uint8_t*>(_xyz->data(i));
+	}
 
-	auto const pitch = xyz->size().width * 2;
+	auto const pitch = _xyz->size().width * 2;
 
 	for (int i = 0; i < 3; ++i) {
 		_pitch_in_bytes[i] = pitch;
@@ -65,21 +63,22 @@ CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf)
 			reinterpret_cast<void**>(&_pixel_data_d[i]),
 			&_pitch_in_bytes[i],
 			pitch,
-			xyz->size().height
+			_xyz->size().height
 			);
 
 		if (status != cudaSuccess) {
 			throw CUDAError("cudaMallocPitch", status);
 		}
 
-		status = cudaMemcpy2D(
+		status = cudaMemcpy2DAsync(
 			_pixel_data_d[i],
 			_pitch_in_bytes[i],
-			pixel_data_h[i],
+			_pixel_data_h[i],
 			_pitch_in_bytes[i],
 			pitch,
-			xyz->size().height,
-			cudaMemcpyHostToDevice
+			_xyz->size().height,
+			cudaMemcpyHostToDevice,
+			stream
 			);
 
 		if (status != cudaSuccess) {
@@ -122,7 +121,7 @@ CUDAJ2KFrameEncoder::Input::~Input()
 optional<dcp::ArrayData>
 CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
 {
-	auto input = Input(vf);
+	auto input = Input(vf, _stream);
 
 	auto const size = vf.frame()->out_size();
 	DCPOMATIC_ASSERT(!_size || size == *_size);
diff --git a/src/lib/cuda_j2k_frame_encoder.h b/src/lib/cuda_j2k_frame_encoder.h
index b17ad69d9..5d1aeef07 100644
--- a/src/lib/cuda_j2k_frame_encoder.h
+++ b/src/lib/cuda_j2k_frame_encoder.h
@@ -52,7 +52,7 @@ private:
 	class Input
 	{
 	public:
-		Input(DCPVideo const& vf);
+		Input(DCPVideo const& vf, cudaStream_t stream);
 		Input(Input const& other) = delete;
 		Input(Input&& other);
 		~Input();
@@ -72,6 +72,8 @@ private:
 		}
 
 	private:
+		std::shared_ptr<dcp::OpenJPEGImage> _xyz;
+		uint8_t* _pixel_data_h[3];
 		uint8_t* _pixel_data_d[3];
 		size_t _pitch_in_bytes[3];
 		nvjpeg2kImage_t _device_image;
author	Carl Hetherington <cth@carlh.net>	2022-05-23 19:53:17 +0200
committer	Carl Hetherington <cth@carlh.net>	2022-05-23 19:53:17 +0200
commit	692ef68b721c57fdd07de83adbfccaa93a903f30 (patch)
tree	049371c9026c9d701c9bc22bbf6358ce5b4aaad5 /src/lib
parent	c4578e1197744272e78d5a19c92bc43e0e92e4cb (diff)