Try using streams.
authorCarl Hetherington <cth@carlh.net>
Mon, 23 May 2022 17:53:17 +0000 (19:53 +0200)
committerCarl Hetherington <cth@carlh.net>
Mon, 23 May 2022 17:53:17 +0000 (19:53 +0200)
src/lib/cuda_j2k_frame_encoder.cc
src/lib/cuda_j2k_frame_encoder.h

index 7fba7b58ed0a285bc770cfa4570fd9921f366f52..6aa894b8776e5b32ff1d18d26bf9993dfeef0154 100644 (file)
@@ -45,19 +45,17 @@ CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
 }
 
 
-CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf)
+CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf, cudaStream_t stream)
        : _index(vf.index())
        , _eyes(vf.eyes())
 {
-       auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+       _xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
 
-       void* pixel_data_h[] = {
-               xyz->data(0),
-               xyz->data(1),
-               xyz->data(2)
-       };
+       for (int i = 0; i < 3; ++i) {
+               _pixel_data_h[i] = reinterpret_cast<uint8_t*>(_xyz->data(i));
+       }
 
-       auto const pitch = xyz->size().width * 2;
+       auto const pitch = _xyz->size().width * 2;
 
        for (int i = 0; i < 3; ++i) {
                _pitch_in_bytes[i] = pitch;
@@ -65,21 +63,22 @@ CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf)
                        reinterpret_cast<void**>(&_pixel_data_d[i]),
                        &_pitch_in_bytes[i],
                        pitch,
-                       xyz->size().height
+                       _xyz->size().height
                        );
 
                if (status != cudaSuccess) {
                        throw CUDAError("cudaMallocPitch", status);
                }
 
-               status = cudaMemcpy2D(
+               status = cudaMemcpy2DAsync(
                        _pixel_data_d[i],
                        _pitch_in_bytes[i],
-                       pixel_data_h[i],
+                       _pixel_data_h[i],
                        _pitch_in_bytes[i],
                        pitch,
-                       xyz->size().height,
-                       cudaMemcpyHostToDevice
+                       _xyz->size().height,
+                       cudaMemcpyHostToDevice,
+                       stream
                        );
 
                if (status != cudaSuccess) {
@@ -122,7 +121,7 @@ CUDAJ2KFrameEncoder::Input::~Input()
 optional<dcp::ArrayData>
 CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
 {
-       auto input = Input(vf);
+       auto input = Input(vf, _stream);
 
        auto const size = vf.frame()->out_size();
        DCPOMATIC_ASSERT(!_size || size == *_size);
index b17ad69d9fb2ce9b6367311e30bf864b3c06838f..5d1aeef07766f47cfa6464596db03e2e71ce9b6b 100644 (file)
@@ -52,7 +52,7 @@ private:
        class Input
        {
        public:
-               Input(DCPVideo const& vf);
+               Input(DCPVideo const& vf, cudaStream_t stream);
                Input(Input const& other) = delete;
                Input(Input&& other);
                ~Input();
@@ -72,6 +72,8 @@ private:
                }
 
        private:
+               std::shared_ptr<dcp::OpenJPEGImage> _xyz;
+               uint8_t* _pixel_data_h[3];
                uint8_t* _pixel_data_d[3];
                size_t _pitch_in_bytes[3];
                nvjpeg2kImage_t _device_image;