fixup! wip: Add CUDA J2K frame encoder using libjpeg2k.
authorCarl Hetherington <cth@carlh.net>
Sun, 22 May 2022 22:41:35 +0000 (00:41 +0200)
committerCarl Hetherington <cth@carlh.net>
Mon, 23 May 2022 14:55:28 +0000 (16:55 +0200)
src/lib/cuda_j2k_frame_encoder.cc
src/lib/cuda_j2k_frame_encoder.h

index b382e8d3fda9069bedc7b21e6c6616acc2404917..675b26f57f2421e4ad0823a19b339c000cc031a7 100644 (file)
 #include "player_video.h"
 #include <dcp/openjpeg_image.h>
 #include <nvjpeg2k.h>
+#include <vector>
 
 
+using std::vector;
 using boost::optional;
 
 
@@ -37,116 +39,131 @@ CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
        nvjpeg2kEncoderCreateSimple(&_encoder_handle);
        nvjpeg2kEncodeStateCreate(_encoder_handle, &_encoder_state);
        nvjpeg2kEncodeParamsCreate(&_encoder_params);
-
-       _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
 }
 
 
-void
-CUDAJ2KFrameEncoder::allocate(dcp::Size size)
+CUDAJ2KFrameEncoder::Frame::Frame(DCPVideo const& vf)
 {
-       for (int i = 0; i < 3; ++i) {
-               _image_component_info[i].component_width = size.width;
-               _image_component_info[i].component_height = size.height;
-               _image_component_info[i].precision = 12;
-               _image_component_info[i].sgn = 0;
-               _pitch_in_bytes[i] = size.width * 2;
-       }
+       auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+
+       void* pixel_data_h[] = {
+               xyz->data(0),
+               xyz->data(1),
+               xyz->data(2)
+       };
+
+       auto const pitch = xyz->size().width * 2;
 
        for (int i = 0; i < 3; ++i) {
-               auto error = cudaMallocPitch(
+               _pitch_in_bytes[i] = pitch;
+               auto status = cudaMallocPitch(
                        reinterpret_cast<void**>(&_pixel_data_d[i]),
                        &_pitch_in_bytes[i],
-                       _image_component_info[i].component_width * 2,
-                       _image_component_info[i].component_height
+                       pitch,
+                       xyz->size().height
                        );
-               if (error != cudaSuccess) {
-                       throw CUDAError("cudaMallocPitch", error);
+
+               if (status != cudaSuccess) {
+                       throw CUDAError("cudaMallocPitch", status);
                }
-       }
 
-       _allocated_size = size;
-}
+               status = cudaMemcpy2D(
+                       _pixel_data_d[i],
+                       _pitch_in_bytes[i],
+                       pixel_data_h[i],
+                       _pitch_in_bytes[i],
+                       pitch,
+                       xyz->size().height,
+                       cudaMemcpyHostToDevice
+                       );
 
+               if (status != cudaSuccess) {
+                       throw CUDAError("cudaMemcpy2D", status);
+               }
 
-CUDAJ2KFrameEncoder::~CUDAJ2KFrameEncoder()
-{
-       free();
+               cudaDeviceSynchronize();
+       }
+
+       _device_image.num_components = 3;
+       _device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+       _device_image.pixel_type = NVJPEG2K_UINT16;
+       _device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
 }
 
 
-void
-CUDAJ2KFrameEncoder::free()
+CUDAJ2KFrameEncoder::Frame::Frame(Frame&& other)
 {
        for (int i = 0; i < 3; ++i) {
-               cudaFree(&_pixel_data_d[i]);
+               _pixel_data_d[i] = other._pixel_data_d[i];
+               other._pixel_data_d[i] = nullptr;
+               _pitch_in_bytes[i] = other._pitch_in_bytes[i];
        }
 
-       _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
+       _device_image.num_components = other._device_image.num_components;
+       _device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+       _device_image.pixel_type = NVJPEG2K_UINT16;
+       _device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
 }
 
 
-optional<dcp::ArrayData>
-CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
+CUDAJ2KFrameEncoder::Frame::~Frame()
 {
-       std::cout << "cuda encode starts.\n";
+       cudaFree(_pixel_data_d[0]);
+       cudaFree(_pixel_data_d[1]);
+       cudaFree(_pixel_data_d[2]);
+}
 
-       auto frame = vf.frame();
-       auto size = frame->out_size();
 
-       if (size != _allocated_size) {
-               free();
-               allocate(size);
-       }
+vector<dcp::ArrayData>
+CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
+{
+       int constexpr BATCH_SIZE = 128;
 
-       auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+       auto const size = vf.frame()->out_size();
+       DCPOMATIC_ASSERT(!_size || size == *_size);
+       _size = size;
 
-       for (int i = 0; i < 3; ++i) {
-               _pixel_data_h[i] = reinterpret_cast<uint8_t*>(xyz->data(i));
+       DCPOMATIC_ASSERT(!_resolution || vf.resolution() == *_resolution);
+       _resolution = vf.resolution();
+
+       if (_batch.size() < BATCH_SIZE) {
+               _batch.push_back(Frame(vf));
+               return {};
        }
 
-       nvjpeg2kImage_t image_h;
-       image_h.num_components = 3;
-       image_h.pixel_data = reinterpret_cast<void**>(_pixel_data_h);
-       image_h.pixel_type = NVJPEG2K_UINT16;
-       image_h.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+       return flush();
+}
+
 
-       nvjpeg2kImage_t image_d;
-       image_d.num_components = 3;
-       image_d.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
-       image_d.pixel_type = NVJPEG2K_UINT16;
-       image_d.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+vector<dcp::ArrayData>
+CUDAJ2KFrameEncoder::flush()
+{
+       if (_batch.empty()) {
+               return {};
+       }
 
+       nvjpeg2kImageComponentInfo_t info[3];
        for (int i = 0; i < 3; ++i) {
-               auto status = cudaMemcpy2D(
-                       image_d.pixel_data[i],
-                       image_d.pitch_in_bytes[i],
-                       image_h.pixel_data[i],
-                       image_h.pitch_in_bytes[i],
-                       2 * _image_component_info[i].component_width,
-                       _image_component_info[i].component_height,
-                       cudaMemcpyHostToDevice
-                       );
-               if (status != cudaSuccess) {
-                       throw CUDAError("cudaMemcpy2D", status);
-               }
-               cudaDeviceSynchronize();
+               info[i].component_width = _size->width;
+               info[i].component_height = _size->height;
+               info[i].precision = 12;
+               info[i].sgn = 0;
        }
 
        nvjpeg2kEncodeConfig_t config;
        memset(&config, 0, sizeof(config));
        config.stream_type = NVJPEG2K_STREAM_J2K;
        config.color_space = NVJPEG2K_COLORSPACE_SRGB;
-       config.image_width = size.width;
-       config.image_height = size.height;
+       config.image_width = _size->width;
+       config.image_height = _size->height;
        config.num_components = 3;
-       config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&_image_component_info);
+       config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&info);
        config.code_block_w = 32;
        config.code_block_h = 32;
        config.irreversible = 0;
        config.mct_mode = 1;
        config.prog_order = NVJPEG2K_CPRL;
-       config.num_resolutions = vf.resolution() == Resolution::FOUR_K ? 7 : 6;
+       config.num_resolutions = *_resolution == Resolution::FOUR_K ? 7 : 6;
 
        auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encoder_params, &config);
        if (status != NVJPEG2K_STATUS_SUCCESS) {
@@ -159,23 +176,31 @@ CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
                throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status);
        }
 
-       status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, &image_d, 0);
-       if (status != NVJPEG2K_STATUS_SUCCESS) {
-               throw CUDAError("nvjpeg2kEncode", status);
-       }
+       vector<dcp::ArrayData> output;
 
-       size_t compressed_size;
-       status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0);
+       for (auto const& frame: _batch) {
 
-       dcp::ArrayData output(compressed_size);
-       status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, output.data(), &compressed_size, 0);
-       cudaDeviceSynchronize();
+               auto x = frame.device_image();
 
-       if (status != NVJPEG2K_STATUS_SUCCESS) {
-               throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+               status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, frame.device_image(), 0);
+               if (status != NVJPEG2K_STATUS_SUCCESS) {
+                       throw CUDAError("nvjpeg2kEncode", status);
+               }
+
+               size_t compressed_size;
+               status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0);
+
+               dcp::ArrayData this_output(compressed_size);
+               status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, this_output.data(), &compressed_size, 0);
+               if (status != NVJPEG2K_STATUS_SUCCESS) {
+                       throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+               }
+
+               output.push_back(this_output);
+               cudaStreamSynchronize(0);
        }
 
-       std::cout << "cuda encode ends with " << status << " " << output.size() << "\n";
+       _batch.clear();
 
        return output;
 }
index 46ac646073ae48602d255659bb6033c328177a29..28e8a3c764bfbc225778818c0a8a5a1bdf264b5f 100644 (file)
@@ -24,6 +24,7 @@
 
 
 #include "j2k_frame_encoder.h"
+#include "types.h"
 #include <dcp/types.h>
 #include <nvjpeg2k.h>
 #include <vector>
@@ -33,24 +34,39 @@ class CUDAJ2KFrameEncoder : public J2KFrameEncoder
 {
 public:
        CUDAJ2KFrameEncoder();
-       ~CUDAJ2KFrameEncoder();
 
-       boost::optional<dcp::ArrayData> encode(DCPVideo const &) override;
+       std::vector<dcp::ArrayData> encode(DCPVideo const &) override;
+       std::vector<dcp::ArrayData> flush() override;
        void log_thread_start() override;
 
 private:
-       void allocate(dcp::Size size);
-       void free();
-
        nvjpeg2kEncoder_t _encoder_handle;
        nvjpeg2kEncodeState_t _encoder_state;
        nvjpeg2kEncodeParams_t _encoder_params;
-       uint8_t* _pixel_data_h[3];
-       uint8_t* _pixel_data_d[3];
-       size_t _pitch_in_bytes[3];
-       nvjpeg2kImageComponentInfo_t _image_component_info[3];
 
-       dcp::Size _allocated_size;
+       class Frame
+       {
+       public:
+               Frame(DCPVideo const& vf);
+               Frame(Frame const& other) = delete;
+               Frame(Frame&& other);
+               ~Frame();
+
+               Frame& operator=(Frame const& other) = delete;
+
+               nvjpeg2kImage_t const* const device_image() const {
+                       return &_device_image;
+               }
+
+       private:
+               uint8_t* _pixel_data_d[3];
+               size_t _pitch_in_bytes[3];
+               nvjpeg2kImage_t _device_image;
+       };
+
+       std::vector<Frame> _batch;
+       boost::optional<dcp::Size> _size;
+       boost::optional<Resolution> _resolution;
 };