wip: Add CUDA J2K frame encoder using libjpeg2k.
authorCarl Hetherington <cth@carlh.net>
Fri, 20 May 2022 14:55:09 +0000 (16:55 +0200)
committerCarl Hetherington <cth@carlh.net>
Mon, 23 May 2022 14:55:28 +0000 (16:55 +0200)
src/lib/cuda_j2k_frame_encoder.cc [new file with mode: 0644]
src/lib/cuda_j2k_frame_encoder.h [new file with mode: 0644]
src/lib/exceptions.cc
src/lib/exceptions.h
src/lib/j2k_encoder.cc
src/lib/wscript
src/tools/wscript
wscript

diff --git a/src/lib/cuda_j2k_frame_encoder.cc b/src/lib/cuda_j2k_frame_encoder.cc
new file mode 100644 (file)
index 0000000..b382e8d
--- /dev/null
@@ -0,0 +1,188 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#include "cross.h"
+#include "cuda_j2k_frame_encoder.h"
+#include "dcpomatic_log.h"
+#include "dcp_video.h"
+#include "exceptions.h"
+#include "player_video.h"
+#include <dcp/openjpeg_image.h>
+#include <nvjpeg2k.h>
+
+
+using boost::optional;
+
+
+CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
+{
+       nvjpeg2kEncoderCreateSimple(&_encoder_handle);
+       nvjpeg2kEncodeStateCreate(_encoder_handle, &_encoder_state);
+       nvjpeg2kEncodeParamsCreate(&_encoder_params);
+
+       _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
+}
+
+
+void
+CUDAJ2KFrameEncoder::allocate(dcp::Size size)
+{
+       for (int i = 0; i < 3; ++i) {
+               _image_component_info[i].component_width = size.width;
+               _image_component_info[i].component_height = size.height;
+               _image_component_info[i].precision = 12;
+               _image_component_info[i].sgn = 0;
+               _pitch_in_bytes[i] = size.width * 2;
+       }
+
+       for (int i = 0; i < 3; ++i) {
+               auto error = cudaMallocPitch(
+                       reinterpret_cast<void**>(&_pixel_data_d[i]),
+                       &_pitch_in_bytes[i],
+                       _image_component_info[i].component_width * 2,
+                       _image_component_info[i].component_height
+                       );
+               if (error != cudaSuccess) {
+                       throw CUDAError("cudaMallocPitch", error);
+               }
+       }
+
+       _allocated_size = size;
+}
+
+
+CUDAJ2KFrameEncoder::~CUDAJ2KFrameEncoder()
+{
+       free();
+}
+
+
+void
+CUDAJ2KFrameEncoder::free()
+{
+       for (int i = 0; i < 3; ++i) {
+               cudaFree(&_pixel_data_d[i]);
+       }
+
+       _pixel_data_d[0] = _pixel_data_d[1] = _pixel_data_d[2] = nullptr;
+}
+
+
+optional<dcp::ArrayData>
+CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
+{
+       std::cout << "cuda encode starts.\n";
+
+       auto frame = vf.frame();
+       auto size = frame->out_size();
+
+       if (size != _allocated_size) {
+               free();
+               allocate(size);
+       }
+
+       auto xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
+
+       for (int i = 0; i < 3; ++i) {
+               _pixel_data_h[i] = reinterpret_cast<uint8_t*>(xyz->data(i));
+       }
+
+       nvjpeg2kImage_t image_h;
+       image_h.num_components = 3;
+       image_h.pixel_data = reinterpret_cast<void**>(_pixel_data_h);
+       image_h.pixel_type = NVJPEG2K_UINT16;
+       image_h.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+
+       nvjpeg2kImage_t image_d;
+       image_d.num_components = 3;
+       image_d.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
+       image_d.pixel_type = NVJPEG2K_UINT16;
+       image_d.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
+
+       for (int i = 0; i < 3; ++i) {
+               auto status = cudaMemcpy2D(
+                       image_d.pixel_data[i],
+                       image_d.pitch_in_bytes[i],
+                       image_h.pixel_data[i],
+                       image_h.pitch_in_bytes[i],
+                       2 * _image_component_info[i].component_width,
+                       _image_component_info[i].component_height,
+                       cudaMemcpyHostToDevice
+                       );
+               if (status != cudaSuccess) {
+                       throw CUDAError("cudaMemcpy2D", status);
+               }
+               cudaDeviceSynchronize();
+       }
+
+       nvjpeg2kEncodeConfig_t config;
+       memset(&config, 0, sizeof(config));
+       config.stream_type = NVJPEG2K_STREAM_J2K;
+       config.color_space = NVJPEG2K_COLORSPACE_SRGB;
+       config.image_width = size.width;
+       config.image_height = size.height;
+       config.num_components = 3;
+       config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&_image_component_info);
+       config.code_block_w = 32;
+       config.code_block_h = 32;
+       config.irreversible = 0;
+       config.mct_mode = 1;
+       config.prog_order = NVJPEG2K_CPRL;
+       config.num_resolutions = vf.resolution() == Resolution::FOUR_K ? 7 : 6;
+
+       auto status = nvjpeg2kEncodeParamsSetEncodeConfig(_encoder_params, &config);
+       if (status != NVJPEG2K_STATUS_SUCCESS) {
+               throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status);
+       }
+
+       // XXX: quality
+       status = nvjpeg2kEncodeParamsSetQuality(_encoder_params, 25);
+       if (status != NVJPEG2K_STATUS_SUCCESS) {
+               throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status);
+       }
+
+       status = nvjpeg2kEncode(_encoder_handle, _encoder_state, _encoder_params, &image_d, 0);
+       if (status != NVJPEG2K_STATUS_SUCCESS) {
+               throw CUDAError("nvjpeg2kEncode", status);
+       }
+
+       size_t compressed_size;
+       status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, nullptr, &compressed_size, 0);
+
+       dcp::ArrayData output(compressed_size);
+       status = nvjpeg2kEncodeRetrieveBitstream(_encoder_handle, _encoder_state, output.data(), &compressed_size, 0);
+       cudaDeviceSynchronize();
+
+       if (status != NVJPEG2K_STATUS_SUCCESS) {
+               throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
+       }
+
+       std::cout << "cuda encode ends with " << status << " " << output.size() << "\n";
+
+       return output;
+}
+
+
+void
+CUDAJ2KFrameEncoder::log_thread_start ()
+{
+       LOG_TIMING("start-encoder-thread thread=%1", thread_id());
+}
diff --git a/src/lib/cuda_j2k_frame_encoder.h b/src/lib/cuda_j2k_frame_encoder.h
new file mode 100644 (file)
index 0000000..46ac646
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This file is part of DCP-o-matic.
+
+    DCP-o-matic is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DCP-o-matic is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+#ifndef DCPOMATIC_CUDA_J2K_FRAME_ENCODER
+#define DCPOMATIC_CUDA_J2K_FRAME_ENCODER
+
+
+#include "j2k_frame_encoder.h"
+#include <dcp/types.h>
+#include <nvjpeg2k.h>
+#include <vector>
+
+
+class CUDAJ2KFrameEncoder : public J2KFrameEncoder
+{
+public:
+       CUDAJ2KFrameEncoder();
+       ~CUDAJ2KFrameEncoder();
+
+       boost::optional<dcp::ArrayData> encode(DCPVideo const &) override;
+       void log_thread_start() override;
+
+private:
+       void allocate(dcp::Size size);
+       void free();
+
+       nvjpeg2kEncoder_t _encoder_handle;
+       nvjpeg2kEncodeState_t _encoder_state;
+       nvjpeg2kEncodeParams_t _encoder_params;
+       uint8_t* _pixel_data_h[3];
+       uint8_t* _pixel_data_d[3];
+       size_t _pitch_in_bytes[3];
+       nvjpeg2kImageComponentInfo_t _image_component_info[3];
+
+       dcp::Size _allocated_size;
+};
+
+
+#endif
index 66db9fda71d0b13414515a697b8a8acced87810b..4a43ac32b527a7226e5e60501a4a1c25aff52db7 100644 (file)
@@ -172,3 +172,16 @@ VerifyError::VerifyError (string m, int n)
 
 }
 
+
+CUDAError::CUDAError (string function, cudaError error)
+       : runtime_error(String::compose("CUDA error on %1 (%2)", function, static_cast<int>(error)))
+{
+
+}
+
+
+CUDAError::CUDAError (string function, int error)
+       : runtime_error(String::compose("CUDA error on %1 (%2)", function, error))
+{
+}
+
index 7c9509800ed3a40bf35f2cd88ce45755c44a03f8..0712c085b511409febec3e935369f783a24a5ebe 100644 (file)
@@ -32,6 +32,9 @@
 extern "C" {
 #include <libavutil/pixfmt.h>
 }
+#ifdef HAVE_NVJPEG2K_H
+#include <cuda_runtime_api.h>
+#endif
 #include <boost/filesystem.hpp>
 #include <boost/optional.hpp>
 #include <cstring>
@@ -457,4 +460,12 @@ public:
 };
 
 
+class CUDAError : public std::runtime_error
+{
+public:
+       CUDAError (std::string function, cudaError_t error);
+       CUDAError (std::string function, int);
+};
+
+
 #endif
index f9969927fb7ae5bec6fdab6dc22b426ae64890c7..11b360b349e189ae226eb3d76a7d0a320d672cbf 100644 (file)
@@ -27,6 +27,7 @@
 #include "compose.hpp"
 #include "config.h"
 #include "cpu_j2k_frame_encoder.h"
+#include "cuda_j2k_frame_encoder.h"
 #include "cross.h"
 #include "dcp_video.h"
 #include "dcpomatic_log.h"
@@ -350,6 +351,7 @@ J2KEncoder::servers_list_changed ()
 
        /* XXX: could re-use threads */
 
+#if 0
        if (!Config::instance()->only_servers_encode ()) {
                for (int i = 0; i < Config::instance()->master_encoding_threads (); ++i) {
                        auto worker = make_shared<CPUJ2KFrameEncoder>();
@@ -375,6 +377,10 @@ J2KEncoder::servers_list_changed ()
                        _threads->create_thread(boost::bind(&J2KEncoder::encoder_thread, this, worker));
                }
        }
+#endif
+       auto worker = make_shared<CUDAJ2KFrameEncoder>();
+       _workers.push_back(worker);
+       _threads->create_thread(boost::bind(&J2KEncoder::encoder_thread, this, worker));
 
        _writer->set_encoder_threads (_threads->size());
 }
index 1e8fd6f251d2a7c8a5d9e29f22354450cba1d885..377bf140966abb61f0ad8acf0ba727d22081edb7 100644 (file)
@@ -214,6 +214,7 @@ def build(bld):
                  BOOST_FILESYSTEM BOOST_THREAD BOOST_DATETIME BOOST_SIGNALS2 BOOST_REGEX
                  SAMPLERATE POSTPROC TIFF SSH DCP CXML GLIB LZMA XML++
                  CURL ZIP BZ2 FONTCONFIG PANGOMM CAIROMM XMLSEC SUB ICU NETTLE PNG JPEG LEQM_NRT
+                 CUDA
                  """
 
     if bld.env.TARGET_OSX:
@@ -227,6 +228,9 @@ def build(bld):
         if bld.env.TARGET_LINUX:
             obj.uselib += ' POLKIT'
 
+    if bld.env.HAVE_NVJPEG2K_H:
+        obj.source += ' cuda_j2k_frame_encoder.cc'
+
     if bld.env.TARGET_WINDOWS_64 or bld.env.TARGET_WINDOWS_32:
         obj.uselib += ' WINSOCK2 DBGHELP SHLWAPI MSWSOCK BOOST_LOCALE SETUPAPI OLE32 UUID'
         obj.source += ' cross_windows.cc'
index be5297beb06e8cee0f1ac1f6acf73fc49ca905dd..204058b2ee5e3f231c8f76335c9f4c2e021a412a 100644 (file)
@@ -30,7 +30,7 @@ def configure(conf):
 def build(bld):
     uselib =  'BOOST_THREAD BOOST_DATETIME DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC '
     uselib += 'AVUTIL SWSCALE SWRESAMPLE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB '
-    uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT '
+    uselib += 'SNDFILE SAMPLERATE BOOST_REGEX ICU NETTLE RTAUDIO PNG JPEG LEQM_NRT CUDA'
 
     if bld.env.ENABLE_DISK:
         if bld.env.TARGET_LINUX:
diff --git a/wscript b/wscript
index 439fc4fa597d20c9c3ffd45e57310e2dd63a91ae..139bcdc36b6f133497c0c29b305943d20d170f05 100644 (file)
--- a/wscript
+++ b/wscript
@@ -149,6 +149,7 @@ def configure(conf):
             pass
 
     conf.env.append_value('INCLUDES', os.path.join(conf.options.cuda_path, 'include'))
+    conf.env.append_value('LIBPATH', os.path.join(conf.options.cuda_path, 'targets', 'x86_64-linux', 'lib'))
 
     #
     # Windows/Linux/OS X specific
@@ -561,7 +562,9 @@ def configure(conf):
                            lib=deps,
                            uselib_store='BOOST_PROCESS')
 
-    conf.check(header_name='nvjpeg2k.h', mandatory=False)
+    if conf.check(header_name='nvjpeg2k.h', mandatory=False):
+        conf.env.LIB_CUDA = ['cudart', 'nvjpeg2k']
+
 
     # Other stuff