fc4423fcdf977f45ab6242f5f7c02d9796a07496
[dcpomatic.git] / src / lib / cuda_j2k_frame_encoder.cc
1 /*
2     Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
3
4     This file is part of DCP-o-matic.
5
6     DCP-o-matic is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10
11     DCP-o-matic is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15
16     You should have received a copy of the GNU General Public License
17     along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
18
19 */
20
21
22 #include "cross.h"
23 #include "cuda_j2k_frame_encoder.h"
24 #include "dcpomatic_log.h"
25 #include "dcp_video.h"
26 #include "exceptions.h"
27 #include "player_video.h"
28 #include <dcp/array_data.h>
29 #include <dcp/openjpeg_image.h>
30 #include <nvjpeg2k.h>
31 #include <thread>
32 #include <vector>
33
34
35 using std::make_pair;
36 using std::thread;
37 using std::vector;
38 using boost::optional;
39
40
41 vector<thread> CUDAJ2KFrameEncoder::_cuda_threads;
42 std::queue<CUDAJ2KFrameEncoder::Input> CUDAJ2KFrameEncoder::_input;
43 std::map<std::pair<int, Eyes>, dcp::ArrayData> CUDAJ2KFrameEncoder::_output;
44 boost::condition CUDAJ2KFrameEncoder::_input_condition;
45 boost::mutex CUDAJ2KFrameEncoder::_input_mutex;
46 boost::condition CUDAJ2KFrameEncoder::_output_condition;
47 boost::mutex CUDAJ2KFrameEncoder::_output_mutex;
48
49
50 CUDAJ2KFrameEncoder::CUDAJ2KFrameEncoder()
51 {
52         if (_cuda_threads.empty()) {
53                 for (int i = 0; i < 8; ++i) {
54                         _cuda_threads.push_back(std::thread(&CUDAJ2KFrameEncoder::cuda_thread));
55                 }
56         }
57 }
58
59
60 void
61 CUDAJ2KFrameEncoder::cuda_thread()
62 {
63         nvjpeg2kEncoder_t encoder_handle;
64         nvjpeg2kEncodeState_t encoder_state;
65         nvjpeg2kEncodeParams_t encoder_params;
66
67         nvjpeg2kEncoderCreateSimple(&encoder_handle);
68         nvjpeg2kEncodeStateCreate(encoder_handle, &encoder_state);
69         nvjpeg2kEncodeParamsCreate(&encoder_params);
70
71         cudaStream_t stream;
72         cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
73
74         while (true) {
75                 boost::mutex::scoped_lock lm(_input_mutex);
76                 while (_input.empty()) {
77                         std::cout << "gpu starved.\n";
78                         _input_condition.wait(lm);
79                 }
80
81                 auto input = std::move(_input.front());
82                 _input.pop();
83                 lm.unlock();
84
85                 nvjpeg2kImageComponentInfo_t info[3];
86                 for (int i = 0; i < 3; ++i) {
87                         info[i].component_width = input.size().width;
88                         info[i].component_height = input.size().height;
89                         info[i].precision = 12;
90                         info[i].sgn = 0;
91                 }
92
93                 nvjpeg2kEncodeConfig_t config;
94                 memset(&config, 0, sizeof(config));
95                 config.stream_type = NVJPEG2K_STREAM_J2K;
96                 config.color_space = NVJPEG2K_COLORSPACE_SRGB;
97                 config.image_width = input.size().width;
98                 config.image_height = input.size().height;
99                 config.num_components = 3;
100                 config.image_comp_info = reinterpret_cast<nvjpeg2kImageComponentInfo_t*>(&info);
101                 config.code_block_w = 32;
102                 config.code_block_h = 32;
103                 config.irreversible = 0;
104                 config.mct_mode = 1;
105                 config.prog_order = NVJPEG2K_CPRL;
106                 config.num_resolutions = input.resolution() == Resolution::FOUR_K ? 7 : 6;
107                 config.enable_custom_precincts = 0;
108                 config.precint_width[0] = 7;
109                 config.precint_height[0] = 7;
110                 for (int i = 1; i < 6; ++i) {
111                         config.precint_width[i] = 8;
112                         config.precint_height[i] = 8;
113                 }
114
115                 auto status = nvjpeg2kEncodeParamsSetEncodeConfig(encoder_params, &config);
116                 if (status != NVJPEG2K_STATUS_SUCCESS) {
117                         throw CUDAError("nvjpeg2kEncodeParamsSetEncodeConfig", status);
118                 }
119
120                 // XXX: quality
121                 status = nvjpeg2kEncodeParamsSetQuality(encoder_params, 50);
122                 if (status != NVJPEG2K_STATUS_SUCCESS) {
123                         throw CUDAError("nvjpeg2kEncodeParamsSetQuality", status);
124                 }
125
126                 status = nvjpeg2kEncode(encoder_handle, encoder_state, encoder_params, input.device_image(), stream);
127                 if (status != NVJPEG2K_STATUS_SUCCESS) {
128                         throw CUDAError("nvjpeg2kEncode", status);
129                 }
130
131                 size_t compressed_size;
132                 status = nvjpeg2kEncodeRetrieveBitstream(encoder_handle, encoder_state, nullptr, &compressed_size, stream);
133
134                 dcp::ArrayData output(compressed_size);
135                 status = nvjpeg2kEncodeRetrieveBitstream(encoder_handle, encoder_state, output.data(), &compressed_size, stream);
136                 if (status != NVJPEG2K_STATUS_SUCCESS) {
137                         throw CUDAError("nvjpeg2kEncodeRetrieveBitstream", status);
138                 }
139
140                 boost::mutex::scoped_lock lm2(_output_mutex);
141                 _output[make_pair(input.index(), input.eyes())] = output;
142                 _output_condition.notify_all();
143         }
144
145         cudaStreamDestroy(stream);
146 }
147
148
149 CUDAJ2KFrameEncoder::Input::Input(DCPVideo const& vf)
150         : _index(vf.index())
151         , _eyes(vf.eyes())
152         , _resolution(vf.resolution())
153 {
154         _xyz = convert_to_xyz(vf.frame(), boost::bind(&Log::dcp_log, dcpomatic_log.get(), _1, _2));
155
156         for (int i = 0; i < 3; ++i) {
157                 _pixel_data_h[i] = reinterpret_cast<uint8_t*>(_xyz->data(i));
158         }
159
160         _size = _xyz->size();
161
162         auto const pitch = _size.width * 2;
163
164         for (int i = 0; i < 3; ++i) {
165                 _pitch_in_bytes[i] = pitch;
166                 auto status = cudaMallocPitch(
167                         reinterpret_cast<void**>(&_pixel_data_d[i]),
168                         &_pitch_in_bytes[i],
169                         pitch,
170                         _size.height
171                         );
172
173                 if (status != cudaSuccess) {
174                         throw CUDAError("cudaMallocPitch", status);
175                 }
176
177                 status = cudaMemcpy2D(
178                         _pixel_data_d[i],
179                         _pitch_in_bytes[i],
180                         _pixel_data_h[i],
181                         _pitch_in_bytes[i],
182                         pitch,
183                         _size.height,
184                         cudaMemcpyHostToDevice
185                         );
186
187                 if (status != cudaSuccess) {
188                         throw CUDAError("cudaMemcpy2D", status);
189                 }
190         }
191
192         _device_image.num_components = 3;
193         _device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
194         _device_image.pixel_type = NVJPEG2K_UINT16;
195         _device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
196 }
197
198
199 CUDAJ2KFrameEncoder::Input::Input(Input&& other)
200         : _index(other._index)
201         , _eyes(other._eyes)
202         , _size(other._size)
203         , _resolution(other._resolution)
204 {
205         for (int i = 0; i < 3; ++i) {
206                 _pixel_data_d[i] = other._pixel_data_d[i];
207                 other._pixel_data_d[i] = nullptr;
208                 _pitch_in_bytes[i] = other._pitch_in_bytes[i];
209         }
210
211         _device_image.num_components = other._device_image.num_components;
212         _device_image.pixel_data = reinterpret_cast<void**>(_pixel_data_d);
213         _device_image.pixel_type = NVJPEG2K_UINT16;
214         _device_image.pitch_in_bytes = reinterpret_cast<size_t*>(_pitch_in_bytes);
215 }
216
217
218 CUDAJ2KFrameEncoder::Input::~Input()
219 {
220         cudaFree(_pixel_data_d[0]);
221         cudaFree(_pixel_data_d[1]);
222         cudaFree(_pixel_data_d[2]);
223 }
224
225
226 optional<dcp::ArrayData>
227 CUDAJ2KFrameEncoder::encode(DCPVideo const& vf)
228 {
229         auto input = Input(vf);
230
231         auto const size = vf.frame()->out_size();
232         DCPOMATIC_ASSERT(!_size || size == *_size);
233         _size = size;
234
235         DCPOMATIC_ASSERT(!_resolution || vf.resolution() == *_resolution);
236         _resolution = vf.resolution();
237
238         {
239                 boost::mutex::scoped_lock lm (_input_mutex);
240                 _input.push(std::move(input));
241                 std::cout << "push input: " << _input.size() << "\n";
242                 _input_condition.notify_all();
243         }
244
245         boost::mutex::scoped_lock lm(_output_mutex);
246         while (_output.find(make_pair(vf.index(), vf.eyes())) == _output.end()) {
247                 _output_condition.wait(lm);
248         }
249
250         auto iter = _output.find(make_pair(vf.index(), vf.eyes()));
251         auto data = iter->second;
252         _output.erase(iter);
253         return data;
254 }
255
256
257
258 void
259 CUDAJ2KFrameEncoder::log_thread_start ()
260 {
261        LOG_TIMING("start-encoder-thread thread=%1", thread_id());
262 }
263
264
265 void
266 CUDAJ2KFrameEncoder::flush()
267 {
268
269 }