src/lib/cuda.cc

   1 /*
   2     Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
   3
   4     This file is part of DCP-o-matic.
   5
   6     DCP-o-matic is free software; you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation; either version 2 of the License, or
   9     (at your option) any later version.
  10
  11     DCP-o-matic is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with DCP-o-matic.  If not, see <http://www.gnu.org/licenses/>.
  18
  19 */
  20
  21
  22 #include "cuda.h"
  23 #include "dcpomatic_assert.h"
  24 #include "exceptions.h"
  25 #include <dcp/openjpeg_image.h>
  26 #include <nvjpeg2k.h>
  27
  28
  29 using std::shared_ptr;
  30
  31
  32 CUDA* CUDA::_instance = nullptr;
  33
  34
  35 CUDA::CUDA()
  36 {
  37         _decode_thread = std::thread(std::bind(&CUDA::decode_thread, this));
  38 }
  39
  40
  41 void
  42 CUDA::decode_thread()
  43 {
  44         nvjpeg2kHandle_t handle;
  45         auto status = nvjpeg2kCreateSimple(&handle);
  46         if (status != NVJPEG2K_STATUS_SUCCESS) {
  47                 throw CUDAError("nvjpeg2kCreateSimple", status);
  48         }
  49
  50         nvjpeg2kDecodeState_t decode_state;
  51         status = nvjpeg2kDecodeStateCreate(handle, &decode_state);
  52         if (status != NVJPEG2K_STATUS_SUCCESS) {
  53                 throw CUDAError("nvjpeg2kDecodeStateCreate", status);
  54         }
  55
  56         nvjpeg2kStream_t jpeg2k_stream;
  57         status = nvjpeg2kStreamCreate(&jpeg2k_stream);
  58         if (status != NVJPEG2K_STATUS_SUCCESS) {
  59                 throw CUDAError("nvjpeg2kStreamCreate", status);
  60         }
  61
  62         while (true) {
  63                 boost::mutex::scoped_lock lm(_decode_mutex);
  64                 while (_decode_queue.empty()) {
  65                         _decode_queue_empty_condition.wait(lm);
  66                 }
  67
  68                 auto input = std::move(_decode_queue.front());
  69                 _decode_queue.pop();
  70                 lm.unlock();
  71
  72                 try {
  73                         std::cout << "we got " << input.data->size() << " bytes.\n";
  74                         auto status = nvjpeg2kStreamParse(handle, input.data->data(), input.data->size(), 0, 0, jpeg2k_stream);
  75                         if (status != NVJPEG2K_STATUS_SUCCESS) {
  76                                 throw CUDAError("nvjpeg2kStreamParse", status);
  77                         }
  78
  79                         nvjpeg2kImageInfo_t image_info;
  80                         status = nvjpeg2kStreamGetImageInfo(jpeg2k_stream, &image_info);
  81                         if (status != NVJPEG2K_STATUS_SUCCESS) {
  82                                 throw CUDAError("nvjpeg2kStreamGetImageInfo", status);
  83                         }
  84                         std::cout << image_info.num_components << " components.\n";
  85
  86                         nvjpeg2kImageComponentInfo_t image_component_info[3];
  87                         for (int i = 0; i < 3; ++i) {
  88                                 status = nvjpeg2kStreamGetImageComponentInfo(jpeg2k_stream, &image_component_info[i], i);
  89                                 if (status != NVJPEG2K_STATUS_SUCCESS) {
  90                                         throw CUDAError("nvjpeg2kStreamGetImageComponentInfo", status);
  91                                 }
  92                         }
  93
  94                         auto const width = image_component_info[0].component_width;
  95                         auto const height = image_component_info[0].component_height;
  96                         std::cout << width << "x" << height << " " << ((int)image_component_info[0].precision) << "\n";
  97
  98                         uint16_t* decoded_d[3];
  99                         size_t pitch[3];
 100
 101                         for (int i = 0; i < 3; ++i) {
 102                                 printf("cudaMallocPitch %d %d\n", width * 2, height);
 103                                 auto status = cudaMallocPitch(reinterpret_cast<void**>(&decoded_d[i]), &pitch[i], width * 2, height);
 104                                 if (status != cudaSuccess) {
 105                                         throw CUDAError("cudaMallocPitch", status);
 106                                 }
 107                         }
 108
 109                         nvjpeg2kImage_t output_image;
 110                         output_image.pixel_data = reinterpret_cast<void**>(decoded_d);
 111                         output_image.pixel_type = NVJPEG2K_UINT16;
 112                         output_image.pitch_in_bytes = pitch;
 113                         output_image.num_components = 3;
 114
 115                         status = nvjpeg2kDecode(handle, decode_state, jpeg2k_stream, &output_image, 0);
 116                         std::cout << "decode said " << status << "\n";
 117                         if (status != NVJPEG2K_STATUS_SUCCESS) {
 118                                 abort();
 119                                 throw CUDAError("nvjpeg2kDecode", status);
 120                         }
 121                         cudaDeviceSynchronize();
 122
 123                         std::vector<uint16_t> decoded_h[3];
 124                         for (int i = 0; i < 3; ++i) {
 125                                 auto size = pitch[i] * height;
 126                                 decoded_h[i].resize(size / 2);
 127                                 auto status = cudaMemcpy(decoded_h[i].data(), decoded_d[i], size, cudaMemcpyDeviceToHost);
 128                                 if (status != cudaSuccess) {
 129                                         throw CUDAError("cudaMemcpy", status);
 130                                 }
 131                         }
 132
 133                         auto output = std::make_shared<Image>(input.pixel_format, dcp::Size(width, height), input.alignment);
 134                         int p = 0;
 135                         for (size_t y = 0; y < height; ++y) {
 136                                 auto q = reinterpret_cast<uint16_t *>(output->data()[0] + y * output->stride()[0]);
 137                                 for (size_t x = 0; x < width; ++x) {
 138                                         *q++ = decoded_h[0][p] << 4;
 139                                         *q++ = decoded_h[1][p] << 4;
 140                                         *q++ = decoded_h[2][p] << 4;
 141                                         ++p;
 142                                 }
 143                         }
 144
 145                         for (int i = 0; i < 3; ++i) {
 146                                 cudaFree(decoded_d[i]);
 147                         }
 148
 149                         lm.lock();
 150                         _decode_output[input.id] = output;
 151                         _decode_complete_condition.notify_all();
 152                         lm.unlock();
 153                 } catch (CUDAError&) {
 154                         lm.lock();
 155                         _decode_output[input.id] = {};
 156                         _decode_complete_condition.notify_all();
 157                         lm.unlock();
 158                 }
 159         }
 160 }
 161
 162
 163 shared_ptr<Image>
 164 CUDA::decode(shared_ptr<const dcp::Data> j2k_data, int reduce, AVPixelFormat pixel_format, Image::Alignment alignment)
 165 {
 166         boost::mutex::scoped_lock lm(_decode_mutex);
 167         auto id = _next_decode_id++;
 168         _decode_queue.push({id, j2k_data, reduce, pixel_format, alignment});
 169         _decode_queue_empty_condition.notify_all();
 170
 171         while (_decode_output.find(id) == _decode_output.end()) {
 172                 _decode_complete_condition.wait(lm);
 173         }
 174
 175         auto iter = _decode_output.find(id);
 176         if (iter ==_decode_output.end()) {
 177                 return {};
 178         }
 179
 180         auto output = *iter;
 181         _decode_output.erase(iter);
 182         return output.second;
 183 }
 184
 185
 186 CUDA *
 187 CUDA::instance()
 188 {
 189         if (!_instance) {
 190                 _instance = new CUDA();
 191         }
 192
 193         return _instance;
 194 }
 195