src/lib/openjp2/dwt.c

   1 /*
   2  * The copyright in this software is being made available under the 2-clauses
   3  * BSD License, included below. This software may be subject to other third
   4  * party and contributor rights, including patent rights, and no such rights
   5  * are granted under this license.
   6  *
   7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
   8  * Copyright (c) 2002-2014, Professor Benoit Macq
   9  * Copyright (c) 2001-2003, David Janssens
  10  * Copyright (c) 2002-2003, Yannick Verschueren
  11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12  * Copyright (c) 2003-2014, Antonin Descampe
  13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
  15  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
  16  * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
  17  * All rights reserved.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  29  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38  * POSSIBILITY OF SUCH DAMAGE.
  39  */
  40
  41 #include <assert.h>
  42
  43 #define OPJ_SKIP_POISON
  44 #include "opj_includes.h"
  45
  46 #ifdef __SSE__
  47 #include <xmmintrin.h>
  48 #endif
  49 #ifdef __SSE2__
  50 #include <emmintrin.h>
  51 #endif
  52 #ifdef __SSSE3__
  53 #include <tmmintrin.h>
  54 #endif
  55 #ifdef __AVX2__
  56 #include <immintrin.h>
  57 #endif
  58
  59 #if defined(__GNUC__)
  60 #pragma GCC poison malloc calloc realloc free
  61 #endif
  62
  63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
  64 /*@{*/
  65
  66 #define OPJ_WS(i) v->mem[(i)*2]
  67 #define OPJ_WD(i) v->mem[(1+(i)*2)]
  68
  69 #ifdef __AVX2__
  70 /** Number of int32 values in a AVX2 register */
  71 #define VREG_INT_COUNT       8
  72 #else
  73 /** Number of int32 values in a SSE2 register */
  74 #define VREG_INT_COUNT       4
  75 #endif
  76
  77 /** Number of columns that we can process in parallel in the vertical pass */
  78 #define PARALLEL_COLS_53     (2*VREG_INT_COUNT)
  79
  80 /** @name Local data structures */
  81 /*@{*/
  82
  83 typedef struct dwt_local {
  84     OPJ_INT32* mem;
  85     OPJ_INT32 dn;   /* number of elements in high pass band */
  86     OPJ_INT32 sn;   /* number of elements in low pass band */
  87     OPJ_INT32 cas;  /* 0 = start on even coord, 1 = start on odd coord */
  88 } opj_dwt_t;
  89
  90 #define NB_ELTS_V8  8
  91
  92 typedef union {
  93     OPJ_FLOAT32 f[NB_ELTS_V8];
  94 } opj_v8_t;
  95
  96 typedef struct v8dwt_local {
  97     opj_v8_t*   wavelet ;
  98     OPJ_INT32       dn ;  /* number of elements in high pass band */
  99     OPJ_INT32       sn ;  /* number of elements in low pass band */
 100     OPJ_INT32       cas ; /* 0 = start on even coord, 1 = start on odd coord */
 101     OPJ_UINT32      win_l_x0; /* start coord in low pass band */
 102     OPJ_UINT32      win_l_x1; /* end coord in low pass band */
 103     OPJ_UINT32      win_h_x0; /* start coord in high pass band */
 104     OPJ_UINT32      win_h_x1; /* end coord in high pass band */
 105 } opj_v8dwt_t ;
 106
 107 /* From table F.4 from the standard */
 108 static const OPJ_FLOAT32 opj_dwt_alpha =  -1.586134342f;
 109 static const OPJ_FLOAT32 opj_dwt_beta  =  -0.052980118f;
 110 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
 111 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
 112
 113 static const OPJ_FLOAT32 opj_K      = 1.230174105f;
 114 static const OPJ_FLOAT32 opj_invK   = (OPJ_FLOAT32)(1.0 / 1.230174105);
 115
 116 /*@}*/
 117
 118 /** @name Local static functions */
 119 /*@{*/
 120
 121 /**
 122 Forward lazy transform (horizontal)
 123 */
 124 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 125                                    OPJ_INT32 * OPJ_RESTRICT b,
 126                                    OPJ_INT32 dn,
 127                                    OPJ_INT32 sn, OPJ_INT32 cas);
 128
 129 /**
 130 Forward 9-7 wavelet transform in 1-D
 131 */
 132 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
 133                                   OPJ_INT32 cas);
 134 /**
 135 Explicit calculation of the Quantization Stepsizes
 136 */
 137 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 138                                     opj_stepsize_t *bandno_stepsize);
 139 /**
 140 Inverse wavelet transform in 2-D.
 141 */
 142 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
 143                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
 144
 145 static OPJ_BOOL opj_dwt_decode_partial_tile(
 146     opj_tcd_tilecomp_t* tilec,
 147     OPJ_UINT32 numres);
 148
 149 /* Forward transform, for the vertical pass, processing cols columns */
 150 /* where cols <= NB_ELTS_V8 */
 151 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 152 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
 153     void *array,
 154     void *tmp,
 155     OPJ_UINT32 height,
 156     OPJ_BOOL even,
 157     OPJ_UINT32 stride_width,
 158     OPJ_UINT32 cols);
 159
 160 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 161 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
 162     void *row,
 163     void *tmp,
 164     OPJ_UINT32 width,
 165     OPJ_BOOL even);
 166
 167 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
 168         opj_tcd_tilecomp_t * tilec,
 169         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
 170         opj_encode_and_deinterleave_h_one_row_fnptr_type
 171         p_encode_and_deinterleave_h_one_row);
 172
 173 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
 174         OPJ_UINT32 i);
 175
 176 /* <summary>                             */
 177 /* Inverse 9-7 wavelet transform in 1-D. */
 178 /* </summary>                            */
 179
 180 /*@}*/
 181
 182 /*@}*/
 183
 184 #define OPJ_S(i) a[(i)*2]
 185 #define OPJ_D(i) a[(1+(i)*2)]
 186 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
 187 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
 188 /* new */
 189 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
 190 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
 191
 192 /* <summary>                                                              */
 193 /* This table contains the norms of the 5-3 wavelets for different bands. */
 194 /* </summary>                                                             */
 195 /* FIXME! the array should really be extended up to 33 resolution levels */
 196 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 197 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
 198     {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
 199     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 200     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 201     {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
 202 };
 203
 204 /* <summary>                                                              */
 205 /* This table contains the norms of the 9-7 wavelets for different bands. */
 206 /* </summary>                                                             */
 207 /* FIXME! the array should really be extended up to 33 resolution levels */
 208 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 209 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
 210     {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
 211     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 212     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 213     {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
 214 };
 215
 216 /*
 217 ==========================================================
 218    local functions
 219 ==========================================================
 220 */
 221
 222 /* <summary>                             */
 223 /* Forward lazy transform (horizontal).  */
 224 /* </summary>                            */
 225 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 226                                    OPJ_INT32 * OPJ_RESTRICT b,
 227                                    OPJ_INT32 dn,
 228                                    OPJ_INT32 sn, OPJ_INT32 cas)
 229 {
 230     OPJ_INT32 i;
 231     OPJ_INT32 * OPJ_RESTRICT l_dest = b;
 232     const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
 233
 234     for (i = 0; i < sn; ++i) {
 235         *l_dest++ = *l_src;
 236         l_src += 2;
 237     }
 238
 239     l_dest = b + sn;
 240     l_src = a + 1 - cas;
 241
 242     for (i = 0; i < dn; ++i)  {
 243         *l_dest++ = *l_src;
 244         l_src += 2;
 245     }
 246 }
 247
 248 #ifdef STANDARD_SLOW_VERSION
 249 /* <summary>                             */
 250 /* Inverse lazy transform (horizontal).  */
 251 /* </summary>                            */
 252 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 253 {
 254     const OPJ_INT32 *ai = a;
 255     OPJ_INT32 *bi = h->mem + h->cas;
 256     OPJ_INT32  i    = h->sn;
 257     while (i--) {
 258         *bi = *(ai++);
 259         bi += 2;
 260     }
 261     ai  = a + h->sn;
 262     bi  = h->mem + 1 - h->cas;
 263     i   = h->dn ;
 264     while (i--) {
 265         *bi = *(ai++);
 266         bi += 2;
 267     }
 268 }
 269
 270 /* <summary>                             */
 271 /* Inverse lazy transform (vertical).    */
 272 /* </summary>                            */
 273 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 274 {
 275     const OPJ_INT32 *ai = a;
 276     OPJ_INT32 *bi = v->mem + v->cas;
 277     OPJ_INT32  i = v->sn;
 278     while (i--) {
 279         *bi = *ai;
 280         bi += 2;
 281         ai += x;
 282     }
 283     ai = a + (v->sn * (OPJ_SIZE_T)x);
 284     bi = v->mem + 1 - v->cas;
 285     i = v->dn ;
 286     while (i--) {
 287         *bi = *ai;
 288         bi += 2;
 289         ai += x;
 290     }
 291 }
 292
 293 #endif /* STANDARD_SLOW_VERSION */
 294
 295 #ifdef STANDARD_SLOW_VERSION
 296 /* <summary>                            */
 297 /* Inverse 5-3 wavelet transform in 1-D. */
 298 /* </summary>                           */
 299 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 300                               OPJ_INT32 cas)
 301 {
 302     OPJ_INT32 i;
 303
 304     if (!cas) {
 305         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 306             for (i = 0; i < sn; i++) {
 307                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
 308             }
 309             for (i = 0; i < dn; i++) {
 310                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
 311             }
 312         }
 313     } else {
 314         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
 315             OPJ_S(0) /= 2;
 316         } else {
 317             for (i = 0; i < sn; i++) {
 318                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
 319             }
 320             for (i = 0; i < dn; i++) {
 321                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
 322             }
 323         }
 324     }
 325 }
 326
 327 static void opj_dwt_decode_1(const opj_dwt_t *v)
 328 {
 329     opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
 330 }
 331
 332 #endif /* STANDARD_SLOW_VERSION */
 333
 334 #if !defined(STANDARD_SLOW_VERSION)
 335 static void  opj_idwt53_h_cas0(OPJ_INT32* tmp,
 336                                const OPJ_INT32 sn,
 337                                const OPJ_INT32 len,
 338                                OPJ_INT32* tiledp)
 339 {
 340     OPJ_INT32 i, j;
 341     const OPJ_INT32* in_even = &tiledp[0];
 342     const OPJ_INT32* in_odd = &tiledp[sn];
 343
 344 #ifdef TWO_PASS_VERSION
 345     /* For documentation purpose: performs lifting in two iterations, */
 346     /* but without explicit interleaving */
 347
 348     assert(len > 1);
 349
 350     /* Even */
 351     tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
 352     for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
 353         tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
 354     }
 355     if (len & 1) { /* if len is odd */
 356         tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
 357     }
 358
 359     /* Odd */
 360     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 361         tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
 362     }
 363     if (!(len & 1)) { /* if len is even */
 364         tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
 365     }
 366 #else
 367     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 368
 369     assert(len > 1);
 370
 371     /* Improved version of the TWO_PASS_VERSION: */
 372     /* Performs lifting in one single iteration. Saves memory */
 373     /* accesses and explicit interleaving. */
 374     s1n = in_even[0];
 375     d1n = in_odd[0];
 376     s0n = s1n - ((d1n + 1) >> 1);
 377
 378     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 379         d1c = d1n;
 380         s0c = s0n;
 381
 382         s1n = in_even[j];
 383         d1n = in_odd[j];
 384
 385         s0n = s1n - ((d1c + d1n + 2) >> 2);
 386
 387         tmp[i  ] = s0c;
 388         tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c,
 389                                              s0n) >> 1);
 390     }
 391
 392     tmp[i] = s0n;
 393
 394     if (len & 1) {
 395         tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
 396         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 397     } else {
 398         tmp[len - 1] = d1n + s0n;
 399     }
 400 #endif
 401     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 402 }
 403
 404 static void  opj_idwt53_h_cas1(OPJ_INT32* tmp,
 405                                const OPJ_INT32 sn,
 406                                const OPJ_INT32 len,
 407                                OPJ_INT32* tiledp)
 408 {
 409     OPJ_INT32 i, j;
 410     const OPJ_INT32* in_even = &tiledp[sn];
 411     const OPJ_INT32* in_odd = &tiledp[0];
 412
 413 #ifdef TWO_PASS_VERSION
 414     /* For documentation purpose: performs lifting in two iterations, */
 415     /* but without explicit interleaving */
 416
 417     assert(len > 2);
 418
 419     /* Odd */
 420     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 421         tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
 422     }
 423     if (!(len & 1)) {
 424         tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
 425     }
 426
 427     /* Even */
 428     tmp[0] = in_even[0] + tmp[1];
 429     for (i = 2, j = 1; i < len - 1; i += 2, j++) {
 430         tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
 431     }
 432     if (len & 1) {
 433         tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
 434     }
 435 #else
 436     OPJ_INT32 s1, s2, dc, dn;
 437
 438     assert(len > 2);
 439
 440     /* Improved version of the TWO_PASS_VERSION: */
 441     /* Performs lifting in one single iteration. Saves memory */
 442     /* accesses and explicit interleaving. */
 443
 444     s1 = in_even[1];
 445     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 446     tmp[0] = in_even[0] + dc;
 447
 448     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 449
 450         s2 = in_even[j + 1];
 451
 452         dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
 453         tmp[i  ] = dc;
 454         tmp[i + 1] = opj_int_add_no_overflow(s1, opj_int_add_no_overflow(dn, dc) >> 1);
 455
 456         dc = dn;
 457         s1 = s2;
 458     }
 459
 460     tmp[i] = dc;
 461
 462     if (!(len & 1)) {
 463         dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
 464         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 465         tmp[len - 1] = dn;
 466     } else {
 467         tmp[len - 1] = s1 + dc;
 468     }
 469 #endif
 470     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 471 }
 472
 473
 474 #endif /* !defined(STANDARD_SLOW_VERSION) */
 475
 476 /* <summary>                            */
 477 /* Inverse 5-3 wavelet transform in 1-D for one row. */
 478 /* </summary>                           */
 479 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 480 static void opj_idwt53_h(const opj_dwt_t *dwt,
 481                          OPJ_INT32* tiledp)
 482 {
 483 #ifdef STANDARD_SLOW_VERSION
 484     /* For documentation purpose */
 485     opj_dwt_interleave_h(dwt, tiledp);
 486     opj_dwt_decode_1(dwt);
 487     memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
 488 #else
 489     const OPJ_INT32 sn = dwt->sn;
 490     const OPJ_INT32 len = sn + dwt->dn;
 491     if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
 492         if (len > 1) {
 493             opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
 494         } else {
 495             /* Unmodified value */
 496         }
 497     } else { /* Left-most sample is on odd coordinate */
 498         if (len == 1) {
 499             tiledp[0] /= 2;
 500         } else if (len == 2) {
 501             OPJ_INT32* out = dwt->mem;
 502             const OPJ_INT32* in_even = &tiledp[sn];
 503             const OPJ_INT32* in_odd = &tiledp[0];
 504             out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 505             out[0] = in_even[0] + out[1];
 506             memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 507         } else if (len > 2) {
 508             opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
 509         }
 510     }
 511 #endif
 512 }
 513
 514 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
 515
 516 /* Conveniency macros to improve the readability of the formulas */
 517 #if __AVX2__
 518 #define VREG        __m256i
 519 #define LOAD_CST(x) _mm256_set1_epi32(x)
 520 #define LOAD(x)     _mm256_load_si256((const VREG*)(x))
 521 #define LOADU(x)    _mm256_loadu_si256((const VREG*)(x))
 522 #define STORE(x,y)  _mm256_store_si256((VREG*)(x),(y))
 523 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
 524 #define ADD(x,y)    _mm256_add_epi32((x),(y))
 525 #define SUB(x,y)    _mm256_sub_epi32((x),(y))
 526 #define SAR(x,y)    _mm256_srai_epi32((x),(y))
 527 #else
 528 #define VREG        __m128i
 529 #define LOAD_CST(x) _mm_set1_epi32(x)
 530 #define LOAD(x)     _mm_load_si128((const VREG*)(x))
 531 #define LOADU(x)    _mm_loadu_si128((const VREG*)(x))
 532 #define STORE(x,y)  _mm_store_si128((VREG*)(x),(y))
 533 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
 534 #define ADD(x,y)    _mm_add_epi32((x),(y))
 535 #define SUB(x,y)    _mm_sub_epi32((x),(y))
 536 #define SAR(x,y)    _mm_srai_epi32((x),(y))
 537 #endif
 538 #define ADD3(x,y,z) ADD(ADD(x,y),z)
 539
 540 static
 541 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
 542                                const OPJ_INT32* tmp,
 543                                OPJ_INT32 len,
 544                                OPJ_SIZE_T stride)
 545 {
 546     OPJ_INT32 i;
 547     for (i = 0; i < len; ++i) {
 548         /* A memcpy(&tiledp_col[i * stride + 0],
 549                     &tmp[PARALLEL_COLS_53 * i + 0],
 550                     PARALLEL_COLS_53 * sizeof(OPJ_INT32))
 551            would do but would be a tiny bit slower.
 552            We can take here advantage of our knowledge of alignment */
 553         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
 554                LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
 555         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
 556                LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
 557     }
 558 }
 559
 560 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 561  * 16 in AVX2, when top-most pixel is on even coordinate */
 562 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
 563     OPJ_INT32* tmp,
 564     const OPJ_INT32 sn,
 565     const OPJ_INT32 len,
 566     OPJ_INT32* tiledp_col,
 567     const OPJ_SIZE_T stride)
 568 {
 569     const OPJ_INT32* in_even = &tiledp_col[0];
 570     const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 571
 572     OPJ_INT32 i;
 573     OPJ_SIZE_T j;
 574     VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
 575     VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
 576     const VREG two = LOAD_CST(2);
 577
 578     assert(len > 1);
 579 #if __AVX2__
 580     assert(PARALLEL_COLS_53 == 16);
 581     assert(VREG_INT_COUNT == 8);
 582 #else
 583     assert(PARALLEL_COLS_53 == 8);
 584     assert(VREG_INT_COUNT == 4);
 585 #endif
 586
 587     /* Note: loads of input even/odd values must be done in a unaligned */
 588     /* fashion. But stores in tmp can be done with aligned store, since */
 589     /* the temporary buffer is properly aligned */
 590     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 591
 592     s1n_0 = LOADU(in_even + 0);
 593     s1n_1 = LOADU(in_even + VREG_INT_COUNT);
 594     d1n_0 = LOADU(in_odd);
 595     d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
 596
 597     /* s0n = s1n - ((d1n + 1) >> 1); <==> */
 598     /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
 599     s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 600     s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 601
 602     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 603         d1c_0 = d1n_0;
 604         s0c_0 = s0n_0;
 605         d1c_1 = d1n_1;
 606         s0c_1 = s0n_1;
 607
 608         s1n_0 = LOADU(in_even + j * stride);
 609         s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
 610         d1n_0 = LOADU(in_odd + j * stride);
 611         d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
 612
 613         /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
 614         s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
 615         s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
 616
 617         STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
 618         STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
 619
 620         /* d1c + ((s0c + s0n) >> 1) */
 621         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 622               ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
 623         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 624               ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
 625     }
 626
 627     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
 628     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
 629
 630     if (len & 1) {
 631         VREG tmp_len_minus_1;
 632         s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
 633         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 634         tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 635         STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
 636         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 637         STORE(tmp + PARALLEL_COLS_53 * (len - 2),
 638               ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
 639
 640         s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
 641         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 642         tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 643         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 644               tmp_len_minus_1);
 645         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 646         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 647               ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
 648
 649
 650     } else {
 651         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
 652               ADD(d1n_0, s0n_0));
 653         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 654               ADD(d1n_1, s0n_1));
 655     }
 656
 657     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 658 }
 659
 660
 661 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 662  * 16 in AVX2, when top-most pixel is on odd coordinate */
 663 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
 664     OPJ_INT32* tmp,
 665     const OPJ_INT32 sn,
 666     const OPJ_INT32 len,
 667     OPJ_INT32* tiledp_col,
 668     const OPJ_SIZE_T stride)
 669 {
 670     OPJ_INT32 i;
 671     OPJ_SIZE_T j;
 672
 673     VREG s1_0, s2_0, dc_0, dn_0;
 674     VREG s1_1, s2_1, dc_1, dn_1;
 675     const VREG two = LOAD_CST(2);
 676
 677     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 678     const OPJ_INT32* in_odd = &tiledp_col[0];
 679
 680     assert(len > 2);
 681 #if __AVX2__
 682     assert(PARALLEL_COLS_53 == 16);
 683     assert(VREG_INT_COUNT == 8);
 684 #else
 685     assert(PARALLEL_COLS_53 == 8);
 686     assert(VREG_INT_COUNT == 4);
 687 #endif
 688
 689     /* Note: loads of input even/odd values must be done in a unaligned */
 690     /* fashion. But stores in tmp can be done with aligned store, since */
 691     /* the temporary buffer is properly aligned */
 692     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 693
 694     s1_0 = LOADU(in_even + stride);
 695     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 696     dc_0 = SUB(LOADU(in_odd + 0),
 697                SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
 698     STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
 699
 700     s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
 701     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 702     dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
 703                SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
 704     STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
 705           ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
 706
 707     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 708
 709         s2_0 = LOADU(in_even + (j + 1) * stride);
 710         s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
 711
 712         /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
 713         dn_0 = SUB(LOADU(in_odd + j * stride),
 714                    SAR(ADD3(s1_0, s2_0, two), 2));
 715         dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
 716                    SAR(ADD3(s1_1, s2_1, two), 2));
 717
 718         STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 719         STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 720
 721         /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
 722         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 723               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 724         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 725               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 726
 727         dc_0 = dn_0;
 728         s1_0 = s2_0;
 729         dc_1 = dn_1;
 730         s1_1 = s2_1;
 731     }
 732     STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 733     STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 734
 735     if (!(len & 1)) {
 736         /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
 737         dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
 738                    SAR(ADD3(s1_0, s1_0, two), 2));
 739         dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
 740                    SAR(ADD3(s1_1, s1_1, two), 2));
 741
 742         /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
 743         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
 744               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 745         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 746               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 747
 748         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
 749         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
 750     } else {
 751         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
 752         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 753               ADD(s1_1, dc_1));
 754     }
 755
 756     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 757 }
 758
 759 #undef VREG
 760 #undef LOAD_CST
 761 #undef LOADU
 762 #undef LOAD
 763 #undef STORE
 764 #undef STOREU
 765 #undef ADD
 766 #undef ADD3
 767 #undef SUB
 768 #undef SAR
 769
 770 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
 771
 772 #if !defined(STANDARD_SLOW_VERSION)
 773 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 774  * pixel is on even coordinate */
 775 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
 776                              const OPJ_INT32 sn,
 777                              const OPJ_INT32 len,
 778                              OPJ_INT32* tiledp_col,
 779                              const OPJ_SIZE_T stride)
 780 {
 781     OPJ_INT32 i, j;
 782     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 783
 784     assert(len > 1);
 785
 786     /* Performs lifting in one single iteration. Saves memory */
 787     /* accesses and explicit interleaving. */
 788
 789     s1n = tiledp_col[0];
 790     d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
 791     s0n = s1n - ((d1n + 1) >> 1);
 792
 793     for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
 794         d1c = d1n;
 795         s0c = s0n;
 796
 797         s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
 798         d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
 799
 800         s0n = opj_int_sub_no_overflow(s1n,
 801                                       opj_int_add_no_overflow(opj_int_add_no_overflow(d1c, d1n), 2) >> 2);
 802
 803         tmp[i  ] = s0c;
 804         tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c,
 805                                              s0n) >> 1);
 806     }
 807
 808     tmp[i] = s0n;
 809
 810     if (len & 1) {
 811         tmp[len - 1] =
 812             tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
 813             ((d1n + 1) >> 1);
 814         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 815     } else {
 816         tmp[len - 1] = d1n + s0n;
 817     }
 818
 819     for (i = 0; i < len; ++i) {
 820         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 821     }
 822 }
 823
 824
 825 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 826  * pixel is on odd coordinate */
 827 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
 828                              const OPJ_INT32 sn,
 829                              const OPJ_INT32 len,
 830                              OPJ_INT32* tiledp_col,
 831                              const OPJ_SIZE_T stride)
 832 {
 833     OPJ_INT32 i, j;
 834     OPJ_INT32 s1, s2, dc, dn;
 835     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 836     const OPJ_INT32* in_odd = &tiledp_col[0];
 837
 838     assert(len > 2);
 839
 840     /* Performs lifting in one single iteration. Saves memory */
 841     /* accesses and explicit interleaving. */
 842
 843     s1 = in_even[stride];
 844     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 845     tmp[0] = in_even[0] + dc;
 846     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 847
 848         s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
 849
 850         dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
 851         tmp[i  ] = dc;
 852         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 853
 854         dc = dn;
 855         s1 = s2;
 856     }
 857     tmp[i] = dc;
 858     if (!(len & 1)) {
 859         dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
 860         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 861         tmp[len - 1] = dn;
 862     } else {
 863         tmp[len - 1] = s1 + dc;
 864     }
 865
 866     for (i = 0; i < len; ++i) {
 867         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 868     }
 869 }
 870 #endif /* !defined(STANDARD_SLOW_VERSION) */
 871
 872 /* <summary>                            */
 873 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
 874 /* </summary>                           */
 875 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 876 static void opj_idwt53_v(const opj_dwt_t *dwt,
 877                          OPJ_INT32* tiledp_col,
 878                          OPJ_SIZE_T stride,
 879                          OPJ_INT32 nb_cols)
 880 {
 881 #ifdef STANDARD_SLOW_VERSION
 882     /* For documentation purpose */
 883     OPJ_INT32 k, c;
 884     for (c = 0; c < nb_cols; c ++) {
 885         opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
 886         opj_dwt_decode_1(dwt);
 887         for (k = 0; k < dwt->sn + dwt->dn; ++k) {
 888             tiledp_col[c + k * stride] = dwt->mem[k];
 889         }
 890     }
 891 #else
 892     const OPJ_INT32 sn = dwt->sn;
 893     const OPJ_INT32 len = sn + dwt->dn;
 894     if (dwt->cas == 0) {
 895         /* If len == 1, unmodified value */
 896
 897 #if (defined(__SSE2__) || defined(__AVX2__))
 898         if (len > 1 && nb_cols == PARALLEL_COLS_53) {
 899             /* Same as below general case, except that thanks to SSE2/AVX2 */
 900             /* we can efficiently process 8/16 columns in parallel */
 901             opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 902             return;
 903         }
 904 #endif
 905         if (len > 1) {
 906             OPJ_INT32 c;
 907             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 908                 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
 909             }
 910             return;
 911         }
 912     } else {
 913         if (len == 1) {
 914             OPJ_INT32 c;
 915             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 916                 tiledp_col[0] /= 2;
 917             }
 918             return;
 919         }
 920
 921         if (len == 2) {
 922             OPJ_INT32 c;
 923             OPJ_INT32* out = dwt->mem;
 924             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 925                 OPJ_INT32 i;
 926                 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 927                 const OPJ_INT32* in_odd = &tiledp_col[0];
 928
 929                 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 930                 out[0] = in_even[0] + out[1];
 931
 932                 for (i = 0; i < len; ++i) {
 933                     tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
 934                 }
 935             }
 936
 937             return;
 938         }
 939
 940 #if (defined(__SSE2__) || defined(__AVX2__))
 941         if (len > 2 && nb_cols == PARALLEL_COLS_53) {
 942             /* Same as below general case, except that thanks to SSE2/AVX2 */
 943             /* we can efficiently process 8/16 columns in parallel */
 944             opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 945             return;
 946         }
 947 #endif
 948         if (len > 2) {
 949             OPJ_INT32 c;
 950             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 951                 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
 952             }
 953             return;
 954         }
 955     }
 956 #endif
 957 }
 958
 959 #if 0
 960 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
 961                                  OPJ_UINT32 end,
 962                                  const OPJ_FLOAT32 c)
 963 {
 964     OPJ_UINT32 i = 0;
 965     for (; i < end; ++i) {
 966         fw[0] *= c;
 967         fw += 2;
 968     }
 969 }
 970 #else
 971 static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
 972         OPJ_UINT32 iters_c1,
 973         OPJ_UINT32 iters_c2,
 974         const OPJ_FLOAT32 c1,
 975         const OPJ_FLOAT32 c2)
 976 {
 977     OPJ_UINT32 i = 0;
 978     const OPJ_UINT32 iters_common =  opj_uint_min(iters_c1, iters_c2);
 979     assert((((OPJ_SIZE_T)fw) & 0xf) == 0);
 980     assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1);
 981     for (; i + 3 < iters_common; i += 4) {
 982 #ifdef __SSE__
 983         const __m128 vcst = _mm_set_ps(c2, c1, c2, c1);
 984         *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst);
 985         *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst);
 986 #else
 987         fw[0] *= c1;
 988         fw[1] *= c2;
 989         fw[2] *= c1;
 990         fw[3] *= c2;
 991         fw[4] *= c1;
 992         fw[5] *= c2;
 993         fw[6] *= c1;
 994         fw[7] *= c2;
 995 #endif
 996         fw += 8;
 997     }
 998     for (; i < iters_common; i++) {
 999         fw[0] *= c1;
1000         fw[1] *= c2;
1001         fw += 2;
1002     }
1003     if (i < iters_c1) {
1004         fw[0] *= c1;
1005     } else if (i < iters_c2) {
1006         fw[1] *= c2;
1007     }
1008 }
1009
1010 #endif
1011
1012 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1013                                  OPJ_UINT32 end,
1014                                  OPJ_UINT32 m,
1015                                  OPJ_FLOAT32 c)
1016 {
1017     OPJ_UINT32 i;
1018     OPJ_UINT32 imax = opj_uint_min(end, m);
1019     if (imax > 0) {
1020         fw[-1] += (fl[0] + fw[0]) * c;
1021         fw += 2;
1022         i = 1;
1023         for (; i + 3 < imax; i += 4) {
1024             fw[-1] += (fw[-2] + fw[0]) * c;
1025             fw[1] += (fw[0] + fw[2]) * c;
1026             fw[3] += (fw[2] + fw[4]) * c;
1027             fw[5] += (fw[4] + fw[6]) * c;
1028             fw += 8;
1029         }
1030         for (; i < imax; ++i) {
1031             fw[-1] += (fw[-2] + fw[0]) * c;
1032             fw += 2;
1033         }
1034     }
1035     if (m < end) {
1036         assert(m + 1 == end);
1037         fw[-1] += (2 * fw[-2]) * c;
1038     }
1039 }
1040
1041 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
1042                                   OPJ_INT32 cas)
1043 {
1044     OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
1045     OPJ_INT32 a, b;
1046     assert(dn + sn > 1);
1047     if (cas == 0) {
1048         a = 0;
1049         b = 1;
1050     } else {
1051         a = 1;
1052         b = 0;
1053     }
1054     opj_dwt_encode_step2(w + a, w + b + 1,
1055                          (OPJ_UINT32)dn,
1056                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1057                          opj_dwt_alpha);
1058     opj_dwt_encode_step2(w + b, w + a + 1,
1059                          (OPJ_UINT32)sn,
1060                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1061                          opj_dwt_beta);
1062     opj_dwt_encode_step2(w + a, w + b + 1,
1063                          (OPJ_UINT32)dn,
1064                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1065                          opj_dwt_gamma);
1066     opj_dwt_encode_step2(w + b, w + a + 1,
1067                          (OPJ_UINT32)sn,
1068                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1069                          opj_dwt_delta);
1070 #if 0
1071     opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn,
1072                          opj_K);
1073     opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn,
1074                          opj_invK);
1075 #else
1076     if (a == 0) {
1077         opj_dwt_encode_step1_combined(w,
1078                                       (OPJ_UINT32)sn,
1079                                       (OPJ_UINT32)dn,
1080                                       opj_invK,
1081                                       opj_K);
1082     } else {
1083         opj_dwt_encode_step1_combined(w,
1084                                       (OPJ_UINT32)dn,
1085                                       (OPJ_UINT32)sn,
1086                                       opj_K,
1087                                       opj_invK);
1088     }
1089 #endif
1090 }
1091
1092 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1093                                     opj_stepsize_t *bandno_stepsize)
1094 {
1095     OPJ_INT32 p, n;
1096     p = opj_int_floorlog2(stepsize) - 13;
1097     n = 11 - opj_int_floorlog2(stepsize);
1098     bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1099     bandno_stepsize->expn = numbps - p;
1100 }
1101
1102 /*
1103 ==========================================================
1104    DWT interface
1105 ==========================================================
1106 */
1107
1108 /** Process one line for the horizontal pass of the 5x3 forward transform */
1109 static
1110 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
1111         void* tmpIn,
1112         OPJ_UINT32 width,
1113         OPJ_BOOL even)
1114 {
1115     OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
1116     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
1117     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1118     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1119
1120     if (even) {
1121         if (width > 1) {
1122             OPJ_INT32 i;
1123             for (i = 0; i < sn - 1; i++) {
1124                 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
1125             }
1126             if ((width % 2) == 0) {
1127                 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
1128             }
1129             row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
1130             for (i = 1; i < dn; i++) {
1131                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
1132             }
1133             if ((width % 2) == 1) {
1134                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
1135             }
1136             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1137         }
1138     } else {
1139         if (width == 1) {
1140             row[0] *= 2;
1141         } else {
1142             OPJ_INT32 i;
1143             tmp[sn + 0] = row[0] - row[1];
1144             for (i = 1; i < sn; i++) {
1145                 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
1146             }
1147             if ((width % 2) == 1) {
1148                 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
1149             }
1150
1151             for (i = 0; i < dn - 1; i++) {
1152                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
1153             }
1154             if ((width % 2) == 0) {
1155                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
1156             }
1157             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1158         }
1159     }
1160 }
1161
1162 /** Process one line for the horizontal pass of the 9x7 forward transform */
1163 static
1164 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
1165         void* tmpIn,
1166         OPJ_UINT32 width,
1167         OPJ_BOOL even)
1168 {
1169     OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
1170     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
1171     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1172     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1173     if (width == 1) {
1174         return;
1175     }
1176     memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
1177     opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1178     opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
1179                            (OPJ_INT32 * OPJ_RESTRICT)row,
1180                            dn, sn, even ? 0 : 1);
1181 }
1182
1183 typedef struct {
1184     opj_dwt_t h;
1185     OPJ_UINT32 rw; /* Width of the resolution to process */
1186     OPJ_UINT32 w; /* Width of tiledp */
1187     OPJ_INT32 * OPJ_RESTRICT tiledp;
1188     OPJ_UINT32 min_j;
1189     OPJ_UINT32 max_j;
1190     opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
1191 } opj_dwt_encode_h_job_t;
1192
1193 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
1194 {
1195     OPJ_UINT32 j;
1196     opj_dwt_encode_h_job_t* job;
1197     (void)tls;
1198
1199     job = (opj_dwt_encode_h_job_t*)user_data;
1200     for (j = job->min_j; j < job->max_j; j++) {
1201         OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
1202         (*job->p_function)(aj, job->h.mem, job->rw,
1203                            job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
1204     }
1205
1206     opj_aligned_free(job->h.mem);
1207     opj_free(job);
1208 }
1209
1210 typedef struct {
1211     opj_dwt_t v;
1212     OPJ_UINT32 rh;
1213     OPJ_UINT32 w;
1214     OPJ_INT32 * OPJ_RESTRICT tiledp;
1215     OPJ_UINT32 min_j;
1216     OPJ_UINT32 max_j;
1217     opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
1218 } opj_dwt_encode_v_job_t;
1219
1220 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
1221 {
1222     OPJ_UINT32 j;
1223     opj_dwt_encode_v_job_t* job;
1224     (void)tls;
1225
1226     job = (opj_dwt_encode_v_job_t*)user_data;
1227     for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
1228         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1229                                             job->v.mem,
1230                                             job->rh,
1231                                             job->v.cas == 0,
1232                                             job->w,
1233                                             NB_ELTS_V8);
1234     }
1235     if (j < job->max_j) {
1236         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1237                                             job->v.mem,
1238                                             job->rh,
1239                                             job->v.cas == 0,
1240                                             job->w,
1241                                             job->max_j - j);
1242     }
1243
1244     opj_aligned_free(job->v.mem);
1245     opj_free(job);
1246 }
1247
1248 /** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
1249 /* that has a NB_ELTS_V8 interleave factor. */
1250 static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
1251         void *tmpOut,
1252         OPJ_UINT32 height,
1253         OPJ_UINT32 stride_width,
1254         OPJ_UINT32 cols)
1255 {
1256     const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1257     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
1258     if (cols == NB_ELTS_V8) {
1259         OPJ_UINT32 k;
1260         for (k = 0; k < height; ++k) {
1261             memcpy(tmp + NB_ELTS_V8 * k,
1262                    array + k * stride_width,
1263                    NB_ELTS_V8 * sizeof(OPJ_INT32));
1264         }
1265     } else {
1266         OPJ_UINT32 k;
1267         for (k = 0; k < height; ++k) {
1268             OPJ_UINT32 c;
1269             for (c = 0; c < cols; c++) {
1270                 tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
1271             }
1272             for (; c < NB_ELTS_V8; c++) {
1273                 tmp[NB_ELTS_V8 * k + c] = 0;
1274             }
1275         }
1276     }
1277 }
1278
1279 /* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
1280 /* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
1281 /* columns. */
1282 static INLINE void opj_dwt_deinterleave_v_cols(
1283     const OPJ_INT32 * OPJ_RESTRICT src,
1284     OPJ_INT32 * OPJ_RESTRICT dst,
1285     OPJ_INT32 dn,
1286     OPJ_INT32 sn,
1287     OPJ_UINT32 stride_width,
1288     OPJ_INT32 cas,
1289     OPJ_UINT32 cols)
1290 {
1291     OPJ_INT32 k;
1292     OPJ_INT32 i = sn;
1293     OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
1294     const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
1295     OPJ_UINT32 c;
1296
1297     for (k = 0; k < 2; k++) {
1298         while (i--) {
1299             if (cols == NB_ELTS_V8) {
1300                 memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
1301             } else {
1302                 c = 0;
1303                 switch (cols) {
1304                 case 7:
1305                     l_dest[c] = l_src[c];
1306                     c++; /* fallthru */
1307                 case 6:
1308                     l_dest[c] = l_src[c];
1309                     c++; /* fallthru */
1310                 case 5:
1311                     l_dest[c] = l_src[c];
1312                     c++; /* fallthru */
1313                 case 4:
1314                     l_dest[c] = l_src[c];
1315                     c++; /* fallthru */
1316                 case 3:
1317                     l_dest[c] = l_src[c];
1318                     c++; /* fallthru */
1319                 case 2:
1320                     l_dest[c] = l_src[c];
1321                     c++; /* fallthru */
1322                 default:
1323                     l_dest[c] = l_src[c];
1324                     break;
1325                 }
1326             }
1327             l_dest += stride_width;
1328             l_src += 2 * NB_ELTS_V8;
1329         }
1330
1331         l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
1332         l_src = src + (1 - cas) * NB_ELTS_V8;
1333         i = dn;
1334     }
1335 }
1336
1337
1338 /* Forward 5-3 transform, for the vertical pass, processing cols columns */
1339 /* where cols <= NB_ELTS_V8 */
1340 static void opj_dwt_encode_and_deinterleave_v(
1341     void *arrayIn,
1342     void *tmpIn,
1343     OPJ_UINT32 height,
1344     OPJ_BOOL even,
1345     OPJ_UINT32 stride_width,
1346     OPJ_UINT32 cols)
1347 {
1348     OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1349     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
1350     const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
1351     const OPJ_UINT32 dn = height - sn;
1352
1353     opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1354
1355 #define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
1356 #define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
1357
1358 #ifdef __SSE2__
1359     if (height == 1) {
1360         if (!even) {
1361             OPJ_UINT32 c;
1362             for (c = 0; c < NB_ELTS_V8; c++) {
1363                 tmp[c] *= 2;
1364             }
1365         }
1366     } else if (even) {
1367         OPJ_UINT32 c;
1368         OPJ_UINT32 i;
1369         i = 0;
1370         if (i + 1 < sn) {
1371             __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
1372             __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
1373             for (; i + 1 < sn; i++) {
1374                 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1375                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1376                 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1377                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1378                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1379                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1380                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1381                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1382                 xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
1383                                          _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
1384                 xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
1385                                          _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
1386                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) =  xmm_Di_0;
1387                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) =  xmm_Di_1;
1388                 xmm_Si_0 = xmm_Sip1_0;
1389                 xmm_Si_1 = xmm_Sip1_1;
1390             }
1391         }
1392         if (((height) % 2) == 0) {
1393             for (c = 0; c < NB_ELTS_V8; c++) {
1394                 OPJ_Dc(i) -= OPJ_Sc(i);
1395             }
1396         }
1397         for (c = 0; c < NB_ELTS_V8; c++) {
1398             OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1399         }
1400         i = 1;
1401         if (i < dn) {
1402             __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1403                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1404             __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1405                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1406             const __m128i xmm_two = _mm_set1_epi32(2);
1407             for (; i < dn; i++) {
1408                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1409                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1410                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1411                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1412                 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1413                                                      (i * 2) * NB_ELTS_V8 + 4 * 0);
1414                 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1415                                                      (i * 2) * NB_ELTS_V8 + 4 * 1);
1416                 xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
1417                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
1418                 xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
1419                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
1420                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1421                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1422                 xmm_Dim1_0 = xmm_Di_0;
1423                 xmm_Dim1_1 = xmm_Di_1;
1424             }
1425         }
1426         if (((height) % 2) == 1) {
1427             for (c = 0; c < NB_ELTS_V8; c++) {
1428                 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1429             }
1430         }
1431     } else {
1432         OPJ_UINT32 c;
1433         OPJ_UINT32 i;
1434         for (c = 0; c < NB_ELTS_V8; c++) {
1435             OPJ_Sc(0) -= OPJ_Dc(0);
1436         }
1437         i = 1;
1438         if (i < sn) {
1439             __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1440                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1441             __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1442                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1443             for (; i < sn; i++) {
1444                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1445                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1446                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1447                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1448                 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1449                                                      (i * 2) * NB_ELTS_V8 + 4 * 0);
1450                 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1451                                                      (i * 2) * NB_ELTS_V8 + 4 * 1);
1452                 xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
1453                                          _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
1454                 xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
1455                                          _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
1456                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1457                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1458                 xmm_Dim1_0 = xmm_Di_0;
1459                 xmm_Dim1_1 = xmm_Di_1;
1460             }
1461         }
1462         if (((height) % 2) == 1) {
1463             for (c = 0; c < NB_ELTS_V8; c++) {
1464                 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1465             }
1466         }
1467         i = 0;
1468         if (i + 1 < dn) {
1469             __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
1470             __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
1471             const __m128i xmm_two = _mm_set1_epi32(2);
1472             for (; i + 1 < dn; i++) {
1473                 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1474                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1475                 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1476                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1477                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1478                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1479                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1480                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1481                 xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
1482                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
1483                 xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
1484                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
1485                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1486                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1487                 xmm_Si_0 = xmm_Sip1_0;
1488                 xmm_Si_1 = xmm_Sip1_1;
1489             }
1490         }
1491         if (((height) % 2) == 0) {
1492             for (c = 0; c < NB_ELTS_V8; c++) {
1493                 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1494             }
1495         }
1496     }
1497 #else
1498     if (even) {
1499         OPJ_UINT32 c;
1500         if (height > 1) {
1501             OPJ_UINT32 i;
1502             for (i = 0; i + 1 < sn; i++) {
1503                 for (c = 0; c < NB_ELTS_V8; c++) {
1504                     OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
1505                 }
1506             }
1507             if (((height) % 2) == 0) {
1508                 for (c = 0; c < NB_ELTS_V8; c++) {
1509                     OPJ_Dc(i) -= OPJ_Sc(i);
1510                 }
1511             }
1512             for (c = 0; c < NB_ELTS_V8; c++) {
1513                 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1514             }
1515             for (i = 1; i < dn; i++) {
1516                 for (c = 0; c < NB_ELTS_V8; c++) {
1517                     OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
1518                 }
1519             }
1520             if (((height) % 2) == 1) {
1521                 for (c = 0; c < NB_ELTS_V8; c++) {
1522                     OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1523                 }
1524             }
1525         }
1526     } else {
1527         OPJ_UINT32 c;
1528         if (height == 1) {
1529             for (c = 0; c < NB_ELTS_V8; c++) {
1530                 OPJ_Sc(0) *= 2;
1531             }
1532         } else {
1533             OPJ_UINT32 i;
1534             for (c = 0; c < NB_ELTS_V8; c++) {
1535                 OPJ_Sc(0) -= OPJ_Dc(0);
1536             }
1537             for (i = 1; i < sn; i++) {
1538                 for (c = 0; c < NB_ELTS_V8; c++) {
1539                     OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
1540                 }
1541             }
1542             if (((height) % 2) == 1) {
1543                 for (c = 0; c < NB_ELTS_V8; c++) {
1544                     OPJ_Sc(i) -= OPJ_Dc(i - 1);
1545                 }
1546             }
1547             for (i = 0; i + 1 < dn; i++) {
1548                 for (c = 0; c < NB_ELTS_V8; c++) {
1549                     OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
1550                 }
1551             }
1552             if (((height) % 2) == 0) {
1553                 for (c = 0; c < NB_ELTS_V8; c++) {
1554                     OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1555                 }
1556             }
1557         }
1558     }
1559 #endif
1560
1561     if (cols == NB_ELTS_V8) {
1562         opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1563                                     stride_width, even ? 0 : 1, NB_ELTS_V8);
1564     } else {
1565         opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1566                                     stride_width, even ? 0 : 1, cols);
1567     }
1568 }
1569
1570 static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
1571                                    OPJ_UINT32 end,
1572                                    const OPJ_FLOAT32 cst)
1573 {
1574     OPJ_UINT32 i;
1575 #ifdef __SSE__
1576     __m128* vw = (__m128*) fw;
1577     const __m128 vcst = _mm_set1_ps(cst);
1578     for (i = 0; i < end; ++i) {
1579         vw[0] = _mm_mul_ps(vw[0], vcst);
1580         vw[1] = _mm_mul_ps(vw[1], vcst);
1581         vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1582     }
1583 #else
1584     OPJ_UINT32 c;
1585     for (i = 0; i < end; ++i) {
1586         for (c = 0; c < NB_ELTS_V8; c++) {
1587             fw[i * 2 * NB_ELTS_V8 + c] *= cst;
1588         }
1589     }
1590 #endif
1591 }
1592
1593 static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1594                                    OPJ_UINT32 end,
1595                                    OPJ_UINT32 m,
1596                                    OPJ_FLOAT32 cst)
1597 {
1598     OPJ_UINT32 i;
1599     OPJ_UINT32 imax = opj_uint_min(end, m);
1600 #ifdef __SSE__
1601     __m128* vw = (__m128*) fw;
1602     __m128 vcst = _mm_set1_ps(cst);
1603     if (imax > 0) {
1604         __m128* vl = (__m128*) fl;
1605         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
1606         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
1607         vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1608         i = 1;
1609
1610         for (; i < imax; ++i) {
1611             vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
1612             vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
1613             vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1614         }
1615     }
1616     if (m < end) {
1617         assert(m + 1 == end);
1618         vcst = _mm_add_ps(vcst, vcst);
1619         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
1620         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
1621     }
1622 #else
1623     OPJ_INT32 c;
1624     if (imax > 0) {
1625         for (c = 0; c < NB_ELTS_V8; c++) {
1626             fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1627                                        cst;
1628         }
1629         fw += 2 * NB_ELTS_V8;
1630         i = 1;
1631         for (; i < imax; ++i) {
1632             for (c = 0; c < NB_ELTS_V8; c++) {
1633                 fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1634                                            cst;
1635             }
1636             fw += 2 * NB_ELTS_V8;
1637         }
1638     }
1639     if (m < end) {
1640         assert(m + 1 == end);
1641         for (c = 0; c < NB_ELTS_V8; c++) {
1642             fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst;
1643         }
1644     }
1645 #endif
1646 }
1647
1648 /* Forward 9-7 transform, for the vertical pass, processing cols columns */
1649 /* where cols <= NB_ELTS_V8 */
1650 static void opj_dwt_encode_and_deinterleave_v_real(
1651     void *arrayIn,
1652     void *tmpIn,
1653     OPJ_UINT32 height,
1654     OPJ_BOOL even,
1655     OPJ_UINT32 stride_width,
1656     OPJ_UINT32 cols)
1657 {
1658     OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
1659     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
1660     const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1661     const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1662     OPJ_INT32 a, b;
1663
1664     if (height == 1) {
1665         return;
1666     }
1667
1668     opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1669
1670     if (even) {
1671         a = 0;
1672         b = 1;
1673     } else {
1674         a = 1;
1675         b = 0;
1676     }
1677     opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1678                            tmp + (b + 1) * NB_ELTS_V8,
1679                            (OPJ_UINT32)dn,
1680                            (OPJ_UINT32)opj_int_min(dn, sn - b),
1681                            opj_dwt_alpha);
1682     opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1683                            tmp + (a + 1) * NB_ELTS_V8,
1684                            (OPJ_UINT32)sn,
1685                            (OPJ_UINT32)opj_int_min(sn, dn - a),
1686                            opj_dwt_beta);
1687     opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1688                            tmp + (b + 1) * NB_ELTS_V8,
1689                            (OPJ_UINT32)dn,
1690                            (OPJ_UINT32)opj_int_min(dn, sn - b),
1691                            opj_dwt_gamma);
1692     opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1693                            tmp + (a + 1) * NB_ELTS_V8,
1694                            (OPJ_UINT32)sn,
1695                            (OPJ_UINT32)opj_int_min(sn, dn - a),
1696                            opj_dwt_delta);
1697     opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn,
1698                            opj_K);
1699     opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn,
1700                            opj_invK);
1701
1702
1703     if (cols == NB_ELTS_V8) {
1704         opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1705                                     (OPJ_INT32*)array,
1706                                     (OPJ_INT32)dn, (OPJ_INT32)sn,
1707                                     stride_width, even ? 0 : 1, NB_ELTS_V8);
1708     } else {
1709         opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1710                                     (OPJ_INT32*)array,
1711                                     (OPJ_INT32)dn, (OPJ_INT32)sn,
1712                                     stride_width, even ? 0 : 1, cols);
1713     }
1714 }
1715
1716
1717 /* <summary>                            */
1718 /* Forward 5-3 wavelet transform in 2-D. */
1719 /* </summary>                           */
1720 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
1721         opj_tcd_tilecomp_t * tilec,
1722         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
1723         opj_encode_and_deinterleave_h_one_row_fnptr_type
1724         p_encode_and_deinterleave_h_one_row)
1725 {
1726     OPJ_INT32 i;
1727     OPJ_INT32 *bj = 00;
1728     OPJ_UINT32 w;
1729     OPJ_INT32 l;
1730
1731     OPJ_SIZE_T l_data_size;
1732
1733     opj_tcd_resolution_t * l_cur_res = 0;
1734     opj_tcd_resolution_t * l_last_res = 0;
1735     const int num_threads = opj_thread_pool_get_thread_count(tp);
1736     OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1737
1738     w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1739     l = (OPJ_INT32)tilec->numresolutions - 1;
1740
1741     l_cur_res = tilec->resolutions + l;
1742     l_last_res = l_cur_res - 1;
1743
1744     l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1745     /* overflow check */
1746     if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
1747         /* FIXME event manager error callback */
1748         return OPJ_FALSE;
1749     }
1750     l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
1751     bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1752     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1753     /* in that case, so do not error out */
1754     if (l_data_size != 0 && ! bj) {
1755         return OPJ_FALSE;
1756     }
1757     i = l;
1758
1759     while (i--) {
1760         OPJ_UINT32 j;
1761         OPJ_UINT32 rw;           /* width of the resolution level computed   */
1762         OPJ_UINT32 rh;           /* height of the resolution level computed  */
1763         OPJ_UINT32
1764         rw1;      /* width of the resolution level once lower than computed one                                       */
1765         OPJ_UINT32
1766         rh1;      /* height of the resolution level once lower than computed one                                      */
1767         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1768         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
1769         OPJ_INT32 dn, sn;
1770
1771         rw  = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
1772         rh  = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
1773         rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
1774         rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
1775
1776         cas_row = l_cur_res->x0 & 1;
1777         cas_col = l_cur_res->y0 & 1;
1778
1779         sn = (OPJ_INT32)rh1;
1780         dn = (OPJ_INT32)(rh - rh1);
1781
1782         /* Perform vertical pass */
1783         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
1784             for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
1785                 p_encode_and_deinterleave_v(tiledp + j,
1786                                             bj,
1787                                             rh,
1788                                             cas_col == 0,
1789                                             w,
1790                                             NB_ELTS_V8);
1791             }
1792             if (j < rw) {
1793                 p_encode_and_deinterleave_v(tiledp + j,
1794                                             bj,
1795                                             rh,
1796                                             cas_col == 0,
1797                                             w,
1798                                             rw - j);
1799             }
1800         }  else {
1801             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1802             OPJ_UINT32 step_j;
1803
1804             if (rw < num_jobs) {
1805                 num_jobs = rw;
1806             }
1807             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
1808
1809             for (j = 0; j < num_jobs; j++) {
1810                 opj_dwt_encode_v_job_t* job;
1811
1812                 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
1813                 if (!job) {
1814                     opj_thread_pool_wait_completion(tp, 0);
1815                     opj_aligned_free(bj);
1816                     return OPJ_FALSE;
1817                 }
1818                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1819                 if (!job->v.mem) {
1820                     opj_thread_pool_wait_completion(tp, 0);
1821                     opj_free(job);
1822                     opj_aligned_free(bj);
1823                     return OPJ_FALSE;
1824                 }
1825                 job->v.dn = dn;
1826                 job->v.sn = sn;
1827                 job->v.cas = cas_col;
1828                 job->rh = rh;
1829                 job->w = w;
1830                 job->tiledp = tiledp;
1831                 job->min_j = j * step_j;
1832                 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
1833                 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
1834                 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
1835             }
1836             opj_thread_pool_wait_completion(tp, 0);
1837         }
1838
1839         sn = (OPJ_INT32)rw1;
1840         dn = (OPJ_INT32)(rw - rw1);
1841
1842         /* Perform horizontal pass */
1843         if (num_threads <= 1 || rh <= 1) {
1844             for (j = 0; j < rh; j++) {
1845                 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
1846                 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
1847                                                        cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
1848             }
1849         }  else {
1850             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1851             OPJ_UINT32 step_j;
1852
1853             if (rh < num_jobs) {
1854                 num_jobs = rh;
1855             }
1856             step_j = (rh / num_jobs);
1857
1858             for (j = 0; j < num_jobs; j++) {
1859                 opj_dwt_encode_h_job_t* job;
1860
1861                 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
1862                 if (!job) {
1863                     opj_thread_pool_wait_completion(tp, 0);
1864                     opj_aligned_free(bj);
1865                     return OPJ_FALSE;
1866                 }
1867                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1868                 if (!job->h.mem) {
1869                     opj_thread_pool_wait_completion(tp, 0);
1870                     opj_free(job);
1871                     opj_aligned_free(bj);
1872                     return OPJ_FALSE;
1873                 }
1874                 job->h.dn = dn;
1875                 job->h.sn = sn;
1876                 job->h.cas = cas_row;
1877                 job->rw = rw;
1878                 job->w = w;
1879                 job->tiledp = tiledp;
1880                 job->min_j = j * step_j;
1881                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1882                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1883                     job->max_j = rh;
1884                 }
1885                 job->p_function = p_encode_and_deinterleave_h_one_row;
1886                 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
1887             }
1888             opj_thread_pool_wait_completion(tp, 0);
1889         }
1890
1891         l_cur_res = l_last_res;
1892
1893         --l_last_res;
1894     }
1895
1896     opj_aligned_free(bj);
1897     return OPJ_TRUE;
1898 }
1899
1900 /* Forward 5-3 wavelet transform in 2-D. */
1901 /* </summary>                           */
1902 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
1903                         opj_tcd_tilecomp_t * tilec)
1904 {
1905     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1906                                     opj_dwt_encode_and_deinterleave_v,
1907                                     opj_dwt_encode_and_deinterleave_h_one_row);
1908 }
1909
1910 /* <summary>                            */
1911 /* Inverse 5-3 wavelet transform in 2-D. */
1912 /* </summary>                           */
1913 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1914                         OPJ_UINT32 numres)
1915 {
1916     if (p_tcd->whole_tile_decoding) {
1917         return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1918     } else {
1919         return opj_dwt_decode_partial_tile(tilec, numres);
1920     }
1921 }
1922
1923 /* <summary>                */
1924 /* Get norm of 5-3 wavelet. */
1925 /* </summary>               */
1926 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1927 {
1928     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1929     /* but the array should really be extended up to 33 resolution levels */
1930     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1931     if (orient == 0 && level >= 10) {
1932         level = 9;
1933     } else if (orient > 0 && level >= 9) {
1934         level = 8;
1935     }
1936     return opj_dwt_norms[orient][level];
1937 }
1938
1939 /* <summary>                             */
1940 /* Forward 9-7 wavelet transform in 2-D. */
1941 /* </summary>                            */
1942 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
1943                              opj_tcd_tilecomp_t * tilec)
1944 {
1945     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1946                                     opj_dwt_encode_and_deinterleave_v_real,
1947                                     opj_dwt_encode_and_deinterleave_h_one_row_real);
1948 }
1949
1950 /* <summary>                */
1951 /* Get norm of 9-7 wavelet. */
1952 /* </summary>               */
1953 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1954 {
1955     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1956     /* but the array should really be extended up to 33 resolution levels */
1957     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1958     if (orient == 0 && level >= 10) {
1959         level = 9;
1960     } else if (orient > 0 && level >= 9) {
1961         level = 8;
1962     }
1963     return opj_dwt_norms_real[orient][level];
1964 }
1965
1966 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1967 {
1968     OPJ_UINT32 numbands, bandno;
1969     numbands = 3 * tccp->numresolutions - 2;
1970     for (bandno = 0; bandno < numbands; bandno++) {
1971         OPJ_FLOAT64 stepsize;
1972         OPJ_UINT32 resno, level, orient, gain;
1973
1974         resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1975         orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1976         level = tccp->numresolutions - 1 - resno;
1977         gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1978                                           (orient == 2)) ? 1 : 2));
1979         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1980             stepsize = 1.0;
1981         } else {
1982             OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient);
1983             stepsize = (1 << (gain)) / norm;
1984         }
1985         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1986                                 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1987     }
1988 }
1989
1990 /* <summary>                             */
1991 /* Determine maximum computed resolution level for inverse wavelet transform */
1992 /* </summary>                            */
1993 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1994         OPJ_UINT32 i)
1995 {
1996     OPJ_UINT32 mr   = 0;
1997     OPJ_UINT32 w;
1998     while (--i) {
1999         ++r;
2000         if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
2001             mr = w ;
2002         }
2003         if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
2004             mr = w ;
2005         }
2006     }
2007     return mr ;
2008 }
2009
2010 typedef struct {
2011     opj_dwt_t h;
2012     OPJ_UINT32 rw;
2013     OPJ_UINT32 w;
2014     OPJ_INT32 * OPJ_RESTRICT tiledp;
2015     OPJ_UINT32 min_j;
2016     OPJ_UINT32 max_j;
2017 } opj_dwt_decode_h_job_t;
2018
2019 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
2020 {
2021     OPJ_UINT32 j;
2022     opj_dwt_decode_h_job_t* job;
2023     (void)tls;
2024
2025     job = (opj_dwt_decode_h_job_t*)user_data;
2026     for (j = job->min_j; j < job->max_j; j++) {
2027         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
2028     }
2029
2030     opj_aligned_free(job->h.mem);
2031     opj_free(job);
2032 }
2033
2034 typedef struct {
2035     opj_dwt_t v;
2036     OPJ_UINT32 rh;
2037     OPJ_UINT32 w;
2038     OPJ_INT32 * OPJ_RESTRICT tiledp;
2039     OPJ_UINT32 min_j;
2040     OPJ_UINT32 max_j;
2041 } opj_dwt_decode_v_job_t;
2042
2043 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
2044 {
2045     OPJ_UINT32 j;
2046     opj_dwt_decode_v_job_t* job;
2047     (void)tls;
2048
2049     job = (opj_dwt_decode_v_job_t*)user_data;
2050     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
2051             j += PARALLEL_COLS_53) {
2052         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2053                      PARALLEL_COLS_53);
2054     }
2055     if (j < job->max_j)
2056         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2057                      (OPJ_INT32)(job->max_j - j));
2058
2059     opj_aligned_free(job->v.mem);
2060     opj_free(job);
2061 }
2062
2063
2064 /* <summary>                            */
2065 /* Inverse wavelet transform in 2-D.    */
2066 /* </summary>                           */
2067 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
2068                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
2069 {
2070     opj_dwt_t h;
2071     opj_dwt_t v;
2072
2073     opj_tcd_resolution_t* tr = tilec->resolutions;
2074
2075     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2076                                  tr->x0);  /* width of the resolution level computed */
2077     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2078                                  tr->y0);  /* height of the resolution level computed */
2079
2080     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2081                                                                1].x1 -
2082                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2083     OPJ_SIZE_T h_mem_size;
2084     int num_threads;
2085
2086     /* Not entirely sure for the return code of w == 0 which is triggered per */
2087     /* https://github.com/uclouvain/openjpeg/issues/1505 */
2088     if (numres == 1U || w == 0) {
2089         return OPJ_TRUE;
2090     }
2091     num_threads = opj_thread_pool_get_thread_count(tp);
2092     h_mem_size = opj_dwt_max_resolution(tr, numres);
2093     /* overflow check */
2094     if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
2095         /* FIXME event manager error callback */
2096         return OPJ_FALSE;
2097     }
2098     /* We need PARALLEL_COLS_53 times the height of the array, */
2099     /* since for the vertical pass */
2100     /* we process PARALLEL_COLS_53 columns at a time */
2101     h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
2102     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2103     if (! h.mem) {
2104         /* FIXME event manager error callback */
2105         return OPJ_FALSE;
2106     }
2107
2108     v.mem = h.mem;
2109
2110     while (--numres) {
2111         OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
2112         OPJ_UINT32 j;
2113
2114         ++tr;
2115         h.sn = (OPJ_INT32)rw;
2116         v.sn = (OPJ_INT32)rh;
2117
2118         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2119         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2120
2121         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2122         h.cas = tr->x0 % 2;
2123
2124         if (num_threads <= 1 || rh <= 1) {
2125             for (j = 0; j < rh; ++j) {
2126                 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
2127             }
2128         } else {
2129             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2130             OPJ_UINT32 step_j;
2131
2132             if (rh < num_jobs) {
2133                 num_jobs = rh;
2134             }
2135             step_j = (rh / num_jobs);
2136
2137             for (j = 0; j < num_jobs; j++) {
2138                 opj_dwt_decode_h_job_t* job;
2139
2140                 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
2141                 if (!job) {
2142                     /* It would be nice to fallback to single thread case, but */
2143                     /* unfortunately some jobs may be launched and have modified */
2144                     /* tiledp, so it is not practical to recover from that error */
2145                     /* FIXME event manager error callback */
2146                     opj_thread_pool_wait_completion(tp, 0);
2147                     opj_aligned_free(h.mem);
2148                     return OPJ_FALSE;
2149                 }
2150                 job->h = h;
2151                 job->rw = rw;
2152                 job->w = w;
2153                 job->tiledp = tiledp;
2154                 job->min_j = j * step_j;
2155                 job->max_j = (j + 1U) * step_j; /* this can overflow */
2156                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
2157                     job->max_j = rh;
2158                 }
2159                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2160                 if (!job->h.mem) {
2161                     /* FIXME event manager error callback */
2162                     opj_thread_pool_wait_completion(tp, 0);
2163                     opj_free(job);
2164                     opj_aligned_free(h.mem);
2165                     return OPJ_FALSE;
2166                 }
2167                 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
2168             }
2169             opj_thread_pool_wait_completion(tp, 0);
2170         }
2171
2172         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2173         v.cas = tr->y0 % 2;
2174
2175         if (num_threads <= 1 || rw <= 1) {
2176             for (j = 0; j + PARALLEL_COLS_53 <= rw;
2177                     j += PARALLEL_COLS_53) {
2178                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
2179             }
2180             if (j < rw) {
2181                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
2182             }
2183         } else {
2184             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2185             OPJ_UINT32 step_j;
2186
2187             if (rw < num_jobs) {
2188                 num_jobs = rw;
2189             }
2190             step_j = (rw / num_jobs);
2191
2192             for (j = 0; j < num_jobs; j++) {
2193                 opj_dwt_decode_v_job_t* job;
2194
2195                 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
2196                 if (!job) {
2197                     /* It would be nice to fallback to single thread case, but */
2198                     /* unfortunately some jobs may be launched and have modified */
2199                     /* tiledp, so it is not practical to recover from that error */
2200                     /* FIXME event manager error callback */
2201                     opj_thread_pool_wait_completion(tp, 0);
2202                     opj_aligned_free(v.mem);
2203                     return OPJ_FALSE;
2204                 }
2205                 job->v = v;
2206                 job->rh = rh;
2207                 job->w = w;
2208                 job->tiledp = tiledp;
2209                 job->min_j = j * step_j;
2210                 job->max_j = (j + 1U) * step_j; /* this can overflow */
2211                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
2212                     job->max_j = rw;
2213                 }
2214                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2215                 if (!job->v.mem) {
2216                     /* FIXME event manager error callback */
2217                     opj_thread_pool_wait_completion(tp, 0);
2218                     opj_free(job);
2219                     opj_aligned_free(v.mem);
2220                     return OPJ_FALSE;
2221                 }
2222                 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
2223             }
2224             opj_thread_pool_wait_completion(tp, 0);
2225         }
2226     }
2227     opj_aligned_free(h.mem);
2228     return OPJ_TRUE;
2229 }
2230
2231 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
2232         OPJ_INT32 cas,
2233         opj_sparse_array_int32_t* sa,
2234         OPJ_UINT32 sa_line,
2235         OPJ_UINT32 sn,
2236         OPJ_UINT32 win_l_x0,
2237         OPJ_UINT32 win_l_x1,
2238         OPJ_UINT32 win_h_x0,
2239         OPJ_UINT32 win_h_x1)
2240 {
2241     OPJ_BOOL ret;
2242     ret = opj_sparse_array_int32_read(sa,
2243                                       win_l_x0, sa_line,
2244                                       win_l_x1, sa_line + 1,
2245                                       dest + cas + 2 * win_l_x0,
2246                                       2, 0, OPJ_TRUE);
2247     assert(ret);
2248     ret = opj_sparse_array_int32_read(sa,
2249                                       sn + win_h_x0, sa_line,
2250                                       sn + win_h_x1, sa_line + 1,
2251                                       dest + 1 - cas + 2 * win_h_x0,
2252                                       2, 0, OPJ_TRUE);
2253     assert(ret);
2254     OPJ_UNUSED(ret);
2255 }
2256
2257
2258 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
2259         OPJ_INT32 cas,
2260         opj_sparse_array_int32_t* sa,
2261         OPJ_UINT32 sa_col,
2262         OPJ_UINT32 nb_cols,
2263         OPJ_UINT32 sn,
2264         OPJ_UINT32 win_l_y0,
2265         OPJ_UINT32 win_l_y1,
2266         OPJ_UINT32 win_h_y0,
2267         OPJ_UINT32 win_h_y1)
2268 {
2269     OPJ_BOOL ret;
2270     ret  = opj_sparse_array_int32_read(sa,
2271                                        sa_col, win_l_y0,
2272                                        sa_col + nb_cols, win_l_y1,
2273                                        dest + cas * 4 + 2 * 4 * win_l_y0,
2274                                        1, 2 * 4, OPJ_TRUE);
2275     assert(ret);
2276     ret = opj_sparse_array_int32_read(sa,
2277                                       sa_col, sn + win_h_y0,
2278                                       sa_col + nb_cols, sn + win_h_y1,
2279                                       dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
2280                                       1, 2 * 4, OPJ_TRUE);
2281     assert(ret);
2282     OPJ_UNUSED(ret);
2283 }
2284
2285 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
2286                                      OPJ_INT32 cas,
2287                                      OPJ_INT32 win_l_x0,
2288                                      OPJ_INT32 win_l_x1,
2289                                      OPJ_INT32 win_h_x0,
2290                                      OPJ_INT32 win_h_x1)
2291 {
2292     OPJ_INT32 i;
2293
2294     if (!cas) {
2295         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
2296
2297             /* Naive version is :
2298             for (i = win_l_x0; i < i_max; i++) {
2299                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2300             }
2301             for (i = win_h_x0; i < win_h_x1; i++) {
2302                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2303             }
2304             but the compiler doesn't manage to unroll it to avoid bound
2305             checking in OPJ_S_ and OPJ_D_ macros
2306             */
2307
2308             i = win_l_x0;
2309             if (i < win_l_x1) {
2310                 OPJ_INT32 i_max;
2311
2312                 /* Left-most case */
2313                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2314                 i ++;
2315
2316                 i_max = win_l_x1;
2317                 if (i_max > dn) {
2318                     i_max = dn;
2319                 }
2320                 for (; i < i_max; i++) {
2321                     /* No bound checking */
2322                     OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
2323                 }
2324                 for (; i < win_l_x1; i++) {
2325                     /* Right-most case */
2326                     OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2327                 }
2328             }
2329
2330             i = win_h_x0;
2331             if (i < win_h_x1) {
2332                 OPJ_INT32 i_max = win_h_x1;
2333                 if (i_max >= sn) {
2334                     i_max = sn - 1;
2335                 }
2336                 for (; i < i_max; i++) {
2337                     /* No bound checking */
2338                     OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
2339                 }
2340                 for (; i < win_h_x1; i++) {
2341                     /* Right-most case */
2342                     OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2343                 }
2344             }
2345         }
2346     } else {
2347         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
2348             OPJ_S(0) /= 2;
2349         } else {
2350             for (i = win_l_x0; i < win_l_x1; i++) {
2351                 OPJ_D(i) = opj_int_sub_no_overflow(OPJ_D(i),
2352                                                    opj_int_add_no_overflow(opj_int_add_no_overflow(OPJ_SS_(i), OPJ_SS_(i + 1)),
2353                                                            2) >> 2);
2354             }
2355             for (i = win_h_x0; i < win_h_x1; i++) {
2356                 OPJ_S(i) = opj_int_add_no_overflow(OPJ_S(i),
2357                                                    opj_int_add_no_overflow(OPJ_DD_(i), OPJ_DD_(i - 1)) >> 1);
2358             }
2359         }
2360     }
2361 }
2362
2363 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
2364 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
2365 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
2366 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
2367 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
2368 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
2369
2370 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
2371         OPJ_UINT32 nb_cols,
2372         OPJ_INT32 dn, OPJ_INT32 sn,
2373         OPJ_INT32 cas,
2374         OPJ_INT32 win_l_x0,
2375         OPJ_INT32 win_l_x1,
2376         OPJ_INT32 win_h_x0,
2377         OPJ_INT32 win_h_x1)
2378 {
2379     OPJ_INT32 i;
2380     OPJ_UINT32 off;
2381
2382     (void)nb_cols;
2383
2384     if (!cas) {
2385         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
2386
2387             /* Naive version is :
2388             for (i = win_l_x0; i < i_max; i++) {
2389                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2390             }
2391             for (i = win_h_x0; i < win_h_x1; i++) {
2392                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2393             }
2394             but the compiler doesn't manage to unroll it to avoid bound
2395             checking in OPJ_S_ and OPJ_D_ macros
2396             */
2397
2398             i = win_l_x0;
2399             if (i < win_l_x1) {
2400                 OPJ_INT32 i_max;
2401
2402                 /* Left-most case */
2403                 for (off = 0; off < 4; off++) {
2404                     OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2405                 }
2406                 i ++;
2407
2408                 i_max = win_l_x1;
2409                 if (i_max > dn) {
2410                     i_max = dn;
2411                 }
2412
2413 #ifdef __SSE2__
2414                 if (i + 1 < i_max) {
2415                     const __m128i two = _mm_set1_epi32(2);
2416                     __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
2417                     for (; i + 1 < i_max; i += 2) {
2418                         /* No bound checking */
2419                         __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2420                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2421                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2422                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2423                         S = _mm_sub_epi32(S,
2424                                           _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
2425                         S1 = _mm_sub_epi32(S1,
2426                                            _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
2427                         _mm_store_si128((__m128i*)(a + i * 8), S);
2428                         _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
2429                         Dm1 = D1;
2430                     }
2431                 }
2432 #endif
2433
2434                 for (; i < i_max; i++) {
2435                     /* No bound checking */
2436                     for (off = 0; off < 4; off++) {
2437                         OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
2438                     }
2439                 }
2440                 for (; i < win_l_x1; i++) {
2441                     /* Right-most case */
2442                     for (off = 0; off < 4; off++) {
2443                         OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2444                     }
2445                 }
2446             }
2447
2448             i = win_h_x0;
2449             if (i < win_h_x1) {
2450                 OPJ_INT32 i_max = win_h_x1;
2451                 if (i_max >= sn) {
2452                     i_max = sn - 1;
2453                 }
2454
2455 #ifdef __SSE2__
2456                 if (i + 1 < i_max) {
2457                     __m128i S =  _mm_load_si128((__m128i * const)(a + i * 8));
2458                     for (; i + 1 < i_max; i += 2) {
2459                         /* No bound checking */
2460                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2461                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2462                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2463                         __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
2464                         D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
2465                         D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
2466                         _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
2467                         _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
2468                         S = S2;
2469                     }
2470                 }
2471 #endif
2472
2473                 for (; i < i_max; i++) {
2474                     /* No bound checking */
2475                     for (off = 0; off < 4; off++) {
2476                         OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
2477                     }
2478                 }
2479                 for (; i < win_h_x1; i++) {
2480                     /* Right-most case */
2481                     for (off = 0; off < 4; off++) {
2482                         OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
2483                     }
2484                 }
2485             }
2486         }
2487     } else {
2488         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
2489             for (off = 0; off < 4; off++) {
2490                 OPJ_S_off(0, off) /= 2;
2491             }
2492         } else {
2493             for (i = win_l_x0; i < win_l_x1; i++) {
2494                 for (off = 0; off < 4; off++) {
2495                     OPJ_D_off(i, off) = opj_int_sub_no_overflow(
2496                                             OPJ_D_off(i, off),
2497                                             opj_int_add_no_overflow(
2498                                                 opj_int_add_no_overflow(OPJ_SS__off(i, off), OPJ_SS__off(i + 1, off)), 2) >> 2);
2499                 }
2500             }
2501             for (i = win_h_x0; i < win_h_x1; i++) {
2502                 for (off = 0; off < 4; off++) {
2503                     OPJ_S_off(i, off) = opj_int_add_no_overflow(
2504                                             OPJ_S_off(i, off),
2505                                             opj_int_add_no_overflow(OPJ_DD__off(i, off), OPJ_DD__off(i - 1, off)) >> 1);
2506                 }
2507             }
2508         }
2509     }
2510 }
2511
2512 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
2513         OPJ_UINT32 resno,
2514         OPJ_UINT32 bandno,
2515         OPJ_UINT32 tcx0,
2516         OPJ_UINT32 tcy0,
2517         OPJ_UINT32 tcx1,
2518         OPJ_UINT32 tcy1,
2519         OPJ_UINT32* tbx0,
2520         OPJ_UINT32* tby0,
2521         OPJ_UINT32* tbx1,
2522         OPJ_UINT32* tby1)
2523 {
2524     /* Compute number of decomposition for this band. See table F-1 */
2525     OPJ_UINT32 nb = (resno == 0) ?
2526                     tilec->numresolutions - 1 :
2527                     tilec->numresolutions - resno;
2528     /* Map above tile-based coordinates to sub-band-based coordinates per */
2529     /* equation B-15 of the standard */
2530     OPJ_UINT32 x0b = bandno & 1;
2531     OPJ_UINT32 y0b = bandno >> 1;
2532     if (tbx0) {
2533         *tbx0 = (nb == 0) ? tcx0 :
2534                 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
2535                 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
2536     }
2537     if (tby0) {
2538         *tby0 = (nb == 0) ? tcy0 :
2539                 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
2540                 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
2541     }
2542     if (tbx1) {
2543         *tbx1 = (nb == 0) ? tcx1 :
2544                 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
2545                 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
2546     }
2547     if (tby1) {
2548         *tby1 = (nb == 0) ? tcy1 :
2549                 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
2550                 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
2551     }
2552 }
2553
2554 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
2555                                  OPJ_UINT32 max_size,
2556                                  OPJ_UINT32* start,
2557                                  OPJ_UINT32* end)
2558 {
2559     *start = opj_uint_subs(*start, filter_width);
2560     *end = opj_uint_adds(*end, filter_width);
2561     *end = opj_uint_min(*end, max_size);
2562 }
2563
2564
2565 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
2566     opj_tcd_tilecomp_t* tilec,
2567     OPJ_UINT32 numres)
2568 {
2569     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2570     OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
2571     OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
2572     OPJ_UINT32 resno, bandno, precno, cblkno;
2573     opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
2574                                        w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
2575     if (sa == NULL) {
2576         return NULL;
2577     }
2578
2579     for (resno = 0; resno < numres; ++resno) {
2580         opj_tcd_resolution_t* res = &tilec->resolutions[resno];
2581
2582         for (bandno = 0; bandno < res->numbands; ++bandno) {
2583             opj_tcd_band_t* band = &res->bands[bandno];
2584
2585             for (precno = 0; precno < res->pw * res->ph; ++precno) {
2586                 opj_tcd_precinct_t* precinct = &band->precincts[precno];
2587                 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
2588                     opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
2589                     if (cblk->decoded_data != NULL) {
2590                         OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
2591                         OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
2592                         OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
2593                         OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
2594
2595                         if (band->bandno & 1) {
2596                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2597                             x += (OPJ_UINT32)(pres->x1 - pres->x0);
2598                         }
2599                         if (band->bandno & 2) {
2600                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2601                             y += (OPJ_UINT32)(pres->y1 - pres->y0);
2602                         }
2603
2604                         if (!opj_sparse_array_int32_write(sa, x, y,
2605                                                           x + cblk_w, y + cblk_h,
2606                                                           cblk->decoded_data,
2607                                                           1, cblk_w, OPJ_TRUE)) {
2608                             opj_sparse_array_int32_free(sa);
2609                             return NULL;
2610                         }
2611                     }
2612                 }
2613             }
2614         }
2615     }
2616
2617     return sa;
2618 }
2619
2620
2621 static OPJ_BOOL opj_dwt_decode_partial_tile(
2622     opj_tcd_tilecomp_t* tilec,
2623     OPJ_UINT32 numres)
2624 {
2625     opj_sparse_array_int32_t* sa;
2626     opj_dwt_t h;
2627     opj_dwt_t v;
2628     OPJ_UINT32 resno;
2629     /* This value matches the maximum left/right extension given in tables */
2630     /* F.2 and F.3 of the standard. */
2631     const OPJ_UINT32 filter_width = 2U;
2632
2633     opj_tcd_resolution_t* tr = tilec->resolutions;
2634     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2635
2636     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2637                                  tr->x0);  /* width of the resolution level computed */
2638     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2639                                  tr->y0);  /* height of the resolution level computed */
2640
2641     OPJ_SIZE_T h_mem_size;
2642
2643     /* Compute the intersection of the area of interest, expressed in tile coordinates */
2644     /* with the tile coordinates */
2645     OPJ_UINT32 win_tcx0 = tilec->win_x0;
2646     OPJ_UINT32 win_tcy0 = tilec->win_y0;
2647     OPJ_UINT32 win_tcx1 = tilec->win_x1;
2648     OPJ_UINT32 win_tcy1 = tilec->win_y1;
2649
2650     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2651         return OPJ_TRUE;
2652     }
2653
2654     sa = opj_dwt_init_sparse_array(tilec, numres);
2655     if (sa == NULL) {
2656         return OPJ_FALSE;
2657     }
2658
2659     if (numres == 1U) {
2660         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2661                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2662                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2663                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2664                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2665                        tilec->data_win,
2666                        1, tr_max->win_x1 - tr_max->win_x0,
2667                        OPJ_TRUE);
2668         assert(ret);
2669         OPJ_UNUSED(ret);
2670         opj_sparse_array_int32_free(sa);
2671         return OPJ_TRUE;
2672     }
2673     h_mem_size = opj_dwt_max_resolution(tr, numres);
2674     /* overflow check */
2675     /* in vertical pass, we process 4 columns at a time */
2676     if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
2677         /* FIXME event manager error callback */
2678         opj_sparse_array_int32_free(sa);
2679         return OPJ_FALSE;
2680     }
2681
2682     h_mem_size *= 4 * sizeof(OPJ_INT32);
2683     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2684     if (! h.mem) {
2685         /* FIXME event manager error callback */
2686         opj_sparse_array_int32_free(sa);
2687         return OPJ_FALSE;
2688     }
2689
2690     v.mem = h.mem;
2691
2692     for (resno = 1; resno < numres; resno ++) {
2693         OPJ_UINT32 i, j;
2694         /* Window of interest subband-based coordinates */
2695         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2696         OPJ_UINT32 win_hl_x0, win_hl_x1;
2697         OPJ_UINT32 win_lh_y0, win_lh_y1;
2698         /* Window of interest tile-resolution-based coordinates */
2699         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2700         /* Tile-resolution subband-based coordinates */
2701         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2702
2703         ++tr;
2704
2705         h.sn = (OPJ_INT32)rw;
2706         v.sn = (OPJ_INT32)rh;
2707
2708         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2709         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2710
2711         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2712         h.cas = tr->x0 % 2;
2713
2714         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2715         v.cas = tr->y0 % 2;
2716
2717         /* Get the subband coordinates for the window of interest */
2718         /* LL band */
2719         opj_dwt_get_band_coordinates(tilec, resno, 0,
2720                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2721                                      &win_ll_x0, &win_ll_y0,
2722                                      &win_ll_x1, &win_ll_y1);
2723
2724         /* HL band */
2725         opj_dwt_get_band_coordinates(tilec, resno, 1,
2726                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2727                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
2728
2729         /* LH band */
2730         opj_dwt_get_band_coordinates(tilec, resno, 2,
2731                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2732                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
2733
2734         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2735         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2736         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2737         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2738         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2739
2740         /* Subtract the origin of the bands for this tile, to the subwindow */
2741         /* of interest band coordinates, so as to get them relative to the */
2742         /* tile */
2743         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2744         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2745         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2746         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2747         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2748         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2749         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2750         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2751
2752         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2753         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2754
2755         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2756         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2757
2758         /* Compute the tile-resolution-based coordinates for the window of interest */
2759         if (h.cas == 0) {
2760             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2761             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2762         } else {
2763             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2764             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2765         }
2766
2767         if (v.cas == 0) {
2768             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2769             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2770         } else {
2771             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2772             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2773         }
2774
2775         for (j = 0; j < rh; ++j) {
2776             if ((j >= win_ll_y0 && j < win_ll_y1) ||
2777                     (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2778
2779                 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2780                 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2781                 /* on opj_decompress -i  ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2782                 /* This is less extreme than memsetting the whole buffer to 0 */
2783                 /* although we could potentially do better with better handling of edge conditions */
2784                 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2785                     h.mem[win_tr_x1 - 1] = 0;
2786                 }
2787                 if (win_tr_x1 < rw) {
2788                     h.mem[win_tr_x1] = 0;
2789                 }
2790
2791                 opj_dwt_interleave_partial_h(h.mem,
2792                                              h.cas,
2793                                              sa,
2794                                              j,
2795                                              (OPJ_UINT32)h.sn,
2796                                              win_ll_x0,
2797                                              win_ll_x1,
2798                                              win_hl_x0,
2799                                              win_hl_x1);
2800                 opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas,
2801                                          (OPJ_INT32)win_ll_x0,
2802                                          (OPJ_INT32)win_ll_x1,
2803                                          (OPJ_INT32)win_hl_x0,
2804                                          (OPJ_INT32)win_hl_x1);
2805                 if (!opj_sparse_array_int32_write(sa,
2806                                                   win_tr_x0, j,
2807                                                   win_tr_x1, j + 1,
2808                                                   h.mem + win_tr_x0,
2809                                                   1, 0, OPJ_TRUE)) {
2810                     /* FIXME event manager error callback */
2811                     opj_sparse_array_int32_free(sa);
2812                     opj_aligned_free(h.mem);
2813                     return OPJ_FALSE;
2814                 }
2815             }
2816         }
2817
2818         for (i = win_tr_x0; i < win_tr_x1;) {
2819             OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2820             opj_dwt_interleave_partial_v(v.mem,
2821                                          v.cas,
2822                                          sa,
2823                                          i,
2824                                          nb_cols,
2825                                          (OPJ_UINT32)v.sn,
2826                                          win_ll_y0,
2827                                          win_ll_y1,
2828                                          win_lh_y0,
2829                                          win_lh_y1);
2830             opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2831                                               (OPJ_INT32)win_ll_y0,
2832                                               (OPJ_INT32)win_ll_y1,
2833                                               (OPJ_INT32)win_lh_y0,
2834                                               (OPJ_INT32)win_lh_y1);
2835             if (!opj_sparse_array_int32_write(sa,
2836                                               i, win_tr_y0,
2837                                               i + nb_cols, win_tr_y1,
2838                                               v.mem + 4 * win_tr_y0,
2839                                               1, 4, OPJ_TRUE)) {
2840                 /* FIXME event manager error callback */
2841                 opj_sparse_array_int32_free(sa);
2842                 opj_aligned_free(h.mem);
2843                 return OPJ_FALSE;
2844             }
2845
2846             i += nb_cols;
2847         }
2848     }
2849     opj_aligned_free(h.mem);
2850
2851     {
2852         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2853                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2854                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2855                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2856                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2857                        tilec->data_win,
2858                        1, tr_max->win_x1 - tr_max->win_x0,
2859                        OPJ_TRUE);
2860         assert(ret);
2861         OPJ_UNUSED(ret);
2862     }
2863     opj_sparse_array_int32_free(sa);
2864     return OPJ_TRUE;
2865 }
2866
2867 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
2868                                    OPJ_FLOAT32* OPJ_RESTRICT a,
2869                                    OPJ_UINT32 width,
2870                                    OPJ_UINT32 remaining_height)
2871 {
2872     OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2873     OPJ_UINT32 i, k;
2874     OPJ_UINT32 x0 = dwt->win_l_x0;
2875     OPJ_UINT32 x1 = dwt->win_l_x1;
2876
2877     for (k = 0; k < 2; ++k) {
2878         if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2879                 ((OPJ_SIZE_T) bi & 0x0f) == 0) {
2880             /* Fast code path */
2881             for (i = x0; i < x1; ++i) {
2882                 OPJ_UINT32 j = i;
2883                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2884                 dst[0] = a[j];
2885                 j += width;
2886                 dst[1] = a[j];
2887                 j += width;
2888                 dst[2] = a[j];
2889                 j += width;
2890                 dst[3] = a[j];
2891                 j += width;
2892                 dst[4] = a[j];
2893                 j += width;
2894                 dst[5] = a[j];
2895                 j += width;
2896                 dst[6] = a[j];
2897                 j += width;
2898                 dst[7] = a[j];
2899             }
2900         } else {
2901             /* Slow code path */
2902             for (i = x0; i < x1; ++i) {
2903                 OPJ_UINT32 j = i;
2904                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2905                 dst[0] = a[j];
2906                 j += width;
2907                 if (remaining_height == 1) {
2908                     continue;
2909                 }
2910                 dst[1] = a[j];
2911                 j += width;
2912                 if (remaining_height == 2) {
2913                     continue;
2914                 }
2915                 dst[2] = a[j];
2916                 j += width;
2917                 if (remaining_height == 3) {
2918                     continue;
2919                 }
2920                 dst[3] = a[j];
2921                 j += width;
2922                 if (remaining_height == 4) {
2923                     continue;
2924                 }
2925                 dst[4] = a[j];
2926                 j += width;
2927                 if (remaining_height == 5) {
2928                     continue;
2929                 }
2930                 dst[5] = a[j];
2931                 j += width;
2932                 if (remaining_height == 6) {
2933                     continue;
2934                 }
2935                 dst[6] = a[j];
2936                 j += width;
2937                 if (remaining_height == 7) {
2938                     continue;
2939                 }
2940                 dst[7] = a[j];
2941             }
2942         }
2943
2944         bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2945         a += dwt->sn;
2946         x0 = dwt->win_h_x0;
2947         x1 = dwt->win_h_x1;
2948     }
2949 }
2950
2951 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
2952         opj_sparse_array_int32_t* sa,
2953         OPJ_UINT32 sa_line,
2954         OPJ_UINT32 remaining_height)
2955 {
2956     OPJ_UINT32 i;
2957     for (i = 0; i < remaining_height; i++) {
2958         OPJ_BOOL ret;
2959         ret = opj_sparse_array_int32_read(sa,
2960                                           dwt->win_l_x0, sa_line + i,
2961                                           dwt->win_l_x1, sa_line + i + 1,
2962                                           /* Nasty cast from float* to int32* */
2963                                           (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2964                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2965         assert(ret);
2966         ret = opj_sparse_array_int32_read(sa,
2967                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2968                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2969                                           /* Nasty cast from float* to int32* */
2970                                           (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2971                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2972         assert(ret);
2973         OPJ_UNUSED(ret);
2974     }
2975 }
2976
2977 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2978         OPJ_FLOAT32* OPJ_RESTRICT a,
2979         OPJ_UINT32 width,
2980         OPJ_UINT32 nb_elts_read)
2981 {
2982     opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2983     OPJ_UINT32 i;
2984
2985     for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2986         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2987                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2988     }
2989
2990     a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2991     bi = dwt->wavelet + 1 - dwt->cas;
2992
2993     for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2994         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2995                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2996     }
2997 }
2998
2999 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
3000         opj_sparse_array_int32_t* sa,
3001         OPJ_UINT32 sa_col,
3002         OPJ_UINT32 nb_elts_read)
3003 {
3004     OPJ_BOOL ret;
3005     ret = opj_sparse_array_int32_read(sa,
3006                                       sa_col, dwt->win_l_x0,
3007                                       sa_col + nb_elts_read, dwt->win_l_x1,
3008                                       (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
3009                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
3010     assert(ret);
3011     ret = opj_sparse_array_int32_read(sa,
3012                                       sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
3013                                       sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
3014                                       (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
3015                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
3016     assert(ret);
3017     OPJ_UNUSED(ret);
3018 }
3019
3020 #ifdef __SSE__
3021
3022 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
3023                                        OPJ_UINT32 start,
3024                                        OPJ_UINT32 end,
3025                                        const __m128 c)
3026 {
3027     __m128* OPJ_RESTRICT vw = (__m128*) w;
3028     OPJ_UINT32 i = start;
3029     /* To be adapted if NB_ELTS_V8 changes */
3030     vw += 4 * start;
3031     /* Note: attempt at loop unrolling x2 doesn't help */
3032     for (; i < end; ++i, vw += 4) {
3033         vw[0] = _mm_mul_ps(vw[0], c);
3034         vw[1] = _mm_mul_ps(vw[1], c);
3035     }
3036 }
3037
3038 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
3039                                        OPJ_UINT32 start,
3040                                        OPJ_UINT32 end,
3041                                        OPJ_UINT32 m,
3042                                        __m128 c)
3043 {
3044     __m128* OPJ_RESTRICT vl = (__m128*) l;
3045     __m128* OPJ_RESTRICT vw = (__m128*) w;
3046     /* To be adapted if NB_ELTS_V8 changes */
3047     OPJ_UINT32 i;
3048     OPJ_UINT32 imax = opj_uint_min(end, m);
3049     if (start == 0) {
3050         if (imax >= 1) {
3051             vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
3052             vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
3053             vw += 4;
3054             start = 1;
3055         }
3056     } else {
3057         vw += start * 4;
3058     }
3059
3060     i = start;
3061     /* Note: attempt at loop unrolling x2 doesn't help */
3062     for (; i < imax; ++i) {
3063         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
3064         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
3065         vw += 4;
3066     }
3067     if (m < end) {
3068         assert(m + 1 == end);
3069         c = _mm_add_ps(c, c);
3070         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
3071         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
3072     }
3073 }
3074
3075 #else
3076
3077 static void opj_v8dwt_decode_step1(opj_v8_t* w,
3078                                    OPJ_UINT32 start,
3079                                    OPJ_UINT32 end,
3080                                    const OPJ_FLOAT32 c)
3081 {
3082     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
3083     OPJ_UINT32 i;
3084     /* To be adapted if NB_ELTS_V8 changes */
3085     for (i = start; i < end; ++i) {
3086         fw[i * 2 * 8    ] = fw[i * 2 * 8    ] * c;
3087         fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
3088         fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
3089         fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
3090         fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
3091         fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
3092         fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
3093         fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
3094     }
3095 }
3096
3097 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
3098                                    OPJ_UINT32 start,
3099                                    OPJ_UINT32 end,
3100                                    OPJ_UINT32 m,
3101                                    OPJ_FLOAT32 c)
3102 {
3103     OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
3104     OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
3105     OPJ_UINT32 i;
3106     OPJ_UINT32 imax = opj_uint_min(end, m);
3107     if (start > 0) {
3108         fw += 2 * NB_ELTS_V8 * start;
3109         fl = fw - 2 * NB_ELTS_V8;
3110     }
3111     /* To be adapted if NB_ELTS_V8 changes */
3112     for (i = start; i < imax; ++i) {
3113         fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
3114         fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
3115         fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
3116         fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
3117         fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
3118         fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
3119         fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
3120         fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
3121         fl = fw;
3122         fw += 2 * NB_ELTS_V8;
3123     }
3124     if (m < end) {
3125         assert(m + 1 == end);
3126         c += c;
3127         fw[-8] = fw[-8] + fl[0] * c;
3128         fw[-7] = fw[-7] + fl[1] * c;
3129         fw[-6] = fw[-6] + fl[2] * c;
3130         fw[-5] = fw[-5] + fl[3] * c;
3131         fw[-4] = fw[-4] + fl[4] * c;
3132         fw[-3] = fw[-3] + fl[5] * c;
3133         fw[-2] = fw[-2] + fl[6] * c;
3134         fw[-1] = fw[-1] + fl[7] * c;
3135     }
3136 }
3137
3138 #endif
3139
3140 /* <summary>                             */
3141 /* Inverse 9-7 wavelet transform in 1-D. */
3142 /* </summary>                            */
3143 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
3144 {
3145     OPJ_INT32 a, b;
3146     /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
3147     /* Historic value for 2 / opj_invK */
3148     /* Normally, we should use invK, but if we do so, we have failures in the */
3149     /* conformance test, due to MSE and peak errors significantly higher than */
3150     /* accepted value */
3151     /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
3152     /* the computation of the stepsize for the non LL subbands */
3153     const float two_invK = 1.625732422f;
3154     if (dwt->cas == 0) {
3155         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
3156             return;
3157         }
3158         a = 0;
3159         b = 1;
3160     } else {
3161         if (!((dwt->sn > 0) || (dwt->dn > 1))) {
3162             return;
3163         }
3164         a = 1;
3165         b = 0;
3166     }
3167 #ifdef __SSE__
3168     opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3169                                _mm_set1_ps(opj_K));
3170     opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3171                                _mm_set1_ps(two_invK));
3172     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3173                                dwt->win_l_x0, dwt->win_l_x1,
3174                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3175                                _mm_set1_ps(-opj_dwt_delta));
3176     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3177                                dwt->win_h_x0, dwt->win_h_x1,
3178                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3179                                _mm_set1_ps(-opj_dwt_gamma));
3180     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3181                                dwt->win_l_x0, dwt->win_l_x1,
3182                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3183                                _mm_set1_ps(-opj_dwt_beta));
3184     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3185                                dwt->win_h_x0, dwt->win_h_x1,
3186                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3187                                _mm_set1_ps(-opj_dwt_alpha));
3188 #else
3189     opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3190                            opj_K);
3191     opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3192                            two_invK);
3193     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3194                            dwt->win_l_x0, dwt->win_l_x1,
3195                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3196                            -opj_dwt_delta);
3197     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3198                            dwt->win_h_x0, dwt->win_h_x1,
3199                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3200                            -opj_dwt_gamma);
3201     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3202                            dwt->win_l_x0, dwt->win_l_x1,
3203                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3204                            -opj_dwt_beta);
3205     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3206                            dwt->win_h_x0, dwt->win_h_x1,
3207                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3208                            -opj_dwt_alpha);
3209 #endif
3210 }
3211
3212 typedef struct {
3213     opj_v8dwt_t h;
3214     OPJ_UINT32 rw;
3215     OPJ_UINT32 w;
3216     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3217     OPJ_UINT32 nb_rows;
3218 } opj_dwt97_decode_h_job_t;
3219
3220 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
3221 {
3222     OPJ_UINT32 j;
3223     opj_dwt97_decode_h_job_t* job;
3224     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3225     OPJ_UINT32 w;
3226     (void)tls;
3227
3228     job = (opj_dwt97_decode_h_job_t*)user_data;
3229     w = job->w;
3230
3231     assert((job->nb_rows % NB_ELTS_V8) == 0);
3232
3233     aj = job->aj;
3234     for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
3235         OPJ_UINT32 k;
3236         opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
3237         opj_v8dwt_decode(&job->h);
3238
3239         /* To be adapted if NB_ELTS_V8 changes */
3240         for (k = 0; k < job->rw; k++) {
3241             aj[k      ] = job->h.wavelet[k].f[0];
3242             aj[k + (OPJ_SIZE_T)w  ] = job->h.wavelet[k].f[1];
3243             aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
3244             aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
3245         }
3246         for (k = 0; k < job->rw; k++) {
3247             aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
3248             aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
3249             aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
3250             aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
3251         }
3252
3253         aj += w * NB_ELTS_V8;
3254     }
3255
3256     opj_aligned_free(job->h.wavelet);
3257     opj_free(job);
3258 }
3259
3260
3261 typedef struct {
3262     opj_v8dwt_t v;
3263     OPJ_UINT32 rh;
3264     OPJ_UINT32 w;
3265     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3266     OPJ_UINT32 nb_columns;
3267 } opj_dwt97_decode_v_job_t;
3268
3269 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
3270 {
3271     OPJ_UINT32 j;
3272     opj_dwt97_decode_v_job_t* job;
3273     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3274     (void)tls;
3275
3276     job = (opj_dwt97_decode_v_job_t*)user_data;
3277
3278     assert((job->nb_columns % NB_ELTS_V8) == 0);
3279
3280     aj = job->aj;
3281     for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
3282         OPJ_UINT32 k;
3283
3284         opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
3285         opj_v8dwt_decode(&job->v);
3286
3287         for (k = 0; k < job->rh; ++k) {
3288             memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
3289                    NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3290         }
3291         aj += NB_ELTS_V8;
3292     }
3293
3294     opj_aligned_free(job->v.wavelet);
3295     opj_free(job);
3296 }
3297
3298
3299 /* <summary>                             */
3300 /* Inverse 9-7 wavelet transform in 2-D. */
3301 /* </summary>                            */
3302 static
3303 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
3304                                 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3305                                 OPJ_UINT32 numres)
3306 {
3307     opj_v8dwt_t h;
3308     opj_v8dwt_t v;
3309
3310     opj_tcd_resolution_t* res = tilec->resolutions;
3311
3312     OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
3313                                  res->x0);    /* width of the resolution level computed */
3314     OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
3315                                  res->y0);    /* height of the resolution level computed */
3316
3317     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
3318                                                                1].x1 -
3319                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
3320
3321     OPJ_SIZE_T l_data_size;
3322     const int num_threads = opj_thread_pool_get_thread_count(tp);
3323
3324     if (numres == 1) {
3325         return OPJ_TRUE;
3326     }
3327
3328     l_data_size = opj_dwt_max_resolution(res, numres);
3329     /* overflow check */
3330     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3331         /* FIXME event manager error callback */
3332         return OPJ_FALSE;
3333     }
3334     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3335     if (!h.wavelet) {
3336         /* FIXME event manager error callback */
3337         return OPJ_FALSE;
3338     }
3339     v.wavelet = h.wavelet;
3340
3341     while (--numres) {
3342         OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
3343         OPJ_UINT32 j;
3344
3345         h.sn = (OPJ_INT32)rw;
3346         v.sn = (OPJ_INT32)rh;
3347
3348         ++res;
3349
3350         rw = (OPJ_UINT32)(res->x1 -
3351                           res->x0);   /* width of the resolution level computed */
3352         rh = (OPJ_UINT32)(res->y1 -
3353                           res->y0);   /* height of the resolution level computed */
3354
3355         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3356         h.cas = res->x0 % 2;
3357
3358         h.win_l_x0 = 0;
3359         h.win_l_x1 = (OPJ_UINT32)h.sn;
3360         h.win_h_x0 = 0;
3361         h.win_h_x1 = (OPJ_UINT32)h.dn;
3362
3363         if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
3364             for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3365                 OPJ_UINT32 k;
3366                 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
3367                 opj_v8dwt_decode(&h);
3368
3369                 /* To be adapted if NB_ELTS_V8 changes */
3370                 for (k = 0; k < rw; k++) {
3371                     aj[k      ] = h.wavelet[k].f[0];
3372                     aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
3373                     aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
3374                     aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
3375                 }
3376                 for (k = 0; k < rw; k++) {
3377                     aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
3378                     aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
3379                     aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
3380                     aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
3381                 }
3382
3383                 aj += w * NB_ELTS_V8;
3384             }
3385         } else {
3386             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
3387             OPJ_UINT32 step_j;
3388
3389             if ((rh / NB_ELTS_V8) < num_jobs) {
3390                 num_jobs = rh / NB_ELTS_V8;
3391             }
3392             step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3393             for (j = 0; j < num_jobs; j++) {
3394                 opj_dwt97_decode_h_job_t* job;
3395
3396                 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
3397                 if (!job) {
3398                     opj_thread_pool_wait_completion(tp, 0);
3399                     opj_aligned_free(h.wavelet);
3400                     return OPJ_FALSE;
3401                 }
3402                 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3403                 if (!job->h.wavelet) {
3404                     opj_thread_pool_wait_completion(tp, 0);
3405                     opj_free(job);
3406                     opj_aligned_free(h.wavelet);
3407                     return OPJ_FALSE;
3408                 }
3409                 job->h.dn = h.dn;
3410                 job->h.sn = h.sn;
3411                 job->h.cas = h.cas;
3412                 job->h.win_l_x0 = h.win_l_x0;
3413                 job->h.win_l_x1 = h.win_l_x1;
3414                 job->h.win_h_x0 = h.win_h_x0;
3415                 job->h.win_h_x1 = h.win_h_x1;
3416                 job->rw = rw;
3417                 job->w = w;
3418                 job->aj = aj;
3419                 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
3420                                                       (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3421                 aj += w * job->nb_rows;
3422                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
3423             }
3424             opj_thread_pool_wait_completion(tp, 0);
3425             j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
3426         }
3427
3428         if (j < rh) {
3429             OPJ_UINT32 k;
3430             opj_v8dwt_interleave_h(&h, aj, w, rh - j);
3431             opj_v8dwt_decode(&h);
3432             for (k = 0; k < rw; k++) {
3433                 OPJ_UINT32 l;
3434                 for (l = 0; l < rh - j; l++) {
3435                     aj[k + (OPJ_SIZE_T)w  * l ] = h.wavelet[k].f[l];
3436                 }
3437             }
3438         }
3439
3440         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3441         v.cas = res->y0 % 2;
3442         v.win_l_x0 = 0;
3443         v.win_l_x1 = (OPJ_UINT32)v.sn;
3444         v.win_h_x0 = 0;
3445         v.win_h_x1 = (OPJ_UINT32)v.dn;
3446
3447         aj = (OPJ_FLOAT32*) tilec->data;
3448         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
3449             for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
3450                 OPJ_UINT32 k;
3451
3452                 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
3453                 opj_v8dwt_decode(&v);
3454
3455                 for (k = 0; k < rh; ++k) {
3456                     memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3457                 }
3458                 aj += NB_ELTS_V8;
3459             }
3460         } else {
3461             /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
3462                 transfer being the limiting factor. So limit the number of
3463                 threads.
3464              */
3465             OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
3466             OPJ_UINT32 step_j;
3467
3468             if ((rw / NB_ELTS_V8) < num_jobs) {
3469                 num_jobs = rw / NB_ELTS_V8;
3470             }
3471             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3472             for (j = 0; j < num_jobs; j++) {
3473                 opj_dwt97_decode_v_job_t* job;
3474
3475                 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
3476                 if (!job) {
3477                     opj_thread_pool_wait_completion(tp, 0);
3478                     opj_aligned_free(h.wavelet);
3479                     return OPJ_FALSE;
3480                 }
3481                 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3482                 if (!job->v.wavelet) {
3483                     opj_thread_pool_wait_completion(tp, 0);
3484                     opj_free(job);
3485                     opj_aligned_free(h.wavelet);
3486                     return OPJ_FALSE;
3487                 }
3488                 job->v.dn = v.dn;
3489                 job->v.sn = v.sn;
3490                 job->v.cas = v.cas;
3491                 job->v.win_l_x0 = v.win_l_x0;
3492                 job->v.win_l_x1 = v.win_l_x1;
3493                 job->v.win_h_x0 = v.win_h_x0;
3494                 job->v.win_h_x1 = v.win_h_x1;
3495                 job->rh = rh;
3496                 job->w = w;
3497                 job->aj = aj;
3498                 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
3499                                   (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3500                 aj += job->nb_columns;
3501                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
3502             }
3503             opj_thread_pool_wait_completion(tp, 0);
3504         }
3505
3506         if (rw & (NB_ELTS_V8 - 1)) {
3507             OPJ_UINT32 k;
3508
3509             j = rw & (NB_ELTS_V8 - 1);
3510
3511             opj_v8dwt_interleave_v(&v, aj, w, j);
3512             opj_v8dwt_decode(&v);
3513
3514             for (k = 0; k < rh; ++k) {
3515                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
3516                        (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
3517             }
3518         }
3519     }
3520
3521     opj_aligned_free(h.wavelet);
3522     return OPJ_TRUE;
3523 }
3524
3525 static
3526 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3527                                    OPJ_UINT32 numres)
3528 {
3529     opj_sparse_array_int32_t* sa;
3530     opj_v8dwt_t h;
3531     opj_v8dwt_t v;
3532     OPJ_UINT32 resno;
3533     /* This value matches the maximum left/right extension given in tables */
3534     /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
3535     /* we currently use 3. */
3536     const OPJ_UINT32 filter_width = 4U;
3537
3538     opj_tcd_resolution_t* tr = tilec->resolutions;
3539     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
3540
3541     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
3542                                  tr->x0);    /* width of the resolution level computed */
3543     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
3544                                  tr->y0);    /* height of the resolution level computed */
3545
3546     OPJ_SIZE_T l_data_size;
3547
3548     /* Compute the intersection of the area of interest, expressed in tile coordinates */
3549     /* with the tile coordinates */
3550     OPJ_UINT32 win_tcx0 = tilec->win_x0;
3551     OPJ_UINT32 win_tcy0 = tilec->win_y0;
3552     OPJ_UINT32 win_tcx1 = tilec->win_x1;
3553     OPJ_UINT32 win_tcy1 = tilec->win_y1;
3554
3555     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
3556         return OPJ_TRUE;
3557     }
3558
3559     sa = opj_dwt_init_sparse_array(tilec, numres);
3560     if (sa == NULL) {
3561         return OPJ_FALSE;
3562     }
3563
3564     if (numres == 1U) {
3565         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3566                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3567                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3568                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3569                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3570                        tilec->data_win,
3571                        1, tr_max->win_x1 - tr_max->win_x0,
3572                        OPJ_TRUE);
3573         assert(ret);
3574         OPJ_UNUSED(ret);
3575         opj_sparse_array_int32_free(sa);
3576         return OPJ_TRUE;
3577     }
3578
3579     l_data_size = opj_dwt_max_resolution(tr, numres);
3580     /* overflow check */
3581     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3582         /* FIXME event manager error callback */
3583         opj_sparse_array_int32_free(sa);
3584         return OPJ_FALSE;
3585     }
3586     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3587     if (!h.wavelet) {
3588         /* FIXME event manager error callback */
3589         opj_sparse_array_int32_free(sa);
3590         return OPJ_FALSE;
3591     }
3592     v.wavelet = h.wavelet;
3593
3594     for (resno = 1; resno < numres; resno ++) {
3595         OPJ_UINT32 j;
3596         /* Window of interest subband-based coordinates */
3597         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
3598         OPJ_UINT32 win_hl_x0, win_hl_x1;
3599         OPJ_UINT32 win_lh_y0, win_lh_y1;
3600         /* Window of interest tile-resolution-based coordinates */
3601         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
3602         /* Tile-resolution subband-based coordinates */
3603         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
3604
3605         ++tr;
3606
3607         h.sn = (OPJ_INT32)rw;
3608         v.sn = (OPJ_INT32)rh;
3609
3610         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
3611         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
3612
3613         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3614         h.cas = tr->x0 % 2;
3615
3616         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3617         v.cas = tr->y0 % 2;
3618
3619         /* Get the subband coordinates for the window of interest */
3620         /* LL band */
3621         opj_dwt_get_band_coordinates(tilec, resno, 0,
3622                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3623                                      &win_ll_x0, &win_ll_y0,
3624                                      &win_ll_x1, &win_ll_y1);
3625
3626         /* HL band */
3627         opj_dwt_get_band_coordinates(tilec, resno, 1,
3628                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3629                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
3630
3631         /* LH band */
3632         opj_dwt_get_band_coordinates(tilec, resno, 2,
3633                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3634                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
3635
3636         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
3637         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
3638         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
3639         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
3640         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
3641
3642         /* Subtract the origin of the bands for this tile, to the subwindow */
3643         /* of interest band coordinates, so as to get them relative to the */
3644         /* tile */
3645         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
3646         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
3647         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
3648         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
3649         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
3650         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
3651         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
3652         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
3653
3654         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
3655         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
3656
3657         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
3658         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
3659
3660         /* Compute the tile-resolution-based coordinates for the window of interest */
3661         if (h.cas == 0) {
3662             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
3663             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
3664         } else {
3665             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
3666             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
3667         }
3668
3669         if (v.cas == 0) {
3670             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
3671             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
3672         } else {
3673             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
3674             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
3675         }
3676
3677         h.win_l_x0 = win_ll_x0;
3678         h.win_l_x1 = win_ll_x1;
3679         h.win_h_x0 = win_hl_x0;
3680         h.win_h_x1 = win_hl_x1;
3681         for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3682             if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3683                     (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3684                      j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
3685                 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
3686                 opj_v8dwt_decode(&h);
3687                 if (!opj_sparse_array_int32_write(sa,
3688                                                   win_tr_x0, j,
3689                                                   win_tr_x1, j + NB_ELTS_V8,
3690                                                   (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3691                                                   NB_ELTS_V8, 1, OPJ_TRUE)) {
3692                     /* FIXME event manager error callback */
3693                     opj_sparse_array_int32_free(sa);
3694                     opj_aligned_free(h.wavelet);
3695                     return OPJ_FALSE;
3696                 }
3697             }
3698         }
3699
3700         if (j < rh &&
3701                 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3702                  (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3703                   j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
3704             opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
3705             opj_v8dwt_decode(&h);
3706             if (!opj_sparse_array_int32_write(sa,
3707                                               win_tr_x0, j,
3708                                               win_tr_x1, rh,
3709                                               (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3710                                               NB_ELTS_V8, 1, OPJ_TRUE)) {
3711                 /* FIXME event manager error callback */
3712                 opj_sparse_array_int32_free(sa);
3713                 opj_aligned_free(h.wavelet);
3714                 return OPJ_FALSE;
3715             }
3716         }
3717
3718         v.win_l_x0 = win_ll_y0;
3719         v.win_l_x1 = win_ll_y1;
3720         v.win_h_x0 = win_lh_y0;
3721         v.win_h_x1 = win_lh_y1;
3722         for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
3723             OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
3724
3725             opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
3726             opj_v8dwt_decode(&v);
3727
3728             if (!opj_sparse_array_int32_write(sa,
3729                                               j, win_tr_y0,
3730                                               j + nb_elts, win_tr_y1,
3731                                               (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
3732                                               1, NB_ELTS_V8, OPJ_TRUE)) {
3733                 /* FIXME event manager error callback */
3734                 opj_sparse_array_int32_free(sa);
3735                 opj_aligned_free(h.wavelet);
3736                 return OPJ_FALSE;
3737             }
3738         }
3739     }
3740
3741     {
3742         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3743                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3744                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3745                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3746                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3747                        tilec->data_win,
3748                        1, tr_max->win_x1 - tr_max->win_x0,
3749                        OPJ_TRUE);
3750         assert(ret);
3751         OPJ_UNUSED(ret);
3752     }
3753     opj_sparse_array_int32_free(sa);
3754
3755     opj_aligned_free(h.wavelet);
3756     return OPJ_TRUE;
3757 }
3758
3759
3760 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
3761                              opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3762                              OPJ_UINT32 numres)
3763 {
3764     if (p_tcd->whole_tile_decoding) {
3765         return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
3766     } else {
3767         return opj_dwt_decode_partial_97(tilec, numres);
3768     }
3769 }