src/lib/openjp2/dwt.c

   1 /*
   2  * The copyright in this software is being made available under the 2-clauses
   3  * BSD License, included below. This software may be subject to other third
   4  * party and contributor rights, including patent rights, and no such rights
   5  * are granted under this license.
   6  *
   7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
   8  * Copyright (c) 2002-2014, Professor Benoit Macq
   9  * Copyright (c) 2001-2003, David Janssens
  10  * Copyright (c) 2002-2003, Yannick Verschueren
  11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12  * Copyright (c) 2003-2014, Antonin Descampe
  13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
  15  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
  16  * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
  17  * All rights reserved.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  29  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38  * POSSIBILITY OF SUCH DAMAGE.
  39  */
  40
  41 #include <assert.h>
  42
  43 #define OPJ_SKIP_POISON
  44 #include "opj_includes.h"
  45
  46 #ifdef __SSE__
  47 #include <xmmintrin.h>
  48 #endif
  49 #ifdef __SSE2__
  50 #include <emmintrin.h>
  51 #endif
  52 #ifdef __SSSE3__
  53 #include <tmmintrin.h>
  54 #endif
  55
  56 #if defined(__GNUC__)
  57 #pragma GCC poison malloc calloc realloc free
  58 #endif
  59
  60 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
  61 /*@{*/
  62
  63 #define OPJ_WS(i) v->mem[(i)*2]
  64 #define OPJ_WD(i) v->mem[(1+(i)*2)]
  65
  66 #define PARALLEL_COLS_53     8
  67
  68 /** @name Local data structures */
  69 /*@{*/
  70
  71 typedef struct dwt_local {
  72     OPJ_INT32* mem;
  73     OPJ_INT32 dn;
  74     OPJ_INT32 sn;
  75     OPJ_INT32 cas;
  76 } opj_dwt_t;
  77
  78 typedef union {
  79     OPJ_FLOAT32 f[4];
  80 } opj_v4_t;
  81
  82 typedef struct v4dwt_local {
  83     opj_v4_t*   wavelet ;
  84     OPJ_INT32       dn ;
  85     OPJ_INT32       sn ;
  86     OPJ_INT32       cas ;
  87 } opj_v4dwt_t ;
  88
  89 static const OPJ_FLOAT32 opj_dwt_alpha =  1.586134342f; /*  12994 */
  90 static const OPJ_FLOAT32 opj_dwt_beta  =  0.052980118f; /*    434 */
  91 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /*  -7233 */
  92 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /*  -3633 */
  93
  94 static const OPJ_FLOAT32 opj_K      = 1.230174105f; /*  10078 */
  95 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
  96
  97 /*@}*/
  98
  99 /**
 100 Virtual function type for wavelet transform in 1-D
 101 */
 102 typedef void (*DWT1DFN)(const opj_dwt_t* v);
 103
 104 /** @name Local static functions */
 105 /*@{*/
 106
 107 /**
 108 Forward lazy transform (horizontal)
 109 */
 110 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
 111                                    OPJ_INT32 sn, OPJ_INT32 cas);
 112 /**
 113 Forward lazy transform (vertical)
 114 */
 115 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
 116                                    OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
 117 /**
 118 Forward 5-3 wavelet transform in 1-D
 119 */
 120 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 121                              OPJ_INT32 cas);
 122 /**
 123 Forward 9-7 wavelet transform in 1-D
 124 */
 125 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 126                                   OPJ_INT32 cas);
 127 /**
 128 Explicit calculation of the Quantization Stepsizes
 129 */
 130 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 131                                     opj_stepsize_t *bandno_stepsize);
 132 /**
 133 Inverse wavelet transform in 2-D.
 134 */
 135 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
 136                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
 137
 138 static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
 139         void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
 140
 141 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
 142         OPJ_UINT32 i);
 143
 144 /* <summary>                             */
 145 /* Inverse 9-7 wavelet transform in 1-D. */
 146 /* </summary>                            */
 147 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
 148
 149 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
 150                                    OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size);
 151
 152 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
 153                                    OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read);
 154
 155 #ifdef __SSE__
 156 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
 157                                        const __m128 c);
 158
 159 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
 160                                        OPJ_INT32 m, __m128 c);
 161
 162 #else
 163 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
 164                                    const OPJ_FLOAT32 c);
 165
 166 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
 167                                    OPJ_INT32 m, OPJ_FLOAT32 c);
 168
 169 #endif
 170
 171 /*@}*/
 172
 173 /*@}*/
 174
 175 #define OPJ_S(i) a[(i)*2]
 176 #define OPJ_D(i) a[(1+(i)*2)]
 177 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
 178 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
 179 /* new */
 180 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
 181 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
 182
 183 /* <summary>                                                              */
 184 /* This table contains the norms of the 5-3 wavelets for different bands. */
 185 /* </summary>                                                             */
 186 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
 187     {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
 188     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 189     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 190     {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
 191 };
 192
 193 /* <summary>                                                              */
 194 /* This table contains the norms of the 9-7 wavelets for different bands. */
 195 /* </summary>                                                             */
 196 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
 197     {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
 198     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 199     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 200     {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
 201 };
 202
 203 /*
 204 ==========================================================
 205    local functions
 206 ==========================================================
 207 */
 208
 209 /* <summary>                             */
 210 /* Forward lazy transform (horizontal).  */
 211 /* </summary>                            */
 212 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
 213                                    OPJ_INT32 sn, OPJ_INT32 cas)
 214 {
 215     OPJ_INT32 i;
 216     OPJ_INT32 * l_dest = b;
 217     OPJ_INT32 * l_src = a + cas;
 218
 219     for (i = 0; i < sn; ++i) {
 220         *l_dest++ = *l_src;
 221         l_src += 2;
 222     }
 223
 224     l_dest = b + sn;
 225     l_src = a + 1 - cas;
 226
 227     for (i = 0; i < dn; ++i)  {
 228         *l_dest++ = *l_src;
 229         l_src += 2;
 230     }
 231 }
 232
 233 /* <summary>                             */
 234 /* Forward lazy transform (vertical).    */
 235 /* </summary>                            */
 236 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
 237                                    OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
 238 {
 239     OPJ_INT32 i = sn;
 240     OPJ_INT32 * l_dest = b;
 241     OPJ_INT32 * l_src = a + cas;
 242
 243     while (i--) {
 244         *l_dest = *l_src;
 245         l_dest += x;
 246         l_src += 2;
 247     } /* b[i*x]=a[2*i+cas]; */
 248
 249     l_dest = b + sn * x;
 250     l_src = a + 1 - cas;
 251
 252     i = dn;
 253     while (i--) {
 254         *l_dest = *l_src;
 255         l_dest += x;
 256         l_src += 2;
 257     } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
 258 }
 259
 260 #ifdef STANDARD_SLOW_VERSION
 261 /* <summary>                             */
 262 /* Inverse lazy transform (horizontal).  */
 263 /* </summary>                            */
 264 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 265 {
 266     OPJ_INT32 *ai = a;
 267     OPJ_INT32 *bi = h->mem + h->cas;
 268     OPJ_INT32  i    = h->sn;
 269     while (i--) {
 270         *bi = *(ai++);
 271         bi += 2;
 272     }
 273     ai  = a + h->sn;
 274     bi  = h->mem + 1 - h->cas;
 275     i   = h->dn ;
 276     while (i--) {
 277         *bi = *(ai++);
 278         bi += 2;
 279     }
 280 }
 281
 282 /* <summary>                             */
 283 /* Inverse lazy transform (vertical).    */
 284 /* </summary>                            */
 285 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 286 {
 287     OPJ_INT32 *ai = a;
 288     OPJ_INT32 *bi = v->mem + v->cas;
 289     OPJ_INT32  i = v->sn;
 290     while (i--) {
 291         *bi = *ai;
 292         bi += 2;
 293         ai += x;
 294     }
 295     ai = a + (v->sn * x);
 296     bi = v->mem + 1 - v->cas;
 297     i = v->dn ;
 298     while (i--) {
 299         *bi = *ai;
 300         bi += 2;
 301         ai += x;
 302     }
 303 }
 304
 305 #endif /* STANDARD_SLOW_VERSION */
 306
 307 /* <summary>                            */
 308 /* Forward 5-3 wavelet transform in 1-D. */
 309 /* </summary>                           */
 310 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 311                              OPJ_INT32 cas)
 312 {
 313     OPJ_INT32 i;
 314
 315     if (!cas) {
 316         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 317             for (i = 0; i < dn; i++) {
 318                 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
 319             }
 320             for (i = 0; i < sn; i++) {
 321                 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
 322             }
 323         }
 324     } else {
 325         if (!sn && dn == 1) {       /* NEW :  CASE ONE ELEMENT */
 326             OPJ_S(0) *= 2;
 327         } else {
 328             for (i = 0; i < dn; i++) {
 329                 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
 330             }
 331             for (i = 0; i < sn; i++) {
 332                 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
 333             }
 334         }
 335     }
 336 }
 337
 338 #ifdef STANDARD_SLOW_VERSION
 339 /* <summary>                            */
 340 /* Inverse 5-3 wavelet transform in 1-D. */
 341 /* </summary>                           */
 342 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 343                               OPJ_INT32 cas)
 344 {
 345     OPJ_INT32 i;
 346
 347     if (!cas) {
 348         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 349             for (i = 0; i < sn; i++) {
 350                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
 351             }
 352             for (i = 0; i < dn; i++) {
 353                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
 354             }
 355         }
 356     } else {
 357         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
 358             OPJ_S(0) /= 2;
 359         } else {
 360             for (i = 0; i < sn; i++) {
 361                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
 362             }
 363             for (i = 0; i < dn; i++) {
 364                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
 365             }
 366         }
 367     }
 368 }
 369
 370 static void opj_dwt_decode_1(const opj_dwt_t *v)
 371 {
 372     opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
 373 }
 374
 375 #endif /* STANDARD_SLOW_VERSION */
 376
 377 #if !defined(STANDARD_SLOW_VERSION)
 378 static void  opj_idwt53_h_cas0(OPJ_INT32* tmp,
 379                                const OPJ_INT32 sn,
 380                                const OPJ_INT32 len,
 381                                OPJ_INT32* tiledp)
 382 {
 383     OPJ_INT32 i, j;
 384     const OPJ_INT32* in_even = &tiledp[0];
 385     const OPJ_INT32* in_odd = &tiledp[sn];
 386
 387 #ifdef TWO_PASS_VERSION
 388     /* For documentation purpose: performs lifting in two iterations, */
 389     /* but withtmp explicit interleaving */
 390
 391     assert(len > 1);
 392
 393     /* Even */
 394     tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
 395     for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
 396         tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
 397     }
 398     if (len & 1) { /* if len is odd */
 399         tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
 400     }
 401
 402     /* Odd */
 403     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 404         tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
 405     }
 406     if (!(len & 1)) { /* if len is even */
 407         tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
 408     }
 409 #else
 410     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 411
 412     assert(len > 1);
 413
 414     /* Improved version of the TWO_PASS_VERSION: */
 415     /* Performs lifting in one single iteration. Saves memory */
 416     /* accesses and explicit interleaving. */
 417     s1n = in_even[0];
 418     d1n = in_odd[0];
 419     s0n = s1n - ((d1n + 1) >> 1);
 420
 421     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 422         d1c = d1n;
 423         s0c = s0n;
 424
 425         s1n = in_even[j];
 426         d1n = in_odd[j];
 427
 428         s0n = s1n - ((d1c + d1n + 2) >> 2);
 429
 430         tmp[i  ] = s0c;
 431         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 432     }
 433
 434     tmp[i] = s0n;
 435
 436     if (len & 1) {
 437         tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
 438         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 439     } else {
 440         tmp[len - 1] = d1n + s0n;
 441     }
 442 #endif
 443     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 444 }
 445
 446 static void  opj_idwt53_h_cas1(OPJ_INT32* tmp,
 447                                const OPJ_INT32 sn,
 448                                const OPJ_INT32 len,
 449                                OPJ_INT32* tiledp)
 450 {
 451     OPJ_INT32 i, j;
 452     const OPJ_INT32* in_even = &tiledp[sn];
 453     const OPJ_INT32* in_odd = &tiledp[0];
 454
 455 #ifdef TWO_PASS_VERSION
 456     /* For documentation purpose: performs lifting in two iterations, */
 457     /* but withtmp explicit interleaving */
 458
 459     assert(len > 2);
 460
 461     /* Odd */
 462     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 463         tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
 464     }
 465     if (!(len & 1)) {
 466         tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
 467     }
 468
 469     /* Even */
 470     tmp[0] = in_even[0] + tmp[1];
 471     for (i = 2, j = 1; i < len - 1; i += 2, j++) {
 472         tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
 473     }
 474     if (len & 1) {
 475         tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
 476     }
 477 #else
 478     OPJ_INT32 s1, s2, dc, dn;
 479
 480     assert(len > 2);
 481
 482     /* Improved version of the TWO_PASS_VERSION: */
 483     /* Performs lifting in one single iteration. Saves memory */
 484     /* accesses and explicit interleaving. */
 485
 486     s1 = in_even[1];
 487     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 488     tmp[0] = in_even[0] + dc;
 489
 490     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 491
 492         s2 = in_even[j + 1];
 493
 494         dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
 495         tmp[i  ] = dc;
 496         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 497
 498         dc = dn;
 499         s1 = s2;
 500     }
 501
 502     tmp[i] = dc;
 503
 504     if (!(len & 1)) {
 505         dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
 506         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 507         tmp[len - 1] = dn;
 508     } else {
 509         tmp[len - 1] = s1 + dc;
 510     }
 511 #endif
 512     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 513 }
 514
 515
 516 #endif /* !defined(STANDARD_SLOW_VERSION) */
 517
 518 /* <summary>                            */
 519 /* Inverse 5-3 wavelet transform in 1-D for one row. */
 520 /* </summary>                           */
 521 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 522 static void opj_idwt53_h(const opj_dwt_t *dwt,
 523                          OPJ_INT32* tiledp)
 524 {
 525 #ifdef STANDARD_SLOW_VERSION
 526     /* For documentation purpose */
 527     opj_dwt_interleave_h(dwt, tiledp);
 528     opj_dwt_decode_1(dwt);
 529     memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
 530 #else
 531     const OPJ_INT32 sn = dwt->sn;
 532     const OPJ_INT32 len = sn + dwt->dn;
 533     if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
 534         if (len > 1) {
 535             opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
 536         } else {
 537             /* Unmodified value */
 538         }
 539     } else { /* Left-most sample is on odd coordinate */
 540         if (len == 1) {
 541             tiledp[0] /= 2;
 542         } else if (len == 2) {
 543             OPJ_INT32* out = dwt->mem;
 544             const OPJ_INT32* in_even = &tiledp[sn];
 545             const OPJ_INT32* in_odd = &tiledp[0];
 546             out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 547             out[0] = in_even[0] + out[1];
 548             memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 549         } else if (len > 2) {
 550             opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
 551         }
 552     }
 553 #endif
 554 }
 555
 556 #if defined(__SSE2__) && !defined(STANDARD_SLOW_VERSION)
 557
 558 /* Conveniency macros to improve the readabilty of the formulas */
 559 #define LOADU(x)   _mm_loadu_si128((const __m128i*)(x))
 560 #define STORE(x,y) _mm_store_si128((__m128i*)(x),(y))
 561 #define ADD(x,y)   _mm_add_epi32((x),(y))
 562 #define ADD3(x,y,z) ADD(ADD(x,y),z)
 563 #define SUB(x,y)   _mm_sub_epi32((x),(y))
 564 #define SAR(x,y)   _mm_srai_epi32((x),(y))
 565
 566 /** Vertical inverse 5x3 wavelet transform for 8 columns, when top-most
 567  * pixel is on even coordinate */
 568 static void opj_idwt53_v_cas0_8cols_SSE2(
 569     OPJ_INT32* tmp,
 570     const OPJ_INT32 sn,
 571     const OPJ_INT32 len,
 572     OPJ_INT32* tiledp_col,
 573     const OPJ_INT32 stride)
 574 {
 575     const OPJ_INT32* in_even = &tiledp_col[0];
 576     const OPJ_INT32* in_odd = &tiledp_col[sn * stride];
 577
 578     OPJ_INT32 i, j;
 579     __m128i d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
 580     __m128i d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
 581     const __m128i two = _mm_set1_epi32(2);
 582
 583     assert(len > 1);
 584     assert(PARALLEL_COLS_53 == 8);
 585
 586     s1n_0 = LOADU(in_even + 0);
 587     s1n_1 = LOADU(in_even + 4);
 588     d1n_0 = LOADU(in_odd);
 589     d1n_1 = LOADU(in_odd + 4);
 590
 591     /* s0n = s1n - ((d1n + 1) >> 1); <==> */
 592     /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
 593     s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 594     s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 595
 596     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 597         d1c_0 = d1n_0;
 598         s0c_0 = s0n_0;
 599         d1c_1 = d1n_1;
 600         s0c_1 = s0n_1;
 601
 602         s1n_0 = LOADU(in_even + j * stride);
 603         s1n_1 = LOADU(in_even + j * stride + 4);
 604         d1n_0 = LOADU(in_odd + j * stride);
 605         d1n_1 = LOADU(in_odd + j * stride + 4);
 606
 607         /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
 608         s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
 609         s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
 610
 611         STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
 612         STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 4, s0c_1);
 613
 614         /* d1c + ((s0c + s0n) >> 1) */
 615         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 616               ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
 617         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 4,
 618               ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
 619     }
 620
 621     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
 622     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 4, s0n_1);
 623
 624     if (len & 1) {
 625         __m128i tmp_len_minus_1;
 626         s1n_0 = LOADU(in_even + ((len - 1) / 2) * stride);
 627         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 628         tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 629         STORE(tmp + 8 * (len - 1), tmp_len_minus_1);
 630         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 631         STORE(tmp + 8 * (len - 2),
 632               ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
 633
 634         s1n_1 = LOADU(in_even + ((len - 1) / 2) * stride + 4);
 635         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 636         tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 637         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 4, tmp_len_minus_1);
 638         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 639         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 4,
 640               ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
 641
 642
 643     } else {
 644         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(d1n_0, s0n_0));
 645         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 4, ADD(d1n_1, s0n_1));
 646     }
 647
 648     for (i = 0; i < len; ++i) {
 649         memcpy(&tiledp_col[i * stride],
 650                &tmp[PARALLEL_COLS_53 * i],
 651                PARALLEL_COLS_53 * sizeof(OPJ_INT32));
 652     }
 653 }
 654
 655
 656 /** Vertical inverse 5x3 wavelet transform for 8 columns, when top-most
 657  * pixel is on odd coordinate */
 658 static void opj_idwt53_v_cas1_8cols_SSE2(
 659     OPJ_INT32* tmp,
 660     const OPJ_INT32 sn,
 661     const OPJ_INT32 len,
 662     OPJ_INT32* tiledp_col,
 663     const OPJ_INT32 stride)
 664 {
 665     OPJ_INT32 i, j;
 666
 667     __m128i s1_0, s2_0, dc_0, dn_0;
 668     __m128i s1_1, s2_1, dc_1, dn_1;
 669     const __m128i two = _mm_set1_epi32(2);
 670
 671     const OPJ_INT32* in_even = &tiledp_col[sn * stride];
 672     const OPJ_INT32* in_odd = &tiledp_col[0];
 673
 674     assert(len > 2);
 675     assert(PARALLEL_COLS_53 == 8);
 676
 677     s1_0 = LOADU(in_even + stride);
 678     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 679     dc_0 = SUB(LOADU(in_odd + 0),
 680                SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
 681     STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
 682
 683     s1_1 = LOADU(in_even + stride + 4);
 684     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 685     dc_1 = SUB(LOADU(in_odd + 4),
 686                SAR(ADD3(LOADU(in_even + 4), s1_1, two), 2));
 687     STORE(tmp + PARALLEL_COLS_53 * 0 + 4, ADD(LOADU(in_even + 4), dc_1));
 688
 689     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 690
 691         s2_0 = LOADU(in_even + (j + 1) * stride);
 692         s2_1 = LOADU(in_even + (j + 1) * stride + 4);
 693
 694         /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
 695         dn_0 = SUB(LOADU(in_odd + j * stride),
 696                    SAR(ADD3(s1_0, s2_0, two), 2));
 697         dn_1 = SUB(LOADU(in_odd + j * stride + 4),
 698                    SAR(ADD3(s1_1, s2_1, two), 2));
 699
 700         STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 701         STORE(tmp + PARALLEL_COLS_53 * i + 4, dc_1);
 702
 703         /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
 704         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 705               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 706         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 4,
 707               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 708
 709         dc_0 = dn_0;
 710         s1_0 = s2_0;
 711         dc_1 = dn_1;
 712         s1_1 = s2_1;
 713     }
 714     STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 715     STORE(tmp + PARALLEL_COLS_53 * i + 4, dc_1);
 716
 717     if (!(len & 1)) {
 718         /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
 719         dn_0 = SUB(LOADU(in_odd + (len / 2 - 1) * stride),
 720                    SAR(ADD3(s1_0, s1_0, two), 2));
 721         dn_1 = SUB(LOADU(in_odd + (len / 2 - 1) * stride + 4),
 722                    SAR(ADD3(s1_1, s1_1, two), 2));
 723
 724         /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
 725         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
 726               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 727         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 4,
 728               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 729
 730         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
 731         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 4, dn_1);
 732     } else {
 733         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
 734         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 4, ADD(s1_1, dc_1));
 735     }
 736
 737     for (i = 0; i < len; ++i) {
 738         memcpy(&tiledp_col[i * stride],
 739                &tmp[PARALLEL_COLS_53 * i],
 740                PARALLEL_COLS_53 * sizeof(OPJ_INT32));
 741     }
 742 }
 743
 744 #undef LOADU
 745 #undef STORE
 746 #undef ADD
 747 #undef ADD3
 748 #undef SUB
 749 #undef SAR
 750
 751 #endif /* defined(__SSE2__) && !defined(STANDARD_SLOW_VERSION) */
 752
 753 #if !defined(STANDARD_SLOW_VERSION)
 754 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 755  * pixel is on even coordinate */
 756 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
 757                              const OPJ_INT32 sn,
 758                              const OPJ_INT32 len,
 759                              OPJ_INT32* tiledp_col,
 760                              const OPJ_INT32 stride)
 761 {
 762     OPJ_INT32 i, j;
 763     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 764
 765     assert(len > 1);
 766
 767     /* Performs lifting in one single iteration. Saves memory */
 768     /* accesses and explicit interleaving. */
 769
 770     s1n = tiledp_col[0];
 771     d1n = tiledp_col[sn * stride];
 772     s0n = s1n - ((d1n + 1) >> 1);
 773
 774     for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
 775         d1c = d1n;
 776         s0c = s0n;
 777
 778         s1n = tiledp_col[(j + 1) * stride];
 779         d1n = tiledp_col[(sn + j + 1) * stride];
 780
 781         s0n = s1n - ((d1c + d1n + 2) >> 2);
 782
 783         tmp[i  ] = s0c;
 784         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 785     }
 786
 787     tmp[i] = s0n;
 788
 789     if (len & 1) {
 790         tmp[len - 1] =
 791             tiledp_col[((len - 1) / 2) * stride] -
 792             ((d1n + 1) >> 1);
 793         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 794     } else {
 795         tmp[len - 1] = d1n + s0n;
 796     }
 797
 798     for (i = 0; i < len; ++i) {
 799         tiledp_col[i * stride] = tmp[i];
 800     }
 801 }
 802
 803
 804 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 805  * pixel is on odd coordinate */
 806 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
 807                              const OPJ_INT32 sn,
 808                              const OPJ_INT32 len,
 809                              OPJ_INT32* tiledp_col,
 810                              const OPJ_INT32 stride)
 811 {
 812     OPJ_INT32 i, j;
 813     OPJ_INT32 s1, s2, dc, dn;
 814     const OPJ_INT32* in_even = &tiledp_col[sn * stride];
 815     const OPJ_INT32* in_odd = &tiledp_col[0];
 816
 817     assert(len > 2);
 818
 819     /* Performs lifting in one single iteration. Saves memory */
 820     /* accesses and explicit interleaving. */
 821
 822     s1 = in_even[stride];
 823     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 824     tmp[0] = in_even[0] + dc;
 825     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 826
 827         s2 = in_even[(j + 1) * stride];
 828
 829         dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2);
 830         tmp[i  ] = dc;
 831         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 832
 833         dc = dn;
 834         s1 = s2;
 835     }
 836     tmp[i] = dc;
 837     if (!(len & 1)) {
 838         dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
 839         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 840         tmp[len - 1] = dn;
 841     } else {
 842         tmp[len - 1] = s1 + dc;
 843     }
 844
 845     for (i = 0; i < len; ++i) {
 846         tiledp_col[i * stride] = tmp[i];
 847     }
 848 }
 849 #endif /* !defined(STANDARD_SLOW_VERSION) */
 850
 851 /* <summary>                            */
 852 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
 853 /* </summary>                           */
 854 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 855 static void opj_idwt53_v(const opj_dwt_t *dwt,
 856                          OPJ_INT32* tiledp_col,
 857                          OPJ_INT32 stride,
 858                          OPJ_INT32 nb_cols)
 859 {
 860 #ifdef STANDARD_SLOW_VERSION
 861     /* For documentation purpose */
 862     OPJ_INT32 k, c;
 863     for (c = 0; c < nb_cols; c ++) {
 864         opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
 865         opj_dwt_decode_1(dwt);
 866         for (k = 0; k < dwt->sn + dwt->dn; ++k) {
 867             tiledp_col[c + k * stride] = dwt->mem[k];
 868         }
 869     }
 870 #else
 871     const OPJ_INT32 sn = dwt->sn;
 872     const OPJ_INT32 len = sn + dwt->dn;
 873     if (dwt->cas == 0) {
 874         /* If len == 1, unmodified value */
 875
 876 #if __SSE2__
 877         if (len > 1 && nb_cols == PARALLEL_COLS_53) {
 878             /* Same as below general case, except that thanks to SSE2 */
 879             /* we can efficently process 8 columns in parallel */
 880             opj_idwt53_v_cas0_8cols_SSE2(dwt->mem, sn, len, tiledp_col, stride);
 881             return;
 882         }
 883 #endif
 884         if (len > 1) {
 885             OPJ_INT32 c;
 886             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 887                 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
 888             }
 889             return;
 890         }
 891     } else {
 892         if (len == 1) {
 893             OPJ_INT32 c;
 894             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 895                 tiledp_col[0] /= 2;
 896             }
 897             return;
 898         }
 899
 900         if (len == 2) {
 901             OPJ_INT32 c;
 902             OPJ_INT32* out = dwt->mem;
 903             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 904                 OPJ_INT32 i;
 905                 const OPJ_INT32* in_even = &tiledp_col[sn * stride];
 906                 const OPJ_INT32* in_odd = &tiledp_col[0];
 907
 908                 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 909                 out[0] = in_even[0] + out[1];
 910
 911                 for (i = 0; i < len; ++i) {
 912                     tiledp_col[i * stride] = out[i];
 913                 }
 914             }
 915
 916             return;
 917         }
 918
 919 #ifdef __SSE2__
 920         if (len > 2 && nb_cols == PARALLEL_COLS_53) {
 921             /* Same as below general case, except that thanks to SSE2 */
 922             /* we can efficently process 8 columns in parallel */
 923             opj_idwt53_v_cas1_8cols_SSE2(dwt->mem, sn, len, tiledp_col, stride);
 924             return;
 925         }
 926 #endif
 927         if (len > 2) {
 928             OPJ_INT32 c;
 929             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 930                 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
 931             }
 932             return;
 933         }
 934     }
 935 #endif
 936 }
 937
 938
 939 /* <summary>                             */
 940 /* Forward 9-7 wavelet transform in 1-D. */
 941 /* </summary>                            */
 942 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 943                                   OPJ_INT32 cas)
 944 {
 945     OPJ_INT32 i;
 946     if (!cas) {
 947         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 948             for (i = 0; i < dn; i++) {
 949                 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
 950             }
 951             for (i = 0; i < sn; i++) {
 952                 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
 953             }
 954             for (i = 0; i < dn; i++) {
 955                 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
 956             }
 957             for (i = 0; i < sn; i++) {
 958                 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
 959             }
 960             for (i = 0; i < dn; i++) {
 961                 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038);    /*5038 */
 962             }
 963             for (i = 0; i < sn; i++) {
 964                 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659);    /*6660 */
 965             }
 966         }
 967     } else {
 968         if ((sn > 0) || (dn > 1)) { /* NEW :  CASE ONE ELEMENT */
 969             for (i = 0; i < dn; i++) {
 970                 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
 971             }
 972             for (i = 0; i < sn; i++) {
 973                 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
 974             }
 975             for (i = 0; i < dn; i++) {
 976                 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
 977             }
 978             for (i = 0; i < sn; i++) {
 979                 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
 980             }
 981             for (i = 0; i < dn; i++) {
 982                 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038);    /*5038 */
 983             }
 984             for (i = 0; i < sn; i++) {
 985                 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659);    /*6660 */
 986             }
 987         }
 988     }
 989 }
 990
 991 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 992                                     opj_stepsize_t *bandno_stepsize)
 993 {
 994     OPJ_INT32 p, n;
 995     p = opj_int_floorlog2(stepsize) - 13;
 996     n = 11 - opj_int_floorlog2(stepsize);
 997     bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
 998     bandno_stepsize->expn = numbps - p;
 999 }
1000
1001 /*
1002 ==========================================================
1003    DWT interface
1004 ==========================================================
1005 */
1006
1007
1008 /* <summary>                            */
1009 /* Forward 5-3 wavelet transform in 2-D. */
1010 /* </summary>                           */
1011 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
1012         void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
1013 {
1014     OPJ_INT32 i, j, k;
1015     OPJ_INT32 *a = 00;
1016     OPJ_INT32 *aj = 00;
1017     OPJ_INT32 *bj = 00;
1018     OPJ_INT32 w, l;
1019
1020     OPJ_INT32 rw;           /* width of the resolution level computed   */
1021     OPJ_INT32 rh;           /* height of the resolution level computed  */
1022     size_t l_data_size;
1023
1024     opj_tcd_resolution_t * l_cur_res = 0;
1025     opj_tcd_resolution_t * l_last_res = 0;
1026
1027     w = tilec->x1 - tilec->x0;
1028     l = (OPJ_INT32)tilec->numresolutions - 1;
1029     a = tilec->data;
1030
1031     l_cur_res = tilec->resolutions + l;
1032     l_last_res = l_cur_res - 1;
1033
1034     l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1035     /* overflow check */
1036     if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
1037         /* FIXME event manager error callback */
1038         return OPJ_FALSE;
1039     }
1040     l_data_size *= sizeof(OPJ_INT32);
1041     bj = (OPJ_INT32*)opj_malloc(l_data_size);
1042     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1043     /* in that case, so do not error out */
1044     if (l_data_size != 0 && ! bj) {
1045         return OPJ_FALSE;
1046     }
1047     i = l;
1048
1049     while (i--) {
1050         OPJ_INT32 rw1;      /* width of the resolution level once lower than computed one                                       */
1051         OPJ_INT32 rh1;      /* height of the resolution level once lower than computed one                                      */
1052         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1053         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
1054         OPJ_INT32 dn, sn;
1055
1056         rw  = l_cur_res->x1 - l_cur_res->x0;
1057         rh  = l_cur_res->y1 - l_cur_res->y0;
1058         rw1 = l_last_res->x1 - l_last_res->x0;
1059         rh1 = l_last_res->y1 - l_last_res->y0;
1060
1061         cas_row = l_cur_res->x0 & 1;
1062         cas_col = l_cur_res->y0 & 1;
1063
1064         sn = rh1;
1065         dn = rh - rh1;
1066         for (j = 0; j < rw; ++j) {
1067             aj = a + j;
1068             for (k = 0; k < rh; ++k) {
1069                 bj[k] = aj[k * w];
1070             }
1071
1072             (*p_function)(bj, dn, sn, cas_col);
1073
1074             opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
1075         }
1076
1077         sn = rw1;
1078         dn = rw - rw1;
1079
1080         for (j = 0; j < rh; j++) {
1081             aj = a + j * w;
1082             for (k = 0; k < rw; k++) {
1083                 bj[k] = aj[k];
1084             }
1085             (*p_function)(bj, dn, sn, cas_row);
1086             opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
1087         }
1088
1089         l_cur_res = l_last_res;
1090
1091         --l_last_res;
1092     }
1093
1094     opj_free(bj);
1095     return OPJ_TRUE;
1096 }
1097
1098 /* Forward 5-3 wavelet transform in 2-D. */
1099 /* </summary>                           */
1100 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
1101 {
1102     return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
1103 }
1104
1105 /* <summary>                            */
1106 /* Inverse 5-3 wavelet transform in 2-D. */
1107 /* </summary>                           */
1108 OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec,
1109                         OPJ_UINT32 numres)
1110 {
1111     return opj_dwt_decode_tile(tp, tilec, numres);
1112 }
1113
1114
1115 /* <summary>                          */
1116 /* Get gain of 5-3 wavelet transform. */
1117 /* </summary>                         */
1118 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
1119 {
1120     if (orient == 0) {
1121         return 0;
1122     }
1123     if (orient == 1 || orient == 2) {
1124         return 1;
1125     }
1126     return 2;
1127 }
1128
1129 /* <summary>                */
1130 /* Get norm of 5-3 wavelet. */
1131 /* </summary>               */
1132 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1133 {
1134     return opj_dwt_norms[orient][level];
1135 }
1136
1137 /* <summary>                             */
1138 /* Forward 9-7 wavelet transform in 2-D. */
1139 /* </summary>                            */
1140 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
1141 {
1142     return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
1143 }
1144
1145 /* <summary>                          */
1146 /* Get gain of 9-7 wavelet transform. */
1147 /* </summary>                         */
1148 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
1149 {
1150     (void)orient;
1151     return 0;
1152 }
1153
1154 /* <summary>                */
1155 /* Get norm of 9-7 wavelet. */
1156 /* </summary>               */
1157 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1158 {
1159     return opj_dwt_norms_real[orient][level];
1160 }
1161
1162 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1163 {
1164     OPJ_UINT32 numbands, bandno;
1165     numbands = 3 * tccp->numresolutions - 2;
1166     for (bandno = 0; bandno < numbands; bandno++) {
1167         OPJ_FLOAT64 stepsize;
1168         OPJ_UINT32 resno, level, orient, gain;
1169
1170         resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1171         orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1172         level = tccp->numresolutions - 1 - resno;
1173         gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1174                                           (orient == 2)) ? 1 : 2));
1175         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1176             stepsize = 1.0;
1177         } else {
1178             OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1179             stepsize = (1 << (gain)) / norm;
1180         }
1181         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1182                                 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1183     }
1184 }
1185
1186 /* <summary>                             */
1187 /* Determine maximum computed resolution level for inverse wavelet transform */
1188 /* </summary>                            */
1189 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1190         OPJ_UINT32 i)
1191 {
1192     OPJ_UINT32 mr   = 0;
1193     OPJ_UINT32 w;
1194     while (--i) {
1195         ++r;
1196         if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1197             mr = w ;
1198         }
1199         if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1200             mr = w ;
1201         }
1202     }
1203     return mr ;
1204 }
1205
1206 typedef struct {
1207     opj_dwt_t h;
1208     OPJ_UINT32 rw;
1209     OPJ_UINT32 w;
1210     OPJ_INT32 * OPJ_RESTRICT tiledp;
1211     OPJ_UINT32 min_j;
1212     OPJ_UINT32 max_j;
1213 } opj_dwd_decode_h_job_t;
1214
1215 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1216 {
1217     OPJ_UINT32 j;
1218     opj_dwd_decode_h_job_t* job;
1219     (void)tls;
1220
1221     job = (opj_dwd_decode_h_job_t*)user_data;
1222     for (j = job->min_j; j < job->max_j; j++) {
1223         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1224     }
1225
1226     opj_aligned_free(job->h.mem);
1227     opj_free(job);
1228 }
1229
1230 typedef struct {
1231     opj_dwt_t v;
1232     OPJ_UINT32 rh;
1233     OPJ_UINT32 w;
1234     OPJ_INT32 * OPJ_RESTRICT tiledp;
1235     OPJ_UINT32 min_j;
1236     OPJ_UINT32 max_j;
1237 } opj_dwd_decode_v_job_t;
1238
1239 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1240 {
1241     OPJ_UINT32 j;
1242     opj_dwd_decode_v_job_t* job;
1243     (void)tls;
1244
1245     job = (opj_dwd_decode_v_job_t*)user_data;
1246     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1247             j += PARALLEL_COLS_53) {
1248         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w,
1249                      PARALLEL_COLS_53);
1250     }
1251     if (j < job->max_j)
1252         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w,
1253                      (OPJ_INT32)(job->max_j - j));
1254
1255     opj_aligned_free(job->v.mem);
1256     opj_free(job);
1257 }
1258
1259
1260 /* <summary>                            */
1261 /* Inverse wavelet transform in 2-D.    */
1262 /* </summary>                           */
1263 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1264                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1265 {
1266     opj_dwt_t h;
1267     opj_dwt_t v;
1268
1269     opj_tcd_resolution_t* tr = tilec->resolutions;
1270
1271     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1272                                  tr->x0);  /* width of the resolution level computed */
1273     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1274                                  tr->y0);  /* height of the resolution level computed */
1275
1276     OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1277     size_t h_mem_size;
1278     int num_threads;
1279
1280     if (numres == 1U) {
1281         return OPJ_TRUE;
1282     }
1283     num_threads = opj_thread_pool_get_thread_count(tp);
1284     h_mem_size = opj_dwt_max_resolution(tr, numres);
1285     /* overflow check */
1286     if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1287         /* FIXME event manager error callback */
1288         return OPJ_FALSE;
1289     }
1290     /* We need PARALLEL_COLS_53 times the height of the array, */
1291     /* since for the vertical pass */
1292     /* we process PARALLEL_COLS_53 columns at a time */
1293     h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
1294     h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
1295     if (! h.mem) {
1296         /* FIXME event manager error callback */
1297         return OPJ_FALSE;
1298     }
1299
1300     v.mem = h.mem;
1301
1302     while (--numres) {
1303         OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1304         OPJ_UINT32 j;
1305
1306         ++tr;
1307         h.sn = (OPJ_INT32)rw;
1308         v.sn = (OPJ_INT32)rh;
1309
1310         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1311         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1312
1313         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1314         h.cas = tr->x0 % 2;
1315
1316         if (num_threads <= 1 || rh <= 1) {
1317             for (j = 0; j < rh; ++j) {
1318                 opj_idwt53_h(&h, &tiledp[j * w]);
1319             }
1320         } else {
1321             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1322             OPJ_UINT32 step_j;
1323
1324             if (rh < num_jobs) {
1325                 num_jobs = rh;
1326             }
1327             step_j = (rh / num_jobs);
1328
1329             for (j = 0; j < num_jobs; j++) {
1330                 opj_dwd_decode_h_job_t* job;
1331
1332                 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
1333                 if (!job) {
1334                     /* It would be nice to fallback to single thread case, but */
1335                     /* unfortunately some jobs may be launched and have modified */
1336                     /* tiledp, so it is not practical to recover from that error */
1337                     /* FIXME event manager error callback */
1338                     opj_thread_pool_wait_completion(tp, 0);
1339                     opj_aligned_free(h.mem);
1340                     return OPJ_FALSE;
1341                 }
1342                 job->h = h;
1343                 job->rw = rw;
1344                 job->w = w;
1345                 job->tiledp = tiledp;
1346                 job->min_j = j * step_j;
1347                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1348                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1349                     job->max_j = rh;
1350                 }
1351                 job->h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
1352                 if (!job->h.mem) {
1353                     /* FIXME event manager error callback */
1354                     opj_thread_pool_wait_completion(tp, 0);
1355                     opj_free(job);
1356                     opj_aligned_free(h.mem);
1357                     return OPJ_FALSE;
1358                 }
1359                 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1360             }
1361             opj_thread_pool_wait_completion(tp, 0);
1362         }
1363
1364         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1365         v.cas = tr->y0 % 2;
1366
1367         if (num_threads <= 1 || rw <= 1) {
1368             for (j = 0; j + PARALLEL_COLS_53 <= rw;
1369                     j += PARALLEL_COLS_53) {
1370                 opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, PARALLEL_COLS_53);
1371             }
1372             if (j < rw) {
1373                 opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, (OPJ_INT32)(rw - j));
1374             }
1375         } else {
1376             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1377             OPJ_UINT32 step_j;
1378
1379             if (rw < num_jobs) {
1380                 num_jobs = rw;
1381             }
1382             step_j = (rw / num_jobs);
1383
1384             for (j = 0; j < num_jobs; j++) {
1385                 opj_dwd_decode_v_job_t* job;
1386
1387                 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
1388                 if (!job) {
1389                     /* It would be nice to fallback to single thread case, but */
1390                     /* unfortunately some jobs may be launched and have modified */
1391                     /* tiledp, so it is not practical to recover from that error */
1392                     /* FIXME event manager error callback */
1393                     opj_thread_pool_wait_completion(tp, 0);
1394                     opj_aligned_free(v.mem);
1395                     return OPJ_FALSE;
1396                 }
1397                 job->v = v;
1398                 job->rh = rh;
1399                 job->w = w;
1400                 job->tiledp = tiledp;
1401                 job->min_j = j * step_j;
1402                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1403                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1404                     job->max_j = rw;
1405                 }
1406                 job->v.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
1407                 if (!job->v.mem) {
1408                     /* FIXME event manager error callback */
1409                     opj_thread_pool_wait_completion(tp, 0);
1410                     opj_free(job);
1411                     opj_aligned_free(v.mem);
1412                     return OPJ_FALSE;
1413                 }
1414                 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1415             }
1416             opj_thread_pool_wait_completion(tp, 0);
1417         }
1418     }
1419     opj_aligned_free(h.mem);
1420     return OPJ_TRUE;
1421 }
1422
1423 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
1424                                    OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size)
1425 {
1426     OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(w->wavelet + w->cas);
1427     OPJ_INT32 count = w->sn;
1428     OPJ_INT32 i, k;
1429
1430     for (k = 0; k < 2; ++k) {
1431         if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 &&
1432                 ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
1433             /* Fast code path */
1434             for (i = 0; i < count; ++i) {
1435                 OPJ_INT32 j = i;
1436                 bi[i * 8    ] = a[j];
1437                 j += x;
1438                 bi[i * 8 + 1] = a[j];
1439                 j += x;
1440                 bi[i * 8 + 2] = a[j];
1441                 j += x;
1442                 bi[i * 8 + 3] = a[j];
1443             }
1444         } else {
1445             /* Slow code path */
1446             for (i = 0; i < count; ++i) {
1447                 OPJ_INT32 j = i;
1448                 bi[i * 8    ] = a[j];
1449                 j += x;
1450                 if (j >= size) {
1451                     continue;
1452                 }
1453                 bi[i * 8 + 1] = a[j];
1454                 j += x;
1455                 if (j >= size) {
1456                     continue;
1457                 }
1458                 bi[i * 8 + 2] = a[j];
1459                 j += x;
1460                 if (j >= size) {
1461                     continue;
1462                 }
1463                 bi[i * 8 + 3] = a[j]; /* This one*/
1464             }
1465         }
1466
1467         bi = (OPJ_FLOAT32*)(w->wavelet + 1 - w->cas);
1468         a += w->sn;
1469         size -= w->sn;
1470         count = w->dn;
1471     }
1472 }
1473
1474 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
1475                                    OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read)
1476 {
1477     opj_v4_t* OPJ_RESTRICT bi = v->wavelet + v->cas;
1478     OPJ_INT32 i;
1479
1480     for (i = 0; i < v->sn; ++i) {
1481         memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
1482     }
1483
1484     a += v->sn * x;
1485     bi = v->wavelet + 1 - v->cas;
1486
1487     for (i = 0; i < v->dn; ++i) {
1488         memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
1489     }
1490 }
1491
1492 #ifdef __SSE__
1493
1494 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
1495                                        const __m128 c)
1496 {
1497     __m128* OPJ_RESTRICT vw = (__m128*) w;
1498     OPJ_INT32 i;
1499     /* 4x unrolled loop */
1500     for (i = 0; i < count >> 2; ++i) {
1501         *vw = _mm_mul_ps(*vw, c);
1502         vw += 2;
1503         *vw = _mm_mul_ps(*vw, c);
1504         vw += 2;
1505         *vw = _mm_mul_ps(*vw, c);
1506         vw += 2;
1507         *vw = _mm_mul_ps(*vw, c);
1508         vw += 2;
1509     }
1510     count &= 3;
1511     for (i = 0; i < count; ++i) {
1512         *vw = _mm_mul_ps(*vw, c);
1513         vw += 2;
1514     }
1515 }
1516
1517 void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
1518                                 OPJ_INT32 m, __m128 c)
1519 {
1520     __m128* OPJ_RESTRICT vl = (__m128*) l;
1521     __m128* OPJ_RESTRICT vw = (__m128*) w;
1522     OPJ_INT32 i;
1523     __m128 tmp1, tmp2, tmp3;
1524     tmp1 = vl[0];
1525     for (i = 0; i < m; ++i) {
1526         tmp2 = vw[-1];
1527         tmp3 = vw[ 0];
1528         vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
1529         tmp1 = tmp3;
1530         vw += 2;
1531     }
1532     vl = vw - 2;
1533     if (m >= k) {
1534         return;
1535     }
1536     c = _mm_add_ps(c, c);
1537     c = _mm_mul_ps(c, vl[0]);
1538     for (; m < k; ++m) {
1539         __m128 tmp = vw[-1];
1540         vw[-1] = _mm_add_ps(tmp, c);
1541         vw += 2;
1542     }
1543 }
1544
1545 #else
1546
1547 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
1548                                    const OPJ_FLOAT32 c)
1549 {
1550     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
1551     OPJ_INT32 i;
1552     for (i = 0; i < count; ++i) {
1553         OPJ_FLOAT32 tmp1 = fw[i * 8    ];
1554         OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
1555         OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
1556         OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
1557         fw[i * 8    ] = tmp1 * c;
1558         fw[i * 8 + 1] = tmp2 * c;
1559         fw[i * 8 + 2] = tmp3 * c;
1560         fw[i * 8 + 3] = tmp4 * c;
1561     }
1562 }
1563
1564 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
1565                                    OPJ_INT32 m, OPJ_FLOAT32 c)
1566 {
1567     OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
1568     OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
1569     OPJ_INT32 i;
1570     for (i = 0; i < m; ++i) {
1571         OPJ_FLOAT32 tmp1_1 = fl[0];
1572         OPJ_FLOAT32 tmp1_2 = fl[1];
1573         OPJ_FLOAT32 tmp1_3 = fl[2];
1574         OPJ_FLOAT32 tmp1_4 = fl[3];
1575         OPJ_FLOAT32 tmp2_1 = fw[-4];
1576         OPJ_FLOAT32 tmp2_2 = fw[-3];
1577         OPJ_FLOAT32 tmp2_3 = fw[-2];
1578         OPJ_FLOAT32 tmp2_4 = fw[-1];
1579         OPJ_FLOAT32 tmp3_1 = fw[0];
1580         OPJ_FLOAT32 tmp3_2 = fw[1];
1581         OPJ_FLOAT32 tmp3_3 = fw[2];
1582         OPJ_FLOAT32 tmp3_4 = fw[3];
1583         fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
1584         fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
1585         fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
1586         fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
1587         fl = fw;
1588         fw += 8;
1589     }
1590     if (m < k) {
1591         OPJ_FLOAT32 c1;
1592         OPJ_FLOAT32 c2;
1593         OPJ_FLOAT32 c3;
1594         OPJ_FLOAT32 c4;
1595         c += c;
1596         c1 = fl[0] * c;
1597         c2 = fl[1] * c;
1598         c3 = fl[2] * c;
1599         c4 = fl[3] * c;
1600         for (; m < k; ++m) {
1601             OPJ_FLOAT32 tmp1 = fw[-4];
1602             OPJ_FLOAT32 tmp2 = fw[-3];
1603             OPJ_FLOAT32 tmp3 = fw[-2];
1604             OPJ_FLOAT32 tmp4 = fw[-1];
1605             fw[-4] = tmp1 + c1;
1606             fw[-3] = tmp2 + c2;
1607             fw[-2] = tmp3 + c3;
1608             fw[-1] = tmp4 + c4;
1609             fw += 8;
1610         }
1611     }
1612 }
1613
1614 #endif
1615
1616 /* <summary>                             */
1617 /* Inverse 9-7 wavelet transform in 1-D. */
1618 /* </summary>                            */
1619 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
1620 {
1621     OPJ_INT32 a, b;
1622     if (dwt->cas == 0) {
1623         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
1624             return;
1625         }
1626         a = 0;
1627         b = 1;
1628     } else {
1629         if (!((dwt->sn > 0) || (dwt->dn > 1))) {
1630             return;
1631         }
1632         a = 1;
1633         b = 0;
1634     }
1635 #ifdef __SSE__
1636     opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->sn, _mm_set1_ps(opj_K));
1637     opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->dn, _mm_set1_ps(opj_c13318));
1638     opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1639                                opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_delta));
1640     opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1641                                opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_gamma));
1642     opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1643                                opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_beta));
1644     opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1645                                opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_alpha));
1646 #else
1647     opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->sn, opj_K);
1648     opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->dn, opj_c13318);
1649     opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1650                            opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_delta);
1651     opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1652                            opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_gamma);
1653     opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1654                            opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_beta);
1655     opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1656                            opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_alpha);
1657 #endif
1658 }
1659
1660
1661 /* <summary>                             */
1662 /* Inverse 9-7 wavelet transform in 2-D. */
1663 /* </summary>                            */
1664 OPJ_BOOL opj_dwt_decode_real(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
1665                              OPJ_UINT32 numres)
1666 {
1667     opj_v4dwt_t h;
1668     opj_v4dwt_t v;
1669
1670     opj_tcd_resolution_t* res = tilec->resolutions;
1671
1672     OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
1673                                  res->x0);    /* width of the resolution level computed */
1674     OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
1675                                  res->y0);    /* height of the resolution level computed */
1676
1677     OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1678
1679     size_t l_data_size;
1680
1681     l_data_size = opj_dwt_max_resolution(res, numres);
1682     /* overflow check */
1683     if (l_data_size > (SIZE_MAX - 5U)) {
1684         /* FIXME event manager error callback */
1685         return OPJ_FALSE;
1686     }
1687     l_data_size += 5U;
1688     /* overflow check */
1689     if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
1690         /* FIXME event manager error callback */
1691         return OPJ_FALSE;
1692     }
1693     h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
1694     if (!h.wavelet) {
1695         /* FIXME event manager error callback */
1696         return OPJ_FALSE;
1697     }
1698     v.wavelet = h.wavelet;
1699
1700     while (--numres) {
1701         OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
1702         OPJ_UINT32 bufsize = (OPJ_UINT32)((tilec->x1 - tilec->x0) *
1703                                           (tilec->y1 - tilec->y0));
1704         OPJ_INT32 j;
1705
1706         h.sn = (OPJ_INT32)rw;
1707         v.sn = (OPJ_INT32)rh;
1708
1709         ++res;
1710
1711         rw = (OPJ_UINT32)(res->x1 -
1712                           res->x0);   /* width of the resolution level computed */
1713         rh = (OPJ_UINT32)(res->y1 -
1714                           res->y0);   /* height of the resolution level computed */
1715
1716         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1717         h.cas = res->x0 % 2;
1718
1719         for (j = (OPJ_INT32)rh; j > 3; j -= 4) {
1720             OPJ_INT32 k;
1721             opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1722             opj_v4dwt_decode(&h);
1723
1724             for (k = (OPJ_INT32)rw; --k >= 0;) {
1725                 aj[k               ] = h.wavelet[k].f[0];
1726                 aj[k + (OPJ_INT32)w  ] = h.wavelet[k].f[1];
1727                 aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1728                 aj[k + (OPJ_INT32)w * 3] = h.wavelet[k].f[3];
1729             }
1730
1731             aj += w * 4;
1732             bufsize -= w * 4;
1733         }
1734
1735         if (rh & 0x03) {
1736             OPJ_INT32 k;
1737             j = rh & 0x03;
1738             opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1739             opj_v4dwt_decode(&h);
1740             for (k = (OPJ_INT32)rw; --k >= 0;) {
1741                 switch (j) {
1742                 case 3:
1743                     aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1744                 /* FALLTHRU */
1745                 case 2:
1746                     aj[k + (OPJ_INT32)w  ] = h.wavelet[k].f[1];
1747                 /* FALLTHRU */
1748                 case 1:
1749                     aj[k               ] = h.wavelet[k].f[0];
1750                 }
1751             }
1752         }
1753
1754         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1755         v.cas = res->y0 % 2;
1756
1757         aj = (OPJ_FLOAT32*) tilec->data;
1758         for (j = (OPJ_INT32)rw; j > 3; j -= 4) {
1759             OPJ_UINT32 k;
1760
1761             opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, 4);
1762             opj_v4dwt_decode(&v);
1763
1764             for (k = 0; k < rh; ++k) {
1765                 memcpy(&aj[k * w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
1766             }
1767             aj += 4;
1768         }
1769
1770         if (rw & 0x03) {
1771             OPJ_UINT32 k;
1772
1773             j = rw & 0x03;
1774
1775             opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, j);
1776             opj_v4dwt_decode(&v);
1777
1778             for (k = 0; k < rh; ++k) {
1779                 memcpy(&aj[k * w], &v.wavelet[k], (size_t)j * sizeof(OPJ_FLOAT32));
1780             }
1781         }
1782     }
1783
1784     opj_aligned_free(h.wavelet);
1785     return OPJ_TRUE;
1786 }