2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17 * All rights reserved.
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
43 #define OPJ_SKIP_POISON
44 #include "opj_includes.h"
47 #include <xmmintrin.h>
50 #include <emmintrin.h>
53 #include <tmmintrin.h>
56 #include <immintrin.h>
60 #pragma GCC poison malloc calloc realloc free
63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
66 #define OPJ_WS(i) v->mem[(i)*2]
67 #define OPJ_WD(i) v->mem[(1+(i)*2)]
70 /** Number of int32 values in a AVX2 register */
71 #define VREG_INT_COUNT 8
73 /** Number of int32 values in a SSE2 register */
74 #define VREG_INT_COUNT 4
77 /** Number of columns that we can process in parallel in the vertical pass */
78 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT)
80 /** @name Local data structures */
83 typedef struct dwt_local {
94 typedef struct v4dwt_local {
101 static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */
102 static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */
103 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */
104 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */
106 static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */
107 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
112 Virtual function type for wavelet transform in 1-D
114 typedef void (*DWT1DFN)(const opj_dwt_t* v);
116 /** @name Local static functions */
120 Forward lazy transform (horizontal)
122 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
123 OPJ_INT32 sn, OPJ_INT32 cas);
125 Forward lazy transform (vertical)
127 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
128 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
130 Forward 5-3 wavelet transform in 1-D
132 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
135 Forward 9-7 wavelet transform in 1-D
137 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
140 Explicit calculation of the Quantization Stepsizes
142 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
143 opj_stepsize_t *bandno_stepsize);
145 Inverse wavelet transform in 2-D.
147 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
148 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
150 static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
151 void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
153 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
157 /* Inverse 9-7 wavelet transform in 1-D. */
159 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
161 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
162 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size);
164 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
165 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read);
168 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
171 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
172 OPJ_INT32 m, __m128 c);
175 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
176 const OPJ_FLOAT32 c);
178 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
179 OPJ_INT32 m, OPJ_FLOAT32 c);
187 #define OPJ_S(i) a[(i)*2]
188 #define OPJ_D(i) a[(1+(i)*2)]
189 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
190 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
192 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
193 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
196 /* This table contains the norms of the 5-3 wavelets for different bands. */
198 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
199 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
200 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
201 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
202 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
206 /* This table contains the norms of the 9-7 wavelets for different bands. */
208 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
209 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
210 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
211 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
212 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
216 ==========================================================
218 ==========================================================
222 /* Forward lazy transform (horizontal). */
224 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
225 OPJ_INT32 sn, OPJ_INT32 cas)
228 OPJ_INT32 * l_dest = b;
229 OPJ_INT32 * l_src = a + cas;
231 for (i = 0; i < sn; ++i) {
239 for (i = 0; i < dn; ++i) {
246 /* Forward lazy transform (vertical). */
248 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
249 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
252 OPJ_INT32 * l_dest = b;
253 OPJ_INT32 * l_src = a + cas;
259 } /* b[i*x]=a[2*i+cas]; */
269 } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
272 #ifdef STANDARD_SLOW_VERSION
274 /* Inverse lazy transform (horizontal). */
276 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
279 OPJ_INT32 *bi = h->mem + h->cas;
286 bi = h->mem + 1 - h->cas;
295 /* Inverse lazy transform (vertical). */
297 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
300 OPJ_INT32 *bi = v->mem + v->cas;
307 ai = a + (v->sn * x);
308 bi = v->mem + 1 - v->cas;
317 #endif /* STANDARD_SLOW_VERSION */
320 /* Forward 5-3 wavelet transform in 1-D. */
322 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
328 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
329 for (i = 0; i < dn; i++) {
330 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
332 for (i = 0; i < sn; i++) {
333 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
337 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
340 for (i = 0; i < dn; i++) {
341 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
343 for (i = 0; i < sn; i++) {
344 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
350 #ifdef STANDARD_SLOW_VERSION
352 /* Inverse 5-3 wavelet transform in 1-D. */
354 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
360 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
361 for (i = 0; i < sn; i++) {
362 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
364 for (i = 0; i < dn; i++) {
365 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
369 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
372 for (i = 0; i < sn; i++) {
373 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
375 for (i = 0; i < dn; i++) {
376 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
382 static void opj_dwt_decode_1(const opj_dwt_t *v)
384 opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
387 #endif /* STANDARD_SLOW_VERSION */
389 #if !defined(STANDARD_SLOW_VERSION)
390 static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
396 const OPJ_INT32* in_even = &tiledp[0];
397 const OPJ_INT32* in_odd = &tiledp[sn];
399 #ifdef TWO_PASS_VERSION
400 /* For documentation purpose: performs lifting in two iterations, */
401 /* but without explicit interleaving */
406 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
407 for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
408 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
410 if (len & 1) { /* if len is odd */
411 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
415 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
416 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
418 if (!(len & 1)) { /* if len is even */
419 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
422 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
426 /* Improved version of the TWO_PASS_VERSION: */
427 /* Performs lifting in one single iteration. Saves memory */
428 /* accesses and explicit interleaving. */
431 s0n = s1n - ((d1n + 1) >> 1);
433 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
440 s0n = s1n - ((d1c + d1n + 2) >> 2);
443 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
449 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
450 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
452 tmp[len - 1] = d1n + s0n;
455 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
458 static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
464 const OPJ_INT32* in_even = &tiledp[sn];
465 const OPJ_INT32* in_odd = &tiledp[0];
467 #ifdef TWO_PASS_VERSION
468 /* For documentation purpose: performs lifting in two iterations, */
469 /* but without explicit interleaving */
474 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
475 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
478 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
482 tmp[0] = in_even[0] + tmp[1];
483 for (i = 2, j = 1; i < len - 1; i += 2, j++) {
484 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
487 tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
490 OPJ_INT32 s1, s2, dc, dn;
494 /* Improved version of the TWO_PASS_VERSION: */
495 /* Performs lifting in one single iteration. Saves memory */
496 /* accesses and explicit interleaving. */
499 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
500 tmp[0] = in_even[0] + dc;
502 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
506 dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
508 tmp[i + 1] = s1 + ((dn + dc) >> 1);
517 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
518 tmp[len - 2] = s1 + ((dn + dc) >> 1);
521 tmp[len - 1] = s1 + dc;
524 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
528 #endif /* !defined(STANDARD_SLOW_VERSION) */
531 /* Inverse 5-3 wavelet transform in 1-D for one row. */
533 /* Performs interleave, inverse wavelet transform and copy back to buffer */
534 static void opj_idwt53_h(const opj_dwt_t *dwt,
537 #ifdef STANDARD_SLOW_VERSION
538 /* For documentation purpose */
539 opj_dwt_interleave_h(dwt, tiledp);
540 opj_dwt_decode_1(dwt);
541 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
543 const OPJ_INT32 sn = dwt->sn;
544 const OPJ_INT32 len = sn + dwt->dn;
545 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
547 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
549 /* Unmodified value */
551 } else { /* Left-most sample is on odd coordinate */
554 } else if (len == 2) {
555 OPJ_INT32* out = dwt->mem;
556 const OPJ_INT32* in_even = &tiledp[sn];
557 const OPJ_INT32* in_odd = &tiledp[0];
558 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
559 out[0] = in_even[0] + out[1];
560 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
561 } else if (len > 2) {
562 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
568 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
570 /* Conveniency macros to improve the readabilty of the formulas */
573 #define LOAD_CST(x) _mm256_set1_epi32(x)
574 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
575 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
576 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
577 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
578 #define ADD(x,y) _mm256_add_epi32((x),(y))
579 #define SUB(x,y) _mm256_sub_epi32((x),(y))
580 #define SAR(x,y) _mm256_srai_epi32((x),(y))
583 #define LOAD_CST(x) _mm_set1_epi32(x)
584 #define LOAD(x) _mm_load_si128((const VREG*)(x))
585 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
586 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
587 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
588 #define ADD(x,y) _mm_add_epi32((x),(y))
589 #define SUB(x,y) _mm_sub_epi32((x),(y))
590 #define SAR(x,y) _mm_srai_epi32((x),(y))
592 #define ADD3(x,y,z) ADD(ADD(x,y),z)
595 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
596 const OPJ_INT32* tmp,
601 for (i = 0; i < len; ++i) {
602 /* A memcpy(&tiledp_col[i * stride + 0],
603 &tmp[PARALLEL_COLS_53 * i + 0],
604 PARALLEL_COLS_53 * sizeof(OPJ_INT32))
605 would do but would be a tiny bit slower.
606 We can take here advantage of our knowledge of alignment */
607 STOREU(&tiledp_col[i * stride + 0],
608 LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
609 STOREU(&tiledp_col[i * stride + VREG_INT_COUNT],
610 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
614 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
615 * 16 in AVX2, when top-most pixel is on even coordinate */
616 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
620 OPJ_INT32* tiledp_col,
621 const OPJ_INT32 stride)
623 const OPJ_INT32* in_even = &tiledp_col[0];
624 const OPJ_INT32* in_odd = &tiledp_col[sn * stride];
627 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
628 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
629 const VREG two = LOAD_CST(2);
633 assert(PARALLEL_COLS_53 == 16);
634 assert(VREG_INT_COUNT == 8);
636 assert(PARALLEL_COLS_53 == 8);
637 assert(VREG_INT_COUNT == 4);
640 /* Note: loads of input even/odd values must be done in a unaligned */
641 /* fashion. But stores in tmp can be done with aligned store, since */
642 /* the temporary buffer is properly aligned */
643 assert((size_t)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
645 s1n_0 = LOADU(in_even + 0);
646 s1n_1 = LOADU(in_even + VREG_INT_COUNT);
647 d1n_0 = LOADU(in_odd);
648 d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
650 /* s0n = s1n - ((d1n + 1) >> 1); <==> */
651 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
652 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
653 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
655 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
661 s1n_0 = LOADU(in_even + j * stride);
662 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
663 d1n_0 = LOADU(in_odd + j * stride);
664 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
666 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
667 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
668 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
670 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
671 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
673 /* d1c + ((s0c + s0n) >> 1) */
674 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
675 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
676 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
677 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
680 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
681 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
684 VREG tmp_len_minus_1;
685 s1n_0 = LOADU(in_even + ((len - 1) / 2) * stride);
686 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
687 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
688 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
689 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
690 STORE(tmp + PARALLEL_COLS_53 * (len - 2),
691 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
693 s1n_1 = LOADU(in_even + ((len - 1) / 2) * stride + VREG_INT_COUNT);
694 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
695 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
696 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
698 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
699 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
700 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
704 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
706 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
710 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
714 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
715 * 16 in AVX2, when top-most pixel is on odd coordinate */
716 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
720 OPJ_INT32* tiledp_col,
721 const OPJ_INT32 stride)
725 VREG s1_0, s2_0, dc_0, dn_0;
726 VREG s1_1, s2_1, dc_1, dn_1;
727 const VREG two = LOAD_CST(2);
729 const OPJ_INT32* in_even = &tiledp_col[sn * stride];
730 const OPJ_INT32* in_odd = &tiledp_col[0];
734 assert(PARALLEL_COLS_53 == 16);
735 assert(VREG_INT_COUNT == 8);
737 assert(PARALLEL_COLS_53 == 8);
738 assert(VREG_INT_COUNT == 4);
741 /* Note: loads of input even/odd values must be done in a unaligned */
742 /* fashion. But stores in tmp can be done with aligned store, since */
743 /* the temporary buffer is properly aligned */
744 assert((size_t)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
746 s1_0 = LOADU(in_even + stride);
747 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
748 dc_0 = SUB(LOADU(in_odd + 0),
749 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
750 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
752 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
753 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
754 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
755 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
756 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
757 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
759 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
761 s2_0 = LOADU(in_even + (j + 1) * stride);
762 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
764 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
765 dn_0 = SUB(LOADU(in_odd + j * stride),
766 SAR(ADD3(s1_0, s2_0, two), 2));
767 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
768 SAR(ADD3(s1_1, s2_1, two), 2));
770 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
771 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
773 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
774 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
775 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
776 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
777 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
784 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
785 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
788 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
789 dn_0 = SUB(LOADU(in_odd + (len / 2 - 1) * stride),
790 SAR(ADD3(s1_0, s1_0, two), 2));
791 dn_1 = SUB(LOADU(in_odd + (len / 2 - 1) * stride + VREG_INT_COUNT),
792 SAR(ADD3(s1_1, s1_1, two), 2));
794 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
795 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
796 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
797 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
798 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
800 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
801 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
803 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
804 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
808 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
822 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
824 #if !defined(STANDARD_SLOW_VERSION)
825 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
826 * pixel is on even coordinate */
827 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
830 OPJ_INT32* tiledp_col,
831 const OPJ_INT32 stride)
834 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
838 /* Performs lifting in one single iteration. Saves memory */
839 /* accesses and explicit interleaving. */
842 d1n = tiledp_col[sn * stride];
843 s0n = s1n - ((d1n + 1) >> 1);
845 for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
849 s1n = tiledp_col[(j + 1) * stride];
850 d1n = tiledp_col[(sn + j + 1) * stride];
852 s0n = s1n - ((d1c + d1n + 2) >> 2);
855 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
862 tiledp_col[((len - 1) / 2) * stride] -
864 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
866 tmp[len - 1] = d1n + s0n;
869 for (i = 0; i < len; ++i) {
870 tiledp_col[i * stride] = tmp[i];
875 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
876 * pixel is on odd coordinate */
877 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
880 OPJ_INT32* tiledp_col,
881 const OPJ_INT32 stride)
884 OPJ_INT32 s1, s2, dc, dn;
885 const OPJ_INT32* in_even = &tiledp_col[sn * stride];
886 const OPJ_INT32* in_odd = &tiledp_col[0];
890 /* Performs lifting in one single iteration. Saves memory */
891 /* accesses and explicit interleaving. */
893 s1 = in_even[stride];
894 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
895 tmp[0] = in_even[0] + dc;
896 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
898 s2 = in_even[(j + 1) * stride];
900 dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2);
902 tmp[i + 1] = s1 + ((dn + dc) >> 1);
909 dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
910 tmp[len - 2] = s1 + ((dn + dc) >> 1);
913 tmp[len - 1] = s1 + dc;
916 for (i = 0; i < len; ++i) {
917 tiledp_col[i * stride] = tmp[i];
920 #endif /* !defined(STANDARD_SLOW_VERSION) */
923 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
925 /* Performs interleave, inverse wavelet transform and copy back to buffer */
926 static void opj_idwt53_v(const opj_dwt_t *dwt,
927 OPJ_INT32* tiledp_col,
931 #ifdef STANDARD_SLOW_VERSION
932 /* For documentation purpose */
934 for (c = 0; c < nb_cols; c ++) {
935 opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
936 opj_dwt_decode_1(dwt);
937 for (k = 0; k < dwt->sn + dwt->dn; ++k) {
938 tiledp_col[c + k * stride] = dwt->mem[k];
942 const OPJ_INT32 sn = dwt->sn;
943 const OPJ_INT32 len = sn + dwt->dn;
945 /* If len == 1, unmodified value */
947 #if (defined(__SSE2__) || defined(__AVX2__))
948 if (len > 1 && nb_cols == PARALLEL_COLS_53) {
949 /* Same as below general case, except that thanks to SSE2/AVX2 */
950 /* we can efficently process 8/16 columns in parallel */
951 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
957 for (c = 0; c < nb_cols; c++, tiledp_col++) {
958 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
965 for (c = 0; c < nb_cols; c++, tiledp_col++) {
973 OPJ_INT32* out = dwt->mem;
974 for (c = 0; c < nb_cols; c++, tiledp_col++) {
976 const OPJ_INT32* in_even = &tiledp_col[sn * stride];
977 const OPJ_INT32* in_odd = &tiledp_col[0];
979 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
980 out[0] = in_even[0] + out[1];
982 for (i = 0; i < len; ++i) {
983 tiledp_col[i * stride] = out[i];
990 #if (defined(__SSE2__) || defined(__AVX2__))
991 if (len > 2 && nb_cols == PARALLEL_COLS_53) {
992 /* Same as below general case, except that thanks to SSE2/AVX2 */
993 /* we can efficently process 8/16 columns in parallel */
994 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1000 for (c = 0; c < nb_cols; c++, tiledp_col++) {
1001 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
1011 /* Forward 9-7 wavelet transform in 1-D. */
1013 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
1018 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
1019 for (i = 0; i < dn; i++) {
1020 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
1022 for (i = 0; i < sn; i++) {
1023 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
1025 for (i = 0; i < dn; i++) {
1026 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
1028 for (i = 0; i < sn; i++) {
1029 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
1031 for (i = 0; i < dn; i++) {
1032 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */
1034 for (i = 0; i < sn; i++) {
1035 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */
1039 if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
1040 for (i = 0; i < dn; i++) {
1041 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
1043 for (i = 0; i < sn; i++) {
1044 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
1046 for (i = 0; i < dn; i++) {
1047 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
1049 for (i = 0; i < sn; i++) {
1050 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
1052 for (i = 0; i < dn; i++) {
1053 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */
1055 for (i = 0; i < sn; i++) {
1056 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */
1062 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1063 opj_stepsize_t *bandno_stepsize)
1066 p = opj_int_floorlog2(stepsize) - 13;
1067 n = 11 - opj_int_floorlog2(stepsize);
1068 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1069 bandno_stepsize->expn = numbps - p;
1073 ==========================================================
1075 ==========================================================
1080 /* Forward 5-3 wavelet transform in 2-D. */
1082 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
1083 void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
1091 OPJ_INT32 rw; /* width of the resolution level computed */
1092 OPJ_INT32 rh; /* height of the resolution level computed */
1095 opj_tcd_resolution_t * l_cur_res = 0;
1096 opj_tcd_resolution_t * l_last_res = 0;
1098 w = tilec->x1 - tilec->x0;
1099 l = (OPJ_INT32)tilec->numresolutions - 1;
1102 l_cur_res = tilec->resolutions + l;
1103 l_last_res = l_cur_res - 1;
1105 l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1106 /* overflow check */
1107 if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
1108 /* FIXME event manager error callback */
1111 l_data_size *= sizeof(OPJ_INT32);
1112 bj = (OPJ_INT32*)opj_malloc(l_data_size);
1113 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1114 /* in that case, so do not error out */
1115 if (l_data_size != 0 && ! bj) {
1121 OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */
1122 OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */
1123 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1124 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
1127 rw = l_cur_res->x1 - l_cur_res->x0;
1128 rh = l_cur_res->y1 - l_cur_res->y0;
1129 rw1 = l_last_res->x1 - l_last_res->x0;
1130 rh1 = l_last_res->y1 - l_last_res->y0;
1132 cas_row = l_cur_res->x0 & 1;
1133 cas_col = l_cur_res->y0 & 1;
1137 for (j = 0; j < rw; ++j) {
1139 for (k = 0; k < rh; ++k) {
1143 (*p_function)(bj, dn, sn, cas_col);
1145 opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
1151 for (j = 0; j < rh; j++) {
1153 for (k = 0; k < rw; k++) {
1156 (*p_function)(bj, dn, sn, cas_row);
1157 opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
1160 l_cur_res = l_last_res;
1169 /* Forward 5-3 wavelet transform in 2-D. */
1171 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
1173 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
1177 /* Inverse 5-3 wavelet transform in 2-D. */
1179 OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec,
1182 return opj_dwt_decode_tile(tp, tilec, numres);
1187 /* Get gain of 5-3 wavelet transform. */
1189 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
1194 if (orient == 1 || orient == 2) {
1201 /* Get norm of 5-3 wavelet. */
1203 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1205 return opj_dwt_norms[orient][level];
1209 /* Forward 9-7 wavelet transform in 2-D. */
1211 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
1213 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
1217 /* Get gain of 9-7 wavelet transform. */
1219 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
1226 /* Get norm of 9-7 wavelet. */
1228 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1230 return opj_dwt_norms_real[orient][level];
1233 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1235 OPJ_UINT32 numbands, bandno;
1236 numbands = 3 * tccp->numresolutions - 2;
1237 for (bandno = 0; bandno < numbands; bandno++) {
1238 OPJ_FLOAT64 stepsize;
1239 OPJ_UINT32 resno, level, orient, gain;
1241 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1242 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1243 level = tccp->numresolutions - 1 - resno;
1244 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1245 (orient == 2)) ? 1 : 2));
1246 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1249 OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1250 stepsize = (1 << (gain)) / norm;
1252 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1253 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1258 /* Determine maximum computed resolution level for inverse wavelet transform */
1260 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1267 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1270 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1281 OPJ_INT32 * OPJ_RESTRICT tiledp;
1284 } opj_dwd_decode_h_job_t;
1286 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1289 opj_dwd_decode_h_job_t* job;
1292 job = (opj_dwd_decode_h_job_t*)user_data;
1293 for (j = job->min_j; j < job->max_j; j++) {
1294 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1297 opj_aligned_free(job->h.mem);
1305 OPJ_INT32 * OPJ_RESTRICT tiledp;
1308 } opj_dwd_decode_v_job_t;
1310 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1313 opj_dwd_decode_v_job_t* job;
1316 job = (opj_dwd_decode_v_job_t*)user_data;
1317 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1318 j += PARALLEL_COLS_53) {
1319 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w,
1323 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w,
1324 (OPJ_INT32)(job->max_j - j));
1326 opj_aligned_free(job->v.mem);
1332 /* Inverse wavelet transform in 2-D. */
1334 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1335 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1340 opj_tcd_resolution_t* tr = tilec->resolutions;
1342 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1343 tr->x0); /* width of the resolution level computed */
1344 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1345 tr->y0); /* height of the resolution level computed */
1347 OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1354 num_threads = opj_thread_pool_get_thread_count(tp);
1355 h_mem_size = opj_dwt_max_resolution(tr, numres);
1356 /* overflow check */
1357 if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1358 /* FIXME event manager error callback */
1361 /* We need PARALLEL_COLS_53 times the height of the array, */
1362 /* since for the vertical pass */
1363 /* we process PARALLEL_COLS_53 columns at a time */
1364 h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
1365 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1367 /* FIXME event manager error callback */
1374 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1378 h.sn = (OPJ_INT32)rw;
1379 v.sn = (OPJ_INT32)rh;
1381 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1382 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1384 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1387 if (num_threads <= 1 || rh <= 1) {
1388 for (j = 0; j < rh; ++j) {
1389 opj_idwt53_h(&h, &tiledp[j * w]);
1392 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1395 if (rh < num_jobs) {
1398 step_j = (rh / num_jobs);
1400 for (j = 0; j < num_jobs; j++) {
1401 opj_dwd_decode_h_job_t* job;
1403 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
1405 /* It would be nice to fallback to single thread case, but */
1406 /* unfortunately some jobs may be launched and have modified */
1407 /* tiledp, so it is not practical to recover from that error */
1408 /* FIXME event manager error callback */
1409 opj_thread_pool_wait_completion(tp, 0);
1410 opj_aligned_free(h.mem);
1416 job->tiledp = tiledp;
1417 job->min_j = j * step_j;
1418 job->max_j = (j + 1U) * step_j; /* this can overflow */
1419 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1422 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1424 /* FIXME event manager error callback */
1425 opj_thread_pool_wait_completion(tp, 0);
1427 opj_aligned_free(h.mem);
1430 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1432 opj_thread_pool_wait_completion(tp, 0);
1435 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1438 if (num_threads <= 1 || rw <= 1) {
1439 for (j = 0; j + PARALLEL_COLS_53 <= rw;
1440 j += PARALLEL_COLS_53) {
1441 opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, PARALLEL_COLS_53);
1444 opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, (OPJ_INT32)(rw - j));
1447 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1450 if (rw < num_jobs) {
1453 step_j = (rw / num_jobs);
1455 for (j = 0; j < num_jobs; j++) {
1456 opj_dwd_decode_v_job_t* job;
1458 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
1460 /* It would be nice to fallback to single thread case, but */
1461 /* unfortunately some jobs may be launched and have modified */
1462 /* tiledp, so it is not practical to recover from that error */
1463 /* FIXME event manager error callback */
1464 opj_thread_pool_wait_completion(tp, 0);
1465 opj_aligned_free(v.mem);
1471 job->tiledp = tiledp;
1472 job->min_j = j * step_j;
1473 job->max_j = (j + 1U) * step_j; /* this can overflow */
1474 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1477 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1479 /* FIXME event manager error callback */
1480 opj_thread_pool_wait_completion(tp, 0);
1482 opj_aligned_free(v.mem);
1485 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1487 opj_thread_pool_wait_completion(tp, 0);
1490 opj_aligned_free(h.mem);
1494 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
1495 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size)
1497 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(w->wavelet + w->cas);
1498 OPJ_INT32 count = w->sn;
1501 for (k = 0; k < 2; ++k) {
1502 if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 &&
1503 ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
1504 /* Fast code path */
1505 for (i = 0; i < count; ++i) {
1509 bi[i * 8 + 1] = a[j];
1511 bi[i * 8 + 2] = a[j];
1513 bi[i * 8 + 3] = a[j];
1516 /* Slow code path */
1517 for (i = 0; i < count; ++i) {
1524 bi[i * 8 + 1] = a[j];
1529 bi[i * 8 + 2] = a[j];
1534 bi[i * 8 + 3] = a[j]; /* This one*/
1538 bi = (OPJ_FLOAT32*)(w->wavelet + 1 - w->cas);
1545 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
1546 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read)
1548 opj_v4_t* OPJ_RESTRICT bi = v->wavelet + v->cas;
1551 for (i = 0; i < v->sn; ++i) {
1552 memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
1556 bi = v->wavelet + 1 - v->cas;
1558 for (i = 0; i < v->dn; ++i) {
1559 memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
1565 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
1568 __m128* OPJ_RESTRICT vw = (__m128*) w;
1570 /* 4x unrolled loop */
1571 for (i = 0; i < count >> 2; ++i) {
1572 *vw = _mm_mul_ps(*vw, c);
1574 *vw = _mm_mul_ps(*vw, c);
1576 *vw = _mm_mul_ps(*vw, c);
1578 *vw = _mm_mul_ps(*vw, c);
1582 for (i = 0; i < count; ++i) {
1583 *vw = _mm_mul_ps(*vw, c);
1588 void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
1589 OPJ_INT32 m, __m128 c)
1591 __m128* OPJ_RESTRICT vl = (__m128*) l;
1592 __m128* OPJ_RESTRICT vw = (__m128*) w;
1594 __m128 tmp1, tmp2, tmp3;
1596 for (i = 0; i < m; ++i) {
1599 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
1607 c = _mm_add_ps(c, c);
1608 c = _mm_mul_ps(c, vl[0]);
1609 for (; m < k; ++m) {
1610 __m128 tmp = vw[-1];
1611 vw[-1] = _mm_add_ps(tmp, c);
1618 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
1619 const OPJ_FLOAT32 c)
1621 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
1623 for (i = 0; i < count; ++i) {
1624 OPJ_FLOAT32 tmp1 = fw[i * 8 ];
1625 OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
1626 OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
1627 OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
1628 fw[i * 8 ] = tmp1 * c;
1629 fw[i * 8 + 1] = tmp2 * c;
1630 fw[i * 8 + 2] = tmp3 * c;
1631 fw[i * 8 + 3] = tmp4 * c;
1635 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
1636 OPJ_INT32 m, OPJ_FLOAT32 c)
1638 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
1639 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
1641 for (i = 0; i < m; ++i) {
1642 OPJ_FLOAT32 tmp1_1 = fl[0];
1643 OPJ_FLOAT32 tmp1_2 = fl[1];
1644 OPJ_FLOAT32 tmp1_3 = fl[2];
1645 OPJ_FLOAT32 tmp1_4 = fl[3];
1646 OPJ_FLOAT32 tmp2_1 = fw[-4];
1647 OPJ_FLOAT32 tmp2_2 = fw[-3];
1648 OPJ_FLOAT32 tmp2_3 = fw[-2];
1649 OPJ_FLOAT32 tmp2_4 = fw[-1];
1650 OPJ_FLOAT32 tmp3_1 = fw[0];
1651 OPJ_FLOAT32 tmp3_2 = fw[1];
1652 OPJ_FLOAT32 tmp3_3 = fw[2];
1653 OPJ_FLOAT32 tmp3_4 = fw[3];
1654 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
1655 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
1656 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
1657 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
1671 for (; m < k; ++m) {
1672 OPJ_FLOAT32 tmp1 = fw[-4];
1673 OPJ_FLOAT32 tmp2 = fw[-3];
1674 OPJ_FLOAT32 tmp3 = fw[-2];
1675 OPJ_FLOAT32 tmp4 = fw[-1];
1688 /* Inverse 9-7 wavelet transform in 1-D. */
1690 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
1693 if (dwt->cas == 0) {
1694 if (!((dwt->dn > 0) || (dwt->sn > 1))) {
1700 if (!((dwt->sn > 0) || (dwt->dn > 1))) {
1707 opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->sn, _mm_set1_ps(opj_K));
1708 opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->dn, _mm_set1_ps(opj_c13318));
1709 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1710 opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_delta));
1711 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1712 opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_gamma));
1713 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1714 opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_beta));
1715 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1716 opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_alpha));
1718 opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->sn, opj_K);
1719 opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->dn, opj_c13318);
1720 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1721 opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_delta);
1722 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1723 opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_gamma);
1724 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1725 opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_beta);
1726 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1727 opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_alpha);
1733 /* Inverse 9-7 wavelet transform in 2-D. */
1735 OPJ_BOOL opj_dwt_decode_real(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
1741 opj_tcd_resolution_t* res = tilec->resolutions;
1743 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
1744 res->x0); /* width of the resolution level computed */
1745 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
1746 res->y0); /* height of the resolution level computed */
1748 OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1752 l_data_size = opj_dwt_max_resolution(res, numres);
1753 /* overflow check */
1754 if (l_data_size > (SIZE_MAX - 5U)) {
1755 /* FIXME event manager error callback */
1759 /* overflow check */
1760 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
1761 /* FIXME event manager error callback */
1764 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
1766 /* FIXME event manager error callback */
1769 v.wavelet = h.wavelet;
1772 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
1773 OPJ_UINT32 bufsize = (OPJ_UINT32)((tilec->x1 - tilec->x0) *
1774 (tilec->y1 - tilec->y0));
1777 h.sn = (OPJ_INT32)rw;
1778 v.sn = (OPJ_INT32)rh;
1782 rw = (OPJ_UINT32)(res->x1 -
1783 res->x0); /* width of the resolution level computed */
1784 rh = (OPJ_UINT32)(res->y1 -
1785 res->y0); /* height of the resolution level computed */
1787 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1788 h.cas = res->x0 % 2;
1790 for (j = (OPJ_INT32)rh; j > 3; j -= 4) {
1792 opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1793 opj_v4dwt_decode(&h);
1795 for (k = (OPJ_INT32)rw; --k >= 0;) {
1796 aj[k ] = h.wavelet[k].f[0];
1797 aj[k + (OPJ_INT32)w ] = h.wavelet[k].f[1];
1798 aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1799 aj[k + (OPJ_INT32)w * 3] = h.wavelet[k].f[3];
1809 opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1810 opj_v4dwt_decode(&h);
1811 for (k = (OPJ_INT32)rw; --k >= 0;) {
1814 aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1817 aj[k + (OPJ_INT32)w ] = h.wavelet[k].f[1];
1820 aj[k ] = h.wavelet[k].f[0];
1825 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1826 v.cas = res->y0 % 2;
1828 aj = (OPJ_FLOAT32*) tilec->data;
1829 for (j = (OPJ_INT32)rw; j > 3; j -= 4) {
1832 opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, 4);
1833 opj_v4dwt_decode(&v);
1835 for (k = 0; k < rh; ++k) {
1836 memcpy(&aj[k * w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
1846 opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, j);
1847 opj_v4dwt_decode(&v);
1849 for (k = 0; k < rh; ++k) {
1850 memcpy(&aj[k * w], &v.wavelet[k], (size_t)j * sizeof(OPJ_FLOAT32));
1855 opj_aligned_free(h.wavelet);