2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16 * All rights reserved.
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
28 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
41 #include <xmmintrin.h>
44 #include "opj_includes.h"
46 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
49 #define OPJ_WS(i) v->mem[(i)*2]
50 #define OPJ_WD(i) v->mem[(1+(i)*2)]
52 /** @name Local data structures */
55 typedef struct dwt_local {
66 typedef struct v4dwt_local {
73 static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */
74 static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */
75 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */
76 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */
78 static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */
79 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
84 Virtual function type for wavelet transform in 1-D
86 typedef void (*DWT1DFN)(opj_dwt_t* v);
88 /** @name Local static functions */
92 Forward lazy transform (horizontal)
94 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
95 OPJ_INT32 sn, OPJ_INT32 cas);
97 Forward lazy transform (vertical)
99 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
100 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
102 Inverse lazy transform (horizontal)
104 static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a);
106 Inverse lazy transform (vertical)
108 static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x);
110 Forward 5-3 wavelet transform in 1-D
112 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
115 Inverse 5-3 wavelet transform in 1-D
117 static void opj_dwt_decode_1(opj_dwt_t *v);
118 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
121 Forward 9-7 wavelet transform in 1-D
123 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
126 Explicit calculation of the Quantization Stepsizes
128 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
129 opj_stepsize_t *bandno_stepsize);
131 Inverse wavelet transform in 2-D.
133 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
134 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);
136 static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
137 void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32));
139 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
143 /* Inverse 9-7 wavelet transform in 1-D. */
145 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
147 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
148 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size);
150 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
151 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read);
154 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
157 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
158 OPJ_INT32 m, __m128 c);
161 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
162 const OPJ_FLOAT32 c);
164 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
165 OPJ_INT32 m, OPJ_FLOAT32 c);
173 #define OPJ_S(i) a[(i)*2]
174 #define OPJ_D(i) a[(1+(i)*2)]
175 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
176 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
178 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
179 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
182 /* This table contains the norms of the 5-3 wavelets for different bands. */
184 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
185 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
186 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
187 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
188 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
192 /* This table contains the norms of the 9-7 wavelets for different bands. */
194 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
195 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
196 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
197 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
198 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
202 ==========================================================
204 ==========================================================
208 /* Forward lazy transform (horizontal). */
210 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
211 OPJ_INT32 sn, OPJ_INT32 cas)
214 OPJ_INT32 * l_dest = b;
215 OPJ_INT32 * l_src = a + cas;
217 for (i = 0; i < sn; ++i) {
225 for (i = 0; i < dn; ++i) {
232 /* Forward lazy transform (vertical). */
234 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
235 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
238 OPJ_INT32 * l_dest = b;
239 OPJ_INT32 * l_src = a + cas;
245 } /* b[i*x]=a[2*i+cas]; */
255 } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
259 /* Inverse lazy transform (horizontal). */
261 static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a)
264 OPJ_INT32 *bi = h->mem + h->cas;
271 bi = h->mem + 1 - h->cas;
280 /* Inverse lazy transform (vertical). */
282 static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
285 OPJ_INT32 *bi = v->mem + v->cas;
292 ai = a + (v->sn * x);
293 bi = v->mem + 1 - v->cas;
304 /* Forward 5-3 wavelet transform in 1-D. */
306 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
312 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
313 for (i = 0; i < dn; i++) {
314 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
316 for (i = 0; i < sn; i++) {
317 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
321 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
324 for (i = 0; i < dn; i++) {
325 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
327 for (i = 0; i < sn; i++) {
328 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
335 /* Inverse 5-3 wavelet transform in 1-D. */
337 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
343 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
344 for (i = 0; i < sn; i++) {
345 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
347 for (i = 0; i < dn; i++) {
348 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
352 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
355 for (i = 0; i < sn; i++) {
356 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
358 for (i = 0; i < dn; i++) {
359 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
366 /* Inverse 5-3 wavelet transform in 1-D. */
368 static void opj_dwt_decode_1(opj_dwt_t *v)
370 opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
374 /* Forward 9-7 wavelet transform in 1-D. */
376 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
381 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
382 for (i = 0; i < dn; i++) {
383 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
385 for (i = 0; i < sn; i++) {
386 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
388 for (i = 0; i < dn; i++) {
389 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
391 for (i = 0; i < sn; i++) {
392 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
394 for (i = 0; i < dn; i++) {
395 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */
397 for (i = 0; i < sn; i++) {
398 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */
402 if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
403 for (i = 0; i < dn; i++) {
404 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
406 for (i = 0; i < sn; i++) {
407 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
409 for (i = 0; i < dn; i++) {
410 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
412 for (i = 0; i < sn; i++) {
413 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
415 for (i = 0; i < dn; i++) {
416 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */
418 for (i = 0; i < sn; i++) {
419 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */
425 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
426 opj_stepsize_t *bandno_stepsize)
429 p = opj_int_floorlog2(stepsize) - 13;
430 n = 11 - opj_int_floorlog2(stepsize);
431 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
432 bandno_stepsize->expn = numbps - p;
436 ==========================================================
438 ==========================================================
443 /* Forward 5-3 wavelet transform in 2-D. */
445 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,
446 void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32))
454 OPJ_INT32 rw; /* width of the resolution level computed */
455 OPJ_INT32 rh; /* height of the resolution level computed */
458 opj_tcd_resolution_t * l_cur_res = 0;
459 opj_tcd_resolution_t * l_last_res = 0;
461 w = tilec->x1 - tilec->x0;
462 l = (OPJ_INT32)tilec->numresolutions - 1;
465 l_cur_res = tilec->resolutions + l;
466 l_last_res = l_cur_res - 1;
468 l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
470 if (l_data_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
471 /* FIXME event manager error callback */
474 l_data_size *= sizeof(OPJ_INT32);
475 bj = (OPJ_INT32*)opj_malloc(l_data_size);
476 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
477 /* in that case, so do not error out */
478 if (l_data_size != 0 && ! bj) {
484 OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */
485 OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */
486 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
487 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
490 rw = l_cur_res->x1 - l_cur_res->x0;
491 rh = l_cur_res->y1 - l_cur_res->y0;
492 rw1 = l_last_res->x1 - l_last_res->x0;
493 rh1 = l_last_res->y1 - l_last_res->y0;
495 cas_row = l_cur_res->x0 & 1;
496 cas_col = l_cur_res->y0 & 1;
500 for (j = 0; j < rw; ++j) {
502 for (k = 0; k < rh; ++k) {
506 (*p_function)(bj, dn, sn, cas_col);
508 opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
514 for (j = 0; j < rh; j++) {
516 for (k = 0; k < rw; k++) {
519 (*p_function)(bj, dn, sn, cas_row);
520 opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
523 l_cur_res = l_last_res;
532 /* Forward 5-3 wavelet transform in 2-D. */
534 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
536 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
540 /* Inverse 5-3 wavelet transform in 2-D. */
542 OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec,
545 return opj_dwt_decode_tile(tp, tilec, numres, &opj_dwt_decode_1);
550 /* Get gain of 5-3 wavelet transform. */
552 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
557 if (orient == 1 || orient == 2) {
564 /* Get norm of 5-3 wavelet. */
566 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
568 return opj_dwt_norms[orient][level];
572 /* Forward 9-7 wavelet transform in 2-D. */
574 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
576 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
580 /* Get gain of 9-7 wavelet transform. */
582 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
589 /* Get norm of 9-7 wavelet. */
591 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
593 return opj_dwt_norms_real[orient][level];
596 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
598 OPJ_UINT32 numbands, bandno;
599 numbands = 3 * tccp->numresolutions - 2;
600 for (bandno = 0; bandno < numbands; bandno++) {
601 OPJ_FLOAT64 stepsize;
602 OPJ_UINT32 resno, level, orient, gain;
604 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
605 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
606 level = tccp->numresolutions - 1 - resno;
607 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
608 (orient == 2)) ? 1 : 2));
609 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
612 OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
613 stepsize = (1 << (gain)) / norm;
615 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
616 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
621 /* Determine maximum computed resolution level for inverse wavelet transform */
623 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
630 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
633 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
645 OPJ_INT32 * OPJ_RESTRICT tiledp;
648 } opj_dwd_decode_h_job_t;
650 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
653 opj_dwd_decode_h_job_t* job;
656 job = (opj_dwd_decode_h_job_t*)user_data;
657 for (j = job->min_j; j < job->max_j; j++) {
658 opj_dwt_interleave_h(&job->h, &job->tiledp[j * job->w]);
659 (job->dwt_1D)(&job->h);
660 memcpy(&job->tiledp[j * job->w], job->h.mem, job->rw * sizeof(OPJ_INT32));
663 opj_aligned_free(job->h.mem);
672 OPJ_INT32 * OPJ_RESTRICT tiledp;
675 } opj_dwd_decode_v_job_t;
677 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
680 opj_dwd_decode_v_job_t* job;
683 job = (opj_dwd_decode_v_job_t*)user_data;
684 for (j = job->min_j; j < job->max_j; j++) {
686 opj_dwt_interleave_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w);
687 (job->dwt_1D)(&job->v);
688 for (k = 0; k < job->rh; ++k) {
689 job->tiledp[k * job->w + j] = job->v.mem[k];
693 opj_aligned_free(job->v.mem);
699 /* Inverse wavelet transform in 2-D. */
701 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
702 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D)
707 opj_tcd_resolution_t* tr = tilec->resolutions;
709 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
710 tr->x0); /* width of the resolution level computed */
711 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
712 tr->y0); /* height of the resolution level computed */
714 OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
721 num_threads = opj_thread_pool_get_thread_count(tp);
722 h_mem_size = opj_dwt_max_resolution(tr, numres);
724 if (h_mem_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
725 /* FIXME event manager error callback */
728 h_mem_size *= sizeof(OPJ_INT32);
729 h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
731 /* FIXME event manager error callback */
738 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
742 h.sn = (OPJ_INT32)rw;
743 v.sn = (OPJ_INT32)rh;
745 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
746 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
748 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
751 if (num_threads <= 1 || rh <= 1) {
752 for (j = 0; j < rh; ++j) {
753 opj_dwt_interleave_h(&h, &tiledp[j * w]);
755 memcpy(&tiledp[j * w], h.mem, rw * sizeof(OPJ_INT32));
758 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
764 step_j = (rh / num_jobs);
766 for (j = 0; j < num_jobs; j++) {
767 opj_dwd_decode_h_job_t* job;
769 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
771 /* It would be nice to fallback to single thread case, but */
772 /* unfortunately some jobs may be launched and have modified */
773 /* tiledp, so it is not practical to recover from that error */
774 /* FIXME event manager error callback */
775 opj_thread_pool_wait_completion(tp, 0);
776 opj_aligned_free(h.mem);
780 job->dwt_1D = dwt_1D;
783 job->tiledp = tiledp;
784 job->min_j = j * step_j;
785 job->max_j = (j + 1U) * step_j; /* this can overflow */
786 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
789 job->h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
791 /* FIXME event manager error callback */
792 opj_thread_pool_wait_completion(tp, 0);
794 opj_aligned_free(h.mem);
797 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
799 opj_thread_pool_wait_completion(tp, 0);
802 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
805 if (num_threads <= 1 || rw <= 1) {
806 for (j = 0; j < rw; ++j) {
809 opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w);
811 for (k = 0; k < rh; ++k) {
812 tiledp[k * w + j] = v.mem[k];
816 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
822 step_j = (rw / num_jobs);
824 for (j = 0; j < num_jobs; j++) {
825 opj_dwd_decode_v_job_t* job;
827 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
829 /* It would be nice to fallback to single thread case, but */
830 /* unfortunately some jobs may be launched and have modified */
831 /* tiledp, so it is not practical to recover from that error */
832 /* FIXME event manager error callback */
833 opj_thread_pool_wait_completion(tp, 0);
834 opj_aligned_free(v.mem);
838 job->dwt_1D = dwt_1D;
841 job->tiledp = tiledp;
842 job->min_j = j * step_j;
843 job->max_j = (j + 1U) * step_j; /* this can overflow */
844 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
847 job->v.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size);
849 /* FIXME event manager error callback */
850 opj_thread_pool_wait_completion(tp, 0);
852 opj_aligned_free(v.mem);
855 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
857 opj_thread_pool_wait_completion(tp, 0);
860 opj_aligned_free(h.mem);
864 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT w,
865 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 size)
867 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(w->wavelet + w->cas);
868 OPJ_INT32 count = w->sn;
871 for (k = 0; k < 2; ++k) {
872 if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 &&
873 ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
875 for (i = 0; i < count; ++i) {
879 bi[i * 8 + 1] = a[j];
881 bi[i * 8 + 2] = a[j];
883 bi[i * 8 + 3] = a[j];
887 for (i = 0; i < count; ++i) {
894 bi[i * 8 + 1] = a[j];
899 bi[i * 8 + 2] = a[j];
904 bi[i * 8 + 3] = a[j]; /* This one*/
908 bi = (OPJ_FLOAT32*)(w->wavelet + 1 - w->cas);
915 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT v,
916 OPJ_FLOAT32* OPJ_RESTRICT a, OPJ_INT32 x, OPJ_INT32 nb_elts_read)
918 opj_v4_t* OPJ_RESTRICT bi = v->wavelet + v->cas;
921 for (i = 0; i < v->sn; ++i) {
922 memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
926 bi = v->wavelet + 1 - v->cas;
928 for (i = 0; i < v->dn; ++i) {
929 memcpy(&bi[i * 2], &a[i * x], (size_t)nb_elts_read * sizeof(OPJ_FLOAT32));
935 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count,
938 __m128* OPJ_RESTRICT vw = (__m128*) w;
940 /* 4x unrolled loop */
941 for (i = 0; i < count >> 2; ++i) {
942 *vw = _mm_mul_ps(*vw, c);
944 *vw = _mm_mul_ps(*vw, c);
946 *vw = _mm_mul_ps(*vw, c);
948 *vw = _mm_mul_ps(*vw, c);
952 for (i = 0; i < count; ++i) {
953 *vw = _mm_mul_ps(*vw, c);
958 void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
959 OPJ_INT32 m, __m128 c)
961 __m128* OPJ_RESTRICT vl = (__m128*) l;
962 __m128* OPJ_RESTRICT vw = (__m128*) w;
964 __m128 tmp1, tmp2, tmp3;
966 for (i = 0; i < m; ++i) {
969 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
977 c = _mm_add_ps(c, c);
978 c = _mm_mul_ps(c, vl[0]);
981 vw[-1] = _mm_add_ps(tmp, c);
988 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count,
991 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
993 for (i = 0; i < count; ++i) {
994 OPJ_FLOAT32 tmp1 = fw[i * 8 ];
995 OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
996 OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
997 OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
998 fw[i * 8 ] = tmp1 * c;
999 fw[i * 8 + 1] = tmp2 * c;
1000 fw[i * 8 + 2] = tmp3 * c;
1001 fw[i * 8 + 3] = tmp4 * c;
1005 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k,
1006 OPJ_INT32 m, OPJ_FLOAT32 c)
1008 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
1009 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
1011 for (i = 0; i < m; ++i) {
1012 OPJ_FLOAT32 tmp1_1 = fl[0];
1013 OPJ_FLOAT32 tmp1_2 = fl[1];
1014 OPJ_FLOAT32 tmp1_3 = fl[2];
1015 OPJ_FLOAT32 tmp1_4 = fl[3];
1016 OPJ_FLOAT32 tmp2_1 = fw[-4];
1017 OPJ_FLOAT32 tmp2_2 = fw[-3];
1018 OPJ_FLOAT32 tmp2_3 = fw[-2];
1019 OPJ_FLOAT32 tmp2_4 = fw[-1];
1020 OPJ_FLOAT32 tmp3_1 = fw[0];
1021 OPJ_FLOAT32 tmp3_2 = fw[1];
1022 OPJ_FLOAT32 tmp3_3 = fw[2];
1023 OPJ_FLOAT32 tmp3_4 = fw[3];
1024 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
1025 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
1026 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
1027 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
1041 for (; m < k; ++m) {
1042 OPJ_FLOAT32 tmp1 = fw[-4];
1043 OPJ_FLOAT32 tmp2 = fw[-3];
1044 OPJ_FLOAT32 tmp3 = fw[-2];
1045 OPJ_FLOAT32 tmp4 = fw[-1];
1058 /* Inverse 9-7 wavelet transform in 1-D. */
1060 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
1063 if (dwt->cas == 0) {
1064 if (!((dwt->dn > 0) || (dwt->sn > 1))) {
1070 if (!((dwt->sn > 0) || (dwt->dn > 1))) {
1077 opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->sn, _mm_set1_ps(opj_K));
1078 opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->dn, _mm_set1_ps(opj_c13318));
1079 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1080 opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_delta));
1081 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1082 opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_gamma));
1083 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1084 opj_int_min(dwt->sn, dwt->dn - a), _mm_set1_ps(opj_dwt_beta));
1085 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1086 opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(opj_dwt_alpha));
1088 opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->sn, opj_K);
1089 opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->dn, opj_c13318);
1090 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1091 opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_delta);
1092 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1093 opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_gamma);
1094 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, dwt->sn,
1095 opj_int_min(dwt->sn, dwt->dn - a), opj_dwt_beta);
1096 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, dwt->dn,
1097 opj_int_min(dwt->dn, dwt->sn - b), opj_dwt_alpha);
1103 /* Inverse 9-7 wavelet transform in 2-D. */
1105 OPJ_BOOL opj_dwt_decode_real(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
1111 opj_tcd_resolution_t* res = tilec->resolutions;
1113 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
1114 res->x0); /* width of the resolution level computed */
1115 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
1116 res->y0); /* height of the resolution level computed */
1118 OPJ_UINT32 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1122 l_data_size = opj_dwt_max_resolution(res, numres);
1123 /* overflow check */
1124 if (l_data_size > (SIZE_MAX - 5U)) {
1125 /* FIXME event manager error callback */
1129 /* overflow check */
1130 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
1131 /* FIXME event manager error callback */
1134 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
1136 /* FIXME event manager error callback */
1139 v.wavelet = h.wavelet;
1142 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
1143 OPJ_UINT32 bufsize = (OPJ_UINT32)((tilec->x1 - tilec->x0) *
1144 (tilec->y1 - tilec->y0));
1147 h.sn = (OPJ_INT32)rw;
1148 v.sn = (OPJ_INT32)rh;
1152 rw = (OPJ_UINT32)(res->x1 -
1153 res->x0); /* width of the resolution level computed */
1154 rh = (OPJ_UINT32)(res->y1 -
1155 res->y0); /* height of the resolution level computed */
1157 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1158 h.cas = res->x0 % 2;
1160 for (j = (OPJ_INT32)rh; j > 3; j -= 4) {
1162 opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1163 opj_v4dwt_decode(&h);
1165 for (k = (OPJ_INT32)rw; --k >= 0;) {
1166 aj[k ] = h.wavelet[k].f[0];
1167 aj[k + (OPJ_INT32)w ] = h.wavelet[k].f[1];
1168 aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1169 aj[k + (OPJ_INT32)w * 3] = h.wavelet[k].f[3];
1179 opj_v4dwt_interleave_h(&h, aj, (OPJ_INT32)w, (OPJ_INT32)bufsize);
1180 opj_v4dwt_decode(&h);
1181 for (k = (OPJ_INT32)rw; --k >= 0;) {
1184 aj[k + (OPJ_INT32)w * 2] = h.wavelet[k].f[2];
1186 aj[k + (OPJ_INT32)w ] = h.wavelet[k].f[1];
1188 aj[k ] = h.wavelet[k].f[0];
1193 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1194 v.cas = res->y0 % 2;
1196 aj = (OPJ_FLOAT32*) tilec->data;
1197 for (j = (OPJ_INT32)rw; j > 3; j -= 4) {
1200 opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, 4);
1201 opj_v4dwt_decode(&v);
1203 for (k = 0; k < rh; ++k) {
1204 memcpy(&aj[k * w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
1214 opj_v4dwt_interleave_v(&v, aj, (OPJ_INT32)w, j);
1215 opj_v4dwt_decode(&v);
1217 for (k = 0; k < rh; ++k) {
1218 memcpy(&aj[k * w], &v.wavelet[k], (size_t)j * sizeof(OPJ_FLOAT32));
1223 opj_aligned_free(h.wavelet);