+/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
+ * 16 in AVX2, when top-most pixel is on even coordinate */
+static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
+ OPJ_INT32* tmp,
+ const OPJ_INT32 sn,
+ const OPJ_INT32 len,
+ OPJ_INT32* tiledp_col,
+ const OPJ_SIZE_T stride)
+{
+ const OPJ_INT32* in_even = &tiledp_col[0];
+ const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
+
+ OPJ_INT32 i;
+ OPJ_SIZE_T j;
+ VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
+ VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
+ const VREG two = LOAD_CST(2);
+
+ assert(len > 1);
+#if __AVX2__
+ assert(PARALLEL_COLS_53 == 16);
+ assert(VREG_INT_COUNT == 8);
+#else
+ assert(PARALLEL_COLS_53 == 8);
+ assert(VREG_INT_COUNT == 4);
+#endif
+
+ /* Note: loads of input even/odd values must be done in a unaligned */
+ /* fashion. But stores in tmp can be done with aligned store, since */
+ /* the temporary buffer is properly aligned */
+ assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
+
+ s1n_0 = LOADU(in_even + 0);
+ s1n_1 = LOADU(in_even + VREG_INT_COUNT);
+ d1n_0 = LOADU(in_odd);
+ d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
+
+ /* s0n = s1n - ((d1n + 1) >> 1); <==> */
+ /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
+ s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
+ s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
+
+ for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
+ d1c_0 = d1n_0;
+ s0c_0 = s0n_0;
+ d1c_1 = d1n_1;
+ s0c_1 = s0n_1;
+
+ s1n_0 = LOADU(in_even + j * stride);
+ s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
+ d1n_0 = LOADU(in_odd + j * stride);
+ d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
+
+ /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
+ s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
+ s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
+
+ STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
+ STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
+
+ /* d1c + ((s0c + s0n) >> 1) */
+ STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
+ ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
+ STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
+ ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
+ }
+
+ STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
+ STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
+
+ if (len & 1) {
+ VREG tmp_len_minus_1;
+ s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
+ /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
+ tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
+ /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
+ STORE(tmp + PARALLEL_COLS_53 * (len - 2),
+ ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
+
+ s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
+ /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
+ tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
+ tmp_len_minus_1);
+ /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
+ STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
+ ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
+
+
+ } else {
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
+ ADD(d1n_0, s0n_0));
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
+ ADD(d1n_1, s0n_1));
+ }
+
+ opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
+}
+
+
+/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
+ * 16 in AVX2, when top-most pixel is on odd coordinate */
+static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
+ OPJ_INT32* tmp,
+ const OPJ_INT32 sn,
+ const OPJ_INT32 len,
+ OPJ_INT32* tiledp_col,
+ const OPJ_SIZE_T stride)
+{
+ OPJ_INT32 i;
+ OPJ_SIZE_T j;
+
+ VREG s1_0, s2_0, dc_0, dn_0;
+ VREG s1_1, s2_1, dc_1, dn_1;
+ const VREG two = LOAD_CST(2);
+
+ const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
+ const OPJ_INT32* in_odd = &tiledp_col[0];
+
+ assert(len > 2);
+#if __AVX2__
+ assert(PARALLEL_COLS_53 == 16);
+ assert(VREG_INT_COUNT == 8);
+#else
+ assert(PARALLEL_COLS_53 == 8);
+ assert(VREG_INT_COUNT == 4);
+#endif
+
+ /* Note: loads of input even/odd values must be done in a unaligned */
+ /* fashion. But stores in tmp can be done with aligned store, since */
+ /* the temporary buffer is properly aligned */
+ assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
+
+ s1_0 = LOADU(in_even + stride);
+ /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
+ dc_0 = SUB(LOADU(in_odd + 0),
+ SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
+ STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
+
+ s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
+ /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
+ dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
+ SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
+ STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
+ ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
+
+ for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
+
+ s2_0 = LOADU(in_even + (j + 1) * stride);
+ s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
+
+ /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
+ dn_0 = SUB(LOADU(in_odd + j * stride),
+ SAR(ADD3(s1_0, s2_0, two), 2));
+ dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
+ SAR(ADD3(s1_1, s2_1, two), 2));
+
+ STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
+ STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
+
+ /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
+ STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
+ ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
+ STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
+ ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
+
+ dc_0 = dn_0;
+ s1_0 = s2_0;
+ dc_1 = dn_1;
+ s1_1 = s2_1;
+ }
+ STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
+ STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
+
+ if (!(len & 1)) {
+ /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
+ dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
+ SAR(ADD3(s1_0, s1_0, two), 2));
+ dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
+ SAR(ADD3(s1_1, s1_1, two), 2));
+
+ /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
+ STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
+ ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
+ STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
+ ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
+
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
+ } else {
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
+ STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
+ ADD(s1_1, dc_1));
+ }
+
+ opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
+}
+
+#undef VREG
+#undef LOAD_CST
+#undef LOADU
+#undef LOAD
+#undef STORE
+#undef STOREU
+#undef ADD
+#undef ADD3
+#undef SUB
+#undef SAR
+
+#endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
+
+#if !defined(STANDARD_SLOW_VERSION)
+/** Vertical inverse 5x3 wavelet transform for one column, when top-most
+ * pixel is on even coordinate */
+static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
+ const OPJ_INT32 sn,
+ const OPJ_INT32 len,
+ OPJ_INT32* tiledp_col,
+ const OPJ_SIZE_T stride)
+{
+ OPJ_INT32 i, j;
+ OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
+
+ assert(len > 1);
+
+ /* Performs lifting in one single iteration. Saves memory */
+ /* accesses and explicit interleaving. */
+
+ s1n = tiledp_col[0];
+ d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
+ s0n = s1n - ((d1n + 1) >> 1);
+
+ for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
+ d1c = d1n;
+ s0c = s0n;
+
+ s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
+ d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
+
+ s0n = s1n - ((d1c + d1n + 2) >> 2);
+
+ tmp[i ] = s0c;
+ tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
+ }
+
+ tmp[i] = s0n;
+
+ if (len & 1) {
+ tmp[len - 1] =
+ tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
+ ((d1n + 1) >> 1);
+ tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
+ } else {
+ tmp[len - 1] = d1n + s0n;
+ }
+
+ for (i = 0; i < len; ++i) {
+ tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
+ }
+}
+
+
+/** Vertical inverse 5x3 wavelet transform for one column, when top-most
+ * pixel is on odd coordinate */
+static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
+ const OPJ_INT32 sn,
+ const OPJ_INT32 len,
+ OPJ_INT32* tiledp_col,
+ const OPJ_SIZE_T stride)
+{
+ OPJ_INT32 i, j;
+ OPJ_INT32 s1, s2, dc, dn;
+ const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
+ const OPJ_INT32* in_odd = &tiledp_col[0];
+
+ assert(len > 2);
+
+ /* Performs lifting in one single iteration. Saves memory */
+ /* accesses and explicit interleaving. */
+
+ s1 = in_even[stride];
+ dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
+ tmp[0] = in_even[0] + dc;
+ for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
+
+ s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
+
+ dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
+ tmp[i ] = dc;
+ tmp[i + 1] = s1 + ((dn + dc) >> 1);
+
+ dc = dn;
+ s1 = s2;
+ }
+ tmp[i] = dc;
+ if (!(len & 1)) {
+ dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
+ tmp[len - 2] = s1 + ((dn + dc) >> 1);
+ tmp[len - 1] = dn;
+ } else {
+ tmp[len - 1] = s1 + dc;
+ }
+
+ for (i = 0; i < len; ++i) {
+ tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
+ }
+}
+#endif /* !defined(STANDARD_SLOW_VERSION) */
+
+/* <summary> */
+/* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
+/* </summary> */
+/* Performs interleave, inverse wavelet transform and copy back to buffer */
+static void opj_idwt53_v(const opj_dwt_t *dwt,
+ OPJ_INT32* tiledp_col,
+ OPJ_SIZE_T stride,
+ OPJ_INT32 nb_cols)
+{
+#ifdef STANDARD_SLOW_VERSION
+ /* For documentation purpose */
+ OPJ_INT32 k, c;
+ for (c = 0; c < nb_cols; c ++) {
+ opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
+ opj_dwt_decode_1(dwt);
+ for (k = 0; k < dwt->sn + dwt->dn; ++k) {
+ tiledp_col[c + k * stride] = dwt->mem[k];
+ }
+ }
+#else
+ const OPJ_INT32 sn = dwt->sn;
+ const OPJ_INT32 len = sn + dwt->dn;
+ if (dwt->cas == 0) {
+ /* If len == 1, unmodified value */
+
+#if (defined(__SSE2__) || defined(__AVX2__))
+ if (len > 1 && nb_cols == PARALLEL_COLS_53) {
+ /* Same as below general case, except that thanks to SSE2/AVX2 */
+ /* we can efficiently process 8/16 columns in parallel */
+ opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
+ return;
+ }
+#endif
+ if (len > 1) {
+ OPJ_INT32 c;
+ for (c = 0; c < nb_cols; c++, tiledp_col++) {
+ opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
+ }
+ return;
+ }
+ } else {
+ if (len == 1) {
+ OPJ_INT32 c;
+ for (c = 0; c < nb_cols; c++, tiledp_col++) {
+ tiledp_col[0] /= 2;
+ }
+ return;
+ }
+
+ if (len == 2) {
+ OPJ_INT32 c;
+ OPJ_INT32* out = dwt->mem;
+ for (c = 0; c < nb_cols; c++, tiledp_col++) {
+ OPJ_INT32 i;
+ const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
+ const OPJ_INT32* in_odd = &tiledp_col[0];
+
+ out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
+ out[0] = in_even[0] + out[1];
+
+ for (i = 0; i < len; ++i) {
+ tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
+ }
+ }
+
+ return;
+ }
+
+#if (defined(__SSE2__) || defined(__AVX2__))
+ if (len > 2 && nb_cols == PARALLEL_COLS_53) {
+ /* Same as below general case, except that thanks to SSE2/AVX2 */
+ /* we can efficiently process 8/16 columns in parallel */
+ opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
+ return;
+ }
+#endif
+ if (len > 2) {
+ OPJ_INT32 c;
+ for (c = 0; c < nb_cols; c++, tiledp_col++) {
+ opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
+ }
+ return;
+ }
+ }
+#endif
+}
+
+