Fix 2.2.0 regression when reading codestream with reperated calls to opj_get_decoded_...
[openjpeg.git] / src / lib / openjp2 / t1.c
index 953c7ab140d1a3108251208a033f05d34c99403c..a583e692088be2834fe9628398661f698271149c 100644 (file)
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define OPJ_SKIP_POISON
 #include "opj_includes.h"
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC poison malloc calloc realloc free
+#endif
+
 #include "t1_luts.h"
 
 /** @defgroup T1 T1 - Implementation of the tier-1 coding */
@@ -1439,7 +1452,7 @@ static OPJ_BOOL opj_t1_allocate_buffers(
     if (!t1->encoder) {
         OPJ_UINT32 datasize = w * h;
 
-        if (datasize > (size_t)t1->datasize) {
+        if (datasize > t1->datasize) {
             opj_aligned_free(t1->data);
             t1->data = (OPJ_INT32*) opj_aligned_malloc(datasize * sizeof(OPJ_INT32));
             if (!t1->data) {
@@ -1563,6 +1576,7 @@ void opj_t1_destroy(opj_t1_t *p_t1)
 }
 
 typedef struct {
+    OPJ_BOOL whole_tile_decoding;
     OPJ_UINT32 resno;
     opj_tcd_cblk_dec_t* cblk;
     opj_tcd_band_t* band;
@@ -1596,8 +1610,37 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
     OPJ_UINT32 tile_w;
 
     job = (opj_t1_cblk_decode_processing_job_t*) user_data;
-    resno = job->resno;
+
     cblk = job->cblk;
+
+    if (!job->whole_tile_decoding) {
+        cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
+        cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
+
+        cblk->decoded_data = opj_aligned_malloc(cblk_w * cblk_h * sizeof(OPJ_INT32));
+        if (cblk->decoded_data == NULL) {
+            if (job->p_manager_mutex) {
+                opj_mutex_lock(job->p_manager_mutex);
+            }
+            opj_event_msg(job->p_manager, EVT_ERROR,
+                          "Cannot allocate cblk->decoded_data\n");
+            if (job->p_manager_mutex) {
+                opj_mutex_unlock(job->p_manager_mutex);
+            }
+            *(job->pret) = OPJ_FALSE;
+            opj_free(job);
+            return;
+        }
+        /* Zero-init required */
+        memset(cblk->decoded_data, 0, cblk_w * cblk_h * sizeof(OPJ_INT32));
+    } else if (cblk->decoded_data) {
+        /* Not sure if that code path can happen, but better be */
+        /* safe than sorry */
+        opj_aligned_free(cblk->decoded_data);
+        cblk->decoded_data = NULL;
+    }
+
+    resno = job->resno;
     band = job->band;
     tilec = job->tilec;
     tccp = job->tccp;
@@ -1668,36 +1711,48 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
         }
     }
 
+    /* Both can be non NULL if for example decoding a full tile and then */
+    /* partially a tile. In which case partial decoding should be the */
+    /* priority */
+    assert((cblk->decoded_data != NULL) || (tilec->data != NULL));
+
     if (cblk->decoded_data) {
+        OPJ_UINT32 cblk_size = cblk_w * cblk_h;
         if (tccp->qmfbid == 1) {
-            for (j = 0; j < cblk_h; ++j) {
-                i = 0;
-                for (; i < (cblk_w & ~(OPJ_UINT32)3U); i += 4U) {
-                    OPJ_INT32 tmp0 = datap[(j * cblk_w) + i + 0U];
-                    OPJ_INT32 tmp1 = datap[(j * cblk_w) + i + 1U];
-                    OPJ_INT32 tmp2 = datap[(j * cblk_w) + i + 2U];
-                    OPJ_INT32 tmp3 = datap[(j * cblk_w) + i + 3U];
-                    datap[(j * cblk_w) + i + 0U] = tmp0 / 2;
-                    datap[(j * cblk_w) + i + 1U] = tmp1 / 2;
-                    datap[(j * cblk_w) + i + 2U] = tmp2 / 2;
-                    datap[(j * cblk_w) + i + 3U] = tmp3 / 2;
-                }
-                for (; i < cblk_w; ++i) {
-                    datap[(j * cblk_w) + i] /= 2;
-                }
+            for (i = 0; i < cblk_size; ++i) {
+                datap[i] /= 2;
             }
         } else {        /* if (tccp->qmfbid == 0) */
-            for (j = 0; j < cblk_h; ++j) {
-                for (i = 0; i < cblk_w; ++i) {
-                    OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * band->stepsize;
-                    memcpy(datap, &tmp, sizeof(tmp));
-                    datap++;
+            i = 0;
+#ifdef __SSE2__
+            {
+                const __m128 xmm_stepsize = _mm_set1_ps(band->stepsize);
+                for (; i < (cblk_size & ~15U); i += 16) {
+                    __m128 xmm0_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)(
+                                                           datap + 0)));
+                    __m128 xmm1_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)(
+                                                           datap + 4)));
+                    __m128 xmm2_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)(
+                                                           datap + 8)));
+                    __m128 xmm3_data = _mm_cvtepi32_ps(_mm_load_si128((__m128i * const)(
+                                                           datap + 12)));
+                    _mm_store_ps((float*)(datap +  0), _mm_mul_ps(xmm0_data, xmm_stepsize));
+                    _mm_store_ps((float*)(datap +  4), _mm_mul_ps(xmm1_data, xmm_stepsize));
+                    _mm_store_ps((float*)(datap +  8), _mm_mul_ps(xmm2_data, xmm_stepsize));
+                    _mm_store_ps((float*)(datap + 12), _mm_mul_ps(xmm3_data, xmm_stepsize));
+                    datap += 16;
                 }
             }
+#endif
+            for (; i < cblk_size; ++i) {
+                OPJ_FLOAT32 tmp = ((OPJ_FLOAT32)(*datap)) * band->stepsize;
+                memcpy(datap, &tmp, sizeof(tmp));
+                datap++;
+            }
         }
     } else if (tccp->qmfbid == 1) {
-        OPJ_INT32* OPJ_RESTRICT tiledp = &tilec->data[(OPJ_UINT32)y * tile_w +
-                                                       (OPJ_UINT32)x];
+        OPJ_INT32* OPJ_RESTRICT tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w +
+                                                       (OPJ_SIZE_T)x];
         for (j = 0; j < cblk_h; ++j) {
             i = 0;
             for (; i < (cblk_w & ~(OPJ_UINT32)3U); i += 4U) {
@@ -1705,19 +1760,19 @@ static void opj_t1_clbl_decode_processor(void* user_data, opj_tls_t* tls)
                 OPJ_INT32 tmp1 = datap[(j * cblk_w) + i + 1U];
                 OPJ_INT32 tmp2 = datap[(j * cblk_w) + i + 2U];
                 OPJ_INT32 tmp3 = datap[(j * cblk_w) + i + 3U];
-                ((OPJ_INT32*)tiledp)[(j * tile_w) + i + 0U] = tmp0 / 2;
-                ((OPJ_INT32*)tiledp)[(j * tile_w) + i + 1U] = tmp1 / 2;
-                ((OPJ_INT32*)tiledp)[(j * tile_w) + i + 2U] = tmp2 / 2;
-                ((OPJ_INT32*)tiledp)[(j * tile_w) + i + 3U] = tmp3 / 2;
+                ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 0U] = tmp0 / 2;
+                ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 1U] = tmp1 / 2;
+                ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 2U] = tmp2 / 2;
+                ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i + 3U] = tmp3 / 2;
             }
             for (; i < cblk_w; ++i) {
                 OPJ_INT32 tmp = datap[(j * cblk_w) + i];
-                ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp / 2;
+                ((OPJ_INT32*)tiledp)[(j * (OPJ_SIZE_T)tile_w) + i] = tmp / 2;
             }
         }
     } else {        /* if (tccp->qmfbid == 0) */
-        OPJ_FLOAT32* OPJ_RESTRICT tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_UINT32)y *
-                                                         tile_w + (OPJ_UINT32)x];
+        OPJ_FLOAT32* OPJ_RESTRICT tiledp = (OPJ_FLOAT32*) &tilec->data[(OPJ_SIZE_T)y *
+                                                         tile_w + (OPJ_SIZE_T)x];
         for (j = 0; j < cblk_h; ++j) {
             OPJ_FLOAT32* OPJ_RESTRICT tiledp2 = tiledp;
             for (i = 0; i < cblk_w; ++i) {
@@ -1746,6 +1801,11 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
     opj_thread_pool_t* tp = tcd->thread_pool;
     OPJ_UINT32 resno, bandno, precno, cblkno;
 
+#ifdef DEBUG_VERBOSE
+    OPJ_UINT32 codeblocks_decoded = 0;
+    printf("Enter opj_t1_decode_cblks()\n");
+#endif
+
     for (resno = 0; resno < tilec->minimum_num_resolutions; ++resno) {
         opj_tcd_resolution_t* res = &tilec->resolutions[resno];
 
@@ -1763,6 +1823,17 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
                         (OPJ_UINT32)precinct->y0,
                         (OPJ_UINT32)precinct->x1,
                         (OPJ_UINT32)precinct->y1)) {
+                    for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
+                        opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
+                        if (cblk->decoded_data) {
+#ifdef DEBUG_VERBOSE
+                            printf("Discarding codeblock %d,%d at resno=%d, bandno=%d\n",
+                                   cblk->x0, cblk->y0, resno, bandno);
+#endif
+                            opj_aligned_free(cblk->decoded_data);
+                            cblk->decoded_data = NULL;
+                        }
+                    }
                     continue;
                 }
 
@@ -1770,8 +1841,6 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
                     opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
                     opj_t1_cblk_decode_processing_job_t* job;
 
-                    assert(cblk->decoded_data == NULL);
-
                     if (!opj_tcd_is_subband_area_of_interest(tcd,
                             tilec->compno,
                             resno,
@@ -1780,29 +1849,34 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
                             (OPJ_UINT32)cblk->y0,
                             (OPJ_UINT32)cblk->x1,
                             (OPJ_UINT32)cblk->y1)) {
+                        if (cblk->decoded_data) {
+#ifdef DEBUG_VERBOSE
+                            printf("Discarding codeblock %d,%d at resno=%d, bandno=%d\n",
+                                   cblk->x0, cblk->y0, resno, bandno);
+#endif
+                            opj_aligned_free(cblk->decoded_data);
+                            cblk->decoded_data = NULL;
+                        }
                         continue;
                     }
 
                     if (!tcd->whole_tile_decoding) {
                         OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
                         OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
-                        if (cblk_w == 0 || cblk_h == 0) {
+                        if (cblk->decoded_data != NULL) {
+#ifdef DEBUG_VERBOSE
+                            printf("Reusing codeblock %d,%d at resno=%d, bandno=%d\n",
+                                   cblk->x0, cblk->y0, resno, bandno);
+#endif
                             continue;
                         }
-                        /* Zero-init required */
-                        cblk->decoded_data = opj_calloc(1, cblk_w * cblk_h * sizeof(OPJ_INT32));
-                        if (cblk->decoded_data == NULL) {
-                            if (p_manager_mutex) {
-                                opj_mutex_lock(p_manager_mutex);
-                            }
-                            opj_event_msg(p_manager, EVT_ERROR,
-                                          "Cannot allocate cblk->decoded_data\n");
-                            if (p_manager_mutex) {
-                                opj_mutex_unlock(p_manager_mutex);
-                            }
-                            *pret = OPJ_FALSE;
-                            return;
+                        if (cblk_w == 0 || cblk_h == 0) {
+                            continue;
                         }
+#ifdef DEBUG_VERBOSE
+                        printf("Decoding codeblock %d,%d at resno=%d, bandno=%d\n",
+                               cblk->x0, cblk->y0, resno, bandno);
+#endif
                     }
 
                     job = (opj_t1_cblk_decode_processing_job_t*) opj_calloc(1,
@@ -1811,6 +1885,7 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
                         *pret = OPJ_FALSE;
                         return;
                     }
+                    job->whole_tile_decoding = tcd->whole_tile_decoding;
                     job->resno = resno;
                     job->cblk = cblk;
                     job->band = band;
@@ -1822,6 +1897,9 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
                     job->check_pterm = check_pterm;
                     job->mustuse_cblkdatabuffer = opj_thread_pool_get_thread_count(tp) > 1;
                     opj_thread_pool_submit_job(tp, opj_t1_clbl_decode_processor, job);
+#ifdef DEBUG_VERBOSE
+                    codeblocks_decoded ++;
+#endif
                     if (!(*pret)) {
                         return;
                     }
@@ -1830,6 +1908,9 @@ void opj_t1_decode_cblks(opj_tcd_t* tcd,
         } /* bandno */
     } /* resno */
 
+#ifdef DEBUG_VERBOSE
+    printf("Leave opj_t1_decode_cblks(). Number decoded: %d\n", codeblocks_decoded);
+#endif
     return;
 }
 
@@ -2052,7 +2133,8 @@ OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1,
                         OPJ_INT32* OPJ_RESTRICT tiledp;
                         OPJ_UINT32 cblk_w;
                         OPJ_UINT32 cblk_h;
-                        OPJ_UINT32 i, j, tileIndex = 0, tileLineAdvance;
+                        OPJ_UINT32 i, j, tileLineAdvance;
+                        OPJ_SIZE_T tileIndex = 0;
 
                         OPJ_INT32 x = cblk->x0 - band->x0;
                         OPJ_INT32 y = cblk->y0 - band->y0;
@@ -2076,7 +2158,7 @@ OPJ_BOOL opj_t1_encode_cblks(opj_t1_t *t1,
                         cblk_h = t1->h;
                         tileLineAdvance = tile_w - cblk_w;
 
-                        tiledp = &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x];
+                        tiledp = &tilec->data[(OPJ_SIZE_T)y * tile_w + (OPJ_SIZE_T)x];
                         t1->data = tiledp;
                         t1->data_stride = tile_w;
                         if (tccp->qmfbid == 1) {