Sub-tile decoding: speed up vertical pass in IDWT5x3 by processing 4 cols at a time
[openjpeg.git] / src / lib / openjp2 / sparse_array.c
index b0634f67e8ecc4b9e653416dbd6db0f5778a51a7..48c4b23b0ca0c0c417bdf4259d231d00cbe98665 100644 (file)
@@ -165,10 +165,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                     if (buf_col_stride == 1) {
                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
                                                            (x - x0) * buf_col_stride;
-                        for (j = 0; j < y_incr; j++) {
-                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
-                            dest_ptr += buf_line_stride;
-                            src_ptr += block_width;
+                        if (x_incr == 4) {
+                            // Same code as general branch, but the compiler
+                            // can have an efficient memcpy()
+                            for (j = 0; j < y_incr; j++) {
+                                memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                                dest_ptr += buf_line_stride;
+                                src_ptr += block_width;
+                            }
+                        } else {
+                            for (j = 0; j < y_incr; j++) {
+                                memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                                dest_ptr += buf_line_stride;
+                                src_ptr += block_width;
+                            }
                         }
                     } else {
                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
@@ -179,6 +189,17 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                                 dest_ptr += buf_line_stride;
                                 src_ptr += block_width;
                             }
+                        } else if (y_incr == 1 && buf_col_stride == 2) {
+                            OPJ_UINT32 k;
+                            for (k = 0; k < (x_incr & ~3U); k += 4) {
+                                dest_ptr[k * buf_col_stride] = src_ptr[k];
+                                dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
+                                dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
+                                dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
+                            }
+                            for (; k < x_incr; k++) {
+                                dest_ptr[k * buf_col_stride] = src_ptr[k];
+                            }
                         } else {
                             /* General case */
                             for (j = 0; j < y_incr; j++) {
@@ -207,10 +228,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                                                        (size_t)block_width + block_x_offset;
                     const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
                                                             (size_t)buf_line_stride + (x - x0) * buf_col_stride;
-                    for (j = 0; j < y_incr; j++) {
-                        memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
-                        dest_ptr += block_width;
-                        src_ptr += buf_line_stride;
+                    if (x_incr == 4) {
+                        // Same code as general branch, but the compiler
+                        // can have an efficient memcpy()
+                        for (j = 0; j < y_incr; j++) {
+                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                            dest_ptr += block_width;
+                            src_ptr += buf_line_stride;
+                        }
+                    } else {
+                        for (j = 0; j < y_incr; j++) {
+                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                            dest_ptr += block_width;
+                            src_ptr += buf_line_stride;
+                        }
                     }
                 } else {
                     OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *