diff options
| author | Even Rouault <even.rouault@spatialys.com> | 2020-05-19 19:45:00 +0200 |
|---|---|---|
| committer | Even Rouault <even.rouault@spatialys.com> | 2020-05-20 20:31:28 +0200 |
| commit | c6a413a42394836b956846cc037dd8297b732f44 (patch) | |
| tree | cda14068229cb4947f7df6c9172a3e18d11f1047 /src | |
| parent | fe4c15f12c562a42a6b0c4b0a0c5e42a25797235 (diff) | |
opj_mct_encode_real(): add SSE optimization
Diffstat (limited to 'src')
| -rw-r--r-- | src/lib/openjp2/mct.c | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c index 9d79b50a..88c8f409 100644 --- a/src/lib/openjp2/mct.c +++ b/src/lib/openjp2/mct.c @@ -216,6 +216,53 @@ void opj_mct_encode_real( OPJ_SIZE_T n) { OPJ_SIZE_T i; +#ifdef __SSE__ + const __m128 YR = _mm_set1_ps(0.299f); + const __m128 YG = _mm_set1_ps(0.587f); + const __m128 YB = _mm_set1_ps(0.114f); + const __m128 UR = _mm_set1_ps(-0.16875f); + const __m128 UG = _mm_set1_ps(-0.331260f); + const __m128 UB = _mm_set1_ps(0.5f); + const __m128 VR = _mm_set1_ps(0.5f); + const __m128 VG = _mm_set1_ps(-0.41869f); + const __m128 VB = _mm_set1_ps(-0.08131f); + for (i = 0; i < (n >> 3); i ++) { + __m128 r, g, b, y, u, v; + + r = _mm_load_ps(c0); + g = _mm_load_ps(c1); + b = _mm_load_ps(c2); + y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)), + _mm_mul_ps(b, YB)); + u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)), + _mm_mul_ps(b, UB)); + v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)), + _mm_mul_ps(b, VB)); + _mm_store_ps(c0, y); + _mm_store_ps(c1, u); + _mm_store_ps(c2, v); + c0 += 4; + c1 += 4; + c2 += 4; + + r = _mm_load_ps(c0); + g = _mm_load_ps(c1); + b = _mm_load_ps(c2); + y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)), + _mm_mul_ps(b, YB)); + u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)), + _mm_mul_ps(b, UB)); + v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)), + _mm_mul_ps(b, VB)); + _mm_store_ps(c0, y); + _mm_store_ps(c1, u); + _mm_store_ps(c2, v); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; +#endif for (i = 0; i < n; ++i) { OPJ_FLOAT32 r = c0[i]; OPJ_FLOAT32 g = c1[i]; |
