diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
--- OpenJPEG.orig/libopenjpeg/t1.c 2007-08-23 05:53:17.000000000 -0500
+++ OpenJPEG.patched/libopenjpeg/t1.c 2007-08-23 05:56:33.000000000 -0500
@@ -45,7 +45,11 @@
static char t1_getspb(int f);
static short t1_getnmsedec_sig(int x, int bitpos);
static short t1_getnmsedec_ref(int x, int bitpos);
+#ifdef __amd64__
+static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
+#else
static void t1_updateflags(flag_t *flagsp, int s, int stride);
+#endif
/**
Encode significant pass
*/
@@ -258,6 +262,38 @@
return lut_nmsedec_ref0[x & ((1 << T1_NMSEDEC_BITS) - 1)];
}
+#ifdef __amd64__
+
+/* On 64 bit platforms we can set three flags at a time. (SWAR) */
+/* FIXME: Assumes little endian? */
+
+#define VEC(x,y,z) (int64)(x)|((int64)(y)<<16)|((int64)(z)<<32)
+
+static void t1_updateflags(flag_t *flagsp, int s, int stride) {
+ static const int64 mod[] = {
+ VEC(T1_SIG_SE, T1_SIG_E, T1_SIG_NE),
+ VEC(T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE),
+ VEC(T1_SIG_S, T1_SIG, T1_SIG_N),
+ VEC(T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N),
+ VEC(T1_SIG_SW, T1_SIG_W, T1_SIG_NW),
+ VEC(T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW)
+ };
+
+ int64 tmp1 = *(int64*)((void*)&flagsp[-1 - stride]);
+ int64 tmp2 = *(int64*)((void*)&flagsp[-1 ]);
+ int64 tmp3 = *(int64*)((void*)&flagsp[-1 + stride]);
+
+ tmp1 |= mod[s];
+ tmp2 |= mod[s+2];
+ tmp3 |= mod[s+4];
+
+ *(int64*)((void*)&flagsp[-1 - stride]) = tmp1;
+ *(int64*)((void*)&flagsp[-1 ]) = tmp2;
+ *(int64*)((void*)&flagsp[-1 + stride]) = tmp3;
+}
+
+#else
+
static void t1_updateflags(flag_t *flagsp, int s, int stride) {
static const flag_t mod[] = {
T1_SIG_E, T1_SIG_E|T1_SGN_E,
@@ -279,6 +315,8 @@
flagsp[ 1 + stride] |= T1_SIG_NW;
}
+#endif
+
static void t1_enc_sigpass_step(
opj_t1_t *t1,
flag_t *flagsp,
@@ -670,6 +708,8 @@
for (i = 0; i < t1->w; ++i) {
if (k + 3 < t1->h) {
#ifdef __amd64__
+ /* 64 bit SWAR */
+ /* FIXME: Assumes little endian? */
int64 tmp = *((int64*)&t1->flags[(k+1) + (i+1)*(t1->h+2)]);
if (cblksty & J2K_CCP_CBLKSTY_VSC) {
tmp &= ~((int64)(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S)<<48);
@@ -780,6 +820,11 @@
memset(t1->data,0,datasize * sizeof(int));
flagssize=(h+2) * (w+2);
+#ifdef __amd64__
+ /* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
+ because three shorts = 48 bits. */
+ ++flagssize;
+#endif
if(flagssize > t1->flagssize){
opj_aligned_free(t1->flags);