diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
--- OpenJPEG.orig/libopenjpeg/t1.c 2007-11-13 13:52:05.000000000 -0600
+++ OpenJPEG.patched/libopenjpeg/t1.c 2007-11-14 01:09:40.000000000 -0600
@@ -33,6 +33,17 @@
#include "opj_includes.h"
#include "t1_luts.h"
+/* Don't use MMX on amd64 */
+/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */
+/* outputs on amd64, and it is measurably slower. A bug in gcc? */
+#ifdef __amd64__
+#undef __MMX__
+#endif
+
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+
/** @defgroup T1 T1 - Implementation of the tier-1 coding */
/*@{*/
@@ -45,7 +56,7 @@
static char t1_getspb(int f);
static short t1_getnmsedec_sig(int x, int bitpos);
static short t1_getnmsedec_ref(int x, int bitpos);
-#ifdef __amd64__
+#if defined(__amd64__) || defined(__MMX__)
static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
#else
static void t1_updateflags(flag_t *flagsp, int s, int stride);
@@ -293,6 +304,32 @@
}
#else
+#ifdef __MMX__
+
+static void t1_updateflags(flag_t *flagsp, int s, int stride) {
+ static const __v4hi mod[] = {
+ {T1_SIG_SE, T1_SIG_E, T1_SIG_NE, 0},
+ {T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE, 0},
+ {T1_SIG_S, T1_SIG, T1_SIG_N, 0},
+ {T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N, 0},
+ {T1_SIG_SW, T1_SIG_W, T1_SIG_NW, 0},
+ {T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW, 0}
+ };
+
+ __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]);
+ __m64 tmp2 = *(__m64*)((void*)&flagsp[-1 ]);
+ __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]);
+
+ tmp1 = _mm_or_si64(tmp1, mod[s]);
+ tmp2 = _mm_or_si64(tmp2, mod[s+2]);
+ tmp3 = _mm_or_si64(tmp3, mod[s+4]);
+
+ *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1;
+ *(__m64*)((void*)&flagsp[-1 ]) = tmp2;
+ *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3;
+}
+
+#else
static void t1_updateflags(flag_t *flagsp, int s, int stride) {
static const flag_t mod[] = {
@@ -316,6 +353,7 @@
}
#endif
+#endif
static void t1_enc_sigpass_step(
opj_t1_t *t1,
@@ -720,18 +758,14 @@
| ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48);
agg = !tmp;
#else
+ int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)];
+ agg = flagsp[1];
if (cblksty & J2K_CCP_CBLKSTY_VSC) {
- agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || (t1->flags[(k+4) + (i+1)*(t1->h+2)]
- & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH));
- } else {
- agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH));
+ agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16);
}
+ agg |= flagsp[0];
+ agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16;
+ agg = !agg;
#endif
} else {
agg = 0;
@@ -820,7 +854,7 @@
memset(t1->data,0,datasize * sizeof(int));
flagssize=(h+2) * (w+2);
-#ifdef __amd64__
+#if defined(__amd64__) || defined(__MMX__)
/* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
because three shorts = 48 bits. */
++flagssize;
@@ -886,6 +920,9 @@
int correction = 3;
type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
+#if !defined(__amd64__) && defined(__MMX__)
+ _mm_empty();
+#endif
switch (passtype) {
case 0:
t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty);
@@ -900,6 +937,9 @@
mqc_segmark_enc(mqc);
break;
}
+#if !defined(__amd64__) && defined(__MMX__)
+ _mm_empty();
+#endif
/* fixed_quality */
cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps);
@@ -1004,6 +1044,9 @@
mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3);
mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4);
+#if !defined(__amd64__) && defined(__MMX__)
+ _mm_empty();
+#endif
for (segno = 0; segno < cblk->numsegs; ++segno) {
opj_tcd_seg_t *seg = &cblk->segs[segno];
@@ -1044,6 +1087,9 @@
}
}
}
+#if !defined(__amd64__) && defined(__MMX__)
+ _mm_empty();
+#endif
}
/* ----------------------------------------------------------------------- */