Siddhesh Poyarekar bd8a81d
commit 2506109403de69bd454de27835d42e6eb6ec3abc
Siddhesh Poyarekar bd8a81d
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Siddhesh Poyarekar bd8a81d
Date:   Wed Jun 12 10:36:48 2013 +0530
Siddhesh Poyarekar bd8a81d
Siddhesh Poyarekar bd8a81d
    Set/restore rounding mode only when needed
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    The most common use case of math functions is with default rounding
Siddhesh Poyarekar bd8a81d
    mode, i.e. rounding to nearest.  Setting and restoring rounding mode
Siddhesh Poyarekar bd8a81d
    is an unnecessary overhead for this, so I've added support for a
Siddhesh Poyarekar bd8a81d
    context, which does the set/restore only if the FP status needs a
Siddhesh Poyarekar bd8a81d
    change.  The code is written such that only x86 uses these.  Other
Siddhesh Poyarekar bd8a81d
    architectures should be unaffected by it, but would definitely benefit
Siddhesh Poyarekar bd8a81d
    if the set/restore has as much overhead relative to the rest of the
Siddhesh Poyarekar bd8a81d
    code, as the x86 bits do.
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    Here's a summary of the performance improvement due to these
Siddhesh Poyarekar bd8a81d
    improvements; I've only mentioned functions that use the set/restore
Siddhesh Poyarekar bd8a81d
    and have benchmark inputs for x86_64:
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    Before:
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    cos(): ITERS:4.69335e+08: TOTAL:28884.6Mcy, MAX:4080.28cy, MIN:57.562cy, 16248.6 calls/Mcy
Siddhesh Poyarekar bd8a81d
    exp(): ITERS:4.47604e+08: TOTAL:28796.2Mcy, MAX:207.721cy, MIN:62.385cy, 15543.9 calls/Mcy
Siddhesh Poyarekar bd8a81d
    pow(): ITERS:1.63485e+08: TOTAL:28879.9Mcy, MAX:362.255cy, MIN:172.469cy, 5660.86 calls/Mcy
Siddhesh Poyarekar bd8a81d
    sin(): ITERS:3.89578e+08: TOTAL:28900Mcy, MAX:704.859cy, MIN:47.583cy, 13480.2 calls/Mcy
Siddhesh Poyarekar bd8a81d
    tan(): ITERS:7.0971e+07: TOTAL:28902.2Mcy, MAX:1357.79cy, MIN:388.58cy, 2455.55 calls/Mcy
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    After:
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    cos(): ITERS:6.0014e+08: TOTAL:28875.9Mcy, MAX:364.283cy, MIN:45.716cy, 20783.4 calls/Mcy
Siddhesh Poyarekar bd8a81d
    exp(): ITERS:5.48578e+08: TOTAL:28764.9Mcy, MAX:191.617cy, MIN:51.011cy, 19071.1 calls/Mcy
Siddhesh Poyarekar bd8a81d
    pow(): ITERS:1.70013e+08: TOTAL:28873.6Mcy, MAX:689.522cy, MIN:163.989cy, 5888.18 calls/Mcy
Siddhesh Poyarekar bd8a81d
    sin(): ITERS:4.64079e+08: TOTAL:28891.5Mcy, MAX:6959.3cy, MIN:36.189cy, 16062.8 calls/Mcy
Siddhesh Poyarekar bd8a81d
    tan(): ITERS:7.2354e+07: TOTAL:28898.9Mcy, MAX:1295.57cy, MIN:380.698cy, 2503.7 calls/Mcy
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    So the improvements are:
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    cos: 27.9089%
Siddhesh Poyarekar bd8a81d
    exp: 22.6919%
Siddhesh Poyarekar bd8a81d
    pow: 4.01564%
Siddhesh Poyarekar bd8a81d
    sin: 19.1585%
Siddhesh Poyarekar bd8a81d
    tan: 1.96086%
Siddhesh Poyarekar bd8a81d
    
Siddhesh Poyarekar bd8a81d
    The downside of the change is that it will have an adverse performance
Siddhesh Poyarekar bd8a81d
    impact on non-default rounding modes, but I think the tradeoff is
Siddhesh Poyarekar bd8a81d
    justified.
Siddhesh Poyarekar bd8a81d
Siddhesh Poyarekar bd8a81d
diff --git a/include/fenv.h b/include/fenv.h
Siddhesh Poyarekar bd8a81d
index ed6d139..9f90d17 100644
Siddhesh Poyarekar bd8a81d
--- a/include/fenv.h
Siddhesh Poyarekar bd8a81d
+++ b/include/fenv.h
Siddhesh Poyarekar bd8a81d
@@ -1,5 +1,6 @@
Siddhesh Poyarekar bd8a81d
 #ifndef _FENV_H
Siddhesh Poyarekar bd8a81d
 #include <math/fenv.h>
Siddhesh Poyarekar bd8a81d
+#include <stdbool.h>
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 #ifndef _ISOMAC
Siddhesh Poyarekar bd8a81d
 /* Now define the internal interfaces.  */
Siddhesh Poyarekar bd8a81d
@@ -23,4 +24,13 @@ libm_hidden_proto (fetestexcept)
Siddhesh Poyarekar bd8a81d
 libm_hidden_proto (feclearexcept)
Siddhesh Poyarekar bd8a81d
 #endif
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
+/* Rounding mode context.  This allows functions to set/restore rounding mode
Siddhesh Poyarekar bd8a81d
+   only when the desired rounding mode is different from the current rounding
Siddhesh Poyarekar bd8a81d
+   mode.  */
Siddhesh Poyarekar bd8a81d
+struct rm_ctx
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  fenv_t env;
Siddhesh Poyarekar bd8a81d
+  bool updated_status;
Siddhesh Poyarekar bd8a81d
+};
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
 #endif
Siddhesh Poyarekar bd8a81d
diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
Siddhesh Poyarekar bd8a81d
index e98360d..c0fc03d 100644
Siddhesh Poyarekar bd8a81d
--- a/sysdeps/generic/math_private.h
Siddhesh Poyarekar bd8a81d
+++ b/sysdeps/generic/math_private.h
Siddhesh Poyarekar bd8a81d
@@ -553,35 +553,62 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
Siddhesh Poyarekar bd8a81d
 # define libc_feresetround_noexl libc_fesetenvl
Siddhesh Poyarekar bd8a81d
 #endif
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
+#if HAVE_RM_CTX
Siddhesh Poyarekar bd8a81d
+/* Set/Restore Rounding Modes only when necessary.  If defined, these functions
Siddhesh Poyarekar bd8a81d
+   set/restore floating point state only if the state needed within the lexical
Siddhesh Poyarekar bd8a81d
+   block is different from the current state.  This saves a lot of time when
Siddhesh Poyarekar bd8a81d
+   the floating point unit is much slower than the fixed point units.  */
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+# ifndef libc_feresetround_noex_ctx
Siddhesh Poyarekar bd8a81d
+#   define libc_feresetround_noex_ctx  libc_fesetenv_ctx
Siddhesh Poyarekar bd8a81d
+# endif
Siddhesh Poyarekar bd8a81d
+# ifndef libc_feresetround_noexf_ctx
Siddhesh Poyarekar bd8a81d
+#   define libc_feresetround_noexf_ctx libc_fesetenvf_ctx
Siddhesh Poyarekar bd8a81d
+# endif
Siddhesh Poyarekar bd8a81d
+# ifndef libc_feresetround_noexl_ctx
Siddhesh Poyarekar bd8a81d
+#   define libc_feresetround_noexl_ctx libc_fesetenvl_ctx
Siddhesh Poyarekar bd8a81d
+# endif
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+# ifndef libc_feholdsetround_53bit_ctx
Siddhesh Poyarekar bd8a81d
+#   define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
Siddhesh Poyarekar bd8a81d
+# endif
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+# ifndef libc_feresetround_53bit_ctx
Siddhesh Poyarekar bd8a81d
+#   define libc_feresetround_53bit_ctx libc_feresetround_ctx
Siddhesh Poyarekar bd8a81d
+# endif
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
Siddhesh Poyarekar bd8a81d
+  struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx)));	      \
Siddhesh Poyarekar bd8a81d
+  ROUNDFUNC ## _ctx (&ctx, (RM))
Siddhesh Poyarekar bd8a81d
+#else
Siddhesh Poyarekar bd8a81d
+# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \
Siddhesh Poyarekar bd8a81d
+  fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC)));	\
Siddhesh Poyarekar bd8a81d
+  ROUNDFUNC (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+#endif
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
 /* Save and restore the rounding mode within a lexical block.  */
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUND(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround)));	\
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetround (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround)
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUNDF(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundf)));	\
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetroundf (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetroundf)
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUNDL(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetroundl)));	\
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetroundl (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl)
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 /* Save and restore the rounding mode within a lexical block, and also
Siddhesh Poyarekar bd8a81d
    the set of exceptions raised within the block may be discarded.  */
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUND_NOEX(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noex))); \
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetround (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex)
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUND_NOEXF(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexf))); \
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetroundf (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf)
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUND_NOEXL(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_noexl))); \
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetroundl (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl)
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
Siddhesh Poyarekar bd8a81d
 #define SET_RESTORE_ROUND_53BIT(RM) \
Siddhesh Poyarekar bd8a81d
-  fenv_t __libc_save_rm __attribute__((cleanup(libc_feresetround_53bit))); \
Siddhesh Poyarekar bd8a81d
-  libc_feholdsetround_53bit (&__libc_save_rm, (RM))
Siddhesh Poyarekar bd8a81d
+  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_53bit,	      \
Siddhesh Poyarekar bd8a81d
+			     libc_feresetround_53bit)
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 #define __nan(str) \
Siddhesh Poyarekar bd8a81d
   (__builtin_constant_p (str) && str[0] == '\0' ? NAN : __nan (str))
Siddhesh Poyarekar bd8a81d
diff --git a/sysdeps/i386/fpu/fenv_private.h b/sysdeps/i386/fpu/fenv_private.h
Siddhesh Poyarekar bd8a81d
index 1f8336c..3998387 100644
Siddhesh Poyarekar bd8a81d
--- a/sysdeps/i386/fpu/fenv_private.h
Siddhesh Poyarekar bd8a81d
+++ b/sysdeps/i386/fpu/fenv_private.h
Siddhesh Poyarekar bd8a81d
@@ -322,6 +322,179 @@ libc_feresetround_387 (fenv_t *e)
Siddhesh Poyarekar bd8a81d
 # define libc_feholdsetround_53bit	libc_feholdsetround_387_53bit
Siddhesh Poyarekar bd8a81d
 #endif
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
+/* We have support for rounding mode context.  */
Siddhesh Poyarekar bd8a81d
+#define HAVE_RM_CTX 1
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  unsigned int mxcsr, new_mxcsr;
Siddhesh Poyarekar bd8a81d
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
Siddhesh Poyarekar bd8a81d
+  new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  ctx->env.__mxcsr = mxcsr;
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (mxcsr != new_mxcsr))
Siddhesh Poyarekar bd8a81d
+    {
Siddhesh Poyarekar bd8a81d
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
Siddhesh Poyarekar bd8a81d
+      ctx->updated_status = true;
Siddhesh Poyarekar bd8a81d
+    }
Siddhesh Poyarekar bd8a81d
+  else
Siddhesh Poyarekar bd8a81d
+    ctx->updated_status = false;
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+/* Unconditional since we want to overwrite any exceptions that occurred in the
Siddhesh Poyarekar bd8a81d
+   context.  This is also why all fehold* functions unconditionally write into
Siddhesh Poyarekar bd8a81d
+   ctx->env.  */
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_fesetenv_sse_ctx (struct rm_ctx *ctx)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_fesetenv_sse (&ctx->env);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feupdateenv_sse_ctx (struct rm_ctx *ctx)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (ctx->updated_status))
Siddhesh Poyarekar bd8a81d
+    libc_feupdateenv_test_sse (&ctx->env, 0);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_feholdexcept_387 (&ctx->env);
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  fpu_control_t cw = ctx->env.__control_word;
Siddhesh Poyarekar bd8a81d
+  fpu_control_t old_cw = cw;
Siddhesh Poyarekar bd8a81d
+  cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
Siddhesh Poyarekar bd8a81d
+  cw |= r | 0x3f;
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (old_cw != cw))
Siddhesh Poyarekar bd8a81d
+    {
Siddhesh Poyarekar bd8a81d
+      _FPU_SETCW (cw);
Siddhesh Poyarekar bd8a81d
+      ctx->updated_status = true;
Siddhesh Poyarekar bd8a81d
+    }
Siddhesh Poyarekar bd8a81d
+  else
Siddhesh Poyarekar bd8a81d
+    ctx->updated_status = false;
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  fpu_control_t cw, new_cw;
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  _FPU_GETCW (cw);
Siddhesh Poyarekar bd8a81d
+  new_cw = cw;
Siddhesh Poyarekar bd8a81d
+  new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED);
Siddhesh Poyarekar bd8a81d
+  new_cw |= r;
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  ctx->env.__control_word = cw;
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (new_cw != cw))
Siddhesh Poyarekar bd8a81d
+    {
Siddhesh Poyarekar bd8a81d
+      _FPU_SETCW (new_cw);
Siddhesh Poyarekar bd8a81d
+      ctx->updated_status = true;
Siddhesh Poyarekar bd8a81d
+    }
Siddhesh Poyarekar bd8a81d
+  else
Siddhesh Poyarekar bd8a81d
+    ctx->updated_status = false;
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  unsigned int mxcsr, new_mxcsr;
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  asm (STMXCSR " %0" : "=m" (*&mxcsr));
Siddhesh Poyarekar bd8a81d
+  new_mxcsr = (mxcsr & ~0x6000) | (r << 3);
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+  ctx->env.__mxcsr = mxcsr;
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (new_mxcsr != mxcsr))
Siddhesh Poyarekar bd8a81d
+    {
Siddhesh Poyarekar bd8a81d
+      asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr));
Siddhesh Poyarekar bd8a81d
+      ctx->updated_status = true;
Siddhesh Poyarekar bd8a81d
+    }
Siddhesh Poyarekar bd8a81d
+  else
Siddhesh Poyarekar bd8a81d
+    ctx->updated_status = false;
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feresetround_sse_ctx (struct rm_ctx *ctx)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (ctx->updated_status))
Siddhesh Poyarekar bd8a81d
+    libc_feresetround_sse (&ctx->env);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feresetround_387_ctx (struct rm_ctx *ctx)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (ctx->updated_status))
Siddhesh Poyarekar bd8a81d
+    _FPU_SETCW (ctx->env.__control_word);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+static __always_inline void
Siddhesh Poyarekar bd8a81d
+libc_feupdateenv_387_ctx (struct rm_ctx *ctx)
Siddhesh Poyarekar bd8a81d
+{
Siddhesh Poyarekar bd8a81d
+  if (__glibc_unlikely (ctx->updated_status))
Siddhesh Poyarekar bd8a81d
+    libc_feupdateenv_test_387 (&ctx->env, 0);
Siddhesh Poyarekar bd8a81d
+}
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+#ifdef __SSE_MATH__
Siddhesh Poyarekar bd8a81d
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_fesetenvf_ctx		libc_fesetenv_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feupdateenvf_ctx		libc_feupdateenv_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feresetroundf_ctx		libc_feresetround_sse_ctx
Siddhesh Poyarekar bd8a81d
+#else
Siddhesh Poyarekar bd8a81d
+# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feupdateenvf_ctx		libc_feupdateenv_387_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feholdsetroundf_ctx	libc_feholdsetround_387_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feresetroundf_ctx		libc_feresetround_387_ctx
Siddhesh Poyarekar bd8a81d
+#endif /* __SSE_MATH__ */
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+#ifdef __SSE2_MATH__
Siddhesh Poyarekar bd8a81d
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_fesetenv_ctx		libc_fesetenv_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feupdateenv_ctx		libc_feupdateenv_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feholdsetround_ctx	libc_feholdsetround_sse_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feresetround_ctx		libc_feresetround_sse_ctx
Siddhesh Poyarekar bd8a81d
+#else
Siddhesh Poyarekar bd8a81d
+# define libc_feholdexcept_setround_ctx	libc_feholdexcept_setround_387_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feupdateenv_ctx		libc_feupdateenv_387_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feresetround_ctx		libc_feresetround_387_ctx
Siddhesh Poyarekar bd8a81d
+#endif /* __SSE2_MATH__ */
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+#define libc_feholdexcept_setroundl_ctx	libc_feholdexcept_setround_387_ctx
Siddhesh Poyarekar bd8a81d
+#define libc_feupdateenvl_ctx		libc_feupdateenv_387_ctx
Siddhesh Poyarekar bd8a81d
+#define libc_feholdsetroundl_ctx	libc_feholdsetround_387_ctx
Siddhesh Poyarekar bd8a81d
+#define libc_feresetroundl_ctx		libc_feresetround_387_ctx
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
+#ifndef __SSE2_MATH__
Siddhesh Poyarekar bd8a81d
+# define libc_feholdsetround_53bit_ctx	libc_feholdsetround_387_53bit_ctx
Siddhesh Poyarekar bd8a81d
+# define libc_feresetround_53bit_ctx	libc_feresetround_387_ctx
Siddhesh Poyarekar bd8a81d
+#endif
Siddhesh Poyarekar bd8a81d
+
Siddhesh Poyarekar bd8a81d
 #undef __mxcsr
Siddhesh Poyarekar bd8a81d
 
Siddhesh Poyarekar bd8a81d
 #endif /* FENV_PRIVATE_H */