b3eda9b
commit 97d336b79e36f6c99d8b07f49ebc9b780e6df84e
b3eda9b
Author: Julian Seward <jseward@acm.org>
b3eda9b
Date:   Tue Nov 20 11:07:37 2018 +0100
b3eda9b
b3eda9b
    Add ppc host-side isel and instruction support for IROps added in previous commit.
b3eda9b
    
b3eda9b
    VEX/priv/host_ppc_defs.c, VEX/priv/host_ppc_defs.h:
b3eda9b
    
b3eda9b
    Dont emit cnttz{w,d}.  We may need them on a target which doesn't support
b3eda9b
    them.  Instead we can generate a fairly reasonable alternative sequence with
b3eda9b
    cntlz{w,d} instead.
b3eda9b
    
b3eda9b
    Add support for emitting popcnt{w,d}.
b3eda9b
    
b3eda9b
    VEX/priv/host_ppc_isel.c
b3eda9b
    
b3eda9b
    Add support for: Iop_ClzNat32 Iop_ClzNat64
b3eda9b
    
b3eda9b
    Redo support for: Iop_Ctz{32,64} and their Nat equivalents, so as to not use
b3eda9b
    cnttz{w,d}, as mentioned above.
b3eda9b
    
b3eda9b
    Add support for: Iop_PopCount64 Iop_PopCount32 Iop_Reverse8sIn32_x1
b3eda9b
b3eda9b
diff --git a/VEX/priv/host_ppc_defs.c b/VEX/priv/host_ppc_defs.c
b3eda9b
index b073c1d..f4b52e4 100644
b3eda9b
--- a/VEX/priv/host_ppc_defs.c
b3eda9b
+++ b/VEX/priv/host_ppc_defs.c
b3eda9b
@@ -501,9 +501,9 @@ const HChar* showPPCUnaryOp ( PPCUnaryOp op ) {
b3eda9b
    case Pun_NEG:   return "neg";
b3eda9b
    case Pun_CLZ32: return "cntlzw";
b3eda9b
    case Pun_CLZ64: return "cntlzd";
b3eda9b
-   case Pun_CTZ32: return "cnttzw";
b3eda9b
-   case Pun_CTZ64: return "cnttzd";
b3eda9b
    case Pun_EXTSW: return "extsw";
b3eda9b
+   case Pun_POP32: return "popcntw";
b3eda9b
+   case Pun_POP64: return "popcntd";
b3eda9b
    default: vpanic("showPPCUnaryOp");
b3eda9b
    }
b3eda9b
 }
b3eda9b
@@ -4265,20 +4265,19 @@ Int emit_PPCInstr ( /*MB_MOD*/Bool* is_profInc,
b3eda9b
          vassert(mode64);
b3eda9b
          p = mkFormX(p, 31, r_src, r_dst, 0, 58, 0, endness_host);
b3eda9b
          break;
b3eda9b
-      case Pun_CTZ32:  // cnttzw r_dst, r_src
b3eda9b
-         /* Note oder of src and dst is backwards from normal */
b3eda9b
-         p = mkFormX(p, 31, r_src, r_dst, 0, 538, 0, endness_host);
b3eda9b
-         break;
b3eda9b
-      case Pun_CTZ64:  // cnttzd r_dst, r_src
b3eda9b
-         /* Note oder of src and dst is backwards from normal */
b3eda9b
-         vassert(mode64);
b3eda9b
-         p = mkFormX(p, 31, r_src, r_dst, 0, 570, 0, endness_host);
b3eda9b
-         break;
b3eda9b
       case Pun_EXTSW:  // extsw r_dst, r_src
b3eda9b
          vassert(mode64);
b3eda9b
          p = mkFormX(p, 31, r_src, r_dst, 0, 986, 0, endness_host);
b3eda9b
          break;
b3eda9b
-      default: goto bad;
b3eda9b
+      case Pun_POP32:  // popcntw r_dst, r_src
b3eda9b
+         p = mkFormX(p, 31, r_src, r_dst, 0, 378, 0, endness_host);
b3eda9b
+         break;
b3eda9b
+      case Pun_POP64:  // popcntd r_dst, r_src
b3eda9b
+         vassert(mode64);
b3eda9b
+         p = mkFormX(p, 31, r_src, r_dst, 0, 506, 0, endness_host);
b3eda9b
+         break;
b3eda9b
+      default:
b3eda9b
+         goto bad;
b3eda9b
       }
b3eda9b
       goto done;
b3eda9b
    }
b3eda9b
diff --git a/VEX/priv/host_ppc_defs.h b/VEX/priv/host_ppc_defs.h
b3eda9b
index 17baff5..321fba9 100644
b3eda9b
--- a/VEX/priv/host_ppc_defs.h
b3eda9b
+++ b/VEX/priv/host_ppc_defs.h
b3eda9b
@@ -291,9 +291,9 @@ typedef
b3eda9b
       Pun_NOT,
b3eda9b
       Pun_CLZ32,
b3eda9b
       Pun_CLZ64,
b3eda9b
-      Pun_CTZ32,
b3eda9b
-      Pun_CTZ64,
b3eda9b
-      Pun_EXTSW
b3eda9b
+      Pun_EXTSW,
b3eda9b
+      Pun_POP32, // popcntw
b3eda9b
+      Pun_POP64  // popcntd
b3eda9b
    }
b3eda9b
    PPCUnaryOp;
b3eda9b
 
b3eda9b
diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c
b3eda9b
index 6bdb5f7..5242176 100644
b3eda9b
--- a/VEX/priv/host_ppc_isel.c
b3eda9b
+++ b/VEX/priv/host_ppc_isel.c
b3eda9b
@@ -2065,12 +2065,15 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e,
b3eda9b
             return r_dst;
b3eda9b
          }
b3eda9b
          break;
b3eda9b
-      case Iop_Clz32:
b3eda9b
-      case Iop_Clz64: {
b3eda9b
+
b3eda9b
+      case Iop_Clz32: case Iop_ClzNat32:
b3eda9b
+      case Iop_Clz64: case Iop_ClzNat64: {
b3eda9b
+         // cntlz is available even in the most basic (earliest) ppc
b3eda9b
+         // variants, so it's safe to generate it unconditionally.
b3eda9b
          HReg r_src, r_dst;
b3eda9b
-         PPCUnaryOp op_clz = (op_unop == Iop_Clz32) ? Pun_CLZ32 :
b3eda9b
-                                                      Pun_CLZ64;
b3eda9b
-         if (op_unop == Iop_Clz64 && !mode64)
b3eda9b
+         PPCUnaryOp op_clz = (op_unop == Iop_Clz32 || op_unop == Iop_ClzNat32)
b3eda9b
+                                ? Pun_CLZ32 : Pun_CLZ64;
b3eda9b
+         if ((op_unop == Iop_Clz64 || op_unop == Iop_ClzNat64) && !mode64)
b3eda9b
             goto irreducible;
b3eda9b
          /* Count leading zeroes. */
b3eda9b
          r_dst = newVRegI(env);
b3eda9b
@@ -2079,18 +2082,133 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e,
b3eda9b
          return r_dst;
b3eda9b
       }
b3eda9b
 
b3eda9b
-      case Iop_Ctz32:
b3eda9b
-      case Iop_Ctz64: {
b3eda9b
-         HReg r_src, r_dst;
b3eda9b
-         PPCUnaryOp op_clz = (op_unop == Iop_Ctz32) ? Pun_CTZ32 :
b3eda9b
-                                                      Pun_CTZ64;
b3eda9b
-         if (op_unop == Iop_Ctz64 && !mode64)
b3eda9b
-            goto irreducible;
b3eda9b
-         /* Count trailing zeroes. */
b3eda9b
-         r_dst = newVRegI(env);
b3eda9b
-         r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
b3eda9b
-         addInstr(env, PPCInstr_Unary(op_clz,r_dst,r_src));
b3eda9b
-         return r_dst;
b3eda9b
+      //case Iop_Ctz32:
b3eda9b
+      case Iop_CtzNat32:
b3eda9b
+      //case Iop_Ctz64:
b3eda9b
+      case Iop_CtzNat64:
b3eda9b
+      {
b3eda9b
+         // Generate code using Clz, because we can't assume the host has
b3eda9b
+         // Ctz.  In particular, part of the fix for bug 386945 involves
b3eda9b
+         // creating a Ctz in ir_opt.c from smaller fragments.
b3eda9b
+         PPCUnaryOp op_clz = Pun_CLZ64;
b3eda9b
+         Int WS = 64;
b3eda9b
+         if (op_unop == Iop_Ctz32 || op_unop == Iop_CtzNat32) {
b3eda9b
+            op_clz = Pun_CLZ32;
b3eda9b
+            WS = 32;
b3eda9b
+         }
b3eda9b
+         /* Compute ctz(arg) = wordsize - clz(~arg & (arg - 1)), thusly:
b3eda9b
+            t1 = arg - 1
b3eda9b
+            t2 = not arg
b3eda9b
+            t2 = t2 & t1
b3eda9b
+            t2 = clz t2
b3eda9b
+            t1 = WS
b3eda9b
+            t2 = t1 - t2
b3eda9b
+            // result in t2
b3eda9b
+         */
b3eda9b
+         HReg arg = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
b3eda9b
+         HReg t1 = newVRegI(env);
b3eda9b
+         HReg t2 = newVRegI(env);
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_SUB, t1, arg, PPCRH_Imm(True, 1)));
b3eda9b
+         addInstr(env, PPCInstr_Unary(Pun_NOT, t2, arg));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_AND, t2, t2, PPCRH_Reg(t1)));
b3eda9b
+         addInstr(env, PPCInstr_Unary(op_clz, t2, t2));
b3eda9b
+         addInstr(env, PPCInstr_LI(t1, WS, False/*!64-bit imm*/));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_SUB, t2, t1, PPCRH_Reg(t2)));
b3eda9b
+         return t2;
b3eda9b
+      }
b3eda9b
+
b3eda9b
+      case Iop_PopCount64: {
b3eda9b
+         // popcnt{x,d} is only available in later arch revs (ISA 3.0,
b3eda9b
+         // maybe) so it's not really correct to emit it here without a caps
b3eda9b
+         // check for the host.
b3eda9b
+         if (mode64) {
b3eda9b
+            HReg r_dst = newVRegI(env);
b3eda9b
+            HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
b3eda9b
+            addInstr(env, PPCInstr_Unary(Pun_POP64, r_dst, r_src));
b3eda9b
+            return r_dst;
b3eda9b
+         }
b3eda9b
+         // We don't expect to be required to handle this in 32-bit mode.
b3eda9b
+         break;
b3eda9b
+      }
b3eda9b
+
b3eda9b
+      case Iop_PopCount32: {
b3eda9b
+         // Similar comment as for Ctz just above applies -- we really
b3eda9b
+         // should have a caps check here.
b3eda9b
+
b3eda9b
+        HReg r_dst = newVRegI(env);
b3eda9b
+        // This actually generates popcntw, which in 64 bit mode does a
b3eda9b
+        // 32-bit count individually for both low and high halves of the
b3eda9b
+        // word.  Per the comment at the top of iselIntExpr_R, in the 64
b3eda9b
+        // bit mode case, the user of this result is required to ignore
b3eda9b
+        // the upper 32 bits of the result.  In 32 bit mode this is all
b3eda9b
+        // moot.  It is however unclear from the PowerISA 3.0 docs that
b3eda9b
+        // the instruction exists in 32 bit mode; however our own front
b3eda9b
+        // end (guest_ppc_toIR.c) accepts it, so I guess it does exist.
b3eda9b
+        HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
b3eda9b
+        addInstr(env, PPCInstr_Unary(Pun_POP32, r_dst, r_src));
b3eda9b
+        return r_dst;
b3eda9b
+      }
b3eda9b
+
b3eda9b
+      case Iop_Reverse8sIn32_x1: {
b3eda9b
+         // A bit of a mouthful, but simply .. 32-bit byte swap.
b3eda9b
+         // This is pretty rubbish code.  We could do vastly better if
b3eda9b
+         // rotates, and better, rotate-inserts, were allowed.  Note that
b3eda9b
+         // even on a 64 bit target, the right shifts must be done as 32-bit
b3eda9b
+         // so as to introduce zero bits in the right places.  So it seems
b3eda9b
+         // simplest to do the whole sequence in 32-bit insns.
b3eda9b
+         /*
b3eda9b
+            r     = <argument>  // working temporary, initial byte order ABCD
b3eda9b
+            Mask  = 00FF00FF
b3eda9b
+            nMask = not Mask
b3eda9b
+            tHi   = and r, Mask
b3eda9b
+            tHi   = shl tHi, 8
b3eda9b
+            tLo   = and r, nMask
b3eda9b
+            tLo   = shr tLo, 8
b3eda9b
+            r     = or tHi, tLo  // now r has order BADC
b3eda9b
+            and repeat for 16 bit chunks ..
b3eda9b
+            Mask  = 0000FFFF
b3eda9b
+            nMask = not Mask
b3eda9b
+            tHi   = and r, Mask
b3eda9b
+            tHi   = shl tHi, 16
b3eda9b
+            tLo   = and r, nMask
b3eda9b
+            tLo   = shr tLo, 16
b3eda9b
+            r     = or tHi, tLo  // now r has order DCBA
b3eda9b
+         */
b3eda9b
+         HReg r_src  = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess);
b3eda9b
+         HReg rr     = newVRegI(env);
b3eda9b
+         HReg rMask  = newVRegI(env);
b3eda9b
+         HReg rnMask = newVRegI(env);
b3eda9b
+         HReg rtHi   = newVRegI(env);
b3eda9b
+         HReg rtLo   = newVRegI(env);
b3eda9b
+         // Copy r_src since we need to modify it
b3eda9b
+         addInstr(env, mk_iMOVds_RR(rr, r_src));
b3eda9b
+         // Swap within 16-bit lanes
b3eda9b
+         addInstr(env, PPCInstr_LI(rMask, 0x00FF00FFULL,
b3eda9b
+                                   False/* !64bit imm*/));
b3eda9b
+         addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask)));
b3eda9b
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/,
b3eda9b
+                                     rtHi, rtHi,
b3eda9b
+                                     PPCRH_Imm(False/*!signed imm*/, 8)));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask)));
b3eda9b
+         addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/,
b3eda9b
+                                     rtLo, rtLo,
b3eda9b
+                                     PPCRH_Imm(False/*!signed imm*/, 8)));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo)));
b3eda9b
+         // And now swap the two 16-bit chunks
b3eda9b
+         addInstr(env, PPCInstr_LI(rMask, 0x0000FFFFULL,
b3eda9b
+                                   False/* !64bit imm*/));
b3eda9b
+         addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask)));
b3eda9b
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/,
b3eda9b
+                                     rtHi, rtHi,
b3eda9b
+                                     PPCRH_Imm(False/*!signed imm*/, 16)));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask)));
b3eda9b
+         addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/,
b3eda9b
+                                     rtLo, rtLo,
b3eda9b
+                                     PPCRH_Imm(False/*!signed imm*/, 16)));
b3eda9b
+         addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo)));
b3eda9b
+         return rr;
b3eda9b
       }
b3eda9b
 
b3eda9b
       case Iop_Left8: