commit e221eca26be6b2396e3fcbf4117e630fc22e79f6 Author: Julian Seward Date: Tue Nov 20 11:28:42 2018 +0100 Add Memcheck support for IROps added in 42719898. memcheck/mc_translate.c: Add mkRight{32,64} as right-travelling analogues to mkLeft{32,64}. doCmpORD: for the cases of a signed comparison against zero, compute definedness of the 3 result bits (lt,gt,eq) separately, and, for the lt and eq bits, do it exactly accurately. expensiveCountTrailingZeroes: no functional change. Re-analyse/verify and add comments. expensiveCountLeadingZeroes: add. Very similar to expensiveCountTrailingZeroes. Add some comments to mark unary ops which are self-shadowing. Route Iop_Ctz{,Nat}{32,64} through expensiveCountTrailingZeroes. Route Iop_Clz{,Nat}{32,64} through expensiveCountLeadingZeroes. Add instrumentation for Iop_PopCount{32,64} and Iop_Reverse8sIn32_x1. memcheck/tests/vbit-test/irops.c Add dummy new entries for all new IROps, just enough to make it compile and run. diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c index 68a2ab3..c24db91 100644 --- a/memcheck/mc_translate.c +++ b/memcheck/mc_translate.c @@ -737,6 +737,34 @@ static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) { return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1)); } +/* --------- The Right-family of operations. --------- */ + +/* Unfortunately these are a lot more expensive then their Left + counterparts. Fortunately they are only very rarely used -- only for + count-leading-zeroes instrumentation. */ + +static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 ) +{ + for (Int i = 1; i <= 16; i *= 2) { + // a1 |= (a1 >>u i) + IRAtom* tmp + = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i))); + a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp)); + } + return a1; +} + +static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 ) +{ + for (Int i = 1; i <= 32; i *= 2) { + // a1 |= (a1 >>u i) + IRAtom* tmp + = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i))); + a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp)); + } + return a1; +} + /* --------- 'Improvement' functions for AND/OR. --------- */ /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give @@ -1280,20 +1308,18 @@ static IRAtom* doCmpORD ( MCEnv* mce, IRAtom* xxhash, IRAtom* yyhash, IRAtom* xx, IRAtom* yy ) { - Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U; - Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S; - IROp opOR = m64 ? Iop_Or64 : Iop_Or32; - IROp opAND = m64 ? Iop_And64 : Iop_And32; - IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32; - IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32; - IRType ty = m64 ? Ity_I64 : Ity_I32; - Int width = m64 ? 64 : 32; + Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U; + Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S; + IROp opOR = m64 ? Iop_Or64 : Iop_Or32; + IROp opAND = m64 ? Iop_And64 : Iop_And32; + IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32; + IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32; + IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32; + IRType ty = m64 ? Ity_I64 : Ity_I32; + Int width = m64 ? 64 : 32; Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32; - IRAtom* threeLeft1 = NULL; - IRAtom* sevenLeft1 = NULL; - tl_assert(isShadowAtom(mce,xxhash)); tl_assert(isShadowAtom(mce,yyhash)); tl_assert(isOriginalAtom(mce,xx)); @@ -1312,30 +1338,55 @@ static IRAtom* doCmpORD ( MCEnv* mce, /* fancy interpretation */ /* if yy is zero, then it must be fully defined (zero#). */ tl_assert(isZero(yyhash)); - threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1); + // This is still inaccurate, but I don't think it matters, since + // nobody writes code of the form + // "is signedly greater than zero?". + // We therefore simply declare "x >s 0" to be undefined if any bit in + // x is undefined. That's clearly suboptimal in some cases. Eg, if + // the highest order bit is a defined 1 then x is negative so it + // doesn't matter whether the remaining bits are defined or not. + IRAtom* t_0_gt_0_0 + = assignNew( + 'V', mce,ty, + binop( + opAND, + mkPCastTo(mce,ty, xxhash), + m64 ? mkU64(1<<2) : mkU32(1<<2) + )); + // For "x >u 1) + // + // That is, improver has its upper clz(atom)+1 bits equal to one; + // lower bits (if any) equal to zero. So it's exactly the right + // mask to use to remove the irrelevant undefined input bits. + /* Here are some examples: + atom = 0...0 1 U...U + R(atom) = 0...0 1 1...1 + R(atom) >>u 1 = 0...0 0 1...1 + ~(R(atom) >>u 1) = 1...1 1 0...0 + which correctly describes which bits of |atom| + actually influence the result + A boundary case + atom = 0...0 + R(atom) = 0...0 + R(atom) >>u 1 = 0...0 + ~(R(atom) >>u 1) = 1...1 + also a correct mask for the input: all input bits + are relevant + Another boundary case + atom = 1 1..1 + R(atom) = 1 1..1 + R(atom) >>u 1 = 0 1..1 + ~(R(atom) >>u 1) = 1 0..0 + also a correct mask: only the leftmost input bit + is relevant + Now with misc U bits interspersed: + atom = 0...0 1 U...U 0 1 U...U + R(atom) = 0...0 1 1...1 1 1 1...1 + R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1 + ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct + (Per initial implementation of 15 Nov 2018) + */ + improver = mkRight(mce, atom); + improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1))); + improver = assignNew('V', mce, ty, unop(notOp, improver)); + + // improved = vatom & improver + // + // That is, treat any V bits to the right of the leftmost clz(atom)+1 + // bits as "defined". improved = assignNew('V', mce, ty, binop(andOp, vatom, improver)); @@ -4705,6 +4866,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_RecipEst32F0x4: return unary32F0x4(mce, vatom); + // These are self-shadowing. case Iop_32UtoV128: case Iop_64UtoV128: case Iop_Dup8x16: @@ -4745,6 +4907,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_MulI128by10Carry: case Iop_F16toF64x2: case Iop_F64toF16x2: + // FIXME JRS 2018-Nov-15. This is surely not correct! return vatom; case Iop_I32StoF128: /* signed I32 -> F128 */ @@ -4770,7 +4933,6 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_RoundF64toF64_NegINF: case Iop_RoundF64toF64_PosINF: case Iop_RoundF64toF64_ZERO: - case Iop_Clz64: case Iop_D32toD64: case Iop_I32StoD64: case Iop_I32UtoD64: @@ -4785,17 +4947,32 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_D64toD128: return mkPCastTo(mce, Ity_I128, vatom); - case Iop_Clz32: case Iop_TruncF64asF32: case Iop_NegF32: case Iop_AbsF32: case Iop_F16toF32: return mkPCastTo(mce, Ity_I32, vatom); - case Iop_Ctz32: - case Iop_Ctz64: + case Iop_Ctz32: case Iop_CtzNat32: + case Iop_Ctz64: case Iop_CtzNat64: return expensiveCountTrailingZeroes(mce, op, atom, vatom); + case Iop_Clz32: case Iop_ClzNat32: + case Iop_Clz64: case Iop_ClzNat64: + return expensiveCountLeadingZeroes(mce, op, atom, vatom); + + // PopCount32: this is slightly pessimistic. It is true that the + // result depends on all input bits, so that aspect of the PCast is + // correct. However, regardless of the input, only the lowest 5 bits + // out of the output can ever be undefined. So we could actually + // "improve" the results here by marking the top 27 bits of output as + // defined. A similar comment applies for PopCount64. + case Iop_PopCount32: + return mkPCastTo(mce, Ity_I32, vatom); + case Iop_PopCount64: + return mkPCastTo(mce, Ity_I64, vatom); + + // These are self-shadowing. case Iop_1Uto64: case Iop_1Sto64: case Iop_8Uto64: @@ -4821,6 +4998,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_V256to64_2: case Iop_V256to64_3: return assignNew('V', mce, Ity_I64, unop(op, vatom)); + // These are self-shadowing. case Iop_64to32: case Iop_64HIto32: case Iop_1Uto32: @@ -4830,8 +5008,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_16Sto32: case Iop_8Sto32: case Iop_V128to32: + case Iop_Reverse8sIn32_x1: return assignNew('V', mce, Ity_I32, unop(op, vatom)); + // These are self-shadowing. case Iop_8Sto16: case Iop_8Uto16: case Iop_32to16: @@ -4840,6 +5020,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_GetMSBs8x16: return assignNew('V', mce, Ity_I16, unop(op, vatom)); + // These are self-shadowing. case Iop_1Uto8: case Iop_1Sto8: case Iop_16to8: @@ -4868,6 +5049,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_Not16: case Iop_Not8: case Iop_Not1: + // FIXME JRS 2018-Nov-15. This is surely not correct! return vatom; case Iop_CmpNEZ8x8: @@ -4929,6 +5111,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) case Iop_Ctz64x2: return mkPCast64x2(mce, vatom); + // This is self-shadowing. case Iop_PwBitMtxXpose64x2: return assignNew('V', mce, Ity_V128, unop(op, vatom)); diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c index bfd82fc..e8bf67d 100644 --- a/memcheck/tests/vbit-test/irops.c +++ b/memcheck/tests/vbit-test/irops.c @@ -111,6 +111,12 @@ static irop_t irops[] = { { DEFOP(Iop_Clz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, { DEFOP(Iop_Ctz64, UNDEF_ALL), .s390x = 0, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, { DEFOP(Iop_Ctz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_ClzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, // ppc32 asserts + { DEFOP(Iop_ClzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_CtzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_CtzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_PopCount64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_PopCount32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, { DEFOP(Iop_CmpLT32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, { DEFOP(Iop_CmpLT64S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 1 }, // ppc, mips assert { DEFOP(Iop_CmpLE32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, @@ -336,6 +342,7 @@ static irop_t irops[] = { { DEFOP(Iop_Sad8Ux4, UNDEF_UNKNOWN), }, { DEFOP(Iop_CmpNEZ16x2, UNDEF_UNKNOWN), }, { DEFOP(Iop_CmpNEZ8x4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_Reverse8sIn32_x1, UNDEF_UNKNOWN) }, /* ------------------ 64-bit SIMD FP ------------------------ */ { DEFOP(Iop_I32UtoFx2, UNDEF_UNKNOWN), }, { DEFOP(Iop_I32StoFx2, UNDEF_UNKNOWN), },