Handle the new IROps introduced to support ARM64 SIMD.

git-svn-id: svn://svn.valgrind.org/valgrind/trunk@14362
This commit is contained in:
Julian Seward 2014-08-26 18:35:13 +00:00
parent 319765a595
commit 3af5a88ca6

View File

@ -48,6 +48,13 @@
Check the interpretation for vector narrowing and widening ops,
particularly the saturating ones. I suspect they are either overly
pessimistic and/or wrong.
Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
saturating shifts): the interpretation is overly pessimistic.
See comments on the relevant cases below for details.
Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
both rounding and non-rounding variants): ditto
*/
/* This file implements the Memcheck instrumentation, and in
@ -790,6 +797,31 @@ static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
}
if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
/* Use InterleaveHI64x2 to copy the top half of the vector into
the bottom half. Then we can UifU it with the original, throw
away the upper half of the result, and PCast-I64-to-I64
the lower half. */
// Generates vbits[127:64] : vbits[127:64]
IRAtom* hi64hi64
= assignNew('V', mce, Ity_V128,
binop(Iop_InterleaveHI64x2, vbits, vbits));
// Generates
// UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
// == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
IRAtom* lohi64
= mkUifUV128(mce, hi64hi64, vbits);
// Generates UifU(vbits[127:64],vbits[63:0])
IRAtom* lo64
= assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
// Generates
// PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
// == PCast-to-I64( vbits[127:0] )
IRAtom* res
= assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
return res;
}
/* Else do it the slow way .. */
/* First of all, collapse vbits down to a single bit. */
tmp1 = NULL;
@ -857,6 +889,42 @@ static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
}
}
/* This is a minor variant. It takes an arg of some type and returns
a value of the same type. The result consists entirely of Defined
(zero) bits except its least significant bit, which is a PCast of
the entire argument down to a single bit. */
static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
{
if (ty == Ity_V128) {
/* --- Case for V128 --- */
IRAtom* varg128 = varg;
// generates: PCast-to-I64(varg128)
IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
// Now introduce zeros (defined bits) in the top 63 places
// generates: Def--(63)--Def PCast-to-I1(varg128)
IRAtom* d63pc
= assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
// generates: Def--(64)--Def
IRAtom* d64
= definedOfType(Ity_I64);
// generates: Def--(127)--Def PCast-to-I1(varg128)
IRAtom* res
= assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
return res;
}
if (ty == Ity_I64) {
/* --- Case for I64 --- */
// PCast to 64
IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
// Zero (Def) out the top 63 bits
IRAtom* res
= assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
return res;
}
/*NOTREACHED*/
tl_assert(0);
}
/* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
/*
Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
@ -3123,11 +3191,20 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
/* V x V shifts/rotates are done using the standard lazy scheme. */
/* For the non-rounding variants of bi-di vector x vector
shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
But note that this is overly pessimistic, because in fact only
the bottom 8 bits of each lane of the second argument are taken
into account when shifting. So really we ought to ignore
undefinedness in bits 8 and above of each lane in the
second argument. */
case Iop_Shl8x16:
case Iop_Shr8x16:
case Iop_Sar8x16:
case Iop_Sal8x16:
case Iop_Rol8x16:
case Iop_Sh8Sx16:
case Iop_Sh8Ux16:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast8x16(mce,vatom2)
@ -3138,6 +3215,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_Sar16x8:
case Iop_Sal16x8:
case Iop_Rol16x8:
case Iop_Sh16Sx8:
case Iop_Sh16Ux8:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast16x8(mce,vatom2)
@ -3148,6 +3227,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_Sar32x4:
case Iop_Sal32x4:
case Iop_Rol32x4:
case Iop_Sh32Sx4:
case Iop_Sh32Ux4:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast32x4(mce,vatom2)
@ -3158,11 +3239,31 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_Sar64x2:
case Iop_Sal64x2:
case Iop_Rol64x2:
case Iop_Sh64Sx2:
case Iop_Sh64Ux2:
return mkUifUV128(mce,
assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
mkPCast64x2(mce,vatom2)
);
/* For the rounding variants of bi-di vector x vector shifts, the
rounding adjustment can cause undefinedness to propagate through
the entire lane, in the worst case. Too complex to handle
properly .. just UifU the arguments and then PCast them.
Suboptimal but safe. */
case Iop_Rsh8Sx16:
case Iop_Rsh8Ux16:
return binary8Ix16(mce, vatom1, vatom2);
case Iop_Rsh16Sx8:
case Iop_Rsh16Ux8:
return binary16Ix8(mce, vatom1, vatom2);
case Iop_Rsh32Sx4:
case Iop_Rsh32Ux4:
return binary32Ix4(mce, vatom1, vatom2);
case Iop_Rsh64Sx2:
case Iop_Rsh64Ux2:
return binary64Ix2(mce, vatom1, vatom2);
case Iop_F32ToFixed32Ux4_RZ:
case Iop_F32ToFixed32Sx4_RZ:
case Iop_Fixed32UToF32x4_RN:
@ -3191,6 +3292,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_Avg8Sx16:
case Iop_QAdd8Ux16:
case Iop_QAdd8Sx16:
case Iop_QAddExtUSsatSS8x16:
case Iop_QAddExtSUsatUU8x16:
case Iop_QSal8x16:
case Iop_QShl8x16:
case Iop_Add8x16:
@ -3216,6 +3319,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_Avg16Sx8:
case Iop_QAdd16Ux8:
case Iop_QAdd16Sx8:
case Iop_QAddExtUSsatSS16x8:
case Iop_QAddExtSUsatUU16x8:
case Iop_QSal16x8:
case Iop_QShl16x8:
case Iop_Add16x8:
@ -3232,6 +3337,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_QAdd32Ux4:
case Iop_QSub32Sx4:
case Iop_QSub32Ux4:
case Iop_QAddExtUSsatSS32x4:
case Iop_QAddExtSUsatUU32x4:
case Iop_QSal32x4:
case Iop_QShl32x4:
case Iop_Avg32Ux4:
@ -3262,6 +3369,8 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
case Iop_QAdd64Sx2:
case Iop_QSub64Ux2:
case Iop_QSub64Sx2:
case Iop_QAddExtUSsatSS64x2:
case Iop_QAddExtSUsatUU64x2:
case Iop_PolynomialMulAdd64x2:
case Iop_CipherV128:
case Iop_CipherLV128:
@ -3359,6 +3468,80 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
complainIfUndefined(mce, atom2, NULL);
return mkPCast32x4(mce, vatom1);
/* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
To make this simpler, do the following:
* complain if the shift amount (the I8) is undefined
* pcast each lane at the wide width
* truncate each lane to half width
* pcast the resulting 64-bit value to a single bit and use
that as the least significant bit of the upper half of the
result. */
case Iop_QandQShrNnarrow64Uto32Ux2:
case Iop_QandQSarNnarrow64Sto32Sx2:
case Iop_QandQSarNnarrow64Sto32Ux2:
case Iop_QandQRShrNnarrow64Uto32Ux2:
case Iop_QandQRSarNnarrow64Sto32Sx2:
case Iop_QandQRSarNnarrow64Sto32Ux2:
case Iop_QandQShrNnarrow32Uto16Ux4:
case Iop_QandQSarNnarrow32Sto16Sx4:
case Iop_QandQSarNnarrow32Sto16Ux4:
case Iop_QandQRShrNnarrow32Uto16Ux4:
case Iop_QandQRSarNnarrow32Sto16Sx4:
case Iop_QandQRSarNnarrow32Sto16Ux4:
case Iop_QandQShrNnarrow16Uto8Ux8:
case Iop_QandQSarNnarrow16Sto8Sx8:
case Iop_QandQSarNnarrow16Sto8Ux8:
case Iop_QandQRShrNnarrow16Uto8Ux8:
case Iop_QandQRSarNnarrow16Sto8Sx8:
case Iop_QandQRSarNnarrow16Sto8Ux8:
{
IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
IROp opNarrow = Iop_INVALID;
switch (op) {
case Iop_QandQShrNnarrow64Uto32Ux2:
case Iop_QandQSarNnarrow64Sto32Sx2:
case Iop_QandQSarNnarrow64Sto32Ux2:
case Iop_QandQRShrNnarrow64Uto32Ux2:
case Iop_QandQRSarNnarrow64Sto32Sx2:
case Iop_QandQRSarNnarrow64Sto32Ux2:
fnPessim = mkPCast64x2;
opNarrow = Iop_NarrowUn64to32x2;
break;
case Iop_QandQShrNnarrow32Uto16Ux4:
case Iop_QandQSarNnarrow32Sto16Sx4:
case Iop_QandQSarNnarrow32Sto16Ux4:
case Iop_QandQRShrNnarrow32Uto16Ux4:
case Iop_QandQRSarNnarrow32Sto16Sx4:
case Iop_QandQRSarNnarrow32Sto16Ux4:
fnPessim = mkPCast32x4;
opNarrow = Iop_NarrowUn32to16x4;
break;
case Iop_QandQShrNnarrow16Uto8Ux8:
case Iop_QandQSarNnarrow16Sto8Sx8:
case Iop_QandQSarNnarrow16Sto8Ux8:
case Iop_QandQRShrNnarrow16Uto8Ux8:
case Iop_QandQRSarNnarrow16Sto8Sx8:
case Iop_QandQRSarNnarrow16Sto8Ux8:
fnPessim = mkPCast16x8;
opNarrow = Iop_NarrowUn16to8x8;
break;
default:
tl_assert(0);
}
complainIfUndefined(mce, atom2, NULL);
// Pessimised shift result
IRAtom* shV
= fnPessim(mce, vatom1);
// Narrowed, pessimised shift result
IRAtom* shVnarrowed
= assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
// Generates: Def--(63)--Def PCast-to-I1(narrowed)
IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
// and assemble the result
return assignNew('V', mce, Ity_V128,
binop(Iop_64HLtoV128, qV, shVnarrowed));
}
case Iop_Mull32Sx2:
case Iop_Mull32Ux2:
case Iop_QDMull32Sx2:
@ -3977,6 +4160,66 @@ IRAtom* expr2vbits_Binop ( MCEnv* mce,
mkPCast32x8(mce, vatom2)
);
/* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
Handle the shifted results in the same way that other
binary Q ops are handled, eg QSub: UifU the two args,
then pessimise -- which is binaryNIxM. But for the upper
V128, we require to generate just 1 bit which is the
pessimised shift result, with 127 defined zeroes above it.
Note that this overly pessimistic in that in fact only the
bottom 8 bits of each lane of the second arg determine the shift
amount. Really we ought to ignore any undefinedness in the
rest of the lanes of the second arg. */
case Iop_QandSQsh64x2: case Iop_QandUQsh64x2:
case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
case Iop_QandSQsh32x4: case Iop_QandUQsh32x4:
case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
case Iop_QandSQsh16x8: case Iop_QandUQsh16x8:
case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
case Iop_QandSQsh8x16: case Iop_QandUQsh8x16:
case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
{
// The function to generate the pessimised shift result
IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
switch (op) {
case Iop_QandSQsh64x2:
case Iop_QandUQsh64x2:
case Iop_QandSQRsh64x2:
case Iop_QandUQRsh64x2:
binaryNIxM = binary64Ix2;
break;
case Iop_QandSQsh32x4:
case Iop_QandUQsh32x4:
case Iop_QandSQRsh32x4:
case Iop_QandUQRsh32x4:
binaryNIxM = binary32Ix4;
break;
case Iop_QandSQsh16x8:
case Iop_QandUQsh16x8:
case Iop_QandSQRsh16x8:
case Iop_QandUQRsh16x8:
binaryNIxM = binary16Ix8;
break;
case Iop_QandSQsh8x16:
case Iop_QandUQsh8x16:
case Iop_QandSQRsh8x16:
case Iop_QandUQRsh8x16:
binaryNIxM = binary8Ix16;
break;
default:
tl_assert(0);
}
tl_assert(binaryNIxM);
// Pessimised shift result, shV[127:0]
IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
// Generates: Def--(127)--Def PCast-to-I1(shV)
IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
// and assemble the result
return assignNew('V', mce, Ity_V256,
binop(Iop_V128HLtoV256, qV, shV));
}
default:
ppIROp(op);
VG_(tool_panic)("memcheck:expr2vbits_Binop");
@ -4047,6 +4290,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_Dup8x16:
case Iop_Dup16x8:
case Iop_Dup32x4:
case Iop_Reverse1sIn8_x16:
case Iop_Reverse8sIn16_x8:
case Iop_Reverse8sIn32_x4:
case Iop_Reverse16sIn32_x4:
@ -4232,6 +4476,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
case Iop_FtoI32Ux4_RZ:
case Iop_FtoI32Sx4_RZ:
case Iop_Abs32x4:
case Iop_RSqrtEst32Ux4:
return mkPCast32x4(mce, vatom);
case Iop_CmpwNEZ32: