mirror of
https://github.com/Zenithsiz/ftmemsim-valgrind.git
synced 2026-02-13 06:33:56 +00:00
dead code -- it is better at spotting it than gcc is. git-svn-id: svn://svn.valgrind.org/vex/trunk@735
940 lines
24 KiB
C
940 lines
24 KiB
C
|
|
/*---------------------------------------------------------------*/
|
|
/*--- ---*/
|
|
/*--- This file (host-generic/h_generic_simd64.c) is ---*/
|
|
/*--- Copyright (c) 2005 OpenWorks LLP. All rights reserved. ---*/
|
|
/*--- ---*/
|
|
/*---------------------------------------------------------------*/
|
|
|
|
/*
|
|
This file is part of LibVEX, a library for dynamic binary
|
|
instrumentation and translation.
|
|
|
|
Copyright (C) 2004-2005 OpenWorks, LLP.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; Version 2 dated June 1991 of the
|
|
license.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or liability
|
|
for damages. See the GNU General Public License for more details.
|
|
|
|
Neither the names of the U.S. Department of Energy nor the
|
|
University of California nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software
|
|
without prior written permission.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
|
USA.
|
|
*/
|
|
|
|
/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
|
|
where the instruction selectors cannot generate code in-line.
|
|
These are purely back-end entities and cannot be seen/referenced
|
|
from IR. */
|
|
|
|
#include "libvex_basictypes.h"
|
|
#include "host-generic/h_generic_simd64.h"
|
|
|
|
|
|
|
|
/* Tuple/select functions for 32x2 vectors. */
|
|
|
|
static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
|
|
return (((ULong)w1) << 32) | ((ULong)w0);
|
|
}
|
|
|
|
static inline UInt sel32x2_1 ( ULong w64 ) {
|
|
return 0xFFFFFFFF & (UInt)(w64 >> 32);
|
|
}
|
|
static inline UInt sel32x2_0 ( ULong w64 ) {
|
|
return 0xFFFFFFFF & (UInt)w64;
|
|
}
|
|
|
|
|
|
/* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless
|
|
with 64-bit shifts so we give it a hand. */
|
|
|
|
static inline ULong mk16x4 ( UShort w3, UShort w2,
|
|
UShort w1, UShort w0 ) {
|
|
UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
|
|
UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
|
|
return mk32x2(hi32, lo32);
|
|
}
|
|
|
|
static inline UShort sel16x4_3 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFFFF & (UShort)(hi32 >> 16);
|
|
}
|
|
static inline UShort sel16x4_2 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFFFF & (UShort)hi32;
|
|
}
|
|
static inline UShort sel16x4_1 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFFFF & (UShort)(lo32 >> 16);
|
|
}
|
|
static inline UShort sel16x4_0 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFFFF & (UShort)lo32;
|
|
}
|
|
|
|
|
|
/* Tuple/select functions for 8x8 vectors. */
|
|
|
|
static inline ULong mk8x8 ( UChar w7, UChar w6,
|
|
UChar w5, UChar w4,
|
|
UChar w3, UChar w2,
|
|
UChar w1, UChar w0 ) {
|
|
UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16)
|
|
| (((UInt)w5) << 8) | (((UInt)w4) << 0);
|
|
UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16)
|
|
| (((UInt)w1) << 8) | (((UInt)w0) << 0);
|
|
return mk32x2(hi32, lo32);
|
|
}
|
|
|
|
static inline UChar sel8x8_7 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFF & (UChar)(hi32 >> 24);
|
|
}
|
|
static inline UChar sel8x8_6 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFF & (UChar)(hi32 >> 16);
|
|
}
|
|
static inline UChar sel8x8_5 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFF & (UChar)(hi32 >> 8);
|
|
}
|
|
static inline UChar sel8x8_4 ( ULong w64 ) {
|
|
UInt hi32 = (UInt)(w64 >> 32);
|
|
return 0xFF & (UChar)(hi32 >> 0);
|
|
}
|
|
static inline UChar sel8x8_3 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFF & (UChar)(lo32 >> 24);
|
|
}
|
|
static inline UChar sel8x8_2 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFF & (UChar)(lo32 >> 16);
|
|
}
|
|
static inline UChar sel8x8_1 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFF & (UChar)(lo32 >> 8);
|
|
}
|
|
static inline UChar sel8x8_0 ( ULong w64 ) {
|
|
UInt lo32 = (UInt)w64;
|
|
return 0xFF & (UChar)(lo32 >> 0);
|
|
}
|
|
|
|
|
|
/* Scalar helpers. */
|
|
|
|
static inline Short qadd16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) + ((Int)yy);
|
|
if (t < -32768) t = -32768;
|
|
if (t > 32767) t = 32767;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Char qadd8S ( Char xx, Char yy )
|
|
{
|
|
Int t = ((Int)xx) + ((Int)yy);
|
|
if (t < -128) t = -128;
|
|
if (t > 127) t = 127;
|
|
return (Char)t;
|
|
}
|
|
|
|
static inline UShort qadd16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt t = ((UInt)xx) + ((UInt)yy);
|
|
if (t > 0xFFFF) t = 0xFFFF;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UChar qadd8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt t = ((UInt)xx) + ((UInt)yy);
|
|
if (t > 0xFF) t = 0xFF;
|
|
return (UChar)t;
|
|
}
|
|
|
|
static inline Short qsub16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < -32768) t = -32768;
|
|
if (t > 32767) t = 32767;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Char qsub8S ( Char xx, Char yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < -128) t = -128;
|
|
if (t > 127) t = 127;
|
|
return (Char)t;
|
|
}
|
|
|
|
static inline UShort qsub16U ( UShort xx, UShort yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < 0) t = 0;
|
|
if (t > 0xFFFF) t = 0xFFFF;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UChar qsub8U ( UChar xx, UChar yy )
|
|
{
|
|
Int t = ((Int)xx) - ((Int)yy);
|
|
if (t < 0) t = 0;
|
|
if (t > 0xFF) t = 0xFF;
|
|
return (UChar)t;
|
|
}
|
|
|
|
static inline Short mul16 ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) * ((Int)yy);
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline Short mulhi16S ( Short xx, Short yy )
|
|
{
|
|
Int t = ((Int)xx) * ((Int)yy);
|
|
t >>=/*s*/ 16;
|
|
return (Short)t;
|
|
}
|
|
|
|
static inline UShort mulhi16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt t = ((UInt)xx) * ((UInt)yy);
|
|
t >>=/*u*/ 16;
|
|
return (UShort)t;
|
|
}
|
|
|
|
static inline UInt cmpeq32 ( UInt xx, UInt yy )
|
|
{
|
|
return xx==yy ? 0xFFFFFFFF : 0;
|
|
}
|
|
|
|
static inline UShort cmpeq16 ( UShort xx, UShort yy )
|
|
{
|
|
return xx==yy ? 0xFFFF : 0;
|
|
}
|
|
|
|
static inline UChar cmpeq8 ( UChar xx, UChar yy )
|
|
{
|
|
return xx==yy ? 0xFF : 0;
|
|
}
|
|
|
|
static inline UInt cmpgt32S ( Int xx, Int yy )
|
|
{
|
|
return xx>yy ? 0xFFFFFFFF : 0;
|
|
}
|
|
|
|
static inline UShort cmpgt16S ( Short xx, Short yy )
|
|
{
|
|
return xx>yy ? 0xFFFF : 0;
|
|
}
|
|
|
|
static inline UChar cmpgt8S ( Char xx, Char yy )
|
|
{
|
|
return xx>yy ? 0xFF : 0;
|
|
}
|
|
|
|
static inline UInt cmpnez32 ( UInt xx )
|
|
{
|
|
return xx==0 ? 0 : 0xFFFFFFFF;
|
|
}
|
|
|
|
static inline UShort cmpnez16 ( UShort xx )
|
|
{
|
|
return xx==0 ? 0 : 0xFFFF;
|
|
}
|
|
|
|
static inline UChar cmpnez8 ( UChar xx )
|
|
{
|
|
return xx==0 ? 0 : 0xFF;
|
|
}
|
|
|
|
static inline Short qnarrow32Sto16 ( UInt xx0 )
|
|
{
|
|
Int xx = (Int)xx0;
|
|
if (xx < -32768) xx = -32768;
|
|
if (xx > 32767) xx = 32767;
|
|
return (Short)xx;
|
|
}
|
|
|
|
static inline Char qnarrow16Sto8 ( UShort xx0 )
|
|
{
|
|
Short xx = (Short)xx0;
|
|
if (xx < -128) xx = -128;
|
|
if (xx > 127) xx = 127;
|
|
return (Char)xx;
|
|
}
|
|
|
|
static inline UChar qnarrow16Uto8 ( UShort xx0 )
|
|
{
|
|
Short xx = (Short)xx0;
|
|
if (xx < 0) xx = 0;
|
|
if (xx > 255) xx = 255;
|
|
return (UChar)xx;
|
|
}
|
|
|
|
/* shifts: we don't care about out-of-range ones, since
|
|
that is dealt with at a higher level. */
|
|
|
|
static inline UShort shl16 ( UShort v, UInt n )
|
|
{
|
|
return v << n;
|
|
}
|
|
|
|
static inline UShort shr16 ( UShort v, UInt n )
|
|
{
|
|
return (((UShort)v) >> n);
|
|
}
|
|
|
|
static inline UShort sar16 ( UShort v, UInt n )
|
|
{
|
|
return ((Short)v) >> n;
|
|
}
|
|
|
|
static inline UInt shl32 ( UInt v, UInt n )
|
|
{
|
|
return v << n;
|
|
}
|
|
|
|
static inline UInt shr32 ( UInt v, UInt n )
|
|
{
|
|
return (((UInt)v) >> n);
|
|
}
|
|
|
|
static inline UInt sar32 ( UInt v, UInt n )
|
|
{
|
|
return ((Int)v) >> n;
|
|
}
|
|
|
|
static inline UChar avg8U ( UChar xx, UChar yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi + 1) >> 1;
|
|
return (UChar)r;
|
|
}
|
|
|
|
static inline UShort avg16U ( UShort xx, UShort yy )
|
|
{
|
|
UInt xxi = (UInt)xx;
|
|
UInt yyi = (UInt)yy;
|
|
UInt r = (xxi + yyi + 1) >> 1;
|
|
return (UShort)r;
|
|
}
|
|
|
|
static inline Short max16S ( Short xx, Short yy )
|
|
{
|
|
return (xx > yy) ? xx : yy;
|
|
}
|
|
|
|
static inline UChar max8U ( UChar xx, UChar yy )
|
|
{
|
|
return (xx > yy) ? xx : yy;
|
|
}
|
|
|
|
static inline Short min16S ( Short xx, Short yy )
|
|
{
|
|
return (xx < yy) ? xx : yy;
|
|
}
|
|
|
|
static inline UChar min8U ( UChar xx, UChar yy )
|
|
{
|
|
return (xx < yy) ? xx : yy;
|
|
}
|
|
|
|
/* ----------------------------------------------------- */
|
|
/* Start of the externally visible functions. These simply
|
|
implement the corresponding IR primops. */
|
|
/* ----------------------------------------------------- */
|
|
|
|
/* ------------ Normal addition ------------ */
|
|
|
|
ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(xx) + sel32x2_1(yy),
|
|
sel32x2_0(xx) + sel32x2_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_3(xx) + sel16x4_3(yy),
|
|
sel16x4_2(xx) + sel16x4_2(yy),
|
|
sel16x4_1(xx) + sel16x4_1(yy),
|
|
sel16x4_0(xx) + sel16x4_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_7(xx) + sel8x8_7(yy),
|
|
sel8x8_6(xx) + sel8x8_6(yy),
|
|
sel8x8_5(xx) + sel8x8_5(yy),
|
|
sel8x8_4(xx) + sel8x8_4(yy),
|
|
sel8x8_3(xx) + sel8x8_3(yy),
|
|
sel8x8_2(xx) + sel8x8_2(yy),
|
|
sel8x8_1(xx) + sel8x8_1(yy),
|
|
sel8x8_0(xx) + sel8x8_0(yy)
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating addition ------------ */
|
|
|
|
ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Normal subtraction ------------ */
|
|
|
|
ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(xx) - sel32x2_1(yy),
|
|
sel32x2_0(xx) - sel32x2_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_3(xx) - sel16x4_3(yy),
|
|
sel16x4_2(xx) - sel16x4_2(yy),
|
|
sel16x4_1(xx) - sel16x4_1(yy),
|
|
sel16x4_0(xx) - sel16x4_0(yy)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_7(xx) - sel8x8_7(yy),
|
|
sel8x8_6(xx) - sel8x8_6(yy),
|
|
sel8x8_5(xx) - sel8x8_5(yy),
|
|
sel8x8_4(xx) - sel8x8_4(yy),
|
|
sel8x8_3(xx) - sel8x8_3(yy),
|
|
sel8x8_2(xx) - sel8x8_2(yy),
|
|
sel8x8_1(xx) - sel8x8_1(yy),
|
|
sel8x8_0(xx) - sel8x8_0(yy)
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating subtraction ------------ */
|
|
|
|
ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Multiplication ------------ */
|
|
|
|
ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mul16( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mul16( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mul16( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mul16( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Comparison ------------ */
|
|
|
|
ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
|
|
cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
|
|
{
|
|
return mk32x2(
|
|
cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
|
|
cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
|
|
{
|
|
return mk32x2(
|
|
cmpnez32( sel32x2_1(xx) ),
|
|
cmpnez32( sel32x2_0(xx) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
|
|
{
|
|
return mk16x4(
|
|
cmpnez16( sel16x4_3(xx) ),
|
|
cmpnez16( sel16x4_2(xx) ),
|
|
cmpnez16( sel16x4_1(xx) ),
|
|
cmpnez16( sel16x4_0(xx) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
|
|
{
|
|
return mk8x8(
|
|
cmpnez8( sel8x8_7(xx) ),
|
|
cmpnez8( sel8x8_6(xx) ),
|
|
cmpnez8( sel8x8_5(xx) ),
|
|
cmpnez8( sel8x8_4(xx) ),
|
|
cmpnez8( sel8x8_3(xx) ),
|
|
cmpnez8( sel8x8_2(xx) ),
|
|
cmpnez8( sel8x8_1(xx) ),
|
|
cmpnez8( sel8x8_0(xx) )
|
|
);
|
|
}
|
|
|
|
/* ------------ Saturating narrowing ------------ */
|
|
|
|
ULong h_generic_calc_QNarrow32Sx2 ( ULong aa, ULong bb )
|
|
{
|
|
UInt d = sel32x2_1(aa);
|
|
UInt c = sel32x2_0(aa);
|
|
UInt b = sel32x2_1(bb);
|
|
UInt a = sel32x2_0(bb);
|
|
return mk16x4(
|
|
qnarrow32Sto16(d),
|
|
qnarrow32Sto16(c),
|
|
qnarrow32Sto16(b),
|
|
qnarrow32Sto16(a)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QNarrow16Sx4 ( ULong aa, ULong bb )
|
|
{
|
|
UShort h = sel16x4_3(aa);
|
|
UShort g = sel16x4_2(aa);
|
|
UShort f = sel16x4_1(aa);
|
|
UShort e = sel16x4_0(aa);
|
|
UShort d = sel16x4_3(bb);
|
|
UShort c = sel16x4_2(bb);
|
|
UShort b = sel16x4_1(bb);
|
|
UShort a = sel16x4_0(bb);
|
|
return mk8x8(
|
|
qnarrow16Sto8(h),
|
|
qnarrow16Sto8(g),
|
|
qnarrow16Sto8(f),
|
|
qnarrow16Sto8(e),
|
|
qnarrow16Sto8(d),
|
|
qnarrow16Sto8(c),
|
|
qnarrow16Sto8(b),
|
|
qnarrow16Sto8(a)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_QNarrow16Ux4 ( ULong aa, ULong bb )
|
|
{
|
|
UShort h = sel16x4_3(aa);
|
|
UShort g = sel16x4_2(aa);
|
|
UShort f = sel16x4_1(aa);
|
|
UShort e = sel16x4_0(aa);
|
|
UShort d = sel16x4_3(bb);
|
|
UShort c = sel16x4_2(bb);
|
|
UShort b = sel16x4_1(bb);
|
|
UShort a = sel16x4_0(bb);
|
|
return mk8x8(
|
|
qnarrow16Uto8(h),
|
|
qnarrow16Uto8(g),
|
|
qnarrow16Uto8(f),
|
|
qnarrow16Uto8(e),
|
|
qnarrow16Uto8(d),
|
|
qnarrow16Uto8(c),
|
|
qnarrow16Uto8(b),
|
|
qnarrow16Uto8(a)
|
|
);
|
|
}
|
|
|
|
/* ------------ Interleaving ------------ */
|
|
|
|
ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_7(aa),
|
|
sel8x8_7(bb),
|
|
sel8x8_6(aa),
|
|
sel8x8_6(bb),
|
|
sel8x8_5(aa),
|
|
sel8x8_5(bb),
|
|
sel8x8_4(aa),
|
|
sel8x8_4(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
|
|
{
|
|
return mk8x8(
|
|
sel8x8_3(aa),
|
|
sel8x8_3(bb),
|
|
sel8x8_2(aa),
|
|
sel8x8_2(bb),
|
|
sel8x8_1(aa),
|
|
sel8x8_1(bb),
|
|
sel8x8_0(aa),
|
|
sel8x8_0(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_3(aa),
|
|
sel16x4_3(bb),
|
|
sel16x4_2(aa),
|
|
sel16x4_2(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
|
|
{
|
|
return mk16x4(
|
|
sel16x4_1(aa),
|
|
sel16x4_1(bb),
|
|
sel16x4_0(aa),
|
|
sel16x4_0(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_1(aa),
|
|
sel32x2_1(bb)
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
|
|
{
|
|
return mk32x2(
|
|
sel32x2_0(aa),
|
|
sel32x2_0(bb)
|
|
);
|
|
}
|
|
|
|
|
|
/* ------------ Shifting ------------ */
|
|
/* Note that because these primops are undefined if the shift amount
|
|
equals or exceeds the lane width, the shift amount is masked so
|
|
that the scalar shifts are always in range. In fact, given the
|
|
semantics of these primops (ShlN16x4, etc) it is an error if in
|
|
fact we are ever given an out-of-range shift amount.
|
|
*/
|
|
ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
shl32( sel32x2_1(xx), nn ),
|
|
shl32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
shl16( sel16x4_3(xx), nn ),
|
|
shl16( sel16x4_2(xx), nn ),
|
|
shl16( sel16x4_1(xx), nn ),
|
|
shl16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
shr32( sel32x2_1(xx), nn ),
|
|
shr32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
shr16( sel16x4_3(xx), nn ),
|
|
shr16( sel16x4_2(xx), nn ),
|
|
shr16( sel16x4_1(xx), nn ),
|
|
shr16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 32); */
|
|
nn &= 31;
|
|
return mk32x2(
|
|
sar32( sel32x2_1(xx), nn ),
|
|
sar32( sel32x2_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
|
|
{
|
|
/* vassert(nn < 16); */
|
|
nn &= 15;
|
|
return mk16x4(
|
|
sar16( sel16x4_3(xx), nn ),
|
|
sar16( sel16x4_2(xx), nn ),
|
|
sar16( sel16x4_1(xx), nn ),
|
|
sar16( sel16x4_0(xx), nn )
|
|
);
|
|
}
|
|
|
|
/* ------------ Averaging ------------ */
|
|
|
|
ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
avg8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
avg16U( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
/* ------------ max/min ------------ */
|
|
|
|
ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
max16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
max16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
max16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
max16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
max8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
max8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
max8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
max8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
max8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
max8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
max8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
max8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
|
|
{
|
|
return mk16x4(
|
|
min16S( sel16x4_3(xx), sel16x4_3(yy) ),
|
|
min16S( sel16x4_2(xx), sel16x4_2(yy) ),
|
|
min16S( sel16x4_1(xx), sel16x4_1(yy) ),
|
|
min16S( sel16x4_0(xx), sel16x4_0(yy) )
|
|
);
|
|
}
|
|
|
|
ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
|
|
{
|
|
return mk8x8(
|
|
min8U( sel8x8_7(xx), sel8x8_7(yy) ),
|
|
min8U( sel8x8_6(xx), sel8x8_6(yy) ),
|
|
min8U( sel8x8_5(xx), sel8x8_5(yy) ),
|
|
min8U( sel8x8_4(xx), sel8x8_4(yy) ),
|
|
min8U( sel8x8_3(xx), sel8x8_3(yy) ),
|
|
min8U( sel8x8_2(xx), sel8x8_2(yy) ),
|
|
min8U( sel8x8_1(xx), sel8x8_1(yy) ),
|
|
min8U( sel8x8_0(xx), sel8x8_0(yy) )
|
|
);
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------*/
|
|
/*--- end host-generic/h_generic_simd64.c ---*/
|
|
/*---------------------------------------------------------------*/
|