ftmemsim-valgrind/vg_translate.c


/*--------------------------------------------------------------------*/
/*--- The JITter proper: register allocation & code improvement    ---*/
/*---                                               vg_translate.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Valgrind, an x86 protected-mode emulator
   designed for debugging and profiling binaries on x86-Unixes.

   Copyright (C) 2000-2002 Julian Seward
      jseward@acm.org

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file LICENSE.
*/

#include "vg_include.h"


/*------------------------------------------------------------*/
/*--- Renamings of frequently-used global functions.       ---*/
/*------------------------------------------------------------*/

#define uInstr1   VG_(newUInstr1)
#define uInstr2   VG_(newUInstr2)
#define uInstr3   VG_(newUInstr3)
#define dis       VG_(disassemble)
#define nameIReg  VG_(nameOfIntReg)
#define nameISize VG_(nameOfIntSize)
#define uLiteral  VG_(setLiteralField)
#define newTemp   VG_(getNewTemp)
#define newShadow VG_(getNewShadow)


/*------------------------------------------------------------*/
/*--- Memory management for the translater.                ---*/
/*------------------------------------------------------------*/

#define N_JITBLOCKS    4
#define N_JITBLOCK_SZ  5000

static UChar jitstorage[N_JITBLOCKS][N_JITBLOCK_SZ];
static Bool  jitstorage_inuse[N_JITBLOCKS];
static Bool  jitstorage_initdone = False;

static __inline__ void jitstorage_initialise ( void )
{
   Int i;
   if (jitstorage_initdone) return;
   jitstorage_initdone = True;
   for (i = 0; i < N_JITBLOCKS; i++)
      jitstorage_inuse[i] = False;
}

void* VG_(jitmalloc) ( Int nbytes )
{
   Int i;
   jitstorage_initialise();
   if (nbytes > N_JITBLOCK_SZ) {
      /* VG_(printf)("too large: %d\n", nbytes); */
      return VG_(malloc)(VG_AR_PRIVATE, nbytes);
   }
   for (i = 0; i < N_JITBLOCKS; i++) {
      if (!jitstorage_inuse[i]) {
         jitstorage_inuse[i] = True;
         /* VG_(printf)("alloc %d -> %d\n", nbytes, i ); */
         return & jitstorage[i][0];
      }
   }
   VG_(panic)("out of slots in vg_jitmalloc\n");
   return VG_(malloc)(VG_AR_PRIVATE, nbytes);
}

void VG_(jitfree) ( void* ptr )
{
   Int i;
   jitstorage_initialise();
   for (i = 0; i < N_JITBLOCKS; i++) {
      if (ptr == & jitstorage[i][0]) {
         vg_assert(jitstorage_inuse[i]);
         jitstorage_inuse[i] = False;
         return;
      }
   }
   VG_(free)(VG_AR_PRIVATE, ptr);
}

/*------------------------------------------------------------*/
/*--- Basics                                               ---*/
/*------------------------------------------------------------*/

UCodeBlock* VG_(allocCodeBlock) ( void )
{
   UCodeBlock* cb = VG_(malloc)(VG_AR_PRIVATE, sizeof(UCodeBlock));
   cb->used = cb->size = cb->nextTemp = 0;
   cb->instrs = NULL;
   return cb;
}


void VG_(freeCodeBlock) ( UCodeBlock* cb )
{
   if (cb->instrs) VG_(free)(VG_AR_PRIVATE, cb->instrs);
   VG_(free)(VG_AR_PRIVATE, cb);
}


/* Ensure there's enough space in a block to add one uinstr. */
static __inline__
void ensureUInstr ( UCodeBlock* cb )
{
   if (cb->used == cb->size) {
      if (cb->instrs == NULL) {
         vg_assert(cb->size == 0);
         vg_assert(cb->used == 0);
         cb->size = 8;
         cb->instrs = VG_(malloc)(VG_AR_PRIVATE, 8 * sizeof(UInstr));
      } else {
         Int i;
         UInstr* instrs2 = VG_(malloc)(VG_AR_PRIVATE,
                                       2 * sizeof(UInstr) * cb->size);
         for (i = 0; i < cb->used; i++)
            instrs2[i] = cb->instrs[i];
         cb->size *= 2;
         VG_(free)(VG_AR_PRIVATE, cb->instrs);
         cb->instrs = instrs2;
      }
   }

   vg_assert(cb->used < cb->size);
}


__inline__
void VG_(emptyUInstr) ( UInstr* u )
{
   u->val1 = u->val2 = u->val3 = 0;
   u->tag1 = u->tag2 = u->tag3 = NoValue;
   u->flags_r = u->flags_w = FlagsEmpty;
   u->jmpkind = JmpBoring;
   u->smc_check = u->signed_widen = False;
   u->lit32    = 0;
   u->opcode   = 0;
   u->size     = 0;
   u->cond     = 0;
   u->extra4b  = 0;
}


/* Add an instruction to a ucode block, and return the index of the
   instruction. */
__inline__
void VG_(newUInstr3) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1,
                       Tag tag2, UInt val2,
                       Tag tag3, UInt val3 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(emptyUInstr)(ui);
   ui->val1   = val1;
   ui->val2   = val2;
   ui->val3   = val3;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->tag2   = tag2;
   ui->tag3   = tag3;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
   if (tag3 == TempReg) vg_assert(val3 != INVALID_TEMPREG);
}


__inline__
void VG_(newUInstr2) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1,
                       Tag tag2, UInt val2 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(emptyUInstr)(ui);
   ui->val1   = val1;
   ui->val2   = val2;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->tag2   = tag2;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
}


__inline__
void VG_(newUInstr1) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(emptyUInstr)(ui);
   ui->val1   = val1;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
}


__inline__
void VG_(newUInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(emptyUInstr)(ui);
   ui->opcode = opcode;
   ui->size   = sz;
}

/* Copy an instruction into the given codeblock. */
__inline__
void VG_(copyUInstr) ( UCodeBlock* cb, UInstr* instr )
{
   ensureUInstr(cb);
   cb->instrs[cb->used] = *instr;
   cb->used++;
}

/* Copy auxiliary info from one uinstr to another. */
static __inline__
void copyAuxInfoFromTo ( UInstr* src, UInstr* dst )
{
   dst->cond          = src->cond;
   dst->extra4b       = src->extra4b;
   dst->smc_check     = src->smc_check;
   dst->signed_widen  = src->signed_widen;
   dst->jmpkind       = src->jmpkind;
   dst->flags_r       = src->flags_r;
   dst->flags_w       = src->flags_w;
}


/* Set the flag R/W sets on a uinstr. */
void VG_(setFlagRW) ( UInstr* u, FlagSet fr, FlagSet fw )
{
   /* VG_(ppUInstr)(-1,u); */
   vg_assert(fr == (fr & FlagsALL));
   vg_assert(fw == (fw & FlagsALL));
   u->flags_r = fr;
   u->flags_w = fw;
}


/* Set the lit32 field of the most recent uinsn. */
void VG_(setLiteralField) ( UCodeBlock* cb, UInt lit32 )
{
   LAST_UINSTR(cb).lit32 = lit32;
}


Bool VG_(anyFlagUse) ( UInstr* u )
{
   return (u->flags_r != FlagsEmpty
           || u->flags_w != FlagsEmpty);
}


/* Convert a rank in the range 0 .. VG_MAX_REALREGS-1 into an Intel
   register number.  This effectively defines the order in which real
   registers are allocated.  %ebp is excluded since it is permanently
   reserved for pointing at VG_(baseBlock).  %edi is a general spare
   temp used for Left4 and various misc tag ops.

   Important!  If you change the set of allocatable registers from
   %eax, %ebx, %ecx, %edx, %esi you must change the
   save/restore sequences in vg_helper_smc_check4 to match!
*/
__inline__ Int VG_(rankToRealRegNo) ( Int rank )
{
   switch (rank) {
#     if 1
      /* Probably the best allocation ordering. */
      case 0: return R_EAX;
      case 1: return R_EBX;
      case 2: return R_ECX;
      case 3: return R_EDX;
      case 4: return R_ESI;
#     else
      /* Contrary; probably the worst.  Helpful for debugging, tho. */
      case 4: return R_EAX;
      case 3: return R_EBX;
      case 2: return R_ECX;
      case 1: return R_EDX;
      case 0: return R_ESI;
#     endif
      default: VG_(panic)("rankToRealRegNo");
   }
}


/*------------------------------------------------------------*/
/*--- Sanity checking uinstrs.                             ---*/
/*------------------------------------------------------------*/

/* This seems as good a place as any to record some important stuff
   about ucode semantics.

   * TempRegs are 32 bits wide.  LOADs of 8/16 bit values into a
     TempReg are defined to zero-extend the loaded value to 32 bits.
     This is needed to make the translation of movzbl et al work
     properly.

   * Similarly, GETs of a 8/16 bit ArchRegs are zero-extended.

   * Arithmetic on TempRegs is at the specified size.  For example,
     SUBW t1, t2 has to result in a real 16 bit x86 subtraction
     being emitted -- not a 32 bit one.

   * On some insns we allow the cc bit to be set.  If so, the
     intention is that the simulated machine's %eflags register
     is copied into that of the real machine before the insn,
     and copied back again afterwards.  This means that the
     code generated for that insn must be very careful only to
     update %eflags in the intended way.  This is particularly
     important for the routines referenced by CALL insns.
*/

/* Meaning of operand kinds is as follows:

   ArchReg  is a register of the simulated CPU, stored in memory,
            in vg_m_state.m_eax .. m_edi.  These values are stored
            using the Intel register encoding.

   RealReg  is a register of the real CPU.  There are VG_MAX_REALREGS
            available for allocation.  As with ArchRegs, these values
            are stored using the Intel register encoding.

   TempReg  is a temporary register used to express the results of
            disassembly.  There is an unlimited supply of them --
            register allocation and spilling eventually assigns them
            to RealRegs.

   SpillNo  is a spill slot number.  The number of required spill
            slots is VG_MAX_PSEUDOS, in general.  Only allowed
            as the ArchReg operand of GET and PUT.

   Lit16    is a signed 16-bit literal value.

   Literal  is a 32-bit literal value.  Each uinstr can only hold
            one of these.

   The disassembled code is expressed purely in terms of ArchReg,
   TempReg and Literal operands.  Eventually, register allocation
   removes all the TempRegs, giving a result using ArchRegs, RealRegs,
   and Literals.  New x86 code can easily be synthesised from this.
   There are carefully designed restrictions on which insns can have
   which operands, intended to make it possible to generate x86 code
   from the result of register allocation on the ucode efficiently and
   without need of any further RealRegs.

   Restrictions on insns (as generated by the disassembler) are as
   follows:

      A=ArchReg   S=SpillNo   T=TempReg   L=Literal   R=RealReg
      N=NoValue

         GETF       T       N       N
         PUTF       T       N       N

         GET        A,S     T       N
         PUT        T       A,S     N
         LOAD       T       T       N
         STORE      T       T       N
         MOV        T,L     T       N
         CMOV       T       T       N
         WIDEN      T       N       N
         JMP        T,L     N       N
         CALLM      L       N       N
         CALLM_S    N       N       N
         CALLM_E    N       N       N
         PUSH,POP   T       N       N
         CLEAR      L       N       N

         AND, OR
                    T       T       N

         ADD, ADC, XOR, SUB, SBB
                    A,L,T   T       N

         SHL, SHR, SAR, ROL, ROR, RCL, RCR
                    L,T     T       N

         NOT, NEG, INC, DEC, CC2VAL, BSWAP
                    T       N       N

         JIFZ       T       L       N

         FPU_R      L       T       N
         FPU_W      L       T       N
         FPU        L       T       N

         LEA1       T       T   (const in a seperate field)
         LEA2       T       T       T   (const & shift ditto)

         INCEIP     L       N       N

   and for instrumentation insns:

         LOADV      T       T       N
         STOREV     T,L     T       N
         GETV       A       T       N
         PUTV       T,L     A       N
         GETVF      T       N       N
         PUTVF      T       N       N
         WIDENV     T       N       N
         TESTV      A,T     N       N
         SETV       A,T     N       N
         TAG1       T       N       N
         TAG2       T       T       N

   Before register allocation, S operands should not appear anywhere.
   After register allocation, all T operands should have been
   converted into Rs, and S operands are allowed in GET and PUT --
   denoting spill saves/restores.

   The size field should be 0 for insns for which it is meaningless,
   ie those which do not directly move/operate on data.
*/
Bool VG_(saneUInstr) ( Bool beforeRA, UInstr* u )
{
#  define TR1 (beforeRA ? (u->tag1 == TempReg) : (u->tag1 == RealReg))
#  define TR2 (beforeRA ? (u->tag2 == TempReg) : (u->tag2 == RealReg))
#  define TR3 (beforeRA ? (u->tag3 == TempReg) : (u->tag3 == RealReg))
#  define A1  (u->tag1 == ArchReg)
#  define A2  (u->tag2 == ArchReg)
#  define AS1 ((u->tag1 == ArchReg) || ((!beforeRA && (u->tag1 == SpillNo))))
#  define AS2 ((u->tag2 == ArchReg) || ((!beforeRA && (u->tag2 == SpillNo))))
#  define AS3 ((u->tag3 == ArchReg) || ((!beforeRA && (u->tag3 == SpillNo))))
#  define L1  (u->tag1 == Literal && u->val1 == 0)
#  define L2  (u->tag2 == Literal && u->val2 == 0)
#  define Ls1 (u->tag1 == Lit16)
#  define Ls3 (u->tag3 == Lit16)
#  define N1  (u->tag1 == NoValue)
#  define N2  (u->tag2 == NoValue)
#  define N3  (u->tag3 == NoValue)
#  define SZ4 (u->size == 4)
#  define SZ2 (u->size == 2)
#  define SZ1 (u->size == 1)
#  define SZ0 (u->size == 0)
#  define CC0 (u->flags_r == FlagsEmpty && u->flags_w == FlagsEmpty)
#  define FLG_RD (u->flags_r == FlagsALL && u->flags_w == FlagsEmpty)
#  define FLG_WR (u->flags_r == FlagsEmpty && u->flags_w == FlagsALL)
#  define FLG_RD_WR_MAYBE                                         \
       ((u->flags_r == FlagsEmpty && u->flags_w == FlagsEmpty)    \
        || (u->flags_r == FlagsEmpty && u->flags_w == FlagsZCP)   \
        || (u->flags_r == FlagsZCP && u->flags_w == FlagsEmpty))
#  define CC1 (!(CC0))
#  define SZ4_IF_TR1 ((u->tag1 == TempReg || u->tag1 == RealReg) \
                      ? (u->size == 4) : True)

   Int n_lits = 0;
   if (u->tag1 == Literal) n_lits++;
   if (u->tag2 == Literal) n_lits++;
   if (u->tag3 == Literal) n_lits++;
   if (n_lits > 1)
      return False;

   switch (u->opcode) {
      case GETF:
         return (SZ2 || SZ4) && TR1 && N2 && N3 && FLG_RD;
      case PUTF:
         return (SZ2 || SZ4) && TR1 && N2 && N3 && FLG_WR;
      case CALLM_S: case CALLM_E:
         return SZ0 && N1 && N2 && N3;
      case INCEIP:
         return SZ0 && CC0 && Ls1 && N2 && N3;
      case LEA1:
         return CC0 && TR1 && TR2 && N3 && SZ4;
      case LEA2:
         return CC0 && TR1 && TR2 && TR3 && SZ4;
      case NOP:
         return SZ0 && CC0 && N1 && N2 && N3;
      case GET:
         return CC0 && AS1 && TR2 && N3;
      case PUT:
         return CC0 && TR1 && AS2 && N3;
      case LOAD: case STORE:
         return CC0 && TR1 && TR2 && N3;
      case MOV:
         return CC0 && (TR1 || L1) && TR2 && N3 && SZ4_IF_TR1;
      case CMOV:
         return CC1 && TR1 && TR2 && N3 && SZ4;
      case JMP:
         return (u->cond==CondAlways ? CC0 : CC1)
                && (TR1 || L1) && N2 && SZ0 && N3;
      case CLEAR:
         return CC0 && Ls1 && N2 && SZ0 && N3;
      case CALLM:
         return SZ0 && Ls1 && N2 && N3;
      case PUSH: case POP:
         return CC0 && TR1 && N2 && N3;
      case AND: case OR:
         return TR1 && TR2 && N3;
      case ADD: case ADC: case XOR: case SUB: case SBB:
         return (A1 || TR1 || L1) && TR2 && N3;
      case SHL: case SHR: case SAR: case ROL: case ROR: case RCL: case RCR:
         return       (TR1 || L1) && TR2 && N3;
      case NOT: case NEG: case INC: case DEC:
         return        TR1 && N2 && N3;
      case BSWAP:
         return TR1 && N2 && N3 && CC0 && SZ4;
      case CC2VAL:
         return CC1 && SZ1 && TR1 && N2 && N3;
      case JIFZ:
         return CC0 && SZ4 && TR1 && L2 && N3;
      case FPU_R:  case FPU_W:
         return CC0 && Ls1 && TR2 && N3;
      case FPU:
         return SZ0 && FLG_RD_WR_MAYBE && Ls1 && N2 && N3;
      case LOADV:
         return CC0 && TR1 && TR2 && N3;
      case STOREV:
         return CC0 && (TR1 || L1) && TR2 && N3;
      case GETV:
         return CC0 && A1 && TR2 && N3;
      case PUTV:
         return CC0 && (TR1 || L1) && A2 && N3;
      case GETVF:
         return CC0 && TR1 && N2 && N3 && SZ0;
      case PUTVF:
         return CC0 && TR1 && N2 && N3 && SZ0;
      case WIDEN:
         return CC0 && TR1 && N2 && N3;
      case TESTV:
         return CC0 && (A1 || TR1) && N2 && N3;
      case SETV:
         return CC0 && (A1 || TR1) && N2 && N3;
      case TAG1:
         return CC0 && TR1 && N2 && Ls3 && SZ0;
      case TAG2:
         return CC0 && TR1 && TR2 && Ls3 && SZ0;
      default:
         VG_(panic)("vg_saneUInstr: unhandled opcode");
   }
#  undef SZ4_IF_TR1
#  undef CC0
#  undef CC1
#  undef SZ4
#  undef SZ2
#  undef SZ1
#  undef SZ0
#  undef TR1
#  undef TR2
#  undef TR3
#  undef A1
#  undef A2
#  undef AS1
#  undef AS2
#  undef AS3
#  undef L1
#  undef Ls1
#  undef L2
#  undef Ls3
#  undef N1
#  undef N2
#  undef N3
#  undef FLG_RD
#  undef FLG_WR
#  undef FLG_RD_WR_MAYBE
}


/* Sanity checks to do with CALLMs in UCodeBlocks. */
Bool VG_(saneUCodeBlock) ( UCodeBlock* cb )
{
   Int  callm = 0;
   Int  callm_s = 0;
   Int  callm_e = 0;
   Int  callm_ptr, calls_ptr;
   Int  i, j, t;
   Bool incall = False;

   /* Ensure the number of CALLM, CALLM_S and CALLM_E are the same. */

   for (i = 0; i < cb->used; i++) {
      switch (cb->instrs[i].opcode) {
         case CALLM:
            if (!incall) return False;
            callm++;
            break;
         case CALLM_S:
            if (incall) return False;
            incall = True;
            callm_s++;
            break;
         case CALLM_E:
            if (!incall) return False;
            incall = False;
            callm_e++;
            break;
         case PUSH: case POP: case CLEAR:
            if (!incall) return False;
            break;
         default:
            break;
      }
   }
   if (incall) return False;
   if (callm != callm_s || callm != callm_e) return False;

   /* Check the sections between CALLM_S and CALLM's.  Ensure that no
      PUSH uinsn pushes any TempReg that any other PUSH in the same
      section pushes.  Ie, check that the TempReg args to PUSHes in
      the section are unique.  If not, the instrumenter generates
      incorrect code for CALLM insns. */

   callm_ptr = 0;

 find_next_CALLM:
   /* Search for the next interval, making calls_ptr .. callm_ptr
      bracket it. */
   while (callm_ptr < cb->used
          && cb->instrs[callm_ptr].opcode != CALLM)
      callm_ptr++;
   if (callm_ptr == cb->used)
      return True;
   vg_assert(cb->instrs[callm_ptr].opcode == CALLM);

   calls_ptr = callm_ptr - 1;
   while (cb->instrs[calls_ptr].opcode != CALLM_S)
      calls_ptr--;
   vg_assert(cb->instrs[calls_ptr].opcode == CALLM_S);
   vg_assert(calls_ptr >= 0);

   /* VG_(printf)("interval from %d to %d\n", calls_ptr, callm_ptr ); */

   /* For each PUSH insn in the interval ... */
   for (i = calls_ptr + 1; i < callm_ptr; i++) {
      if (cb->instrs[i].opcode != PUSH) continue;
      t = cb->instrs[i].val1;
      /* Ensure no later PUSH insns up to callm_ptr push the same
         TempReg.  Return False if any such are found. */
      for (j = i+1; j < callm_ptr; j++) {
         if (cb->instrs[j].opcode == PUSH &&
             cb->instrs[j].val1 == t)
            return False;
      }
   }

   /* This interval is clean.  Keep going ... */
   callm_ptr++;
   goto find_next_CALLM;
}


/*------------------------------------------------------------*/
/*--- Printing uinstrs.                                    ---*/
/*------------------------------------------------------------*/

Char* VG_(nameCondcode) ( Condcode cond )
{
   switch (cond) {
      case CondO:      return "o";
      case CondNO:     return "no";
      case CondB:      return "b";
      case CondNB:     return "nb";
      case CondZ:      return "z";
      case CondNZ:     return "nz";
      case CondBE:     return "be";
      case CondNBE:    return "nbe";
      case CondS:      return "s";
      case ConsNS:     return "ns";
      case CondP:      return "p";
      case CondNP:     return "np";
      case CondL:      return "l";
      case CondNL:     return "nl";
      case CondLE:     return "le";
      case CondNLE:    return "nle";
      case CondAlways: return "MP"; /* hack! */
      default: VG_(panic)("nameCondcode");
   }
}


static void vg_ppFlagSet ( Char* prefix, FlagSet set )
{
   VG_(printf)("%s", prefix);
   if (set & FlagD) VG_(printf)("D");
   if (set & FlagO) VG_(printf)("O");
   if (set & FlagS) VG_(printf)("S");
   if (set & FlagZ) VG_(printf)("Z");
   if (set & FlagA) VG_(printf)("A");
   if (set & FlagC) VG_(printf)("C");
   if (set & FlagP) VG_(printf)("P");
}


static void ppTempReg ( Int tt )
{
   if ((tt & 1) == 0)
      VG_(printf)("t%d", tt);
   else
      VG_(printf)("q%d", tt-1);
}


static void ppUOperand ( UInstr* u, Int operandNo, Int sz, Bool parens )
{
   UInt tag, val;
   switch (operandNo) {
      case 1: tag = u->tag1; val = u->val1; break;
      case 2: tag = u->tag2; val = u->val2; break;
      case 3: tag = u->tag3; val = u->val3; break;
      default: VG_(panic)("ppUOperand(1)");
   }
   if (tag == Literal) val = u->lit32;

   if (parens) VG_(printf)("(");
   switch (tag) {
      case TempReg: ppTempReg(val); break;
      case RealReg: VG_(printf)("%s",nameIReg(sz==0 ? 4 : sz,val)); break;
      case Literal: VG_(printf)("$0x%x", val); break;
      case Lit16:   VG_(printf)("$0x%x", val); break;
      case NoValue: VG_(printf)("NoValue"); break;
      case ArchReg: VG_(printf)("%S",nameIReg(sz,val)); break;
      case SpillNo: VG_(printf)("spill%d", val); break;
      default: VG_(panic)("ppUOperand(2)");
   }
   if (parens) VG_(printf)(")");
}


Char* VG_(nameUOpcode) ( Bool upper, Opcode opc )
{
   switch (opc) {
      case ADD:   return (upper ? "ADD" : "add");
      case ADC:   return (upper ? "ADC" : "adc");
      case AND:   return (upper ? "AND" : "and");
      case OR:    return (upper ? "OR"  : "or");
      case XOR:   return (upper ? "XOR" : "xor");
      case SUB:   return (upper ? "SUB" : "sub");
      case SBB:   return (upper ? "SBB" : "sbb");
      case SHL:   return (upper ? "SHL" : "shl");
      case SHR:   return (upper ? "SHR" : "shr");
      case SAR:   return (upper ? "SAR" : "sar");
      case ROL:   return (upper ? "ROL" : "rol");
      case ROR:   return (upper ? "ROR" : "ror");
      case RCL:   return (upper ? "RCL" : "rcl");
      case RCR:   return (upper ? "RCR" : "rcr");
      case NOT:   return (upper ? "NOT" : "not");
      case NEG:   return (upper ? "NEG" : "neg");
      case INC:   return (upper ? "INC" : "inc");
      case DEC:   return (upper ? "DEC" : "dec");
      case BSWAP: return (upper ? "BSWAP" : "bswap");
      default:    break;
   }
   if (!upper) VG_(panic)("vg_nameUOpcode: invalid !upper");
   switch (opc) {
      case GETVF:   return "GETVF";
      case PUTVF:   return "PUTVF";
      case TAG1:    return "TAG1";
      case TAG2:    return "TAG2";
      case CALLM_S: return "CALLM_S";
      case CALLM_E: return "CALLM_E";
      case INCEIP:  return "INCEIP";
      case LEA1:    return "LEA1";
      case LEA2:    return "LEA2";
      case NOP:     return "NOP";
      case GET:     return "GET";
      case PUT:     return "PUT";
      case GETF:    return "GETF";
      case PUTF:    return "PUTF";
      case LOAD:    return "LD" ;
      case STORE:   return "ST" ;
      case MOV:     return "MOV";
      case CMOV:    return "CMOV";
      case WIDEN:   return "WIDEN";
      case JMP:     return "J"    ;
      case JIFZ:    return "JIFZ" ;
      case CALLM:   return "CALLM";
      case PUSH:    return "PUSH" ;
      case POP:     return "POP"  ;
      case CLEAR:   return "CLEAR";
      case CC2VAL:  return "CC2VAL";
      case FPU_R:   return "FPU_R";
      case FPU_W:   return "FPU_W";
      case FPU:     return "FPU"  ;
      case LOADV:   return "LOADV";
      case STOREV:  return "STOREV";
      case GETV:    return "GETV";
      case PUTV:    return "PUTV";
      case TESTV:   return "TESTV";
      case SETV:    return "SETV";
      default:      VG_(panic)("nameUOpcode: unhandled case");
   }
}


void VG_(ppUInstr) ( Int instrNo, UInstr* u )
{
   VG_(printf)("\t%4d: %s", instrNo,
                            VG_(nameUOpcode)(True, u->opcode));
   if (u->opcode == JMP || u->opcode == CC2VAL)
      VG_(printf)("%s", VG_(nameCondcode(u->cond)));

   switch (u->size) {
      case 0:  VG_(printf)("o"); break;
      case 1:  VG_(printf)("B"); break;
      case 2:  VG_(printf)("W"); break;
      case 4:  VG_(printf)("L"); break;
      case 8:  VG_(printf)("Q"); break;
      default: VG_(printf)("%d", (Int)u->size); break;
   }

   switch (u->opcode) {

      case TAG1:
         VG_(printf)("\t");
         ppUOperand(u, 1, 4, False);
         VG_(printf)(" = %s ( ", VG_(nameOfTagOp)( u->val3 ));
         ppUOperand(u, 1, 4, False);
         VG_(printf)(" )");
         break;

      case TAG2:
         VG_(printf)("\t");
         ppUOperand(u, 2, 4, False);
         VG_(printf)(" = %s ( ", VG_(nameOfTagOp)( u->val3 ));
         ppUOperand(u, 1, 4, False);
         VG_(printf)(", ");
         ppUOperand(u, 2, 4, False);
         VG_(printf)(" )");
         break;

      case CALLM_S: case CALLM_E:
         break;

      case INCEIP:
         VG_(printf)("\t$%d", u->val1);
         break;

      case LEA2:
         VG_(printf)("\t%d(" , u->lit32);
         ppUOperand(u, 1, 4, False);
         VG_(printf)(",");
         ppUOperand(u, 2, 4, False);
         VG_(printf)(",%d), ", (Int)u->extra4b);
         ppUOperand(u, 3, 4, False);
         break;

      case LEA1:
         VG_(printf)("\t%d" , u->lit32);
         ppUOperand(u, 1, 4, True);
         VG_(printf)(", ");
         ppUOperand(u, 2, 4, False);
         break;

      case NOP:
         break;

      case FPU_W:
         VG_(printf)("\t0x%x:0x%x, ",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         ppUOperand(u, 2, 4, True);
         break;

      case FPU_R:
         VG_(printf)("\t");
         ppUOperand(u, 2, 4, True);
         VG_(printf)(", 0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         break;

      case FPU:
         VG_(printf)("\t0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         break;

      case STOREV: case LOADV:
      case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, u->opcode==LOAD || u->opcode==LOADV);
         VG_(printf)(", ");
         ppUOperand(u, 2, u->size, u->opcode==STORE || u->opcode==STOREV);
         break;

      case GETF: case PUTF:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         break;

      case JMP: case CC2VAL:
      case PUSH: case POP: case CLEAR: case CALLM:
         if (u->opcode == JMP) {
            switch (u->jmpkind) {
               case JmpCall:      VG_(printf)("-c"); break;
               case JmpRet:       VG_(printf)("-r"); break;
               case JmpSyscall:   VG_(printf)("-sys"); break;
               case JmpClientReq: VG_(printf)("-cli"); break;
               default: break;
            }
         }
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         break;

      case JIFZ:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         VG_(printf)(", ");
         ppUOperand(u, 2, u->size, False);
         break;

      case PUTVF: case GETVF:
         VG_(printf)("\t");
         ppUOperand(u, 1, 0, False);
         break;

      case NOT: case NEG: case INC: case DEC: case BSWAP:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         break;

      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
      case SHL: case SHR: case SAR:
      case ROL: case ROR: case RCL: case RCR:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         VG_(printf)(", ");
         ppUOperand(u, 2, u->size, False);
         break;

      case GETV: case PUTV:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->opcode==PUTV ? 4 : u->size, False);
         VG_(printf)(", ");
         ppUOperand(u, 2, u->opcode==GETV ? 4 : u->size, False);
         break;

      case WIDEN:
         VG_(printf)("_%c%c", VG_(toupper)(nameISize(u->extra4b)),
                              u->signed_widen?'s':'z');
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         break;

      case TESTV: case SETV:
         VG_(printf)("\t");
         ppUOperand(u, 1, u->size, False);
         break;

      default: VG_(panic)("ppUInstr: unhandled opcode");
   }

   if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) {
      VG_(printf)("  (");
      if (u->flags_r != FlagsEmpty)
         vg_ppFlagSet("-r", u->flags_r);
      if (u->flags_w != FlagsEmpty)
         vg_ppFlagSet("-w", u->flags_w);
      VG_(printf)(")");
   }
   VG_(printf)("\n");
}


void VG_(ppUCodeBlock) ( UCodeBlock* cb, Char* title )
{
   Int i;
   VG_(printf)("\n%s\n", title);
   for (i = 0; i < cb->used; i++)
      if (0 || cb->instrs[i].opcode != NOP)
         VG_(ppUInstr) ( i, &cb->instrs[i] );
   VG_(printf)("\n");
}


/*------------------------------------------------------------*/
/*--- uinstr helpers for register allocation               ---*/
/*--- and code improvement.                                ---*/
/*------------------------------------------------------------*/

/* A structure for communicating temp uses, and for indicating
   temp->real register mappings for patchUInstr. */
typedef
   struct {
      Int   realNo;
      Int   tempNo;
      Bool  isWrite;
   }
   TempUse;


/* Get the temp use of a uinstr, parking them in an array supplied by
   the caller, which is assumed to be big enough.  Return the number
   of entries.  Insns which read _and_ write a register wind up
   mentioning it twice.  Entries are placed in the array in program
   order, so that if a reg is read-modified-written, it appears first
   as a read and then as a write.
*/
static __inline__
Int getTempUsage ( UInstr* u, TempUse* arr )
{

#  define RD(ono)                                  \
      if (mycat(u->tag,ono) == TempReg)            \
         { arr[n].tempNo  = mycat(u->val,ono);     \
           arr[n].isWrite = False; n++; }
#  define WR(ono)                                  \
      if (mycat(u->tag,ono) == TempReg)            \
         { arr[n].tempNo  = mycat(u->val,ono);     \
           arr[n].isWrite = True; n++; }

   Int n = 0;
   switch (u->opcode) {
      case LEA1: RD(1); WR(2); break;
      case LEA2: RD(1); RD(2); WR(3); break;

      case NOP: case FPU: case INCEIP: case CALLM_S: case CALLM_E: break;
      case FPU_R: case FPU_W: RD(2); break;

      case GETF:  WR(1); break;
      case PUTF:  RD(1); break;

      case GET:   WR(2); break;
      case PUT:   RD(1); break;
      case LOAD:  RD(1); WR(2); break;
      case STORE: RD(1); RD(2); break;
      case MOV:   RD(1); WR(2); break;

      case JMP:   RD(1); break;
      case CLEAR: case CALLM: break;

      case PUSH: RD(1); break;
      case POP:  WR(1); break;

      case TAG2:
      case CMOV:
      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
         RD(1); RD(2); WR(2); break;

      case SHL: case SHR: case SAR:
      case ROL: case ROR: case RCL: case RCR:
         RD(1); RD(2); WR(2); break;

      case NOT: case NEG: case INC: case DEC: case TAG1: case BSWAP:
         RD(1); WR(1); break;

      case WIDEN: RD(1); WR(1); break;

      case CC2VAL: WR(1); break;
      case JIFZ: RD(1); break;

      /* These sizes are only ever consulted when the instrumentation
         code is being added, so the following can return
         manifestly-bogus sizes. */
      case LOADV:   RD(1); WR(2); break;
      case STOREV:  RD(1); RD(2); break;
      case GETV:    WR(2); break;
      case PUTV:    RD(1); break;
      case TESTV:   RD(1); break;
      case SETV:    WR(1); break;
      case PUTVF:   RD(1); break;
      case GETVF:   WR(1); break;

      default: VG_(panic)("getTempUsage: unhandled opcode");
   }
   return n;

#  undef RD
#  undef WR
}


/* Change temp regs in u into real regs, as directed by tmap. */
static __inline__
void patchUInstr ( UInstr* u, TempUse* tmap, Int n_tmap )
{
   Int i;
   if (u->tag1 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (tmap[i].tempNo == u->val1) break;
      if (i == n_tmap) VG_(panic)("patchUInstr(1)");
      u->tag1 = RealReg;
      u->val1 = tmap[i].realNo;
   }
   if (u->tag2 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (tmap[i].tempNo == u->val2) break;
      if (i == n_tmap) VG_(panic)("patchUInstr(2)");
      u->tag2 = RealReg;
      u->val2 = tmap[i].realNo;
   }
   if (u->tag3 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (tmap[i].tempNo == u->val3) break;
      if (i == n_tmap) VG_(panic)("patchUInstr(3)");
      u->tag3 = RealReg;
      u->val3 = tmap[i].realNo;
   }
}


/* Tedious x86-specific hack which compensates for the fact that the
   register numbers for %ah .. %dh do not correspond to those for %eax
   .. %edx.  It maps a (reg size, reg no) pair to the number of the
   containing 32-bit reg. */
static __inline__
Int containingArchRegOf ( Int sz, Int aregno )
{
   switch (sz) {
      case 4: return aregno;
      case 2: return aregno;
      case 1: return aregno >= 4 ? aregno-4 : aregno;
      default: VG_(panic)("containingArchRegOf");
   }
}


/* If u reads an ArchReg, return the number of the containing arch
   reg.  Otherwise return -1.  Used in redundant-PUT elimination. */
static __inline__
Int maybe_uinstrReadsArchReg ( UInstr* u )
{
   switch (u->opcode) {
      case GET:
      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
      case SHL: case SHR: case SAR: case ROL:
      case ROR: case RCL: case RCR:
         if (u->tag1 == ArchReg)
            return containingArchRegOf ( u->size, u->val1 );
         else
            return -1;

      case GETF: case PUTF:
      case CALLM_S: case CALLM_E:
      case INCEIP:
      case LEA1:
      case LEA2:
      case NOP:
      case PUT:
      case LOAD:
      case STORE:
      case MOV:
      case CMOV:
      case JMP:
      case CALLM: case CLEAR: case PUSH: case POP:
      case NOT: case NEG: case INC: case DEC: case BSWAP:
      case CC2VAL:
      case JIFZ:
      case FPU: case FPU_R: case FPU_W:
      case WIDEN:
         return -1;

      default:
         VG_(ppUInstr)(0,u);
         VG_(panic)("maybe_uinstrReadsArchReg: unhandled opcode");
   }
}

static __inline__
Bool uInstrMentionsTempReg ( UInstr* u, Int tempreg )
{
   Int i, k;
   TempUse tempUse[3];
   k = getTempUsage ( u, &tempUse[0] );
   for (i = 0; i < k; i++)
      if (tempUse[i].tempNo == tempreg)
         return True;
   return False;
}


/*------------------------------------------------------------*/
/*--- ucode improvement.                                   ---*/
/*------------------------------------------------------------*/

/* Improve the code in cb by doing
   -- Redundant ArchReg-fetch elimination
   -- Redundant PUT elimination
   -- Redundant cond-code restore/save elimination
   The overall effect of these is to allow target registers to be
   cached in host registers over multiple target insns.
*/
static void vg_improve ( UCodeBlock* cb )
{
   Int     i, j, k, m, n, ar, tr, told, actual_areg;
   Int     areg_map[8];
   Bool    annul_put[8];
   TempUse tempUse[3];
   UInstr* u;
   Bool    wr;
   Int*    last_live_before;
   FlagSet future_dead_flags;

   if (cb->nextTemp > 0)
      last_live_before = VG_(jitmalloc) ( cb->nextTemp * sizeof(Int) );
   else
      last_live_before = NULL;


   /* PASS 1: redundant GET elimination.  (Actually, more general than
      that -- eliminates redundant fetches of ArchRegs). */

   /* Find the live-range-ends for all temporaries.  Duplicates code
      in the register allocator :-( */

   for (i = 0; i < cb->nextTemp; i++) last_live_before[i] = -1;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      k = getTempUsage(u, &tempUse[0]);

      /* For each temp usage ... bwds in program order. */
      for (j = k-1; j >= 0; j--) {
         tr = tempUse[j].tempNo;
         wr = tempUse[j].isWrite;
         if (last_live_before[tr] == -1) {
            vg_assert(tr >= 0 && tr < cb->nextTemp);
            last_live_before[tr] = wr ? (i+1) : i;
         }
      }

   }

#  define BIND_ARCH_TO_TEMP(archreg,tempreg)\
   { Int q;                                           \
     /* Invalidate any old binding(s) to tempreg. */  \
     for (q = 0; q < 8; q++)                          \
        if (areg_map[q] == tempreg) areg_map[q] = -1; \
     /* Add the new binding. */                       \
     areg_map[archreg] = (tempreg);                   \
   }

   /* Set up the A-reg map. */
   for (i = 0; i < 8; i++) areg_map[i] = -1;

   /* Scan insns. */
   for (i = 0; i < cb->used; i++) {
      u = &cb->instrs[i];
      if (u->opcode == GET && u->size == 4) {
         /* GET; see if it can be annulled. */
         vg_assert(u->tag1 == ArchReg);
         vg_assert(u->tag2 == TempReg);
         ar   = u->val1;
         tr   = u->val2;
         told = areg_map[ar];
         if (told != -1 && last_live_before[told] <= i) {
            /* ar already has an old mapping to told, but that runs
               out here.  Annul this GET, rename tr to told for the
               rest of the block, and extend told's live range to that
               of tr.  */
            u->opcode = NOP;
            u->tag1 = u->tag2 = NoValue;
            n = last_live_before[tr] + 1;
            if (n > cb->used) n = cb->used;
            last_live_before[told] = last_live_before[tr];
            last_live_before[tr] = i-1;
            if (VG_(disassemble))
               VG_(printf)(
                  "at %d: delete GET, rename t%d to t%d in (%d .. %d)\n",
                  i, tr, told,i+1, n-1);
            for (m = i+1; m < n; m++) {
               if (cb->instrs[m].tag1 == TempReg
                   && cb->instrs[m].val1 == tr)
                 cb->instrs[m].val1 = told;
               if (cb->instrs[m].tag2 == TempReg
                   && cb->instrs[m].val2 == tr)
                 cb->instrs[m].val2 = told;
            }
            BIND_ARCH_TO_TEMP(ar,told);
         }
         else
            BIND_ARCH_TO_TEMP(ar,tr);
      }
      else if (u->opcode == GET && u->size != 4) {
         /* Invalidate any mapping for this archreg.  */
         actual_areg = containingArchRegOf ( u->size, u->val1 );
         areg_map[actual_areg] = -1;
      }
      else if (u->opcode == PUT && u->size == 4) {
         /* PUT; re-establish t -> a binding */
         vg_assert(u->tag1 == TempReg);
         vg_assert(u->tag2 == ArchReg);
         BIND_ARCH_TO_TEMP(u->val2, u->val1);
      }
      else if (u->opcode == PUT && u->size != 4) {
         /* Invalidate any mapping for this archreg. */
         actual_areg = containingArchRegOf ( u->size, u->val2 );
         areg_map[actual_areg] = -1;
      } else {

         /* see if insn has an archreg as a read operand; if so try to
            map it. */
         if (u->tag1 == ArchReg && u->size == 4
                                && areg_map[u->val1] != -1) {
            switch (u->opcode) {
               case ADD: case SUB: case AND: case OR: case XOR:
               case ADC: case SBB:
               case SHL: case SHR: case SAR: case ROL: case ROR:
               case RCL: case RCR:
                  if (VG_(disassemble))
                     VG_(printf)(
                        "at %d: change ArchReg %S to TempReg t%d\n",
                        i, nameIReg(4,u->val1), areg_map[u->val1]);
                  u->tag1 = TempReg;
                  u->val1 = areg_map[u->val1];
                  /* Remember to extend the live range of the TempReg,
                     if necessary. */
                  if (last_live_before[u->val1] < i)
                     last_live_before[u->val1] = i;
                  break;
               default:
                  break;
            }
         }

         /* boring insn; invalidate any mappings to temps it writes */
         k = getTempUsage(u, &tempUse[0]);

         for (j = 0; j < k; j++) {
            wr  = tempUse[j].isWrite;
            if (!wr) continue;
            tr = tempUse[j].tempNo;
            for (m = 0; m < 8; m++)
               if (areg_map[m] == tr) areg_map[m] = -1;
         }
      }

   }

#  undef BIND_ARCH_TO_TEMP

   /* PASS 2: redundant PUT elimination.  Don't annul (delay) puts of
      %ESP, since the memory check machinery always requires the
      in-memory value of %ESP to be up to date.  Although this isn't
      actually required by other analyses (cache simulation), it's
      simplest to be consistent for all end-uses. */
   for (j = 0; j < 8; j++)
      annul_put[j] = False;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];
      if (u->opcode == NOP) continue;

      if (u->opcode == PUT && u->size == 4) {
         vg_assert(u->tag2 == ArchReg);
         actual_areg = containingArchRegOf ( 4, u->val2 );
         if (annul_put[actual_areg]) {
            vg_assert(actual_areg != R_ESP);
            u->opcode = NOP;
            u->tag1 = u->tag2 = NoValue;
            if (VG_(disassemble))
               VG_(printf)("at %d: delete PUT\n", i );
         } else {
            if (actual_areg != R_ESP)
               annul_put[actual_areg] = True;
         }
      }
      else if (u->opcode == PUT && u->size != 4) {
         actual_areg = containingArchRegOf ( u->size, u->val2 );
         annul_put[actual_areg] = False;
      }
      else if (u->opcode == JMP || u->opcode == JIFZ
               || u->opcode == CALLM) {
         for (j = 0; j < 8; j++)
            annul_put[j] = False;
      }
      else {
         /* If an instruction reads an ArchReg, the immediately
            preceding PUT cannot be annulled. */
         actual_areg = maybe_uinstrReadsArchReg ( u );
         if (actual_areg != -1)
            annul_put[actual_areg] = False;
      }
   }

   /* PASS 2a: redundant-move elimination.  Given MOV t1, t2 and t1 is
      dead after this point, annul the MOV insn and rename t2 to t1.
      Further modifies the last_live_before map. */

#  if 0
   VG_(ppUCodeBlock)(cb, "Before MOV elimination" );
   for (i = 0; i < cb->nextTemp; i++)
     VG_(printf)("llb[t%d]=%d   ", i, last_live_before[i]);
   VG_(printf)("\n");
#  endif

   for (i = 0; i < cb->used-1; i++) {
      u = &cb->instrs[i];
      if (u->opcode != MOV) continue;
      if (u->tag1 == Literal) continue;
      vg_assert(u->tag1 == TempReg);
      vg_assert(u->tag2 == TempReg);
      if (last_live_before[u->val1] == i) {
         if (VG_(disassemble))
            VG_(printf)(
               "at %d: delete MOV, rename t%d to t%d in (%d .. %d)\n",
               i, u->val2, u->val1, i+1, last_live_before[u->val2] );
         for (j = i+1; j <= last_live_before[u->val2]; j++) {
            if (cb->instrs[j].tag1 == TempReg
                && cb->instrs[j].val1 == u->val2)
               cb->instrs[j].val1 = u->val1;
            if (cb->instrs[j].tag2 == TempReg
                && cb->instrs[j].val2 == u->val2)
               cb->instrs[j].val2 = u->val1;
         }
         last_live_before[u->val1] = last_live_before[u->val2];
         last_live_before[u->val2] = i-1;
         u->opcode = NOP;
         u->tag1 = u->tag2 = NoValue;
      }
   }

   /* PASS 3: redundant condition-code restore/save elimination.
      Scan backwards from the end.  future_dead_flags records the set
      of flags which are dead at this point, that is, will be written
      before they are next read.  Earlier uinsns which write flags
      already in future_dead_flags can have their writes annulled.
   */
   future_dead_flags = FlagsEmpty;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      /* We might never make it to insns beyond this one, so be
         conservative. */
      if (u->opcode == JIFZ || u->opcode == JMP) {
         future_dead_flags = FlagsEmpty;
         continue;
      }

      /* We can annul the flags written by this insn if it writes a
         subset (or eq) of the set of flags known to be dead after
         this insn.  If not, just record the flags also written by
         this insn.*/
      if (u->flags_w != FlagsEmpty
          && VG_IS_FLAG_SUBSET(u->flags_w, future_dead_flags)) {
         if (VG_(disassemble)) {
            VG_(printf)("at %d: annul flag write ", i);
            vg_ppFlagSet("", u->flags_w);
            VG_(printf)(" due to later ");
            vg_ppFlagSet("", future_dead_flags);
            VG_(printf)("\n");
         }
         u->flags_w = FlagsEmpty;
      } else {
        future_dead_flags
           = VG_UNION_FLAG_SETS ( u->flags_w, future_dead_flags );
      }

      /* If this insn also reads flags, empty out future_dead_flags so
         as to force preceding writes not to be annulled. */
      if (u->flags_r != FlagsEmpty)
         future_dead_flags = FlagsEmpty;
   }

   if (last_live_before)
      VG_(jitfree) ( last_live_before );
}


/*------------------------------------------------------------*/
/*--- The new register allocator.                          ---*/
/*------------------------------------------------------------*/

typedef
   struct {
      /* Becomes live for the first time after this insn ... */
      Int live_after;
      /* Becomes dead for the last time after this insn ... */
      Int dead_before;
      /* The "home" spill slot, if needed.  Never changes. */
      Int spill_no;
      /* Where is it?  VG_NOVALUE==in a spill slot; else in reg. */
      Int real_no;
   }
   TempInfo;


/* Take a ucode block and allocate its TempRegs to RealRegs, or put
   them in spill locations, and add spill code, if there are not
   enough real regs.  The usual register allocation deal, in short.

   Important redundancy of representation:

     real_to_temp maps real reg ranks (RRRs) to TempReg nos, or
     to VG_NOVALUE if the real reg has no currently assigned TempReg.

     The .real_no field of a TempInfo gives the current RRR for
     this TempReg, or VG_NOVALUE if the TempReg is currently
     in memory, in which case it is in the SpillNo denoted by
     spillno.

   These pieces of information (a fwds-bwds mapping, really) must
   be kept consistent!

   This allocator uses the so-called Second Chance Bin Packing
   algorithm, as described in "Quality and Speed in Linear-scan
   Register Allocation" (Traub, Holloway and Smith, ACM PLDI98,
   pp142-151).  It is simple and fast and remarkably good at
   minimising the amount of spill code introduced.
*/

static
UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
{
   TempInfo*    temp_info;
   Int          real_to_temp[VG_MAX_REALREGS];
   Bool         is_spill_cand[VG_MAX_REALREGS];
   Int          ss_busy_until_before[VG_MAX_SPILLSLOTS];
   Int          i, j, k, m, r, tno, max_ss_no;
   Bool         wr, defer, isRead, spill_reqd;
   TempUse      tempUse[3];
   UCodeBlock*  c2;

   /* Used to denote ... well, "no value" in this fn. */
#  define VG_NOTHING (-2)

   /* Initialise the TempReg info.  */
   if (c1->nextTemp > 0)
      temp_info = VG_(jitmalloc)(c1->nextTemp * sizeof(TempInfo) );
   else
      temp_info = NULL;

   for (i = 0; i < c1->nextTemp; i++) {
      temp_info[i].live_after  = VG_NOTHING;
      temp_info[i].dead_before = VG_NOTHING;
      temp_info[i].spill_no    = VG_NOTHING;
      /* temp_info[i].real_no is not yet relevant. */
   }

   spill_reqd = False;

   /* Scan fwds to establish live ranges. */

   for (i = 0; i < c1->used; i++) {
      k = getTempUsage(&c1->instrs[i], &tempUse[0]);
      vg_assert(k >= 0 && k <= 3);

      /* For each temp usage ... fwds in program order */
      for (j = 0; j < k; j++) {
         tno = tempUse[j].tempNo;
         wr  = tempUse[j].isWrite;
         if (wr) {
            /* Writes hold a reg live until after this insn. */
            if (temp_info[tno].live_after == VG_NOTHING)
               temp_info[tno].live_after = i;
            if (temp_info[tno].dead_before < i + 1)
               temp_info[tno].dead_before = i + 1;
         } else {
            /* First use of a tmp should be a write. */
            vg_assert(temp_info[tno].live_after != VG_NOTHING);
            /* Reads only hold it live until before this insn. */
            if (temp_info[tno].dead_before < i)
               temp_info[tno].dead_before = i;
         }
      }
   }

#  if 0
   /* Sanity check on live ranges.  Expensive but correct. */
   for (i = 0; i < c1->nextTemp; i++) {
      vg_assert( (temp_info[i].live_after == VG_NOTHING
                  && temp_info[i].dead_before == VG_NOTHING)
                 || (temp_info[i].live_after != VG_NOTHING
                     && temp_info[i].dead_before != VG_NOTHING) );
   }
#  endif

   /* Do a rank-based allocation of TempRegs to spill slot numbers.
      We put as few as possible values in spill slots, but
      nevertheless need to have an assignment to them just in case. */

   max_ss_no = -1;

   for (i = 0; i < VG_MAX_SPILLSLOTS; i++)
      ss_busy_until_before[i] = 0;

   for (i = 0; i < c1->nextTemp; i++) {

      /* True iff this temp is unused. */
      if (temp_info[i].live_after == VG_NOTHING)
         continue;

      /* Find the lowest-numbered spill slot which is available at the
         start point of this interval, and assign the interval to
         it. */
      for (j = 0; j < VG_MAX_SPILLSLOTS; j++)
         if (ss_busy_until_before[j] <= temp_info[i].live_after)
            break;
      if (j == VG_MAX_SPILLSLOTS) {
         VG_(printf)("VG_MAX_SPILLSLOTS is too low; increase and recompile.\n");
         VG_(panic)("register allocation failed -- out of spill slots");
      }
      ss_busy_until_before[j] = temp_info[i].dead_before;
      temp_info[i].spill_no = j;
      if (j > max_ss_no)
         max_ss_no = j;
   }

   VG_(total_reg_rank) += (max_ss_no+1);

   /* Show live ranges and assigned spill slot nos. */

   if (VG_(disassemble)) {
      VG_(printf)("Live Range Assignments\n");

      for (i = 0; i < c1->nextTemp; i++) {
         if (temp_info[i].live_after == VG_NOTHING)
            continue;
         VG_(printf)(
            "   LR %d is   after %d to before %d   spillno %d\n",
            i,
            temp_info[i].live_after,
            temp_info[i].dead_before,
            temp_info[i].spill_no
         );
      }
   }

   /* Now that we've established a spill slot number for each used
      temporary, we can go ahead and do the core of the "Second-chance
      binpacking" allocation algorithm. */

   /* Resulting code goes here.  We generate it all in a forwards
      pass. */
   c2 = VG_(allocCodeBlock)();

   /* At the start, no TempRegs are assigned to any real register.
      Correspondingly, all temps claim to be currently resident in
      their spill slots, as computed by the previous two passes. */
   for (i = 0; i < VG_MAX_REALREGS; i++)
      real_to_temp[i] = VG_NOTHING;
   for (i = 0; i < c1->nextTemp; i++)
      temp_info[i].real_no = VG_NOTHING;

   if (VG_(disassemble))
      VG_(printf)("\n");

   /* Process each insn in turn. */
   for (i = 0; i < c1->used; i++) {

      if (c1->instrs[i].opcode == NOP) continue;
      VG_(uinstrs_prealloc)++;

#     if 0
      /* Check map consistency.  Expensive but correct. */
      for (r = 0; r < VG_MAX_REALREGS; r++) {
         if (real_to_temp[r] != VG_NOTHING) {
            tno = real_to_temp[r];
            vg_assert(tno >= 0 && tno < c1->nextTemp);
            vg_assert(temp_info[tno].real_no == r);
         }
      }
      for (tno = 0; tno < c1->nextTemp; tno++) {
         if (temp_info[tno].real_no != VG_NOTHING) {
            r = temp_info[tno].real_no;
            vg_assert(r >= 0 && r < VG_MAX_REALREGS);
            vg_assert(real_to_temp[r] == tno);
         }
      }
#     endif

      if (VG_(disassemble))
         VG_(ppUInstr)(i, &c1->instrs[i]);

      /* First, free up enough real regs for this insn.  This may
         generate spill stores since we may have to evict some TempRegs
         currently in real regs.  Also generates spill loads. */

      k = getTempUsage(&c1->instrs[i], &tempUse[0]);
      vg_assert(k >= 0 && k <= 3);

      /* For each ***different*** temp mentioned in the insn .... */
      for (j = 0; j < k; j++) {

         /* First check if the temp is mentioned again later; if so,
            ignore this mention.  We only want to process each temp
            used by the insn once, even if it is mentioned more than
            once. */
         defer = False;
         tno = tempUse[j].tempNo;
         for (m = j+1; m < k; m++)
            if (tempUse[m].tempNo == tno)
               defer = True;
         if (defer)
            continue;

         /* Now we're trying to find a register for tempUse[j].tempNo.
            First of all, if it already has a register assigned, we
            don't need to do anything more. */
         if (temp_info[tno].real_no != VG_NOTHING)
            continue;

         /* No luck.  The next thing to do is see if there is a
            currently unassigned register available.  If so, bag it. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            if (real_to_temp[r] == VG_NOTHING)
               break;
         }
         if (r < VG_MAX_REALREGS) {
            real_to_temp[r]        = tno;
            temp_info[tno].real_no = r;
            continue;
         }

         /* Unfortunately, that didn't pan out either.  So we'll have
            to eject some other unfortunate TempReg into a spill slot
            in order to free up a register.  Of course, we need to be
            careful not to eject some other TempReg needed by this
            insn.

            Select r in 0 .. VG_MAX_REALREGS-1 such that
            real_to_temp[r] is not mentioned in
            tempUse[0 .. k-1].tempNo, since it would be just plain
            wrong to eject some other TempReg which we need to use in
            this insn.

            It is here that it is important to make a good choice of
            register to spill.  */

         /* First, mark those regs which are not spill candidates. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            is_spill_cand[r] = True;
            for (m = 0; m < k; m++) {
               if (real_to_temp[r] == tempUse[m].tempNo) {
                  is_spill_cand[r] = False;
                  break;
               }
            }
         }

         /* We can choose any r satisfying is_spill_cand[r].  However,
            try to make a good choice.  First, try and find r such
            that the associated TempReg is already dead. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            if (is_spill_cand[r] &&
                temp_info[real_to_temp[r]].dead_before <= i)
               goto have_spill_cand;
         }

         /* No spill cand is mapped to a dead TempReg.  Now we really
           _do_ have to generate spill code.  Choose r so that the
           next use of its associated TempReg is as far ahead as
           possible, in the hope that this will minimise the number of
           consequent reloads required.  This is a bit expensive, but
           we don't have to do it very often. */
         {
            Int furthest_r = VG_MAX_REALREGS;
            Int furthest = 0;
            for (r = 0; r < VG_MAX_REALREGS; r++) {
               if (!is_spill_cand[r]) continue;
               for (m = i+1; m < c1->used; m++)
                  if (uInstrMentionsTempReg(&c1->instrs[m],
                                            real_to_temp[r]))
                     break;
               if (m > furthest) {
                  furthest   = m;
                  furthest_r = r;
               }
            }
            r = furthest_r;
            goto have_spill_cand;
         }

         have_spill_cand:
         if (r == VG_MAX_REALREGS)
            VG_(panic)("new reg alloc: out of registers ?!");

         /* Eject r.  Important refinement: don't bother if the
            associated TempReg is now dead. */
         vg_assert(real_to_temp[r] != VG_NOTHING);
         vg_assert(real_to_temp[r] != tno);
         temp_info[real_to_temp[r]].real_no = VG_NOTHING;
         if (temp_info[real_to_temp[r]].dead_before > i) {
            uInstr2(c2, PUT, 4,
                        RealReg, VG_(rankToRealRegNo)(r),
                        SpillNo, temp_info[real_to_temp[r]].spill_no);
            VG_(uinstrs_spill)++;
            spill_reqd = True;
            if (VG_(disassemble))
               VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
         }

         /* Decide if tno is read. */
         isRead = False;
         for (m = 0; m < k; m++)
            if (tempUse[m].tempNo == tno && !tempUse[m].isWrite)
               isRead = True;

         /* If so, generate a spill load. */
         if (isRead) {
            uInstr2(c2, GET, 4,
                        SpillNo, temp_info[tno].spill_no,
                        RealReg, VG_(rankToRealRegNo)(r) );
            VG_(uinstrs_spill)++;
            spill_reqd = True;
            if (VG_(disassemble))
               VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
         }

         /* Update the forwards and backwards maps. */
         real_to_temp[r]        = tno;
         temp_info[tno].real_no = r;
      }

      /* By this point, all TempRegs mentioned by the insn have been
         bought into real regs.  We now copy the insn to the output
         and use patchUInstr to convert its rTempRegs into
         realregs. */
      for (j = 0; j < k; j++)
         tempUse[j].realNo
            = VG_(rankToRealRegNo)(temp_info[tempUse[j].tempNo].real_no);
      VG_(copyUInstr)(c2, &c1->instrs[i]);
      patchUInstr(&LAST_UINSTR(c2), &tempUse[0], k);

      if (VG_(disassemble)) {
         VG_(ppUInstr)(c2->used-1, &LAST_UINSTR(c2));
         VG_(printf)("\n");
      }
   }

   if (temp_info != NULL)
      VG_(jitfree)(temp_info);

   VG_(freeCodeBlock)(c1);

   if (spill_reqd)
      VG_(translations_needing_spill)++;

   return c2;

#  undef VG_NOTHING

}


/*------------------------------------------------------------*/
/*--- New instrumentation machinery.                       ---*/
/*------------------------------------------------------------*/

static
VgTagOp get_VgT_ImproveOR_TQ ( Int sz )
{
   switch (sz) {
      case 4: return VgT_ImproveOR4_TQ;
      case 2: return VgT_ImproveOR2_TQ;
      case 1: return VgT_ImproveOR1_TQ;
      default: VG_(panic)("get_VgT_ImproveOR_TQ");
   }
}


static
VgTagOp get_VgT_ImproveAND_TQ ( Int sz )
{
   switch (sz) {
      case 4: return VgT_ImproveAND4_TQ;
      case 2: return VgT_ImproveAND2_TQ;
      case 1: return VgT_ImproveAND1_TQ;
      default: VG_(panic)("get_VgT_ImproveAND_TQ");
   }
}


static
VgTagOp get_VgT_Left ( Int sz )
{
   switch (sz) {
      case 4: return VgT_Left4;
      case 2: return VgT_Left2;
      case 1: return VgT_Left1;
      default: VG_(panic)("get_VgT_Left");
   }
}


static
VgTagOp get_VgT_UifU ( Int sz )
{
   switch (sz) {
      case 4: return VgT_UifU4;
      case 2: return VgT_UifU2;
      case 1: return VgT_UifU1;
      case 0: return VgT_UifU0;
      default: VG_(panic)("get_VgT_UifU");
   }
}


static
VgTagOp get_VgT_DifD ( Int sz )
{
   switch (sz) {
      case 4: return VgT_DifD4;
      case 2: return VgT_DifD2;
      case 1: return VgT_DifD1;
      default: VG_(panic)("get_VgT_DifD");
   }
}


static
VgTagOp get_VgT_PCast ( Int szs, Int szd )
{
   if (szs == 4 && szd == 0) return VgT_PCast40;
   if (szs == 2 && szd == 0) return VgT_PCast20;
   if (szs == 1 && szd == 0) return VgT_PCast10;
   if (szs == 0 && szd == 1) return VgT_PCast01;
   if (szs == 0 && szd == 2) return VgT_PCast02;
   if (szs == 0 && szd == 4) return VgT_PCast04;
   if (szs == 1 && szd == 4) return VgT_PCast14;
   if (szs == 1 && szd == 2) return VgT_PCast12;
   if (szs == 1 && szd == 1) return VgT_PCast11;
   VG_(printf)("get_VgT_PCast(%d,%d)\n", szs, szd);
   VG_(panic)("get_VgT_PCast");
}


static
VgTagOp get_VgT_Widen ( Bool syned, Int szs, Int szd )
{
   if (szs == 1 && szd == 2 && syned)  return VgT_SWiden12;
   if (szs == 1 && szd == 2 && !syned) return VgT_ZWiden12;

   if (szs == 1 && szd == 4 && syned)  return VgT_SWiden14;
   if (szs == 1 && szd == 4 && !syned) return VgT_ZWiden14;

   if (szs == 2 && szd == 4 && syned)  return VgT_SWiden24;
   if (szs == 2 && szd == 4 && !syned) return VgT_ZWiden24;

   VG_(printf)("get_VgT_Widen(%d,%d,%d)\n", (Int)syned, szs, szd);
   VG_(panic)("get_VgT_Widen");
}

/* Pessimally cast the spec'd shadow from one size to another. */
static
void create_PCast ( UCodeBlock* cb, Int szs, Int szd, Int tempreg )
{
   if (szs == 0 && szd == 0)
      return;
   uInstr3(cb, TAG1, 0, TempReg, tempreg,
                        NoValue, 0,
                        Lit16,   get_VgT_PCast(szs,szd));
}


/* Create a signed or unsigned widen of the spec'd shadow from one
   size to another.  The only allowed size transitions are 1->2, 1->4
   and 2->4. */
static
void create_Widen ( UCodeBlock* cb, Bool signed_widen,
                    Int szs, Int szd, Int tempreg )
{
   if (szs == szd) return;
   uInstr3(cb, TAG1, 0, TempReg, tempreg,
                        NoValue, 0,
                        Lit16,   get_VgT_Widen(signed_widen,szs,szd));
}


/* Get the condition codes into a new shadow, at the given size. */
static
Int create_GETVF ( UCodeBlock* cb, Int sz )
{
   Int tt = newShadow(cb);
   uInstr1(cb, GETVF, 0, TempReg, tt);
   create_PCast(cb, 0, sz, tt);
   return tt;
}


/* Save the condition codes from the spec'd shadow. */
static
void create_PUTVF ( UCodeBlock* cb, Int sz, Int tempreg )
{
   if (sz == 0) {
      uInstr1(cb, PUTVF, 0, TempReg, tempreg);
   } else {
      Int tt = newShadow(cb);
      uInstr2(cb, MOV, 4, TempReg, tempreg, TempReg, tt);
      create_PCast(cb, sz, 0, tt);
      uInstr1(cb, PUTVF, 0, TempReg, tt);
   }
}


/* Do Left on the spec'd shadow. */
static
void create_Left ( UCodeBlock* cb, Int sz, Int tempreg )
{
   uInstr3(cb, TAG1, 0,
               TempReg, tempreg,
               NoValue, 0,
               Lit16, get_VgT_Left(sz));
}


/* Do UifU on ts and td, putting the result in td. */
static
void create_UifU ( UCodeBlock* cb, Int sz, Int ts, Int td )
{
   uInstr3(cb, TAG2, 0, TempReg, ts, TempReg, td,
               Lit16, get_VgT_UifU(sz));
}


/* Do DifD on ts and td, putting the result in td. */
static
void create_DifD ( UCodeBlock* cb, Int sz, Int ts, Int td )
{
   uInstr3(cb, TAG2, 0, TempReg, ts, TempReg, td,
               Lit16, get_VgT_DifD(sz));
}


/* Do HelpAND on value tval and tag tqqq, putting the result in
   tqqq. */
static
void create_ImproveAND_TQ ( UCodeBlock* cb, Int sz, Int tval, Int tqqq )
{
   uInstr3(cb, TAG2, 0, TempReg, tval, TempReg, tqqq,
               Lit16, get_VgT_ImproveAND_TQ(sz));
}


/* Do HelpOR on value tval and tag tqqq, putting the result in
   tqqq. */
static
void create_ImproveOR_TQ ( UCodeBlock* cb, Int sz, Int tval, Int tqqq )
{
   uInstr3(cb, TAG2, 0, TempReg, tval, TempReg, tqqq,
               Lit16, get_VgT_ImproveOR_TQ(sz));
}


/* Get the shadow for an operand described by (tag, val).  Emit code
   to do this and return the identity of the shadow holding the
   result.  The result tag is always copied into a new shadow, so it
   can be modified without trashing the original.*/
static
Int /* TempReg */ getOperandShadow ( UCodeBlock* cb,
                                     Int sz, Int tag, Int val )
{
   Int sh;
   sh = newShadow(cb);
   if (tag == TempReg) {
      uInstr2(cb, MOV, 4, TempReg, SHADOW(val), TempReg, sh);
      return sh;
   }
   if (tag == Literal) {
      uInstr1(cb, SETV, sz, TempReg, sh);
      return sh;
   }
   if (tag == ArchReg) {
      uInstr2(cb, GETV, sz, ArchReg, val, TempReg, sh);
      return sh;
   }
   VG_(panic)("getOperandShadow");
}


/* Create and return an instrumented version of cb_in.  Free cb_in
   before returning. */
static UCodeBlock* vg_instrument ( UCodeBlock* cb_in )
{
   UCodeBlock* cb;
   Int         i, j;
   UInstr*     u_in;
   Int         qs, qd, qt, qtt;
   cb = VG_(allocCodeBlock)();
   cb->nextTemp = cb_in->nextTemp;

   for (i = 0; i < cb_in->used; i++) {
      qs = qd = qt = qtt = INVALID_TEMPREG;
      u_in = &cb_in->instrs[i];

      /* if (i > 0) uInstr1(cb, NOP, 0, NoValue, 0); */

      /* VG_(ppUInstr)(0, u_in); */
      switch (u_in->opcode) {

         case NOP:
            break;

         case INCEIP:
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Loads and stores.  Test the V bits for the address.  24
            Mar 02: since the address is A-checked anyway, there's not
            really much point in doing the V-check too, unless you
            think that you might use addresses which are undefined but
            still addressible.  Hence the optionalisation of the V
            check.

            The LOADV/STOREV does an addressibility check for the
            address. */

         case LOAD:
            if (VG_(clo_check_addrVs)) {
               uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
               uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
            }
            uInstr2(cb, LOADV, u_in->size,
                        TempReg, u_in->val1,
                        TempReg, SHADOW(u_in->val2));
            VG_(copyUInstr)(cb, u_in);
            break;
         case STORE:
            if (VG_(clo_check_addrVs)) {
               uInstr1(cb, TESTV,  4, TempReg, SHADOW(u_in->val2));
               uInstr1(cb, SETV,   4, TempReg, SHADOW(u_in->val2));
            }
            uInstr2(cb, STOREV, u_in->size,
                        TempReg, SHADOW(u_in->val1),
                        TempReg, u_in->val2);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Moving stuff around.  Make the V bits follow accordingly,
            but don't do anything else.  */

         case GET:
            uInstr2(cb, GETV, u_in->size,
                        ArchReg, u_in->val1,
                        TempReg, SHADOW(u_in->val2));
            VG_(copyUInstr)(cb, u_in);
            break;
         case PUT:
            uInstr2(cb, PUTV, u_in->size,
                        TempReg, SHADOW(u_in->val1),
                        ArchReg, u_in->val2);
            VG_(copyUInstr)(cb, u_in);
            break;

         case GETF:
            /* This is not the smartest way to do it, but should work. */
            qd = create_GETVF(cb, u_in->size);
            uInstr2(cb, MOV, 4, TempReg, qd, TempReg, SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;
         case PUTF:
            create_PUTVF(cb, u_in->size, SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;

         case MOV:
            switch (u_in->tag1) {
               case TempReg:
                  uInstr2(cb, MOV, 4,
                              TempReg, SHADOW(u_in->val1),
                              TempReg, SHADOW(u_in->val2));
                  break;
               case Literal:
                  uInstr1(cb, SETV, u_in->size,
                              TempReg, SHADOW(u_in->val2));
                  break;
               default:
                  VG_(panic)("vg_instrument: MOV");
            }
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Special case of add, where one of the operands is a literal.
            lea1(t) = t + some literal.
            Therefore: lea1#(qa) = left(qa)
         */
         case LEA1:
            vg_assert(u_in->size == 4 && !VG_(anyFlagUse)(u_in));
            qs = SHADOW(u_in->val1);
            qd = SHADOW(u_in->val2);
            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qd);
            create_Left(cb, u_in->size, qd);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Another form of add.
            lea2(ts,tt,shift) = ts + (tt << shift); shift is a literal
                                and is 0,1,2 or 3.
            lea2#(qs,qt) = left(qs `UifU` (qt << shift)).
            Note, subtly, that the shift puts zeroes at the bottom of qt,
            meaning Valid, since the corresponding shift of tt puts
            zeroes at the bottom of tb.
         */
         case LEA2: {
            Int shift;
            vg_assert(u_in->size == 4 && !VG_(anyFlagUse)(u_in));
            switch (u_in->extra4b) {
               case 1: shift = 0; break;
               case 2: shift = 1; break;
               case 4: shift = 2; break;
               case 8: shift = 3; break;
               default: VG_(panic)( "vg_instrument(LEA2)" );
            }
            qs = SHADOW(u_in->val1);
            qt = SHADOW(u_in->val2);
            qd = SHADOW(u_in->val3);
            uInstr2(cb, MOV, 4, TempReg, qt, TempReg, qd);
            if (shift > 0) {
               uInstr2(cb, SHL, 4, Literal, 0, TempReg, qd);
               uLiteral(cb, shift);
            }
            create_UifU(cb, 4, qs, qd);
            create_Left(cb, u_in->size, qd);
            VG_(copyUInstr)(cb, u_in);
            break;
         }

         /* inc#/dec#(qd) = q `UifU` left(qd) = left(qd) */
         case INC: case DEC:
            qd = SHADOW(u_in->val1);
            create_Left(cb, u_in->size, qd);
            if (u_in->flags_w != FlagsEmpty)
               create_PUTVF(cb, u_in->size, qd);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* This is a HACK (approximation :-) */
         /* rcl#/rcr#(qs,qd)
               = let q0 = pcast-sz-0(qd) `UifU` pcast-sz-0(qs) `UifU` eflags#
                 eflags# = q0
                 qd =pcast-0-sz(q0)
            Ie, cast everything down to a single bit, then back up.
            This assumes that any bad bits infect the whole word and
            the eflags.
         */
         case RCL: case RCR:
	    vg_assert(u_in->flags_r != FlagsEmpty);
            /* The following assertion looks like it makes sense, but is
               actually wrong.  Consider this:
                  rcll    %eax
                  imull   %eax, %eax
               The rcll writes O and C but so does the imull, so the O and C
               write of the rcll is annulled by the prior improvement pass.
               Noticed by Kevin Ryde <user42@zip.com.au>
            */
	    /* vg_assert(u_in->flags_w != FlagsEmpty); */
            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
            /* We can safely modify qs; cast it to 0-size. */
            create_PCast(cb, u_in->size, 0, qs);
            qd = SHADOW(u_in->val2);
            create_PCast(cb, u_in->size, 0, qd);
            /* qs is cast-to-0(shift count#), and qd is cast-to-0(value#). */
            create_UifU(cb, 0, qs, qd);
            /* qs is now free; reuse it for the flag definedness. */
            qs = create_GETVF(cb, 0);
            create_UifU(cb, 0, qs, qd);
            create_PUTVF(cb, 0, qd);
            create_PCast(cb, 0, u_in->size, qd);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* for OP in shl shr sar rol ror
            (qs is shift count#, qd is value to be OP#d)
            OP(ts,td)
            OP#(qs,qd)
               = pcast-1-sz(qs) `UifU` OP(ts,qd)
            So we apply OP to the tag bits too, and then UifU with
            the shift count# to take account of the possibility of it
            being undefined.

            A bit subtle:
               ROL/ROR rearrange the tag bits as per the value bits.
               SHL/SHR shifts zeroes into the value, and corresponding
                  zeroes indicating Definedness into the tag.
               SAR copies the top bit of the value downwards, and therefore
                  SAR also copies the definedness of the top bit too.
            So in all five cases, we just apply the same op to the tag
            bits as is applied to the value bits.  Neat!
         */
         case SHL:
         case SHR: case SAR:
         case ROL: case ROR: {
            Int t_amount = INVALID_TEMPREG;
            vg_assert(u_in->tag1 == TempReg || u_in->tag1 == Literal);
            vg_assert(u_in->tag2 == TempReg);
            qd = SHADOW(u_in->val2);

            /* Make qs hold shift-count# and make
               t_amount be a TempReg holding the shift count. */
            if (u_in->tag1 == Literal) {
               t_amount = newTemp(cb);
               uInstr2(cb, MOV, 4, Literal, 0, TempReg, t_amount);
               uLiteral(cb, u_in->lit32);
               qs = SHADOW(t_amount);
               uInstr1(cb, SETV, 1, TempReg, qs);
            } else {
               t_amount = u_in->val1;
               qs = SHADOW(u_in->val1);
            }

            uInstr2(cb, u_in->opcode,
                        u_in->size,
                        TempReg, t_amount,
                        TempReg, qd);
            qt = newShadow(cb);
            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
            create_PCast(cb, 1, u_in->size, qt);
            create_UifU(cb, u_in->size, qt, qd);
            VG_(copyUInstr)(cb, u_in);
            break;
         }

         /* One simple tag operation. */
         case WIDEN:
            vg_assert(u_in->tag1 == TempReg);
            create_Widen(cb, u_in->signed_widen, u_in->extra4b, u_in->size,
                             SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;

         /* not#(x) = x (since bitwise independent) */
         case NOT:
            vg_assert(u_in->tag1 == TempReg);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* neg#(x) = left(x) (derivable from case for SUB) */
         case NEG:
            vg_assert(u_in->tag1 == TempReg);
            create_Left(cb, u_in->size, SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;

         /* bswap#(x) = bswap(x) */
         case BSWAP:
            vg_assert(u_in->tag1 == TempReg);
            vg_assert(u_in->size == 4);
            qd = SHADOW(u_in->val1);
            uInstr1(cb, BSWAP, 4, TempReg, qd);
            VG_(copyUInstr)(cb, u_in);
            break;

         /* cc2val#(qd) = pcast-0-to-size(eflags#) */
         case CC2VAL:
            vg_assert(u_in->tag1 == TempReg);
            vg_assert(u_in->flags_r != FlagsEmpty);
            qt = create_GETVF(cb, u_in->size);
            uInstr2(cb, MOV, 4, TempReg, qt, TempReg, SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;

         /* cmov#(qs,qd) = cmov(qs,qd)
            That is, do the cmov of tags using the same flags as for
            the data (obviously).  However, first do a test on the
            validity of the flags.
         */
         case CMOV:
            vg_assert(u_in->size == 4);
            vg_assert(u_in->tag1 == TempReg);
            vg_assert(u_in->tag2 == TempReg);
            vg_assert(u_in->flags_r != FlagsEmpty);
            vg_assert(u_in->flags_w == FlagsEmpty);
            qs = SHADOW(u_in->val1);
            qd = SHADOW(u_in->val2);
            qt = create_GETVF(cb, 0);
            uInstr1(cb, TESTV, 0, TempReg, qt);
            /* qt should never be referred to again.  Nevertheless
               ... */
            uInstr1(cb, SETV, 0, TempReg, qt);

            uInstr2(cb, CMOV, 4, TempReg, qs, TempReg, qd);
            LAST_UINSTR(cb).cond    = u_in->cond;
            LAST_UINSTR(cb).flags_r = u_in->flags_r;

            VG_(copyUInstr)(cb, u_in);
            break;

         /* add#/sub#(qs,qd)
               = qs `UifU` qd `UifU` left(qs) `UifU` left(qd)
               = left(qs) `UifU` left(qd)
               = left(qs `UifU` qd)
            adc#/sbb#(qs,qd)
               = left(qs `UifU` qd) `UifU` pcast(eflags#)
            Second arg (dest) is TempReg.
            First arg (src) is Literal or TempReg or ArchReg.
         */
         case ADD: case SUB:
         case ADC: case SBB:
            qd = SHADOW(u_in->val2);
            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
            create_UifU(cb, u_in->size, qs, qd);
            create_Left(cb, u_in->size, qd);
            if (u_in->opcode == ADC || u_in->opcode == SBB) {
               vg_assert(u_in->flags_r != FlagsEmpty);
               qt = create_GETVF(cb, u_in->size);
               create_UifU(cb, u_in->size, qt, qd);
            }
            if (u_in->flags_w != FlagsEmpty) {
               create_PUTVF(cb, u_in->size, qd);
            }
            VG_(copyUInstr)(cb, u_in);
            break;

         /* xor#(qs,qd) = qs `UifU` qd */
         case XOR:
            qd = SHADOW(u_in->val2);
            qs = getOperandShadow(cb, u_in->size, u_in->tag1, u_in->val1);
            create_UifU(cb, u_in->size, qs, qd);
            if (u_in->flags_w != FlagsEmpty) {
               create_PUTVF(cb, u_in->size, qd);
            }
            VG_(copyUInstr)(cb, u_in);
            break;

         /* and#/or#(qs,qd)
               = (qs `UifU` qd) `DifD` improve(vs,qs)
                                `DifD` improve(vd,qd)
            where improve is the relevant one of
                Improve{AND,OR}_TQ
            Use the following steps, with qt as a temp:
               qt = improve(vd,qd)
               qd = qs `UifU` qd
               qd = qt `DifD` qd
               qt = improve(vs,qs)
               qd = qt `DifD` qd
         */
         case AND: case OR:
            vg_assert(u_in->tag1 == TempReg);
            vg_assert(u_in->tag2 == TempReg);
            qd = SHADOW(u_in->val2);
            qs = SHADOW(u_in->val1);
            qt = newShadow(cb);

            /* qt = improve(vd,qd) */
            uInstr2(cb, MOV, 4, TempReg, qd, TempReg, qt);
            if (u_in->opcode == AND)
               create_ImproveAND_TQ(cb, u_in->size, u_in->val2, qt);
            else
               create_ImproveOR_TQ(cb, u_in->size, u_in->val2, qt);
            /* qd = qs `UifU` qd */
            create_UifU(cb, u_in->size, qs, qd);
            /* qd = qt `DifD` qd */
            create_DifD(cb, u_in->size, qt, qd);
            /* qt = improve(vs,qs) */
            uInstr2(cb, MOV, 4, TempReg, qs, TempReg, qt);
            if (u_in->opcode == AND)
               create_ImproveAND_TQ(cb, u_in->size, u_in->val1, qt);
            else
               create_ImproveOR_TQ(cb, u_in->size, u_in->val1, qt);
            /* qd = qt `DifD` qd */
               create_DifD(cb, u_in->size, qt, qd);
            /* So, finally qd is the result tag. */
            if (u_in->flags_w != FlagsEmpty) {
               create_PUTVF(cb, u_in->size, qd);
            }
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Machinery to do with supporting CALLM.  Copy the start and
            end markers only to make the result easier to read
            (debug); they generate no code and have no effect.
         */
         case CALLM_S: case CALLM_E:
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Copy PUSH and POP verbatim.  Arg/result absval
            calculations are done when the associated CALL is
            processed.  CLEAR has no effect on absval calculations but
            needs to be copied.
         */
         case PUSH: case POP: case CLEAR:
            VG_(copyUInstr)(cb, u_in);
            break;

         /* In short:
               callm#(a1# ... an#) = (a1# `UifU` ... `UifU` an#)
            We have to decide on a size to do the computation at,
            although the choice doesn't affect correctness.  We will
            do a pcast to the final size anyway, so the only important
            factor is to choose a size which minimises the total
            number of casts needed.  Valgrind: just use size 0,
            regardless.  It may not be very good for performance
            but does simplify matters, mainly by reducing the number
            of different pessimising casts which have to be implemented.
         */
         case CALLM: {
            UInstr* uu;
            Bool res_used;

            /* Now generate the code.  Get the final result absval
               into qt. */
            qt  = newShadow(cb);
            qtt = newShadow(cb);
            uInstr1(cb, SETV, 0, TempReg, qt);
            for (j = i-1; cb_in->instrs[j].opcode != CALLM_S; j--) {
               uu = & cb_in->instrs[j];
               if (uu->opcode != PUSH) continue;
               /* cast via a temporary */
               uInstr2(cb, MOV, 4, TempReg, SHADOW(uu->val1),
                                   TempReg, qtt);
               create_PCast(cb, uu->size, 0, qtt);
               create_UifU(cb, 0, qtt, qt);
            }
            /* Remembering also that flags read count as inputs. */
            if (u_in->flags_r != FlagsEmpty) {
               qtt = create_GETVF(cb, 0);
               create_UifU(cb, 0, qtt, qt);
            }

            /* qt now holds the result tag.  If any results from the
               call are used, either by fetching with POP or
               implicitly by writing the flags, we copy the result
               absval to the relevant location.  If not used, the call
               must have been for its side effects, so we test qt here
               and now.  Note that this assumes that all values
               removed by POP continue to be live.  So dead args
               *must* be removed with CLEAR, not by POPping them into
               a dummy tempreg.
            */
            res_used = False;
            for (j = i+1; cb_in->instrs[j].opcode != CALLM_E; j++) {
               uu = & cb_in->instrs[j];
               if (uu->opcode != POP) continue;
               /* Cast via a temp. */
               uInstr2(cb, MOV, 4, TempReg, qt, TempReg, qtt);
               create_PCast(cb, 0, uu->size, qtt);
               uInstr2(cb, MOV, 4, TempReg, qtt,
                                   TempReg, SHADOW(uu->val1));
               res_used = True;
            }
            if (u_in->flags_w != FlagsEmpty) {
               create_PUTVF(cb, 0, qt);
               res_used = True;
            }
            if (!res_used) {
               uInstr1(cb, TESTV, 0, TempReg, qt);
               /* qt should never be referred to again.  Nevertheless
                  ... */
               uInstr1(cb, SETV, 0, TempReg, qt);
            }
            VG_(copyUInstr)(cb, u_in);
            break;
         }
         /* Whew ... */

         case JMP:
            if (u_in->tag1 == TempReg) {
               uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
               uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
            } else {
               vg_assert(u_in->tag1 == Literal);
            }
            if (u_in->cond != CondAlways) {
               vg_assert(u_in->flags_r != FlagsEmpty);
               qt = create_GETVF(cb, 0);
               uInstr1(cb, TESTV, 0, TempReg, qt);
               /* qt should never be referred to again.  Nevertheless
                  ... */
               uInstr1(cb, SETV, 0, TempReg, qt);
            }
            VG_(copyUInstr)(cb, u_in);
            break;

         case JIFZ:
            uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val1));
            uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val1));
            VG_(copyUInstr)(cb, u_in);
            break;

         /* Emit a check on the address used.  For FPU_R, the value
            loaded into the FPU is checked at the time it is read from
            memory (see synth_fpu_mem_check_actions).  */
         case FPU_R: case FPU_W:
            vg_assert(u_in->tag2 == TempReg);
            uInstr1(cb, TESTV, 4, TempReg, SHADOW(u_in->val2));
            uInstr1(cb, SETV,  4, TempReg, SHADOW(u_in->val2));
            VG_(copyUInstr)(cb, u_in);
            break;

         /* For FPU insns not referencing memory, just copy thru. */
         case FPU:
            VG_(copyUInstr)(cb, u_in);
            break;

         default:
            VG_(ppUInstr)(0, u_in);
            VG_(panic)( "vg_instrument: unhandled case");

      } /* end of switch (u_in->opcode) */

   } /* end of for loop */

   VG_(freeCodeBlock)(cb_in);
   return cb;
}

/*------------------------------------------------------------*/
/*--- Clean up mem check instrumentation.                  ---*/
/*------------------------------------------------------------*/

#define VGC_IS_SHADOW(tempreg) ((tempreg % 2) == 1)
#define VGC_UNDEF ((UChar)100)
#define VGC_VALUE ((UChar)101)

#define NOP_no_msg(uu)                                         \
   do { uu->opcode = NOP; } while (False)

#define NOP_tag1_op(uu)                                        \
   do { uu->opcode = NOP;                                      \
        if (VG_(disassemble))                                  \
           VG_(printf)("at %d: delete %s due to defd arg\n",   \
                       i, VG_(nameOfTagOp(u->val3)));          \
   } while (False)

#define SETV_tag1_op(uu,newsz)                                 \
   do { uu->opcode = SETV;                                     \
        uu->size = newsz;                                      \
        uu->tag2 = uu->tag3 = NoValue;                         \
        if (VG_(disassemble))                                  \
           VG_(printf)("at %d: convert %s to SETV%d "          \
                       "due to defd arg\n",                    \
                       i, VG_(nameOfTagOp(u->val3)), newsz);   \
   } while (False)


/* Run backwards and delete SETVs on shadow temps for which the next
   action is a write.  Needs an env saying whether or not the next
   action is a write.  The supplied UCodeBlock is destructively
   modified.
*/
static void vg_delete_redundant_SETVs ( UCodeBlock* cb )
{
   Bool*   next_is_write;
   Int     i, j, k, n_temps;
   UInstr* u;
   TempUse tempUse[3];

   n_temps = cb->nextTemp;
   if (n_temps == 0) return;

   next_is_write = VG_(jitmalloc)(n_temps * sizeof(Bool));

   for (i = 0; i < n_temps; i++) next_is_write[i] = True;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      /* If we're not checking address V bits, there will be a lot of
         GETVs, TAG1s and TAG2s calculating values which are never
         used.  These first three cases get rid of them. */

      if (u->opcode == GETV && VGC_IS_SHADOW(u->val2)
                            && next_is_write[u->val2]
                            && !VG_(clo_check_addrVs)) {
         u->opcode = NOP;
         u->size = 0;
         if (VG_(disassemble))
            VG_(printf)("at %d: delete GETV\n", i);
      } else

      if (u->opcode == TAG1 && VGC_IS_SHADOW(u->val1)
                            && next_is_write[u->val1]
                            && !VG_(clo_check_addrVs)) {
         u->opcode = NOP;
         u->size = 0;
         if (VG_(disassemble))
            VG_(printf)("at %d: delete TAG1\n", i);
      } else

      if (u->opcode == TAG2 && VGC_IS_SHADOW(u->val2)
                            && next_is_write[u->val2]
                            && !VG_(clo_check_addrVs)) {
         u->opcode = NOP;
         u->size = 0;
         if (VG_(disassemble))
            VG_(printf)("at %d: delete TAG2\n", i);
      } else

      /* We do the rest of these regardless of whether or not
         addresses are V-checked. */

      if (u->opcode == MOV && VGC_IS_SHADOW(u->val2)
                           && next_is_write[u->val2]) {
         /* This MOV is pointless because the target is dead at this
            point.  Delete it. */
         u->opcode = NOP;
         u->size = 0;
         if (VG_(disassemble))
            VG_(printf)("at %d: delete MOV\n", i);
      } else

      if (u->opcode == SETV) {
         if (u->tag1 == TempReg) {
            vg_assert(VGC_IS_SHADOW(u->val1));
            if (next_is_write[u->val1]) {
               /* This write is pointless, so annul it. */
               u->opcode = NOP;
               u->size = 0;
               if (VG_(disassemble))
                  VG_(printf)("at %d: delete SETV\n", i);
            } else {
               /* This write has a purpose; don't annul it, but do
                  notice that we did it. */
               next_is_write[u->val1] = True;
            }

         }

      } else {
         /* Find out what this insn does to the temps. */
         k = getTempUsage(u, &tempUse[0]);
         vg_assert(k <= 3);
         for (j = k-1; j >= 0; j--) {
            next_is_write[ tempUse[j].tempNo ]
                         = tempUse[j].isWrite;
         }
      }

   }

   VG_(jitfree)(next_is_write);
}


/* Run forwards, propagating and using the is-completely-defined
   property.  This removes a lot of redundant tag-munging code.
   Unfortunately it requires intimate knowledge of how each uinstr and
   tagop modifies its arguments.  This duplicates knowledge of uinstr
   tempreg uses embodied in getTempUsage(), which is unfortunate.
   The supplied UCodeBlock* is modified in-place.

   For each value temp, def[] should hold VGC_VALUE.

   For each shadow temp, def[] may hold 4,2,1 or 0 iff that shadow is
   definitely known to be fully defined at that size.  In all other
   circumstances a shadow's def[] entry is VGC_UNDEF, meaning possibly
   undefined.  In cases of doubt, VGC_UNDEF is always safe.
*/
static void vg_propagate_definedness ( UCodeBlock* cb )
{
   UChar*  def;
   Int     i, j, k, t, n_temps;
   UInstr* u;
   TempUse tempUse[3];

   n_temps = cb->nextTemp;
   if (n_temps == 0) return;

   def = VG_(jitmalloc)(n_temps * sizeof(UChar));
   for (i = 0; i < n_temps; i++)
      def[i] = VGC_IS_SHADOW(i) ? VGC_UNDEF : VGC_VALUE;

   /* Run forwards, detecting and using the all-defined property. */

   for (i = 0; i < cb->used; i++) {
      u = &cb->instrs[i];
      switch (u->opcode) {

      /* Tag-handling uinstrs. */

         /* Deal with these quickly. */
         case NOP:
         case INCEIP:
            break;

         /* Make a tag defined. */
         case SETV:
            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
            def[u->val1] = u->size;
            break;

         /* Check definedness of a tag. */
         case TESTV:
            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
            if (def[u->val1] <= 4) {
               vg_assert(def[u->val1] == u->size);
               NOP_no_msg(u);
               if (VG_(disassemble))
                  VG_(printf)("at %d: delete TESTV on defd arg\n", i);
            }
            break;

         /* Applies to both values and tags.  Propagate Definedness
            property through copies.  Note that this isn't optional;
            we *have* to do this to keep def[] correct. */
         case MOV:
            vg_assert(u->tag2 == TempReg);
            if (u->tag1 == TempReg) {
               if (VGC_IS_SHADOW(u->val1)) {
                  vg_assert(VGC_IS_SHADOW(u->val2));
                  def[u->val2] = def[u->val1];
               }
            }
            break;

         case PUTV:
            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
            if (def[u->val1] <= 4) {
               vg_assert(def[u->val1] == u->size);
               u->tag1 = Literal;
               u->val1 = 0;
               switch (u->size) {
                  case 4: u->lit32 = 0x00000000; break;
                  case 2: u->lit32 = 0xFFFF0000; break;
                  case 1: u->lit32 = 0xFFFFFF00; break;
                  default: VG_(panic)("vg_cleanup(PUTV)");
               }
               if (VG_(disassemble))
                  VG_(printf)(
                     "at %d: propagate definedness into PUTV\n", i);
            }
            break;

         case STOREV:
            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
            if (def[u->val1] <= 4) {
               vg_assert(def[u->val1] == u->size);
               u->tag1 = Literal;
               u->val1 = 0;
               switch (u->size) {
                  case 4: u->lit32 = 0x00000000; break;
                  case 2: u->lit32 = 0xFFFF0000; break;
                  case 1: u->lit32 = 0xFFFFFF00; break;
                  default: VG_(panic)("vg_cleanup(STOREV)");
               }
               if (VG_(disassemble))
                  VG_(printf)(
                     "at %d: propagate definedness into STandV\n", i);
            }
            break;

         /* Nothing interesting we can do with this, I think. */
         case PUTVF:
            break;

         /* Tag handling operations. */
         case TAG2:
            vg_assert(u->tag2 == TempReg && VGC_IS_SHADOW(u->val2));
            vg_assert(u->tag3 == Lit16);
            /* Ultra-paranoid "type" checking. */
            switch (u->val3) {
               case VgT_ImproveAND4_TQ: case VgT_ImproveAND2_TQ:
               case VgT_ImproveAND1_TQ: case VgT_ImproveOR4_TQ:
               case VgT_ImproveOR2_TQ: case VgT_ImproveOR1_TQ:
                  vg_assert(u->tag1 == TempReg && !VGC_IS_SHADOW(u->val1));
                  break;
               default:
                  vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
                  break;
            }
            switch (u->val3) {
               Int sz;
               case VgT_UifU4:
                  sz = 4; goto do_UifU;
               case VgT_UifU2:
                  sz = 2; goto do_UifU;
               case VgT_UifU1:
                  sz = 1; goto do_UifU;
               case VgT_UifU0:
                  sz = 0; goto do_UifU;
               do_UifU:
                  vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
                  vg_assert(u->tag2 == TempReg && VGC_IS_SHADOW(u->val2));
                  if (def[u->val1] <= 4) {
                     /* UifU.  The first arg is defined, so result is
                        simply second arg.  Delete this operation. */
                     vg_assert(def[u->val1] == sz);
                     NOP_no_msg(u);
                     if (VG_(disassemble))
                        VG_(printf)(
                           "at %d: delete UifU%d due to defd arg1\n",
                           i, sz);
                  }
                  else
                  if (def[u->val2] <= 4) {
                     /* UifU.  The second arg is defined, so result is
                        simply first arg.  Copy to second. */
                     vg_assert(def[u->val2] == sz);
                     u->opcode = MOV;
                     u->size = 4;
                     u->tag3 = NoValue;
                     def[u->val2] = def[u->val1];
                     if (VG_(disassemble))
                        VG_(printf)(
                           "at %d: change UifU%d to MOV due to defd"
                           " arg2\n",
                           i, sz);
                  }
                  break;
               case VgT_ImproveAND4_TQ:
                  sz = 4; goto do_ImproveAND;
               case VgT_ImproveAND1_TQ:
                  sz = 1; goto do_ImproveAND;
               do_ImproveAND:
                  /* Implements Q = T OR Q.  So if Q is entirely defined,
                     ie all 0s, we get MOV T, Q. */
		  if (def[u->val2] <= 4) {
                     vg_assert(def[u->val2] == sz);
                     u->size = 4; /* Regardless of sz */
                     u->opcode = MOV;
                     u->tag3 = NoValue;
                     def[u->val2] = VGC_UNDEF;
                     if (VG_(disassemble))
                        VG_(printf)(
                            "at %d: change ImproveAND%d_TQ to MOV due "
                            "to defd arg2\n",
                            i, sz);
                  }
                  break;
               default:
                  goto unhandled;
            }
            break;

         case TAG1:
            vg_assert(u->tag1 == TempReg && VGC_IS_SHADOW(u->val1));
            if (def[u->val1] > 4) break;
            /* We now know that the arg to the op is entirely defined.
               If the op changes the size of the arg, we must replace
               it with a SETV at the new size.  If it doesn't change
               the size, we can delete it completely. */
            switch (u->val3) {
               /* Maintain the same size ... */
               case VgT_Left4:
                  vg_assert(def[u->val1] == 4);
                  NOP_tag1_op(u);
                  break;
               case VgT_PCast11:
                  vg_assert(def[u->val1] == 1);
                  NOP_tag1_op(u);
                  break;
               /* Change size ... */
               case VgT_PCast40:
                  vg_assert(def[u->val1] == 4);
                  SETV_tag1_op(u,0);
                  def[u->val1] = 0;
                  break;
               case VgT_PCast14:
                  vg_assert(def[u->val1] == 1);
                  SETV_tag1_op(u,4);
                  def[u->val1] = 4;
                  break;
               case VgT_PCast12:
                  vg_assert(def[u->val1] == 1);
                  SETV_tag1_op(u,2);
                  def[u->val1] = 2;
                  break;
               case VgT_PCast10:
                  vg_assert(def[u->val1] == 1);
                  SETV_tag1_op(u,0);
                  def[u->val1] = 0;
                  break;
               case VgT_PCast02:
                  vg_assert(def[u->val1] == 0);
                  SETV_tag1_op(u,2);
                  def[u->val1] = 2;
                  break;
               default:
                  goto unhandled;
            }
            if (VG_(disassemble))
               VG_(printf)(
                  "at %d: delete TAG1 %s due to defd arg\n",
                  i, VG_(nameOfTagOp(u->val3)));
            break;

         default:
         unhandled:
            /* We don't know how to handle this uinstr.  Be safe, and
               set to VGC_VALUE or VGC_UNDEF all temps written by it. */
            k = getTempUsage(u, &tempUse[0]);
            vg_assert(k <= 3);
            for (j = 0; j < k; j++) {
               t = tempUse[j].tempNo;
               vg_assert(t >= 0 && t < n_temps);
               if (!tempUse[j].isWrite) {
                  /* t is read; ignore it. */
                  if (0&& VGC_IS_SHADOW(t) && def[t] <= 4)
                     VG_(printf)("ignoring def %d at %s %s\n",
                                 def[t],
                                 VG_(nameUOpcode)(True, u->opcode),
                                 (u->opcode == TAG1 || u->opcode == TAG2)
                                    ? VG_(nameOfTagOp)(u->val3)
                                    : (Char*)"");
               } else {
                  /* t is written; better nullify it. */
                  def[t] = VGC_IS_SHADOW(t) ? VGC_UNDEF : VGC_VALUE;
               }
            }
      }
   }

   VG_(jitfree)(def);
}


/* Top level post-instrumentation cleanup function. */
static void vg_cleanup ( UCodeBlock* cb )
{
   vg_propagate_definedness ( cb );
   vg_delete_redundant_SETVs ( cb );
}


/*------------------------------------------------------------*/
/*--- Main entry point for the JITter.                     ---*/
/*------------------------------------------------------------*/

/* Translate the basic block beginning at orig_addr, placing the
   translation in a vg_malloc'd block, the address and size of which
   are returned in trans_addr and trans_size.  Length of the original
   block is also returned in orig_size.  If the latter three are NULL,
   this call is being done for debugging purposes, in which case (a)
   throw away the translation once it is made, and (b) produce a load
   of debugging output.
*/
void VG_(translate) ( ThreadState* tst,
                         /* Identity of thread needing this block */
                      Addr  orig_addr,
                      UInt* orig_size,
                      Addr* trans_addr,
                      UInt* trans_size )
{
   Int         n_disassembled_bytes, final_code_size;
   Bool        debugging_translation;
   UChar*      final_code;
   UCodeBlock* cb;

   VGP_PUSHCC(VgpTranslate);
   debugging_translation
      = orig_size == NULL || trans_addr == NULL || trans_size == NULL;

   dis = True;
   dis = debugging_translation;

   /* Check if we're being asked to jump to a silly address, and if so
      record an error message before potentially crashing the entire
      system. */
   if (VG_(clo_instrument) && !debugging_translation && !dis) {
      Addr bad_addr;
      Bool ok = VGM_(check_readable) ( orig_addr, 1, &bad_addr );
      if (!ok) {
         VG_(record_jump_error)(tst, bad_addr);
      }
   }

   /* if (VG_(overall_in_count) >= 4800) dis=True; */
   if (VG_(disassemble))
      VG_(printf)("\n");
   if (0 || dis
       || (VG_(overall_in_count) > 0 &&
           (VG_(overall_in_count) % 1000 == 0))) {
      if (0&& (VG_(clo_verbosity) > 1 || dis))
         VG_(message)(Vg_UserMsg,
              "trans# %d, bb# %lu, in %d, out %d",
              VG_(overall_in_count),
              VG_(bbs_done),
              VG_(overall_in_osize), VG_(overall_in_tsize),
              orig_addr );
   }
   cb = VG_(allocCodeBlock)();

   /* Disassemble this basic block into cb. */
   VGP_PUSHCC(VgpToUCode);
   n_disassembled_bytes = VG_(disBB) ( cb, orig_addr );
   VGP_POPCC;
   /* dis=True; */
   /* if (0&& VG_(translations_done) < 617)  */
   /*    dis=False; */
   /* Try and improve the code a bit. */
   if (VG_(clo_optimise)) {
      VGP_PUSHCC(VgpImprove);
      vg_improve ( cb );
      if (VG_(disassemble))
         VG_(ppUCodeBlock) ( cb, "Improved code:" );
      VGP_POPCC;
   }
   /* dis=False; */
   /* Add instrumentation code. */
   if (VG_(clo_instrument)) {
      VGP_PUSHCC(VgpInstrument);
      cb = vg_instrument(cb);
      VGP_POPCC;
      if (VG_(disassemble))
         VG_(ppUCodeBlock) ( cb, "Instrumented code:" );
      if (VG_(clo_cleanup)) {
         VGP_PUSHCC(VgpCleanup);
         vg_cleanup(cb);
         VGP_POPCC;
         if (VG_(disassemble))
            VG_(ppUCodeBlock) ( cb, "Cleaned-up instrumented code:" );
      }
   }

   //VG_(disassemble) = True;

   /* Add cache simulation code. */
   if (VG_(clo_cachesim)) {
      VGP_PUSHCC(VgpCacheInstrument);
      cb = VG_(cachesim_instrument)(cb, orig_addr);
      VGP_POPCC;
      if (VG_(disassemble))
         VG_(ppUCodeBlock) ( cb, "Cachesim instrumented code:" );
   }

   //VG_(disassemble) = False;

   /* Allocate registers. */
   VGP_PUSHCC(VgpRegAlloc);
   cb = vg_do_register_allocation ( cb );
   VGP_POPCC;
   /* dis=False; */
   /*
   if (VG_(disassemble))
      VG_(ppUCodeBlock) ( cb, "After Register Allocation:");
   */

   VGP_PUSHCC(VgpFromUcode);
   /* NB final_code is allocated with VG_(jitmalloc), not VG_(malloc)
      and so must be VG_(jitfree)'d. */
   final_code = VG_(emit_code)(cb, &final_code_size );
   VGP_POPCC;
   VG_(freeCodeBlock)(cb);

   if (debugging_translation) {
      /* Only done for debugging -- throw away final result. */
      VG_(jitfree)(final_code);
   } else {
      /* Doing it for real -- return values to caller. */
      *orig_size = n_disassembled_bytes;
      *trans_addr = (Addr)final_code;
      *trans_size = final_code_size;
   }
   VGP_POPCC;
}

/*--------------------------------------------------------------------*/
/*--- end                                           vg_translate.c ---*/
/*--------------------------------------------------------------------*/