ftmemsim-valgrind/coregrind/vg_translate.c


/*--------------------------------------------------------------------*/
/*--- The JITter proper: register allocation & code improvement    ---*/
/*---                                               vg_translate.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Valgrind, an extensible x86 protected-mode
   emulator for monitoring program execution on x86-Unixes.

   Copyright (C) 2000-2004 Julian Seward
      jseward@acm.org

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "vg_include.h"

/*------------------------------------------------------------*/
/*--- Renamings of frequently-used global functions.       ---*/
/*------------------------------------------------------------*/

#define dis       VG_(print_codegen)

/*------------------------------------------------------------*/
/*--- Reg-alloc stats                                      ---*/
/*------------------------------------------------------------*/

static UInt n_uinstrs_prealloc;           // # uinstrs input to reg-alloc
static UInt n_uinstrs_spill;              // # uinstrs added due to spill code
static UInt n_translations_needing_spill; // # bbs requiring spill code
static UInt n_total_reg_rank; // total of register ranks over all translations

void VG_(print_reg_alloc_stats)(void)
{
   VG_(message)(Vg_DebugMsg,
                "reg-alloc: %d t-req-spill, "
                "%d+%d orig+spill uis, %d total-reg-r.",
                n_translations_needing_spill,
                n_uinstrs_prealloc, n_uinstrs_spill, n_total_reg_rank );
}

/*------------------------------------------------------------*/
/*--- Basics                                               ---*/
/*------------------------------------------------------------*/

#define VG_IS_FLAG_SUBSET(set1,set2) \
   (( ((FlagSet)set1) & ((FlagSet)set2) ) == ((FlagSet)set1) )

#define VG_UNION_FLAG_SETS(set1,set2) \
   ( ((FlagSet)set1) | ((FlagSet)set2) )

/* This one is called by the core */
UCodeBlock* VG_(alloc_UCodeBlock) ( void )
{
   UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
   cb->used = cb->size = cb->nextTemp = 0;
   cb->instrs = NULL;
   return cb;
}

/* This one is called by tools */
UCodeBlock* VG_(setup_UCodeBlock) ( UCodeBlock* cb_in )
{
   UCodeBlock* cb = VG_(arena_malloc)(VG_AR_CORE, sizeof(UCodeBlock));
   cb->orig_eip = cb_in->orig_eip;
   cb->used = cb->size = 0;
   cb->nextTemp = cb_in->nextTemp;
   cb->instrs = NULL;
   return cb;
}

void VG_(free_UCodeBlock) ( UCodeBlock* cb )
{
   if (cb->instrs) VG_(arena_free)(VG_AR_CORE, cb->instrs);
   VG_(arena_free)(VG_AR_CORE, cb);
}


/* Ensure there's enough space in a block to add one uinstr. */
static
void ensureUInstr ( UCodeBlock* cb )
{
   if (cb->used == cb->size) {
      if (cb->instrs == NULL) {
         vg_assert(cb->size == 0);
         vg_assert(cb->used == 0);
         cb->size = 8;
         cb->instrs = VG_(arena_malloc)(VG_AR_CORE, 8 * sizeof(UInstr));
      } else {
         Int i;
         UInstr* instrs2 = VG_(arena_malloc)(VG_AR_CORE,
                                       2 * sizeof(UInstr) * cb->size);
         for (i = 0; i < cb->used; i++)
            instrs2[i] = cb->instrs[i];
         cb->size *= 2;
         VG_(arena_free)(VG_AR_CORE, cb->instrs);
         cb->instrs = instrs2;
      }
   }

   vg_assert(cb->used < cb->size);
}


__inline__
void VG_(new_NOP) ( UInstr* u )
{
   u->val1 = u->val2 = u->val3 = 0;
   u->tag1 = u->tag2 = u->tag3 = NoValue;
   u->flags_r = u->flags_w = FlagsEmpty;
   u->jmpkind = JmpBoring;
   u->signed_widen = u->has_ret_val = False;
   u->regs_live_after = ALL_RREGS_LIVE;
   u->lit32    = 0;
   u->opcode   = NOP;
   u->size     = 0;
   u->cond     = 0;
   u->extra4b  = 0;
   u->argc = u->regparms_n = 0;
}


/* Add an instruction to a ucode block, and return the index of the
   instruction. */
__inline__
void VG_(new_UInstr3) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1,
                       Tag tag2, UInt val2,
                       Tag tag3, UInt val3 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(new_NOP)(ui);
   ui->val1   = val1;
   ui->val2   = val2;
   ui->val3   = val3;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->tag2   = tag2;
   ui->tag3   = tag3;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
   if (tag3 == TempReg) vg_assert(val3 != INVALID_TEMPREG);
}


__inline__
void VG_(new_UInstr2) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1,
                       Tag tag2, UInt val2 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(new_NOP)(ui);
   ui->val1   = val1;
   ui->val2   = val2;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->tag2   = tag2;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
   if (tag2 == TempReg) vg_assert(val2 != INVALID_TEMPREG);
}


__inline__
void VG_(new_UInstr1) ( UCodeBlock* cb, Opcode opcode, Int sz,
                       Tag tag1, UInt val1 )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(new_NOP)(ui);
   ui->val1   = val1;
   ui->opcode = opcode;
   ui->tag1   = tag1;
   ui->size   = sz;
   if (tag1 == TempReg) vg_assert(val1 != INVALID_TEMPREG);
}


__inline__
void VG_(new_UInstr0) ( UCodeBlock* cb, Opcode opcode, Int sz )
{
   UInstr* ui;
   ensureUInstr(cb);
   ui = & cb->instrs[cb->used];
   cb->used++;
   VG_(new_NOP)(ui);
   ui->opcode = opcode;
   ui->size   = sz;
}

/* Copy an instruction into the given codeblock. */
__inline__
void VG_(copy_UInstr) ( UCodeBlock* cb, UInstr* instr )
{
   ensureUInstr(cb);
   cb->instrs[cb->used] = *instr;
   cb->used++;
}

/* Set the lit32 field of the most recent uinsn. */
void VG_(set_lit_field) ( UCodeBlock* cb, UInt lit32 )
{
   LAST_UINSTR(cb).lit32 = lit32;
}


/* Set the C call info fields of the most recent uinsn. */
void  VG_(set_ccall_fields) ( UCodeBlock* cb, Addr fn, UChar argc, UChar
                              regparms_n, Bool has_ret_val )
{
   vg_assert(argc       <  4);
   vg_assert(regparms_n <= argc);
   LAST_UINSTR(cb).lit32       = fn;
   LAST_UINSTR(cb).argc        = argc;
   LAST_UINSTR(cb).regparms_n  = regparms_n;
   LAST_UINSTR(cb).has_ret_val = has_ret_val;
}

/* For the last uinsn inserted into cb, set the read, written and
   undefined flags.  Undefined flags are counted as written, but it
   seems worthwhile to distinguish them.
*/
__inline__
void VG_(set_flag_fields) ( UCodeBlock* cb,
                             FlagSet rr, FlagSet ww, FlagSet uu )
{
   FlagSet uw = VG_UNION_FLAG_SETS(ww,uu);

   vg_assert(rr == (rr & FlagsALL));
   vg_assert(uw == (uw & FlagsALL));
   LAST_UINSTR(cb).flags_r = rr;
   LAST_UINSTR(cb).flags_w = uw;
}

void VG_(set_cond_field) ( UCodeBlock* cb, Condcode cond )
{
   LAST_UINSTR(cb).cond = cond;
}

void VG_(set_widen_fields) ( UCodeBlock* cb, UInt szs, Bool is_signed )
{
   LAST_UINSTR(cb).extra4b = szs;
   LAST_UINSTR(cb).signed_widen = is_signed;
}


Bool VG_(any_flag_use) ( UInstr* u )
{
   return (u->flags_r != FlagsEmpty
           || u->flags_w != FlagsEmpty);
}

#if 1
#  define BEST_ALLOC_ORDER
#endif

/* Convert a rank in the range 0 .. VG_MAX_REALREGS-1 into an Intel
   register number.  This effectively defines the order in which real
   registers are allocated.  %ebp is excluded since it is permanently
   reserved for pointing at VG_(baseBlock).

   Important!  This function must correspond with the value of
   VG_MAX_REALREGS (actually, VG_MAX_REALREGS can be reduced without
   a problem, except the generated code will obviously be worse).
*/
__inline__
Int VG_(rank_to_realreg) ( Int rank )
{
   switch (rank) {
#     ifdef BEST_ALLOC_ORDER
      /* Probably the best allocation ordering. */
      case 0: return R_EAX;
      case 1: return R_EBX;
      case 2: return R_ECX;
      case 3: return R_EDX;
      case 4: return R_ESI;
      case 5: return R_EDI;
#     else
      /* Contrary; probably the worst.  Helpful for debugging, tho. */
      case 5: return R_EAX;
      case 4: return R_EBX;
      case 3: return R_ECX;
      case 2: return R_EDX;
      case 1: return R_ESI;
      case 0: return R_EDI;
#     endif
      default: VG_(core_panic)("VG_(rank_to_realreg)");
   }
}

/* Convert an Intel register number into a rank in the range 0 ..
   VG_MAX_REALREGS-1.  See related comments for rank_to_realreg()
   above.  */
__inline__
Int VG_(realreg_to_rank) ( Int realReg )
{
   switch (realReg) {
#     ifdef BEST_ALLOC_ORDER
      case R_EAX: return 0;
      case R_EBX: return 1;
      case R_ECX: return 2;
      case R_EDX: return 3;
      case R_ESI: return 4;
      case R_EDI: return 5;
#     else
      case R_EAX: return 5;
      case R_EBX: return 4;
      case R_ECX: return 3;
      case R_EDX: return 2;
      case R_ESI: return 1;
      case R_EDI: return 0;
#     endif
      default: VG_(core_panic)("VG_(realreg_to_rank)");
   }
}


/*------------------------------------------------------------*/
/*--- Sanity checking uinstrs.                             ---*/
/*------------------------------------------------------------*/

/* This seems as good a place as any to record some important stuff
   about ucode semantics.

   * TempRegs are 32 bits wide.  LOADs of 8/16 bit values into a
     TempReg are defined to zero-extend the loaded value to 32 bits.
     This is needed to make the translation of movzbl et al work
     properly.

   * Similarly, GETs of a 8/16 bit ArchRegs are zero-extended.

   * Arithmetic on TempRegs is at the specified size.  For example,
     SUBW t1, t2 has to result in a real 16 bit x86 subtraction
     being emitted -- not a 32 bit one.

   * On some insns we allow the cc bit to be set.  If so, the
     intention is that the simulated machine's %eflags register
     is copied into that of the real machine before the insn,
     and copied back again afterwards.  This means that the
     code generated for that insn must be very careful only to
     update %eflags in the intended way.  This is particularly
     important for the routines referenced by CALL insns.
*/

/* Meaning of operand kinds is as follows:

   ArchReg  is a register of the simulated CPU, stored in memory,
            in vg_m_state.m_eax .. m_edi.  These values are stored
            using the Intel register encoding.

   RealReg  is a register of the real CPU.  There are VG_MAX_REALREGS
            available for allocation.  As with ArchRegs, these values
            are stored using the Intel register encoding.

   TempReg  is a temporary register used to express the results of
            disassembly.  There is an unlimited supply of them --
            register allocation and spilling eventually assigns them
            to RealRegs.

   SpillNo  is a spill slot number.  The number of required spill
            slots is VG_MAX_PSEUDOS, in general.  Only allowed
            as the ArchReg operand of GET and PUT.

   Lit16    is a signed 16-bit literal value.

   Literal  is a 32-bit literal value.  Each uinstr can only hold
            one of these.

   The disassembled code is expressed purely in terms of ArchReg,
   TempReg and Literal operands.  Eventually, register allocation
   removes all the TempRegs, giving a result using ArchRegs, RealRegs,
   and Literals.  New x86 code can easily be synthesised from this.
   There are carefully designed restrictions on which insns can have
   which operands, intended to make it possible to generate x86 code
   from the result of register allocation on the ucode efficiently and
   without need of any further RealRegs.

   Restrictions for the individual UInstrs are clear from the checks below.
   Abbreviations: A=ArchReg   S=SpillNo   T=TempReg   L=Literal
                  Ls=Lit16    R=RealReg   N=NoValue
                  As=ArchRegS

   Before register allocation, S operands should not appear anywhere.
   After register allocation, all T operands should have been
   converted into Rs, and S operands are allowed in GET and PUT --
   denoting spill saves/restores.

   Before liveness analysis, save_e[acd]x fields should all be True.
   Afterwards, they may be False.

   The size field should be 0 for insns for which it is meaningless,
   ie those which do not directly move/operate on data.
*/
Bool VG_(saneUInstr) ( Bool beforeRA, Bool beforeLiveness, UInstr* u )
{
#  define LIT0 (u->lit32 == 0)
#  define LIT8 (((u->lit32) & 0xFFFFFF00) == 0)
#  define LIT1 (!(LIT0))
#  define LITm (u->tag1 == Literal ? True : LIT0 )
#  define SZ16 (u->size == 16)
#  define SZ8  (u->size == 8)
#  define SZ4  (u->size == 4)
#  define SZ2  (u->size == 2)
#  define SZ1  (u->size == 1)
#  define SZ0  (u->size == 0)
#  define SZ42 (u->size == 4 || u->size == 2)
#  define SZ48 (u->size == 4 || u->size == 8)
#  define SZ416 (u->size == 4 || u->size == 16)
#  define SZ816 (u->size == 8 || u->size == 16)
#  define SZsse2 (u->size == 4 || u->size == 8 || u->size == 16 || u->size == 512)
#  define SZsse3 (u->size == 4 || u->size == 8 || u->size == 16)
#  define SZi  (u->size == 4 || u->size == 2 || u->size == 1)
#  define SZf  (  u->size ==  4 || u->size ==  8 || u->size ==   2     \
               || u->size == 10 || u->size == 28 || u->size == 108)
#  define SZ4m ((u->tag1 == TempReg || u->tag1 == RealReg) \
                      ? (u->size == 4) : SZi)

/* For these ones, two cases:
 *
 * 1. They are transliterations of the corresponding x86 instruction, in
 *    which case they should have its flags (except that redundant write
 *    flags can be annulled by the optimisation pass).
 *
 * 2. They are being used generally for other purposes, eg. helping with a
 *    'rep'-prefixed instruction, in which case should have empty flags .
 */
#  define emptyR (u->flags_r == FlagsEmpty)
#  define emptyW (u->flags_w == FlagsEmpty)
#  define CC0 (emptyR && emptyW)
#  define CCr (u->flags_r == FlagsALL && emptyW)
#  define CCw (emptyR &&  u->flags_w == FlagsALL)
#  define CCa (emptyR && (u->flags_w == FlagsOSZACP  || emptyW))
#  define CCc (emptyR && (u->flags_w == FlagsOC      || emptyW))
#  define CCe (emptyR && (u->flags_w == FlagsOSZAP   || emptyW))
#  define CCb ((u->flags_r==FlagC       || emptyR) && \
               (u->flags_w==FlagsOSZACP || emptyW))
#  define CCd ((u->flags_r==FlagC   || emptyR) && \
               (u->flags_w==FlagsOC || emptyW))
#  define CCf (CC0 || (emptyR && u->flags_w==FlagsZCP) \
                   || (u->flags_r==FlagsZCP && emptyW))
#  define CCg ((u->flags_r==FlagsOSZACP || emptyR) && emptyW)
#  define CCj (u->cond==CondAlways ? CC0 : CCg)

#  define TR1 (beforeRA ? (u->tag1 == TempReg) : (u->tag1 == RealReg))
#  define TR2 (beforeRA ? (u->tag2 == TempReg) : (u->tag2 == RealReg))
#  define TR3 (beforeRA ? (u->tag3 == TempReg) : (u->tag3 == RealReg))
#  define A1  (u->tag1 == ArchReg)
#  define A2  (u->tag2 == ArchReg)
#  define AS1 ((u->tag1 == ArchReg) || ((!beforeRA && (u->tag1 == SpillNo))))
#  define AS2 ((u->tag2 == ArchReg) || ((!beforeRA && (u->tag2 == SpillNo))))
#  define AS3 ((u->tag3 == ArchReg) || ((!beforeRA && (u->tag3 == SpillNo))))
#  define L1  (u->tag1 == Literal && u->val1 == 0)
#  define L2  (u->tag2 == Literal && u->val2 == 0)
#  define Ls1 (u->tag1 == Lit16)
#  define Ls2 (u->tag2 == Lit16)
#  define Ls3 (u->tag3 == Lit16)
#  define TRL1 (TR1 || L1)
#  define TRAL1 (TR1 || A1 || L1)
#  define TRA1 (TR1 || A1)
#  define TRA2 (TR2 || A2)
#  define N1  (u->tag1 == NoValue)
#  define N2  (u->tag2 == NoValue)
#  define N3  (u->tag3 == NoValue)
#  define Se1 (u->tag1 == ArchRegS)
#  define Se2 (u->tag2 == ArchRegS)

#  define COND0    (u->cond         == 0)
#  define EXTRA4b0 (u->extra4b      == 0)
#  define EXTRA4b12 (u->extra4b     == 1 || u->extra4b == 2)
#  define SG_WD0   (u->signed_widen == 0)
#  define JMPKIND0 (u->jmpkind      == 0)
#  define CCALL0   (u->argc==0 && u->regparms_n==0 && u->has_ret_val==0 && \
                    ( beforeLiveness                                       \
                    ? u->regs_live_after == ALL_RREGS_LIVE                 \
                    : True ))

#  define XCONDi   (         EXTRA4b0 && SG_WD0 && JMPKIND0 && CCALL0)
#  define XLEA2    (COND0             && SG_WD0 && JMPKIND0 && CCALL0)
#  define XWIDEN   (COND0 && EXTRA4b12          && JMPKIND0 && CCALL0)
#  define XJMP     (                     SG_WD0             && CCALL0)
#  define XCCALL   (COND0 && EXTRA4b0 && SG_WD0 && JMPKIND0          )
#  define XOTHER   (COND0 && EXTRA4b0 && SG_WD0 && JMPKIND0 && CCALL0)

   /* 0 or 1 Literal args per UInstr */
   Int n_lits = 0;
   if (u->tag1 == Literal) n_lits++;
   if (u->tag2 == Literal) n_lits++;
   if (u->tag3 == Literal) n_lits++;
   if (n_lits > 1)
      return False;

   /* Fields not checked: val1, val2, val3 */

   switch (u->opcode) {

   /* Fields checked: lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
   case PUTSEG: return LIT0 && SZ2  && CC0 &&  TR1 && Se2 &&  N3 && XOTHER;
   case GETSEG: return LIT0 && SZ2  && CC0 &&  Se1 && TR2 &&  N3 && XOTHER;
   case USESEG: return LIT0 && SZ0  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
   case NOP:    return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
   case LOCK:   return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
   case GETF:   return LIT0 && SZ42 && CCr &&  TR1 &&  N2 &&  N3 && XOTHER;
   case PUTF:   return LIT0 && SZ42 && CCw &&  TR1 &&  N2 &&  N3 && XOTHER;
   case GET:    return LIT0 && SZi  && CC0 &&  AS1 && TR2 &&  N3 && XOTHER;
   case PUT:    return LIT0 && SZi  && CC0 &&  TR1 && AS2 &&  N3 && XOTHER;
   case LOAD:
   case STORE:  return LIT0 && SZi  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
   case MOV:    return LITm && SZ4m && CC0 && TRL1 && TR2 &&  N3 && XOTHER;
   case CMOV:   return LIT0 && SZ4  && CCg &&  TR1 && TR2 &&  N3 && XCONDi;
   case WIDEN:  return LIT0 && SZ42 && CC0 &&  TR1 &&  N2 &&  N3 && XWIDEN;
   case JMP:    return LITm && SZ0  && CCj && TRL1 &&  N2 &&  N3 && XJMP;
   case CALLM:  return LIT0 && SZ0 /*any*/ &&  Ls1 &&  N2 &&  N3 && XOTHER;
   case CALLM_S:
   case CALLM_E:return LIT0 && SZ0  && CC0 &&   N1 &&  N2 &&  N3 && XOTHER;
   case PUSH:
   case POP:    return LIT0 && SZi  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
   case CLEAR:  return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
   case AND:
   case OR:     return LIT0 && SZi  && CCa &&  TR1 && TR2 &&  N3 && XOTHER;
   case MUL:    return LIT0 && SZ42 && CCa && TRA1 &&TRA2 &&  N3 && XOTHER;
   case ADD:
   case XOR:
   case SUB:    return LITm && SZi  && CCa &&TRAL1 && TR2 &&  N3 && XOTHER;
   case SBB:
   case ADC:    return LITm && SZi  && CCb &&TRAL1 && TR2 &&  N3 && XOTHER;
   case SHL:
   case SHR:
   case SAR:    return LITm && SZi  && CCa && TRL1 && TR2 &&  N3 && XOTHER;
   case ROL:
   case ROR:    return LITm && SZi  && CCc && TRL1 && TR2 &&  N3 && XOTHER;
   case RCL:
   case RCR:    return LITm && SZi  && CCd && TRL1 && TR2 &&  N3 && XOTHER;
   case NOT:    return LIT0 && SZi  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
   case NEG:    return LIT0 && SZi  && CCa &&  TR1 &&  N2 &&  N3 && XOTHER;
   case INC:
   case DEC:    return LIT0 && SZi  && CCe &&  TR1 &&  N2 &&  N3 && XOTHER;
   case CC2VAL: return LIT0 && SZ1  && CCg &&  TR1 &&  N2 &&  N3 && XCONDi;
   case BSWAP:  return LIT0 && SZ4  && CC0 &&  TR1 &&  N2 &&  N3 && XOTHER;
   case JIFZ:   return LIT1 && SZ4  && CC0 &&  TR1 &&  L2 &&  N3 && XOTHER;
   case FPU_R:
   case FPU_W:  return LIT0 && SZf  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
   case FPU:    return LIT0 && SZ0  && CCf &&  Ls1 &&  N2 &&  N3 && XOTHER;
   case LEA1:   return /*any*/ SZ4  && CC0 &&  TR1 && TR2 &&  N3 && XOTHER;
   case LEA2:   return /*any*/ SZ4  && CC0 &&  TR1 && TR2 && TR3 && XLEA2;
   case INCEIP: return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
   case CCALL:  return LIT1 && SZ0  && CC0 &&
                       (u->argc > 0                   ? TR1 : N1) &&
                       (u->argc > 1                   ? TR2 : N2) &&
                       (u->argc > 2 || u->has_ret_val ? TR3 : N3) &&
                       u->regparms_n <= u->argc && XCCALL;
   /* Fields checked:       lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
   case MMX1:
   case MMX2:         return LIT0 && SZ0  && CC0 &&  Ls1 &&  N2 &&  N3 && XOTHER;
   case MMX3:         return LIT0 && SZ0  && CC0 &&  Ls1 && Ls2 &&  N3 && XOTHER;
   case MMX2_MemRd:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
   case MMX2_MemWr:   return LIT0 && SZ48 && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
   case MMX2a1_MemRd: return LIT0 && SZ8  && CC0 &&  Ls1 && Ls2 && TR3 && XOTHER;
   case MMX2_ERegRd:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;
   case MMX2_ERegWr:  return LIT0 && SZ4  && CC0 &&  Ls1 && TR2 &&  N3 && XOTHER;

   /* Fields checked:        lit32   size  flags_r/w tag1   tag2   tag3    (rest) */
   case SSE2a_MemWr:  return LIT0 && SZsse2 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE2a_MemRd:  return LIT0 && SZsse2 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE2a1_MemRd: return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE2g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE2g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE2e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3a_MemWr:  return LIT0 && SZsse3 && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3a_MemRd:  return LIT0 && SZsse3 && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3e_RegRd:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3e_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3a1_MemRd: return LIT8 && SZ816  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3g_RegWr:  return LIT0 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3g1_RegWr: return LIT8 && SZ4    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3e1_RegRd: return LIT8 && SZ2    && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
   case SSE3:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
   case SSE4:         return LIT0 && SZ0    && CCa  && Ls1 && Ls2 && N3  && XOTHER;
   case SSE5:         return LIT0 && SZ0    && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
   case SSE3ag_MemRd_RegWr:
                      return         SZ48   && CC0  && TR1 && TR2 && N3  && XOTHER;
   default:
      if (VG_(needs).extended_UCode)
         return SK_(sane_XUInstr)(beforeRA, beforeLiveness, u);
      else {
         VG_(printf)("unhandled opcode: %u.  Perhaps "
                     "VG_(needs).extended_UCode should be set?",
                     u->opcode);
         VG_(core_panic)("VG_(saneUInstr): unhandled opcode");
      }
   }
#  undef LIT0
#  undef LIT1
#  undef LIT8
#  undef LITm
#  undef SZ16
#  undef SZ8
#  undef SZ4
#  undef SZ2
#  undef SZ1
#  undef SZ0
#  undef SZ42
#  undef SZ48
#  undef SZ416
#  undef SZ816
#  undef SZsse2
#  undef SZsse3
#  undef SZi
#  undef SZf
#  undef SZ4m
#  undef emptyR
#  undef emptyW
#  undef CC0
#  undef CCr
#  undef CCw
#  undef CCa
#  undef CCb
#  undef CCc
#  undef CCd
#  undef CCe
#  undef CCf
#  undef CCg
#  undef CCj
#  undef TR1
#  undef TR2
#  undef TR3
#  undef A1
#  undef A2
#  undef AS1
#  undef AS2
#  undef AS3
#  undef L1
#  undef L2
#  undef Ls1
#  undef Ls2
#  undef Ls3
#  undef TRL1
#  undef TRAL1
#  undef TRA1
#  undef TRA2
#  undef N1
#  undef N2
#  undef N3
#  undef Se2
#  undef Se1
#  undef COND0
#  undef EXTRA4b0
#  undef EXTRA4b12
#  undef SG_WD0
#  undef JMPKIND0
#  undef CCALL0
#  undef XCONDi
#  undef XLEA2
#  undef XWIDEN
#  undef XJMP
#  undef XCCALL
#  undef XOTHER
}

void VG_(saneUCodeBlock) ( UCodeBlock* cb )
{
   Int i;

   for (i = 0; i < cb->used; i++) {
      Bool sane = VG_(saneUInstr)(True, True, &cb->instrs[i]);
      if (!sane) {
         VG_(printf)("Instruction failed sanity check:\n");
         VG_(up_UInstr)(i, &cb->instrs[i]);
      }
      vg_assert(sane);
   }
}

/* Sanity checks to do with CALLMs in UCodeBlocks. */
Bool VG_(saneUCodeBlockCalls) ( UCodeBlock* cb )
{
   Int  callm = 0;
   Int  callm_s = 0;
   Int  callm_e = 0;
   Int  callm_ptr, calls_ptr;
   Int  i, j, t;
   Bool incall = False;

   /* Ensure the number of CALLM, CALLM_S and CALLM_E are the same. */

   for (i = 0; i < cb->used; i++) {
      switch (cb->instrs[i].opcode) {
         case CALLM:
            if (!incall) return False;
            callm++;
            break;
         case CALLM_S:
            if (incall) return False;
            incall = True;
            callm_s++;
            break;
         case CALLM_E:
            if (!incall) return False;
            incall = False;
            callm_e++;
            break;
         case PUSH: case POP: case CLEAR:
            if (!incall) return False;
            break;
         default:
            break;
      }
   }
   if (incall) return False;
   if (callm != callm_s || callm != callm_e) return False;

   /* Check the sections between CALLM_S and CALLM's.  Ensure that no
      PUSH uinsn pushes any TempReg that any other PUSH in the same
      section pushes.  Ie, check that the TempReg args to PUSHes in
      the section are unique.  If not, the instrumenter generates
      incorrect code for CALLM insns. */

   callm_ptr = 0;

 find_next_CALLM:
   /* Search for the next interval, making calls_ptr .. callm_ptr
      bracket it. */
   while (callm_ptr < cb->used
          && cb->instrs[callm_ptr].opcode != CALLM)
      callm_ptr++;
   if (callm_ptr == cb->used)
      return True;
   vg_assert(cb->instrs[callm_ptr].opcode == CALLM);

   calls_ptr = callm_ptr - 1;
   while (cb->instrs[calls_ptr].opcode != CALLM_S)
      calls_ptr--;
   vg_assert(cb->instrs[calls_ptr].opcode == CALLM_S);
   vg_assert(calls_ptr >= 0);

   /* VG_(printf)("interval from %d to %d\n", calls_ptr, callm_ptr ); */

   /* For each PUSH insn in the interval ... */
   for (i = calls_ptr + 1; i < callm_ptr; i++) {
      if (cb->instrs[i].opcode != PUSH) continue;
      t = cb->instrs[i].val1;
      /* Ensure no later PUSH insns up to callm_ptr push the same
         TempReg.  Return False if any such are found. */
      for (j = i+1; j < callm_ptr; j++) {
         if (cb->instrs[j].opcode == PUSH &&
             cb->instrs[j].val1 == t)
            return False;
      }
   }

   /* This interval is clean.  Keep going ... */
   callm_ptr++;
   goto find_next_CALLM;
}


/*------------------------------------------------------------*/
/*--- Printing uinstrs.                                    ---*/
/*------------------------------------------------------------*/

/* Global that dictates whether to print generated code at all stages */
Bool VG_(print_codegen);

Char* VG_(name_UCondcode) ( Condcode cond )
{
   switch (cond) {
      case CondO:      return "o";
      case CondNO:     return "no";
      case CondB:      return "b";
      case CondNB:     return "nb";
      case CondZ:      return "z";
      case CondNZ:     return "nz";
      case CondBE:     return "be";
      case CondNBE:    return "nbe";
      case CondS:      return "s";
      case CondNS:     return "ns";
      case CondP:      return "p";
      case CondNP:     return "np";
      case CondL:      return "l";
      case CondNL:     return "nl";
      case CondLE:     return "le";
      case CondNLE:    return "nle";
      case CondAlways: return "MP"; /* hack! */
      default: VG_(core_panic)("name_UCondcode");
   }
}


static void vg_ppFlagSet ( Char* prefix, FlagSet set )
{
   VG_(printf)("%s", prefix);
   if (set & FlagD) VG_(printf)("D");
   if (set & FlagO) VG_(printf)("O");
   if (set & FlagS) VG_(printf)("S");
   if (set & FlagZ) VG_(printf)("Z");
   if (set & FlagA) VG_(printf)("A");
   if (set & FlagC) VG_(printf)("C");
   if (set & FlagP) VG_(printf)("P");
}


static void ppTempReg ( Int tt )
{
   if ((tt & 1) == 0)
      VG_(printf)("t%d", tt);
   else
      VG_(printf)("q%d", tt-1);
}


void VG_(pp_UOperand) ( UInstr* u, Int operandNo, Int sz, Bool parens )
{
   UInt tag, val;
   switch (operandNo) {
      case 1: tag = u->tag1; val = u->val1; break;
      case 2: tag = u->tag2; val = u->val2; break;
      case 3: tag = u->tag3; val = u->val3; break;
      default: VG_(core_panic)("VG_(pp_UOperand)(1)");
   }
   if (tag == Literal) val = u->lit32;

   if (parens) VG_(printf)("(");
   switch (tag) {
      case TempReg:  ppTempReg(val); break;
      case RealReg:  VG_(printf)("%s",nameIReg(sz==0 ? 4 : sz,val)); break;
      case Literal:  VG_(printf)("$0x%x", val); break;
      case Lit16:    VG_(printf)("$0x%x", val); break;
      case NoValue:  VG_(printf)("NoValue"); break;
      case ArchReg:  VG_(printf)("%S",nameIReg(sz,val)); break;
      case ArchRegS: VG_(printf)("%S",nameSReg(val)); break;
      case SpillNo:  VG_(printf)("spill%d", val); break;
      default: VG_(core_panic)("VG_(ppUOperand)(2)");
   }
   if (parens) VG_(printf)(")");
}


Char* VG_(name_UOpcode) ( Bool upper, Opcode opc )
{
   switch (opc) {
      case ADD:   return (upper ? "ADD" : "add");
      case ADC:   return (upper ? "ADC" : "adc");
      case AND:   return (upper ? "AND" : "and");
      case OR:    return (upper ? "OR"  : "or");
      case XOR:   return (upper ? "XOR" : "xor");
      case SUB:   return (upper ? "SUB" : "sub");
      case SBB:   return (upper ? "SBB" : "sbb");
      case SHL:   return (upper ? "SHL" : "shl");
      case SHR:   return (upper ? "SHR" : "shr");
      case SAR:   return (upper ? "SAR" : "sar");
      case ROL:   return (upper ? "ROL" : "rol");
      case ROR:   return (upper ? "ROR" : "ror");
      case RCL:   return (upper ? "RCL" : "rcl");
      case RCR:   return (upper ? "RCR" : "rcr");
      case MUL:   return (upper ? "MUL" : "mul");
      case NOT:   return (upper ? "NOT" : "not");
      case NEG:   return (upper ? "NEG" : "neg");
      case INC:   return (upper ? "INC" : "inc");
      case DEC:   return (upper ? "DEC" : "dec");
      case BSWAP: return (upper ? "BSWAP" : "bswap");
      default:    break;
   }
   if (!upper) VG_(core_panic)("vg_name_UOpcode: invalid !upper");
   switch (opc) {
      case CALLM_S: return "CALLM_S";
      case CALLM_E: return "CALLM_E";
      case INCEIP:  return "INCEIP";
      case LEA1:    return "LEA1";
      case LEA2:    return "LEA2";
      case NOP:     return "NOP";
      case LOCK:    return "LOCK";
      case GET:     return "GET";
      case PUT:     return "PUT";
      case GETF:    return "GETF";
      case PUTF:    return "PUTF";
      case GETSEG:  return "GETSEG";
      case PUTSEG:  return "PUTSEG";
      case USESEG:  return "USESEG";
      case LOAD:    return "LD" ;
      case STORE:   return "ST" ;
      case MOV:     return "MOV";
      case CMOV:    return "CMOV";
      case WIDEN:   return "WIDEN";
      case JMP:     return "J"    ;
      case JIFZ:    return "JIFZ" ;
      case CALLM:   return "CALLM";
      case CCALL:   return "CCALL";
      case PUSH:    return "PUSH" ;
      case POP:     return "POP"  ;
      case CLEAR:   return "CLEAR";
      case CC2VAL:  return "CC2VAL";
      case FPU_R:   return "FPU_R";
      case FPU_W:   return "FPU_W";
      case FPU:     return "FPU"  ;
      case MMX1:       return "MMX1" ;
      case MMX2:       return "MMX2" ;
      case MMX3:       return "MMX3" ;
      case MMX2_MemRd: return "MMX2_MRd" ;
      case MMX2_MemWr: return "MMX2_MWr" ;
      case MMX2a1_MemRd: return "MMX2a1_MRd" ;
      case MMX2_ERegRd: return "MMX2_eRRd" ;
      case MMX2_ERegWr: return "MMX2_eRWr" ;
      case SSE2a_MemWr: return "SSE2a_MWr";
      case SSE2a_MemRd: return "SSE2a_MRd";
      case SSE2g_RegWr: return "SSE2g_RWr";
      case SSE2a1_MemRd: return "SSE2a1_MRd";
      case SSE2g1_RegWr: return "SSE2g1_RWr";
      case SSE2e1_RegRd: return "SSE2e1_RRd";
      case SSE3e_RegRd: return "SSE3e_RRd";
      case SSE3e_RegWr: return "SSE3e_RWr";
      case SSE3g_RegWr: return "SSE3g_RWr";
      case SSE3a1_MemRd: return "SSE3a1_MRd";
      case SSE3g1_RegWr: return "SSE3g1_RWr";
      case SSE3e1_RegRd: return "SSE3e1_RRd";
      case SSE3:        return "SSE3";
      case SSE4:        return "SSE4";
      case SSE5:        return "SSE5";
      case SSE3a_MemWr: return "SSE3a_MWr";
      case SSE3a_MemRd: return "SSE3a_MRd";
      case SSE3ag_MemRd_RegWr: return "SSE3ag_MemRd_RegWr";
      default:
         if (VG_(needs).extended_UCode)
            return SK_(name_XUOpcode)(opc);
         else {
            VG_(printf)("unhandled opcode: %u.  Perhaps "
                        "VG_(needs).extended_UCode should be set?",
                        opc);
            VG_(core_panic)("name_UOpcode: unhandled opcode");
         }
   }
}

static
void pp_realregs_liveness ( UInstr* u )
{
#  define PRINT_RREG_LIVENESS(realReg,s) \
     VG_(printf)( IS_RREG_LIVE(VG_(realreg_to_rank)(realReg), \
                               u->regs_live_after)             \
                     ? s : "-");

   VG_(printf)("[");
   PRINT_RREG_LIVENESS(R_EAX, "a");
   PRINT_RREG_LIVENESS(R_EBX, "b");
   PRINT_RREG_LIVENESS(R_ECX, "c");
   PRINT_RREG_LIVENESS(R_EDX, "d");
   PRINT_RREG_LIVENESS(R_ESI, "S");
   PRINT_RREG_LIVENESS(R_EDI, "D");
   VG_(printf)("]");

#  undef PRINT_RREG_LIVENESS
}

/* Ugly-print UInstr :) */
void VG_(up_UInstr) ( Int i, UInstr* u )
{
   VG_(pp_UInstr_regs)(i, u);

   VG_(printf)("opcode:          %d\n", u->opcode);
   VG_(printf)("lit32:           0x%x\n", u->lit32);
   VG_(printf)("size:            %d\n", u->size);
   VG_(printf)("val1,val2,val3:  %d, %d, %d\n", u->val1, u->val2, u->val3);
   VG_(printf)("tag1,tag2,tag3:  %d, %d, %d\n", u->tag1, u->tag2, u->tag3);
   VG_(printf)("flags_r:         0x%x\n", u->flags_r);
   VG_(printf)("flags_w:         0x%x\n", u->flags_w);
   VG_(printf)("extra4b:         0x%x\n", u->extra4b);
   VG_(printf)("cond:            0x%x\n", u->cond);
   VG_(printf)("signed_widen:    %d\n", u->signed_widen);
   VG_(printf)("jmpkind:         %d\n", u->jmpkind);
   VG_(printf)("argc,regparms_n: %d, %d\n", u->argc, u->regparms_n);
   VG_(printf)("has_ret_val:     %d\n", u->has_ret_val);
   VG_(printf)("regs_live_after: ");
   pp_realregs_liveness(u);
   VG_(printf)("\n");
}

static
void pp_UInstrWorker ( Int instrNo, UInstr* u, Bool ppRegsLiveness )
{
   VG_(printf)("\t%4d: %s", instrNo,
                            VG_(name_UOpcode)(True, u->opcode));
   // For JMP, the condition goes before the size
   if (u->opcode == JMP)
      VG_(printf)("%s", VG_(name_UCondcode)(u->cond));

   switch (u->size) {
      case 0:  VG_(printf)("o"); break;
      case 1:  VG_(printf)("B"); break;
      case 2:  VG_(printf)("W"); break;
      case 4:  VG_(printf)("L"); break;
      case 8:  VG_(printf)("Q"); break;
      case 16: VG_(printf)("QQ"); break;
      default: VG_(printf)("%d", (Int)u->size); break;
   }

   // For CC2VAL and CMOV, the condition goes after the size
   if (u->opcode == CC2VAL || u->opcode == CMOV)
      VG_(printf)("%s", VG_(name_UCondcode)(u->cond));

   // Append extra bits
   switch (u->opcode) {
   case JMP:
      switch (u->jmpkind) {
         case JmpCall:      VG_(printf)("-c"); break;
         case JmpRet:       VG_(printf)("-r"); break;
         case JmpSyscall:   VG_(printf)("-sys"); break;
         case JmpClientReq: VG_(printf)("-cli"); break;
         case JmpYield:     VG_(printf)("-yld"); break;
         default: break;
      }
      break;

   case WIDEN:
      VG_(printf)("_%c%c", VG_(toupper)(nameISize(u->extra4b)),
                           u->signed_widen?'s':'z');
   }
   VG_(printf)("       \t");

   switch (u->opcode) {

      case CALLM_S: case CALLM_E:
         break;

      case INCEIP:
         VG_(printf)("$%d", u->val1);
         break;

      case LEA2:
         VG_(printf)("%d(" , u->lit32);
         VG_(pp_UOperand)(u, 1, 4, False);
         VG_(printf)(",");
         VG_(pp_UOperand)(u, 2, 4, False);
         VG_(printf)(",%d), ", (Int)u->extra4b);
         VG_(pp_UOperand)(u, 3, 4, False);
         break;

      case LEA1:
         VG_(printf)("%d" , u->lit32);
         VG_(pp_UOperand)(u, 1, 4, True);
         VG_(printf)(", ");
         VG_(pp_UOperand)(u, 2, 4, False);
         break;

      case NOP: case LOCK:
         break;

      case FPU_W:
         VG_(printf)("0x%x:0x%x, ",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         VG_(pp_UOperand)(u, 2, 4, True);
         break;

      case FPU_R:
         VG_(printf)("");
         VG_(pp_UOperand)(u, 2, 4, True);
         VG_(printf)(", 0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         break;

      case FPU:
         VG_(printf)("0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         break;

      case MMX1:
         VG_(printf)("0x%x",
                     u->val1 & 0xFF );
         break;

      case MMX2:
         VG_(printf)("0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         break;

      case MMX3:
         VG_(printf)("0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
         break;

      case MMX2_ERegWr:
      case MMX2_ERegRd:
         VG_(printf)("0x%x:0x%x, ",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         VG_(pp_UOperand)(u, 2, 4, False);
         break;

      case MMX2_MemWr:
      case MMX2_MemRd:
          VG_(printf)("0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF );
         VG_(pp_UOperand)(u, 2, 4, True);
         break;

      case MMX2a1_MemRd:
          VG_(printf)("0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
         VG_(pp_UOperand)(u, 3, 4, True);
         break;

      case SSE2a_MemWr:
      case SSE2a_MemRd:
      case SSE2g_RegWr:
      case SSE2g1_RegWr:
      case SSE2e1_RegRd:
         VG_(printf)("0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, u->val2 & 0xFF );
         VG_(pp_UOperand)(u, 3, 4, True);
         break;

      case SSE2a1_MemRd:
      case SSE3a_MemWr:
      case SSE3a_MemRd:
         VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
         VG_(pp_UOperand)(u, 3, 4, True);
         break;

      case SSE3e_RegWr:
      case SSE3e_RegRd:
      case SSE3g_RegWr:
         VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
         VG_(pp_UOperand)(u, 3, 4, True);
         break;

      case SSE3g1_RegWr:
      case SSE3e1_RegRd:
      case SSE3a1_MemRd:
         VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     (u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
                     u->lit32 );
         VG_(pp_UOperand)(u, 3, 4, True);
         break;

      case SSE3:
         VG_(printf)("0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     u->val2 & 0xFF );
         break;

      case SSE4:
         VG_(printf)("0x%x:0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     (u->val2 >> 8) & 0xFF, u->val2 & 0xFF );
         break;

      case SSE5:
         VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
                     (u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
                     (u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
                     u->val3 & 0xFF );
         break;

      case SSE3ag_MemRd_RegWr:
	 VG_(printf)("0x%x(addr=", u->lit32 );
	 VG_(pp_UOperand)(u, 1, 4, False);
	 VG_(printf)(", dst=");
	 VG_(pp_UOperand)(u, 2, 4, False);
         VG_(printf)(")");
         break;

      case GET: case PUT: case MOV: case LOAD: case STORE: case CMOV:
      case GETSEG: case PUTSEG:
         VG_(pp_UOperand)(u, 1, u->size, u->opcode==LOAD);
         VG_(printf)(", ");
         VG_(pp_UOperand)(u, 2, u->size, u->opcode==STORE);
         break;

      case JMP:
         VG_(pp_UOperand)(u, 1, u->size, False);
         if (CondAlways == u->cond) {
            /* Print x86 instruction size if filled in */
            if (0 != u->extra4b)
               VG_(printf)("  ($%u)", u->extra4b);
         }
         break;

      case GETF: case PUTF:
      case CC2VAL: case PUSH: case POP: case CLEAR: case CALLM:
      case NOT: case NEG: case INC: case DEC: case BSWAP:
         VG_(pp_UOperand)(u, 1, u->size, False);
         break;

      /* Print a "(s)" after args passed on stack */
      case CCALL:
         if (u->has_ret_val) {
            VG_(pp_UOperand)(u, 3, 0, False);
            VG_(printf)(" = ");
         }
         VG_(printf)("%p(", u->lit32);
         if (u->argc > 0) {
            VG_(pp_UOperand)(u, 1, 0, False);
            if (u->regparms_n < 1)
               VG_(printf)("(s)");
         }
         if (u->argc > 1) {
            VG_(printf)(", ");
            VG_(pp_UOperand)(u, 2, 0, False);
            if (u->regparms_n < 2)
               VG_(printf)("(s)");
         }
         if (u->argc > 2) {
            VG_(printf)(", ");
            VG_(pp_UOperand)(u, 3, 0, False);
            if (u->regparms_n < 3)
               VG_(printf)("(s)");
         }
         VG_(printf)(") ");
         break;

      case USESEG:
      case JIFZ:
      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
      case SHL: case SHR: case SAR:
      case ROL: case ROR: case RCL: case RCR:
      case MUL:
         VG_(pp_UOperand)(u, 1, u->size, False);
         VG_(printf)(", ");
         VG_(pp_UOperand)(u, 2, u->size, False);
         break;

      case WIDEN:
         VG_(pp_UOperand)(u, 1, u->size, False);
         break;

      default:
         if (VG_(needs).extended_UCode)
            SK_(pp_XUInstr)(u);
         else {
            VG_(printf)("unhandled opcode: %u.  Perhaps "
                        "VG_(needs).extended_UCode should be set?",
                        u->opcode);
            VG_(core_panic)("pp_UInstr: unhandled opcode");
         }
   }
   if (u->flags_r != FlagsEmpty || u->flags_w != FlagsEmpty) {
      VG_(printf)("  (");
      if (u->flags_r != FlagsEmpty)
         vg_ppFlagSet("-r", u->flags_r);
      if (u->flags_w != FlagsEmpty)
         vg_ppFlagSet("-w", u->flags_w);
      VG_(printf)(")");
   }

   if (ppRegsLiveness) {
      VG_(printf)("\t\t");
      pp_realregs_liveness ( u );
   }

   VG_(printf)("\n");
}

void VG_(pp_UInstr) ( Int instrNo, UInstr* u )
{
   pp_UInstrWorker ( instrNo, u, /*ppRegsLiveness*/False );
}

void VG_(pp_UInstr_regs) ( Int instrNo, UInstr* u )
{
   pp_UInstrWorker ( instrNo, u, /*ppRegsLiveness*/True );
}

void VG_(pp_UCodeBlock) ( UCodeBlock* cb, Char* title )
{
   Int i;
   VG_(printf)("%s\n", title);
   for (i = 0; i < cb->used; i++)
      if (cb->instrs[i].opcode != NOP)
         VG_(pp_UInstr) ( i, &cb->instrs[i] );
   VG_(printf)("\n");
}


/*------------------------------------------------------------*/
/*--- uinstr helpers for register allocation               ---*/
/*--- and code improvement.                                ---*/
/*------------------------------------------------------------*/

/* Get the temp/reg use of a uinstr, parking them in an array supplied by
   the caller (regs), which is assumed to be big enough.  Return the number
   of entries.  Written regs are indicated in parallel array isWrites.
   Insns which read _and_ write a register wind up mentioning it twice.
   Entries are placed in the array in program order, so that if a reg is
   read-modified-written, it appears first as a read and then as a write.
   'tag' indicates whether we are looking at TempRegs or RealRegs.
*/
Int VG_(get_reg_usage) ( UInstr* u, Tag tag, Int* regs, Bool* isWrites )
{
#  define RD(ono)    VG_UINSTR_READS_REG(ono, regs, isWrites)
#  define WR(ono)    VG_UINSTR_WRITES_REG(ono, regs, isWrites)

   Int n = 0;
   switch (u->opcode) {
      case LEA1: RD(1); WR(2); break;
      case LEA2: RD(1); RD(2); WR(3); break;

      case SSE3a1_MemRd:
      case SSE2a1_MemRd:
      case SSE2e1_RegRd:
      case SSE3e_RegRd:
      case SSE3a_MemWr:
      case SSE3a_MemRd:
      case SSE2a_MemWr:
      case SSE3e1_RegRd:
      case SSE2a_MemRd: RD(3); break;

      case SSE2g_RegWr:
      case SSE2g1_RegWr:
      case SSE3e_RegWr:
      case SSE3g1_RegWr:
      case SSE3g_RegWr: WR(3); break;

      case SSE3ag_MemRd_RegWr: RD(1); WR(2); break;

      case MMX2a1_MemRd: RD(3); break;
      case MMX2_ERegRd: RD(2); break;
      case MMX2_ERegWr: WR(2); break;

      case SSE4: case SSE3: case SSE5:
      case MMX1: case MMX2: case MMX3:
      case NOP:   case FPU:   case INCEIP: case CALLM_S: case CALLM_E:
      case CLEAR: case CALLM: case LOCK: break;

      case CCALL:
         if (u->argc > 0)    RD(1);
         if (u->argc > 1)    RD(2);
         if (u->argc > 2)    RD(3);
         if (u->has_ret_val) WR(3);
         break;

      case MMX2_MemRd: case MMX2_MemWr:
      case FPU_R: case FPU_W: RD(2); break;

      case GETSEG: WR(2); break;
      case PUTSEG: RD(1); break;

      case GETF:  WR(1); break;
      case PUTF:  RD(1); break;

      case GET:   WR(2); break;
      case PUT:   RD(1); break;
      case LOAD:  RD(1); WR(2); break;
      case STORE: RD(1); RD(2); break;
      case MOV:   RD(1); WR(2); break;

      case JMP:   RD(1); break;

      case PUSH: RD(1); break;
      case POP:  WR(1); break;

      case USESEG:
      case CMOV:
      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
      case MUL:
         RD(1); RD(2); WR(2); break;

      case SHL: case SHR: case SAR:
      case ROL: case ROR: case RCL: case RCR:
         RD(1); RD(2); WR(2); break;

      case NOT: case NEG: case INC: case DEC: case BSWAP:
         RD(1); WR(1); break;

      case WIDEN: RD(1); WR(1); break;

      case CC2VAL: WR(1); break;
      case JIFZ: RD(1); break;

      default:
         if (VG_(needs).extended_UCode)
            return SK_(get_Xreg_usage)(u, tag, regs, isWrites);
         else {
            VG_(printf)("unhandled opcode: %u.  Perhaps "
                        "VG_(needs).extended_UCode should be set?",
                        u->opcode);
            VG_(core_panic)("VG_(get_reg_usage): unhandled opcode");
         }
   }
   return n;

#  undef RD
#  undef WR
}


/* Change temp regs in u into real regs, as directed by the
 * temps[i]-->reals[i] mapping. */
static
void patchUInstr ( UInstr* u, Int temps[], UInt reals[], Int n_tmap )
{
   Int i;
   if (u->tag1 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (temps[i] == u->val1) break;
      if (i == n_tmap) VG_(core_panic)("patchUInstr(1)");
      u->tag1 = RealReg;
      u->val1 = reals[i];
   }
   if (u->tag2 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (temps[i] == u->val2) break;
      if (i == n_tmap) VG_(core_panic)("patchUInstr(2)");
      u->tag2 = RealReg;
      u->val2 = reals[i];
   }
   if (u->tag3 == TempReg) {
      for (i = 0; i < n_tmap; i++)
         if (temps[i] == u->val3) break;
      if (i == n_tmap) VG_(core_panic)("patchUInstr(3)");
      u->tag3 = RealReg;
      u->val3 = reals[i];
   }
}


/* Tedious x86-specific hack which compensates for the fact that the
   register numbers for %ah .. %dh do not correspond to those for %eax
   .. %edx.  It maps a (reg size, reg no) pair to the number of the
   containing 32-bit reg. */
static __inline__
Int containingArchRegOf ( Int sz, Int aregno )
{
   switch (sz) {
      case 4: return aregno;
      case 2: return aregno;
      case 1: return aregno >= 4 ? aregno-4 : aregno;
      default: VG_(core_panic)("containingArchRegOf");
   }
}


/* If u reads an ArchReg, return the number of the containing arch
   reg.  Otherwise return -1.  Used in redundant-PUT elimination.
   Note that this is not required for tools extending UCode because
   this happens before instrumentation. */
static
Int maybe_uinstrReadsArchReg ( UInstr* u )
{
   switch (u->opcode) {
      case GET:
      case ADD: case ADC: case AND: case OR:
      case XOR: case SUB: case SBB:
      case SHL: case SHR: case SAR: case ROL:
      case ROR: case RCL: case RCR:
      case MUL:
         if (u->tag1 == ArchReg)
            return containingArchRegOf ( u->size, u->val1 );
         else
            return -1;

      case GETF: case PUTF:
      case CALLM_S: case CALLM_E:
      case INCEIP:
      case LEA1:
      case LEA2:
      case NOP:
      case LOCK:
      case PUT:
      case LOAD:
      case STORE:
      case MOV:
      case CMOV:
      case JMP:
      case CALLM: case CLEAR: case PUSH: case POP:
      case NOT: case NEG: case INC: case DEC: case BSWAP:
      case CC2VAL:
      case JIFZ:
      case FPU: case FPU_R: case FPU_W:
      case MMX1: case MMX2: case MMX3:
      case MMX2_MemRd: case MMX2_MemWr: case MMX2a1_MemRd:
      case MMX2_ERegRd: case MMX2_ERegWr:
      case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
      case SSE2g_RegWr: case SSE2g1_RegWr: case SSE2e1_RegRd:
      case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
      case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
      case SSE3g1_RegWr: case SSE3e1_RegRd:
      case SSE4: case SSE3: case SSE5: case SSE3ag_MemRd_RegWr:
      case WIDEN:
      /* GETSEG and USESEG are to do with ArchRegS, not ArchReg */
      case GETSEG: case PUTSEG:
      case USESEG:
         return -1;

      default:
         VG_(pp_UInstr)(0,u);
         VG_(core_panic)("maybe_uinstrReadsArchReg: unhandled opcode");
   }
}

static __inline__
Bool uInstrMentionsTempReg ( UInstr* u, Int tempreg )
{
   Int i, k;
   Int tempUse[VG_MAX_REGS_USED];
   Bool notUsed[VG_MAX_REGS_USED];

   k = VG_(get_reg_usage) ( u, TempReg, &tempUse[0], &notUsed[0] );
   for (i = 0; i < k; i++)
      if (tempUse[i] == tempreg)
         return True;
   return False;
}


/*------------------------------------------------------------*/
/*--- ucode improvement.                                   ---*/
/*------------------------------------------------------------*/

/* Improve the code in cb by doing
   -- Redundant ArchReg-fetch elimination
   -- Redundant PUT elimination
   -- Redundant cond-code restore/save elimination
   The overall effect of these is to allow target registers to be
   cached in host registers over multiple target insns.
*/
static void vg_improve ( UCodeBlock* cb )
{
   Int     i, j, k, m, n, ar, tr, told, actual_areg;
   Int     areg_map[8];
   Bool    annul_put[8];
   Int     tempUse[VG_MAX_REGS_USED];
   Bool    isWrites[VG_MAX_REGS_USED];
   UInstr* u;
   Bool    wr;
   Int*    last_live_before;
   FlagSet future_dead_flags;

   if (dis)
      VG_(printf) ("Improvements:\n");

   if (cb->nextTemp > 0)
      last_live_before = VG_(arena_malloc) ( VG_AR_JITTER,
                                             cb->nextTemp * sizeof(Int) );
   else
      last_live_before = NULL;


   /* PASS 1: redundant GET elimination.  (Actually, more general than
      that -- eliminates redundant fetches of ArchRegs). */

   /* Find the live-range-ends for all temporaries.  Duplicates code
      in the register allocator :-( */

   for (i = 0; i < cb->nextTemp; i++) last_live_before[i] = -1;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      k = VG_(get_reg_usage)(u, TempReg, &tempUse[0], &isWrites[0]);

      /* For each temp usage ... bwds in program order. */
      for (j = k-1; j >= 0; j--) {
         tr = tempUse[j];
         wr = isWrites[j];
         if (last_live_before[tr] == -1) {
            vg_assert(tr >= 0 && tr < cb->nextTemp);
            last_live_before[tr] = wr ? (i+1) : i;
         }
      }

   }

#  define BIND_ARCH_TO_TEMP(archreg,tempreg)\
   { Int q;                                           \
     /* Invalidate any old binding(s) to tempreg. */  \
     for (q = 0; q < 8; q++)                          \
        if (areg_map[q] == tempreg) areg_map[q] = -1; \
     /* Add the new binding. */                       \
     areg_map[archreg] = (tempreg);                   \
   }

   /* Set up the A-reg map. */
   for (i = 0; i < 8; i++) areg_map[i] = -1;

   /* Scan insns. */
   for (i = 0; i < cb->used; i++) {
      u = &cb->instrs[i];
      if (u->opcode == GET && u->size == 4) {
         /* GET; see if it can be annulled. */
         vg_assert(u->tag1 == ArchReg);
         vg_assert(u->tag2 == TempReg);
         ar   = u->val1;
         tr   = u->val2;
         told = areg_map[ar];
         if (told != -1 && last_live_before[told] <= i) {
            /* ar already has an old mapping to told, but that runs
               out here.  Annul this GET, rename tr to told for the
               rest of the block, and extend told's live range to that
               of tr.  */
            VG_(new_NOP)(u);
            n = last_live_before[tr] + 1;
            if (n > cb->used) n = cb->used;
            last_live_before[told] = last_live_before[tr];
            last_live_before[tr] = i-1;
            if (dis)
               VG_(printf)(
                  "   at %2d: delete GET, rename t%d to t%d in (%d .. %d)\n",
                  i, tr, told,i+1, n-1);
            for (m = i+1; m < n; m++) {
               if (cb->instrs[m].tag1 == TempReg
                   && cb->instrs[m].val1 == tr)
                 cb->instrs[m].val1 = told;
               if (cb->instrs[m].tag2 == TempReg
                   && cb->instrs[m].val2 == tr)
                 cb->instrs[m].val2 = told;
               if (cb->instrs[m].tag3 == TempReg
                   && cb->instrs[m].val3 == tr)
                 cb->instrs[m].val3 = told;
            }
            BIND_ARCH_TO_TEMP(ar,told);
         }
         else
            BIND_ARCH_TO_TEMP(ar,tr);
      }
      else if (u->opcode == GET && u->size != 4) {
         /* Invalidate any mapping for this archreg.  */
         actual_areg = containingArchRegOf ( u->size, u->val1 );
         areg_map[actual_areg] = -1;
      }
      else if (u->opcode == PUT && u->size == 4) {
         /* PUT; re-establish t -> a binding */
         vg_assert(u->tag1 == TempReg);
         vg_assert(u->tag2 == ArchReg);
         BIND_ARCH_TO_TEMP(u->val2, u->val1);
      }
      else if (u->opcode == PUT && u->size != 4) {
         /* Invalidate any mapping for this archreg. */
         actual_areg = containingArchRegOf ( u->size, u->val2 );
         areg_map[actual_areg] = -1;
      } else {

         /* see if insn has an archreg as a read operand; if so try to
            map it. */
         if (u->tag1 == ArchReg && u->size == 4
                                && areg_map[u->val1] != -1) {
            switch (u->opcode) {
               case ADD: case SUB: case AND: case OR: case XOR:
               case ADC: case SBB:
               case SHL: case SHR: case SAR: case ROL: case ROR:
               case RCL: case RCR:
	       case MUL:
                  if (dis)
                     VG_(printf)(
                        "   at %2d: change ArchReg %S to TempReg t%d\n",
                        i, nameIReg(4,u->val1), areg_map[u->val1]);
                  u->tag1 = TempReg;
                  u->val1 = areg_map[u->val1];
                  /* Remember to extend the live range of the TempReg,
                     if necessary. */
                  if (last_live_before[u->val1] < i)
                     last_live_before[u->val1] = i;
                  break;
               default:
                  break;
            }
         }

         /* boring insn; invalidate any mappings to temps it writes */
         k = VG_(get_reg_usage)(u, TempReg, &tempUse[0], &isWrites[0]);

         for (j = 0; j < k; j++) {
            wr  = isWrites[j];
            if (!wr) continue;
            tr = tempUse[j];
            for (m = 0; m < 8; m++)
               if (areg_map[m] == tr) areg_map[m] = -1;
         }
      }

   }

#  undef BIND_ARCH_TO_TEMP

   /* PASS 2: redundant PUT elimination.  Don't annul (delay) puts of
      %ESP, since the memory check machinery always requires the
      in-memory value of %ESP to be up to date.  Although this isn't
      actually required by other analyses (cache simulation), it's
      simplest to be consistent for all end-uses. */
   for (j = 0; j < 8; j++)
      annul_put[j] = False;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];
      if (u->opcode == NOP) continue;

      if (u->opcode == PUT && u->size == 4) {
         vg_assert(u->tag2 == ArchReg);
         actual_areg = containingArchRegOf ( 4, u->val2 );
         if (annul_put[actual_areg]) {
            vg_assert(actual_areg != R_ESP);
            VG_(new_NOP)(u);
            if (dis)
               VG_(printf)("   at %2d: delete PUT\n", i );
         } else {
            if (actual_areg != R_ESP)
               annul_put[actual_areg] = True;
         }
      }
      else if (u->opcode == PUT && u->size != 4) {
         actual_areg = containingArchRegOf ( u->size, u->val2 );
         annul_put[actual_areg] = False;
      }
      else if (u->opcode == JMP || u->opcode == JIFZ
               || u->opcode == CALLM) {
         for (j = 0; j < 8; j++)
            annul_put[j] = False;
      }
      else {
         /* If an instruction reads an ArchReg, the immediately
            preceding PUT cannot be annulled. */
         actual_areg = maybe_uinstrReadsArchReg ( u );
         if (actual_areg != -1)
            annul_put[actual_areg] = False;
      }
   }

   /* PASS 2a: redundant-move elimination.  Given MOV t1, t2 and t1 is
      dead after this point, annul the MOV insn and rename t2 to t1.
      Further modifies the last_live_before map. */

#  if 0
   VG_(pp_UCodeBlock)(cb, "Before MOV elimination" );
   for (i = 0; i < cb->nextTemp; i++)
     VG_(printf)("llb[t%d]=%d   ", i, last_live_before[i]);
   VG_(printf)("\n");
#  endif

   for (i = 0; i < cb->used-1; i++) {
      u = &cb->instrs[i];
      if (u->opcode != MOV) continue;
      if (u->tag1 == Literal) continue;
      vg_assert(u->tag1 == TempReg);
      vg_assert(u->tag2 == TempReg);
      if (last_live_before[u->val1] == i) {
         if (dis)
            VG_(printf)(
               "   at %2d: delete MOV, rename t%d to t%d in (%d .. %d)\n",
               i, u->val2, u->val1, i+1, last_live_before[u->val2] );
         for (j = i+1; j <= last_live_before[u->val2]; j++) {
            if (cb->instrs[j].tag1 == TempReg
                && cb->instrs[j].val1 == u->val2)
               cb->instrs[j].val1 = u->val1;
            if (cb->instrs[j].tag2 == TempReg
                && cb->instrs[j].val2 == u->val2)
               cb->instrs[j].val2 = u->val1;
            if (cb->instrs[j].tag3 == TempReg
                && cb->instrs[j].val3 == u->val2)
               cb->instrs[j].val3 = u->val1;
         }
         last_live_before[u->val1] = last_live_before[u->val2];
         last_live_before[u->val2] = i-1;
         VG_(new_NOP)(u);
      }
   }

   /* PASS 3: redundant condition-code restore/save elimination.
      Scan backwards from the end.  future_dead_flags records the set
      of flags which are dead at this point, that is, will be written
      before they are next read.  Earlier uinsns which write flags
      already in future_dead_flags can have their writes annulled.
   */
   future_dead_flags = FlagsEmpty;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      /* We might never make it to insns beyond this one, so be
         conservative. */
      if (u->opcode == JIFZ || u->opcode == JMP) {
         future_dead_flags = FlagsEmpty;
         continue;
      }

      /* PUTF modifies the %EFLAGS in essentially unpredictable ways.
         For example people try to mess with bit 21 to see if CPUID
         works.  The setting may or may not actually take hold.  So we
         play safe here. */
      if (u->opcode == PUTF) {
         future_dead_flags = FlagsEmpty;
         continue;
      }

      /* We can annul the flags written by this insn if it writes a
         subset (or eq) of the set of flags known to be dead after
         this insn.  If not, just record the flags also written by
         this insn.*/
      if (u->flags_w != FlagsEmpty
          && VG_IS_FLAG_SUBSET(u->flags_w, future_dead_flags)) {
         if (dis) {
            VG_(printf)("   at %2d: annul flag write ", i);
            vg_ppFlagSet("", u->flags_w);
            VG_(printf)(" due to later ");
            vg_ppFlagSet("", future_dead_flags);
            VG_(printf)("\n");
         }
         u->flags_w = FlagsEmpty;
      } else {
        future_dead_flags
           = VG_UNION_FLAG_SETS ( u->flags_w, future_dead_flags );
      }

      /* If this insn also reads flags, empty out future_dead_flags so
         as to force preceding writes not to be annulled. */
      if (u->flags_r != FlagsEmpty)
         future_dead_flags = FlagsEmpty;
   }

   if (last_live_before)
      VG_(arena_free) ( VG_AR_JITTER, last_live_before );

   if (dis) {
      VG_(printf)("\n");
      VG_(pp_UCodeBlock) ( cb, "Improved UCode:" );
   }
}

/*------------------------------------------------------------*/
/*--- %ESP-update pass                                     ---*/
/*------------------------------------------------------------*/

/* For tools that want to know about %ESP changes, this pass adds
   in the appropriate hooks.  We have to do it after the tool's
   instrumentation, so the tool doesn't have to worry about the CCALLs
   it adds in, and we must do it before register allocation because
   spilled temps make it much harder to work out the %esp deltas.
   Thus we have it as an extra phase between the two.

   We look for "GETL %ESP, t_ESP", then track ADDs and SUBs of
   literal values to t_ESP, and the total delta of the ADDs/SUBs.  Then if
   "PUTL t_ESP, %ESP" happens, we call the helper with the known delta.  We
   also cope with "MOVL t_ESP, tX", making tX the new t_ESP.  If any other
   instruction clobbers t_ESP, we don't track it anymore, and fall back to
   the delta-is-unknown case.  That case is also used when the delta is not
   a nice small amount, or an unknown amount.
*/
static
UCodeBlock* vg_ESP_update_pass(UCodeBlock* cb_in)
{
   UCodeBlock* cb;
   UInstr*     u;
   Int         delta = 0;
   UInt        t_ESP = INVALID_TEMPREG;
   Int         i;

   cb = VG_(setup_UCodeBlock)(cb_in);

   for (i = 0; i < VG_(get_num_instrs)(cb_in); i++) {
      u = VG_(get_instr)(cb_in, i);

      if (GET == u->opcode && R_ESP == u->val1) {
         t_ESP = u->val2;
         delta = 0;

      } else if (PUT == u->opcode && R_ESP == u->val2 && 4 == u->size) {

#           define DO_GENERIC					\
               if (VG_(defined_new_mem_stack)() ||		\
                   VG_(defined_die_mem_stack)()) {		\
                  uInstr1(cb, CCALL, 0, TempReg, u->val1);	\
                  uCCall(cb, (Addr) VG_(unknown_esp_update),	\
                         1, 1, False);				\
               }

#           define DO(kind, size)								\
               if (VG_(defined_##kind##_mem_stack_##size)()) {					\
                  uInstr1(cb, CCALL, 0, TempReg, u->val1);					\
                  uCCall(cb, (Addr) VG_(tool_interface).track_##kind##_mem_stack_##size,	\
                         1, 1, False);								\
												\
               } else										\
                  DO_GENERIC									\
               break

         if (u->val1 == t_ESP) {
            /* Known delta, common cases handled specially. */
            switch (delta) {
            case   0: break;
            case   4: DO(die, 4);
            case  -4: DO(new, 4);
            case   8: DO(die, 8);
            case  -8: DO(new, 8);
            case  12: DO(die, 12);
            case -12: DO(new, 12);
            case  16: DO(die, 16);
            case -16: DO(new, 16);
            case  32: DO(die, 32);
            case -32: DO(new, 32);
            default:  DO_GENERIC;   break;
            }
         } else {
            /* Unknown delta */
            DO_GENERIC;

            /* now we know the temp that points to %ESP */
            t_ESP = u->val1;
         }
         delta = 0;

#        undef DO
#        undef DO_GENERIC

      } else if (ADD == u->opcode && Literal == u->tag1 && t_ESP == u->val2) {
         delta += u->lit32;

      } else if (SUB == u->opcode && Literal == u->tag1 && t_ESP == u->val2) {
         delta -= u->lit32;

      } else if (MOV == u->opcode && TempReg == u->tag1 && t_ESP == u->val1 &&
                                     TempReg == u->tag2) {
         // t_ESP is transferred
         t_ESP = u->val2;

      } else {
         // Stop tracking t_ESP if it's clobbered by this instruction.
         Int  tempUse [VG_MAX_REGS_USED];
         Bool isWrites[VG_MAX_REGS_USED];
         Int  j, n = VG_(get_reg_usage)(u, TempReg, tempUse, isWrites);

         for (j = 0; j < n; j++) {
            if (tempUse[j] == t_ESP && isWrites[j])
               t_ESP = INVALID_TEMPREG;
         }
      }
      VG_(copy_UInstr) ( cb, u );
   }

   VG_(free_UCodeBlock)(cb_in);
   return cb;
}

/*------------------------------------------------------------*/
/*--- The new register allocator.                          ---*/
/*------------------------------------------------------------*/

typedef
   struct {
      /* Becomes live for the first time after this insn ... */
      Int live_after;
      /* Becomes dead for the last time before this insn ... */
      Int dead_before;
      /* The "home" spill slot, if needed.  Never changes. */
      Int spill_no;
      /* Where is it?  VG_NOVALUE==in a spill slot; else in reg. */
      Int real_no;
   }
   TempInfo;


/* Take a ucode block and allocate its TempRegs to RealRegs, or put
   them in spill locations, and add spill code, if there are not
   enough real regs.  The usual register allocation deal, in short.

   Important redundancy of representation:

     real_to_temp maps real reg ranks (RRRs) to TempReg nos, or
     to VG_NOVALUE if the real reg has no currently assigned TempReg.

     The .real_no field of a TempInfo gives the current RRR for
     this TempReg, or VG_NOVALUE if the TempReg is currently
     in memory, in which case it is in the SpillNo denoted by
     spillno.

   These pieces of information (a fwds-bwds mapping, really) must
   be kept consistent!

   This allocator uses the so-called Second Chance Bin Packing
   algorithm, as described in "Quality and Speed in Linear-scan
   Register Allocation" (Traub, Holloway and Smith, ACM PLDI98,
   pp142-151).  It is simple and fast and remarkably good at
   minimising the amount of spill code introduced.
*/

static
UCodeBlock* vg_do_register_allocation ( UCodeBlock* c1 )
{
   TempInfo*    temp_info;
   Int          real_to_temp [VG_MAX_REALREGS];
   Bool         is_spill_cand[VG_MAX_REALREGS];
   Int          ss_busy_until_before[VG_MAX_SPILLSLOTS];
   Int          i, j, k, m, r, tno, max_ss_no;
   Bool         wr, defer, isRead, spill_reqd;
   UInt         realUse [VG_MAX_REGS_USED];
   Int          tempUse [VG_MAX_REGS_USED];
   Bool         isWrites[VG_MAX_REGS_USED];
   UCodeBlock*  c2;

   /* Used to denote ... well, "no value" in this fn. */
#  define VG_NOTHING (-2)

   /* Initialise the TempReg info.  */
   if (c1->nextTemp > 0)
      temp_info = VG_(arena_malloc)(VG_AR_JITTER,
                                    c1->nextTemp * sizeof(TempInfo) );
   else
      temp_info = NULL;

   for (i = 0; i < c1->nextTemp; i++) {
      temp_info[i].live_after  = VG_NOTHING;
      temp_info[i].dead_before = VG_NOTHING;
      temp_info[i].spill_no    = VG_NOTHING;
      /* temp_info[i].real_no is not yet relevant. */
   }

   spill_reqd = False;

   /* Scan fwds to establish live ranges. */

   for (i = 0; i < c1->used; i++) {
      k = VG_(get_reg_usage)(&c1->instrs[i], TempReg, &tempUse[0],
                             &isWrites[0]);
      vg_assert(k >= 0 && k <= VG_MAX_REGS_USED);

      /* For each temp usage ... fwds in program order */
      for (j = 0; j < k; j++) {
         tno = tempUse[j];
         wr  = isWrites[j];
         if (wr) {
            /* Writes hold a reg live until after this insn. */
            if (temp_info[tno].live_after == VG_NOTHING)
               temp_info[tno].live_after = i;
            if (temp_info[tno].dead_before < i + 1)
               temp_info[tno].dead_before = i + 1;
         } else {
            /* First use of a tmp should be a write. */
            if (temp_info[tno].live_after == VG_NOTHING) {
               VG_(printf)("At instr %d...\n", i);
               VG_(core_panic)("First use of tmp not a write,"
                               " probably a tool instrumentation error");
            }
            /* Reads only hold it live until before this insn. */
            if (temp_info[tno].dead_before < i)
               temp_info[tno].dead_before = i;
         }
      }
   }

#  if 0
   /* Sanity check on live ranges.  Expensive but correct. */
   for (i = 0; i < c1->nextTemp; i++) {
      vg_assert( (temp_info[i].live_after == VG_NOTHING
                  && temp_info[i].dead_before == VG_NOTHING)
                 || (temp_info[i].live_after != VG_NOTHING
                     && temp_info[i].dead_before != VG_NOTHING) );
   }
#  endif

   /* Do a rank-based allocation of TempRegs to spill slot numbers.
      We put as few as possible values in spill slots, but
      nevertheless need to have an assignment to them just in case. */

   max_ss_no = -1;

   for (i = 0; i < VG_MAX_SPILLSLOTS; i++)
      ss_busy_until_before[i] = 0;

   for (i = 0; i < c1->nextTemp; i++) {

      /* True iff this temp is unused. */
      if (temp_info[i].live_after == VG_NOTHING)
         continue;

      /* Find the lowest-numbered spill slot which is available at the
         start point of this interval, and assign the interval to
         it. */
      for (j = 0; j < VG_MAX_SPILLSLOTS; j++)
         if (ss_busy_until_before[j] <= temp_info[i].live_after)
            break;
      if (j == VG_MAX_SPILLSLOTS) {
         VG_(printf)("VG_MAX_SPILLSLOTS is too low; increase and recompile.\n");
         VG_(core_panic)("register allocation failed -- out of spill slots");
      }
      ss_busy_until_before[j] = temp_info[i].dead_before;
      temp_info[i].spill_no = j;
      if (j > max_ss_no)
         max_ss_no = j;
   }

   n_total_reg_rank += (max_ss_no+1);

   /* Show live ranges and assigned spill slot nos. */

   if (dis) {
      VG_(printf)("Live range assignments:\n");

      for (i = 0; i < c1->nextTemp; i++) {
         if (temp_info[i].live_after == VG_NOTHING)
            continue;
         VG_(printf)(
            "   LR %d is  after %d to before %d\tspillno %d\n",
            i,
            temp_info[i].live_after,
            temp_info[i].dead_before,
            temp_info[i].spill_no
         );
      }
      VG_(printf)("\n");
   }

   /* Now that we've established a spill slot number for each used
      temporary, we can go ahead and do the core of the "Second-chance
      binpacking" allocation algorithm. */

   if (dis) VG_(printf)("Register allocated UCode:\n");


   /* Resulting code goes here.  We generate it all in a forwards
      pass. */
   c2 = VG_(alloc_UCodeBlock)();
   c2->orig_eip = c1->orig_eip;

   /* At the start, no TempRegs are assigned to any real register.
      Correspondingly, all temps claim to be currently resident in
      their spill slots, as computed by the previous two passes. */
   for (i = 0; i < VG_MAX_REALREGS; i++)
      real_to_temp[i] = VG_NOTHING;
   for (i = 0; i < c1->nextTemp; i++)
      temp_info[i].real_no = VG_NOTHING;

   /* Process each insn in turn. */
   for (i = 0; i < c1->used; i++) {

      if (c1->instrs[i].opcode == NOP) continue;
      n_uinstrs_prealloc++;

#     if 0
      /* Check map consistency.  Expensive but correct. */
      for (r = 0; r < VG_MAX_REALREGS; r++) {
         if (real_to_temp[r] != VG_NOTHING) {
            tno = real_to_temp[r];
            vg_assert(tno >= 0 && tno < c1->nextTemp);
            vg_assert(temp_info[tno].real_no == r);
         }
      }
      for (tno = 0; tno < c1->nextTemp; tno++) {
         if (temp_info[tno].real_no != VG_NOTHING) {
            r = temp_info[tno].real_no;
            vg_assert(r >= 0 && r < VG_MAX_REALREGS);
            vg_assert(real_to_temp[r] == tno);
         }
      }
#     endif

      if (dis)
         VG_(pp_UInstr)(i, &c1->instrs[i]);

      /* First, free up enough real regs for this insn.  This may
         generate spill stores since we may have to evict some TempRegs
         currently in real regs.  Also generates spill loads. */

      k = VG_(get_reg_usage)(&c1->instrs[i], TempReg, &tempUse[0],
                             &isWrites[0]);
      vg_assert(k >= 0 && k <= VG_MAX_REGS_USED);

      /* For each ***different*** temp mentioned in the insn .... */
      for (j = 0; j < k; j++) {

         /* First check if the temp is mentioned again later; if so,
            ignore this mention.  We only want to process each temp
            used by the insn once, even if it is mentioned more than
            once. */
         defer = False;
         tno = tempUse[j];
         for (m = j+1; m < k; m++)
            if (tempUse[m] == tno)
               defer = True;
         if (defer)
            continue;

         /* Now we're trying to find a register for tempUse[j].
            First of all, if it already has a register assigned, we
            don't need to do anything more. */
         if (temp_info[tno].real_no != VG_NOTHING)
            continue;

         /* No luck.  The next thing to do is see if there is a
            currently unassigned register available.  If so, bag it. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            if (real_to_temp[r] == VG_NOTHING)
               break;
         }
         if (r < VG_MAX_REALREGS) {
            real_to_temp[r]        = tno;
            temp_info[tno].real_no = r;
            continue;
         }

         /* Unfortunately, that didn't pan out either.  So we'll have
            to eject some other unfortunate TempReg into a spill slot
            in order to free up a register.  Of course, we need to be
            careful not to eject some other TempReg needed by this
            insn.

            Select r in 0 .. VG_MAX_REALREGS-1 such that
            real_to_temp[r] is not mentioned in
            tempUse[0 .. k-1], since it would be just plain
            wrong to eject some other TempReg which we need to use in
            this insn.

            It is here that it is important to make a good choice of
            register to spill.  */

         /* First, mark those regs which are not spill candidates. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            is_spill_cand[r] = True;
            for (m = 0; m < k; m++) {
               if (real_to_temp[r] == tempUse[m]) {
                  is_spill_cand[r] = False;
                  break;
               }
            }
         }

         /* We can choose any r satisfying is_spill_cand[r].  However,
            try to make a good choice.  First, try and find r such
            that the associated TempReg is already dead. */
         for (r = 0; r < VG_MAX_REALREGS; r++) {
            if (is_spill_cand[r] &&
                temp_info[real_to_temp[r]].dead_before <= i)
               goto have_spill_cand;
         }

         /* No spill cand is mapped to a dead TempReg.  Now we really
           _do_ have to generate spill code.  Choose r so that the
           next use of its associated TempReg is as far ahead as
           possible, in the hope that this will minimise the number of
           consequent reloads required.  This is a bit expensive, but
           we don't have to do it very often. */
         {
            Int furthest_r = VG_MAX_REALREGS;
            Int furthest = 0;
            for (r = 0; r < VG_MAX_REALREGS; r++) {
               if (!is_spill_cand[r]) continue;
               for (m = i+1; m < c1->used; m++)
                  if (uInstrMentionsTempReg(&c1->instrs[m],
                                            real_to_temp[r]))
                     break;
               if (m > furthest) {
                  furthest   = m;
                  furthest_r = r;
               }
            }
            r = furthest_r;
            goto have_spill_cand;
         }

         have_spill_cand:
         if (r == VG_MAX_REALREGS)
            VG_(core_panic)("new reg alloc: out of registers ?!");

         /* Eject r.  Important refinement: don't bother if the
            associated TempReg is now dead. */
         vg_assert(real_to_temp[r] != VG_NOTHING);
         vg_assert(real_to_temp[r] != tno);
         temp_info[real_to_temp[r]].real_no = VG_NOTHING;
         if (temp_info[real_to_temp[r]].dead_before > i) {
            uInstr2(c2, PUT, 4,
                        RealReg, VG_(rank_to_realreg)(r),
                        SpillNo, temp_info[real_to_temp[r]].spill_no);
            n_uinstrs_spill++;
            spill_reqd = True;
            if (dis)
               VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
         }

         /* Decide if tno is read. */
         isRead = False;
         for (m = 0; m < k; m++)
            if (tempUse[m] == tno && !isWrites[m])
               isRead = True;

         /* If so, generate a spill load. */
         if (isRead) {
            uInstr2(c2, GET, 4,
                        SpillNo, temp_info[tno].spill_no,
                        RealReg, VG_(rank_to_realreg)(r) );
            n_uinstrs_spill++;
            spill_reqd = True;
            if (dis)
               VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
         }

         /* Update the forwards and backwards maps. */
         real_to_temp[r]        = tno;
         temp_info[tno].real_no = r;
      }

      /* By this point, all TempRegs mentioned by the insn have been
         bought into real regs.  We now copy the insn to the output
         and use patchUInstr to convert its rTempRegs into
         realregs. */
      for (j = 0; j < k; j++)
         realUse[j] = VG_(rank_to_realreg)(temp_info[tempUse[j]].real_no);
      VG_(copy_UInstr)(c2, &c1->instrs[i]);
      patchUInstr(&LAST_UINSTR(c2), &tempUse[0], &realUse[0], k);

      if (dis) {
         VG_(pp_UInstr)(c2->used-1, &LAST_UINSTR(c2));
         VG_(printf)("\n");
      }
   }

   if (temp_info != NULL)
      VG_(arena_free)(VG_AR_JITTER, temp_info);

   VG_(free_UCodeBlock)(c1);

   if (spill_reqd)
      n_translations_needing_spill++;

   return c2;

#  undef VG_NOTHING

}

/* Analysis records liveness of all general-use RealRegs in the UCode. */
static void vg_realreg_liveness_analysis ( UCodeBlock* cb )
{
   Int      i, j, k;
   RRegSet  rregs_live;
   Int      regUse[VG_MAX_REGS_USED];
   Bool     isWrites[VG_MAX_REGS_USED];
   UInstr*  u;

   /* All regs are dead at the end of the block */
   rregs_live = ALL_RREGS_DEAD;

   for (i = cb->used-1; i >= 0; i--) {
      u = &cb->instrs[i];

      u->regs_live_after = rregs_live;

      k = VG_(get_reg_usage)(u, RealReg, &regUse[0], &isWrites[0]);

      /* For each reg usage ... bwds in program order.  Variable is live
         before this UInstr if it is read by this UInstr.
         Note that regUse[j] holds the Intel reg number, so we must
         convert it to our rank number.  */
      for (j = k-1; j >= 0; j--) {
         SET_RREG_LIVENESS ( VG_(realreg_to_rank)(regUse[j]),
                             rregs_live,
                             !isWrites[j] );
      }
   }
}

/*------------------------------------------------------------*/
/*--- Main entry point for the JITter.                     ---*/
/*------------------------------------------------------------*/

/* Translate the basic block beginning at orig_addr, placing the
   translation in a vg_malloc'd block, the address and size of which
   are returned in trans_addr and trans_size.  Length of the original
   block is also returned in orig_size.  If the latter three are NULL,
   this call is being done for debugging purposes, in which case (a)
   throw away the translation once it is made, and (b) produce a load
   of debugging output.

   'tst' is the identity of the thread needing this block.
*/
void VG_(translate) ( /*IN*/  ThreadId tid,
		      /*IN*/  Addr  orig_addr,
                      /*OUT*/ UInt* orig_size,
                      /*OUT*/ Addr* trans_addr,
                      /*OUT*/ UInt* trans_size,
		      /*OUT*/ UShort jumps[VG_MAX_JUMPS])
{
   Int         n_disassembled_bytes, final_code_size;
   Bool        debugging_translation;
   UChar*      final_code;
   UCodeBlock* cb;
   Bool        notrace_until_done;
   UInt        notrace_until_limit = 0;
   Segment     *seg;
   Addr		redir;

   VGP_PUSHCC(VgpTranslate);
   debugging_translation
      = orig_size == NULL || trans_addr == NULL || trans_size == NULL;

   /* Look in the code redirect table to see if we should
      translate an alternative address for orig_addr. */
   redir = VG_(code_redirect)(orig_addr);

   if (redir != orig_addr && VG_(clo_verbosity) >= 2)
      VG_(message)(Vg_UserMsg,
		   "TRANSLATE: %p redirected to %p",
		   orig_addr,
		   redir );
   orig_addr = redir;

   /* If codegen tracing, don't start tracing until
      notrace_until_limit blocks have gone by.  This avoids printing
      huge amounts of useless junk when all we want to see is the last
      few blocks translated prior to a failure.  Set
      notrace_until_limit to be the number of translations to be made
      before --trace-codegen= style printing takes effect. */
   notrace_until_done
      = VG_(overall_in_count) >= notrace_until_limit;

   seg = VG_(find_segment)(orig_addr);

   if (!debugging_translation)
      VG_TRACK( pre_mem_read, Vg_CoreTranslate, tid, "", orig_addr, 1 );

   if (seg == NULL ||
       !VG_(seg_contains)(seg, orig_addr, 1) ||
       (seg->prot & (VKI_PROT_READ|VKI_PROT_EXEC)) == 0) {
      /* Code address is bad - deliver a signal instead */
      vg_assert(!VG_(is_addressable)(orig_addr, 1));

      if (seg != NULL && VG_(seg_contains)(seg, orig_addr, 1)) {
	 vg_assert((seg->prot & VKI_PROT_EXEC) == 0);
	 VG_(synth_fault_perms)(tid, orig_addr);
      } else
	 VG_(synth_fault_mapping)(tid, orig_addr);

      return;
   } else
      seg->flags |= SF_CODE;	/* contains cached code */

   cb = VG_(alloc_UCodeBlock)();
   cb->orig_eip = orig_addr;

   /* If doing any code printing, print a basic block start marker */
   if (VG_(clo_trace_codegen) && notrace_until_done) {
      Char fnname[64] = "";
      VG_(get_fnname_if_entry)(orig_addr, fnname, 64);
      VG_(printf)(
              "==== BB %d %s(%p) in %dB, out %dB, BBs exec'd %llu ====\n\n",
              VG_(overall_in_count), fnname, orig_addr,
              VG_(overall_in_osize), VG_(overall_in_tsize),
              VG_(bbs_done));
   }

   /* True if a debug trans., or if bit N set in VG_(clo_trace_codegen). */
#  define DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(n)               \
      ( debugging_translation                                   \
        || (notrace_until_done                                  \
            && (VG_(clo_trace_codegen) & (1 << (n-1))) ))

   /* Disassemble this basic block into cb. */
   VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(1);
   VGP_PUSHCC(VgpToUCode);
   n_disassembled_bytes = VG_(disBB) ( cb, orig_addr );
   VGP_POPCC(VgpToUCode);

   /* Try and improve the code a bit. */
   if (VG_(clo_optimise)) {
      VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(2);
      VGP_PUSHCC(VgpImprove);
      vg_improve ( cb );
      VGP_POPCC(VgpImprove);
   }

   /* Skin's instrumentation (Nb: must set VG_(print_codegen) in case
      SK_(instrument) looks at it. */
   VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(3);
   VGP_PUSHCC(VgpInstrument);
   cb = SK_(instrument) ( cb, orig_addr );
   if (VG_(print_codegen))
      VG_(pp_UCodeBlock) ( cb, "Instrumented UCode:" );
   VG_(saneUCodeBlock)( cb );
   VGP_POPCC(VgpInstrument);

   /* Add %ESP-update hooks if the tool requires them */
   /* Nb: We don't print out this phase, because it doesn't do much */
   if (VG_(need_to_handle_esp_assignment)()) {
      VGP_PUSHCC(VgpESPUpdate);
      cb = vg_ESP_update_pass ( cb );
      VGP_POPCC(VgpESPUpdate);
   }

   /* Allocate registers. */
   VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(4);
   VGP_PUSHCC(VgpRegAlloc);
   cb = vg_do_register_allocation ( cb );
   VGP_POPCC(VgpRegAlloc);

   /* Do post reg-alloc %e[acd]x liveness analysis (too boring to print
    * anything;  results can be seen when emitting final code). */
   VGP_PUSHCC(VgpLiveness);
   vg_realreg_liveness_analysis ( cb );
   VGP_POPCC(VgpLiveness);

   /* Emit final code */
   VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(5);

   VGP_PUSHCC(VgpFromUcode);
   final_code = VG_(emit_code)(cb, &final_code_size, jumps );
   VGP_POPCC(VgpFromUcode);
   VG_(free_UCodeBlock)(cb);

#undef DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE

   if (debugging_translation) {
      /* Only done for debugging -- throw away final result. */
      VG_(arena_free)(VG_AR_JITTER, final_code);
   } else {
      /* Doing it for real -- return values to caller. */
      *orig_size = n_disassembled_bytes;
      *trans_addr = (Addr)final_code;
      *trans_size = final_code_size;
   }
   VGP_POPCC(VgpTranslate);
}

/*--------------------------------------------------------------------*/
/*--- end                                           vg_translate.c ---*/
/*--------------------------------------------------------------------*/