Lots of spadework for getting x86 floating point to work.

git-svn-id: svn://svn.valgrind.org/vex/trunk@165
2026-02-15 07:07:01 +00:00 · 2004-08-12 20:46:53 +00:00
parent 0046704677
commit b00968b577
13 changed files with 583 additions and 78 deletions
--- a/VEX/hacked104/vg_from_ucode.c
+++ b/VEX/hacked104/vg_from_ucode.c
@@ -810,22 +810,26 @@ static void emit_movzwl_regmem_reg ( Int reg1, Int reg2 )

 static void emit_get_fpu_state ( void )
 {
+#if 0
   Int off = 4 * VGOFF_(m_fpustate);
   newEmit();
   emitB ( 0xDD ); emitB ( 0xA5 ); /* frstor d32(%ebp) */
   emitL ( off );
   if (dis)
      VG_(printf)("\n\t\tfrstor\t%d(%%ebp)\n", off );
+#endif
 }

 static void emit_put_fpu_state ( void )
 {
+#if 0
   Int off = 4 * VGOFF_(m_fpustate);
   newEmit();
   emitB ( 0xDD ); emitB ( 0xB5 ); /* fnsave d32(%ebp) */
   emitL ( off );
   if (dis)
      VG_(printf)("\n\t\tfnsave\t%d(%%ebp)\n", off );
+#endif
 }

 static void emit_fpu_no_mem ( UChar first_byte, 
--- a/VEX/hacked104/vg_helpers.S
+++ b/VEX/hacked104/vg_helpers.S
@@ -206,9 +206,10 @@ cpuid__99:
 VG_(helper_fstsw_AX):
 	pushl	%eax
 	pushl	%esi
-	movl	VGOFF_(m_fpustate), %esi
-	frstor	(%ebp, %esi, 4)
-	fstsw	%ax
+#	movl	VGOFF_(m_fpustate), %esi
+#	frstor	(%ebp, %esi, 4)
+#	fstsw	%ax
+	movw	$0, %ax
 	popl	%esi
 	movw	%ax, 8(%esp)
 	popl	%eax
--- a/VEX/hacked104/vg_include.h
+++ b/VEX/hacked104/vg_include.h
@@ -554,10 +554,11 @@ extern void VG_(__libc_freeres_wrapper)( void );
   which need to go here to avoid ugly circularities.
   ------------------------------------------------------------------ */

-/* How big is the saved FPU state? */
-#define VG_SIZE_OF_FPUSTATE 108
 /* ... and in words ... */
-#define VG_SIZE_OF_FPUSTATE_W ((VG_SIZE_OF_FPUSTATE+3)/4)
+#define VG_SIZE_OF_FPUSTATE_W (8*2 + 1)
+
+#define VG_SIZE_OF_FPUSTATE (4 * VG_SIZE_OF_FPUSTATE_W)
+


 /* ---------------------------------------------------------------------
@@ -737,7 +738,9 @@ typedef
      UInt m_cc_dflag;

      UInt m_eip;
-      UInt m_fpu[VG_SIZE_OF_FPUSTATE_W];
+
+      ULong m_f0, m_f1, m_f2, m_f3, m_f4, m_f5, m_f6, m_f7;
+      UInt  m_ftop;

      UInt sh_eax;
      UInt sh_ebx;
@@ -1533,7 +1536,7 @@ extern void* VG_(client_realloc)  ( ThreadState* tst,
 extern UInt VG_(m_state_static) [8 /* int regs, in Intel order */ 
                                 + 1 /* %eflags */ 
                                 + 1 /* %eip */
-                                 + VG_SIZE_OF_FPUSTATE_W /* FPU state */
+                                 + (108/4) /* real FPU state */
                                ];

 /* Handy fns for doing the copy back and forth. */
@@ -1969,9 +1972,18 @@ extern Int VGOFF_(m_cc_src);
 extern Int VGOFF_(m_cc_dst);
 extern Int VGOFF_(m_cc_dflag);

-extern Int VGOFF_(m_fpustate);
 extern Int VGOFF_(m_eip);

+extern Int VGOFF_(m_f0);
+extern Int VGOFF_(m_f1);
+extern Int VGOFF_(m_f2);
+extern Int VGOFF_(m_f3);
+extern Int VGOFF_(m_f4);
+extern Int VGOFF_(m_f5);
+extern Int VGOFF_(m_f6);
+extern Int VGOFF_(m_f7);
+extern Int VGOFF_(m_ftop);
+

 /* Reg-alloc spill area (VG_MAX_SPILLSLOTS words long). */
 extern Int VGOFF_(spillslots);
--- a/VEX/hacked104/vg_main.c
+++ b/VEX/hacked104/vg_main.c
@@ -55,8 +55,18 @@ Int VGOFF_(m_cc_src) = INVALID_OFFSET;
 Int VGOFF_(m_cc_dst) = INVALID_OFFSET;
 Int VGOFF_(m_cc_dflag) = INVALID_OFFSET;

-Int VGOFF_(m_fpustate) = INVALID_OFFSET;
 Int VGOFF_(m_eip) = INVALID_OFFSET;
+
+Int VGOFF_(m_f0) = INVALID_OFFSET;
+Int VGOFF_(m_f1) = INVALID_OFFSET;
+Int VGOFF_(m_f2) = INVALID_OFFSET;
+Int VGOFF_(m_f3) = INVALID_OFFSET;
+Int VGOFF_(m_f4) = INVALID_OFFSET;
+Int VGOFF_(m_f5) = INVALID_OFFSET;
+Int VGOFF_(m_f6) = INVALID_OFFSET;
+Int VGOFF_(m_f7) = INVALID_OFFSET;
+Int VGOFF_(m_ftop) = INVALID_OFFSET;
+
 Int VGOFF_(spillslots) = INVALID_OFFSET;
 Int VGOFF_(sh_eax) = INVALID_OFFSET;
 Int VGOFF_(sh_ecx) = INVALID_OFFSET;
@@ -165,12 +175,24 @@ static void vg_init_baseBlock ( void )
   /* 6   */ VGOFF_(m_esi)     = alloc_BaB(1);
   /* 7   */ VGOFF_(m_edi)     = alloc_BaB(1);

-   /* 8   */ VGOFF_(m_cc_op)  = alloc_BaB(1);
+   /* 8   */ VGOFF_(m_cc_op)   = alloc_BaB(1);
   /* 9   */ VGOFF_(m_cc_src)  = alloc_BaB(1);
   /* 10  */ VGOFF_(m_cc_dst)  = alloc_BaB(1);
-   /* 11  */ VGOFF_(m_cc_dflag)  = alloc_BaB(1);
-   /* 12  */  VGOFF_(m_eip) = alloc_BaB(1);
+   /* 11  */ VGOFF_(m_cc_dflag)= alloc_BaB(1);

+   /* 12  */ VGOFF_(m_eip)     = alloc_BaB(1);
+
+   /* 13 */ VGOFF_(m_f0) = alloc_BaB(2);
+   /* 15 */ VGOFF_(m_f1) = alloc_BaB(2);
+   /* 17 */ VGOFF_(m_f2) = alloc_BaB(2);
+   /* 19 */ VGOFF_(m_f3) = alloc_BaB(2);
+   /* 21 */ VGOFF_(m_f4) = alloc_BaB(2);
+   /* 23 */ VGOFF_(m_f5) = alloc_BaB(2);
+   /* 25 */ VGOFF_(m_f6) = alloc_BaB(2);
+   /* 27 */ VGOFF_(m_f7) = alloc_BaB(2);
+   /* 29 */ VGOFF_(m_ftop) = alloc_BaB(1);
+
+   /* stated offsets are wrong after here */
   /* 13  */ VGOFF_(sh_eax)    = alloc_BaB(1);
   /* 14  */ VGOFF_(sh_ecx)    = alloc_BaB(1);
   /* 15  */ VGOFF_(sh_edx)    = alloc_BaB(1);
@@ -255,8 +277,6 @@ static void vg_init_baseBlock ( void )
   /* I gave up counting at this point.  Since they're way above the
      short-amode-boundary, there's no point. */

-   VGOFF_(m_fpustate) = alloc_BaB(VG_SIZE_OF_FPUSTATE_W);
-
   VGOFF_(helper_idiv_64_32)
      = alloc_BaB_1_set( (Addr) & VG_(helper_idiv_64_32) );
   VGOFF_(helper_div_64_32)
@@ -999,7 +1019,7 @@ static void process_cmd_line_options ( void )
 UInt VG_(m_state_static) [8 /* int regs, in Intel order */ 
                          + 1 /* %eflags */ 
                          + 1 /* %eip */
-                          + VG_SIZE_OF_FPUSTATE_W /* FPU state */
+                          + (108/4) /* real FPU state */
                         ];

 void VG_(copy_baseBlock_to_m_state_static) ( void )
@@ -1030,15 +1050,15 @@ void VG_(copy_baseBlock_to_m_state_static) ( void )

   VG_(m_state_static)[36/4] = VG_(baseBlock)[VGOFF_(m_eip)];

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
+   /* Hack */
+   for (i = 0; i < (108/4); i++)
      VG_(m_state_static)[40/4 + i] 
-         = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
+         = 0;
 }


 void VG_(copy_m_state_static_to_baseBlock) ( void )
 {
-   Int i;
   VG_(baseBlock)[VGOFF_(m_eax)] = VG_(m_state_static)[ 0/4];
   VG_(baseBlock)[VGOFF_(m_ecx)] = VG_(m_state_static)[ 4/4];
   VG_(baseBlock)[VGOFF_(m_edx)] = VG_(m_state_static)[ 8/4];
@@ -1055,9 +1075,20 @@ void VG_(copy_m_state_static_to_baseBlock) ( void )

   VG_(baseBlock)[VGOFF_(m_eip)] = VG_(m_state_static)[36/4];

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i]
-         = VG_(m_state_static)[40/4 + i];
+   /* Make the FPU register stack appear to be empty. */
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f0)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f1)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f2)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f3)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f4)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f5)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f6)]) = 0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f7)]) = 0;
+   /* stack grows down, towards lower numbered registers, and ftop is
+      decremented prior to use when pushing.  Hence the initial value
+      should be zero, as the decrement then changes it to 7 so we end
+      up first writing %f7. */
+   VG_(baseBlock)[VGOFF_(m_ftop)] = 0;
 }


--- a/VEX/hacked104/vg_scheduler.c
+++ b/VEX/hacked104/vg_scheduler.c
@@ -373,7 +373,6 @@ ThreadId VG_(get_current_tid) ( void )
 __inline__
 void VG_(load_thread_state) ( ThreadId tid )
 {
-   Int i;
   vg_assert(vg_tid_currently_in_baseBlock == VG_INVALID_THREADID);

   VG_(baseBlock)[VGOFF_(m_eax)] = VG_(threads)[tid].m_eax;
@@ -392,8 +391,15 @@ void VG_(load_thread_state) ( ThreadId tid )

   VG_(baseBlock)[VGOFF_(m_eip)] = VG_(threads)[tid].m_eip;

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i] = VG_(threads)[tid].m_fpu[i];
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f0)]) = VG_(threads)[tid].m_f0;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f1)]) = VG_(threads)[tid].m_f1;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f2)]) = VG_(threads)[tid].m_f2;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f3)]) = VG_(threads)[tid].m_f3;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f4)]) = VG_(threads)[tid].m_f4;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f5)]) = VG_(threads)[tid].m_f5;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f6)]) = VG_(threads)[tid].m_f6;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f7)]) = VG_(threads)[tid].m_f7;
+   VG_(baseBlock)[VGOFF_(m_ftop)] = VG_(threads)[tid].m_ftop;

   VG_(baseBlock)[VGOFF_(sh_eax)] = VG_(threads)[tid].sh_eax;
   VG_(baseBlock)[VGOFF_(sh_ebx)] = VG_(threads)[tid].sh_ebx;
@@ -418,8 +424,8 @@ void VG_(load_thread_state) ( ThreadId tid )
 __inline__
 void VG_(save_thread_state) ( ThreadId tid )
 {
-   Int i;
-   const UInt junk = 0xDEADBEEF;
+   const UInt  junk   = 0xDEADBEEF;
+   const ULong junk64 = 0xDEADBEEFDEADBEEFLL;

   vg_assert(vg_tid_currently_in_baseBlock != VG_INVALID_THREADID);

@@ -439,8 +445,15 @@ void VG_(save_thread_state) ( ThreadId tid )

   VG_(threads)[tid].m_eip = VG_(baseBlock)[VGOFF_(m_eip)];

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(threads)[tid].m_fpu[i] = VG_(baseBlock)[VGOFF_(m_fpustate) + i];
+   VG_(threads)[tid].m_f0   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f0)]);
+   VG_(threads)[tid].m_f1   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f1)]);
+   VG_(threads)[tid].m_f2   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f2)]);
+   VG_(threads)[tid].m_f3   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f3)]);
+   VG_(threads)[tid].m_f4   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f4)]);
+   VG_(threads)[tid].m_f5   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f5)]);
+   VG_(threads)[tid].m_f6   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f6)]);
+   VG_(threads)[tid].m_f7   = *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f7)]);
+   VG_(threads)[tid].m_ftop = VG_(baseBlock)[VGOFF_(m_ftop)];

   VG_(threads)[tid].sh_eax = VG_(baseBlock)[VGOFF_(sh_eax)];
   VG_(threads)[tid].sh_ebx = VG_(baseBlock)[VGOFF_(sh_ebx)];
@@ -467,8 +480,15 @@ void VG_(save_thread_state) ( ThreadId tid )
   VG_(baseBlock)[VGOFF_(m_cc_dflag)] = junk;
   VG_(baseBlock)[VGOFF_(m_eip)] = junk;

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      VG_(baseBlock)[VGOFF_(m_fpustate) + i] = junk;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f0)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f1)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f2)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f3)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f4)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f5)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f6)]) = junk64;
+   *(ULong*)(&VG_(baseBlock)[VGOFF_(m_f7)]) = junk64;
+   VG_(baseBlock)[VGOFF_(m_ftop)] = junk;

   vg_tid_currently_in_baseBlock = VG_INVALID_THREADID;
 }
--- a/VEX/hacked104/vg_signals.c
+++ b/VEX/hacked104/vg_signals.c
@@ -899,7 +899,6 @@ typedef
      /* Safely-saved version of sigNo, as described above. */
      Int  sigNo_private;
      /* Saved processor state. */
-      UInt fpustate[VG_SIZE_OF_FPUSTATE_W];
      UInt eax;
      UInt ecx;
      UInt edx;
@@ -913,6 +912,8 @@ typedef
      UInt cc_src;
      UInt cc_dst;
      UInt cc_dflag;
+      ULong f0, f1, f2, f3, f4, f5, f6, f7;
+      UInt ftop;
      /* Scheduler-private stuff: what was the thread's status prior to
         delivering this signal? */
      ThreadStatus status;
@@ -930,7 +931,6 @@ typedef
 static
 void vg_push_signal_frame ( ThreadId tid, int sigNo )
 {
-   Int          i;
   Addr         esp, esp_top_of_frame;
   VgSigFrame*  frame;
   ThreadState* tst;
@@ -971,8 +971,15 @@ void vg_push_signal_frame ( ThreadId tid, int sigNo )
   frame->puContext  = (Addr)NULL;
   frame->magicPI    = 0x31415927;

-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      frame->fpustate[i] = tst->m_fpu[i];
+   frame->f0         = tst->m_f0;
+   frame->f1         = tst->m_f1;
+   frame->f2         = tst->m_f2;
+   frame->f3         = tst->m_f3;
+   frame->f4         = tst->m_f4;
+   frame->f5         = tst->m_f5;
+   frame->f6         = tst->m_f6;
+   frame->f7         = tst->m_f7;
+   frame->ftop       = tst->m_ftop;

   frame->eax        = tst->m_eax;
   frame->ecx        = tst->m_ecx;
@@ -1022,7 +1029,7 @@ static
 Int vg_pop_signal_frame ( ThreadId tid )
 {
   Addr          esp;
-   Int           sigNo, i;
+   Int           sigNo;
   VgSigFrame*   frame;
   ThreadState*  tst;

@@ -1042,8 +1049,15 @@ Int vg_pop_signal_frame ( ThreadId tid )
         "vg_pop_signal_frame (thread %d): valid magic", tid);

   /* restore machine state */
-   for (i = 0; i < VG_SIZE_OF_FPUSTATE_W; i++)
-      tst->m_fpu[i] = frame->fpustate[i];
+   tst->m_f0      = frame->f0;
+   tst->m_f1      = frame->f1;
+   tst->m_f2      = frame->f2;
+   tst->m_f3      = frame->f3;
+   tst->m_f4      = frame->f4;
+   tst->m_f5      = frame->f5;
+   tst->m_f6      = frame->f6;
+   tst->m_f7      = frame->f7;
+   tst->m_ftop    = frame->ftop;

   /* Mark the frame structure as nonaccessible. */
   if (VG_(clo_instrument))
--- a/VEX/hacked104/vg_translate.c
+++ b/VEX/hacked104/vg_translate.c
@@ -3121,8 +3121,8 @@ void VG_(translate) ( ThreadState* tst,
   UChar* final;
   Bool debugging_translation;

-   static Int v0thresh = 87000;
-   static Int v2thresh = 87000;
+   static Int v0thresh = 940;
+   static Int v2thresh = 940;

   TranslateResult tres;
   static Bool vex_init_done = False;
--- a/VEX/priv/guest-x86/gdefs.h
+++ b/VEX/priv/guest-x86/gdefs.h
@@ -130,8 +130,46 @@ enum {
 /* EIP */
 #define OFFB_EIP     (12*4)

+/* FPU.  For now, just simulate 8 64-bit registers and the reg-stack
+   top pointer, of which only the least significant three bits are
+   relevant.

-#define SIZEOF_X86H_STATE OFFB_EIP
+   The model is:
+     F0 .. F7 are the 8 registers.  ftop[2:0] contains the 
+     index of the current 'stack top' -- pretty meaningless, but
+     still.  
+
+     When a value is pushed onto the stack, ftop is first replaced by 
+     (ftop-1) & 7, and then F[ftop] is assigned the value.
+
+     When a value is popped off the stack, the value is read from
+     F[ftop], and then ftop is replaced by (ftop+1) & 7.
+
+     In general, a reference to a register ST(i) actually references
+     F[ (ftop+i) & 7 ].
+
+   There should be an array of 8 booleans corresponding to F0 .. F7,
+   indicating whether the corresponding F reg contains a value or not.
+
+   A read of an F reg marked empty, for any reason, elicits a stack
+   underflow fault.
+
+   A load from memory into an F reg marked full elicits a stack overflow
+   fault.  This appears to be the only way a stack overflow fault can
+   happen.
+*/
+#define OFFB_F0      (13*4)
+#define OFFB_F1      (15*4)
+#define OFFB_F2      (17*4)
+#define OFFB_F3      (19*4)
+#define OFFB_F4      (21*4)
+#define OFFB_F5      (23*4)
+#define OFFB_F6      (25*4)
+#define OFFB_F7      (27*4)
+#define OFFB_FTOP    (29*4)
+
+/* Don't forget to keep this up to date. */
+#define SIZEOF_X86H_STATE  OFFB_FTOP



--- a/VEX/priv/guest-x86/toIR.c
+++ b/VEX/priv/guest-x86/toIR.c
@@ -3076,6 +3076,189 @@ UInt dis_imul_I_E_G ( UChar       sorb,
 }   


+/*------------------------------------------------------------*/
+/*--- x87 floating point insns.                            ---*/
+/*------------------------------------------------------------*/
+
+/* Get/set the top-of-stack pointer. */
+
+static IRExpr* get_ftop ( void )
+{
+   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
+}
+
+static IRStmt* put_ftop ( IRExpr* e )
+{
+   return IRStmt_Put( OFFB_FTOP, e );
+}
+
+/* Given i, generate an expression which is the offset in the guest
+   state of ST(i), considering the current value of FTOP. */
+
+static IRExpr* off_ST ( Int i )
+{
+  vassert(i >= 0 && i <= 7);
+  return 
+     binop(Iop_Add32,
+           binop(Iop_Mul32,
+                 binop(Iop_And32, 
+                       binop(Iop_Add32, get_ftop(), mkU32(i)),
+                       mkU32(7)),
+                 mkU32(8)),
+           mkU32(OFFB_F0)
+     );
+}
+
+/* Given i, and some expression e, generate 'ST(i) = e'. */
+
+static IRStmt* put_ST ( Int i, IRExpr* value )
+{
+   return
+      IRStmt_PutI( off_ST(i), value, OFFB_F0, OFFB_F7+8-1 );
+}
+
+/* Given i, generate an expression yielding 'ST(i)'. */
+
+static IRExpr* get_ST ( Int i )
+{
+   return
+      IRExpr_GetI( off_ST(i), Ity_F64, OFFB_F0, OFFB_F7+8-1 );
+}
+
+/* Adjust FTOP downwards by one register. */
+
+static IRStmt* do_push ( void )
+{
+   return
+      put_ftop(
+         binop(Iop_And32,
+               binop(Iop_Sub32, get_ftop(), mkU32(1)),
+	       mkU32(7))
+      );
+}
+
+
+static
+UInt dis_FPU ( Bool* decode_ok, UChar sorb, UInt delta )
+{
+   Int  len;
+   Char dis_buf[32];
+   UInt opc_aux;
+
+   /* On entry, delta points at the second byte of the insn (the modrm
+      byte).*/
+   UChar first_opcode = getIByte(delta-1);
+   UChar modrm        = getIByte(delta+0);
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
+
+   if (first_opcode == 0xD8) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xD9) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDA) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDB) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDC) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDD) {
+
+      if (modrm < 0xC0) {
+
+ 	 /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+	 IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+	 delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FLD double-real */
+               DIP("fldD %s\n", dis_buf);
+	       stmt( do_push() );
+	       stmt( put_ST(0, IRExpr_LDle(Ity_F64, mkexpr(addr))) );
+	       break;
+
+#if 0
+            case 2: /* FST double-real */
+               IFDB( if (dis) printf("\tfstD\t%s\n",t_addr); )
+               if (!fp_is_empty_tag(fp_get_tag_ST(0))) {
+                  vd_addr = fp_get_reg_ST(0);
+               } else {
+                  vd_addr = NAN;
+                  fp_set_stack_underflow();
+               }
+               setDMem(a_addr,vd_addr);
+               break;
+
+            case 3: /* FSTP double-real */
+               IFDB( if (dis) printf("\tfstpD\t%s\n",t_addr); )
+               if (!fp_is_empty_tag(fp_get_tag_ST(0))) {
+                  vd_addr = fp_pop();
+               } else {
+                  vd_addr = fp_pop(); /* then throw away result */
+                  vd_addr = NAN;
+                  fp_set_stack_underflow();
+               }
+               setDMem(a_addr,vd_addr);
+               break;
+#endif
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", opc_aux);
+               vex_printf("first_opcode == 0xDD");
+               goto decode_fail;
+	 }
+      } else {
+         goto decode_fail;
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDE) {
+      goto decode_fail;
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDF) {
+      goto decode_fail;
+   }
+
+   else
+   vpanic("dis_FPU(x86): invalid primary opcode");
+
+  decode_success:
+   *decode_ok = True;
+   return delta;
+
+  decode_fail:
+   *decode_ok = False;
+   return delta;
+}
+
+
 //-- /* Handle FPU insns which read/write memory.  On entry, eip points to
 //--    the second byte of the insn (the one following D8 .. DF). */
 //-- static 
@@ -6249,16 +6432,23 @@ static UInt disInstr ( UInt delta, Bool* isEnd )
 //--       DIP("fwait\n");
 //--       break;
 //-- 
-//--    case 0xD8:
-//--    case 0xD9:
-//--    case 0xDA:
-//--    case 0xDB:
-//--    case 0xDC:
-//--    case 0xDD:
-//--    case 0xDE:
-//--    case 0xDF:
-//--       eip = dis_fpu ( cb, sorb, opc, eip );
-//--       break;
+   case 0xD8:
+   case 0xD9:
+   case 0xDA:
+   case 0xDB:
+   case 0xDC:
+   case 0xDD:
+   case 0xDE:
+   case 0xDF: {
+      UInt delta0    = delta;
+      Bool decode_OK = False;
+      delta = dis_FPU ( &decode_OK, sorb, delta );
+      if (!decode_OK) {
+         delta = delta0;
+         goto decode_failure;
+      }
+      break;
+   }

   /* ------------------------ INC & DEC ------------------ */

--- a/VEX/priv/host-x86/hdefs.c
+++ b/VEX/priv/host-x86/hdefs.c
@@ -35,7 +35,7 @@ void ppHRegX86 ( HReg reg )
         return;
      case HRcFloat:
         r = hregNumber(reg);
-         vassert(r >= 0 && r < 6);
+         vassert(r >= 0 && r < 4);
         vex_printf("%%fake%d", r);
         return;
      case HRcVector:
@@ -54,9 +54,14 @@ HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt, False); }
 HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt, False); }
 HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt, False); }

+HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFloat, False); }
+HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFloat, False); }
+HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFloat, False); }
+HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFloat, False); }
+
 void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
 {
-   *nregs = 6;
+   *nregs = 10;
   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
   (*arr)[0] = hregX86_EAX();
   (*arr)[1] = hregX86_EBX();
@@ -64,6 +69,10 @@ void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
   (*arr)[3] = hregX86_EDX();
   (*arr)[4] = hregX86_ESI();
   (*arr)[5] = hregX86_EDI();
+   (*arr)[6] = hregX86_FAKE0();
+   (*arr)[7] = hregX86_FAKE1();
+   (*arr)[8] = hregX86_FAKE2();
+   (*arr)[9] = hregX86_FAKE3();
 }


@@ -399,6 +408,18 @@ Char* showX86ShiftOp ( X86ShiftOp op ) {
   }
 }

+Char* showX86FpOp ( X86FpOp op ) {
+   switch (op) {
+      case Xfp_Add:    return "add";
+      case Xfp_Sub:    return "sub";
+      case Xfp_Mul:    return "mul";
+      case Xfp_Div:    return "div";
+      case Xfp_Sqrt:   return "sqrt";
+      case Xfp_Negate: return "chs";
+      default: vpanic("ppX86FpOp");
+   }
+}
+
 X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
   i->tag            = Xin_Alu32R;
@@ -506,7 +527,7 @@ X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
   vassert(szSmall == 1 || szSmall == 2);
   return i;
 }
-X86Instr* X86Instr_Store  ( UChar sz, HReg src, X86AMode* dst ) {
+X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
   i->tag           = Xin_Store;
   i->Xin.Store.sz  = sz;
@@ -515,6 +536,42 @@ X86Instr* X86Instr_Store  ( UChar sz, HReg src, X86AMode* dst ) {
   vassert(sz == 1 || sz == 2);
   return i;
 }
+X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_FpUnary;
+   i->Xin.FpUnary.op  = op;
+   i->Xin.FpUnary.src = src;
+   i->Xin.FpUnary.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_FpBinary;
+   i->Xin.FpBinary.op   = op;
+   i->Xin.FpBinary.srcL = srcL;
+   i->Xin.FpBinary.srcR = srcR;
+   i->Xin.FpBinary.dst  = dst;
+   return i;
+}
+X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_FpLdSt;
+   i->Xin.FpLdSt.isLoad = isLoad;
+   i->Xin.FpLdSt.sz     = sz;
+   i->Xin.FpLdSt.reg    = reg;
+   i->Xin.FpLdSt.addr   = addr;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+X86Instr* X86Instr_FpI64 ( Bool toInt, HReg freg, HReg iregHi, HReg iregLo ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_FpI64;
+   i->Xin.FpI64.toInt  = toInt;
+   i->Xin.FpI64.freg   = freg;
+   i->Xin.FpI64.iregHi = iregHi;
+   i->Xin.FpI64.iregLo = iregLo;
+   return i;
+}


 void ppX86Instr ( X86Instr* i ) {
@@ -617,6 +674,19 @@ void ppX86Instr ( X86Instr* i ) {
         vex_printf(",");
         ppX86AMode(i->Xin.Store.dst);
         return;
+      case Xin_FpLdSt:
+         if (i->Xin.FpLdSt.isLoad) {
+            vex_printf("gld%c" , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+            ppX86AMode(i->Xin.FpLdSt.addr);
+            vex_printf(", ");
+            ppHRegX86(i->Xin.FpLdSt.reg);
+         } else {
+            vex_printf("gst%c" , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+            ppHRegX86(i->Xin.FpLdSt.reg);
+            vex_printf(", ");
+            ppX86AMode(i->Xin.FpLdSt.addr);
+         }
+         return;
      default:
         vpanic("ppX86Instr");
   }
--- a/VEX/priv/host-x86/hdefs.h
+++ b/VEX/priv/host-x86/hdefs.h
@@ -242,6 +242,17 @@ typedef
 extern Char* showX86ShiftOp ( X86ShiftOp );


+/* --------- */
+typedef
+   enum {
+      Xfp_Add, Xfp_Sub, Xfp_Mul, Xfp_Div, 
+      Xfp_Sqrt, Xfp_Negate
+   }
+   X86FpOp;
+
+extern Char* showX86FpOp ( X86FpOp );
+
+
 /* --------- */
 typedef
   enum {
@@ -258,11 +269,15 @@ typedef
      Xin_Goto,      /* conditional/unconditional jmp to dst */
      Xin_CMov32,    /* conditional move */
      Xin_LoadEX,    /* mov{s,z}{b,w}l from mem to reg */
-      Xin_Store      /* store 16/8 bit value in memory */
+      Xin_Store,     /* store 16/8 bit value in memory */
+      Xin_FpUnary,   /* FP fake unary op */
+      Xin_FpBinary,  /* FP fake binary op */
+      Xin_FpLdSt,    /* FP fake load/store */
+      Xin_FpI64      /* FP fake to/from 64-bit signed int */
   }
   X86InstrTag;

-/* Destinations are on the RIGHT (second operand). */
+/* Destinations are on the RIGHT (second operand) */

 typedef
   struct {
@@ -348,25 +363,53 @@ typedef
            HReg      src;
            X86AMode* dst;
         } Store;
-      } Xin;
+         /* X86 Floating point (fake 3-operand, "flat reg file" insns) */
+         struct {
+            X86FpOp op;
+            HReg    src;
+            HReg    dst;
+         } FpUnary;
+         struct {
+            X86FpOp op;
+            HReg    srcL;
+            HReg    srcR;
+            HReg    dst;
+         } FpBinary;
+         struct {
+            Bool      isLoad;
+            UChar     sz; /* only 4 (IEEE single) or 8 (IEEE double) */
+            HReg      reg;
+            X86AMode* addr;
+         } FpLdSt;
+         struct {
+            Bool toInt; /* True: F64->I64; False: I64->64 */
+            HReg freg;
+            HReg iregHi;
+            HReg iregLo;
+         } FpI64;
+     } Xin;
   }
   X86Instr;

-extern X86Instr* X86Instr_Alu32R  ( X86AluOp, X86RMI*, HReg );
-extern X86Instr* X86Instr_Alu32M  ( X86AluOp, X86RI*,  X86AMode* );
-extern X86Instr* X86Instr_Unary32 ( X86UnaryOp op, X86RM* dst );
-extern X86Instr* X86Instr_Sh32    ( X86ShiftOp, UInt, X86RM* );
-extern X86Instr* X86Instr_Test32  ( X86RI* src, X86RM* dst );
-extern X86Instr* X86Instr_MulL    ( Bool syned, X86ScalarSz, X86RM* );
-extern X86Instr* X86Instr_Div     ( Bool syned, X86ScalarSz, X86RM* );
-extern X86Instr* X86Instr_Sh3232  ( X86ShiftOp, UInt amt, HReg src, HReg dst );
-extern X86Instr* X86Instr_Push    ( X86RMI* );
-extern X86Instr* X86Instr_Call    ( HReg );
-extern X86Instr* X86Instr_Goto    ( IRJumpKind, X86CondCode cond, X86RI* dst );
-extern X86Instr* X86Instr_CMov32  ( X86CondCode, X86RM* src, HReg dst );
-extern X86Instr* X86Instr_LoadEX  ( UChar szSmall, Bool syned,
-                                    X86AMode* src, HReg dst );
-extern X86Instr* X86Instr_Store   ( UChar sz, HReg src, X86AMode* dst );
+extern X86Instr* X86Instr_Alu32R   ( X86AluOp, X86RMI*, HReg );
+extern X86Instr* X86Instr_Alu32M   ( X86AluOp, X86RI*,  X86AMode* );
+extern X86Instr* X86Instr_Unary32  ( X86UnaryOp op, X86RM* dst );
+extern X86Instr* X86Instr_Sh32     ( X86ShiftOp, UInt, X86RM* );
+extern X86Instr* X86Instr_Test32   ( X86RI* src, X86RM* dst );
+extern X86Instr* X86Instr_MulL     ( Bool syned, X86ScalarSz, X86RM* );
+extern X86Instr* X86Instr_Div      ( Bool syned, X86ScalarSz, X86RM* );
+extern X86Instr* X86Instr_Sh3232   ( X86ShiftOp, UInt amt, HReg src, HReg dst );
+extern X86Instr* X86Instr_Push     ( X86RMI* );
+extern X86Instr* X86Instr_Call     ( HReg );
+extern X86Instr* X86Instr_Goto     ( IRJumpKind, X86CondCode cond, X86RI* dst );
+extern X86Instr* X86Instr_CMov32   ( X86CondCode, X86RM* src, HReg dst );
+extern X86Instr* X86Instr_LoadEX   ( UChar szSmall, Bool syned,
+                                     X86AMode* src, HReg dst );
+extern X86Instr* X86Instr_Store    ( UChar sz, HReg src, X86AMode* dst );
+extern X86Instr* X86Instr_FpUnary  ( X86FpOp op, HReg src, HReg dst );
+extern X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst );
+extern X86Instr* X86Instr_FpLdSt   ( Bool isLoad, UChar sz, HReg reg, X86AMode* );
+extern X86Instr* X86Instr_FpI64    ( Bool toInt, HReg freg, HReg iregHi, HReg iregLo );

 extern void ppX86Instr ( X86Instr* );

--- a/VEX/priv/ir/irdefs.c
+++ b/VEX/priv/ir/irdefs.c
@@ -26,6 +26,8 @@ void ppIRType ( IRType ty )
    case Ity_I16:     vex_printf( "I16"); break;
    case Ity_I32:     vex_printf( "I32"); break;
    case Ity_I64:     vex_printf( "I64"); break;
+    case Ity_F32:     vex_printf( "F32"); break;
+    case Ity_F64:     vex_printf( "F64"); break;
    default: vex_printf("ty = 0x%x\n", (Int)ty);
             vpanic("ppIRType");
  }
@@ -196,6 +198,12 @@ void ppIRStmt ( IRStmt* s )
      vex_printf( "PUT(%d) = ", s->Ist.Put.offset);
      ppIRExpr(s->Ist.Put.expr);
      break;
+    case Ist_PutI:
+      vex_printf( "PUTI[%d,%d](", s->Ist.PutI.minoff, s->Ist.PutI.maxoff);
+      ppIRExpr(s->Ist.PutI.offset);
+      vex_printf( ") = " );
+      ppIRExpr(s->Ist.PutI.expr);
+      break;
    case Ist_Tmp:
      ppIRTemp(s->Ist.Tmp.tmp);
      vex_printf( " = " );
@@ -320,6 +328,16 @@ IRExpr* IRExpr_Get ( Int off, IRType ty ) {
   e->Iex.Get.ty     = ty;
   return e;
 }
+IRExpr* IRExpr_GetI ( IRExpr* off, IRType ty, 
+                      UShort minoff, UShort maxoff ) {
+   IRExpr* e          = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag             = Iex_GetI;
+   e->Iex.GetI.offset = off;
+   e->Iex.GetI.ty     = ty;
+   e->Iex.GetI.minoff = minoff;
+   e->Iex.GetI.maxoff = maxoff;
+   return e;
+}
 IRExpr* IRExpr_Tmp ( IRTemp tmp ) {
   IRExpr* e      = LibVEX_Alloc(sizeof(IRExpr));
   e->tag         = Iex_Tmp;
@@ -382,6 +400,17 @@ IRStmt* IRStmt_Put ( Int off, IRExpr* value ) {
   s->Ist.Put.expr   = value;
   return s;
 }
+IRStmt* IRStmt_PutI ( IRExpr* off, IRExpr* value, 
+                      UShort minoff, UShort maxoff ) {
+   IRStmt* s          = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag             = Ist_PutI;
+   s->link            = NULL;
+   s->Ist.PutI.offset = off;
+   s->Ist.PutI.expr   = value;
+   s->Ist.PutI.minoff = minoff;
+   s->Ist.PutI.maxoff = maxoff;
+   return s;
+}
 IRStmt* IRStmt_Tmp ( IRTemp tmp, IRExpr* expr ) {
   IRStmt* s       = LibVEX_Alloc(sizeof(IRStmt));
   s->tag          = Ist_Tmp;
@@ -702,6 +731,10 @@ void useBeforeDef_Stmt ( IRBB* bb, IRStmt* stmt, Int* def_counts )
      case Ist_Put:
         useBeforeDef_Expr(bb,stmt,stmt->Ist.Put.expr,def_counts);
         break;
+      case Ist_PutI:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.PutI.offset,def_counts);
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.PutI.expr,def_counts);
+         break;
      case Ist_Tmp:
         useBeforeDef_Expr(bb,stmt,stmt->Ist.Tmp.expr,def_counts);
         break;
@@ -807,6 +840,14 @@ void tcStmt ( IRBB* bb, IRStmt* stmt, IRType gWordTy )
          if (typeOfIRExpr(tyenv,stmt->Ist.Put.expr) == Ity_Bit)
             sanityCheckFail(bb,stmt,"IRStmt.Put.expr: cannot Put :: Ity_Bit");
          break;
+      case Ist_PutI:
+          tcExpr( bb, stmt, stmt->Ist.PutI.expr, gWordTy );
+          tcExpr( bb, stmt, stmt->Ist.PutI.offset, gWordTy );
+          if (typeOfIRExpr(tyenv,stmt->Ist.PutI.expr) == Ity_Bit)
+             sanityCheckFail(bb,stmt,"IRStmt.PutI.expr: cannot PutI :: Ity_Bit");
+          if (typeOfIRExpr(tyenv,stmt->Ist.PutI.offset) != Ity_I32)
+             sanityCheckFail(bb,stmt,"IRStmt.PutI.offset: not :: Ity_I32");
+          break;
      case Ist_Tmp:
         tcExpr( bb, stmt, stmt->Ist.Tmp.expr, gWordTy );
         if (lookupIRTypeEnv(tyenv, stmt->Ist.Tmp.tmp)
--- a/VEX/pub/libvex_ir.h
+++ b/VEX/pub/libvex_ir.h
@@ -21,7 +21,9 @@
 typedef 
   enum { Ity_INVALID=0x10FFF,
          Ity_Bit=0x11000, 
-          Ity_I8, Ity_I16, Ity_I32, Ity_I64 }
+          Ity_I8, Ity_I16, Ity_I32, Ity_I64,
+          Ity_F32, Ity_F64
+   }
   IRType;

 extern void ppIRType ( IRType );
@@ -113,7 +115,11 @@ typedef
      Iop_32HLto64,   // :: (I32,I32) -> I64
      /* 1-bit stuff */
      Iop_32to1, /* :: Ity_I32 -> Ity_Bit, just select bit[0] */
-      Iop_1Uto8  /* :: Ity_Bit -> Ity_I8, unsigned widen */
+      Iop_1Uto8, /* :: Ity_Bit -> Ity_I8, unsigned widen */
+      /* FP stuff */
+      Iop_AddF64, Iop_SubF64, Iop_MulF64, Iop_DivF64,
+      Iop_SqrtF64,
+      Iop_I64toF64, Iop_F64toI64
   }
   IROp;

@@ -123,17 +129,35 @@ extern void ppIROp ( IROp );
 /* ------------------ Expressions ------------------ */
 /*
 data Expr
-   = GET   Int Int         -- offset, size
+   = GET   Int Type        -- offset, size
+   | GETI  Expr Type Int Int -- offset, size, minoff, maxoff
   | TMP   Temp            -- value of temporary
   | BINOP Op Expr Expr    -- binary op
   | UNOP  Op Expr         -- unary op
   | LDle  Type Expr       -- load of the given type, Expr:: 32 or 64
   | CONST Const           -- 8/16/32/64-bit int constant
+
+Re GETI.  It carries two ints, which give the lowest and highest
+possible byte offsets that the GetI can possibly reference.
+For example, if the type is Ity_I32, and the Expr may have
+a value of M, M+4 or M+8, where M is a translation-time known
+constant, then the low and high limits are M and M+11 respectively.
+
+PUTI carries similar limit values.
+
+These can be used by IR optimisers to establish aliasing/non-aliasing
+between seperate GETI and PUTI terms, which could be used to do
+reordering of them, or suchlike things.  Clearly it's critical to give
+the correct limit values -- this is something that can't be
+automatically checked (in general), and so the front-end writers must
+be very careful to tell the truth, since not doing so could lead to
+obscure IR optimisation bugs.
 */
+
 typedef
     enum { Iex_Binder, /* Used only in pattern matching.  
                           Not an expression. */
-          Iex_Get, Iex_Tmp, Iex_Binop, Iex_Unop, Iex_LDle, 
+          Iex_Get, Iex_GetI, Iex_Tmp, Iex_Binop, Iex_Unop, Iex_LDle, 
          Iex_Const, Iex_CCall, Iex_Mux0X }
   IRExprTag;

@@ -148,6 +172,12 @@ typedef
            Int    offset;
            IRType ty;
         } Get;
+         struct {
+            struct _IRExpr* offset;
+            IRType  ty;
+            UShort  minoff;
+            UShort  maxoff;
+         } GetI;
         struct {
            IRTemp tmp;
         } Tmp;
@@ -183,6 +213,8 @@ typedef

 extern IRExpr* IRExpr_Binder ( Int binder );
 extern IRExpr* IRExpr_Get    ( Int off, IRType ty );
+extern IRExpr* IRExpr_GetI   ( IRExpr* off, IRType ty,  
+                               UShort minoff, UShort maxoff );
 extern IRExpr* IRExpr_Tmp    ( IRTemp tmp );
 extern IRExpr* IRExpr_Binop  ( IROp op, IRExpr* arg1, IRExpr* arg2 );
 extern IRExpr* IRExpr_Unop   ( IROp op, IRExpr* arg );
@@ -194,7 +226,8 @@ extern IRExpr* IRExpr_Mux0X  ( IRExpr* cond, IRExpr* expr0, IRExpr* exprX );
 extern void ppIRExpr ( IRExpr* );

 /* CCall info.  The name is the C helper function; the backends
-   will look it up in a table of known helpers, to get the address.
+   will hand the name to the front ends to get the address of a 
+   host-code helper function to be called.

   The args are a NULL-terminated array of arguments.  The stated
   return IRType, and the implied argument types, must match that
@@ -220,7 +253,7 @@ data Stmt
                              -- Const is destination guest addr
 */
 typedef 
-   enum { Ist_Put, Ist_Tmp, Ist_STle, Ist_Exit } 
+   enum { Ist_Put, Ist_PutI, Ist_Tmp, Ist_STle, Ist_Exit } 
   IRStmtTag;

 typedef
@@ -231,6 +264,12 @@ typedef
            Int     offset;
            IRExpr* expr;
         } Put;
+         struct {
+            IRExpr* offset;
+            IRExpr* expr;
+            UShort  minoff;
+            UShort  maxoff;
+         } PutI;
         struct {
            IRTemp  tmp;
            IRExpr* expr;
@@ -249,6 +288,8 @@ typedef
   IRStmt;

 extern IRStmt* IRStmt_Put  ( Int off, IRExpr* value );
+extern IRStmt* IRStmt_PutI ( IRExpr* off, IRExpr* value, 
+                             UShort minoff, UShort maxoff );
 extern IRStmt* IRStmt_Tmp  ( IRTemp tmp, IRExpr* expr );
 extern IRStmt* IRStmt_STle ( IRExpr* addr, IRExpr* value );
 extern IRStmt* IRStmt_Exit ( IRExpr* cond, IRConst* dst );