From 0facd982d21d5a8217f0835dddb3df21d6ec8403 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Mon, 26 May 2003 08:47:27 +0000 Subject: [PATCH] Implement enough SSE/SSE2 insns so that all the GL demos in qt-3.1.0 work, when running on a P4 with an NVidia Vanta card and using NVidia-supplied libGL.so.1.0.3123. Surprisingly this seems to require only a minimal set of instructions. So far this is only with --skin=none. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1652 --- coregrind/vg_from_ucode.c | 32 +++++ coregrind/vg_to_ucode.c | 240 +++++++++++++++++++++++++++++--------- coregrind/vg_translate.c | 31 ++++- 3 files changed, 244 insertions(+), 59 deletions(-) diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c index 9b8fb84d0..613f663e6 100644 --- a/coregrind/vg_from_ucode.c +++ b/coregrind/vg_from_ucode.c @@ -1499,6 +1499,22 @@ static void emit_SSE4 ( FlagSet uses_sflags, (UInt)third_byte, (UInt)fourth_byte ); } +static void emit_SSE3 ( FlagSet uses_sflags, + FlagSet sets_sflags, + UChar first_byte, + UChar second_byte, + UChar third_byte ) +{ + VG_(new_emit)(True, uses_sflags, sets_sflags); + VG_(emitB) ( first_byte ); + VG_(emitB) ( second_byte ); + VG_(emitB) ( third_byte ); + if (dis) + VG_(printf)("\n\t\tsse-0x%x:0x%x:0x%x\n", + (UInt)first_byte, (UInt)second_byte, + (UInt)third_byte ); +} + static void emit_MMX2_reg_to_mmxreg ( FlagSet uses_sflags, FlagSet sets_sflags, UChar first_byte, @@ -3745,6 +3761,22 @@ static void emitUInstr ( UCodeBlock* cb, Int i, u->val2 & 0xFF ); break; + case SSE3: + vg_assert(u->size == 0); + vg_assert(u->tag1 == Lit16); + vg_assert(u->tag2 == Lit16); + vg_assert(u->tag3 == NoValue); + vg_assert(!anyFlagUse(u)); + if (!(*sselive)) { + emit_get_sse_state(); + *sselive = True; + } + emit_SSE3 ( u->flags_r, u->flags_w, + (u->val1 >> 8) & 0xFF, + u->val1 & 0xFF, + u->val2 & 0xFF ); + break; + default: if (VG_(needs).extended_UCode) { if (*sselive) { diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c index d4548a942..c612d2de8 100644 --- a/coregrind/vg_to_ucode.c +++ b/coregrind/vg_to_ucode.c @@ -3282,8 +3282,9 @@ Addr dis_SSE3_reg_or_mem ( UCodeBlock* cb, Lit16, (((UShort)opc3) << 8) | (UShort)modrm ); if (dis) VG_(printf)("%s %s, %s\n", name, - nameXMMReg(eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)) ); - eip++; + nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm)) ); + eip++; } else { VG_(core_panic)("dis_SSE3_reg_or_mem: mem"); } @@ -3291,6 +3292,79 @@ Addr dis_SSE3_reg_or_mem ( UCodeBlock* cb, } +/* Simple SSE operations, either + op (src)xmmreg, (dst)xmmreg + or + op (src)address, (dst)xmmreg + It is assumed that there are 2 opcode bytes preceding the start + of the address mode. eip points to the first opcode byte. +*/ +static +Addr dis_SSE2_reg_or_mem ( UCodeBlock* cb, + UChar sorb, + Char* name, + Int sz, + Addr eip ) +{ + UChar opc1 = getUChar(eip); + UChar opc2 = getUChar(eip+1); + UChar modrm = getUChar(eip+2); + eip += 2; + if (epartIsReg(modrm)) { + /* Completely internal SSE insn. */ + uInstr2(cb, SSE3, 0, /* ignore sz for internal ops */ + Lit16, (((UShort)opc1) << 8) | (UShort)opc2, + Lit16, (UShort)modrm ); + if (dis) + VG_(printf)("%s %s, %s\n", name, + nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm)) ); + eip++; + } else { + VG_(core_panic)("dis_SSE2_reg_or_mem: mem"); + } + return eip; +} + + +/* Simple SSE operations, either + op (src)xmmreg, (dst)xmmreg + or + op (src)address, (dst)xmmreg + It is assumed that there are 2 opcode bytes preceding the start of + the address mode. Also there is an 8-bit immediate following the + address mode. eip points to the first opcode byte. +*/ +static +Addr dis_SSE2_reg_or_mem_Imm8 ( UCodeBlock* cb, + UChar sorb, + Char* name, + Int sz, + Addr eip ) +{ + UChar opc1 = getUChar(eip); + UChar opc2 = getUChar(eip+1); + UChar modrm = getUChar(eip+2); + UChar imm8; + eip += 2; + if (epartIsReg(modrm)) { + /* Completely internal SSE insn. */ + eip++; + imm8 = getUChar(eip); + uInstr2(cb, SSE4, 0, /* ignore sz for internal ops */ + Lit16, (((UShort)opc1) << 8) | (UShort)opc2, + Lit16, (((UShort)modrm) << 8) | (UShort)imm8 ); + if (dis) + VG_(printf)("%s %s, %s\n", name, + nameXMMReg(eregOfRM(modrm)), + nameXMMReg(gregOfRM(modrm)) ); + eip++; + } else { + VG_(core_panic)("dis_SSE2_reg_or_mem_Imm8: mem"); + } + return eip; +} + /*------------------------------------------------------------*/ /*--- Disassembling entire basic blocks ---*/ @@ -3308,6 +3382,10 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) UChar dis_buf[50]; Int am_sz, d_sz; + /* Holds eip at the start of the insn, so that we can print + consistent error messages for unimplemented insns. */ + UChar* eip_start = (UChar*)eip; + /* sz denotes the nominal data-op size of the insn; we change it to 2 if an 0x66 prefix is seen */ Int sz = 4; @@ -3393,10 +3471,11 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) if (VG_(have_ssestate)) { UChar* insn = (UChar*)eip; - /* STMXCSR/LDMXCSR m32 */ + /* STMXCSR/LDMXCSR m32 -- load/store the MXCSR register. */ if (insn[0] == 0x0F && insn[1] == 0xAE && (gregOfRM(insn[2]) == 3 || gregOfRM(insn[2]) == 2) ) { Bool store = gregOfRM(insn[2]) == 3; + vg_assert(sz == 4); pair = disAMode ( cb, sorb, eip+2, dis?dis_buf:NULL ); t1 = LOW24(pair); eip += 2+HI8(pair); @@ -3406,11 +3485,12 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) TempReg, t1 ); if (dis) VG_(printf)("%smxcsr %s\n", store ? "st" : "ld", dis_buf ); - goto sse_done; + goto decode_success; } - /* CVTSI2SS */ + /* CVTSI2SS -- convert int reg to low 4 bytes of XMM reg. */ if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) { + vg_assert(sz == 4); modrm = insn[3]; t1 = newTemp(cb); if (epartIsReg(modrm)) { @@ -3436,17 +3516,41 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) VG_(printf)("cvtsi2ss %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm))); } - goto sse_done; + goto decode_success; } - /* DIVSS */ + /* DIVSS -- divide low 4 bytes of XMM reg. */ if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) { + vg_assert(sz == 4); eip = dis_SSE3_reg_or_mem ( cb, sorb, "divss", 4, eip ); - goto sse_done; + goto decode_success; } - /* MOVSS */ - if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) { + /* SHUFPS */ + if (insn[0] == 0x0F && insn[1] == 0xC6) { + vg_assert(sz == 4); + eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, "shufps", 16, eip ); + goto decode_success; + } + + /* MULPS */ + if (insn[0] == 0x0F && insn[1] == 0x59) { + vg_assert(sz == 4); + eip = dis_SSE2_reg_or_mem ( cb, sorb, "mulps", 16, eip ); + goto decode_success; + } + + /* ADDPS */ + if (insn[0] == 0x0F && insn[1] == 0x58) { + vg_assert(sz == 4); + eip = dis_SSE2_reg_or_mem ( cb, sorb, "addps", 16, eip ); + goto decode_success; + } + + /* MOVSS -- move 4 bytes of XMM reg to/from XMM reg or mem. */ + if (insn[0] == 0xF3 && insn[1] == 0x0F && (insn[2] == 0x11 + || insn[2] == 0x10)) { + vg_assert(sz == 4); if (epartIsReg(insn[3])) { /* MOVSS xmm, xmm */ VG_(core_panic)("MOVSS reg"); @@ -3467,7 +3571,40 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) VG_(printf)("movss %s, %s\n", dis_buf, nameXMMReg(gregOfRM(insn[3])) ); } - goto sse_done; + goto decode_success; + } + + /* MOVAPS (28,29) -- aligned load/store of XMM reg, or x-x reg move */ + /* MOVUPS (10,11) -- unaligned load/store of XMM reg, or x-x reg move */ + if (insn[0] == 0x0F && (insn[1] == 0x28 + || insn[1] == 0x29 + || insn[1] == 0x10 + || insn[1] == 0x11)) { + vg_assert(sz == 4); + modrm = insn[2]; + if (epartIsReg(modrm)) { + VG_(core_panic)("MOVAPS - reg"); + } else { + Bool store = insn[1] == 0x29 || insn[1] == 11; + pair = disAMode ( cb, sorb, eip+2, dis?dis_buf:NULL ); + t1 = LOW24(pair); + eip += 2+HI8(pair); + uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 16, + Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1], + Lit16, (UShort)insn[2], + TempReg, t1 ); + if (dis) { + UChar* name = (insn[1] == 0x10 || insn[1] == 0x11) + ? "movups" : "movaps"; + if (store) + VG_(printf)("%s %s, %s\n", + name, nameXMMReg(gregOfRM(modrm)), dis_buf ); + else + VG_(printf)("%s %s, %s\n", + name, dis_buf, nameXMMReg(gregOfRM(modrm)) ); + } + } + goto decode_success; } /* Fall through into the non-SSE decoder. */ @@ -4422,8 +4559,7 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) if (dis) VG_(printf)("repne scas%c\n", nameISize(sz)); } else { - VG_(printf)("REPNE then 0x%x\n", (UInt)abyte); - VG_(core_panic)("Unhandled REPNE case"); + goto decode_failure; } break; } @@ -4459,9 +4595,7 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) if (dis) VG_(printf)("repe nop (P4 pause)\n"); /* do nothing; apparently a hint to the P4 re spin-wait loop */ } else { - VG_(printf)("Insn bytes: 0xF3 0x%x 0x%x\n", - (UInt)abyte, (UInt)getUChar(eip)); - VG_(core_panic)("Unhandled REPE case"); + goto decode_failure; } break; } @@ -4914,10 +5048,10 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) vg_assert(sz == 4); modrm = getUChar(eip); if (epartIsReg(modrm)) { - goto unimp2; + goto decode_failure; } if (gregOfRM(modrm) > 3) { - goto unimp2; + goto decode_failure; } eip += lengthAMode(eip); if (dis) { @@ -4927,7 +5061,7 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) case 1: hintstr = "t0"; break; case 2: hintstr = "t1"; break; case 3: hintstr = "t2"; break; - default: goto unimp2; + default: goto decode_failure; } VG_(printf)("prefetch%s ...\n", hintstr); } @@ -4948,7 +5082,7 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) /* ok */ } else { eip -= 2; - goto unimp2; + goto decode_failure; } uInstr2(cb, MMX3, 0, Lit16, (((UShort)byte1) << 8) | ((UShort)byte2), @@ -5066,7 +5200,7 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) vg_assert(sz == 4); modrm = getUChar(eip); if (epartIsReg(modrm)) { - goto unimp2; + goto decode_failure; } else { Int tmpa; pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL ); @@ -5214,42 +5348,41 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */ default: - unimp2: - VG_(printf)("disInstr: unhandled 2-byte opcode: " - "0x%x 0x%x 0x%x\n", - (Int)getUChar(eip-1), - (Int)getUChar(eip+0), - (Int)getUChar(eip+1) ); - - VG_(printf)("This _might_ be the result of executing a " - "SSE, SSE2 or 3DNow!\n" ); - VG_(printf)("instruction. Valgrind does not currently " - "support such instructions. Sorry.\n" ); - uInstr0(cb, CALLM_S, 0); - uInstr1(cb, CALLM, 0, Lit16, - VGOFF_(helper_undefined_instruction)); - uInstr0(cb, CALLM_E, 0); - - /* just because everything else insists the last instruction - of a BB is a jmp */ - uInstr1(cb, JMP, 0, Literal, 0); - uCond(cb, CondAlways); - uLiteral(cb, eip); - *isEnd = True; - } - - break; - } + goto decode_failure; + } /* switch (opc) for the 2-byte opcodes */ + goto decode_success; + } /* case 0x0F: of primary opcode */ /* ------------------------ ??? ------------------------ */ + + default: + decode_failure: + /* All decode failures end up here. */ + VG_(printf)("disInstr: unhandled instruction bytes: " + "0x%x 0x%x 0x%x 0x%x\n", + (Int)eip_start[0], + (Int)eip_start[1], + (Int)eip_start[2], + (Int)eip_start[3] ); - default: - VG_(printf)("disInstr: unhandled opcode 0x%x then 0x%x\n", - (UInt)opc, (UInt)getUChar(eip)); - VG_(core_panic)("unhandled x86 opcode"); - } + uInstr0(cb, CALLM_S, 0); + uInstr1(cb, CALLM, 0, Lit16, + VGOFF_(helper_undefined_instruction)); + uInstr0(cb, CALLM_E, 0); - sse_done: + /* just because everything else insists the last instruction of + a BB is a jmp */ + uInstr1(cb, JMP, 0, Literal, 0); + uCond(cb, CondAlways); + uLiteral(cb, eip); + *isEnd = True; + break; + return eip; + + } /* switch (opc) for the main (primary) opcode switch. */ + + decode_success: + /* All decode successes end up here. */ if (dis) VG_(printf)("\n"); for (; first_uinstr < cb->used; first_uinstr++) { @@ -5260,7 +5393,6 @@ static Addr disInstr ( UCodeBlock* cb, Addr eip, Bool* isEnd ) VG_(up_UInstr)(-1, &cb->instrs[first_uinstr]); vg_assert(sane); } - return eip; } diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c index 9c2445f1e..c16356989 100644 --- a/coregrind/vg_translate.c +++ b/coregrind/vg_translate.c @@ -562,6 +562,7 @@ Bool VG_(saneUInstr) ( Bool beforeRA, Bool beforeLiveness, UInstr* u ) case SSE3a_MemWr: return LIT0 && SZ416 && CC0 && Ls1 && Ls2 && TR3 && XOTHER; case SSE3a_MemRd: return LIT0 && SZ416 && CC0 && Ls1 && Ls2 && TR3 && XOTHER; case SSE3g_RegRd: return LIT0 && SZ4 && CC0 && Ls1 && Ls2 && TR3 && XOTHER; + case SSE3: return LIT0 && SZ0 && CC0 && Ls1 && Ls2 && N3 && XOTHER; case SSE4: return LIT0 && SZ0 && CC0 && Ls1 && Ls2 && N3 && XOTHER; default: if (VG_(needs).extended_UCode) @@ -876,6 +877,7 @@ Char* VG_(name_UOpcode) ( Bool upper, Opcode opc ) case SSE2a_MemWr: return "SSE2a_MWr"; case SSE2a_MemRd: return "SSE2a_MRd"; case SSE3g_RegRd: return "SSE3g_RRd"; + case SSE3: return "SSE3"; case SSE4: return "SSE4"; case SSE3a_MemWr: return "SSE3a_MWr"; case SSE3a_MemRd: return "SSE3a_MRd"; @@ -1051,6 +1053,12 @@ void pp_UInstrWorker ( Int instrNo, UInstr* u, Bool ppRegsLiveness ) VG_(pp_UOperand)(u, 3, 4, True); break; + case SSE3: + VG_(printf)("0x%x:0x%x:0x%x", + (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, + u->val2 & 0xFF ); + break; + case SSE4: VG_(printf)("0x%x:0x%x:0x%x:0x%x", (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, @@ -1211,7 +1219,7 @@ Int VG_(get_reg_usage) ( UInstr* u, Tag tag, Int* regs, Bool* isWrites ) case MMX2_RegRd: RD(2); break; case MMX2_RegWr: WR(2); break; - case SSE4: + case SSE4: case SSE3: case MMX1: case MMX2: case MMX3: case NOP: case FPU: case INCEIP: case CALLM_S: case CALLM_E: case CLEAR: case CALLM: case LOCK: break; @@ -1366,7 +1374,7 @@ Int maybe_uinstrReadsArchReg ( UInstr* u ) case SSE2a_MemWr: case SSE2a_MemRd: case SSE3a_MemWr: case SSE3a_MemRd: case SSE3g_RegRd: - case SSE4: + case SSE4: case SSE3: case WIDEN: /* GETSEG and USESEG are to do with ArchRegS, not ArchReg */ case GETSEG: case PUTSEG: @@ -2233,11 +2241,22 @@ void VG_(translate) ( /*IN*/ ThreadState* tst, Bool debugging_translation; UChar* final_code; UCodeBlock* cb; + Bool notrace_until_done; + Int notrace_until_limit = 0; VGP_PUSHCC(VgpTranslate); debugging_translation = orig_size == NULL || trans_addr == NULL || trans_size == NULL; + /* If codegen tracing, don't start tracing until + notrace_until_limit blocks have gone by. This avoids printing + huge amounts of useless junk when all we want to see is the last + few blocks translated prior to a failure. Set + notrace_until_limit to be the number of translations to be made + before --trace-codegen= style printing takes effect. */ + notrace_until_done + = VG_(overall_in_count) > notrace_until_limit; + if (!debugging_translation) VG_TRACK( pre_mem_read, Vg_CoreTranslate, tst, "", orig_addr, 1 ); @@ -2245,7 +2264,7 @@ void VG_(translate) ( /*IN*/ ThreadState* tst, cb->orig_eip = orig_addr; /* If doing any code printing, print a basic block start marker */ - if (VG_(clo_trace_codegen)) { + if (VG_(clo_trace_codegen) && notrace_until_done) { Char fnname[64] = ""; VG_(get_fnname_if_entry)(orig_addr, fnname, 64); VG_(printf)( @@ -2256,8 +2275,10 @@ void VG_(translate) ( /*IN*/ ThreadState* tst, } /* True if a debug trans., or if bit N set in VG_(clo_trace_codegen). */ -# define DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(n) \ - ( debugging_translation || (VG_(clo_trace_codegen) & (1 << (n-1))) ) +# define DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(n) \ + ( debugging_translation \ + || (notrace_until_done \ + && (VG_(clo_trace_codegen) & (1 << (n-1))) )) /* Disassemble this basic block into cb. */ VG_(print_codegen) = DECIDE_IF_PRINTING_CODEGEN_FOR_PHASE(1);