[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [PATCH v3] x86emul: further correct 64-bit mode zero count repeated string insn handling
In an entirely different context I came across Linux commit 428e3d08574b ("KVM: x86: Fix zero iterations REP-string"), which points out that we're still doing things wrong: For one, there's no zero-extension at all on AMD. And then while RCX is zero-extended from 32 bits uniformly for all string instructions on newer hardware, RSI/RDI are only for MOVS and STOS on the systems I have access to. (On an old family 0xf system I've further found that for REP LODS even RCX is not zero-extended.) Fixes: 79e996a89f69 ("x86emul: correct 64-bit mode repeated string insn handling with zero count") Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- Partly RFC for none of this being documented anywhere (and it partly being model specific); inquiry pending. If I was asked, I would have claimed to have tested all string insns and for both vendors back at the time. But pretty clearly I didn't, and instead I did derive uniform behavior from just the MOVS and STOS observations on just Intel hardware; I'm sorry for that. --- v3: Re-base. v2: Re-base over re-ordering against previously 2nd patch in a series. --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -489,7 +489,7 @@ static inline void put_loop_count( regs->r(cx) = ad_bytes == 4 ? (uint32_t)count : count; } -#define get_rep_prefix(using_si, using_di) ({ \ +#define get_rep_prefix(extend_si, extend_di) ({ \ unsigned long max_reps = 1; \ if ( rep_prefix() ) \ max_reps = get_loop_count(&_regs, ad_bytes); \ @@ -497,14 +497,14 @@ static inline void put_loop_count( { \ /* \ * Skip the instruction if no repetitions are required, but \ - * zero extend involved registers first when using 32-bit \ + * zero extend relevant registers first when using 32-bit \ * addressing in 64-bit mode. \ */ \ - if ( mode_64bit() && ad_bytes == 4 ) \ + if ( !amd_like(ctxt) && mode_64bit() && ad_bytes == 4 ) \ { \ _regs.r(cx) = 0; \ - if ( using_si ) _regs.r(si) = (uint32_t)_regs.r(si); \ - if ( using_di ) _regs.r(di) = (uint32_t)_regs.r(di); \ + if ( extend_si ) _regs.r(si) = (uint32_t)_regs.r(si); \ + if ( extend_di ) _regs.r(di) = (uint32_t)_regs.r(di); \ } \ goto complete_insn; \ } \ @@ -1775,7 +1775,7 @@ x86_emulate( dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - nr_reps = get_rep_prefix(false, true); + nr_reps = get_rep_prefix(false, false); dst.mem.off = truncate_ea_and_reps(_regs.r(di), nr_reps, dst.bytes); dst.mem.seg = x86_seg_es; /* Try the presumably most efficient approach first. */ @@ -1817,7 +1817,7 @@ x86_emulate( dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - nr_reps = get_rep_prefix(true, false); + nr_reps = get_rep_prefix(false, false); ea.mem.off = truncate_ea_and_reps(_regs.r(si), nr_reps, dst.bytes); /* Try the presumably most efficient approach first. */ if ( !ops->rep_outs ) @@ -2154,7 +2154,7 @@ x86_emulate( case 0xa6 ... 0xa7: /* cmps */ { unsigned long next_eip = _regs.r(ip); - get_rep_prefix(true, true); + get_rep_prefix(false, false); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.r(si)), &dst.val, dst.bytes, ctxt, ops)) || @@ -2196,7 +2196,7 @@ x86_emulate( } case 0xac ... 0xad: /* lods */ - get_rep_prefix(true, false); + get_rep_prefix(false, false); if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.r(si)), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; @@ -2207,7 +2207,7 @@ x86_emulate( case 0xae ... 0xaf: /* scas */ { unsigned long next_eip = _regs.r(ip); - get_rep_prefix(false, true); + get_rep_prefix(false, false); if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.r(di)), &dst.val, src.bytes, ctxt, ops)) != 0 ) goto done;
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |