[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v2 15/16] x86/PV: use generic emulator for privileged instruction handling
There's a new emulator return code being added to allow bypassing certain operations (see the code comment). Its handling in the epilogue code involves moving the raising of the single step trap until after registers were updated. This should probably have been that way from the beginning, to allow the inject_hw_exception() hook to see updated register state (in case it cares) - it's a trap, after all. The other small tweak to the emulator is to single iteration handling of INS and OUTS: Since we don't want to handle any other memory access instructions, we want these to be handled by the rep_ins() / rep_outs() hooks here too. The read() / write() hook pointers get checked for that purpose. And finally handling of exceptions gets changed for REP INS / REP OUTS: If the hook return X86EMUL_EXCEPTION, register state will still get updated if some iterations have been performed (but the rIP update will get suppressed if not all of them did get handled). While on the HVM side the VA -> LA -> PA translation process clips the number of repetitions, doing so would unduly complicate the PV side code being added here. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> --- One thing to be considered is that despite avoiding the handling of memory reads and writes (other than for INS and OUTS) the set of insns now getting potentially handled by the emulator is much larger than before. A possible solution to this would be a new hook to be called between decode and execution stages, allowing further restrictions to be enforced. Of course this could easily be a follow-up patch, as the one here is quite big already. Another thing to consider is to the extend the X86EMUL_EXCEPTION handling change mentioned above to other string instructions. In that case this should probably be broken out into a prereq patch. --- a/tools/tests/x86_emulator/x86_emulate.c +++ b/tools/tests/x86_emulator/x86_emulate.c @@ -20,6 +20,9 @@ typedef bool bool_t; #define cpu_has_amd_erratum(nr) 0 #define mark_regs_dirty(r) ((void)(r)) +#define likely(x) __builtin_expect(!!(x), true) +#define unlikely(x) __builtin_expect(!!(x), false) + #define __packed __attribute__((packed)) /* For generic assembly code: use macros to define operation/operand sizes. */ --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -459,6 +459,7 @@ static int hvmemul_linear_to_phys( { if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) return X86EMUL_RETRY; + *reps = 0; hvm_inject_page_fault(pfec, addr); return X86EMUL_EXCEPTION; } @@ -478,6 +479,7 @@ static int hvmemul_linear_to_phys( if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) return X86EMUL_RETRY; done /= bytes_per_rep; + *reps = done; if ( done == 0 ) { ASSERT(!reverse); @@ -486,7 +488,6 @@ static int hvmemul_linear_to_phys( hvm_inject_page_fault(pfec, addr & PAGE_MASK); return X86EMUL_EXCEPTION; } - *reps = done; break; } @@ -568,6 +569,7 @@ static int hvmemul_virtual_to_linear( return X86EMUL_UNHANDLEABLE; /* This is a singleton operation: fail it with an exception. */ + *reps = 0; hvmemul_ctxt->exn_pending = 1; hvmemul_ctxt->trap.vector = (seg == x86_seg_ss) ? TRAP_stack_error : TRAP_gp_fault; --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -659,16 +659,13 @@ static void do_guest_trap(unsigned int t trapstr(trapnr), trapnr, regs->error_code); } -static void instruction_done( - struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch) +static void instruction_done(struct cpu_user_regs *regs, unsigned long eip) { regs->eip = eip; regs->eflags &= ~X86_EFLAGS_RF; - if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) ) + if ( regs->eflags & X86_EFLAGS_TF ) { - current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE; - if ( regs->eflags & X86_EFLAGS_TF ) - current->arch.debugreg[6] |= DR_STEP; + current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE; do_guest_trap(TRAP_debug, regs); } } @@ -1292,7 +1289,7 @@ static int emulate_invalid_rdtscp(struct return 0; eip += sizeof(opcode); pv_soft_rdtsc(v, regs, 1); - instruction_done(regs, eip, 0); + instruction_done(regs, eip); return EXCRET_fault_fixed; } @@ -1325,7 +1322,7 @@ static int emulate_forced_invalid_op(str pv_cpuid(regs); - instruction_done(regs, eip, 0); + instruction_done(regs, eip); trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip); @@ -2009,6 +2006,154 @@ static int read_gate_descriptor(unsigned return 1; } +struct priv_op_ctxt { + struct x86_emulate_ctxt ctxt; + struct { + unsigned long base, limit; + } cs; + char *io_emul_stub; + unsigned int bpmatch; + unsigned int tsc; +#define TSC_BASE 1 +#define TSC_AUX 2 +}; + +static bool priv_op_to_linear(unsigned long base, unsigned long offset, + unsigned int bytes, unsigned long limit, + enum x86_segment seg, + const struct x86_emulate_ctxt *ctxt, + unsigned long *addr) +{ + *addr = base + offset; + + if ( ctxt->addr_size < 8 ) + { + if ( unlikely(limit < bytes - 1) || + unlikely(offset > limit - bytes + 1) ) + { + do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error, + ctxt->regs); + return false; + } + + *addr = (uint32_t)*addr; + } + else if ( unlikely(!__addr_ok(*addr)) ) + { + do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error, + ctxt->regs); + return false; + } + + return true; +} + +static int priv_op_insn_fetch( + enum x86_segment seg, + unsigned long offset, + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + const struct priv_op_ctxt *poc = + container_of(ctxt, struct priv_op_ctxt, ctxt); + unsigned int rc; + unsigned long addr = poc->cs.base + offset; + + ASSERT(seg == x86_seg_cs); + + /* We don't mean to emulate any branches. */ + if ( !bytes ) + return X86EMUL_UNHANDLEABLE; + + if ( !priv_op_to_linear(poc->cs.base, offset, bytes, poc->cs.limit, + x86_seg_cs, ctxt, &addr) ) + return X86EMUL_EXCEPTION; + + if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 ) + { + propagate_page_fault(addr + bytes - rc, + cpu_has_nx ? PFEC_insn_fetch : 0 ); + return X86EMUL_EXCEPTION; + } + + return X86EMUL_OKAY; +} + +static int priv_op_read_segment(enum x86_segment seg, + struct segment_register *reg, + struct x86_emulate_ctxt *ctxt) +{ + if ( ctxt->addr_size < 8 ) + { + unsigned long limit; + unsigned int sel, ar; + + switch ( seg ) + { + case x86_seg_cs: sel = ctxt->regs->cs; break; + case x86_seg_ds: sel = read_sreg(ds); break; + case x86_seg_es: sel = read_sreg(es); break; + case x86_seg_fs: sel = read_sreg(fs); break; + case x86_seg_gs: sel = read_sreg(gs); break; + case x86_seg_ss: sel = ctxt->regs->ss; break; + case x86_seg_tr: + /* Check if this is an attempt to access to I/O bitmap. */ + if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c ) + return X86EMUL_DONE; + /* fall through */ + default: + return X86EMUL_UNHANDLEABLE; + } + + if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) ) + return X86EMUL_UNHANDLEABLE; + + reg->limit = limit; + reg->attr.bytes = ar >> 8; + } + else + { + switch ( seg ) + { + default: + reg->base = 0; + break; + case x86_seg_fs: + reg->base = rdfsbase(); + break; + case x86_seg_gs: + reg->base = rdgsbase(); + break; + } + + reg->limit = ~0U; + + reg->attr.bytes = 0; + reg->attr.fields.type = _SEGMENT_WR >> 8; + if ( seg == x86_seg_cs ) + reg->attr.fields.type |= _SEGMENT_CODE >> 8; + reg->attr.fields.s = 1; + reg->attr.fields.dpl = 3; + reg->attr.fields.p = 1; + reg->attr.fields.l = 1; + reg->attr.fields.db = 1; + reg->attr.fields.g = 1; + } + + /* + * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero. + * Also do this for consistency for non-conforming code segments. + */ + if ( (seg == x86_seg_ss || + (seg == x86_seg_cs && + !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) && + guest_kernel_mode(current, ctxt->regs) ) + reg->attr.fields.dpl = 0; + + return X86EMUL_OKAY; +} + /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */ static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs) { @@ -2255,6 +2400,234 @@ unsigned long guest_to_host_gpr_switch(u void (*pv_post_outb_hook)(unsigned int port, u8 value); +typedef void io_emul_stub_t(struct cpu_user_regs *); + +static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode, + unsigned int port, unsigned int bytes) +{ + if ( !ctxt->io_emul_stub ) + ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) + + (this_cpu(stubs.addr) & + ~PAGE_MASK) + + STUB_BUF_SIZE / 2; + + /* movq $host_to_guest_gpr_switch,%rcx */ + ctxt->io_emul_stub[0] = 0x48; + ctxt->io_emul_stub[1] = 0xb9; + *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch; + /* callq *%rcx */ + ctxt->io_emul_stub[10] = 0xff; + ctxt->io_emul_stub[11] = 0xd1; + /* data16 or nop */ + ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66; + /* <io-access opcode> */ + ctxt->io_emul_stub[13] = opcode; + /* imm8 or nop */ + ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90; + /* ret (jumps to guest_to_host_gpr_switch) */ + ctxt->io_emul_stub[15] = 0xc3; + BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16); + + if ( ioemul_handle_quirk ) + ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs); + + /* Handy function-typed pointer to the stub. */ + return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2); +} + +static int priv_op_read_io(unsigned int port, unsigned int bytes, + unsigned long *val, struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + + /* INS must not come here. */ + ASSERT((ctxt->opcode & ~9) == 0xe4); + + if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); + + if ( admin_io_okay(port, bytes, currd) ) + { + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + + mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + return X86EMUL_DONE; + } + + *val = guest_io_read(port, bytes, currd); + + return X86EMUL_OKAY; +} + +static int priv_op_write_io(unsigned int port, unsigned int bytes, + unsigned long val, struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + + /* OUTS must not come here. */ + ASSERT((ctxt->opcode & ~9) == 0xe6); + + if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); + + if ( admin_io_okay(port, bytes, currd) ) + { + io_emul_stub_t *io_emul = + io_emul_stub_setup(poc, ctxt->opcode, port, bytes); + + mark_regs_dirty(ctxt->regs); + io_emul(ctxt->regs); + if ( (bytes == 1) && pv_post_outb_hook ) + pv_post_outb_hook(port, val); + return X86EMUL_DONE; + } + + guest_io_write(port, bytes, val, currd); + + return X86EMUL_OKAY; +} + +static int priv_op_rep_ins(uint16_t port, + enum x86_segment seg, unsigned long offset, + unsigned int bytes_per_rep, unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + unsigned long goal = *reps; + struct segment_register sreg; + int rc; + + ASSERT(seg == x86_seg_es); + + *reps = 0; + + if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( !sreg.attr.fields.p ) + return X86EMUL_UNHANDLEABLE; + if ( !sreg.attr.fields.s || + (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) || + !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) ) + { + do_guest_trap(TRAP_gp_fault, ctxt->regs); + return X86EMUL_EXCEPTION; + } + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); + + while ( *reps < goal ) + { + unsigned int data = guest_io_read(port, bytes_per_rep, currd); + unsigned long addr; + + if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit, + x86_seg_es, ctxt, &addr) ) + return X86EMUL_EXCEPTION; + + if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 ) + { + propagate_page_fault(addr + bytes_per_rep - rc, PFEC_write_access); + return X86EMUL_EXCEPTION; + } + + ++*reps; + + if ( poc->bpmatch || hypercall_preempt_check() ) + break; + + /* x86_emulate() clips the repetition count to ensure we don't wrap. */ + if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) ) + offset -= bytes_per_rep; + else + offset += bytes_per_rep; + } + + return X86EMUL_OKAY; +} + +static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset, + uint16_t port, + unsigned int bytes_per_rep, unsigned long *reps, + struct x86_emulate_ctxt *ctxt) +{ + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); + struct vcpu *curr = current; + struct domain *currd = current->domain; + unsigned long goal = *reps; + struct segment_register sreg; + int rc; + + *reps = 0; + + if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) + return X86EMUL_UNHANDLEABLE; + + rc = priv_op_read_segment(seg, &sreg, ctxt); + if ( rc != X86EMUL_OKAY ) + return rc; + + if ( !sreg.attr.fields.p ) + return X86EMUL_UNHANDLEABLE; + if ( !sreg.attr.fields.s || + ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) && + !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) ) + { + do_guest_trap(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error, + ctxt->regs); + return X86EMUL_EXCEPTION; + } + + poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); + + while ( *reps < goal ) + { + unsigned int data = 0; + unsigned long addr; + + if ( !priv_op_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit, + seg, ctxt, &addr) ) + return X86EMUL_EXCEPTION; + + if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 ) + { + propagate_page_fault(addr + bytes_per_rep - rc, 0); + return X86EMUL_EXCEPTION; + } + + guest_io_write(port, bytes_per_rep, data, currd); + + ++*reps; + + if ( poc->bpmatch || hypercall_preempt_check() ) + break; + + /* x86_emulate() clips the repetition count to ensure we don't wrap. */ + if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) ) + offset -= bytes_per_rep; + else + offset += bytes_per_rep; + } + + return X86EMUL_OKAY; +} + static int priv_op_read_cr(unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt) { @@ -2395,6 +2768,7 @@ static inline bool is_cpufreq_controller static int priv_op_read_msr(unsigned int reg, uint64_t *val, struct x86_emulate_ctxt *ctxt) { + struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); const struct vcpu *curr = current; const struct domain *currd = curr->domain; bool vpmu_msr = false; @@ -2422,6 +2796,22 @@ static int priv_op_read_msr(unsigned int *val = curr->arch.pv_vcpu.gs_base_user; return X86EMUL_OKAY; + /* + * In order to fully retain original behavior we defer calling + * pv_soft_rdtsc() until after emulation. This may want/need to be + * reconsidered. + */ + case MSR_IA32_TSC: + poc->tsc |= TSC_BASE; + goto normal; + + case MSR_TSC_AUX: + poc->tsc |= TSC_AUX; + if ( cpu_has_rdtscp ) + goto normal; + *val = 0; + return X86EMUL_OKAY; + case MSR_K7_FID_VID_CTL: case MSR_K7_FID_VID_STATUS: case MSR_K8_PSTATE_LIMIT: @@ -2725,493 +3115,170 @@ static int priv_op_write_msr(unsigned in return X86EMUL_UNHANDLEABLE; } -/* Instruction fetch with error handling. */ -#define insn_fetch(type, base, eip, limit) \ -({ unsigned long _rc, _ptr = (base) + (eip); \ - type _x; \ - if ( ad_default < 8 ) \ - _ptr = (unsigned int)_ptr; \ - if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \ - goto fail; \ - if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ - { \ - propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \ - goto skip; \ - } \ - (eip) += sizeof(_x); _x; }) - -static int emulate_privileged_op(struct cpu_user_regs *regs) +static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt) { - struct vcpu *v = current; - struct domain *currd = v->domain; - unsigned long *reg, eip = regs->eip; - u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0; - enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none; - int rc; - unsigned int port, i, data_sel, ar, data, bpmatch = 0; - unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0; -#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \ - ? regs->reg \ - : ad_bytes == 4 \ - ? (u32)regs->reg \ - : (u16)regs->reg) -#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \ - ? regs->reg = (val) \ - : ad_bytes == 4 \ - ? (*(u32 *)®s->reg = (val)) \ - : (*(u16 *)®s->reg = (val))) - unsigned long code_base, code_limit; - char *io_emul_stub = NULL; - void (*io_emul)(struct cpu_user_regs *); - uint64_t val; - - if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) ) - goto fail; - op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2; - ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default; - if ( !(ar & _SEGMENT_S) || - !(ar & _SEGMENT_P) || - !(ar & _SEGMENT_CODE) ) - goto fail; - - /* emulating only opcodes not allowing SS to be default */ - data_sel = read_sreg(ds); + /* Ignore the instruction if unprivileged. */ + if ( !cache_flush_permitted(current->domain) ) + /* + * Non-physdev domain attempted WBINVD; ignore for now since + * newer linux uses this in some start-of-day timing loops. + */ + ; + else + wbinvd(); - /* Legacy prefixes. */ - for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) ) - { - switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) ) - { - case 0x66: /* operand-size override */ - opsize_prefix = 1; - op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ - continue; - case 0x67: /* address-size override */ - ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ - continue; - case 0x2e: /* CS override */ - data_sel = regs->cs; - continue; - case 0x3e: /* DS override */ - data_sel = read_sreg(ds); - continue; - case 0x26: /* ES override */ - data_sel = read_sreg(es); - continue; - case 0x64: /* FS override */ - data_sel = read_sreg(fs); - lm_ovr = lm_seg_fs; - continue; - case 0x65: /* GS override */ - data_sel = read_sreg(gs); - lm_ovr = lm_seg_gs; - continue; - case 0x36: /* SS override */ - data_sel = regs->ss; - continue; - case 0xf0: /* LOCK */ - lock = 1; - continue; - case 0xf2: /* REPNE/REPNZ */ - case 0xf3: /* REP/REPE/REPZ */ - rep_prefix = 1; - continue; - default: - if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 ) - { - rex = opcode; - continue; - } - break; - } - break; - } + return X86EMUL_OKAY; +} - /* REX prefix. */ - if ( rex & 8 ) /* REX.W */ - op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */ - modrm_reg = (rex & 4) << 1; /* REX.R */ - /* REX.X does not need to be decoded. */ - modrm_rm = (rex & 1) << 3; /* REX.B */ - - if ( opcode == 0x0f ) - goto twobyte_opcode; - - if ( lock ) - goto fail; - - /* Input/Output String instructions. */ - if ( (opcode >= 0x6c) && (opcode <= 0x6f) ) - { - unsigned long data_base, data_limit; - - if ( rep_prefix && (rd_ad(ecx) == 0) ) - goto done; - - if ( !(opcode & 2) ) - { - data_sel = read_sreg(es); - lm_ovr = lm_seg_none; - } - - if ( !(ar & _SEGMENT_L) ) - { - if ( !read_descriptor(data_sel, v, &data_base, &data_limit, - &ar, 0) ) - goto fail; - if ( !(ar & _SEGMENT_S) || - !(ar & _SEGMENT_P) || - (opcode & 2 ? - (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) : - (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) ) - goto fail; - } - else - { - switch ( lm_ovr ) - { - default: - data_base = 0UL; - break; - case lm_seg_fs: - data_base = rdfsbase(); - break; - case lm_seg_gs: - data_base = rdgsbase(); - break; - } - data_limit = ~0UL; - ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P; - } +static int priv_op_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx, + struct x86_emulate_ctxt *ctxt) +{ + struct cpu_user_regs regs = *ctxt->regs; + + regs._eax = *eax; + regs._ebx = *ebx; + regs._ecx = *ecx; + regs._edx = *edx; + + pv_cpuid(®s); + + *eax = regs._eax; + *ebx = regs._ebx; + *ecx = regs._ecx; + *edx = regs._edx; - port = (u16)regs->edx; + return X86EMUL_OKAY; +} - continue_io_string: - switch ( opcode ) - { - case 0x6c: /* INSB */ - op_bytes = 1; - case 0x6d: /* INSW/INSL */ - if ( (data_limit < (op_bytes - 1)) || - (rd_ad(edi) > (data_limit - (op_bytes - 1))) || - !guest_io_okay(port, op_bytes, v, regs) ) - goto fail; - data = guest_io_read(port, op_bytes, currd); - if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), - &data, op_bytes)) != 0 ) - { - propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc, - PFEC_write_access); - return EXCRET_fault_fixed; - } - wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF) - ? -op_bytes : op_bytes)); - break; +static int priv_op_hw_exception(uint8_t vector, int32_t error_code, + struct x86_emulate_ctxt *ctxt) +{ + do_guest_trap(vector, ctxt->regs); - case 0x6e: /* OUTSB */ - op_bytes = 1; - case 0x6f: /* OUTSW/OUTSL */ - if ( (data_limit < (op_bytes - 1)) || - (rd_ad(esi) > (data_limit - (op_bytes - 1))) || - !guest_io_okay(port, op_bytes, v, regs) ) - goto fail; - if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), - op_bytes)) != 0 ) - { - propagate_page_fault(data_base + rd_ad(esi) - + op_bytes - rc, 0); - return EXCRET_fault_fixed; - } - guest_io_write(port, op_bytes, data, currd); - wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF) - ? -op_bytes : op_bytes)); - break; - } + if ( error_code >= 0 ) + { + struct trap_bounce *tb = ¤t->arch.pv_vcpu.trap_bounce; - bpmatch = check_guest_io_breakpoint(v, port, op_bytes); + tb->flags |= TBF_EXCEPTION_ERRCODE; + tb->error_code = error_code; + } + + return X86EMUL_EXCEPTION; +} + +static const struct x86_emulate_ops priv_op_ops = { + .insn_fetch = priv_op_insn_fetch, + .read = x86emul_unhandleable_rw, + .write = x86emul_unhandleable_rw, + .cmpxchg = x86emul_unhandleable_cx, + .read_io = priv_op_read_io, + .write_io = priv_op_write_io, + .rep_ins = priv_op_rep_ins, + .rep_outs = priv_op_rep_outs, + .read_segment = priv_op_read_segment, + .read_cr = priv_op_read_cr, + .write_cr = priv_op_write_cr, + .read_dr = priv_op_read_dr, + .write_dr = priv_op_write_dr, + .read_msr = priv_op_read_msr, + .write_msr = priv_op_write_msr, + .cpuid = priv_op_cpuid, + .wbinvd = priv_op_wbinvd, + .inject_hw_exception = priv_op_hw_exception, +}; - if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) ) - { - if ( !bpmatch && !hypercall_preempt_check() ) - goto continue_io_string; - eip = regs->eip; - } +static int emulate_privileged_op(struct cpu_user_regs *regs) +{ + struct vcpu *curr = current; + struct domain *currd = curr->domain; + struct priv_op_ctxt ctxt = { .ctxt.regs = regs }; + int rc; + unsigned int eflags, ar; - goto done; - } + if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit, + &ar, 1) || + !(ar & _SEGMENT_S) || + !(ar & _SEGMENT_P) || + !(ar & _SEGMENT_CODE) ) + return 0; + /* Mirror virtualized state into EFLAGS. */ + ASSERT(regs->_eflags & X86_EFLAGS_IF); + if ( vcpu_info(curr, evtchn_upcall_mask) ) + regs->_eflags &= ~X86_EFLAGS_IF; + else + regs->_eflags |= X86_EFLAGS_IF; + ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL)); + regs->_eflags |= curr->arch.pv_vcpu.iopl; /* - * Very likely to be an I/O instruction (IN/OUT). - * Build an stub to execute the instruction with full guest GPR - * context. This is needed for some systems which (ab)use IN/OUT - * to communicate with BIOS code in system-management mode. + * Don't have x86_emulate() inject single step traps, as we want #DB + * also delivered for I/O break points (see below). */ - io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) + - (this_cpu(stubs.addr) & ~PAGE_MASK) + - STUB_BUF_SIZE / 2; - /* movq $host_to_guest_gpr_switch,%rcx */ - io_emul_stub[0] = 0x48; - io_emul_stub[1] = 0xb9; - *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch; - /* callq *%rcx */ - io_emul_stub[10] = 0xff; - io_emul_stub[11] = 0xd1; - /* data16 or nop */ - io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66; - /* <io-access opcode> */ - io_emul_stub[13] = opcode; - /* imm8 or nop */ - io_emul_stub[14] = 0x90; - /* ret (jumps to guest_to_host_gpr_switch) */ - io_emul_stub[15] = 0xc3; - BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16); + if ( regs->_eflags & X86_EFLAGS_TF ) + { + ctxt.bpmatch = DR_STEP; + regs->_eflags &= ~X86_EFLAGS_TF; + } + eflags = regs->_eflags; - /* Handy function-typed pointer to the stub. */ - io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2); + ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16; + /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */ + rc = x86_emulate(&ctxt.ctxt, &priv_op_ops); - if ( ioemul_handle_quirk ) - ioemul_handle_quirk(opcode, &io_emul_stub[12], regs); + if ( ctxt.io_emul_stub ) + unmap_domain_page(ctxt.io_emul_stub); - /* I/O Port and Interrupt Flag instructions. */ - switch ( opcode ) + /* Un-mirror virtualized state from EFLAGS. */ + if ( (regs->_eflags ^ eflags) & X86_EFLAGS_IF ) { - case 0xe4: /* IN imm8,%al */ - op_bytes = 1; - case 0xe5: /* IN imm8,%eax */ - port = insn_fetch(u8, code_base, eip, code_limit); - io_emul_stub[14] = port; /* imm8 */ - exec_in: - if ( !guest_io_okay(port, op_bytes, v, regs) ) - goto fail; - if ( admin_io_okay(port, op_bytes, currd) ) - { - mark_regs_dirty(regs); - io_emul(regs); - } - else - { - if ( op_bytes == 4 ) - regs->eax = 0; - else - regs->eax &= ~((1 << (op_bytes * 8)) - 1); - regs->eax |= guest_io_read(port, op_bytes, currd); - } - bpmatch = check_guest_io_breakpoint(v, port, op_bytes); - goto done; - - case 0xec: /* IN %dx,%al */ - op_bytes = 1; - case 0xed: /* IN %dx,%eax */ - port = (u16)regs->edx; - goto exec_in; - - case 0xe6: /* OUT %al,imm8 */ - op_bytes = 1; - case 0xe7: /* OUT %eax,imm8 */ - port = insn_fetch(u8, code_base, eip, code_limit); - io_emul_stub[14] = port; /* imm8 */ - exec_out: - if ( !guest_io_okay(port, op_bytes, v, regs) ) - goto fail; - if ( admin_io_okay(port, op_bytes, currd) ) - { - mark_regs_dirty(regs); - io_emul(regs); - if ( (op_bytes == 1) && pv_post_outb_hook ) - pv_post_outb_hook(port, regs->eax); - } - else - { - guest_io_write(port, op_bytes, regs->eax, currd); - } - bpmatch = check_guest_io_breakpoint(v, port, op_bytes); - goto done; - - case 0xee: /* OUT %al,%dx */ - op_bytes = 1; - case 0xef: /* OUT %eax,%dx */ - port = (u16)regs->edx; - goto exec_out; - - case 0xfa: /* CLI */ - case 0xfb: /* STI */ - if ( !iopl_ok(v, regs) ) - goto fail; + /* The only allowed insns altering EFLAGS.IF are CLI/STI. */ + ASSERT((ctxt.ctxt.opcode & ~1) == 0xfa); /* * This is just too dangerous to allow, in my opinion. Consider if the * caller then tries to reenable interrupts using POPF: we can't trap * that and we'll end up with hard-to-debug lockups. Fast & loose will * do for us. :-) + vcpu_info(curr, evtchn_upcall_mask) = (opcode == 0xfa); */ - /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/ - goto done; } - - /* No decode of this single-byte opcode. */ - goto fail; - - twobyte_opcode: - /* - * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9), - * and CPUID (0xa2), are executable only from guest kernel mode - * (virtual ring 0). - */ - opcode = insn_fetch(u8, code_base, eip, code_limit); - if ( !guest_kernel_mode(v, regs) && - (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) ) - goto fail; - - if ( lock && (opcode & ~3) != 0x20 ) - goto fail; - switch ( opcode ) - { - case 0x1: /* RDTSCP and XSETBV */ - switch ( insn_fetch(u8, code_base, eip, code_limit) ) - { - case 0xf9: /* RDTSCP */ - if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && - !guest_kernel_mode(v, regs) ) - goto fail; - pv_soft_rdtsc(v, regs, 1); - break; - case 0xd1: /* XSETBV */ - { - u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32); - - if ( lock || rep_prefix || opsize_prefix - || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ) + regs->_eflags |= X86_EFLAGS_IF; + /* Nothing we allow to be emulated can change IOPL or TF. */ + ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IOPL | X86_EFLAGS_TF))); + regs->_eflags &= ~X86_EFLAGS_IOPL; + if ( ctxt.bpmatch & DR_STEP ) + regs->_eflags |= X86_EFLAGS_TF; + + switch ( rc ) + { + case X86EMUL_OKAY: + if ( ctxt.tsc & TSC_BASE ) + { + if ( ctxt.tsc & TSC_AUX ) + pv_soft_rdtsc(curr, regs, 1); + else if ( currd->arch.vtsc ) + pv_soft_rdtsc(curr, regs, 0); + else { - do_guest_trap(TRAP_invalid_op, regs); - goto skip; - } - - if ( !guest_kernel_mode(v, regs) ) - goto fail; - - if ( handle_xsetbv(regs->ecx, new_xfeature) ) - goto fail; - - break; - } - default: - goto fail; - } - break; + uint64_t val = rdtsc(); - case 0x06: /* CLTS */ - (void)do_fpu_taskswitch(0); - break; - - case 0x09: /* WBINVD */ - /* Ignore the instruction if unprivileged. */ - if ( !cache_flush_permitted(currd) ) - /* Non-physdev domain attempted WBINVD; ignore for now since - newer linux uses this in some start-of-day timing loops */ - ; - else - wbinvd(); - break; - - case 0x20: /* MOV CR?,<reg> */ - opcode = insn_fetch(u8, code_base, eip, code_limit); - if ( opcode < 0xc0 ) - goto fail; - modrm_reg += ((opcode >> 3) & 7) + (lock << 3); - modrm_rm |= (opcode >> 0) & 7; - if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0), - NULL) != X86EMUL_OKAY ) - goto fail; - break; - - case 0x21: /* MOV DR?,<reg> */ { - opcode = insn_fetch(u8, code_base, eip, code_limit); - if ( opcode < 0xc0 ) - goto fail; - modrm_reg += ((opcode >> 3) & 7) + (lock << 3); - modrm_rm |= (opcode >> 0) & 7; - if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0), - NULL) != X86EMUL_OKAY ) - goto fail; - break; - } - - case 0x22: /* MOV <reg>,CR? */ - opcode = insn_fetch(u8, code_base, eip, code_limit); - if ( opcode < 0xc0 ) - goto fail; - modrm_reg += ((opcode >> 3) & 7) + (lock << 3); - modrm_rm |= (opcode >> 0) & 7; - reg = decode_register(modrm_rm, regs, 0); - switch ( priv_op_write_cr(modrm_reg, *reg, NULL) ) - { - case X86EMUL_OKAY: - break; - case X86EMUL_RETRY: /* retry after preemption */ - goto skip; - default: - goto fail; + regs->eax = (uint32_t)val; + regs->edx = (uint32_t)(val >> 32); + } } - break; - - case 0x23: /* MOV <reg>,DR? */ - opcode = insn_fetch(u8, code_base, eip, code_limit); - if ( opcode < 0xc0 ) - goto fail; - modrm_reg += ((opcode >> 3) & 7) + (lock << 3); - modrm_rm |= (opcode >> 0) & 7; - reg = decode_register(modrm_rm, regs, 0); - if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY ) - goto fail; - break; - case 0x30: /* WRMSR */ - if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax, - NULL) != X86EMUL_OKAY ) - goto fail; - break; - - case 0x31: /* RDTSC */ - if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && - !guest_kernel_mode(v, regs) ) - goto fail; - if ( currd->arch.vtsc ) - pv_soft_rdtsc(v, regs, 0); - else + if ( ctxt.bpmatch ) { - val = rdtsc(); - goto rdmsr_writeback; + curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE; + if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) ) + do_guest_trap(TRAP_debug, regs); } - break; - - case 0x32: /* RDMSR */ - if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY ) - goto fail; - rdmsr_writeback: - regs->eax = (uint32_t)val; - regs->edx = (uint32_t)(val >> 32); - break; - - case 0xa2: /* CPUID */ - pv_cpuid(regs); - break; - - default: - goto fail; + /* fall through */ + case X86EMUL_RETRY: + case X86EMUL_EXCEPTION: + return EXCRET_fault_fixed; } -#undef wr_ad -#undef rd_ad - - done: - instruction_done(regs, eip, bpmatch); - skip: - if ( io_emul_stub ) - unmap_domain_page(io_emul_stub); - return EXCRET_fault_fixed; - - fail: - if ( io_emul_stub ) - unmap_domain_page(io_emul_stub); return 0; } @@ -3541,7 +3609,7 @@ static void emulate_gate_op(struct cpu_u sel |= (regs->cs & 3); regs->cs = sel; - instruction_done(regs, off, 0); + instruction_done(regs, off); } void do_general_protection(struct cpu_user_regs *regs) --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -857,7 +857,11 @@ static void __put_rep_prefix( #define put_rep_prefix(reps_completed) ({ \ if ( rep_prefix() ) \ + { \ __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \ + if ( unlikely(rc == X86EMUL_EXCEPTION) ) \ + goto no_writeback; \ + } \ }) /* Clip maximum repetitions so that the index register at most just wraps. */ @@ -1075,7 +1079,7 @@ static int ioport_access_check( fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 ) - return rc; + return rc != X86EMUL_DONE ? rc : X86EMUL_OKAY; /* Ensure that the TSS is valid and has an io-bitmap-offset field. */ if ( !tr.attr.fields.p || @@ -1610,6 +1614,17 @@ int x86emul_unhandleable_rw( return X86EMUL_UNHANDLEABLE; } +int x86emul_unhandleable_cx( + enum x86_segment seg, + unsigned long offset, + void *p_old, + void *p_new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + return X86EMUL_UNHANDLEABLE; +} + struct x86_emulate_state { unsigned int op_bytes, ad_bytes; @@ -2280,6 +2295,7 @@ x86_emulate( struct x86_emulate_state state; int rc; uint8_t b, d; + bool tf = ctxt->regs->eflags & EFLG_TF; struct operand src = { .reg = PTR_POISON }; struct operand dst = { .reg = PTR_POISON }; enum x86_swint_type swint_type; @@ -2731,14 +2747,10 @@ x86_emulate( dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes); if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - if ( (nr_reps > 1) && (ops->rep_ins != NULL) && + if ( ((nr_reps == 1) && (ops->write != x86emul_unhandleable_rw)) || + !ops->rep_ins || ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes, - &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) ) - { - if ( rc != 0 ) - goto done; - } - else + &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) ) { fail_if(ops->read_io == NULL); if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 ) @@ -2750,6 +2762,8 @@ x86_emulate( _regs.edi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); put_rep_prefix(nr_reps); + if ( rc != X86EMUL_OKAY ) + goto done; break; } @@ -2760,14 +2774,10 @@ x86_emulate( ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes); if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; - if ( (nr_reps > 1) && (ops->rep_outs != NULL) && + if ( ((nr_reps == 1) && (ops->read != x86emul_unhandleable_rw)) || + !ops->rep_outs || ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes, - &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) ) - { - if ( rc != 0 ) - goto done; - } - else + &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) ) { if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), &dst.val, dst.bytes, ctxt, ops)) != 0 ) @@ -2781,6 +2791,8 @@ x86_emulate( _regs.esi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); put_rep_prefix(nr_reps); + if ( rc != X86EMUL_OKAY ) + goto done; break; } @@ -3038,6 +3050,7 @@ x86_emulate( dst.val = _regs.eax; dst.type = OP_MEM; nr_reps = 1; + rc = X86EMUL_OKAY; } else if ( rc != X86EMUL_OKAY ) goto done; @@ -3846,7 +3859,11 @@ x86_emulate( rc = ops->read_io(port, dst.bytes, &dst.val, ctxt); } if ( rc != 0 ) + { + if ( rc == X86EMUL_DONE ) + goto no_writeback; goto done; + } break; } @@ -5198,11 +5215,6 @@ x86_emulate( } no_writeback: - /* Inject #DB if single-step tracing was enabled at instruction start. */ - if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) && - (ops->inject_hw_exception != NULL) ) - rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION; - /* Commit shadow register state. */ _regs.eflags &= ~EFLG_RF; @@ -5210,7 +5222,18 @@ x86_emulate( if ( !mode_64bit() ) _regs.eip = (uint32_t)_regs.eip; - *ctxt->regs = _regs; + if ( rc != X86EMUL_DONE ) + *ctxt->regs = _regs; + else + { + ctxt->regs->eip = _regs.eip; + ctxt->regs->eflags = _regs.eflags; + rc = X86EMUL_OKAY; + } + + /* Inject #DB if single-step tracing was enabled at instruction start. */ + if ( tf && (rc == X86EMUL_OKAY) && ops->inject_hw_exception ) + rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION; done: _put_fpu(); --- a/xen/arch/x86/x86_emulate/x86_emulate.h +++ b/xen/arch/x86/x86_emulate/x86_emulate.h @@ -111,6 +111,13 @@ struct __packed segment_register { #define X86EMUL_RETRY 3 /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */ #define X86EMUL_CMPXCHG_FAILED 3 + /* + * Operation fully done by one of the hooks: + * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access + * - read_io() / write_io(): bypass GPR update (non-string insns only) + * Undefined behavior when use anywhere else. + */ +#define X86EMUL_DONE 4 /* FPU sub-types which may be requested via ->get_fpu(). */ enum x86_emulate_fpu_type { @@ -531,6 +538,15 @@ x86emul_unhandleable_rw( void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); +/* Unhandleable cmpxchg */ +int +x86emul_unhandleable_cx( + enum x86_segment seg, + unsigned long offset, + void *p_old, + void *p_new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); #ifdef __XEN__ Attachment:
x86-PV-priv-op-generic-emul.patch _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |