Xen project Mailing List

[Xen-changelog] Merge latest xen-unstable into xen-ia64-unstable

From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Thu, 22 Sep 2005 20:02:23 +0000

Delivery-date: Thu, 22 Sep 2005 20:02:36 +0000

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User djm@xxxxxxxxxxxxxxx # Node ID 06d84bf87159174ef040a67f4ce90fcb72469b14 # Parent 97dbd9524a7e918f2ffb2d5155a7e32c48f7f355 # Parent 2f83ff9f6bd2a7987c297b84bfce1f8e22409cae Merge latest xen-unstable into xen-ia64-unstable diff -r 97dbd9524a7e -r 06d84bf87159 .hgignore --- a/.hgignore Thu Sep 22 17:34:14 2005 +++ b/.hgignore Thu Sep 22 17:42:01 2005 @@ -86,6 +86,9 @@ ^tools/check/\..*$ ^tools/console/xenconsoled$ ^tools/console/xenconsole$ +^tools/debugger/gdb/gdb-6\.2\.1\.tar\.bz2$ +^tools/debugger/gdb/gdb-6\.2\.1/.*$ +^tools/debugger/gdb/gdb-6\.2\.1-linux-i386-xen/.*$ ^tools/debugger/pdb/pdb$ ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.ko$ ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.mod.c$ @@ -136,9 +139,10 @@ ^tools/vnet/vnet-module/\..*\.cmd$ ^tools/vnet/vnet-module/\.tmp_versions/.*$ ^tools/vnet/vnet-module/vnet_module\.mod\..*$ -^tools/vtpm/vtpm* -^tools/vtpm/tpm_emulator-* -^tools/vtpm_manager/manager/vtpm_managerd +^tools/vtpm/tpm_emulator/.*$ +^tools/vtpm/tpm_emulator-.*\.tar\.gz$ +^tools/vtpm/vtpm/.*$ +^tools/vtpm_manager/manager/vtpm_managerd$ ^tools/xcutils/xc_restore$ ^tools/xcutils/xc_save$ ^tools/xenstat/xentop/xentop$ @@ -156,6 +160,7 @@ ^tools/xenstore/xs_stress$ ^tools/xenstore/xs_test$ ^tools/xenstore/xs_watch_stress$ +^tools/xentrace/xenctx$ ^tools/xentrace/xentrace$ ^xen/BLOG$ ^xen/TAGS$ diff -r 97dbd9524a7e -r 06d84bf87159 Makefile --- a/Makefile Thu Sep 22 17:34:14 2005 +++ b/Makefile Thu Sep 22 17:42:01 2005 @@ -98,11 +98,14 @@ $(MAKE) -C tools clean $(MAKE) -C docs clean -# clean, but blow away kernel build tree plus tar balls -mrproper: clean +# clean, but blow away kernel build tree plus tarballs +distclean: clean rm -rf dist patches/tmp for i in $(ALLKERNELS) ; do $(MAKE) $$i-delete ; done for i in $(ALLSPARSETREES) ; do $(MAKE) $$i-mrproper ; done + +# Linux name for GNU distclean +mrproper: distclean install-logging: LOGGING=logging-0.4.9.2 install-logging: @@ -142,7 +145,7 @@ @echo 'Cleaning targets:' @echo ' clean - clean the Xen, tools and docs (but not' @echo ' guest kernel) trees' - @echo ' mrproper - clean plus delete kernel tarballs and kernel' + @echo ' distclean - clean plus delete kernel tarballs and kernel' @echo ' build trees' @echo ' kdelete - delete guest kernel build trees' @echo ' kclean - clean guest kernel build trees' @@ -163,27 +166,25 @@ uninstall: [ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` rm -rf $(D)/etc/init.d/xend* - rm -rf $(D)/usr/$(LIBDIR)/libxc* $(D)/usr/$(LIBDIR)/libxutil* - rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/include/xen - rm -rf $(D)/usr/$(LIBDIR)/share/xen $(D)/usr/$(LIBDIR)/libxenstore* + rm -rf $(D)/etc/hotplug/xen-backend.agent rm -rf $(D)/var/run/xen* $(D)/var/lib/xen* - rm -rf $(D)/usr/include/xcs_proto.h $(D)/usr/include/xc.h - rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h - rm -rf $(D)/usr/sbin/xcs $(D)/usr/sbin/xcsdump $(D)/usr/sbin/xen* - rm -rf $(D)/usr/sbin/netfix - rm -rf $(D)/usr/sbin/xfrd $(D)/usr/sbin/xm - rm -rf $(D)/usr/share/doc/xen $(D)/usr/man/man*/xentrace* - rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/miniterm rm -rf $(D)/boot/*xen* rm -rf $(D)/lib/modules/*xen* + rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/lomount rm -rf $(D)/usr/bin/cpuperf-perfcntr $(D)/usr/bin/cpuperf-xen rm -rf $(D)/usr/bin/xc_shadow - rm -rf $(D)/usr/share/xen $(D)/usr/libexec/xen + rm -rf $(D)/usr/include/xenctrl.h + rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h + rm -rf $(D)/usr/include/xen + rm -rf $(D)/usr/$(LIBDIR)/libxenctrl* $(D)/usr/$(LIBDIR)/libxenguest* + rm -rf $(D)/usr/$(LIBDIR)/libxenstore* + rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/$(LIBDIR)/xen + rm -rf $(D)/usr/libexec/xen + rm -rf $(D)/usr/sbin/xen* $(D)/usr/sbin/netfix $(D)/usr/sbin/xm + rm -rf $(D)/usr/share/doc/xen + rm -rf $(D)/usr/share/xen rm -rf $(D)/usr/share/man/man1/xen* rm -rf $(D)/usr/share/man/man8/xen* - rm -rf $(D)/usr/lib/xen - rm -rf $(D)/etc/hotplug.d/xen-backend - rm -rf $(D)/etc/hotplug/xen-backend.agent # Legacy targets for compatibility linux24: diff -r 97dbd9524a7e -r 06d84bf87159 docs/Makefile --- a/docs/Makefile Thu Sep 22 17:34:14 2005 +++ b/docs/Makefile Thu Sep 22 17:42:01 2005 @@ -12,7 +12,7 @@ pkgdocdir := /usr/share/doc/xen -DOC_TEX := $(wildcard src/*.tex) +DOC_TEX := src/user.tex src/interface.tex DOC_PS := $(patsubst src/%.tex,ps/%.ps,$(DOC_TEX)) DOC_PDF := $(patsubst src/%.tex,pdf/%.pdf,$(DOC_TEX)) DOC_HTML := $(patsubst src/%.tex,html/%/index.html,$(DOC_TEX)) @@ -36,11 +36,12 @@ $(MAKE) $(DOC_HTML); fi python-dev-docs: - mkdir -p api/tools/python + @mkdir -v -p api/tools/python @if which $(DOXYGEN) 1>/dev/null 2>/dev/null; then \ echo "Running doxygen to generate Python tools APIs ... "; \ $(DOXYGEN) Doxyfile; \ - $(MAKE) -C api/tools/python/latex ; fi + $(MAKE) -C api/tools/python/latex ; else \ + echo "Doxygen not installed; skipping python-dev-docs."; fi clean: rm -rf .word_count *.aux *.dvi *.bbl *.blg *.glo *.idx *~ diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface.tex --- a/docs/src/interface.tex Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface.tex Thu Sep 22 17:42:01 2005 @@ -87,1084 +87,23 @@ mechanism and policy within the system. +%% chapter Virtual Architecture moved to architecture.tex +\include{src/interface/architecture} -\chapter{Virtual Architecture} +%% chapter Memory moved to memory.tex +\include{src/interface/memory} -On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It -has full access to the physical memory available in the system and is -responsible for allocating portions of it to the domains. Guest -operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as -they see fit. Segmentation is used to prevent the guest OS from -accessing the portion of the address space that is reserved for -Xen. We expect most guest operating systems will use ring 1 for their -own operation and place applications in ring 3. +%% chapter Devices moved to devices.tex +\include{src/interface/devices} -In this chapter we consider the basic virtual architecture provided -by Xen: the basic CPU state, exception and interrupt handling, and -time. Other aspects such as memory and device access are discussed -in later chapters. - -\section{CPU state} - -All privileged state must be handled by Xen. The guest OS has no -direct access to CR3 and is not permitted to update privileged bits in -EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; -these are analogous to system calls but occur from ring 1 to ring 0. - -A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. - - - -\section{Exceptions} - -A virtual IDT is provided --- a domain can submit a table of trap -handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap -handlers are identical to native x86 handlers, although the page-fault -handler is somewhat different. - - -\section{Interrupts and events} - -Interrupts are virtualized by mapping them to \emph{events}, which are -delivered asynchronously to the target domain using a callback -supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map -these events onto its standard interrupt dispatch mechanisms. Xen is -responsible for determining the target domain that will handle each -physical interrupt source. For more details on the binding of event -sources to events, see Chapter~\ref{c:devices}. - - - -\section{Time} - -Guest operating systems need to be aware of the passage of both real -(or wallclock) time and their own `virtual time' (the time for -which they have been executing). Furthermore, Xen has a notion of -time which is used for scheduling. The following notions of -time are provided: - -\begin{description} -\item[Cycle counter time.] - -This provides a fine-grained time reference. The cycle counter time is -used to accurately extrapolate the other time references. On SMP machines -it is currently assumed that the cycle counter time is synchronized between -CPUs. The current x86-based implementation achieves this within inter-CPU -communication latencies. - -\item[System time.] - -This is a 64-bit counter which holds the number of nanoseconds that -have elapsed since system boot. - - -\item[Wall clock time.] - -This is the time of day in a Unix-style {\tt struct timeval} (seconds -and microseconds since 1 January 1970, adjusted by leap seconds). An -NTP client hosted by {\it domain 0} can keep this value accurate. - - -\item[Domain virtual time.] - -This progresses at the same pace as system time, but only while a -domain is executing --- it stops while a domain is de-scheduled. -Therefore the share of the CPU that a domain receives is indicated by -the rate at which its virtual time increases. - -\end{description} - - -Xen exports timestamps for system time and wall-clock time to guest -operating systems through a shared page of memory. Xen also provides -the cycle counter time at the instant the timestamps were calculated, -and the CPU frequency in Hertz. This allows the guest to extrapolate -system and wall-clock times accurately based on the current cycle -counter time. - -Since all time stamps need to be updated and read \emph{atomically} -two version numbers are also stored in the shared info page. The -first is incremented prior to an update, while the second is only -incremented afterwards. Thus a guest can be sure that it read a consistent -state by checking the two version numbers are equal. - -Xen includes a periodic ticker which sends a timer event to the -currently executing domain every 10ms. The Xen scheduler also sends a -timer event whenever a domain is scheduled; this allows the guest OS -to adjust for the time that has passed while it has been inactive. In -addition, Xen allows each domain to request that they receive a timer -event sent at a specified system time by using the {\tt -set\_timer\_op()} hypercall. Guest OSes may use this timer to -implement timeout values when they block. - - - -%% % akw: demoting this to a section -- not sure if there is any point -%% % though, maybe just remove it. - -\section{Xen CPU Scheduling} - -Xen offers a uniform API for CPU schedulers. It is possible to choose -from a number of schedulers at boot and it should be easy to add more. -The BVT, Atropos and Round Robin schedulers are part of the normal -Xen distribution. BVT provides proportional fair shares of the CPU to -the running domains. Atropos can be used to reserve absolute shares -of the CPU for each domain. Round-robin is provided as an example of -Xen's internal scheduler API. - -\paragraph*{Note: SMP host support} -Xen has always supported SMP host systems. Domains are statically assigned to -CPUs, either at creation time or when manually pinning to a particular CPU. -The current schedulers then run locally on each CPU to decide which of the -assigned domains should be run there. The user-level control software -can be used to perform coarse-grain load-balancing between CPUs. - - -%% More information on the characteristics and use of these schedulers is -%% available in {\tt Sched-HOWTO.txt}. - - -\section{Privileged operations} - -Xen exports an extended interface to privileged domains (viz.\ {\it - Domain 0}). This allows such domains to build and boot other domains -on the server, and provides control interfaces for managing -scheduling, memory, networking, and block devices. - - -\chapter{Memory} -\label{c:memory} - -Xen is responsible for managing the allocation of physical memory to -domains, and for ensuring safe use of the paging and segmentation -hardware. - - -\section{Memory Allocation} - - -Xen resides within a small fixed portion of physical memory; it also -reserves the top 64MB of every virtual address space. The remaining -physical memory is available for allocation to domains at a page -granularity. Xen tracks the ownership and use of each page, which -allows it to enforce secure partitioning between domains. - -Each domain has a maximum and current physical memory allocation. -A guest OS may run a `balloon driver' to dynamically adjust its -current memory allocation up to its limit. - - -%% XXX SMH: I use machine and physical in the next section (which -%% is kinda required for consistency with code); wonder if this -%% section should use same terms? -%% -%% Probably. -%% -%% Merging this and below section at some point prob makes sense. - -\section{Pseudo-Physical Memory} - -Since physical memory is allocated and freed on a page granularity, -there is no guarantee that a domain will receive a contiguous stretch -of physical memory. However most operating systems do not have good -support for operating in a fragmented physical address space. To aid -porting such operating systems to run on top of Xen, we make a -distinction between \emph{machine memory} and \emph{pseudo-physical -memory}. - -Put simply, machine memory refers to the entire amount of memory -installed in the machine, including that reserved by Xen, in use by -various domains, or currently unallocated. We consider machine memory -to comprise a set of 4K \emph{machine page frames} numbered -consecutively starting from 0. Machine frame numbers mean the same -within Xen or any domain. - -Pseudo-physical memory, on the other hand, is a per-domain -abstraction. It allows a guest operating system to consider its memory -allocation to consist of a contiguous range of physical page frames -starting at physical frame 0, despite the fact that the underlying -machine page frames may be sparsely allocated and in any order. - -To achieve this, Xen maintains a globally readable {\it -machine-to-physical} table which records the mapping from machine page -frames to pseudo-physical ones. In addition, each domain is supplied -with a {\it physical-to-machine} table which performs the inverse -mapping. Clearly the machine-to-physical table has size proportional -to the amount of RAM installed in the machine, while each -physical-to-machine table has size proportional to the memory -allocation of the given domain. - -Architecture dependent code in guest operating systems can then use -the two tables to provide the abstraction of pseudo-physical -memory. In general, only certain specialized parts of the operating -system (such as page table management) needs to understand the -difference between machine and pseudo-physical addresses. - -\section{Page Table Updates} - -In the default mode of operation, Xen enforces read-only access to -page tables and requires guest operating systems to explicitly request -any modifications. Xen validates all such requests and only applies -updates that it deems safe. This is necessary to prevent domains from -adding arbitrary mappings to their page tables. - -To aid validation, Xen associates a type and reference count with each -memory page. A page has one of the following -mutually-exclusive types at any point in time: page directory ({\sf -PD}), page table ({\sf PT}), local descriptor table ({\sf LDT}), -global descriptor table ({\sf GDT}), or writable ({\sf RW}). Note that -a guest OS may always create readable mappings of its own memory -regardless of its current type. -%%% XXX: possibly explain more about ref count 'lifecyle' here? -This mechanism is used to -maintain the invariants required for safety; for example, a domain -cannot have a writable mapping to any part of a page table as this -would require the page concerned to simultaneously be of types {\sf - PT} and {\sf RW}. - - -%\section{Writable Page Tables} - -Xen also provides an alternative mode of operation in which guests be -have the illusion that their page tables are directly writable. Of -course this is not really the case, since Xen must still validate -modifications to ensure secure partitioning. To this end, Xen traps -any write attempt to a memory page of type {\sf PT} (i.e., that is -currently part of a page table). If such an access occurs, Xen -temporarily allows write access to that page while at the same time -{\em disconnecting} it from the page table that is currently in -use. This allows the guest to safely make updates to the page because -the newly-updated entries cannot be used by the MMU until Xen -revalidates and reconnects the page. -Reconnection occurs automatically in a number of situations: for -example, when the guest modifies a different page-table page, when the -domain is preempted, or whenever the guest uses Xen's explicit -page-table update interfaces. - -Finally, Xen also supports a form of \emph{shadow page tables} in -which the guest OS uses a independent copy of page tables which are -unknown to the hardware (i.e.\ which are never pointed to by {\tt -cr3}). Instead Xen propagates changes made to the guest's tables to the -real ones, and vice versa. This is useful for logging page writes -(e.g.\ for live migration or checkpoint). A full version of the shadow -page tables also allows guest OS porting with less effort. - -\section{Segment Descriptor Tables} - -On boot a guest is supplied with a default GDT, which does not reside -within its own memory allocation. If the guest wishes to use other -than the default `flat' ring-1 and ring-3 segments that this GDT -provides, it must register a custom GDT and/or LDT with Xen, -allocated from its own memory. Note that a number of GDT -entries are reserved by Xen -- any custom GDT must also include -sufficient space for these entries. - -For example, the following hypercall is used to specify a new GDT: - -\begin{quote} -int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em entries}) - -{\em frame\_list}: An array of up to 16 machine page frames within -which the GDT resides. Any frame registered as a GDT frame may only -be mapped read-only within the guest's address space (e.g., no -writable mappings, no use as a page-table page, and so on). - -{\em entries}: The number of descriptor-entry slots in the GDT. Note -that the table must be large enough to contain Xen's reserved entries; -thus we must have `{\em entries $>$ LAST\_RESERVED\_GDT\_ENTRY}\ '. -Note also that, after registering the GDT, slots {\em FIRST\_} through -{\em LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest and -may be overwritten by Xen. -\end{quote} - -The LDT is updated via the generic MMU update mechanism (i.e., via -the {\tt mmu\_update()} hypercall. - -\section{Start of Day} - -The start-of-day environment for guest operating systems is rather -different to that provided by the underlying hardware. In particular, -the processor is already executing in protected mode with paging -enabled. - -{\it Domain 0} is created and booted by Xen itself. For all subsequent -domains, the analogue of the boot-loader is the {\it domain builder}, -user-space software running in {\it domain 0}. The domain builder -is responsible for building the initial page tables for a domain -and loading its kernel image at the appropriate virtual address. - - - -\chapter{Devices} -\label{c:devices} - -Devices such as network and disk are exported to guests using a -split device driver. The device driver domain, which accesses the -physical device directly also runs a {\em backend} driver, serving -requests to that device from guests. Each guest will use a simple -{\em frontend} driver, to access the backend. Communication between these -domains is composed of two parts: First, data is placed onto a shared -memory page between the domains. Second, an event channel between the -two domains is used to pass notification that data is outstanding. -This separation of notification from data transfer allows message -batching, and results in very efficient device access. - -Event channels are used extensively in device virtualization; each -domain has a number of end-points or \emph{ports} each of which -may be bound to one of the following \emph{event sources}: -\begin{itemize} - \item a physical interrupt from a real device, - \item a virtual interrupt (callback) from Xen, or - \item a signal from another domain -\end{itemize} - -Events are lightweight and do not carry much information beyond -the source of the notification. Hence when performing bulk data -transfer, events are typically used as synchronization primitives -over a shared memory transport. Event channels are managed via -the {\tt event\_channel\_op()} hypercall; for more details see -Section~\ref{s:idc}. - -This chapter focuses on some individual device interfaces -available to Xen guests. - -\section{Network I/O} - -Virtual network device services are provided by shared memory -communication with a backend domain. From the point of view of -other domains, the backend may be viewed as a virtual ethernet switch -element with each domain having one or more virtual network interfaces -connected to it. - -\subsection{Backend Packet Handling} - -The backend driver is responsible for a variety of actions relating to -the transmission and reception of packets from the physical device. -With regard to transmission, the backend performs these key actions: - -\begin{itemize} -\item {\bf Validation:} To ensure that domains do not attempt to - generate invalid (e.g. spoofed) traffic, the backend driver may - validate headers ensuring that source MAC and IP addresses match the - interface that they have been sent from. - - Validation functions can be configured using standard firewall rules - ({\small{\tt iptables}} in the case of Linux). - -\item {\bf Scheduling:} Since a number of domains can share a single - physical network interface, the backend must mediate access when - several domains each have packets queued for transmission. This - general scheduling function subsumes basic shaping or rate-limiting - schemes. - -\item {\bf Logging and Accounting:} The backend domain can be - configured with classifier rules that control how packets are - accounted or logged. For example, log messages might be generated - whenever a domain attempts to send a TCP packet containing a SYN. -\end{itemize} - -On receipt of incoming packets, the backend acts as a simple -demultiplexer: Packets are passed to the appropriate virtual -interface after any necessary logging and accounting have been carried -out. - -\subsection{Data Transfer} - -Each virtual interface uses two ``descriptor rings'', one for transmit, -the other for receive. Each descriptor identifies a block of contiguous -physical memory allocated to the domain. - -The transmit ring carries packets to transmit from the guest to the -backend domain. The return path of the transmit ring carries messages -indicating that the contents have been physically transmitted and the -backend no longer requires the associated pages of memory. - -To receive packets, the guest places descriptors of unused pages on -the receive ring. The backend will return received packets by -exchanging these pages in the domain's memory with new pages -containing the received data, and passing back descriptors regarding -the new packets on the ring. This zero-copy approach allows the -backend to maintain a pool of free pages to receive packets into, and -then deliver them to appropriate domains after examining their -headers. - -% -%Real physical addresses are used throughout, with the domain performing -%translation from pseudo-physical addresses if that is necessary. - -If a domain does not keep its receive ring stocked with empty buffers then -packets destined to it may be dropped. This provides some defence against -receive livelock problems because an overload domain will cease to receive -further data. Similarly, on the transmit path, it provides the application -with feedback on the rate at which packets are able to leave the system. - - -Flow control on rings is achieved by including a pair of producer -indexes on the shared ring page. Each side will maintain a private -consumer index indicating the next outstanding message. In this -manner, the domains cooperate to divide the ring into two message -lists, one in each direction. Notification is decoupled from the -immediate placement of new messages on the ring; the event channel -will be used to generate notification when {\em either} a certain -number of outstanding messages are queued, {\em or} a specified number -of nanoseconds have elapsed since the oldest message was placed on the -ring. - -% Not sure if my version is any better -- here is what was here before: -%% Synchronization between the backend domain and the guest is achieved using -%% counters held in shared memory that is accessible to both. Each ring has -%% associated producer and consumer indices indicating the area in the ring -%% that holds descriptors that contain data. After receiving {\it n} packets -%% or {\t nanoseconds} after receiving the first packet, the hypervisor sends -%% an event to the domain. - -\section{Block I/O} - -All guest OS disk access goes through the virtual block device VBD -interface. This interface allows domains access to portions of block -storage devices visible to the the block backend device. The VBD -interface is a split driver, similar to the network interface -described above. A single shared memory ring is used between the -frontend and backend drivers, across which read and write messages are -sent. - -Any block device accessible to the backend domain, including -network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, -can be exported as a VBD. Each VBD is mapped to a device node in the -guest, specified in the guest's startup configuration. - -Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since -similar functionality can be achieved using the more complete LVM -system, which is already in widespread use. - -\subsection{Data Transfer} - -The single ring between the guest and the block backend supports three -messages: - -\begin{description} -\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to this guest - from the backend. The request includes a descriptor of a free page - into which the reply will be written by the backend. - -\item [{\small {\tt READ}}:] Read data from the specified block device. The - front end identifies the device and location to read from and - attaches pages for the data to be copied to (typically via DMA from - the device). The backend acknowledges completed read requests as - they finish. - -\item [{\small {\tt WRITE}}:] Write data to the specified block device. This - functions essentially as {\small {\tt READ}}, except that the data moves to - the device instead of from it. -\end{description} - -% um... some old text -%% In overview, the same style of descriptor-ring that is used for -%% network packets is used here. Each domain has one ring that carries -%% operation requests to the hypervisor and carries the results back -%% again. - -%% Rather than copying data, the backend simply maps the domain's buffers -%% in order to enable direct DMA to them. The act of mapping the buffers -%% also increases the reference counts of the underlying pages, so that -%% the unprivileged domain cannot try to return them to the hypervisor, -%% install them as page tables, or any other unsafe behaviour. -%% %block API here - - -\chapter{Further Information} - - -If you have questions that are not answered by this manual, the -sources of information listed below may be of interest to you. Note -that bug reports, suggestions and contributions related to the -software (or the documentation) should be sent to the Xen developers' -mailing list (address below). - -\section{Other documentation} - -If you are mainly interested in using (rather than developing for) -Xen, the {\em Xen Users' Manual} is distributed in the {\tt docs/} -directory of the Xen source distribution. - -% Various HOWTOs are also available in {\tt docs/HOWTOS}. - -\section{Online references} - -The official Xen web site is found at: -\begin{quote} -{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} -\end{quote} - -This contains links to the latest versions of all on-line -documentation. - -\section{Mailing lists} - -There are currently four official Xen mailing lists: - -\begin{description} -\item[xen-devel@xxxxxxxxxxxxxxxxxxx] Used for development -discussions and bug reports. Subscribe at: \\ -{\small {\tt http://lists.xensource.com/xen-devel}} -\item[xen-users@xxxxxxxxxxxxxxxxxxx] Used for installation and usage -discussions and requests for help. Subscribe at: \\ -{\small {\tt http://lists.xensource.com/xen-users}} -\item[xen-announce@xxxxxxxxxxxxxxxxxxx] Used for announcements only. -Subscribe at: \\ -{\small {\tt http://lists.xensource.com/xen-announce}} -\item[xen-changelog@xxxxxxxxxxxxxxxxxxx] Changelog feed -from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ -{\small {\tt http://lists.xensource.com/xen-changelog}} -\end{description} - -Of these, xen-devel is the most active. - - +%% chapter Further Information moved to further_info.tex +\include{src/interface/further_info} \appendix -%\newcommand{\hypercall}[1]{\vspace{5mm}{\large\sf #1}} - - - - - -\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} - - - - - - -\chapter{Xen Hypercalls} -\label{a:hypercalls} - -Hypercalls represent the procedural interface to Xen; this appendix -categorizes and describes the current set of hypercalls. - -\section{Invoking Hypercalls} - -Hypercalls are invoked in a manner analogous to system calls in a -conventional operating system; a software interrupt is issued which -vectors to an entry point within Xen. On x86\_32 machines the -instruction required is {\tt int \$82}; the (real) IDT is setup so -that this may only be issued from within ring 1. The particular -hypercall to be invoked is contained in {\tt EAX} --- a list -mapping these values to symbolic hypercall names can be found -in {\tt xen/include/public/xen.h}. - -On some occasions a set of hypercalls will be required to carry -out a higher-level function; a good example is when a guest -operating wishes to context switch to a new process which -requires updating various privileged CPU state. As an optimization -for these cases, there is a generic mechanism to issue a set of -hypercalls as a batch: - -\begin{quote} -\hypercall{multicall(void *call\_list, int nr\_calls)} - -Execute a series of hypervisor calls; {\tt nr\_calls} is the length of -the array of {\tt multicall\_entry\_t} structures pointed to be {\tt -call\_list}. Each entry contains the hypercall operation code followed -by up to 7 word-sized arguments. -\end{quote} - -Note that multicalls are provided purely as an optimization; there is -no requirement to use them when first porting a guest operating -system. - - -\section{Virtual CPU Setup} - -At start of day, a guest operating system needs to setup the virtual -CPU it is executing on. This includes installing vectors for the -virtual IDT so that the guest OS can handle interrupts, page faults, -etc. However the very first thing a guest OS must setup is a pair -of hypervisor callbacks: these are the entry points which Xen will -use when it wishes to notify the guest OS of an occurrence. - -\begin{quote} -\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long - event\_address, unsigned long failsafe\_selector, unsigned long - failsafe\_address) } - -Register the normal (``event'') and failsafe callbacks for -event processing. In each case the code segment selector and -address within that segment are provided. The selectors must -have RPL 1; in XenLinux we simply use the kernel's CS for both -{\tt event\_selector} and {\tt failsafe\_selector}. - -The value {\tt event\_address} specifies the address of the guest OSes -event handling and dispatch routine; the {\tt failsafe\_address} -specifies a separate entry point which is used only if a fault occurs -when Xen attempts to use the normal callback. -\end{quote} - - -After installing the hypervisor callbacks, the guest OS can -install a `virtual IDT' by using the following hypercall: - -\begin{quote} -\hypercall{set\_trap\_table(trap\_info\_t *table)} - -Install one or more entries into the per-domain -trap handler table (essentially a software version of the IDT). -Each entry in the array pointed to by {\tt table} includes the -exception vector number with the corresponding segment selector -and entry point. Most guest OSes can use the same handlers on -Xen as when running on the real hardware; an exception is the -page fault handler (exception vector 14) where a modified -stack-frame layout is used. - - -\end{quote} - - - -\section{Scheduling and Timer} - -Domains are preemptively scheduled by Xen according to the -parameters installed by domain 0 (see Section~\ref{s:dom0ops}). -In addition, however, a domain may choose to explicitly -control certain behavior with the following hypercall: - -\begin{quote} -\hypercall{sched\_op(unsigned long op)} - -Request scheduling operation from hypervisor. The options are: {\it -yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the -calling domain runnable but may cause a reschedule if other domains -are runnable. {\it block} removes the calling domain from the run -queue and cause is to sleeps until an event is delivered to it. {\it -shutdown} is used to end the domain's execution; the caller can -additionally specify whether the domain should reboot, halt or -suspend. -\end{quote} - -To aid the implementation of a process scheduler within a guest OS, -Xen provides a virtual programmable timer: - -\begin{quote} -\hypercall{set\_timer\_op(uint64\_t timeout)} - -Request a timer event to be sent at the specified system time (time -in nanoseconds since system boot). The hypercall actually passes the -64-bit timeout value as a pair of 32-bit values. - -\end{quote} - -Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} -allows block-with-timeout semantics. - - -\section{Page Table Management} - -Since guest operating systems have read-only access to their page -tables, Xen must be involved when making any changes. The following -multi-purpose hypercall can be used to modify page-table entries, -update the machine-to-physical mapping table, flush the TLB, install -a new page-table base pointer, and more. - -\begin{quote} -\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} - -Update the page table for the domain; a set of {\tt count} updates are -submitted for processing in a batch, with {\tt success\_count} being -updated to report the number of successful updates. - -Each element of {\tt req[]} contains a pointer (address) and value; -the least significant 2-bits of the pointer are used to distinguish -the type of update requested as follows: -\begin{description} - -\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or -page table entry to the associated value; Xen will check that the -update is safe, as described in Chapter~\ref{c:memory}. - -\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the - machine-to-physical table. The calling domain must own the machine - page in question (or be privileged). - -\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. -The set of additional MMU operations is considerable, and includes -updating {\tt cr3} (or just re-installing it for a TLB flush), -flushing the cache, installing a new LDT, or pinning \& unpinning -page-table pages (to ensure their reference count doesn't drop to zero -which would require a revalidation of all entries). - -Further extended commands are used to deal with granting and -acquiring page ownership; see Section~\ref{s:idc}. - - -\end{description} - -More details on the precise format of all commands can be -found in {\tt xen/include/public/xen.h}. - - -\end{quote} - -Explicitly updating batches of page table entries is extremely -efficient, but can require a number of alterations to the guest -OS. Using the writable page table mode (Chapter~\ref{c:memory}) is -recommended for new OS ports. - -Regardless of which page table update mode is being used, however, -there are some occasions (notably handling a demand page fault) where -a guest OS will wish to modify exactly one PTE rather than a -batch. This is catered for by the following: - -\begin{quote} -\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long -val, \\ unsigned long flags)} - -Update the currently installed PTE for the page {\tt page\_nr} to -{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification -is safe before applying it. The {\tt flags} determine which kind -of TLB flush, if any, should follow the update. - -\end{quote} - -Finally, sufficiently privileged domains may occasionally wish to manipulate -the pages of others: -\begin{quote} - -\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, -unsigned long val, unsigned long flags, uint16\_t domid)} - -Identical to {\tt update\_va\_mapping()} save that the pages being -mapped must belong to the domain {\tt domid}. - -\end{quote} - -This privileged operation is currently used by backend virtual device -drivers to safely map pages containing I/O data. - - - -\section{Segmentation Support} - -Xen allows guest OSes to install a custom GDT if they require it; -this is context switched transparently whenever a domain is -[de]scheduled. The following hypercall is effectively a -`safe' version of {\tt lgdt}: - -\begin{quote} -\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} - -Install a global descriptor table for a domain; {\tt frame\_list} is -an array of up to 16 machine page frames within which the GDT resides, -with {\tt entries} being the actual number of descriptor-entry -slots. All page frames must be mapped read-only within the guest's -address space, and the table must be large enough to contain Xen's -reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). - -\end{quote} - -Many guest OSes will also wish to install LDTs; this is achieved by -using {\tt mmu\_update()} with an extended command, passing the -linear address of the LDT base along with the number of entries. No -special safety checks are required; Xen needs to perform this task -simply since {\tt lldt} requires CPL 0. - - -Xen also allows guest operating systems to update just an -individual segment descriptor in the GDT or LDT: - -\begin{quote} -\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, -unsigned long word2)} - -Update the GDT/LDT entry at machine address {\tt ma}; the new -8-byte descriptor is stored in {\tt word1} and {\tt word2}. -Xen performs a number of checks to ensure the descriptor is -valid. - -\end{quote} - -Guest OSes can use the above in place of context switching entire -LDTs (or the GDT) when the number of changing descriptors is small. - -\section{Context Switching} - -When a guest OS wishes to context switch between two processes, -it can use the page table and segmentation hypercalls described -above to perform the the bulk of the privileged work. In addition, -however, it will need to invoke Xen to switch the kernel (ring 1) -stack pointer: - -\begin{quote} -\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} - -Request kernel stack switch from hypervisor; {\tt ss} is the new -stack segment, which {\tt esp} is the new stack pointer. - -\end{quote} - -A final useful hypercall for context switching allows ``lazy'' -save and restore of floating point state: - -\begin{quote} -\hypercall{fpu\_taskswitch(void)} - -This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} -control register; this means that the next attempt to use floating -point will cause a trap which the guest OS can trap. Typically it will -then save/restore the FP state, and clear the {\tt TS} bit. -\end{quote} - -This is provided as an optimization only; guest OSes can also choose -to save and restore FP state on all context switches for simplicity. - - -\section{Physical Memory Management} - -As mentioned previously, each domain has a maximum and current -memory allocation. The maximum allocation, set at domain creation -time, cannot be modified. However a domain can choose to reduce -and subsequently grow its current allocation by using the -following call: - -\begin{quote} -\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, - unsigned long nr\_extents, unsigned int extent\_order)} - -Increase or decrease current memory allocation (as determined by -the value of {\tt op}). Each invocation provides a list of -extents each of which is $2^s$ pages in size, -where $s$ is the value of {\tt extent\_order}. - -\end{quote} - -In addition to simply reducing or increasing the current memory -allocation via a `balloon driver', this call is also useful for -obtaining contiguous regions of machine memory when required (e.g. -for certain PCI devices, or if using superpages). - - -\section{Inter-Domain Communication} -\label{s:idc} - -Xen provides a simple asynchronous notification mechanism via -\emph{event channels}. Each domain has a set of end-points (or -\emph{ports}) which may be bound to an event source (e.g. a physical -IRQ, a virtual IRQ, or an port in another domain). When a pair of -end-points in two different domains are bound together, then a `send' -operation on one will cause an event to be received by the destination -domain. - -The control and use of event channels involves the following hypercall: - -\begin{quote} -\hypercall{event\_channel\_op(evtchn\_op\_t *op)} - -Inter-domain event-channel management; {\tt op} is a discriminated -union which allows the following 7 operations: - -\begin{description} - -\item[\it alloc\_unbound:] allocate a free (unbound) local - port and prepare for connection from a specified domain. -\item[\it bind\_virq:] bind a local port to a virtual -IRQ; any particular VIRQ can be bound to at most one port per domain. -\item[\it bind\_pirq:] bind a local port to a physical IRQ; -once more, a given pIRQ can be bound to at most one port per -domain. Furthermore the calling domain must be sufficiently -privileged. -\item[\it bind\_interdomain:] construct an interdomain event -channel; in general, the target domain must have previously allocated -an unbound port for this channel, although this can be bypassed by -privileged domains during domain setup. -\item[\it close:] close an interdomain event channel. -\item[\it send:] send an event to the remote end of a -interdomain event channel. -\item[\it status:] determine the current status of a local port. -\end{description} - -For more details see -{\tt xen/include/public/event\_channel.h}. - -\end{quote} - -Event channels are the fundamental communication primitive between -Xen domains and seamlessly support SMP. However they provide little -bandwidth for communication {\sl per se}, and hence are typically -married with a piece of shared memory to produce effective and -high-performance inter-domain communication. - -Safe sharing of memory pages between guest OSes is carried out by -granting access on a per page basis to individual domains. This is -achieved by using the {\tt grant\_table\_op()} hypercall. - -\begin{quote} -\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} - -Grant or remove access to a particular page to a particular domain. - -\end{quote} - -This is not currently widely in use by guest operating systems, but -we intend to integrate support more fully in the near future. - -\section{PCI Configuration} - -Domains with physical device access (i.e.\ driver domains) receive -limited access to certain PCI devices (bus address space and -interrupts). However many guest operating systems attempt to -determine the PCI configuration by directly access the PCI BIOS, -which cannot be allowed for safety. - -Instead, Xen provides the following hypercall: - -\begin{quote} -\hypercall{physdev\_op(void *physdev\_op)} - -Perform a PCI configuration option; depending on the value -of {\tt physdev\_op} this can be a PCI config read, a PCI config -write, or a small number of other queries. - -\end{quote} - - -For examples of using {\tt physdev\_op()}, see the -Xen-specific PCI code in the linux sparse tree. - -\section{Administrative Operations} -\label{s:dom0ops} - -A large number of control operations are available to a sufficiently -privileged domain (typically domain 0). These allow the creation and -management of new domains, for example. A complete list is given -below: for more details on any or all of these, please see -{\tt xen/include/public/dom0\_ops.h} - - -\begin{quote} -\hypercall{dom0\_op(dom0\_op\_t *op)} - -Administrative domain operations for domain management. The options are: - -\begin{description} -\item [\it DOM0\_CREATEDOMAIN:] create a new domain - -\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run -queue. - -\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable - once again. - -\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated -with a domain - -\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain - -\item [\it DOM0\_SCHEDCTL:] - -\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain - -\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain - -\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain - -\item [\it DOM0\_GETPAGEFRAMEINFO:] - -\item [\it DOM0\_GETPAGEFRAMEINFO2:] - -\item [\it DOM0\_IOPL:] set I/O privilege level - -\item [\it DOM0\_MSR:] read or write model specific registers - -\item [\it DOM0\_DEBUG:] interactively invoke the debugger - -\item [\it DOM0\_SETTIME:] set system time - -\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring - -\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU - -\item [\it DOM0\_GETTBUFS:] get information about the size and location of - the trace buffers (only on trace-buffer enabled builds) - -\item [\it DOM0\_PHYSINFO:] get information about the host machine - -\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions - -\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler - -\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes - -\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain - -\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain - -\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options -\end{description} -\end{quote} - -Most of the above are best understood by looking at the code -implementing them (in {\tt xen/common/dom0\_ops.c}) and in -the user-space tools that use them (mostly in {\tt tools/libxc}). - -\section{Debugging Hypercalls} - -A few additional hypercalls are mainly useful for debugging: - -\begin{quote} -\hypercall{console\_io(int cmd, int count, char *str)} - -Use Xen to interact with the console; operations are: - -{\it CONSOLEIO\_write}: Output count characters from buffer str. - -{\it CONSOLEIO\_read}: Input at most count characters into buffer str. -\end{quote} - -A pair of hypercalls allows access to the underlying debug registers: -\begin{quote} -\hypercall{set\_debugreg(int reg, unsigned long value)} - -Set debug register {\tt reg} to {\tt value} - -\hypercall{get\_debugreg(int reg)} - -Return the contents of the debug register {\tt reg} -\end{quote} - -And finally: -\begin{quote} -\hypercall{xen\_version(int cmd)} - -Request Xen version number. -\end{quote} - -This is useful to ensure that user-space tools are in sync -with the underlying hypervisor. - -\section{Deprecated Hypercalls} - -Xen is under constant development and refinement; as such there -are plans to improve the way in which various pieces of functionality -are exposed to guest OSes. - -\begin{quote} -\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} - -Toggle various memory management modes (in particular wrritable page -tables and superpage support). - -\end{quote} - -This is likely to be replaced with mode values in the shared -information page since this is more resilient for resumption -after migration or checkpoint. - - - - - - +%% chapter hypercalls moved to hypercalls.tex +\include{src/interface/hypercalls} %% @@ -1173,279 +112,9 @@ %% new scheduler... not clear how many of them there are... %% -\begin{comment} - -\chapter{Scheduling API} - -The scheduling API is used by both the schedulers described above and should -also be used by any new schedulers. It provides a generic interface and also -implements much of the ``boilerplate'' code. - -Schedulers conforming to this API are described by the following -structure: - -\begin{verbatim} -struct scheduler -{ - char *name; /* full name for this scheduler */ - char *opt_name; /* option name for this scheduler */ - unsigned int sched_id; /* ID for this scheduler */ - - int (*init_scheduler) (); - int (*alloc_task) (struct task_struct *); - void (*add_task) (struct task_struct *); - void (*free_task) (struct task_struct *); - void (*rem_task) (struct task_struct *); - void (*wake_up) (struct task_struct *); - void (*do_block) (struct task_struct *); - task_slice_t (*do_schedule) (s_time_t); - int (*control) (struct sched_ctl_cmd *); - int (*adjdom) (struct task_struct *, - struct sched_adjdom_cmd *); - s32 (*reschedule) (struct task_struct *); - void (*dump_settings) (void); - void (*dump_cpu_state) (int); - void (*dump_runq_el) (struct task_struct *); -}; -\end{verbatim} - -The only method that {\em must} be implemented is -{\tt do\_schedule()}. However, if there is not some implementation for the -{\tt wake\_up()} method then waking tasks will not get put on the runqueue! - -The fields of the above structure are described in more detail below. - -\subsubsection{name} - -The name field should point to a descriptive ASCII string. - -\subsubsection{opt\_name} - -This field is the value of the {\tt sched=} boot-time option that will select -this scheduler. - -\subsubsection{sched\_id} - -This is an integer that uniquely identifies this scheduler. There should be a -macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. - -\subsubsection{init\_scheduler} - -\paragraph*{Purpose} - -This is a function for performing any scheduler-specific initialisation. For -instance, it might allocate memory for per-CPU scheduler data and initialise it -appropriately. - -\paragraph*{Call environment} - -This function is called after the initialisation performed by the generic -layer. The function is called exactly once, for the scheduler that has been -selected. - -\paragraph*{Return values} - -This should return negative on failure --- this will cause an -immediate panic and the system will fail to boot. - -\subsubsection{alloc\_task} - -\paragraph*{Purpose} -Called when a {\tt task\_struct} is allocated by the generic scheduler -layer. A particular scheduler implementation may use this method to -allocate per-task data for this task. It may use the {\tt -sched\_priv} pointer in the {\tt task\_struct} to point to this data. - -\paragraph*{Call environment} -The generic layer guarantees that the {\tt sched\_priv} field will -remain intact from the time this method is called until the task is -deallocated (so long as the scheduler implementation does not change -it explicitly!). - -\paragraph*{Return values} -Negative on failure. - -\subsubsection{add\_task} - -\paragraph*{Purpose} - -Called when a task is initially added by the generic layer. - -\paragraph*{Call environment} - -The fields in the {\tt task\_struct} are now filled out and available for use. -Schedulers should implement appropriate initialisation of any per-task private -information in this method. - -\subsubsection{free\_task} - -\paragraph*{Purpose} - -Schedulers should free the space used by any associated private data -structures. - -\paragraph*{Call environment} - -This is called when a {\tt task\_struct} is about to be deallocated. -The generic layer will have done generic task removal operations and -(if implemented) called the scheduler's {\tt rem\_task} method before -this method is called. - -\subsubsection{rem\_task} - -\paragraph*{Purpose} - -This is called when a task is being removed from scheduling (but is -not yet being freed). - -\subsubsection{wake\_up} - -\paragraph*{Purpose} - -Called when a task is woken up, this method should put the task on the runqueue -(or do the scheduler-specific equivalent action). - -\paragraph*{Call environment} - -The task is already set to state RUNNING. - -\subsubsection{do\_block} - -\paragraph*{Purpose} - -This function is called when a task is blocked. This function should -not remove the task from the runqueue. - -\paragraph*{Call environment} - -The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to -TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt - do\_schedule} method will be made after this method returns, in -order to select the next task to run. - -\subsubsection{do\_schedule} - -This method must be implemented. - -\paragraph*{Purpose} - -The method is called each time a new task must be chosen for scheduling on the -current CPU. The current time as passed as the single argument (the current -task can be found using the {\tt current} macro). - -This method should select the next task to run on this CPU and set it's minimum -time to run as well as returning the data described below. - -This method should also take the appropriate action if the previous -task has blocked, e.g. removing it from the runqueue. - -\paragraph*{Call environment} - -The other fields in the {\tt task\_struct} are updated by the generic layer, -which also performs all Xen-specific tasks and performs the actual task switch -(unless the previous task has been chosen again). - -This method is called with the {\tt schedule\_lock} held for the current CPU -and local interrupts disabled. - -\paragraph*{Return values} - -Must return a {\tt struct task\_slice} describing what task to run and how long -for (at maximum). - -\subsubsection{control} - -\paragraph*{Purpose} - -This method is called for global scheduler control operations. It takes a -pointer to a {\tt struct sched\_ctl\_cmd}, which it should either -source data from or populate with data, depending on the value of the -{\tt direction} field. - -\paragraph*{Call environment} - -The generic layer guarantees that when this method is called, the -caller selected the correct scheduler ID, hence the scheduler's -implementation does not need to sanity-check these parts of the call. - -\paragraph*{Return values} - -This function should return the value to be passed back to user space, hence it -should either be 0 or an appropriate errno value. - -\subsubsection{sched\_adjdom} - -\paragraph*{Purpose} - -This method is called to adjust the scheduling parameters of a particular -domain, or to query their current values. The function should check -the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in -order to determine which of these operations is being performed. - -\paragraph*{Call environment} - -The generic layer guarantees that the caller has specified the correct -control interface version and scheduler ID and that the supplied {\tt -task\_struct} will not be deallocated during the call (hence it is not -necessary to {\tt get\_task\_struct}). - -\paragraph*{Return values} - -This function should return the value to be passed back to user space, hence it -should either be 0 or an appropriate errno value. - -\subsubsection{reschedule} - -\paragraph*{Purpose} - -This method is called to determine if a reschedule is required as a result of a -particular task. - -\paragraph*{Call environment} -The generic layer will cause a reschedule if the current domain is the idle -task or it has exceeded its minimum time slice before a reschedule. The -generic layer guarantees that the task passed is not currently running but is -on the runqueue. - -\paragraph*{Return values} - -Should return a mask of CPUs to cause a reschedule on. - -\subsubsection{dump\_settings} - -\paragraph*{Purpose} - -If implemented, this should dump any private global settings for this -scheduler to the console. - -\paragraph*{Call environment} - -This function is called with interrupts enabled. - -\subsubsection{dump\_cpu\_state} - -\paragraph*{Purpose} - -This method should dump any private settings for the specified CPU. - -\paragraph*{Call environment} - -This function is called with interrupts disabled and the {\tt schedule\_lock} -for the specified CPU held. - -\subsubsection{dump\_runq\_el} - -\paragraph*{Purpose} - -This method should dump any private settings for the specified task. - -\paragraph*{Call environment} - -This function is called with interrupts disabled and the {\tt schedule\_lock} -for the task's CPU held. - -\end{comment} - +%% \include{src/interface/scheduling} +%% scheduling information moved to scheduling.tex +%% still commented out @@ -1457,74 +126,9 @@ %% (and/or kip's stuff?) and write about that instead? %% -\begin{comment} - -\chapter{Debugging} - -Xen provides tools for debugging both Xen and guest OSes. Currently, the -Pervasive Debugger provides a GDB stub, which provides facilities for symbolic -debugging of Xen itself and of OS kernels running on top of Xen. The Trace -Buffer provides a lightweight means to log data about Xen's internal state and -behaviour at runtime, for later analysis. - -\section{Pervasive Debugger} - -Information on using the pervasive debugger is available in pdb.txt. - - -\section{Trace Buffer} - -The trace buffer provides a means to observe Xen's operation from domain 0. -Trace events, inserted at key points in Xen's code, record data that can be -read by the {\tt xentrace} tool. Recording these events has a low overhead -and hence the trace buffer may be useful for debugging timing-sensitive -behaviours. - -\subsection{Internal API} - -To use the trace buffer functionality from within Xen, you must {\tt \#include -<xen/trace.h>}, which contains definitions related to the trace buffer. Trace -events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, -2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional -(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these -will insert the event ID and data into the trace buffer, along with the current -value of the CPU cycle-counter. For builds without the trace buffer enabled, -the macros expand to no-ops and thus can be left in place without incurring -overheads. - -\subsection{Trace-enabled builds} - -By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} -is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, -either in {\tt <xen/config.h>} or on the gcc command line. - -The size (in pages) of the per-CPU trace buffers can be specified using the -{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace -buffers will be disabled. - -\subsection{Dumping trace data} - -When running a trace buffer build of Xen, trace data are written continuously -into the buffer data areas, with newer data overwriting older data. This data -can be captured using the {\tt xentrace} program in domain 0. - -The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace -buffers into its address space. It then periodically polls all the buffers for -new data, dumping out any new records from each buffer in turn. As a result, -for machines with multiple (logical) CPUs, the trace buffer output will not be -in overall chronological order. - -The output from {\tt xentrace} can be post-processed using {\tt -xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and -{\tt xentrace\_format} (used to pretty-print trace data). For the predefined -trace points, there is an example format file in {\tt tools/xentrace/formats }. - -For more information, see the manual pages for {\tt xentrace}, {\tt -xentrace\_format} and {\tt xentrace\_cpusplit}. - -\end{comment} - - +%% \include{src/interface/debugging} +%% debugging information moved to debugging.tex +%% still commented out \end{document} diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user.tex --- a/docs/src/user.tex Thu Sep 22 17:34:14 2005 +++ b/docs/src/user.tex Thu Sep 22 17:42:01 2005 @@ -59,1803 +59,36 @@ \renewcommand{\floatpagefraction}{.8} \setstretch{1.1} + \part{Introduction and Tutorial} -\chapter{Introduction} - -Xen is a {\em paravirtualising} virtual machine monitor (VMM), or -`hypervisor', for the x86 processor architecture. Xen can securely -execute multiple virtual machines on a single physical system with -close-to-native performance. The virtual machine technology -facilitates enterprise-grade functionality, including: - -\begin{itemize} -\item Virtual machines with performance close to native - hardware. -\item Live migration of running virtual machines between physical hosts. -\item Excellent hardware support (supports most Linux device drivers). -\item Sandboxed, restartable device drivers. -\end{itemize} - -Paravirtualisation permits very high performance virtualisation, -even on architectures like x86 that are traditionally -very hard to virtualise. -The drawback of this approach is that it requires operating systems to -be {\em ported} to run on Xen. Porting an OS to run on Xen is similar -to supporting a new hardware platform, however the process -is simplified because the paravirtual machine architecture is very -similar to the underlying native hardware. Even though operating system -kernels must explicitly support Xen, a key feature is that user space -applications and libraries {\em do not} require modification. - -Xen support is available for increasingly many operating systems: -right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. -A FreeBSD port is undergoing testing and will be incorporated into the -release soon. Other OS ports, including Plan 9, are in progress. We -hope that that arch-xen patches will be incorporated into the -mainstream releases of these operating systems in due course (as has -already happened for NetBSD). - -Possible usage scenarios for Xen include: -\begin{description} -\item [Kernel development.] Test and debug kernel modifications in a - sandboxed virtual machine --- no need for a separate test - machine. -\item [Multiple OS configurations.] Run multiple operating systems - simultaneously, for instance for compatibility or QA purposes. -\item [Server consolidation.] Move multiple servers onto a single - physical host with performance and fault isolation provided at - virtual machine boundaries. -\item [Cluster computing.] Management at VM granularity provides more - flexibility than separately managing each physical host, but - better control and isolation than single-system image solutions, - particularly by using live migration for load balancing. -\item [Hardware support for custom OSes.] Allow development of new OSes - while benefiting from the wide-ranging hardware support of - existing OSes such as Linux. -\end{description} - -\section{Structure of a Xen-Based System} - -A Xen system has multiple layers, the lowest and most privileged of -which is Xen itself. -Xen in turn may host multiple {\em guest} operating systems, each of -which is executed within a secure virtual machine (in Xen terminology, -a {\em domain}). Domains are scheduled by Xen to make effective use of -the available physical CPUs. Each guest OS manages its own -applications, which includes responsibility for scheduling each -application within the time allotted to the VM by Xen. - -The first domain, {\em domain 0}, is created automatically when the -system boots and has special management privileges. Domain 0 builds -other domains and manages their virtual devices. It also performs -administrative tasks such as suspending, resuming and migrating other -virtual machines. - -Within domain 0, a process called \emph{xend} runs to manage the system. -\Xend is responsible for managing virtual machines and providing access -to their consoles. Commands are issued to \xend over an HTTP -interface, either from a command-line tool or from a web browser. - -\section{Hardware Support} - -Xen currently runs only on the x86 architecture, requiring a `P6' or -newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, -Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are -supported, and we also have basic support for HyperThreading (SMT), -although this remains a topic for ongoing research. A port -specifically for x86/64 is in progress, although Xen already runs on -such systems in 32-bit legacy mode. In addition a port to the IA64 -architecture is approaching completion. We hope to add other -architectures such as PPC and ARM in due course. - - -Xen can currently use up to 4GB of memory. It is possible for x86 -machines to address up to 64GB of physical memory but there are no -current plans to support these systems: The x86/64 port is the -planned route to supporting larger memory sizes. - -Xen offloads most of the hardware support issues to the guest OS -running in Domain~0. Xen itself contains only the code required to -detect and start secondary processors, set up interrupt routing, and -perform PCI bus enumeration. Device drivers run within a privileged -guest OS rather than within Xen itself. This approach provides -compatibility with the majority of device hardware supported by Linux. -The default XenLinux build contains support for relatively modern -server-class network and disk hardware, but you can add support for -other hardware by configuring your XenLinux kernel in the normal way. - -\section{History} - -Xen was originally developed by the Systems Research Group at the -University of Cambridge Computer Laboratory as part of the XenoServers -project, funded by the UK-EPSRC. -XenoServers aim to provide a `public infrastructure for -global distributed computing', and Xen plays a key part in that, -allowing us to efficiently partition a single machine to enable -multiple independent clients to run their operating systems and -applications in an environment providing protection, resource -isolation and accounting. The project web page contains further -information along with pointers to papers and technical reports: -\path{http://www.cl.cam.ac.uk/xeno} - -Xen has since grown into a fully-fledged project in its own right, -enabling us to investigate interesting research issues regarding the -best techniques for virtualising resources such as the CPU, memory, -disk and network. The project has been bolstered by support from -Intel Research Cambridge, and HP Labs, who are now working closely -with us. - -Xen was first described in a paper presented at SOSP in -2003\footnote{\tt -http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the first -public release (1.0) was made that October. Since then, Xen has -significantly matured and is now used in production scenarios on -many sites. - -Xen 2.0 features greatly enhanced hardware support, configuration -flexibility, usability and a larger complement of supported operating -systems. This latest release takes Xen a step closer to becoming the -definitive open source solution for virtualisation. - -\chapter{Installation} - -The Xen distribution includes three main components: Xen itself, ports -of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the user-space -tools required to manage a Xen-based system. This chapter describes -how to install the Xen 2.0 distribution from source. Alternatively, -there may be pre-built packages available as part of your operating -system distribution. - -\section{Prerequisites} -\label{sec:prerequisites} - -The following is a full list of prerequisites. Items marked `$\dag$' -are required by the \xend control tools, and hence required if you -want to run more than one virtual machine; items marked `$*$' are only -required if you wish to build from source. -\begin{itemize} -\item A working Linux distribution using the GRUB bootloader and -running on a P6-class (or newer) CPU. -\item [$\dag$] The \path{iproute2} package. -\item [$\dag$] The Linux bridge-utils\footnote{Available from -{\tt http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) -\item [$\dag$] An installation of Twisted v1.3 or -above\footnote{Available from {\tt -http://www.twistedmatrix.com}}. There may be a binary package -available for your distribution; alternatively it can be installed by -running `{\sl make install-twisted}' in the root of the Xen source -tree. -\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). -\item [$*$] Development installation of libcurl (e.g., libcurl-devel) -\item [$*$] Development installation of zlib (e.g., zlib-dev). -\item [$*$] Development installation of Python v2.2 or later (e.g., python-dev). -\item [$*$] \LaTeX and transfig are required to build the documentation. -\end{itemize} - -Once you have satisfied the relevant prerequisites, you can -now install either a binary or source distribution of Xen. - -\section{Installing from Binary Tarball} - -Pre-built tarballs are available for download from the Xen -download page -\begin{quote} -{\tt http://xen.sf.net} -\end{quote} - -Once you've downloaded the tarball, simply unpack and install: -\begin{verbatim} -# tar zxvf xen-2.0-install.tgz -# cd xen-2.0-install -# sh ./install.sh -\end{verbatim} - -Once you've installed the binaries you need to configure -your system as described in Section~\ref{s:configure}. - -\section{Installing from Source} - -This section describes how to obtain, build, and install -Xen from source. - -\subsection{Obtaining the Source} - -The Xen source tree is available as either a compressed source tar -ball or as a clone of our master BitKeeper repository. - -\begin{description} -\item[Obtaining the Source Tarball]\mbox{} \\ -Stable versions (and daily snapshots) of the Xen source tree are -available as compressed tarballs from the Xen download page -\begin{quote} -{\tt http://xen.sf.net} -\end{quote} - -\item[Using BitKeeper]\mbox{} \\ -If you wish to install Xen from a clone of our latest BitKeeper -repository then you will need to install the BitKeeper tools. -Download instructions for BitKeeper can be obtained by filling out the -form at: - -\begin{quote} -{\tt http://www.bitmover.com/cgi-bin/download.cgi} -\end{quote} -The public master BK repository for the 2.0 release lives at: -\begin{quote} -{\tt bk://xen.bkbits.net/xen-2.0.bk} -\end{quote} -You can use BitKeeper to -download it and keep it updated with the latest features and fixes. - -Change to the directory in which you want to put the source code, then -run: -\begin{verbatim} -# bk clone bk://xen.bkbits.net/xen-2.0.bk -\end{verbatim} - -Under your current directory, a new directory named \path{xen-2.0.bk} -has been created, which contains all the source code for Xen, the OS -ports, and the control tools. You can update your repository with the -latest changes at any time by running: -\begin{verbatim} -# cd xen-2.0.bk # to change into the local repository -# bk pull # to update the repository -\end{verbatim} -\end{description} - -%\section{The distribution} -% -%The Xen source code repository is structured as follows: -% -%\begin{description} -%\item[\path{tools/}] Xen node controller daemon (Xend), command line tools, -% control libraries -%\item[\path{xen/}] The Xen VMM. -%\item[\path{linux-*-xen-sparse/}] Xen support for Linux. -%\item[\path{linux-*-patches/}] Experimental patches for Linux. -%\item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. -%\item[\path{docs/}] Various documentation files for users and developers. -%\item[\path{extras/}] Bonus extras. -%\end{description} - -\subsection{Building from Source} - -The top-level Xen Makefile includes a target `world' that will do the -following: - -\begin{itemize} -\item Build Xen -\item Build the control tools, including \xend -\item Download (if necessary) and unpack the Linux 2.6 source code, - and patch it for use with Xen -\item Build a Linux kernel to use in domain 0 and a smaller - unprivileged kernel, which can optionally be used for - unprivileged virtual machines. -\end{itemize} - - -After the build has completed you should have a top-level -directory called \path{dist/} in which all resulting targets -will be placed; of particular interest are the two kernels -XenLinux kernel images, one with a `-xen0' extension -which contains hardware device drivers and drivers for Xen's virtual -devices, and one with a `-xenU' extension that just contains the -virtual ones. These are found in \path{dist/install/boot/} along -with the image for Xen itself and the configuration files used -during the build. - -The NetBSD port can be built using: -\begin{quote} -\begin{verbatim} -# make netbsd20 -\end{verbatim} -\end{quote} -NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. -The snapshot is downloaded as part of the build process, if it is not -yet present in the \path{NETBSD\_SRC\_PATH} search path. The build -process also downloads a toolchain which includes all the tools -necessary to build the NetBSD kernel under Linux. - -To customize further the set of kernels built you need to edit -the top-level Makefile. Look for the line: - -\begin{quote} -\begin{verbatim} -KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU -\end{verbatim} -\end{quote} - -You can edit this line to include any set of operating system kernels -which have configurations in the top-level \path{buildconfigs/} -directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 -kernel containing only virtual device drivers. - -%% Inspect the Makefile if you want to see what goes on during a build. -%% Building Xen and the tools is straightforward, but XenLinux is more -%% complicated. The makefile needs a `pristine' Linux kernel tree to which -%% it will then add the Xen architecture files. You can tell the -%% makefile the location of the appropriate Linux compressed tar file by -%% setting the LINUX\_SRC environment variable, e.g. \\ -%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by -%% placing the tar file somewhere in the search path of {\tt -%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the makefile -%% can't find a suitable kernel tar file it attempts to download it from -%% kernel.org (this won't work if you're behind a firewall). - -%% After untaring the pristine kernel tree, the makefile uses the {\tt -%% mkbuildtree} script to add the Xen patches to the kernel. - - -%% The procedure is similar to build the Linux 2.4 port: \\ -%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! - - -%% \framebox{\parbox{5in}{ -%% {\bf Distro specific:} \\ -%% {\it Gentoo} --- if not using udev (most installations, currently), you'll need -%% to enable devfs and devfs mount at boot time in the xen0 config. -%% }} - -\subsection{Custom XenLinux Builds} - -% If you have an SMP machine you may wish to give the {\tt '-j4'} -% argument to make to get a parallel build. - -If you wish to build a customized XenLinux kernel (e.g. to support -additional devices or enable distribution-required features), you can -use the standard Linux configuration mechanisms, specifying that the -architecture being built for is \path{xen}, e.g: -\begin{quote} -\begin{verbatim} -# cd linux-2.6.11-xen0 -# make ARCH=xen xconfig -# cd .. -# make -\end{verbatim} -\end{quote} - -You can also copy an existing Linux configuration (\path{.config}) -into \path{linux-2.6.11-xen0} and execute: -\begin{quote} -\begin{verbatim} -# make ARCH=xen oldconfig -\end{verbatim} -\end{quote} - -You may be prompted with some Xen-specific options; we -advise accepting the defaults for these options. - -Note that the only difference between the two types of Linux kernel -that are built is the configuration file used for each. The "U" -suffixed (unprivileged) versions don't contain any of the physical -hardware device drivers, leading to a 30\% reduction in size; hence -you may prefer these for your non-privileged domains. The `0' -suffixed privileged versions can be used to boot the system, as well -as in driver domains and unprivileged domains. - - -\subsection{Installing the Binaries} - - -The files produced by the build process are stored under the -\path{dist/install/} directory. To install them in their default -locations, do: -\begin{quote} -\begin{verbatim} -# make install -\end{verbatim} -\end{quote} - - -Alternatively, users with special installation requirements may wish -to install them manually by copying the files to their appropriate -destinations. - -%% Files in \path{install/boot/} include: -%% \begin{itemize} -%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' -%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 XenLinux kernel -%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged XenLinux kernel -%% \end{itemize} - -The \path{dist/install/boot} directory will also contain the config files -used for building the XenLinux kernels, and also versions of Xen and -XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} and -\path{vmlinux-syms-2.6.11.11-xen0}) which are essential for interpreting crash -dumps. Retain these files as the developers may wish to see them if -you post on the mailing list. - - - - - -\section{Configuration} -\label{s:configure} -Once you have built and installed the Xen distribution, it is -simple to prepare the machine for booting and running Xen. - -\subsection{GRUB Configuration} - -An entry should be added to \path{grub.conf} (often found under -\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. -This file is sometimes called \path{menu.lst}, depending on your -distribution. The entry should look something like the following: - -{\small -\begin{verbatim} -title Xen 2.0 / XenLinux 2.6 - kernel /boot/xen-2.0.gz dom0_mem=131072 - module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 -\end{verbatim} -} - -The kernel line tells GRUB where to find Xen itself and what boot -parameters should be passed to it (in this case, setting domain 0's -memory allocation in kilobytes and the settings for the serial port). For more -details on the various Xen boot parameters see Section~\ref{s:xboot}. - -The module line of the configuration describes the location of the -XenLinux kernel that Xen should start and the parameters that should -be passed to it (these are standard Linux parameters, identifying the -root device and specifying it be initially mounted read only and -instructing that console output be sent to the screen). Some -distributions such as SuSE do not require the \path{ro} parameter. - -%% \framebox{\parbox{5in}{ -%% {\bf Distro specific:} \\ -%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux kernel -%% command line, since the partition won't be remounted rw during boot. -%% }} - - -If you want to use an initrd, just add another \path{module} line to -the configuration, as usual: -{\small -\begin{verbatim} - module /boot/my_initrd.gz -\end{verbatim} -} - -As always when installing a new kernel, it is recommended that you do -not delete existing menu options from \path{menu.lst} --- you may want -to boot your old Linux kernel in future, particularly if you -have problems. - - -\subsection{Serial Console (optional)} - -%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 -%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro - - -In order to configure Xen serial console output, it is necessary to add -an boot option to your GRUB config; e.g. replace the above kernel line -with: -\begin{quote} -{\small -\begin{verbatim} - kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 -\end{verbatim}} -\end{quote} - -This configures Xen to output on COM1 at 115,200 baud, 8 data bits, -1 stop bit and no parity. Modify these parameters for your set up. - -One can also configure XenLinux to share the serial console; to -achieve this append ``\path{console=ttyS0}'' to your -module line. - - -If you wish to be able to log in over the XenLinux serial console it -is necessary to add a line into \path{/etc/inittab}, just as per -regular Linux. Simply add the line: -\begin{quote} -{\small -{\tt c:2345:respawn:/sbin/mingetty ttyS0} -} -\end{quote} - -and you should be able to log in. Note that to successfully log in -as root over the serial line will require adding \path{ttyS0} to -\path{/etc/securetty} in most modern distributions. - -\subsection{TLS Libraries} - -Users of the XenLinux 2.6 kernel should disable Thread Local Storage -(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before -attempting to run with a XenLinux kernel\footnote{If you boot without first -disabling TLS, you will get a warning message during the boot -process. In this case, simply perform the rename after the machine is -up and then run \texttt{/sbin/ldconfig} to make it take effect.}. You can -always reenable it by restoring the directory to its original location -(i.e.\ \path{mv /lib/tls.disabled /lib/tls}). - -The reason for this is that the current TLS implementation uses -segmentation in a way that is not permissible under Xen. If TLS is -not disabled, an emulation mode is used within Xen which reduces -performance substantially. - -We hope that this issue can be resolved by working with Linux -distribution vendors to implement a minor backward-compatible change -to the TLS library. - -\section{Booting Xen} - -It should now be possible to restart the system and use Xen. Reboot -as usual but choose the new Xen option when the Grub screen appears. - -What follows should look much like a conventional Linux boot. The -first portion of the output comes from Xen itself, supplying low level -information about itself and the machine it is running on. The -following portion of the output comes from XenLinux. - -You may see some errors during the XenLinux boot. These are not -necessarily anything to worry about --- they may result from kernel -configuration differences between your XenLinux kernel and the one you -usually use. - -When the boot completes, you should be able to log into your system as -usual. If you are unable to log in to your system running Xen, you -should still be able to reboot with your normal Linux kernel. - - -\chapter{Starting Additional Domains} - -The first step in creating a new domain is to prepare a root -filesystem for it to boot off. Typically, this might be stored in a -normal partition, an LVM or other volume manager partition, a disk -file or on an NFS server. A simple way to do this is simply to boot -from your standard OS install CD and install the distribution into -another partition on your hard drive. - -To start the \xend control daemon, type -\begin{quote} -\verb!# xend start! -\end{quote} -If you -wish the daemon to start automatically, see the instructions in -Section~\ref{s:xend}. Once the daemon is running, you can use the -\path{xm} tool to monitor and maintain the domains running on your -system. This chapter provides only a brief tutorial: we provide full -details of the \path{xm} tool in the next chapter. - -%\section{From the web interface} -% -%Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} for -%more details) using the command: \\ -%\verb_# xensv start_ \\ -%This will also start Xend (see Chapter~\ref{cha:xend} for more information). -% -%The domain management interface will then be available at {\tt -%http://your\_machine:8080/}. This provides a user friendly wizard for -%starting domains and functions for managing running domains. -% -%\section{From the command line} - - -\section{Creating a Domain Configuration File} - -Before you can start an additional domain, you must create a -configuration file. We provide two example files which you -can use as a starting point: -\begin{itemize} - \item \path{/etc/xen/xmexample1} is a simple template configuration file - for describing a single VM. - - \item \path{/etc/xen/xmexample2} file is a template description that - is intended to be reused for multiple virtual machines. Setting - the value of the \path{vmid} variable on the \path{xm} command line - fills in parts of this template. -\end{itemize} - -Copy one of these files and edit it as appropriate. -Typical values you may wish to edit include: - -\begin{quote} -\begin{description} -\item[kernel] Set this to the path of the kernel you compiled for use - with Xen (e.g.\ \path{kernel = '/boot/vmlinuz-2.6-xenU'}) -\item[memory] Set this to the size of the domain's memory in -megabytes (e.g.\ \path{memory = 64}) -\item[disk] Set the first entry in this list to calculate the offset -of the domain's root partition, based on the domain ID. Set the -second to the location of \path{/usr} if you are sharing it between -domains (e.g.\ \path{disk = ['phy:your\_hard\_drive\%d,sda1,w' \% -(base\_partition\_number + vmid), 'phy:your\_usr\_partition,sda6,r' ]} -\item[dhcp] Uncomment the dhcp variable, so that the domain will -receive its IP address from a DHCP server (e.g.\ \path{dhcp='dhcp'}) -\end{description} -\end{quote} - -You may also want to edit the {\bf vif} variable in order to choose -the MAC address of the virtual ethernet interface yourself. For -example: -\begin{quote} -\verb_vif = ['mac=00:06:AA:F6:BB:B3']_ -\end{quote} -If you do not set this variable, \xend will automatically generate a -random MAC address from an unused range. - - -\section{Booting the Domain} - -The \path{xm} tool provides a variety of commands for managing domains. -Use the \path{create} command to start new domains. Assuming you've -created a configuration file \path{myvmconf} based around -\path{/etc/xen/xmexample2}, to start a domain with virtual -machine ID~1 you should type: - -\begin{quote} -\begin{verbatim} -# xm create -c myvmconf vmid=1 -\end{verbatim} -\end{quote} - - -The \path{-c} switch causes \path{xm} to turn into the domain's -console after creation. The \path{vmid=1} sets the \path{vmid} -variable used in the \path{myvmconf} file. - - -You should see the console boot messages from the new domain -appearing in the terminal in which you typed the command, -culminating in a login prompt. - - -\section{Example: ttylinux} - -Ttylinux is a very small Linux distribution, designed to require very -few resources. We will use it as a concrete example of how to start a -Xen domain. Most users will probably want to install a full-featured -distribution once they have mastered the basics\footnote{ttylinux is -maintained by Pascal Schmidt. You can download source packages from -the distribution's home page: {\tt http://www.minimalinux.org/ttylinux/}}. - -\begin{enumerate} -\item Download and extract the ttylinux disk image from the Files -section of the project's SourceForge site (see -\path{http://sf.net/projects/xen/}). -\item Create a configuration file like the following: -\begin{verbatim} -kernel = "/boot/vmlinuz-2.6-xenU" -memory = 64 -name = "ttylinux" -nics = 1 -ip = "1.2.3.4" -disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] -root = "/dev/sda1 ro" -\end{verbatim} -\item Now start the domain and connect to its console: -\begin{verbatim} -xm create configfile -c -\end{verbatim} -\item Login as root, password root. -\end{enumerate} - - -\section{Starting / Stopping Domains Automatically} - -It is possible to have certain domains start automatically at boot -time and to have dom0 wait for all running domains to shutdown before -it shuts down the system. - -To specify a domain is to start at boot-time, place its -configuration file (or a link to it) under \path{/etc/xen/auto/}. - -A Sys-V style init script for RedHat and LSB-compliant systems is -provided and will be automatically copied to \path{/etc/init.d/} -during install. You can then enable it in the appropriate way for -your distribution. - -For instance, on RedHat: - -\begin{quote} -\verb_# chkconfig --add xendomains_ -\end{quote} - -By default, this will start the boot-time domains in runlevels 3, 4 -and 5. - -You can also use the \path{service} command to run this script -manually, e.g: - -\begin{quote} -\verb_# service xendomains start_ - -Starts all the domains with config files under /etc/xen/auto/. -\end{quote} - - -\begin{quote} -\verb_# service xendomains stop_ - -Shuts down ALL running Xen domains. -\end{quote} - -\chapter{Domain Management Tools} - -The previous chapter described a simple example of how to configure -and start a domain. This chapter summarises the tools available to -manage running domains. - -\section{Command-line Management} - -Command line management tasks are also performed using the \path{xm} -tool. For online help for the commands available, type: -\begin{quote} -\verb_# xm help_ -\end{quote} - -You can also type \path{xm help $<$command$>$} for more information -on a given command. - -\subsection{Basic Management Commands} - -The most important \path{xm} commands are: -\begin{quote} -\verb_# xm list_: Lists all domains running.\\ -\verb_# xm consoles_ : Gives information about the domain consoles.\\ -\verb_# xm console_: Opens a console to a domain (e.g.\ - \verb_# xm console myVM_ -\end{quote} - -\subsection{\tt xm list} - -The output of \path{xm list} is in rows of the following format: -\begin{center} -{\tt name domid memory cpu state cputime console} -\end{center} - -\begin{quote} -\begin{description} -\item[name] The descriptive name of the virtual machine. -\item[domid] The number of the domain ID this virtual machine is running in. -\item[memory] Memory size in megabytes. -\item[cpu] The CPU this domain is running on. -\item[state] Domain state consists of 5 fields: - \begin{description} - \item[r] running - \item[b] blocked - \item[p] paused - \item[s] shutdown - \item[c] crashed - \end{description} -\item[cputime] How much CPU time (in seconds) the domain has used so far. -\item[console] TCP port accepting connections to the domain's console. -\end{description} -\end{quote} - -The \path{xm list} command also supports a long output format when the -\path{-l} switch is used. This outputs the fulls details of the -running domains in \xend's SXP configuration format. - -For example, suppose the system is running the ttylinux domain as -described earlier. The list command should produce output somewhat -like the following: -\begin{verbatim} -# xm list -Name Id Mem(MB) CPU State Time(s) Console -Domain-0 0 251 0 r---- 172.2 -ttylinux 5 63 0 -b--- 3.0 9605 -\end{verbatim} - -Here we can see the details for the ttylinux domain, as well as for -domain 0 (which, of course, is always running). Note that the console -port for the ttylinux domain is 9605. This can be connected to by TCP -using a terminal program (e.g. \path{telnet} or, better, -\path{xencons}). The simplest way to connect is to use the \path{xm console} -command, specifying the domain name or ID. To connect to the console -of the ttylinux domain, we could use any of the following: -\begin{verbatim} -# xm console ttylinux -# xm console 5 -# xencons localhost 9605 -\end{verbatim} - -\section{Domain Save and Restore} - -The administrator of a Xen system may suspend a virtual machine's -current state into a disk file in domain 0, allowing it to be resumed -at a later time. - -The ttylinux domain described earlier can be suspended to disk using -the command: -\begin{verbatim} -# xm save ttylinux ttylinux.xen -\end{verbatim} - -This will stop the domain named `ttylinux' and save its current state -into a file called \path{ttylinux.xen}. - -To resume execution of this domain, use the \path{xm restore} command: -\begin{verbatim} -# xm restore ttylinux.xen -\end{verbatim} - -This will restore the state of the domain and restart it. The domain -will carry on as before and the console may be reconnected using the -\path{xm console} command, as above. - -\section{Live Migration} - -Live migration is used to transfer a domain between physical hosts -whilst that domain continues to perform its usual activities --- from -the user's perspective, the migration should be imperceptible. - -To perform a live migration, both hosts must be running Xen / \xend and -the destination host must have sufficient resources (e.g. memory -capacity) to accommodate the domain after the move. Furthermore we -currently require both source and destination machines to be on the -same L2 subnet. - -Currently, there is no support for providing automatic remote access -to filesystems stored on local disk when a domain is migrated. -Administrators should choose an appropriate storage solution -(i.e. SAN, NAS, etc.) to ensure that domain filesystems are also -available on their destination node. GNBD is a good method for -exporting a volume from one machine to another. iSCSI can do a similar -job, but is more complex to set up. - -When a domain migrates, it's MAC and IP address move with it, thus it -is only possible to migrate VMs within the same layer-2 network and IP -subnet. If the destination node is on a different subnet, the -administrator would need to manually configure a suitable etherip or -IP tunnel in the domain 0 of the remote node. - -A domain may be migrated using the \path{xm migrate} command. To -live migrate a domain to another machine, we would use -the command: - -\begin{verbatim} -# xm migrate --live mydomain destination.ournetwork.com -\end{verbatim} - -Without the \path{--live} flag, \xend simply stops the domain and -copies the memory image over to the new node and restarts it. Since -domains can have large allocations this can be quite time consuming, -even on a Gigabit network. With the \path{--live} flag \xend attempts -to keep the domain running while the migration is in progress, -resulting in typical `downtimes' of just 60--300ms. - -For now it will be necessary to reconnect to the domain's console on -the new machine using the \path{xm console} command. If a migrated -domain has any open network connections then they will be preserved, -so SSH connections do not have this limitation. - -\section{Managing Domain Memory} - -XenLinux domains have the ability to relinquish / reclaim machine -memory at the request of the administrator or the user of the domain. - -\subsection{Setting memory footprints from dom0} - -The machine administrator can request that a domain alter its memory -footprint using the \path{xm set-mem} command. For instance, we can -request that our example ttylinux domain reduce its memory footprint -to 32 megabytes. - -\begin{verbatim} -# xm set-mem ttylinux 32 -\end{verbatim} - -We can now see the result of this in the output of \path{xm list}: - -\begin{verbatim} -# xm list -Name Id Mem(MB) CPU State Time(s) Console -Domain-0 0 251 0 r---- 172.2 -ttylinux 5 31 0 -b--- 4.3 9605 -\end{verbatim} - -The domain has responded to the request by returning memory to Xen. We -can restore the domain to its original size using the command line: - -\begin{verbatim} -# xm set-mem ttylinux 64 -\end{verbatim} - -\subsection{Setting memory footprints from within a domain} - -The virtual file \path{/proc/xen/balloon} allows the owner of a -domain to adjust their own memory footprint. Reading the file -(e.g. \path{cat /proc/xen/balloon}) prints out the current -memory footprint of the domain. Writing the file -(e.g. \path{echo new\_target > /proc/xen/balloon}) requests -that the kernel adjust the domain's memory footprint to a new value. - -\subsection{Setting memory limits} - -Xen associates a memory size limit with each domain. By default, this -is the amount of memory the domain is originally started with, -preventing the domain from ever growing beyond this size. To permit a -domain to grow beyond its original allocation or to prevent a domain -you've shrunk from reclaiming the memory it relinquished, use the -\path{xm maxmem} command. - -\chapter{Domain Filesystem Storage} - -It is possible to directly export any Linux block device in dom0 to -another domain, or to export filesystems / devices to virtual machines -using standard network protocols (e.g. NBD, iSCSI, NFS, etc). This -chapter covers some of the possibilities. - - -\section{Exporting Physical Devices as VBDs} -\label{s:exporting-physical-devices-as-vbds} - -One of the simplest configurations is to directly export -individual partitions from domain 0 to other domains. To -achieve this use the \path{phy:} specifier in your domain -configuration file. For example a line like -\begin{quote} -\verb_disk = ['phy:hda3,sda1,w']_ -\end{quote} -specifies that the partition \path{/dev/hda3} in domain 0 -should be exported read-write to the new domain as \path{/dev/sda1}; -one could equally well export it as \path{/dev/hda} or -\path{/dev/sdb5} should one wish. - -In addition to local disks and partitions, it is possible to export -any device that Linux considers to be ``a disk'' in the same manner. -For example, if you have iSCSI disks or GNBD volumes imported into -domain 0 you can export these to other domains using the \path{phy:} -disk syntax. E.g.: -\begin{quote} -\verb_disk = ['phy:vg/lvm1,sda2,w']_ -\end{quote} - - - -\begin{center} -\framebox{\bf Warning: Block device sharing} -\end{center} -\begin{quote} -Block devices should typically only be shared between domains in a -read-only fashion otherwise the Linux kernel's file systems will get -very confused as the file system structure may change underneath them -(having the same ext3 partition mounted rw twice is a sure fire way to -cause irreparable damage)! \Xend will attempt to prevent you from -doing this by checking that the device is not mounted read-write in -domain 0, and hasn't already been exported read-write to another -domain. -If you want read-write sharing, export the directory to other domains -via NFS from domain0 (or use a cluster file system such as GFS or -ocfs2). - -\end{quote} - - -\section{Using File-backed VBDs} - -It is also possible to use a file in Domain 0 as the primary storage -for a virtual machine. As well as being convenient, this also has the -advantage that the virtual block device will be {\em sparse} --- space -will only really be allocated as parts of the file are used. So if a -virtual machine uses only half of its disk space then the file really -takes up half of the size allocated. - -For example, to create a 2GB sparse file-backed virtual block device -(actually only consumes 1KB of disk): -\begin{quote} -\verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ -\end{quote} - -Make a file system in the disk file: -\begin{quote} -\verb_# mkfs -t ext3 vm1disk_ -\end{quote} - -(when the tool asks for confirmation, answer `y') - -Populate the file system e.g. by copying from the current root: -\begin{quote} -\begin{verbatim} -# mount -o loop vm1disk /mnt -# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt -# mkdir /mnt/{proc,sys,home,tmp} -\end{verbatim} -\end{quote} - -Tailor the file system by editing \path{/etc/fstab}, -\path{/etc/hostname}, etc (don't forget to edit the files in the -mounted file system, instead of your domain 0 filesystem, e.g. you -would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab} ). For -this example put \path{/dev/sda1} to root in fstab. - -Now unmount (this is important!): -\begin{quote} -\verb_# umount /mnt_ -\end{quote} - -In the configuration file set: -\begin{quote} -\verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ -\end{quote} - -As the virtual machine writes to its `disk', the sparse file will be -filled in and consume more space up to the original 2GB. - -{\bf Note that file-backed VBDs may not be appropriate for backing -I/O-intensive domains.} File-backed VBDs are known to experience -substantial slowdowns under heavy I/O workloads, due to the I/O handling -by the loopback block device used to support file-backed VBDs in dom0. -Better I/O performance can be achieved by using either LVM-backed VBDs -(Section~\ref{s:using-lvm-backed-vbds}) or physical devices as VBDs -(Section~\ref{s:exporting-physical-devices-as-vbds}). - -Linux supports a maximum of eight file-backed VBDs across all domains by -default. This limit can be statically increased by using the {\em -max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is compiled as a -module in the dom0 kernel, or by using the {\em max\_loop=n} boot option -if CONFIG\_BLK\_DEV\_LOOP is compiled directly into the dom0 kernel. - - -\section{Using LVM-backed VBDs} -\label{s:using-lvm-backed-vbds} - -A particularly appealing solution is to use LVM volumes -as backing for domain file-systems since this allows dynamic -growing/shrinking of volumes as well as snapshot and other -features. - -To initialise a partition to support LVM volumes: -\begin{quote} -\begin{verbatim} -# pvcreate /dev/sda10 -\end{verbatim} -\end{quote} - -Create a volume group named `vg' on the physical partition: -\begin{quote} -\begin{verbatim} -# vgcreate vg /dev/sda10 -\end{verbatim} -\end{quote} - -Create a logical volume of size 4GB named `myvmdisk1': -\begin{quote} -\begin{verbatim} -# lvcreate -L4096M -n myvmdisk1 vg -\end{verbatim} -\end{quote} - -You should now see that you have a \path{/dev/vg/myvmdisk1} -Make a filesystem, mount it and populate it, e.g.: -\begin{quote} -\begin{verbatim} -# mkfs -t ext3 /dev/vg/myvmdisk1 -# mount /dev/vg/myvmdisk1 /mnt -# cp -ax / /mnt -# umount /mnt -\end{verbatim} -\end{quote} - -Now configure your VM with the following disk configuration: -\begin{quote} -\begin{verbatim} - disk = [ 'phy:vg/myvmdisk1,sda1,w' ] -\end{verbatim} -\end{quote} - -LVM enables you to grow the size of logical volumes, but you'll need -to resize the corresponding file system to make use of the new -space. Some file systems (e.g. ext3) now support on-line resize. See -the LVM manuals for more details. - -You can also use LVM for creating copy-on-write clones of LVM -volumes (known as writable persistent snapshots in LVM -terminology). This facility is new in Linux 2.6.8, so isn't as -stable as one might hope. In particular, using lots of CoW LVM -disks consumes a lot of dom0 memory, and error conditions such as -running out of disk space are not handled well. Hopefully this -will improve in future. - -To create two copy-on-write clone of the above file system you -would use the following commands: - -\begin{quote} -\begin{verbatim} -# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 -# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 -\end{verbatim} -\end{quote} - -Each of these can grow to have 1GB of differences from the master -volume. You can grow the amount of space for storing the -differences using the lvextend command, e.g.: -\begin{quote} -\begin{verbatim} -# lvextend +100M /dev/vg/myclonedisk1 -\end{verbatim} -\end{quote} - -Don't let the `differences volume' ever fill up otherwise LVM gets -rather confused. It may be possible to automate the growing -process by using \path{dmsetup wait} to spot the volume getting full -and then issue an \path{lvextend}. - -In principle, it is possible to continue writing to the volume -that has been cloned (the changes will not be visible to the -clones), but we wouldn't recommend this: have the cloned volume -as a `pristine' file system install that isn't mounted directly -by any of the virtual machines. - - -\section{Using NFS Root} - -First, populate a root filesystem in a directory on the server -machine. This can be on a distinct physical machine, or simply -run within a virtual machine on the same node. - -Now configure the NFS server to export this filesystem over the -network by adding a line to \path{/etc/exports}, for instance: - -\begin{quote} -\begin{small} -\begin{verbatim} -/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) -\end{verbatim} -\end{small} -\end{quote} - -Finally, configure the domain to use NFS root. In addition to the -normal variables, you should make sure to set the following values in -the domain's configuration file: - -\begin{quote} -\begin{small} -\begin{verbatim} -root = '/dev/nfs' -nfs_server = '2.3.4.5' # substitute IP address of server -nfs_root = '/path/to/root' # path to root FS on the server -\end{verbatim} -\end{small} -\end{quote} - -The domain will need network access at boot time, so either statically -configure an IP address (Using the config variables \path{ip}, -\path{netmask}, \path{gateway}, \path{hostname}) or enable DHCP ( -\path{dhcp='dhcp'}). - -Note that the Linux NFS root implementation is known to have stability -problems under high load (this is not a Xen-specific problem), so this -configuration may not be appropriate for critical servers. + +%% Chapter Introduction moved to introduction.tex +\include{src/user/introduction} + +%% Chapter Installation moved to installation.tex +\include{src/user/installation} + +%% Chapter Starting Additional Domains moved to start_addl_dom.tex +\include{src/user/start_addl_dom} + +%% Chapter Domain Management Tools moved to domain_mgmt.tex +\include{src/user/domain_mgmt} + +%% Chapter Domain Filesystem Storage moved to domain_filesystem.tex +\include{src/user/domain_filesystem} + \part{User Reference Documentation} -\chapter{Control Software} - -The Xen control software includes the \xend node control daemon (which -must be running), the xm command line tools, and the prototype -xensv web interface. - -\section{\Xend (node control daemon)} -\label{s:xend} - -The Xen Daemon (\Xend) performs system management functions related to -virtual machines. It forms a central point of control for a machine -and can be controlled using an HTTP-based protocol. \Xend must be -running in order to start and manage virtual machines. - -\Xend must be run as root because it needs access to privileged system -management functions. A small set of commands may be issued on the -\xend command line: - -\begin{tabular}{ll} -\verb!# xend start! & start \xend, if not already running \\ -\verb!# xend stop! & stop \xend if already running \\ -\verb!# xend restart! & restart \xend if running, otherwise start it \\ -% \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ -\verb!# xend status! & indicates \xend status by its return code -\end{tabular} - -A SysV init script called {\tt xend} is provided to start \xend at boot -time. {\tt make install} installs this script in {\path{/etc/init.d}. -To enable it, you have to make symbolic links in the appropriate -runlevel directories or use the {\tt chkconfig} tool, where available. - -Once \xend is running, more sophisticated administration can be done -using the xm tool (see Section~\ref{s:xm}) and the experimental -Xensv web interface (see Section~\ref{s:xensv}). - -As \xend runs, events will be logged to \path{/var/log/xend.log} and, -if the migration assistant daemon (\path{xfrd}) has been started, -\path{/var/log/xfrd.log}. These may be of use for troubleshooting -problems. - -\section{Xm (command line interface)} -\label{s:xm} - -The xm tool is the primary tool for managing Xen from the console. -The general format of an xm command line is: - -\begin{verbatim} -# xm command [switches] [arguments] [variables] -\end{verbatim} - -The available {\em switches} and {\em arguments} are dependent on the -{\em command} chosen. The {\em variables} may be set using -declarations of the form {\tt variable=value} and command line -declarations override any of the values in the configuration file -being used, including the standard variables described above and any -custom variables (for instance, the \path{xmdefconfig} file uses a -{\tt vmid} variable). - -The available commands are as follows: - -\begin{description} -\item[set-mem] Request a domain to adjust its memory footprint. -\item[create] Create a new domain. -\item[destroy] Kill a domain immediately. -\item[list] List running domains. -\item[shutdown] Ask a domain to shutdown. -\item[dmesg] Fetch the Xen (not Linux!) boot output. -\item[consoles] Lists the available consoles. -\item[console] Connect to the console for a domain. -\item[help] Get help on xm commands. -\item[save] Suspend a domain to disk. -\item[restore] Restore a domain from disk. -\item[pause] Pause a domain's execution. -\item[unpause] Unpause a domain. -\item[pincpu] Pin a domain to a CPU. -\item[bvt] Set BVT scheduler parameters for a domain. -\item[bvt\_ctxallow] Set the BVT context switching allowance for the system. -\item[atropos] Set the atropos parameters for a domain. -\item[rrobin] Set the round robin time slice for the system. -\item[info] Get information about the Xen host. -\item[call] Call a \xend HTTP API function directly. -\end{description} - -For a detailed overview of switches, arguments and variables to each command -try -\begin{quote} -\begin{verbatim} -# xm help command -\end{verbatim} -\end{quote} - -\section{Xensv (web control interface)} -\label{s:xensv} - -Xensv is the experimental web control interface for managing a Xen -machine. It can be used to perform some (but not yet all) of the -management tasks that can be done using the xm tool. - -It can be started using: -\begin{quote} -\verb_# xensv start_ -\end{quote} -and stopped using: -\begin{quote} -\verb_# xensv stop_ -\end{quote} - -By default, Xensv will serve out the web interface on port 8080. This -can be changed by editing -\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. - -Once Xensv is running, the web interface can be used to create and -manage running domains. - - - - -\chapter{Domain Configuration} -\label{cha:config} - -The following contains the syntax of the domain configuration -files and description of how to further specify networking, -driver domain and general scheduling behaviour. - -\section{Configuration Files} -\label{s:cfiles} - -Xen configuration files contain the following standard variables. -Unless otherwise stated, configuration items should be enclosed in -quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} -for concrete examples of the syntax. - -\begin{description} -\item[kernel] Path to the kernel image -\item[ramdisk] Path to a ramdisk image (optional). -% \item[builder] The name of the domain build function (e.g. {\tt'linux'} or {\tt'netbsd'}. -\item[memory] Memory size in megabytes. -\item[cpu] CPU to run this domain on, or {\tt -1} for - auto-allocation. -\item[console] Port to export the domain console on (default 9600 + domain ID). -\item[nics] Number of virtual network interfaces. -\item[vif] List of MAC addresses (random addresses are assigned if not - given) and bridges to use for the domain's network interfaces, e.g. -\begin{verbatim} -vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', - 'bridge=xen-br1' ] -\end{verbatim} - to assign a MAC address and bridge to the first interface and assign - a different bridge to the second interface, leaving \xend to choose - the MAC address. -\item[disk] List of block devices to export to the domain, e.g. \\ - \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ - exports physical device \path{/dev/hda1} to the domain - as \path{/dev/sda1} with read-only access. Exporting a disk read-write - which is currently mounted is dangerous -- if you are \emph{certain} - you wish to do this, you can specify \path{w!} as the mode. -\item[dhcp] Set to {\tt 'dhcp'} if you want to use DHCP to configure - networking. -\item[netmask] Manually configured IP netmask. -\item[gateway] Manually configured IP gateway. -\item[hostname] Set the hostname for the virtual machine. -\item[root] Specify the root device parameter on the kernel command - line. -\item[nfs\_server] IP address for the NFS server (if any). -\item[nfs\_root] Path of the root filesystem on the NFS server (if any). -\item[extra] Extra string to append to the kernel command line (if - any) -\item[restart] Three possible options: - \begin{description} - \item[always] Always restart the domain, no matter what - its exit code is. - \item[never] Never restart the domain. - \item[onreboot] Restart the domain iff it requests reboot. - \end{description} -\end{description} - -For additional flexibility, it is also possible to include Python -scripting commands in configuration files. An example of this is the -\path{xmexample2} file, which uses Python code to handle the -\path{vmid} variable. - - -%\part{Advanced Topics} - -\section{Network Configuration} - -For many users, the default installation should work `out of the box'. -More complicated network setups, for instance with multiple ethernet -interfaces and/or existing bridging setups will require some -special configuration. - -The purpose of this section is to describe the mechanisms provided by -\xend to allow a flexible configuration for Xen's virtual networking. - -\subsection{Xen virtual network topology} - -Each domain network interface is connected to a virtual network -interface in dom0 by a point to point link (effectively a `virtual -crossover cable'). These devices are named {\tt -vif$<$domid$>$.$<$vifid$>$} (e.g. {\tt vif1.0} for the first interface -in domain 1, {\tt vif3.1} for the second interface in domain 3). - -Traffic on these virtual interfaces is handled in domain 0 using -standard Linux mechanisms for bridging, routing, rate limiting, etc. -Xend calls on two shell scripts to perform initial configuration of -the network and configuration of new virtual interfaces. By default, -these scripts configure a single bridge for all the virtual -interfaces. Arbitrary routing / bridging configurations can be -configured by customising the scripts, as described in the following -section. - -\subsection{Xen networking scripts} - -Xen's virtual networking is configured by two shell scripts (by -default \path{network} and \path{vif-bridge}). These are -called automatically by \xend when certain events occur, with -arguments to the scripts providing further contextual information. -These scripts are found by default in \path{/etc/xen/scripts}. The -names and locations of the scripts can be configured in -\path{/etc/xen/xend-config.sxp}. - -\begin{description} - -\item[network:] This script is called whenever \xend is started or -stopped to respectively initialise or tear down the Xen virtual -network. In the default configuration initialisation creates the -bridge `xen-br0' and moves eth0 onto that bridge, modifying the -routing accordingly. When \xend exits, it deletes the Xen bridge and -removes eth0, restoring the normal IP and routing configuration. - -%% In configurations where the bridge already exists, this script could -%% be replaced with a link to \path{/bin/true} (for instance). - -\item[vif-bridge:] This script is called for every domain virtual -interface and can configure firewalling rules and add the vif -to the appropriate bridge. By default, this adds and removes -VIFs on the default Xen bridge. - -\end{description} - -For more complex network setups (e.g. where routing is required or -integrate with existing bridges) these scripts may be replaced with -customised variants for your site's preferred configuration. - -%% There are two possible types of privileges: IO privileges and -%% administration privileges. - -\section{Driver Domain Configuration} - -I/O privileges can be assigned to allow a domain to directly access -PCI devices itself. This is used to support driver domains. - -Setting backend privileges is currently only supported in SXP format -config files. To allow a domain to function as a backend for others, -somewhere within the {\tt vm} element of its configuration file must -be a {\tt backend} element of the form {\tt (backend ({\em type}))} -where {\tt \em type} may be either {\tt netif} or {\tt blkif}, -according to the type of virtual device this domain will service. -%% After this domain has been built, \xend will connect all new and -%% existing {\em virtual} devices (of the appropriate type) to that -%% backend. - -Note that a block backend cannot currently import virtual block -devices from other domains, and a network backend cannot import -virtual network devices from other domains. Thus (particularly in the -case of block backends, which cannot import a virtual block device as -their root filesystem), you may need to boot a backend domain from a -ramdisk or a network device. - -Access to PCI devices may be configured on a per-device basis. Xen -will assign the minimal set of hardware privileges to a domain that -are required to control its devices. This can be configured in either -format of configuration file: - -\begin{itemize} -\item SXP Format: Include device elements of the form: \\ -\centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ - inside the top-level {\tt vm} element. Each one specifies the address - of a device this domain is allowed to access --- - the numbers {\em x},{\em y} and {\em z} may be in either decimal or - hexadecimal format. -\item Flat Format: Include a list of PCI device addresses of the - format: \\ -\centerline{{\tt pci = ['x,y,z', ...]}} \\ -where each element in the - list is a string specifying the components of the PCI device - address, separated by commas. The components ({\tt \em x}, {\tt \em - y} and {\tt \em z}) of the list may be formatted as either decimal - or hexadecimal. -\end{itemize} - -%% \section{Administration Domains} - -%% Administration privileges allow a domain to use the `dom0 -%% operations' (so called because they are usually available only to -%% domain 0). A privileged domain can build other domains, set scheduling -%% parameters, etc. - -% Support for other administrative domains is not yet available... perhaps -% we should plumb it in some time - - - - - -\section{Scheduler Configuration} -\label{s:sched} - - -Xen offers a boot time choice between multiple schedulers. To select -a scheduler, pass the boot parameter {\em sched=sched\_name} to Xen, -substituting the appropriate scheduler name. Details of the schedulers -and their parameters are included below; future versions of the tools -will provide a higher-level interface to these tools. - -It is expected that system administrators configure their system to -use the scheduler most appropriate to their needs. Currently, the BVT -scheduler is the recommended choice. - -\subsection{Borrowed Virtual Time} - -{\tt sched=bvt} (the default) \\ - -BVT provides proportional fair shares of the CPU time. It has been -observed to penalise domains that block frequently (e.g. I/O intensive -domains), but this can be compensated for by using warping. - -\subsubsection{Global Parameters} - -\begin{description} -\item[ctx\_allow] - the context switch allowance is similar to the `quantum' - in traditional schedulers. It is the minimum time that - a scheduled domain will be allowed to run before being - pre-empted. -\end{description} - -\subsubsection{Per-domain parameters} - -\begin{description} -\item[mcuadv] - the MCU (Minimum Charging Unit) advance determines the - proportional share of the CPU that a domain receives. It - is set inversely proportionally to a domain's sharing weight. -\item[warp] - the amount of `virtual time' the domain is allowed to warp - backwards -\item[warpl] - the warp limit is the maximum time a domain can run warped for -\item[warpu] - the unwarp requirement is the minimum time a domain must - run unwarped for before it can warp again -\end{description} - -\subsection{Atropos} - -{\tt sched=atropos} \\ - -Atropos is a soft real time scheduler. It provides guarantees about -absolute shares of the CPU, with a facility for sharing -slack CPU time on a best-effort basis. It can provide timeliness -guarantees for latency-sensitive domains. - -Every domain has an associated period and slice. The domain should -receive `slice' nanoseconds every `period' nanoseconds. This allows -the administrator to configure both the absolute share of the CPU a -domain receives and the frequency with which it is scheduled. - -%% When -%% domains unblock, their period is reduced to the value of the latency -%% hint (the slice is scaled accordingly so that they still get the same -%% proportion of the CPU). For each subsequent period, the slice and -%% period times are doubled until they reach their original values. - -Note: don't overcommit the CPU when using Atropos (i.e. don't reserve -more CPU than is available --- the utilisation should be kept to -slightly less than 100\% in order to ensure predictable behaviour). - -\subsubsection{Per-domain parameters} - -\begin{description} -\item[period] The regular time interval during which a domain is - guaranteed to receive its allocation of CPU time. -\item[slice] - The length of time per period that a domain is guaranteed to run - for (in the absence of voluntary yielding of the CPU). -\item[latency] - The latency hint is used to control how soon after - waking up a domain it should be scheduled. -\item[xtratime] This is a boolean flag that specifies whether a domain - should be allowed a share of the system slack time. -\end{description} - -\subsection{Round Robin} - -{\tt sched=rrobin} \\ - -The round robin scheduler is included as a simple demonstration of -Xen's internal scheduler API. It is not intended for production use. - -\subsubsection{Global Parameters} - -\begin{description} -\item[rr\_slice] - The maximum time each domain runs before the next - scheduling decision is made. -\end{description} - - - - - - - - - - - - -\chapter{Build, Boot and Debug options} - -This chapter describes the build- and boot-time options -which may be used to tailor your Xen system. - -\section{Xen Build Options} - -Xen provides a number of build-time options which should be -set as environment variables or passed on make's command-line. - -\begin{description} -\item[verbose=y] Enable debugging messages when Xen detects an unexpected condition. -Also enables console output from all domains. -\item[debug=y] -Enable debug assertions. Implies {\bf verbose=y}. -(Primarily useful for tracing bugs in Xen). -\item[debugger=y] -Enable the in-Xen debugger. This can be used to debug -Xen, guest OSes, and applications. -\item[perfc=y] -Enable performance counters for significant events -within Xen. The counts can be reset or displayed -on Xen's console via console control keys. -\item[trace=y] -Enable per-cpu trace buffers which log a range of -events within Xen for collection by control -software. -\end{description} - -\section{Xen Boot Options} -\label{s:xboot} - -These options are used to configure Xen's behaviour at runtime. They -should be appended to Xen's command line, either manually or by -editing \path{grub.conf}. - -\begin{description} -\item [noreboot ] - Don't reboot the machine automatically on errors. This is - useful to catch debug output if you aren't catching console messages - via the serial line. - -\item [nosmp ] - Disable SMP support. - This option is implied by `ignorebiostables'. - -\item [watchdog ] - Enable NMI watchdog which can report certain failures. - -\item [noirqbalance ] - Disable software IRQ balancing and affinity. This can be used on - systems such as Dell 1850/2850 that have workarounds in hardware for - IRQ-routing issues. - -\item [badpage=$<$page number$>$,$<$page number$>$, \ldots ] - Specify a list of pages not to be allocated for use - because they contain bad bytes. For example, if your - memory tester says that byte 0x12345678 is bad, you would - place `badpage=0x12345' on Xen's command line. - -\item [com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ - com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ - Xen supports up to two 16550-compatible serial ports. - For example: `com1=9600, 8n1, 0x408, 5' maps COM1 to a - 9600-baud port, 8 data bits, no parity, 1 stop bit, - I/O port base 0x408, IRQ 5. - If some configuration options are standard (e.g., I/O base and IRQ), - then only a prefix of the full configuration string need be - specified. If the baud rate is pre-configured (e.g., by the - bootloader) then you can specify `auto' in place of a numeric baud - rate. - -\item [console=$<$specifier list$>$ ] - Specify the destination for Xen console I/O. - This is a comma-separated list of, for example: -\begin{description} - \item[vga] use VGA console and allow keyboard input - \item[com1] use serial port com1 - \item[com2H] use serial port com2. Transmitted chars will - have the MSB set. Received chars must have - MSB set. - \item[com2L] use serial port com2. Transmitted chars will - have the MSB cleared. Received chars must - have MSB cleared. -\end{description} - The latter two examples allow a single port to be - shared by two subsystems (e.g. console and - debugger). Sharing is controlled by MSB of each - transmitted/received character. - [NB. Default for this option is `com1,vga'] - -\item [sync\_console ] - Force synchronous console output. This is useful if you system fails - unexpectedly before it has sent all available output to the - console. In most cases Xen will automatically enter synchronous mode - when an exceptional event occurs, but this option provides a manual - fallback. - -\item [conswitch=$<$switch-char$><$auto-switch-char$>$ ] - Specify how to switch serial-console input between - Xen and DOM0. The required sequence is CTRL-$<$switch-char$>$ - pressed three times. Specifying the backtick character - disables switching. - The $<$auto-switch-char$>$ specifies whether Xen should - auto-switch input to DOM0 when it boots --- if it is `x' - then auto-switching is disabled. Any other value, or - omitting the character, enables auto-switching. - [NB. default switch-char is `a'] - -\item [nmi=xxx ] - Specify what to do with an NMI parity or I/O error. \\ - `nmi=fatal': Xen prints a diagnostic and then hangs. \\ - `nmi=dom0': Inform DOM0 of the NMI. \\ - `nmi=ignore': Ignore the NMI. - -\item [mem=xxx ] - Set the physical RAM address limit. Any RAM appearing beyond this - physical address in the memory map will be ignored. This parameter - may be specified with a B, K, M or G suffix, representing bytes, - kilobytes, megabytes and gigabytes respectively. The - default unit, if no suffix is specified, is kilobytes. - -\item [dom0\_mem=xxx ] - Set the amount of memory to be allocated to domain0. In Xen 3.x the parameter - may be specified with a B, K, M or G suffix, representing bytes, - kilobytes, megabytes and gigabytes respectively; if no suffix is specified, - the parameter defaults to kilobytes. In previous versions of Xen, suffixes - were not supported and the value is always interpreted as kilobytes. - -\item [tbuf\_size=xxx ] - Set the size of the per-cpu trace buffers, in pages - (default 1). Note that the trace buffers are only - enabled in debug builds. Most users can ignore - this feature completely. - -\item [sched=xxx ] - Select the CPU scheduler Xen should use. The current - possibilities are `bvt' (default), `atropos' and `rrobin'. - For more information see Section~\ref{s:sched}. - -\item [apic\_verbosity=debug,verbose ] - Print more detailed information about local APIC and IOAPIC configuration. - -\item [lapic ] - Force use of local APIC even when left disabled by uniprocessor BIOS. - -\item [nolapic ] - Ignore local APIC in a uniprocessor system, even if enabled by the BIOS. - -\item [apic=bigsmp,default,es7000,summit ] - Specify NUMA platform. This can usually be probed automatically. - -\end{description} - -In addition, the following options may be specified on the Xen command -line. Since domain 0 shares responsibility for booting the platform, -Xen will automatically propagate these options to its command -line. These options are taken from Linux's command-line syntax with -unchanged semantics. - -\begin{description} -\item [acpi=off,force,strict,ht,noirq,\ldots ] - Modify how Xen (and domain 0) parses the BIOS ACPI tables. - -\item [acpi\_skip\_timer\_override ] - Instruct Xen (and domain 0) to ignore timer-interrupt override - instructions specified by the BIOS ACPI tables. - -\item [noapic ] - Instruct Xen (and domain 0) to ignore any IOAPICs that are present in - the system, and instead continue to use the legacy PIC. - -\end{description} - -\section{XenLinux Boot Options} - -In addition to the standard Linux kernel boot options, we support: -\begin{description} -\item[xencons=xxx ] Specify the device node to which the Xen virtual -console driver is attached. The following options are supported: -\begin{center} -\begin{tabular}{l} -`xencons=off': disable virtual console \\ -`xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ -`xencons=ttyS': attach console to /dev/ttyS0 -\end{tabular} -\end{center} -The default is ttyS for dom0 and tty for all other domains. -\end{description} - - - -\section{Debugging} -\label{s:keys} - -Xen has a set of debugging features that can be useful to try and -figure out what's going on. Hit 'h' on the serial line (if you -specified a baud rate on the Xen command line) or ScrollLock-h on the -keyboard to get a list of supported commands. - -If you have a crash you'll likely get a crash dump containing an EIP -(PC) which, along with an \path{objdump -d image}, can be useful in -figuring out what's happened. Debug a Xenlinux image just as you -would any other Linux kernel. - -%% We supply a handy debug terminal program which you can find in -%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} -%% This should be built and executed on another machine that is connected -%% via a null modem cable. Documentation is included. -%% Alternatively, if the Xen machine is connected to a serial-port server -%% then we supply a dumb TCP terminal client, {\tt xencons}. - - +%% Chapter Control Software moved to control_software.tex +\include{src/user/control_software} + +%% Chapter Domain Configuration moved to domain_configuration.tex +\include{src/user/domain_configuration} + +%% Chapter Build, Boot and Debug Options moved to build.tex +\include{src/user/build} \chapter{Further Support} @@ -1875,6 +108,7 @@ %Various HOWTOs are available in \path{docs/HOWTOS} but this content is %being integrated into this manual. + \section{Online References} The official Xen web site is found at: @@ -1884,6 +118,7 @@ This contains links to the latest versions of all on-line documentation (including the lateset version of the FAQ). + \section{Mailing Lists} @@ -1905,326 +140,18 @@ \end{description} + \appendix - -\chapter{Installing Xen / XenLinux on Debian} - -The Debian project provides a tool called \path{debootstrap} which -allows a base Debian system to be installed into a filesystem without -requiring the host system to have any Debian-specific software (such -as \path{apt}. - -Here's some info how to install Debian 3.1 (Sarge) for an unprivileged -Xen domain: - -\begin{enumerate} -\item Set up Xen 2.0 and test that it's working, as described earlier in - this manual. - -\item Create disk images for root-fs and swap (alternatively, you - might create dedicated partitions, LVM logical volumes, etc. if - that suits your setup). -\begin{small}\begin{verbatim} -dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes -dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes -\end{verbatim}\end{small} - If you're going to use this filesystem / disk image only as a - `template' for other vm disk images, something like 300 MB should - be enough.. (of course it depends what kind of packages you are - planning to install to the template) - -\item Create the filesystem and initialise the swap image -\begin{small}\begin{verbatim} -mkfs.ext3 /path/diskimage -mkswap /path/swapimage -\end{verbatim}\end{small} - -\item Mount the disk image for installation -\begin{small}\begin{verbatim} -mount -o loop /path/diskimage /mnt/disk -\end{verbatim}\end{small} - -\item Install \path{debootstrap} - -Make sure you have debootstrap installed on the host. If you are -running Debian sarge (3.1 / testing) or unstable you can install it by -running \path{apt-get install debootstrap}. Otherwise, it can be -downloaded from the Debian project website. - -\item Install Debian base to the disk image: -\begin{small}\begin{verbatim} -debootstrap --arch i386 sarge /mnt/disk \ - http://ftp.<countrycode>.debian.org/debian -\end{verbatim}\end{small} - -You can use any other Debian http/ftp mirror you want. - -\item When debootstrap completes successfully, modify settings: -\begin{small}\begin{verbatim} -chroot /mnt/disk /bin/bash -\end{verbatim}\end{small} - -Edit the following files using vi or nano and make needed changes: -\begin{small}\begin{verbatim} -/etc/hostname -/etc/hosts -/etc/resolv.conf -/etc/network/interfaces -/etc/networks -\end{verbatim}\end{small} - -Set up access to the services, edit: -\begin{small}\begin{verbatim} -/etc/hosts.deny -/etc/hosts.allow -/etc/inetd.conf -\end{verbatim}\end{small} - -Add Debian mirror to: -\begin{small}\begin{verbatim} -/etc/apt/sources.list -\end{verbatim}\end{small} - -Create fstab like this: -\begin{small}\begin{verbatim} -/dev/sda1 / ext3 errors=remount-ro 0 1 -/dev/sda2 none swap sw 0 0 -proc /proc proc defaults 0 0 -\end{verbatim}\end{small} - -Logout - -\item Unmount the disk image -\begin{small}\begin{verbatim} -umount /mnt/disk -\end{verbatim}\end{small} - -\item Create Xen 2.0 configuration file for the new domain. You can - use the example-configurations coming with Xen as a template. - - Make sure you have the following set up: -\begin{small}\begin{verbatim} -disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] -root = "/dev/sda1 ro" -\end{verbatim}\end{small} - -\item Start the new domain -\begin{small}\begin{verbatim} -xm create -f domain_config_file -\end{verbatim}\end{small} - -Check that the new domain is running: -\begin{small}\begin{verbatim} -xm list -\end{verbatim}\end{small} - -\item Attach to the console of the new domain. - You should see something like this when starting the new domain: - -\begin{small}\begin{verbatim} -Started domain testdomain2, console on port 9626 -\end{verbatim}\end{small} - - There you can see the ID of the console: 26. You can also list - the consoles with \path{xm consoles} (ID is the last two - digits of the port number.) - - Attach to the console: - -\begin{small}\begin{verbatim} -xm console 26 -\end{verbatim}\end{small} - - or by telnetting to the port 9626 of localhost (the xm console - program works better). - -\item Log in and run base-config - - As a default there's no password for the root. - - Check that everything looks OK, and the system started without - errors. Check that the swap is active, and the network settings are - correct. - - Run \path{/usr/sbin/base-config} to set up the Debian settings. - - Set up the password for root using passwd. - -\item Done. You can exit the console by pressing \path{Ctrl + ]} - -\end{enumerate} - -If you need to create new domains, you can just copy the contents of -the `template'-image to the new disk images, either by mounting the -template and the new image, and using \path{cp -a} or \path{tar} or by -simply copying the image file. Once this is done, modify the -image-specific settings (hostname, network settings, etc). - -\chapter{Installing Xen / XenLinux on Redhat or Fedora Core} - -When using Xen / XenLinux on a standard Linux distribution there are -a couple of things to watch out for: - -Note that, because domains>0 don't have any privileged access at all, -certain commands in the default boot sequence will fail e.g. attempts -to update the hwclock, change the console font, update the keytable -map, start apmd (power management), or gpm (mouse cursor). Either -ignore the errors (they should be harmless), or remove them from the -startup scripts. Deleting the following links are a good start: -{\path{S24pcmcia}}, {\path{S09isdn}}, -{\path{S17keytable}}, {\path{S26apmd}}, -{\path{S85gpm}}. - -If you want to use a single root file system that works cleanly for -both domain 0 and unprivileged domains, a useful trick is to use -different 'init' run levels. For example, use -run level 3 for domain 0, and run level 4 for other domains. This -enables different startup scripts to be run in depending on the run -level number passed on the kernel command line. - -If using NFS root files systems mounted either from an -external server or from domain0 there are a couple of other gotchas. -The default {\path{/etc/sysconfig/iptables}} rules block NFS, so part -way through the boot sequence things will suddenly go dead. - -If you're planning on having a separate NFS {\path{/usr}} partition, the -RH9 boot scripts don't make life easy - they attempt to mount NFS file -systems way to late in the boot process. The easiest way I found to do -this was to have a {\path{/linuxrc}} script run ahead of -{\path{/sbin/init}} that mounts {\path{/usr}}: - -\begin{quote} -\begin{small}\begin{verbatim} - #!/bin/bash - /sbin/ipconfig lo 127.0.0.1 - /sbin/portmap - /bin/mount /usr - exec /sbin/init "$@" <>/dev/console 2>&1 -\end{verbatim}\end{small} -\end{quote} - -%$ XXX SMH: font lock fix :-) - -The one slight complication with the above is that -{\path{/sbin/portmap}} is dynamically linked against -{\path{/usr/lib/libwrap.so.0}} Since this is in -{\path{/usr}}, it won't work. This can be solved by copying the -file (and link) below the /usr mount point, and just let the file be -'covered' when the mount happens. - -In some installations, where a shared read-only {\path{/usr}} is -being used, it may be desirable to move other large directories over -into the read-only {\path{/usr}}. For example, you might replace -{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with -links into {\path{/usr/root/bin}}, {\path{/usr/root/lib}} -and {\path{/usr/root/sbin}} respectively. This creates other -problems for running the {\path{/linuxrc}} script, requiring -bash, portmap, mount, ifconfig, and a handful of other shared -libraries to be copied below the mount point --- a simple -statically-linked C program would solve this problem. - - - - -\chapter{Glossary of Terms} - -\begin{description} -\item[Atropos] One of the CPU schedulers provided by Xen. - Atropos provides domains with absolute shares - of the CPU, with timeliness guarantees and a - mechanism for sharing out `slack time'. - -\item[BVT] The BVT scheduler is used to give proportional - fair shares of the CPU to domains. - -\item[Exokernel] A minimal piece of privileged code, similar to - a {\bf microkernel} but providing a more - `hardware-like' interface to the tasks it - manages. This is similar to a paravirtualising - VMM like {\bf Xen} but was designed as a new - operating system structure, rather than - specifically to run multiple conventional OSs. - -\item[Domain] A domain is the execution context that - contains a running {\bf virtual machine}. - The relationship between virtual machines - and domains on Xen is similar to that between - programs and processes in an operating - system: a virtual machine is a persistent - entity that resides on disk (somewhat like - a program). When it is loaded for execution, - it runs in a domain. Each domain has a - {\bf domain ID}. - -\item[Domain 0] The first domain to be started on a Xen - machine. Domain 0 is responsible for managing - the system. - -\item[Domain ID] A unique identifier for a {\bf domain}, - analogous to a process ID in an operating - system. - -\item[Full virtualisation] An approach to virtualisation which - requires no modifications to the hosted - operating system, providing the illusion of - a complete system of real hardware devices. - -\item[Hypervisor] An alternative term for {\bf VMM}, used - because it means `beyond supervisor', - since it is responsible for managing multiple - `supervisor' kernels. - -\item[Live migration] A technique for moving a running virtual - machine to another physical host, without - stopping it or the services running on it. - -\item[Microkernel] A small base of code running at the highest - hardware privilege level. A microkernel is - responsible for sharing CPU and memory (and - sometimes other devices) between less - privileged tasks running on the system. - This is similar to a VMM, particularly a - {\bf paravirtualising} VMM but typically - addressing a different problem space and - providing different kind of interface. - -\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. - -\item[Paravirtualisation] An approach to virtualisation which requires - modifications to the operating system in - order to run in a virtual machine. Xen - uses paravirtualisation but preserves - binary compatibility for user space - applications. - -\item[Shadow pagetables] A technique for hiding the layout of machine - memory from a virtual machine's operating - system. Used in some {\bf VMMs} to provide - the illusion of contiguous physical memory, - in Xen this is used during - {\bf live migration}. - -\item[Virtual Machine] The environment in which a hosted operating - system runs, providing the abstraction of a - dedicated machine. A virtual machine may - be identical to the underlying hardware (as - in {\bf full virtualisation}, or it may - differ, as in {\bf paravirtualisation}. - -\item[VMM] Virtual Machine Monitor - the software that - allows multiple virtual machines to be - multiplexed on a single physical machine. - -\item[Xen] Xen is a paravirtualising virtual machine - monitor, developed primarily by the - Systems Research Group at the University - of Cambridge Computer Laboratory. - -\item[XenLinux] Official name for the port of the Linux kernel - that runs on Xen. - -\end{description} +%% Chapter Installing Xen / XenLinux on Debian moved to debian.tex +\include{src/user/debian} + +%% Chapter Installing Xen on Red Hat moved to redhat.tex +\include{src/user/redhat} + + +%% Chapter Glossary of Terms moved to glossary.tex +\include{src/user/glossary} \end{document} diff -r 97dbd9524a7e -r 06d84bf87159 extras/mini-os/xenbus/xenbus_xs.c --- a/extras/mini-os/xenbus/xenbus_xs.c Thu Sep 22 17:34:14 2005 +++ b/extras/mini-os/xenbus/xenbus_xs.c Thu Sep 22 17:42:01 2005 @@ -127,7 +127,7 @@ return ERR_PTR(err); for (i = 0; i < num_vecs; i++) { - err = xb_write(iovec[i].iov_base, iovec[i].iov_len);; + err = xb_write(iovec[i].iov_base, iovec[i].iov_len); if (err) return ERR_PTR(err); } diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/Kconfig Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig Thu Sep 22 17:42:01 2005 @@ -73,6 +73,8 @@ config XEN_TPMDEV_FRONTEND bool "TPM-device frontend driver" default n + select TCG_TPM + select TCG_XEN help The TPM-device frontend driver. @@ -108,13 +110,6 @@ network interfaces within another guest OS. Unless you are building a dedicated device-driver domain, or your master control domain (domain 0), then you almost certainly want to say Y here. - -config XEN_NETDEV_GRANT - bool "Grant table substrate for network drivers (DANGEROUS)" - default n - help - This introduces the use of grant tables as a data exhange mechanism - between the frontend and backend network drivers. config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER bool "Pipelined transmitter (DANGEROUS)" diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Thu Sep 22 17:42:01 2005 @@ -19,7 +19,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Thu Sep 22 17:42:01 2005 @@ -19,7 +19,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Thu Sep 22 17:42:01 2005 @@ -16,7 +16,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Thu Sep 22 17:42:01 2005 @@ -16,7 +16,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Thu Sep 22 17:42:01 2005 @@ -19,7 +19,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -372,7 +371,7 @@ # CONFIG_ISAPNP=y # CONFIG_PNPBIOS is not set -CONFIG_PNPACPI=y +# CONFIG_PNPACPI is not set # # Block devices diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Thu Sep 22 17:42:01 2005 @@ -19,7 +19,6 @@ # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_NETDEV_GRANT=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Thu Sep 22 17:42:01 2005 @@ -45,12 +45,12 @@ return 0; } -int direct_remap_pfn_range(struct mm_struct *mm, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) +static int __direct_remap_pfn_range(struct mm_struct *mm, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) { int i; unsigned long start_address; @@ -98,6 +98,20 @@ return 0; } +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + /* Same as remap_pfn_range(). */ + vma->vm_flags |= VM_IO | VM_RESERVED; + + return __direct_remap_pfn_range( + vma->vm_mm, address, mfn, size, prot, domid); +} + EXPORT_SYMBOL(direct_remap_pfn_range); @@ -221,8 +235,9 @@ #ifdef __x86_64__ flags |= _PAGE_USER; #endif - if (direct_remap_pfn_range(&init_mm, (unsigned long) addr, phys_addr>>PAGE_SHIFT, - size, __pgprot(flags), domid)) { + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, + phys_addr>>PAGE_SHIFT, + size, __pgprot(flags), domid)) { vunmap((void __force *) addr); return NULL; } diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/i386/pci/i386.c --- a/linux-2.6-xen-sparse/arch/xen/i386/pci/i386.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/pci/i386.c Thu Sep 22 17:42:01 2005 @@ -295,7 +295,7 @@ /* Write-combine setting is ignored, it is changed via the mtrr * interfaces on this platform. */ - if (direct_remap_pfn_range(vma->vm_mm, vma->vm_start, vma->vm_pgoff, + if (direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot, DOMID_IO)) return -EAGAIN; diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/kernel/devmem.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/devmem.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/devmem.c Thu Sep 22 17:42:01 2005 @@ -90,22 +90,10 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) { - int uncached; - - uncached = uncached_access(file); - if (uncached) + if (uncached_access(file)) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - /* Don't try to swap out physical pages.. */ - vma->vm_flags |= VM_RESERVED; - - /* - * Don't dump addresses that are not real memory to a core file. - */ - if (uncached) - vma->vm_flags |= VM_IO; - - if (direct_remap_pfn_range(vma->vm_mm, vma->vm_start, vma->vm_pgoff, + if (direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot, DOMID_IO)) return -EAGAIN; diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Thu Sep 22 17:42:01 2005 @@ -182,14 +182,14 @@ } int -gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) +gnttab_grant_foreign_transfer(domid_t domid) { int ref; if ( unlikely((ref = get_free_entry()) == -1) ) return -ENOSPC; - shared[ref].frame = pfn; + shared[ref].frame = 0; shared[ref].domid = domid; wmb(); shared[ref].flags = GTF_accept_transfer; @@ -198,10 +198,9 @@ } void -gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, - unsigned long pfn) -{ - shared[ref].frame = pfn; +gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid) +{ + shared[ref].frame = 0; shared[ref].domid = domid; wmb(); shared[ref].flags = GTF_accept_transfer; diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/arch/xen/kernel/reboot.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Thu Sep 22 17:42:01 2005 @@ -334,7 +334,7 @@ return; } - xenbus_write("control", "shutdown", "", O_CREAT); + xenbus_write("control", "shutdown", ""); err = xenbus_transaction_end(0); if (err == -ETIMEDOUT) { diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/Makefile Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/Makefile Thu Sep 22 17:42:01 2005 @@ -1,4 +1,5 @@ +obj-y += util.o obj-y += console/ obj-y += evtchn/ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Thu Sep 22 17:42:01 2005 @@ -362,7 +362,10 @@ return; } - set_new_target(new_target >> PAGE_SHIFT); + /* The given memory/target value is in KiB, so it needs converting to + pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + set_new_target(new_target >> (PAGE_SHIFT - 10)); } diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Thu Sep 22 17:42:01 2005 @@ -28,12 +28,12 @@ #define BATCH_PER_DOMAIN 16 static unsigned long mmap_vstart; -#define MMAP_PAGES \ - (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) /* * Each outstanding request that we've passed to the lower device layers has a @@ -42,12 +42,12 @@ * response queued for it, with the saved 'id' passed back. */ typedef struct { - blkif_t *blkif; - unsigned long id; - int nr_pages; - atomic_t pendcnt; - unsigned short operation; - int status; + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; } pending_req_t; /* @@ -68,14 +68,13 @@ static request_queue_t *plugged_queue; static inline void flush_plugged_queue(void) { - request_queue_t *q = plugged_queue; - if ( q != NULL ) - { - if ( q->unplug_fn != NULL ) - q->unplug_fn(q); - blk_put_queue(q); - plugged_queue = NULL; - } + request_queue_t *q = plugged_queue; + if (q != NULL) { + if ( q->unplug_fn != NULL ) + q->unplug_fn(q); + blk_put_queue(q); + plugged_queue = NULL; + } } /* When using grant tables to map a frame for device access then the @@ -106,24 +105,23 @@ static void fast_flush_area(int idx, int nr_pages) { - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int i, invcount = 0; - u16 handle; - - for ( i = 0; i < nr_pages; i++ ) - { - if ( BLKBACK_INVALID_HANDLE != ( handle = pending_handle(idx, i) ) ) - { - unmap[i].host_addr = MMAP_VADDR(idx, i); - unmap[i].dev_bus_addr = 0; - unmap[i].handle = handle; - pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; - invcount++; - } - } - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount))) - BUG(); + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + u16 handle; + + for (i = 0; i < nr_pages; i++) { + handle = pending_handle(idx, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + unmap[i].host_addr = MMAP_VADDR(idx, i); + unmap[i].dev_bus_addr = 0; + unmap[i].handle = handle; + pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount)); } @@ -136,34 +134,38 @@ static int __on_blkdev_list(blkif_t *blkif) { - return blkif->blkdev_list.next != NULL; + return blkif->blkdev_list.next != NULL; } static void remove_from_blkdev_list(blkif_t *blkif) { - unsigned long flags; - if ( !__on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) - { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = NULL; - blkif_put(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); + unsigned long flags; + + if (!__on_blkdev_list(blkif)) + return; + + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if (__on_blkdev_list(blkif)) { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); } static void add_to_blkdev_list_tail(blkif_t *blkif) { - unsigned long flags; - if ( __on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) - { - list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); - blkif_get(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); + unsigned long flags; + + if (__on_blkdev_list(blkif)) + return; + + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) { + list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); } @@ -175,54 +177,53 @@ static int blkio_schedule(void *arg) { - DECLARE_WAITQUEUE(wq, current); - - blkif_t *blkif; - struct list_head *ent; - - daemonize("xenblkd"); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&blkio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || - list_empty(&blkio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&blkio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&blkio_schedule_list) ) - { - ent = blkio_schedule_list.next; - blkif = list_entry(ent, blkif_t, blkdev_list); - blkif_get(blkif); - remove_from_blkdev_list(blkif); - if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) - add_to_blkdev_list_tail(blkif); - blkif_put(blkif); - } - - /* Push the batch through to disc. */ - flush_plugged_queue(); - } + DECLARE_WAITQUEUE(wq, current); + + blkif_t *blkif; + struct list_head *ent; + + daemonize("xenblkd"); + + for (;;) { + /* Wait for work to do. */ + add_wait_queue(&blkio_schedule_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || + list_empty(&blkio_schedule_list) ) + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&blkio_schedule_wait, &wq); + + /* Queue up a batch of requests. */ + while ((NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&blkio_schedule_list)) { + ent = blkio_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if (do_block_io_op(blkif, BATCH_PER_DOMAIN)) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + + /* Push the batch through to disc. */ + flush_plugged_queue(); + } } static void maybe_trigger_blkio_schedule(void) { - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&blkio_schedule_list) ) - wake_up(&blkio_schedule_wait); + /* + * Needed so that two processes, which together make the following + * predicate true, don't both read stale values and evaluate the + * predicate incorrectly. Incredibly unlikely to stall the scheduler + * on x86, but... + */ + smp_mb(); + + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&blkio_schedule_list)) + wake_up(&blkio_schedule_wait); } @@ -233,36 +234,34 @@ static void __end_block_io_op(pending_req_t *pending_req, int uptodate) { - unsigned long flags; - - /* An error fails the entire request. */ - if ( !uptodate ) - { - DPRINTK("Buffer not up-to-date at end of operation\n"); - pending_req->status = BLKIF_RSP_ERROR; - } - - if ( atomic_dec_and_test(&pending_req->pendcnt) ) - { - int pending_idx = pending_req - pending_reqs; - fast_flush_area(pending_idx, pending_req->nr_pages); - make_response(pending_req->blkif, pending_req->id, - pending_req->operation, pending_req->status); - blkif_put(pending_req->blkif); - spin_lock_irqsave(&pend_prod_lock, flags); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - spin_unlock_irqrestore(&pend_prod_lock, flags); - maybe_trigger_blkio_schedule(); - } + unsigned long flags; + + /* An error fails the entire request. */ + if (!uptodate) { + DPRINTK("Buffer not up-to-date at end of operation\n"); + pending_req->status = BLKIF_RSP_ERROR; + } + + if (atomic_dec_and_test(&pending_req->pendcnt)) { + int pending_idx = pending_req - pending_reqs; + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + maybe_trigger_blkio_schedule(); + } } static int end_block_io_op(struct bio *bio, unsigned int done, int error) { - if ( bio->bi_size != 0 ) - return 1; - __end_block_io_op(bio->bi_private, !error); - bio_put(bio); - return error; + if (bio->bi_size != 0) + return 1; + __end_block_io_op(bio->bi_private, !error); + bio_put(bio); + return error; } @@ -272,10 +271,10 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) { - blkif_t *blkif = dev_id; - add_to_blkdev_list_tail(blkif); - maybe_trigger_blkio_schedule(); - return IRQ_HANDLED; + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_blkio_schedule(); + return IRQ_HANDLED; } @@ -286,183 +285,174 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do) { - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; - RING_IDX i, rp; - int more_to_do = 0; - - rp = blk_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - for ( i = blk_ring->req_cons; - (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); - i++ ) - { - if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) - { - more_to_do = 1; - break; - } + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + blkif_request_t *req; + RING_IDX i, rp; + int more_to_do = 0; + + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + for (i = blk_ring->req_cons; + (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); + i++) { + if ((max_to_do-- == 0) || + (NR_PENDING_REQS == MAX_PENDING_REQS)) { + more_to_do = 1; + break; + } - req = RING_GET_REQUEST(blk_ring, i); - switch ( req->operation ) - { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - dispatch_rw_block_io(blkif, req); - break; - - default: - DPRINTK("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); - break; - } - } - - blk_ring->req_cons = i; - return more_to_do; + req = RING_GET_REQUEST(blk_ring, i); + switch (req->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, + BLKIF_RSP_ERROR); + break; + } + } + + blk_ring->req_cons = i; + return more_to_do; } static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) { - extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); - int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; - unsigned long fas = 0; - int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - pending_req_t *pending_req; - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct phys_req preq; - struct { - unsigned long buf; unsigned int nsec; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int nseg; - struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int nbio = 0; - request_queue_t *q; - - /* Check that number of segments is sane. */ - nseg = req->nr_segments; - if ( unlikely(nseg == 0) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) - { - DPRINTK("Bad number of segments in request (%d)\n", nseg); - goto bad_descriptor; - } - - preq.dev = req->handle; - preq.sector_number = req->sector_number; - preq.nr_sects = 0; - - for ( i = 0; i < nseg; i++ ) - { - fas = req->frame_and_sects[i]; - seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; - - if ( seg[i].nsec <= 0 ) - goto bad_descriptor; - preq.nr_sects += seg[i].nsec; - - map[i].host_addr = MMAP_VADDR(pending_idx, i); - map[i].dom = blkif->domid; - map[i].ref = blkif_gref_from_fas(fas); - map[i].flags = GNTMAP_host_map; - if ( operation == WRITE ) - map[i].flags |= GNTMAP_readonly; - } - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, map, nseg))) - BUG(); - - for ( i = 0; i < nseg; i++ ) - { - if ( unlikely(map[i].handle < 0) ) - { - DPRINTK("invalid buffer -- could not remap it\n"); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); - - pending_handle(pending_idx, i) = map[i].handle; - } - - for ( i = 0; i < nseg; i++ ) - { - fas = req->frame_and_sects[i]; - seg[i].buf = map[i].dev_bus_addr | (blkif_first_sect(fas) << 9); - } - - if ( vbd_translate(&preq, blkif, operation) != 0 ) - { - DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", - operation == READ ? "read" : "write", preq.sector_number, - preq.sector_number + preq.nr_sects, preq.dev); - goto bad_descriptor; - } - - pending_req = &pending_reqs[pending_idx]; - pending_req->blkif = blkif; - pending_req->id = req->id; - pending_req->operation = operation; - pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; - - for ( i = 0; i < nseg; i++ ) - { - if ( ((int)preq.sector_number|(int)seg[i].nsec) & - ((bdev_hardsect_size(preq.bdev) >> 9) - 1) ) - { - DPRINTK("Misaligned I/O request from domain %d", blkif->domid); - goto cleanup_and_fail; - } - - while ( (bio == NULL) || - (bio_add_page(bio, - virt_to_page(MMAP_VADDR(pending_idx, i)), - seg[i].nsec << 9, - seg[i].buf & ~PAGE_MASK) == 0) ) - { - bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i); - if ( unlikely(bio == NULL) ) - { - cleanup_and_fail: - for ( i = 0; i < (nbio-1); i++ ) - bio_put(biolist[i]); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + unsigned long fas = 0; + int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct phys_req preq; + struct { + unsigned long buf; unsigned int nsec; + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg; + struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int nbio = 0; + request_queue_t *q; + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if (unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTK("Bad number of segments in request (%d)\n", nseg); + goto bad_descriptor; + } + + preq.dev = req->handle; + preq.sector_number = req->sector_number; + preq.nr_sects = 0; + + for (i = 0; i < nseg; i++) { + fas = req->frame_and_sects[i]; + seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; + + if (seg[i].nsec <= 0) + goto bad_descriptor; + preq.nr_sects += seg[i].nsec; + + map[i].host_addr = MMAP_VADDR(pending_idx, i); + map[i].dom = blkif->domid; + map[i].ref = blkif_gref_from_fas(fas); + map[i].flags = GNTMAP_host_map; + if ( operation == WRITE ) + map[i].flags |= GNTMAP_readonly; + } + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, map, nseg)); + + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].handle < 0)) { + DPRINTK("invalid buffer -- could not remap it\n"); + fast_flush_area(pending_idx, nseg); + goto bad_descriptor; + } + + phys_to_machine_mapping[__pa(MMAP_VADDR( + pending_idx, i)) >> PAGE_SHIFT] = + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); + + pending_handle(pending_idx, i) = map[i].handle; + } + + for (i = 0; i < nseg; i++) { + fas = req->frame_and_sects[i]; + seg[i].buf = map[i].dev_bus_addr | + (blkif_first_sect(fas) << 9); + } + + if (vbd_translate(&preq, blkif, operation) != 0) { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + preq.sector_number, + preq.sector_number + preq.nr_sects, preq.dev); + goto bad_descriptor; + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + if (((int)preq.sector_number|(int)seg[i].nsec) & + ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { + DPRINTK("Misaligned I/O request from domain %d", + blkif->domid); + goto cleanup_and_fail; + } + + while ((bio == NULL) || + (bio_add_page(bio, + virt_to_page(MMAP_VADDR(pending_idx, i)), + seg[i].nsec << 9, + seg[i].buf & ~PAGE_MASK) == 0)) { + bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i); + if (unlikely(bio == NULL)) { + cleanup_and_fail: + for (i = 0; i < (nbio-1); i++) + bio_put(biolist[i]); + fast_flush_area(pending_idx, nseg); + goto bad_descriptor; + } - bio->bi_bdev = preq.bdev; - bio->bi_private = pending_req; - bio->bi_end_io = end_block_io_op; - bio->bi_sector = preq.sector_number; - } - - preq.sector_number += seg[i].nsec; - } - - if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue ) - { - flush_plugged_queue(); - blk_get_queue(q); - plugged_queue = q; - } - - atomic_set(&pending_req->pendcnt, nbio); - pending_cons++; - blkif_get(blkif); - - for ( i = 0; i < nbio; i++ ) - submit_bio(operation, biolist[i]); - - return; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = preq.sector_number; + } + + preq.sector_number += seg[i].nsec; + } + + if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) { + flush_plugged_queue(); + blk_get_queue(q); + plugged_queue = q; + } + + atomic_set(&pending_req->pendcnt, nbio); + pending_cons++; + blkif_get(blkif); + + for (i = 0; i < nbio; i++) + submit_bio(operation, biolist[i]); + + return; bad_descriptor: - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); } @@ -475,66 +465,71 @@ static void make_response(blkif_t *blkif, unsigned long id, unsigned short op, int st) { - blkif_response_t *resp; - unsigned long flags; - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - - /* Place on the response ring for the relevant domain. */ - spin_lock_irqsave(&blkif->blk_ring_lock, flags); - resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - wmb(); /* Ensure other side can see the response fields. */ - blk_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES(blk_ring); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); - - /* Kick the relevant domain. */ - notify_via_evtchn(blkif->evtchn); + blkif_response_t *resp; + unsigned long flags; + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); /* Ensure other side can see the response fields. */ + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES(blk_ring); + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); } void blkif_deschedule(blkif_t *blkif) { - remove_from_blkdev_list(blkif); + remove_from_blkdev_list(blkif); } static int __init blkif_init(void) { - int i; - struct page *page; - - if ( !(xen_start_info->flags & SIF_INITDOMAIN) && - !(xen_start_info->flags & SIF_BLK_BE_DOMAIN) ) - return 0; - - blkif_interface_init(); - - page = balloon_alloc_empty_page_range(MMAP_PAGES); - BUG_ON(page == NULL); - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - memset(pending_reqs, 0, sizeof(pending_reqs)); - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; + int i; + struct page *page; + + if (!(xen_start_info->flags & SIF_INITDOMAIN) && + !(xen_start_info->flags & SIF_BLK_BE_DOMAIN)) + return 0; + + blkif_interface_init(); + + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for (i = 0; i < MAX_PENDING_REQS; i++) + pending_ring[i] = i; - spin_lock_init(&blkio_schedule_list_lock); - INIT_LIST_HEAD(&blkio_schedule_list); - - if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); - - blkif_xenbus_init(); - - memset( pending_grant_handles, BLKBACK_INVALID_HANDLE, MMAP_PAGES ); - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n"); -#endif - - return 0; + spin_lock_init(&blkio_schedule_list_lock); + INIT_LIST_HEAD(&blkio_schedule_list); + + BUG_ON(kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0); + + blkif_xenbus_init(); + + memset(pending_grant_handles, BLKBACK_INVALID_HANDLE, MMAP_PAGES); + + return 0; } __initcall(blkif_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Thu Sep 22 17:42:01 2005 @@ -17,6 +17,7 @@ #include <asm-xen/xen-public/io/blkif.h> #include <asm-xen/xen-public/io/ring.h> #include <asm-xen/gnttab.h> +#include <asm-xen/driver_util.h> #if 0 #define ASSERT(_p) \ @@ -30,39 +31,39 @@ #endif struct vbd { - blkif_vdev_t handle; /* what the domain refers to this vbd as */ - unsigned char readonly; /* Non-zero -> read-only */ - unsigned char type; /* VDISK_xxx */ - u32 pdevice; /* phys device that this vbd maps to */ - struct block_device *bdev; + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + u32 pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; }; typedef struct blkif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - unsigned int remote_evtchn; - /* Comms information. */ - blkif_back_ring_t blk_ring; - /* VBDs attached to this interface. */ - struct vbd vbd; - /* Private fields. */ - enum { DISCONNECTED, CONNECTED } status; + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int evtchn; + unsigned int remote_evtchn; + /* Comms information. */ + blkif_back_ring_t blk_ring; + struct vm_struct *blk_ring_area; + /* VBDs attached to this interface. */ + struct vbd vbd; + /* Private fields. */ + enum { DISCONNECTED, CONNECTED } status; #ifdef CONFIG_XEN_BLKDEV_TAP_BE - /* Is this a blktap frontend */ - unsigned int is_blktap; + /* Is this a blktap frontend */ + unsigned int is_blktap; #endif - struct list_head blkdev_list; - spinlock_t blk_ring_lock; - atomic_t refcnt; + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; - struct work_struct free_work; - u16 shmem_handle; - unsigned long shmem_vaddr; - grant_ref_t shmem_ref; + struct work_struct free_work; + + u16 shmem_handle; + grant_ref_t shmem_ref; } blkif_t; blkif_t *alloc_blkif(domid_t domid); @@ -70,11 +71,11 @@ int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); #define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define blkif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - free_blkif_callback(_b); \ - } while (0) +#define blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + free_blkif_callback(_b); \ + } while (0) /* Create a vbd. */ int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, u32 pdevice, @@ -86,10 +87,10 @@ unsigned long vbd_secsize(struct vbd *vbd); struct phys_req { - unsigned short dev; - unsigned short nr_sects; - struct block_device *bdev; - blkif_sector_t sector_number; + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; }; int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); @@ -103,3 +104,13 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); #endif /* __BLKIF__BACKEND__COMMON_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Sep 22 17:42:01 2005 @@ -13,131 +13,144 @@ blkif_t *alloc_blkif(domid_t domid) { - blkif_t *blkif; + blkif_t *blkif; - blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); - if (!blkif) - return ERR_PTR(-ENOMEM); + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->status = DISCONNECTED; - spin_lock_init(&blkif->blk_ring_lock); - atomic_set(&blkif->refcnt, 1); + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); - return blkif; + return blkif; } -static int map_frontend_page(blkif_t *blkif, unsigned long localaddr, - unsigned long shared_page) +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) { - struct gnttab_map_grant_ref op; - op.host_addr = localaddr; - op.flags = GNTMAP_host_map; - op.ref = shared_page; - op.dom = blkif->domid; + struct gnttab_map_grant_ref op; - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + op.host_addr = (unsigned long)blkif->blk_ring_area->addr; + op.flags = GNTMAP_host_map; + op.ref = shared_page; + op.dom = blkif->domid; - if (op.handle < 0) { - DPRINTK(" Grant table operation failure !\n"); - return op.handle; - } + lock_vm_area(blkif->blk_ring_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + unlock_vm_area(blkif->blk_ring_area); - blkif->shmem_ref = shared_page; - blkif->shmem_handle = op.handle; - blkif->shmem_vaddr = localaddr; - return 0; + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + + return 0; } static void unmap_frontend_page(blkif_t *blkif) { - struct gnttab_unmap_grant_ref op; + struct gnttab_unmap_grant_ref op; - op.host_addr = blkif->shmem_vaddr; - op.handle = blkif->shmem_handle; - op.dev_bus_addr = 0; - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + op.host_addr = (unsigned long)blkif->blk_ring_area->addr; + op.handle = blkif->shmem_handle; + op.dev_bus_addr = 0; + + lock_vm_area(blkif->blk_ring_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + unlock_vm_area(blkif->blk_ring_area); } int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) { - struct vm_struct *vma; - blkif_sring_t *sring; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; + blkif_sring_t *sring; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; - BUG_ON(blkif->remote_evtchn); + BUG_ON(blkif->remote_evtchn); - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - return -ENOMEM; + if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) + return -ENOMEM; - err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page); - if (err) { - vfree(vma->addr); - return err; - } + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } - op.u.bind_interdomain.dom1 = DOMID_SELF; - op.u.bind_interdomain.dom2 = blkif->domid; - op.u.bind_interdomain.port1 = 0; - op.u.bind_interdomain.port2 = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_frontend_page(blkif); - vfree(vma->addr); - return err; - } + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = blkif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + return err; + } - blkif->evtchn = op.u.bind_interdomain.port1; - blkif->remote_evtchn = evtchn; + blkif->evtchn = op.u.bind_interdomain.port1; + blkif->remote_evtchn = evtchn; - sring = (blkif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + SHARED_RING_INIT(sring); + BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); - bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend", - blkif); - blkif->status = CONNECTED; - blkif->shmem_frame = shared_page; + bind_evtchn_to_irqhandler( + blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif); + blkif->status = CONNECTED; - return 0; + return 0; } static void free_blkif(void *arg) { - evtchn_op_t op = { .cmd = EVTCHNOP_close }; - blkif_t *blkif = (blkif_t *)arg; + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + blkif_t *blkif = (blkif_t *)arg; - op.u.close.port = blkif->evtchn; - op.u.close.dom = DOMID_SELF; - HYPERVISOR_event_channel_op(&op); - op.u.close.port = blkif->remote_evtchn; - op.u.close.dom = blkif->domid; - HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->remote_evtchn; + op.u.close.dom = blkif->domid; + HYPERVISOR_event_channel_op(&op); - vbd_free(&blkif->vbd); + vbd_free(&blkif->vbd); - if (blkif->evtchn) - unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); + if (blkif->evtchn) + unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); - if (blkif->blk_ring.sring) { - unmap_frontend_page(blkif); - vfree(blkif->blk_ring.sring); - blkif->blk_ring.sring = NULL; - } + if (blkif->blk_ring.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_ring.sring = NULL; + } - kmem_cache_free(blkif_cachep, blkif); + kmem_cache_free(blkif_cachep, blkif); } void free_blkif_callback(blkif_t *blkif) { - INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); - schedule_work(&blkif->free_work); + INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); + schedule_work(&blkif->free_work); } void __init blkif_interface_init(void) { - blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), - 0, 0, NULL, NULL); + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c Thu Sep 22 17:42:01 2005 @@ -11,10 +11,10 @@ static inline dev_t vbd_map_devnum(u32 cookie) { - return MKDEV(BLKIF_MAJOR(cookie), BLKIF_MINOR(cookie)); + return MKDEV(BLKIF_MAJOR(cookie), BLKIF_MINOR(cookie)); } -#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ - (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ + (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) #define bdev_put(_b) blkdev_put(_b) unsigned long vbd_size(struct vbd *vbd) @@ -35,63 +35,73 @@ int vbd_create(blkif_t *blkif, blkif_vdev_t handle, u32 pdevice, int readonly) { - struct vbd *vbd; + struct vbd *vbd; - vbd = &blkif->vbd; - vbd->handle = handle; - vbd->readonly = readonly; - vbd->type = 0; + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; + vbd->type = 0; - vbd->pdevice = pdevice; + vbd->pdevice = pdevice; - vbd->bdev = open_by_devnum( - vbd_map_devnum(vbd->pdevice), - vbd->readonly ? FMODE_READ : FMODE_WRITE); - if ( IS_ERR(vbd->bdev) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - return -ENOENT; - } + vbd->bdev = open_by_devnum( + vbd_map_devnum(vbd->pdevice), + vbd->readonly ? FMODE_READ : FMODE_WRITE); + if (IS_ERR(vbd->bdev)) { + DPRINTK("vbd_creat: device %08x doesn't exist.\n", + vbd->pdevice); + return -ENOENT; + } - if ( (vbd->bdev->bd_disk == NULL) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - vbd_free(vbd); - return -ENOENT; - } + if (vbd->bdev->bd_disk == NULL) { + DPRINTK("vbd_creat: device %08x doesn't exist.\n", + vbd->pdevice); + vbd_free(vbd); + return -ENOENT; + } - if ( vbd->bdev->bd_disk->flags & GENHD_FL_CD ) - vbd->type |= VDISK_CDROM; - if ( vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE ) - vbd->type |= VDISK_REMOVABLE; + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD) + vbd->type |= VDISK_CDROM; + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + vbd->type |= VDISK_REMOVABLE; - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", - handle, blkif->domid); - return 0; + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; } void vbd_free(struct vbd *vbd) { - if (vbd->bdev) - bdev_put(vbd->bdev); - vbd->bdev = NULL; + if (vbd->bdev) + bdev_put(vbd->bdev); + vbd->bdev = NULL; } int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) { - struct vbd *vbd = &blkif->vbd; - int rc = -EACCES; + struct vbd *vbd = &blkif->vbd; + int rc = -EACCES; - if ((operation == WRITE) && vbd->readonly) - goto out; + if ((operation == WRITE) && vbd->readonly) + goto out; - if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) - goto out; + if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) + goto out; - req->dev = vbd->pdevice; - req->bdev = vbd->bdev; - rc = 0; + req->dev = vbd->pdevice; + req->bdev = vbd->bdev; + rc = 0; out: - return rc; + return rc; } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Thu Sep 22 17:42:01 2005 @@ -124,7 +124,7 @@ return; -abort: + abort: xenbus_transaction_end(1); } @@ -228,6 +228,7 @@ be->dev = dev; be->backend_watch.node = dev->nodename; be->backend_watch.callback = backend_changed; + /* Will implicitly call backend_changed once. */ err = register_xenbus_watch(&be->backend_watch); if (err) { be->backend_watch.node = NULL; @@ -249,8 +250,6 @@ } dev->data = be; - - backend_changed(&be->backend_watch, dev->nodename); return 0; free_be: @@ -279,3 +278,13 @@ { xenbus_register_backend(&blkback); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkfront/block.h --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Thu Sep 22 17:42:01 2005 @@ -146,4 +146,15 @@ int xlvbd_add(blkif_sector_t capacity, int device, u16 vdisk_info, u16 sector_size, struct blkfront_info *info); void xlvbd_del(struct blkfront_info *info); + #endif /* __XEN_DRIVERS_BLOCK_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Thu Sep 22 17:42:01 2005 @@ -65,7 +65,7 @@ }; static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + - NUM_VBD_MAJORS]; + NUM_VBD_MAJORS]; #define XLBD_MAJOR_IDE_START 0 #define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) @@ -309,3 +309,13 @@ bdput(bd); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Thu Sep 22 17:42:01 2005 @@ -4,7 +4,6 @@ * This is a modified version of the block backend driver that remaps requests * to a user-space memory region. It is intended to be used to write * application-level servers that provide block interfaces to client VMs. - * */ #include <linux/kernel.h> @@ -67,20 +66,19 @@ static inline int BLKTAP_MODE_VALID(unsigned long arg) { - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) ); + return ((arg == BLKTAP_MODE_PASSTHROUGH ) || + (arg == BLKTAP_MODE_INTERCEPT_FE) || + (arg == BLKTAP_MODE_INTERPOSE )); /* - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERCEPT_BE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) || - ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || - ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || - ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) - ); + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERCEPT_BE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) || + ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || + ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || + ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) + ); */ } @@ -110,14 +108,12 @@ unsigned long rings_vstart; /* start of mmaped vma */ unsigned long user_vstart; /* start of user mappings */ -#define MMAP_PAGES \ - (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) -#define MMAP_VADDR(_start, _req,_seg) \ - (_start + \ - ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - - +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) /* * Each outstanding request that we've passed to the lower device layers has a @@ -126,12 +122,12 @@ * response queued for it, with the saved 'id' passed back. */ typedef struct { - blkif_t *blkif; - unsigned long id; - int nr_pages; - atomic_t pendcnt; - unsigned short operation; - int status; + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; } pending_req_t; /* @@ -156,17 +152,17 @@ static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) { - return ( (fe_dom << 16) | MASK_PEND_IDX(idx) ); + return ((fe_dom << 16) | MASK_PEND_IDX(idx)); } extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) { - return (PEND_RING_IDX)( id & 0x0000ffff ); + return (PEND_RING_IDX)(id & 0x0000ffff); } extern inline domid_t ID_TO_DOM(unsigned long id) { - return (domid_t)(id >> 16); + return (domid_t)(id >> 16); } @@ -181,8 +177,8 @@ */ struct grant_handle_pair { - u16 kernel; - u16 user; + u16 kernel; + u16 user; }; static struct grant_handle_pair pending_grant_handles[MMAP_PAGES]; #define pending_handle(_idx, _i) \ @@ -199,21 +195,20 @@ */ static struct page *blktap_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) -{ - /* - * if the page has not been mapped in by the driver then generate - * a SIGBUS to the domain. - */ - - force_sig(SIGBUS, current); - - return 0; + unsigned long address, + int *type) +{ + /* + * if the page has not been mapped in by the driver then generate + * a SIGBUS to the domain. + */ + force_sig(SIGBUS, current); + + return 0; } struct vm_operations_struct blktap_vm_ops = { - nopage: blktap_nopage, + nopage: blktap_nopage, }; /****************************************************************** @@ -222,44 +217,45 @@ static int blktap_open(struct inode *inode, struct file *filp) { - blkif_sring_t *sring; + blkif_sring_t *sring; + + if (test_and_set_bit(0, &blktap_dev_inuse)) + return -EBUSY; - if ( test_and_set_bit(0, &blktap_dev_inuse) ) - return -EBUSY; + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (sring == NULL) + goto fail_nomem; + + SetPageReserved(virt_to_page(sring)); - /* Allocate the fe ring. */ - sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); - if (sring == NULL) - goto fail_nomem; - - SetPageReserved(virt_to_page(sring)); - - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); - - return 0; + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); + + return 0; fail_nomem: - return -ENOMEM; + return -ENOMEM; } static int blktap_release(struct inode *inode, struct file *filp) { - blktap_dev_inuse = 0; - blktap_ring_ok = 0; - - /* Free the ring page. */ - ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); - free_page((unsigned long) blktap_ufe_ring.sring); - - /* Clear any active mappings and free foreign map table */ - if (blktap_vma != NULL) { - zap_page_range(blktap_vma, blktap_vma->vm_start, - blktap_vma->vm_end - blktap_vma->vm_start, NULL); - blktap_vma = NULL; - } - - return 0; + blktap_dev_inuse = 0; + blktap_ring_ok = 0; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); + free_page((unsigned long) blktap_ufe_ring.sring); + + /* Clear any active mappings and free foreign map table */ + if (blktap_vma != NULL) { + zap_page_range( + blktap_vma, blktap_vma->vm_start, + blktap_vma->vm_end - blktap_vma->vm_start, NULL); + blktap_vma = NULL; + } + + return 0; } @@ -283,128 +279,124 @@ */ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) { - int size; - struct page **map; - int i; - - DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n", - vma->vm_start, vma->vm_end); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &blktap_vm_ops; - - size = vma->vm_end - vma->vm_start; - if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) { - printk(KERN_INFO - "blktap: you _must_ map exactly %d pages!\n", - MMAP_PAGES + RING_PAGES); - return -EAGAIN; - } - - size >>= PAGE_SHIFT; - DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); + int size; + struct page **map; + int i; + + DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n", + vma->vm_start, vma->vm_end); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_vm_ops; + + size = vma->vm_end - vma->vm_start; + if (size != ((MMAP_PAGES + RING_PAGES) << PAGE_SHIFT)) { + printk(KERN_INFO + "blktap: you _must_ map exactly %d pages!\n", + MMAP_PAGES + RING_PAGES); + return -EAGAIN; + } + + size >>= PAGE_SHIFT; + DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); - rings_vstart = vma->vm_start; - user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); + rings_vstart = vma->vm_start; + user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); - /* Map the ring pages to the start of the region and reserve it. */ - - /* not sure if I really need to do this... */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (remap_pfn_range(vma, vma->vm_start, - __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - { - WPRINTK("Mapping user ring failed!\n"); - goto fail; - } - - /* Mark this VM as containing foreign pages, and set up mappings. */ - map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - * sizeof(struct page_struct*), - GFP_KERNEL); - if (map == NULL) - { - WPRINTK("Couldn't alloc VM_FOREIGH map.\n"); - goto fail; - } - - for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) - map[i] = NULL; + /* Map the ring pages to the start of the region and reserve it. */ + + /* not sure if I really need to do this... */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (remap_pfn_range(vma, vma->vm_start, + __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot)) { + WPRINTK("Mapping user ring failed!\n"); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + * sizeof(struct page_struct*), + GFP_KERNEL); + if (map == NULL) { + WPRINTK("Couldn't alloc VM_FOREIGH map.\n"); + goto fail; + } + + for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) + map[i] = NULL; - vma->vm_private_data = map; - vma->vm_flags |= VM_FOREIGN; - - blktap_vma = vma; - blktap_ring_ok = 1; - - return 0; + vma->vm_private_data = map; + vma->vm_flags |= VM_FOREIGN; + + blktap_vma = vma; + blktap_ring_ok = 1; + + return 0; fail: - /* Clear any active mappings. */ - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); - - return -ENOMEM; + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + + return -ENOMEM; } static int blktap_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - switch(cmd) { - case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */ - return blktap_read_ufe_ring(); - - case BLKTAP_IOCTL_SETMODE: - if (BLKTAP_MODE_VALID(arg)) { - blktap_mode = arg; - /* XXX: may need to flush rings here. */ - printk(KERN_INFO "blktap: set mode to %lx\n", arg); - return 0; - } - case BLKTAP_IOCTL_PRINT_IDXS: + switch(cmd) { + case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */ + return blktap_read_ufe_ring(); + + case BLKTAP_IOCTL_SETMODE: + if (BLKTAP_MODE_VALID(arg)) { + blktap_mode = arg; + /* XXX: may need to flush rings here. */ + printk(KERN_INFO "blktap: set mode to %lx\n", arg); + return 0; + } + case BLKTAP_IOCTL_PRINT_IDXS: { - //print_fe_ring_idxs(); - WPRINTK("User Rings: \n-----------\n"); - WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_ufe_ring.rsp_cons, - blktap_ufe_ring.req_prod_pvt, - blktap_ufe_ring.sring->req_prod, - blktap_ufe_ring.sring->rsp_prod); + //print_fe_ring_idxs(); + WPRINTK("User Rings: \n-----------\n"); + WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d " + "| req_prod: %2d, rsp_prod: %2d\n", + blktap_ufe_ring.rsp_cons, + blktap_ufe_ring.req_prod_pvt, + blktap_ufe_ring.sring->req_prod, + blktap_ufe_ring.sring->rsp_prod); } - } - return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; } static unsigned int blktap_poll(struct file *file, poll_table *wait) { - poll_wait(file, &blktap_wait, wait); - if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) ) - { - flush_tlb_all(); - - RING_PUSH_REQUESTS(&blktap_ufe_ring); - return POLLIN | POLLRDNORM; - } - - return 0; + poll_wait(file, &blktap_wait, wait); + if (RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring)) { + flush_tlb_all(); + RING_PUSH_REQUESTS(&blktap_ufe_ring); + return POLLIN | POLLRDNORM; + } + + return 0; } void blktap_kick_user(void) { - /* blktap_ring->req_prod = blktap_req_prod; */ - wake_up_interruptible(&blktap_wait); + /* blktap_ring->req_prod = blktap_req_prod; */ + wake_up_interruptible(&blktap_wait); } static struct file_operations blktap_fops = { - owner: THIS_MODULE, - poll: blktap_poll, - ioctl: blktap_ioctl, - open: blktap_open, - release: blktap_release, - mmap: blktap_mmap, + owner: THIS_MODULE, + poll: blktap_poll, + ioctl: blktap_ioctl, + open: blktap_open, + release: blktap_release, + mmap: blktap_mmap, }; @@ -417,44 +409,44 @@ static void fast_flush_area(int idx, int nr_pages) { - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; - unsigned int i, op = 0; - struct grant_handle_pair *handle; - unsigned long ptep; - - for (i=0; i<nr_pages; i++) - { - handle = &pending_handle(idx, i); - if (!BLKTAP_INVALID_HANDLE(handle)) - { - - unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i); - unmap[op].dev_bus_addr = 0; - unmap[op].handle = handle->kernel; - op++; - - if (create_lookup_pte_addr(blktap_vma->vm_mm, - MMAP_VADDR(user_vstart, idx, i), - &ptep) !=0) { - DPRINTK("Couldn't get a pte addr!\n"); - return; - } - unmap[op].host_addr = ptep; - unmap[op].dev_bus_addr = 0; - unmap[op].handle = handle->user; - op++; + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, op = 0; + struct grant_handle_pair *handle; + unsigned long ptep; + + for ( i = 0; i < nr_pages; i++) + { + handle = &pending_handle(idx, i); + if (BLKTAP_INVALID_HANDLE(handle)) + continue; + + unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i); + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->kernel; + op++; + + if (create_lookup_pte_addr( + blktap_vma->vm_mm, + MMAP_VADDR(user_vstart, idx, i), + &ptep) !=0) { + DPRINTK("Couldn't get a pte addr!\n"); + return; + } + unmap[op].host_addr = ptep; + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->user; + op++; - BLKTAP_INVALIDATE_HANDLE(handle); - } - } - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, op))) - BUG(); - - if (blktap_vma != NULL) - zap_page_range(blktap_vma, - MMAP_VADDR(user_vstart, idx, 0), - nr_pages << PAGE_SHIFT, NULL); + BLKTAP_INVALIDATE_HANDLE(handle); + } + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, op)); + + if (blktap_vma != NULL) + zap_page_range(blktap_vma, + MMAP_VADDR(user_vstart, idx, 0), + nr_pages << PAGE_SHIFT, NULL); } /****************************************************************** @@ -466,34 +458,38 @@ static int __on_blkdev_list(blkif_t *blkif) { - return blkif->blkdev_list.next != NULL; + return blkif->blkdev_list.next != NULL; } static void remove_from_blkdev_list(blkif_t *blkif) { - unsigned long flags; - if ( !__on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) - { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = NULL; - blkif_put(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); + unsigned long flags; + + if (!__on_blkdev_list(blkif)) + return; + + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if (__on_blkdev_list(blkif)) { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); } static void add_to_blkdev_list_tail(blkif_t *blkif) { - unsigned long flags; - if ( __on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) - { - list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); - blkif_get(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); + unsigned long flags; + + if (__on_blkdev_list(blkif)) + return; + + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) { + list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); } @@ -505,51 +501,50 @@ static int blkio_schedule(void *arg) { - DECLARE_WAITQUEUE(wq, current); - - blkif_t *blkif; - struct list_head *ent; - - daemonize("xenblkd"); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&blkio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || - list_empty(&blkio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&blkio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&blkio_schedule_list) ) - { - ent = blkio_schedule_list.next; - blkif = list_entry(ent, blkif_t, blkdev_list); - blkif_get(blkif); - remove_from_blkdev_list(blkif); - if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) - add_to_blkdev_list_tail(blkif); - blkif_put(blkif); - } - } + DECLARE_WAITQUEUE(wq, current); + + blkif_t *blkif; + struct list_head *ent; + + daemonize("xenblkd"); + + for (;;) { + /* Wait for work to do. */ + add_wait_queue(&blkio_schedule_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + if ((NR_PENDING_REQS == MAX_PENDING_REQS) || + list_empty(&blkio_schedule_list)) + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&blkio_schedule_wait, &wq); + + /* Queue up a batch of requests. */ + while ((NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&blkio_schedule_list)) { + ent = blkio_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if (do_block_io_op(blkif, BATCH_PER_DOMAIN)) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + } } static void maybe_trigger_blkio_schedule(void) { - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&blkio_schedule_list) ) - wake_up(&blkio_schedule_wait); + /* + * Needed so that two processes, who together make the following + * predicate true, don't both read stale values and evaluate the + * predicate incorrectly. Incredibly unlikely to stall the scheduler + * on the x86, but... + */ + smp_mb(); + + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&blkio_schedule_list)) + wake_up(&blkio_schedule_wait); } @@ -561,54 +556,53 @@ static int blktap_read_ufe_ring(void) { - /* This is called to read responses from the UFE ring. */ - - RING_IDX i, j, rp; - blkif_response_t *resp; - blkif_t *blkif; - int pending_idx; - pending_req_t *pending_req; - unsigned long flags; - - /* if we are forwarding from UFERring to FERing */ - if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { - - /* for each outstanding message on the UFEring */ - rp = blktap_ufe_ring.sring->rsp_prod; - rmb(); + /* This is called to read responses from the UFE ring. */ + + RING_IDX i, j, rp; + blkif_response_t *resp; + blkif_t *blkif; + int pending_idx; + pending_req_t *pending_req; + unsigned long flags; + + /* if we are forwarding from UFERring to FERing */ + if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { + + /* for each outstanding message on the UFEring */ + rp = blktap_ufe_ring.sring->rsp_prod; + rmb(); - for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ ) - { - resp = RING_GET_RESPONSE(&blktap_ufe_ring, i); - pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id)); - pending_req = &pending_reqs[pending_idx]; + for (i = blktap_ufe_ring.rsp_cons; i != rp; i++) { + resp = RING_GET_RESPONSE(&blktap_ufe_ring, i); + pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id)); + pending_req = &pending_reqs[pending_idx]; - blkif = pending_req->blkif; - for (j = 0; j < pending_req->nr_pages; j++) { - unsigned long vaddr; - struct page **map = blktap_vma->vm_private_data; - int offset; - - vaddr = MMAP_VADDR(user_vstart, pending_idx, j); - offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT; - - //ClearPageReserved(virt_to_page(vaddr)); - ClearPageReserved((struct page *)map[offset]); - map[offset] = NULL; - } - - fast_flush_area(pending_idx, pending_req->nr_pages); - make_response(blkif, pending_req->id, resp->operation, - resp->status); - blkif_put(pending_req->blkif); - spin_lock_irqsave(&pend_prod_lock, flags); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - spin_unlock_irqrestore(&pend_prod_lock, flags); - } - blktap_ufe_ring.rsp_cons = i; - maybe_trigger_blkio_schedule(); - } - return 0; + blkif = pending_req->blkif; + for (j = 0; j < pending_req->nr_pages; j++) { + unsigned long vaddr; + struct page **map = blktap_vma->vm_private_data; + int offset; + + vaddr = MMAP_VADDR(user_vstart, pending_idx, j); + offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + + //ClearPageReserved(virt_to_page(vaddr)); + ClearPageReserved((struct page *)map[offset]); + map[offset] = NULL; + } + + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(blkif, pending_req->id, resp->operation, + resp->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + } + blktap_ufe_ring.rsp_cons = i; + maybe_trigger_blkio_schedule(); + } + return 0; } @@ -618,10 +612,10 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) { - blkif_t *blkif = dev_id; - add_to_blkdev_list_tail(blkif); - maybe_trigger_blkio_schedule(); - return IRQ_HANDLED; + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_blkio_schedule(); + return IRQ_HANDLED; } @@ -632,199 +626,194 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do) { - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; - RING_IDX i, rp; - int more_to_do = 0; + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + blkif_request_t *req; + RING_IDX i, rp; + int more_to_do = 0; - rp = blk_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - for ( i = blk_ring->req_cons; - (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); - i++ ) - { - if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) - { - more_to_do = 1; - break; - } + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + for (i = blk_ring->req_cons; + (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); + i++ ) { + if ((max_to_do-- == 0) || + (NR_PENDING_REQS == MAX_PENDING_REQS)) { + more_to_do = 1; + break; + } - req = RING_GET_REQUEST(blk_ring, i); - switch ( req->operation ) - { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - dispatch_rw_block_io(blkif, req); - break; - - default: - DPRINTK("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); - break; - } - } - - blk_ring->req_cons = i; - blktap_kick_user(); - - return more_to_do; + req = RING_GET_REQUEST(blk_ring, i); + switch (req->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, + BLKIF_RSP_ERROR); + break; + } + } + + blk_ring->req_cons = i; + blktap_kick_user(); + + return more_to_do; } static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) { - blkif_request_t *target; - int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - pending_req_t *pending_req; - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; - int op, ret; - unsigned int nseg; - - /* Check that number of segments is sane. */ - nseg = req->nr_segments; - if ( unlikely(nseg == 0) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) - { - DPRINTK("Bad number of segments in request (%d)\n", nseg); - goto bad_descriptor; - } - - /* Make sure userspace is ready. */ - if (!blktap_ring_ok) { - DPRINTK("blktap: ring not ready for requests!\n"); - goto bad_descriptor; - } + blkif_request_t *target; + int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + int op, ret; + unsigned int nseg; + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if (unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTK("Bad number of segments in request (%d)\n", nseg); + goto bad_descriptor; + } + + /* Make sure userspace is ready. */ + if (!blktap_ring_ok) { + DPRINTK("blktap: ring not ready for requests!\n"); + goto bad_descriptor; + } - if ( RING_FULL(&blktap_ufe_ring) ) { - WPRINTK("blktap: fe_ring is full, can't add (very broken!).\n"); - goto bad_descriptor; - } - - flush_cache_all(); /* a noop on intel... */ - - /* Map the foreign pages directly in to the application */ - op = 0; - for (i=0; i<req->nr_segments; i++) { - - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long ptep; - - uvaddr = MMAP_VADDR(user_vstart, pending_idx, i); - kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); - - /* Map the remote page to kernel. */ - map[op].host_addr = kvaddr; - map[op].dom = blkif->domid; - map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); - map[op].flags = GNTMAP_host_map; - /* This needs a bit more thought in terms of interposition: - * If we want to be able to modify pages during write using - * grant table mappings, the guest will either need to allow - * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */ - if (req->operation == BLKIF_OP_WRITE) - map[op].flags |= GNTMAP_readonly; - op++; - - /* Now map it to user. */ - ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); - if (ret) - { - DPRINTK("Couldn't get a pte addr!\n"); - fast_flush_area(pending_idx, req->nr_segments); - goto bad_descriptor; - } - - map[op].host_addr = ptep; - map[op].dom = blkif->domid; - map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); - map[op].flags = GNTMAP_host_map | GNTMAP_application_map - | GNTMAP_contains_pte; - /* Above interposition comment applies here as well. */ - if (req->operation == BLKIF_OP_WRITE) - map[op].flags |= GNTMAP_readonly; - op++; - } - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, map, op))) - BUG(); - - op = 0; - for (i=0; i<(req->nr_segments*2); i+=2) { - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long offset; - int cancel = 0; - - uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2); - kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2); - - if ( unlikely(map[i].handle < 0) ) - { - DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle); - ret = map[i].handle; - cancel = 1; - } - - if ( unlikely(map[i+1].handle < 0) ) - { - DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle); - ret = map[i+1].handle; - cancel = 1; - } - - if (cancel) - { - fast_flush_area(pending_idx, req->nr_segments); - goto bad_descriptor; - } - - /* Set the necessary mappings in p2m and in the VM_FOREIGN - * vm_area_struct to allow user vaddr -> struct page lookups - * to work. This is needed for direct IO to foreign pages. */ - phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] = - FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); - - offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; - ((struct page **)blktap_vma->vm_private_data)[offset] = - pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - - /* Save handles for unmapping later. */ - pending_handle(pending_idx, i/2).kernel = map[i].handle; - pending_handle(pending_idx, i/2).user = map[i+1].handle; - } - - /* Mark mapped pages as reserved: */ - for ( i = 0; i < req->nr_segments; i++ ) - { - unsigned long kvaddr; - - kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); - SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT)); - } - - pending_req = &pending_reqs[pending_idx]; - pending_req->blkif = blkif; - pending_req->id = req->id; - pending_req->operation = req->operation; - pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; - req->id = MAKE_ID(blkif->domid, pending_idx); - //atomic_set(&pending_req->pendcnt, nbio); - pending_cons++; - blkif_get(blkif); - - /* Finally, write the request message to the user ring. */ - target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt); - memcpy(target, req, sizeof(*req)); - blktap_ufe_ring.req_prod_pvt++; - return; + if (RING_FULL(&blktap_ufe_ring)) { + WPRINTK("blktap: fe_ring is full, can't add " + "(very broken!).\n"); + goto bad_descriptor; + } + + flush_cache_all(); /* a noop on intel... */ + + /* Map the foreign pages directly in to the application */ + op = 0; + for (i = 0; i < req->nr_segments; i++) { + + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long ptep; + + uvaddr = MMAP_VADDR(user_vstart, pending_idx, i); + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); + + /* Map the remote page to kernel. */ + map[op].host_addr = kvaddr; + map[op].dom = blkif->domid; + map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map; + /* This needs a bit more thought in terms of interposition: + * If we want to be able to modify pages during write using + * grant table mappings, the guest will either need to allow + * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + + /* Now map it to user. */ + ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); + if (ret) { + DPRINTK("Couldn't get a pte addr!\n"); + fast_flush_area(pending_idx, req->nr_segments); + goto bad_descriptor; + } + + map[op].host_addr = ptep; + map[op].dom = blkif->domid; + map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + /* Above interposition comment applies here as well. */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + } + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, map, op)); + + op = 0; + for (i = 0; i < (req->nr_segments*2); i += 2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + int cancel = 0; + + uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2); + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2); + + if (unlikely(map[i].handle < 0)) { + DPRINTK("Error on kernel grant mapping (%d)\n", + map[i].handle); + ret = map[i].handle; + cancel = 1; + } + + if (unlikely(map[i+1].handle < 0)) { + DPRINTK("Error on user grant mapping (%d)\n", + map[i+1].handle); + ret = map[i+1].handle; + cancel = 1; + } + + if (cancel) { + fast_flush_area(pending_idx, req->nr_segments); + goto bad_descriptor; + } + + /* Set the necessary mappings in p2m and in the VM_FOREIGN + * vm_area_struct to allow user vaddr -> struct page lookups + * to work. This is needed for direct IO to foreign pages. */ + phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] = + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); + + offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + ((struct page **)blktap_vma->vm_private_data)[offset] = + pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + + /* Save handles for unmapping later. */ + pending_handle(pending_idx, i/2).kernel = map[i].handle; + pending_handle(pending_idx, i/2).user = map[i+1].handle; + } + + /* Mark mapped pages as reserved: */ + for (i = 0; i < req->nr_segments; i++) { + unsigned long kvaddr; + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); + SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT)); + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + req->id = MAKE_ID(blkif->domid, pending_idx); + //atomic_set(&pending_req->pendcnt, nbio); + pending_cons++; + blkif_get(blkif); + + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&blktap_ufe_ring, + blktap_ufe_ring.req_prod_pvt); + memcpy(target, req, sizeof(*req)); + blktap_ufe_ring.req_prod_pvt++; + return; bad_descriptor: - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); } @@ -837,80 +826,89 @@ static void make_response(blkif_t *blkif, unsigned long id, unsigned short op, int st) { - blkif_response_t *resp; - unsigned long flags; - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - - /* Place on the response ring for the relevant domain. */ - spin_lock_irqsave(&blkif->blk_ring_lock, flags); - resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - wmb(); /* Ensure other side can see the response fields. */ - blk_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES(blk_ring); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); - - /* Kick the relevant domain. */ - notify_via_evtchn(blkif->evtchn); + blkif_response_t *resp; + unsigned long flags; + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); /* Ensure other side can see the response fields. */ + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES(blk_ring); + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); } static struct miscdevice blktap_miscdev = { - .minor = BLKTAP_MINOR, - .name = "blktap", - .fops = &blktap_fops, - .devfs_name = "misc/blktap", + .minor = BLKTAP_MINOR, + .name = "blktap", + .fops = &blktap_fops, + .devfs_name = "misc/blktap", }; void blkif_deschedule(blkif_t *blkif) { - remove_from_blkdev_list(blkif); + remove_from_blkdev_list(blkif); } static int __init blkif_init(void) { - int i, j, err; - struct page *page; + int i, j, err; + struct page *page; /* - if ( !(xen_start_info->flags & SIF_INITDOMAIN) && - !(xen_start_info->flags & SIF_BLK_BE_DOMAIN) ) - return 0; + if ( !(xen_start_info->flags & SIF_INITDOMAIN) && + !(xen_start_info->flags & SIF_BLK_BE_DOMAIN) ) + return 0; */ - blkif_interface_init(); - - page = balloon_alloc_empty_page_range(MMAP_PAGES); - BUG_ON(page == NULL); - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - memset(pending_reqs, 0, sizeof(pending_reqs)); - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; + blkif_interface_init(); + + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; - spin_lock_init(&blkio_schedule_list_lock); - INIT_LIST_HEAD(&blkio_schedule_list); - - if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); - - blkif_xenbus_init(); - - for (i=0; i<MAX_PENDING_REQS ; i++) - for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) - BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j)); - - err = misc_register(&blktap_miscdev); - if ( err != 0 ) - { - printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err); - return err; - } - - init_waitqueue_head(&blktap_wait); - - return 0; + spin_lock_init(&blkio_schedule_list_lock); + INIT_LIST_HEAD(&blkio_schedule_list); + + BUG_ON(kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0); + + blkif_xenbus_init(); + + for (i = 0; i < MAX_PENDING_REQS ; i++) + for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) + BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j)); + + err = misc_register(&blktap_miscdev); + if (err != 0) { + printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", + err); + return err; + } + + init_waitqueue_head(&blktap_wait); + + return 0; } __initcall(blkif_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blktap/common.h --- a/linux-2.6-xen-sparse/drivers/xen/blktap/common.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/common.h Thu Sep 22 17:42:01 2005 @@ -17,6 +17,7 @@ #include <asm-xen/xen-public/io/blkif.h> #include <asm-xen/xen-public/io/ring.h> #include <asm-xen/gnttab.h> +#include <asm-xen/driver_util.h> #if 0 #define ASSERT(_p) \ @@ -32,39 +33,39 @@ #define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) struct vbd { - blkif_vdev_t handle; /* what the domain refers to this vbd as */ - unsigned char readonly; /* Non-zero -> read-only */ - unsigned char type; /* VDISK_xxx */ - u32 pdevice; /* phys device that this vbd maps to */ - struct block_device *bdev; + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + u32 pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; }; typedef struct blkif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - unsigned int remote_evtchn; - /* Comms information. */ - blkif_back_ring_t blk_ring; - /* VBDs attached to this interface. */ - struct vbd vbd; - /* Private fields. */ - enum { DISCONNECTED, CONNECTED } status; + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int evtchn; + unsigned int remote_evtchn; + /* Comms information. */ + blkif_back_ring_t blk_ring; + struct vm_struct *blk_ring_area; + /* VBDs attached to this interface. */ + struct vbd vbd; + /* Private fields. */ + enum { DISCONNECTED, CONNECTED } status; #ifdef CONFIG_XEN_BLKDEV_TAP_BE - /* Is this a blktap frontend */ - unsigned int is_blktap; + /* Is this a blktap frontend */ + unsigned int is_blktap; #endif - struct list_head blkdev_list; - spinlock_t blk_ring_lock; - atomic_t refcnt; + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; - struct work_struct free_work; - u16 shmem_handle; - unsigned long shmem_vaddr; - grant_ref_t shmem_ref; + struct work_struct free_work; + + u16 shmem_handle; + grant_ref_t shmem_ref; } blkif_t; blkif_t *alloc_blkif(domid_t domid); @@ -88,10 +89,10 @@ unsigned long vbd_secsize(struct vbd *vbd); struct phys_req { - unsigned short dev; - unsigned short nr_sects; - struct block_device *bdev; - blkif_sector_t sector_number; + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; }; int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); @@ -105,3 +106,13 @@ irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); #endif /* __BLKIF__BACKEND__COMMON_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blktap/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c Thu Sep 22 17:42:01 2005 @@ -13,129 +13,143 @@ blkif_t *alloc_blkif(domid_t domid) { - blkif_t *blkif; + blkif_t *blkif; - blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); - if (!blkif) - return ERR_PTR(-ENOMEM); + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->status = DISCONNECTED; - spin_lock_init(&blkif->blk_ring_lock); - atomic_set(&blkif->refcnt, 1); + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); - return blkif; + return blkif; } -static int map_frontend_page(blkif_t *blkif, unsigned long localaddr, - unsigned long shared_page) +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) { - struct gnttab_map_grant_ref op; - op.host_addr = localaddr; - op.flags = GNTMAP_host_map; - op.ref = shared_page; - op.dom = blkif->domid; + struct gnttab_map_grant_ref op; - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + op.host_addr = (unsigned long)blkif->blk_ring_area->addr; + op.flags = GNTMAP_host_map; + op.ref = shared_page; + op.dom = blkif->domid; - if (op.handle < 0) { - DPRINTK(" Grant table operation failure !\n"); - return op.handle; - } + lock_vm_area(blkif->blk_ring_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + unlock_vm_area(blkif->blk_ring_area); - blkif->shmem_ref = shared_page; - blkif->shmem_handle = op.handle; - blkif->shmem_vaddr = localaddr; - return 0; + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + + return 0; } static void unmap_frontend_page(blkif_t *blkif) { - struct gnttab_unmap_grant_ref op; + struct gnttab_unmap_grant_ref op; - op.host_addr = blkif->shmem_vaddr; - op.handle = blkif->shmem_handle; - op.dev_bus_addr = 0; - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + op.host_addr = (unsigned long)blkif->blk_ring_area->addr; + op.handle = blkif->shmem_handle; + op.dev_bus_addr = 0; + + lock_vm_area(blkif->blk_ring_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + unlock_vm_area(blkif->blk_ring_area); } int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) { - struct vm_struct *vma; - blkif_sring_t *sring; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; + blkif_sring_t *sring; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; - BUG_ON(blkif->remote_evtchn); + BUG_ON(blkif->remote_evtchn); - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - return -ENOMEM; + if ((blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL) + return -ENOMEM; - err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page); - if (err) { - vfree(vma->addr); - return err; - } + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } - op.u.bind_interdomain.dom1 = DOMID_SELF; - op.u.bind_interdomain.dom2 = blkif->domid; - op.u.bind_interdomain.port1 = 0; - op.u.bind_interdomain.port2 = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_frontend_page(blkif); - vfree(vma->addr); - return err; - } + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = blkif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + return err; + } - blkif->evtchn = op.u.bind_interdomain.port1; - blkif->remote_evtchn = evtchn; - sring = (blkif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); + blkif->evtchn = op.u.bind_interdomain.port1; + blkif->remote_evtchn = evtchn; - bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend", - blkif); - blkif->status = CONNECTED; - blkif->shmem_frame = shared_page; + sring = (blkif_sring_t *)blkif->blk_ring_area->addr; + SHARED_RING_INIT(sring); + BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); - return 0; + bind_evtchn_to_irqhandler( + blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif); + blkif->status = CONNECTED; + + return 0; } static void free_blkif(void *arg) { - evtchn_op_t op = { .cmd = EVTCHNOP_close }; - blkif_t *blkif = (blkif_t *)arg; + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + blkif_t *blkif = (blkif_t *)arg; - op.u.close.port = blkif->evtchn; - op.u.close.dom = DOMID_SELF; - HYPERVISOR_event_channel_op(&op); - op.u.close.port = blkif->remote_evtchn; - op.u.close.dom = blkif->domid; - HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->remote_evtchn; + op.u.close.dom = blkif->domid; + HYPERVISOR_event_channel_op(&op); - if (blkif->evtchn) - unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); + if (blkif->evtchn) + unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); - if (blkif->blk_ring.sring) { - unmap_frontend_page(blkif); - vfree(blkif->blk_ring.sring); - blkif->blk_ring.sring = NULL; - } + if (blkif->blk_ring.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_ring.sring = NULL; + } - kmem_cache_free(blkif_cachep, blkif); + kmem_cache_free(blkif_cachep, blkif); } void free_blkif_callback(blkif_t *blkif) { - INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); - schedule_work(&blkif->free_work); + INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); + schedule_work(&blkif->free_work); } void __init blkif_interface_init(void) { - blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), - 0, 0, NULL, NULL); + blkif_cachep = kmem_cache_create( + "blkif_cache", sizeof(blkif_t), 0, 0, NULL, NULL); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c Thu Sep 22 17:42:01 2005 @@ -172,6 +172,7 @@ be->dev = dev; be->backend_watch.node = dev->nodename; be->backend_watch.callback = backend_changed; + /* Registration implicitly fires backend_changed once */ err = register_xenbus_watch(&be->backend_watch); if (err) { be->backend_watch.node = NULL; @@ -193,8 +194,6 @@ } dev->data = be; - - backend_changed(&be->backend_watch, dev->nodename); return 0; free_be: @@ -223,3 +222,13 @@ { xenbus_register_backend(&blkback); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/console/console.c --- a/linux-2.6-xen-sparse/drivers/xen/console/console.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/console/console.c Thu Sep 22 17:42:01 2005 @@ -75,31 +75,33 @@ static int __init xencons_setup(char *str) { - char *q; - int n; - - if ( !strncmp(str, "ttyS", 4) ) - xc_mode = XC_SERIAL; - else if ( !strncmp(str, "tty", 3) ) - xc_mode = XC_TTY; - else if ( !strncmp(str, "off", 3) ) - xc_mode = XC_OFF; - - switch ( xc_mode ) - { - case XC_SERIAL: - n = simple_strtol( str+4, &q, 10 ); - if ( q > (str + 4) ) xc_num = n; - break; - case XC_TTY: - n = simple_strtol( str+3, &q, 10 ); - if ( q > (str + 3) ) xc_num = n; - break; - default: - break; - } - - return 1; + char *q; + int n; + + if (!strncmp(str, "ttyS", 4)) + xc_mode = XC_SERIAL; + else if (!strncmp(str, "tty", 3)) + xc_mode = XC_TTY; + else if (!strncmp(str, "off", 3)) + xc_mode = XC_OFF; + + switch ( xc_mode ) + { + case XC_SERIAL: + n = simple_strtol(str+4, &q, 10); + if (q > (str + 4)) + xc_num = n; + break; + case XC_TTY: + n = simple_strtol(str+3, &q, 10); + if (q > (str + 3)) + xc_num = n; + break; + default: + break; + } + + return 1; } __setup("xencons=", xencons_setup); @@ -111,11 +113,11 @@ static int __init xencons_bufsz_setup(char *str) { - unsigned int goal; - goal = simple_strtoul(str, NULL, 0); - while ( wbuf_size < goal ) - wbuf_size <<= 1; - return 1; + unsigned int goal; + goal = simple_strtoul(str, NULL, 0); + while (wbuf_size < goal) + wbuf_size <<= 1; + return 1; } __setup("xencons_bufsz=", xencons_bufsz_setup); @@ -135,57 +137,55 @@ /******************** Kernel console driver ********************************/ static void kcons_write( - struct console *c, const char *s, unsigned int count) -{ - int i; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); + struct console *c, const char *s, unsigned int count) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); - for ( i = 0; i < count; i++ ) - { - if ( (wp - wc) >= (wbuf_size - 1) ) - break; - if ( (wbuf[WBUF_MASK(wp++)] = s[i]) == '\n' ) - wbuf[WBUF_MASK(wp++)] = '\r'; - } - - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); + for (i = 0; i < count; i++) { + if ((wp - wc) >= (wbuf_size - 1)) + break; + if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n') + wbuf[WBUF_MASK(wp++)] = '\r'; + } + + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); } static void kcons_write_dom0( - struct console *c, const char *s, unsigned int count) -{ - int rc; - - while ( (count > 0) && - ((rc = HYPERVISOR_console_io( - CONSOLEIO_write, count, (char *)s)) > 0) ) - { - count -= rc; - s += rc; - } + struct console *c, const char *s, unsigned int count) +{ + int rc; + + while ((count > 0) && + ((rc = HYPERVISOR_console_io( + CONSOLEIO_write, count, (char *)s)) > 0)) { + count -= rc; + s += rc; + } } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) static struct tty_driver *kcons_device(struct console *c, int *index) { - *index = c->index; - return xencons_driver; + *index = c->index; + return xencons_driver; } #else static kdev_t kcons_device(struct console *c) { - return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1); + return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1); } #endif static struct console kcons_info = { - .device = kcons_device, - .flags = CON_PRINTBUFFER, - .index = -1, + .device = kcons_device, + .flags = CON_PRINTBUFFER, + .index = -1, }; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) @@ -196,44 +196,42 @@ void xen_console_init(void) #endif { - if ( xen_start_info->flags & SIF_INITDOMAIN ) - { - if ( xc_mode == XC_DEFAULT ) - xc_mode = XC_SERIAL; - kcons_info.write = kcons_write_dom0; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - if ( xc_mode == XC_SERIAL ) - kcons_info.flags |= CON_ENABLED; -#endif - } - else - { - if ( xc_mode == XC_DEFAULT ) - xc_mode = XC_TTY; - kcons_info.write = kcons_write; - } - - switch ( xc_mode ) - { - case XC_SERIAL: - strcpy(kcons_info.name, "ttyS"); - if ( xc_num == -1 ) xc_num = 0; - break; - - case XC_TTY: - strcpy(kcons_info.name, "tty"); - if ( xc_num == -1 ) xc_num = 1; - break; - - default: - return __RETCODE; - } - - wbuf = alloc_bootmem(wbuf_size); - - register_console(&kcons_info); - - return __RETCODE; + if (xen_start_info->flags & SIF_INITDOMAIN) { + if (xc_mode == XC_DEFAULT) + xc_mode = XC_SERIAL; + kcons_info.write = kcons_write_dom0; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + if (xc_mode == XC_SERIAL) + kcons_info.flags |= CON_ENABLED; +#endif + } else { + if (xc_mode == XC_DEFAULT) + xc_mode = XC_TTY; + kcons_info.write = kcons_write; + } + + switch (xc_mode) { + case XC_SERIAL: + strcpy(kcons_info.name, "ttyS"); + if (xc_num == -1) + xc_num = 0; + break; + + case XC_TTY: + strcpy(kcons_info.name, "tty"); + if (xc_num == -1) + xc_num = 1; + break; + + default: + return __RETCODE; + } + + wbuf = alloc_bootmem(wbuf_size); + + register_console(&kcons_info); + + return __RETCODE; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) console_initcall(xen_console_init); @@ -246,41 +244,40 @@ asmlinkage int xprintk(const char *fmt, ...) #endif { - va_list args; - int printk_len; - static char printk_buf[1024]; + va_list args; + int printk_len; + static char printk_buf[1024]; - /* Emit the output into the temporary buffer */ - va_start(args, fmt); - printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); - va_end(args); - - /* Send the processed output directly to Xen. */ - kcons_write_dom0(NULL, printk_buf, printk_len); - - return 0; + /* Emit the output into the temporary buffer */ + va_start(args, fmt); + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); + va_end(args); + + /* Send the processed output directly to Xen. */ + kcons_write_dom0(NULL, printk_buf, printk_len); + + return 0; } /*** Forcibly flush console data before dying. ***/ void xencons_force_flush(void) { - int sz; - - /* Emergency console is synchronous, so there's nothing to flush. */ - if ( xen_start_info->flags & SIF_INITDOMAIN ) - return; - - - /* Spin until console data is flushed through to the domain controller. */ - while ( (wc != wp) ) - { - int sent = 0; - if ( (sz = wp - wc) == 0 ) - continue; - sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); - if (sent > 0) - wc += sent; - } + int sz; + + /* Emergency console is synchronous, so there's nothing to flush. */ + if (xen_start_info->flags & SIF_INITDOMAIN) + return; + + + /* Spin until console data is flushed through to the daemon. */ + while (wc != wp) { + int sent = 0; + if ((sz = wp - wc) == 0) + continue; + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) + wc += sent; + } } @@ -305,362 +302,358 @@ /* Non-privileged receive callback. */ static void xencons_rx(char *buf, unsigned len, struct pt_regs *regs) { - int i; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); - if ( xencons_tty != NULL ) - { - for ( i = 0; i < len; i++ ) { + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + if (xencons_tty == NULL) + goto out; + + for (i = 0; i < len; i++) { #ifdef CONFIG_MAGIC_SYSRQ - if (sysrq_enabled) { - if (buf[i] == '\x0f') { /* ^O */ - sysrq_requested = jiffies; - continue; /* don't print the sysrq key */ - } else if (sysrq_requested) { - unsigned long sysrq_timeout = sysrq_requested + HZ*2; - sysrq_requested = 0; - /* if it's been less than a timeout, do the sysrq */ - if (time_before(jiffies, sysrq_timeout)) { - spin_unlock_irqrestore(&xencons_lock, flags); - handle_sysrq(buf[i], regs, xencons_tty); - spin_lock_irqsave(&xencons_lock, flags); - continue; - } - } - } -#endif - tty_insert_flip_char(xencons_tty, buf[i], 0); - } - tty_flip_buffer_push(xencons_tty); - } - spin_unlock_irqrestore(&xencons_lock, flags); - + if (sysrq_enabled) { + if (buf[i] == '\x0f') { /* ^O */ + sysrq_requested = jiffies; + continue; /* don't print the sysrq key */ + } else if (sysrq_requested) { + unsigned long sysrq_timeout = + sysrq_requested + HZ*2; + sysrq_requested = 0; + if (time_before(jiffies, sysrq_timeout)) { + spin_unlock_irqrestore( + &xencons_lock, flags); + handle_sysrq( + buf[i], regs, xencons_tty); + spin_lock_irqsave( + &xencons_lock, flags); + continue; + } + } + } +#endif + tty_insert_flip_char(xencons_tty, buf[i], 0); + } + tty_flip_buffer_push(xencons_tty); + + out: + spin_unlock_irqrestore(&xencons_lock, flags); } /* Privileged and non-privileged transmit worker. */ static void __xencons_tx_flush(void) { - int sz, work_done = 0; - - if ( xen_start_info->flags & SIF_INITDOMAIN ) - { - if ( x_char ) - { - kcons_write_dom0(NULL, &x_char, 1); - x_char = 0; - work_done = 1; - } - - while ( wc != wp ) - { - sz = wp - wc; - if ( sz > (wbuf_size - WBUF_MASK(wc)) ) - sz = wbuf_size - WBUF_MASK(wc); - kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); - wc += sz; - work_done = 1; - } - } - else - { - while ( x_char ) - { - if (xencons_ring_send(&x_char, 1) == 1) { - x_char = 0; - work_done = 1; - } - } - - while ( wc != wp ) - { - int sent; - sz = wp - wc; - if ( sz > (wbuf_size - WBUF_MASK(wc)) ) - sz = wbuf_size - WBUF_MASK(wc); - sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); - if ( sent > 0 ) { - wc += sent; - work_done = 1; - } - } - } - - if ( work_done && (xencons_tty != NULL) ) - { - wake_up_interruptible(&xencons_tty->write_wait); - if ( (xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && - (xencons_tty->ldisc.write_wakeup != NULL) ) - (xencons_tty->ldisc.write_wakeup)(xencons_tty); - } + int sz, work_done = 0; + + if (xen_start_info->flags & SIF_INITDOMAIN) { + if (x_char) { + kcons_write_dom0(NULL, &x_char, 1); + x_char = 0; + work_done = 1; + } + + while (wc != wp) { + sz = wp - wc; + if (sz > (wbuf_size - WBUF_MASK(wc))) + sz = wbuf_size - WBUF_MASK(wc); + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); + wc += sz; + work_done = 1; + } + } else { + while (x_char) { + if (xencons_ring_send(&x_char, 1) == 1) { + x_char = 0; + work_done = 1; + } + } + + while (wc != wp) { + int sent; + sz = wp - wc; + if (sz > (wbuf_size - WBUF_MASK(wc))) + sz = wbuf_size - WBUF_MASK(wc); + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) { + wc += sent; + work_done = 1; + } + } + } + + if (work_done && (xencons_tty != NULL)) + { + wake_up_interruptible(&xencons_tty->write_wait); + if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + (xencons_tty->ldisc.write_wakeup != NULL)) + (xencons_tty->ldisc.write_wakeup)(xencons_tty); + } } /* Privileged receive callback and transmit kicker. */ static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - static char rbuf[16]; - int i, l; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); - - if ( xencons_tty != NULL ) - { - /* Receive work. */ - while ( (l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0 ) - for ( i = 0; i < l; i++ ) - tty_insert_flip_char(xencons_tty, rbuf[i], 0); - if ( xencons_tty->flip.count != 0 ) - tty_flip_buffer_push(xencons_tty); - } - - /* Transmit work. */ - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return IRQ_HANDLED; + static char rbuf[16]; + int i, l; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + if (xencons_tty != NULL) + { + /* Receive work. */ + while ((l = HYPERVISOR_console_io( + CONSOLEIO_read, 16, rbuf)) > 0) + for (i = 0; i < l; i++) + tty_insert_flip_char(xencons_tty, rbuf[i], 0); + if (xencons_tty->flip.count != 0) + tty_flip_buffer_push(xencons_tty); + } + + /* Transmit work. */ + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return IRQ_HANDLED; } static int xencons_write_room(struct tty_struct *tty) { - return wbuf_size - (wp - wc); + return wbuf_size - (wp - wc); } static int xencons_chars_in_buffer(struct tty_struct *tty) { - return wp - wc; + return wp - wc; } static void xencons_send_xchar(struct tty_struct *tty, char ch) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - x_char = ch; - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return; + + spin_lock_irqsave(&xencons_lock, flags); + x_char = ch; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); } static void xencons_throttle(struct tty_struct *tty) { - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( I_IXOFF(tty) ) - xencons_send_xchar(tty, STOP_CHAR(tty)); + if (TTY_INDEX(tty) != 0) + return; + + if (I_IXOFF(tty)) + xencons_send_xchar(tty, STOP_CHAR(tty)); } static void xencons_unthrottle(struct tty_struct *tty) { - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( I_IXOFF(tty) ) - { - if ( x_char != 0 ) - x_char = 0; - else - xencons_send_xchar(tty, START_CHAR(tty)); - } + if (TTY_INDEX(tty) != 0) + return; + + if (I_IXOFF(tty)) { + if (x_char != 0) + x_char = 0; + else + xencons_send_xchar(tty, START_CHAR(tty)); + } } static void xencons_flush_buffer(struct tty_struct *tty) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - wc = wp = 0; - spin_unlock_irqrestore(&xencons_lock, flags); + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return; + + spin_lock_irqsave(&xencons_lock, flags); + wc = wp = 0; + spin_unlock_irqrestore(&xencons_lock, flags); } static inline int __xencons_put_char(int ch) { - char _ch = (char)ch; - if ( (wp - wc) == wbuf_size ) - return 0; - wbuf[WBUF_MASK(wp++)] = _ch; - return 1; + char _ch = (char)ch; + if ((wp - wc) == wbuf_size) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) static int xencons_write( - struct tty_struct *tty, - const unsigned char *buf, - int count) -{ - int i; - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return count; - - spin_lock_irqsave(&xencons_lock, flags); - - for ( i = 0; i < count; i++ ) - if ( !__xencons_put_char(buf[i]) ) - break; - - if ( i != 0 ) - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return i; + struct tty_struct *tty, + const unsigned char *buf, + int count) +{ + int i; + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for (i = 0; i < count; i++) + if (!__xencons_put_char(buf[i])) + break; + + if (i != 0) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; } #else static int xencons_write( - struct tty_struct *tty, - int from_user, - const u_char *buf, - int count) -{ - int i; - unsigned long flags; - - if ( from_user && verify_area(VERIFY_READ, buf, count) ) - return -EINVAL; - - if ( TTY_INDEX(tty) != 0 ) - return count; - - spin_lock_irqsave(&xencons_lock, flags); - - for ( i = 0; i < count; i++ ) - { - char ch; - if ( from_user ) - __get_user(ch, buf + i); - else - ch = buf[i]; - if ( !__xencons_put_char(ch) ) - break; - } - - if ( i != 0 ) - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return i; + struct tty_struct *tty, + int from_user, + const u_char *buf, + int count) +{ + int i; + unsigned long flags; + + if (from_user && verify_area(VERIFY_READ, buf, count)) + return -EINVAL; + + if (TTY_INDEX(tty) != 0) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for (i = 0; i < count; i++) { + char ch; + if (from_user) + __get_user(ch, buf + i); + else + ch = buf[i]; + if (!__xencons_put_char(ch)) + break; + } + + if (i != 0) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; } #endif static void xencons_put_char(struct tty_struct *tty, u_char ch) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - (void)__xencons_put_char(ch); - spin_unlock_irqrestore(&xencons_lock, flags); + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return; + + spin_lock_irqsave(&xencons_lock, flags); + (void)__xencons_put_char(ch); + spin_unlock_irqrestore(&xencons_lock, flags); } static void xencons_flush_chars(struct tty_struct *tty) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); } static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) { - unsigned long orig_jiffies = jiffies; - - if ( TTY_INDEX(tty) != 0 ) - return; - - while ( DRV(tty->driver)->chars_in_buffer(tty) ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - if ( signal_pending(current) ) - break; - if ( (timeout != 0) && time_after(jiffies, orig_jiffies + timeout) ) - break; - } + unsigned long orig_jiffies = jiffies; + + if (TTY_INDEX(tty) != 0) + return; + + while (DRV(tty->driver)->chars_in_buffer(tty)) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + if (signal_pending(current)) + break; + if ( (timeout != 0) && + time_after(jiffies, orig_jiffies + timeout) ) + break; + } - set_current_state(TASK_RUNNING); + set_current_state(TASK_RUNNING); } static int xencons_open(struct tty_struct *tty, struct file *filp) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return 0; - - spin_lock_irqsave(&xencons_lock, flags); - tty->driver_data = NULL; - if ( xencons_tty == NULL ) - xencons_tty = tty; - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); - - return 0; + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return 0; + + spin_lock_irqsave(&xencons_lock, flags); + tty->driver_data = NULL; + if (xencons_tty == NULL) + xencons_tty = tty; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); + + return 0; } static void xencons_close(struct tty_struct *tty, struct file *filp) { - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( tty->count == 1 ) - { - tty->closing = 1; - tty_wait_until_sent(tty, 0); - if ( DRV(tty->driver)->flush_buffer != NULL ) - DRV(tty->driver)->flush_buffer(tty); - if ( tty->ldisc.flush_buffer != NULL ) - tty->ldisc.flush_buffer(tty); - tty->closing = 0; - spin_lock_irqsave(&xencons_lock, flags); - xencons_tty = NULL; - spin_unlock_irqrestore(&xencons_lock, flags); - } + unsigned long flags; + + if (TTY_INDEX(tty) != 0) + return; + + if (tty->count == 1) { + tty->closing = 1; + tty_wait_until_sent(tty, 0); + if (DRV(tty->driver)->flush_buffer != NULL) + DRV(tty->driver)->flush_buffer(tty); + if (tty->ldisc.flush_buffer != NULL) + tty->ldisc.flush_buffer(tty); + tty->closing = 0; + spin_lock_irqsave(&xencons_lock, flags); + xencons_tty = NULL; + spin_unlock_irqrestore(&xencons_lock, flags); + } } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) static struct tty_operations xencons_ops = { - .open = xencons_open, - .close = xencons_close, - .write = xencons_write, - .write_room = xencons_write_room, - .put_char = xencons_put_char, - .flush_chars = xencons_flush_chars, - .chars_in_buffer = xencons_chars_in_buffer, - .send_xchar = xencons_send_xchar, - .flush_buffer = xencons_flush_buffer, - .throttle = xencons_throttle, - .unthrottle = xencons_unthrottle, - .wait_until_sent = xencons_wait_until_sent, + .open = xencons_open, + .close = xencons_close, + .write = xencons_write, + .write_room = xencons_write_room, + .put_char = xencons_put_char, + .flush_chars = xencons_flush_chars, + .chars_in_buffer = xencons_chars_in_buffer, + .send_xchar = xencons_send_xchar, + .flush_buffer = xencons_flush_buffer, + .throttle = xencons_throttle, + .unthrottle = xencons_unthrottle, + .wait_until_sent = xencons_wait_until_sent, }; #ifdef CONFIG_XEN_PRIVILEGED_GUEST static const char *xennullcon_startup(void) { - return NULL; + return NULL; } static int xennullcon_dummy(void) { - return 0; + return 0; } #define DUMMY (void *)xennullcon_dummy @@ -672,122 +665,128 @@ */ const struct consw xennull_con = { - .owner = THIS_MODULE, - .con_startup = xennullcon_startup, - .con_init = DUMMY, - .con_deinit = DUMMY, - .con_clear = DUMMY, - .con_putc = DUMMY, - .con_putcs = DUMMY, - .con_cursor = DUMMY, - .con_scroll = DUMMY, - .con_bmove = DUMMY, - .con_switch = DUMMY, - .con_blank = DUMMY, - .con_font_set = DUMMY, - .con_font_get = DUMMY, - .con_font_default = DUMMY, - .con_font_copy = DUMMY, - .con_set_palette = DUMMY, - .con_scrolldelta = DUMMY, + .owner = THIS_MODULE, + .con_startup = xennullcon_startup, + .con_init = DUMMY, + .con_deinit = DUMMY, + .con_clear = DUMMY, + .con_putc = DUMMY, + .con_putcs = DUMMY, + .con_cursor = DUMMY, + .con_scroll = DUMMY, + .con_bmove = DUMMY, + .con_switch = DUMMY, + .con_blank = DUMMY, + .con_font_set = DUMMY, + .con_font_get = DUMMY, + .con_font_default = DUMMY, + .con_font_copy = DUMMY, + .con_set_palette = DUMMY, + .con_scrolldelta = DUMMY, }; #endif #endif static int __init xencons_init(void) { - int rc; - - if ( xc_mode == XC_OFF ) - return 0; - - xencons_ring_init(); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ? - 1 : MAX_NR_CONSOLES); - if ( xencons_driver == NULL ) - return -ENOMEM; + int rc; + + if (xc_mode == XC_OFF) + return 0; + + xencons_ring_init(); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ? + 1 : MAX_NR_CONSOLES); + if (xencons_driver == NULL) + return -ENOMEM; #else - memset(&xencons_driver, 0, sizeof(struct tty_driver)); - xencons_driver.magic = TTY_DRIVER_MAGIC; - xencons_driver.refcount = &xencons_refcount; - xencons_driver.table = xencons_table; - xencons_driver.num = (xc_mode == XC_SERIAL) ? 1 : MAX_NR_CONSOLES; -#endif - - DRV(xencons_driver)->major = TTY_MAJOR; - DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; - DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; - DRV(xencons_driver)->init_termios = tty_std_termios; - DRV(xencons_driver)->flags = - TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_NO_DEVFS; - DRV(xencons_driver)->termios = xencons_termios; - DRV(xencons_driver)->termios_locked = xencons_termios_locked; - - if ( xc_mode == XC_SERIAL ) - { - DRV(xencons_driver)->name = "ttyS"; - DRV(xencons_driver)->minor_start = 64 + xc_num; - DRV(xencons_driver)->name_base = 0 + xc_num; - } - else - { - DRV(xencons_driver)->name = "tty"; - DRV(xencons_driver)->minor_start = xc_num; - DRV(xencons_driver)->name_base = xc_num; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - tty_set_operations(xencons_driver, &xencons_ops); + memset(&xencons_driver, 0, sizeof(struct tty_driver)); + xencons_driver.magic = TTY_DRIVER_MAGIC; + xencons_driver.refcount = &xencons_refcount; + xencons_driver.table = xencons_table; + xencons_driver.num = + (xc_mode == XC_SERIAL) ? 1 : MAX_NR_CONSOLES; +#endif + + DRV(xencons_driver)->major = TTY_MAJOR; + DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; + DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; + DRV(xencons_driver)->init_termios = tty_std_termios; + DRV(xencons_driver)->flags = + TTY_DRIVER_REAL_RAW | + TTY_DRIVER_RESET_TERMIOS | + TTY_DRIVER_NO_DEVFS; + DRV(xencons_driver)->termios = xencons_termios; + DRV(xencons_driver)->termios_locked = xencons_termios_locked; + + if (xc_mode == XC_SERIAL) + { + DRV(xencons_driver)->name = "ttyS"; + DRV(xencons_driver)->minor_start = 64 + xc_num; + DRV(xencons_driver)->name_base = 0 + xc_num; + } else { + DRV(xencons_driver)->name = "tty"; + DRV(xencons_driver)->minor_start = xc_num; + DRV(xencons_driver)->name_base = xc_num; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + tty_set_operations(xencons_driver, &xencons_ops); #else - xencons_driver.open = xencons_open; - xencons_driver.close = xencons_close; - xencons_driver.write = xencons_write; - xencons_driver.write_room = xencons_write_room; - xencons_driver.put_char = xencons_put_char; - xencons_driver.flush_chars = xencons_flush_chars; - xencons_driver.chars_in_buffer = xencons_chars_in_buffer; - xencons_driver.send_xchar = xencons_send_xchar; - xencons_driver.flush_buffer = xencons_flush_buffer; - xencons_driver.throttle = xencons_throttle; - xencons_driver.unthrottle = xencons_unthrottle; - xencons_driver.wait_until_sent = xencons_wait_until_sent; -#endif - - if ( (rc = tty_register_driver(DRV(xencons_driver))) != 0 ) - { - printk("WARNING: Failed to register Xen virtual " - "console driver as '%s%d'\n", - DRV(xencons_driver)->name, DRV(xencons_driver)->name_base); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - put_tty_driver(xencons_driver); - xencons_driver = NULL; -#endif - return rc; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - tty_register_device(xencons_driver, 0, NULL); -#endif - - if ( xen_start_info->flags & SIF_INITDOMAIN ) - { - xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE); - (void)request_irq(xencons_priv_irq, - xencons_priv_interrupt, 0, "console", NULL); - } - else - { - - xencons_ring_register_receiver(xencons_rx); - } - - printk("Xen virtual console successfully installed as %s%d\n", - DRV(xencons_driver)->name, - DRV(xencons_driver)->name_base ); + xencons_driver.open = xencons_open; + xencons_driver.close = xencons_close; + xencons_driver.write = xencons_write; + xencons_driver.write_room = xencons_write_room; + xencons_driver.put_char = xencons_put_char; + xencons_driver.flush_chars = xencons_flush_chars; + xencons_driver.chars_in_buffer = xencons_chars_in_buffer; + xencons_driver.send_xchar = xencons_send_xchar; + xencons_driver.flush_buffer = xencons_flush_buffer; + xencons_driver.throttle = xencons_throttle; + xencons_driver.unthrottle = xencons_unthrottle; + xencons_driver.wait_until_sent = xencons_wait_until_sent; +#endif + + if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) { + printk("WARNING: Failed to register Xen virtual " + "console driver as '%s%d'\n", + DRV(xencons_driver)->name, DRV(xencons_driver)->name_base); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + put_tty_driver(xencons_driver); + xencons_driver = NULL; +#endif + return rc; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + tty_register_device(xencons_driver, 0, NULL); +#endif + + if (xen_start_info->flags & SIF_INITDOMAIN) { + xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE); + (void)request_irq(xencons_priv_irq, + xencons_priv_interrupt, 0, "console", NULL); + } else { + xencons_ring_register_receiver(xencons_rx); + } + + printk("Xen virtual console successfully installed as %s%d\n", + DRV(xencons_driver)->name, + DRV(xencons_driver)->name_base ); - return 0; + return 0; } module_init(xencons_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.c --- a/linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.c Thu Sep 22 17:42:01 2005 @@ -36,13 +36,12 @@ static inline struct ring_head *outring(void) { - return machine_to_virt(xen_start_info->console_mfn << PAGE_SHIFT); + return mfn_to_virt(xen_start_info->console_mfn); } static inline struct ring_head *inring(void) { - return machine_to_virt(xen_start_info->console_mfn << PAGE_SHIFT) - + PAGE_SIZE/2; + return mfn_to_virt(xen_start_info->console_mfn) + PAGE_SIZE/2; } @@ -126,3 +125,13 @@ (void)xencons_ring_init(); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.h --- a/linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/console/xencons_ring.h Thu Sep 22 17:42:01 2005 @@ -3,12 +3,21 @@ asmlinkage int xprintk(const char *fmt, ...); - int xencons_ring_init(void); int xencons_ring_send(const char *data, unsigned len); -typedef void (xencons_receiver_func)(char *buf, unsigned len, - struct pt_regs *regs); +typedef void (xencons_receiver_func)( + char *buf, unsigned len, struct pt_regs *regs); void xencons_ring_register_receiver(xencons_receiver_func *f); #endif /* _XENCONS_RING_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c --- a/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/evtchn/evtchn.c Thu Sep 22 17:42:01 2005 @@ -1,9 +1,9 @@ /****************************************************************************** * evtchn.c * - * Xenolinux driver for receiving and demuxing event-channel signals. - * - * Copyright (c) 2004, K A Fraser + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser * Multi-process extensions Copyright (c) 2004, Steven Smith * * This file may be distributed separately from the Linux kernel, or @@ -46,29 +46,18 @@ #include <linux/init.h> #define XEN_EVTCHN_MASK_OPS #include <asm-xen/evtchn.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#include <linux/devfs_fs_kernel.h> -#define OLD_DEVFS -#else #include <linux/gfp.h> -#endif - -#ifdef OLD_DEVFS -/* NB. This must be shared amongst drivers if more things go in /dev/xen */ -static devfs_handle_t xen_dev_dir; -#endif struct per_user_data { - /* Notification ring, accessed via /dev/xen/evtchn. */ -# define EVTCHN_RING_SIZE 2048 /* 2048 16-bit entries */ -# define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) - u16 *ring; - unsigned int ring_cons, ring_prod, ring_overflow; - - /* Processes wait on this queue when ring is empty. */ - wait_queue_head_t evtchn_wait; - struct fasync_struct *evtchn_async_queue; + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE 2048 /* 2048 16-bit entries */ +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + u16 *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; }; /* Who's bound to each port? */ @@ -77,356 +66,310 @@ void evtchn_device_upcall(int port) { - struct per_user_data *u; - - spin_lock(&port_user_lock); - - mask_evtchn(port); - clear_evtchn(port); - - if ( (u = port_user[port]) != NULL ) - { - if ( (u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE ) - { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = (u16)port; - if ( u->ring_cons == u->ring_prod++ ) - { - wake_up_interruptible(&u->evtchn_wait); - kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); - } - } - else - { - u->ring_overflow = 1; - } - } - - spin_unlock(&port_user_lock); + struct per_user_data *u; + + spin_lock(&port_user_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ((u = port_user[port]) != NULL) { + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = (u16)port; + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); + } + } else { + u->ring_overflow = 1; + } + } + + spin_unlock(&port_user_lock); } static ssize_t evtchn_read(struct file *file, char *buf, size_t count, loff_t *ppos) { - int rc; - unsigned int c, p, bytes1 = 0, bytes2 = 0; - DECLARE_WAITQUEUE(wait, current); - struct per_user_data *u = file->private_data; - - add_wait_queue(&u->evtchn_wait, &wait); - - count &= ~1; /* even number of bytes */ - - if ( count == 0 ) - { - rc = 0; - goto out; - } - - if ( count > PAGE_SIZE ) - count = PAGE_SIZE; - - for ( ; ; ) - { - set_current_state(TASK_INTERRUPTIBLE); - - if ( (c = u->ring_cons) != (p = u->ring_prod) ) - break; - - if ( u->ring_overflow ) - { - rc = -EFBIG; - goto out; - } - - if ( file->f_flags & O_NONBLOCK ) - { - rc = -EAGAIN; - goto out; - } - - if ( signal_pending(current) ) - { - rc = -ERESTARTSYS; - goto out; - } - - schedule(); - } - - /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ - if ( ((c ^ p) & EVTCHN_RING_SIZE) != 0 ) - { - bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * sizeof(u16); - bytes2 = EVTCHN_RING_MASK(p) * sizeof(u16); - } - else - { - bytes1 = (p - c) * sizeof(u16); - bytes2 = 0; - } - - /* Truncate chunks according to caller's maximum byte count. */ - if ( bytes1 > count ) - { - bytes1 = count; - bytes2 = 0; - } - else if ( (bytes1 + bytes2) > count ) - { - bytes2 = count - bytes1; - } - - if ( copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || - ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2)) ) - { - rc = -EFAULT; - goto out; - } - - u->ring_cons += (bytes1 + bytes2) / sizeof(u16); - - rc = bytes1 + bytes2; + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + DECLARE_WAITQUEUE(wait, current); + struct per_user_data *u = file->private_data; + + add_wait_queue(&u->evtchn_wait, &wait); + + count &= ~1; /* even number of bytes */ + + if (count == 0) { + rc = 0; + goto out; + } + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if ((c = u->ring_cons) != (p = u->ring_prod)) + break; + + if (u->ring_overflow) { + rc = -EFBIG; + goto out; + } + + if (file->f_flags & O_NONBLOCK) { + rc = -EAGAIN; + goto out; + } + + if (signal_pending(current)) { + rc = -ERESTARTSYS; + goto out; + } + + schedule(); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(u16); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(u16); + } else { + bytes1 = (p - c) * sizeof(u16); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) { + rc = -EFAULT; + goto out; + } + + u->ring_cons += (bytes1 + bytes2) / sizeof(u16); + + rc = bytes1 + bytes2; out: - __set_current_state(TASK_RUNNING); - remove_wait_queue(&u->evtchn_wait, &wait); - return rc; + __set_current_state(TASK_RUNNING); + remove_wait_queue(&u->evtchn_wait, &wait); + return rc; } static ssize_t evtchn_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - int rc, i; - u16 *kbuf = (u16 *)__get_free_page(GFP_KERNEL); - struct per_user_data *u = file->private_data; - - if ( kbuf == NULL ) - return -ENOMEM; - - count &= ~1; /* even number of bytes */ - - if ( count == 0 ) - { - rc = 0; - goto out; - } - - if ( count > PAGE_SIZE ) - count = PAGE_SIZE; - - if ( copy_from_user(kbuf, buf, count) != 0 ) - { - rc = -EFAULT; - goto out; - } - - spin_lock_irq(&port_user_lock); - for ( i = 0; i < (count/2); i++ ) - if ( (kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u) ) - unmask_evtchn(kbuf[i]); - spin_unlock_irq(&port_user_lock); - - rc = count; + int rc, i; + u16 *kbuf = (u16 *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + count &= ~1; /* even number of bytes */ + + if (count == 0) { + rc = 0; + goto out; + } + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + if (copy_from_user(kbuf, buf, count) != 0) { + rc = -EFAULT; + goto out; + } + + spin_lock_irq(&port_user_lock); + for (i = 0; i < (count/2); i++) + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) + unmask_evtchn(kbuf[i]); + spin_unlock_irq(&port_user_lock); + + rc = count; out: - free_page((unsigned long)kbuf); - return rc; + free_page((unsigned long)kbuf); + return rc; } static int evtchn_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - int rc = 0; - struct per_user_data *u = file->private_data; - - spin_lock_irq(&port_user_lock); + int rc = 0; + struct per_user_data *u = file->private_data; + + spin_lock_irq(&port_user_lock); - switch ( cmd ) - { - case EVTCHN_RESET: - /* Initialise the ring to empty. Clear errors. */ - u->ring_cons = u->ring_prod = u->ring_overflow = 0; - break; - - case EVTCHN_BIND: - if ( arg >= NR_EVENT_CHANNELS ) - { - rc = -EINVAL; - } - else if ( port_user[arg] != NULL ) - { - rc = -EISCONN; - } - else - { - port_user[arg] = u; - unmask_evtchn(arg); - } - break; - - case EVTCHN_UNBIND: - if ( arg >= NR_EVENT_CHANNELS ) - { - rc = -EINVAL; - } - else if ( port_user[arg] != u ) - { - rc = -ENOTCONN; - } - else - { - port_user[arg] = NULL; - mask_evtchn(arg); - } - break; - - default: - rc = -ENOSYS; - break; - } - - spin_unlock_irq(&port_user_lock); - - return rc; + switch (cmd) { + case EVTCHN_RESET: + /* Initialise the ring to empty. Clear errors. */ + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + break; + + case EVTCHN_BIND: + if (arg >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[arg] != NULL) { + rc = -EISCONN; + } else { + port_user[arg] = u; + unmask_evtchn(arg); + } + break; + + case EVTCHN_UNBIND: + if (arg >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[arg] != u) { + rc = -ENOTCONN; + } else { + port_user[arg] = NULL; + mask_evtchn(arg); + } + break; + + default: + rc = -ENOSYS; + break; + } + + spin_unlock_irq(&port_user_lock); + + return rc; } static unsigned int evtchn_poll(struct file *file, poll_table *wait) { - unsigned int mask = POLLOUT | POLLWRNORM; - struct per_user_data *u = file->private_data; - - poll_wait(file, &u->evtchn_wait, wait); - if ( u->ring_cons != u->ring_prod ) - mask |= POLLIN | POLLRDNORM; - if ( u->ring_overflow ) - mask = POLLERR; - return mask; + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; } static int evtchn_fasync(int fd, struct file *filp, int on) { - struct per_user_data *u = filp->private_data; - return fasync_helper(fd, filp, on, &u->evtchn_async_queue); + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); } static int evtchn_open(struct inode *inode, struct file *filp) { - struct per_user_data *u; - - if ( (u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL ) - return -ENOMEM; - - memset(u, 0, sizeof(*u)); - init_waitqueue_head(&u->evtchn_wait); - - if ( (u->ring = (u16 *)__get_free_page(GFP_KERNEL)) == NULL ) - { - kfree(u); - return -ENOMEM; - } - - filp->private_data = u; - - return 0; + struct per_user_data *u; + + if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL) + return -ENOMEM; + + memset(u, 0, sizeof(*u)); + init_waitqueue_head(&u->evtchn_wait); + + if ((u->ring = (u16 *)__get_free_page(GFP_KERNEL)) == NULL) + { + kfree(u); + return -ENOMEM; + } + + filp->private_data = u; + + return 0; } static int evtchn_release(struct inode *inode, struct file *filp) { - int i; - struct per_user_data *u = filp->private_data; - - spin_lock_irq(&port_user_lock); - - free_page((unsigned long)u->ring); - - for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) - { - if ( port_user[i] == u ) - { - port_user[i] = NULL; - mask_evtchn(i); - } - } - - spin_unlock_irq(&port_user_lock); - - kfree(u); - - return 0; + int i; + struct per_user_data *u = filp->private_data; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) + { + if (port_user[i] == u) + { + port_user[i] = NULL; + mask_evtchn(i); + } + } + + spin_unlock_irq(&port_user_lock); + + kfree(u); + + return 0; } static struct file_operations evtchn_fops = { - .owner = THIS_MODULE, - .read = evtchn_read, - .write = evtchn_write, - .ioctl = evtchn_ioctl, - .poll = evtchn_poll, - .fasync = evtchn_fasync, - .open = evtchn_open, - .release = evtchn_release, + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, }; static struct miscdevice evtchn_miscdev = { - .minor = EVTCHN_MINOR, - .name = "evtchn", - .fops = &evtchn_fops, + .minor = EVTCHN_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - .devfs_name = "misc/evtchn", + .devfs_name = "misc/evtchn", #endif }; static int __init evtchn_init(void) { -#ifdef OLD_DEVFS - devfs_handle_t symlink_handle; - int pos; - char link_dest[64]; -#endif - int err; - - spin_lock_init(&port_user_lock); - memset(port_user, 0, sizeof(port_user)); - - /* (DEVFS) create '/dev/misc/evtchn'. */ - err = misc_register(&evtchn_miscdev); - if ( err != 0 ) - { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); - return err; - } - -#ifdef OLD_DEVFS - /* (DEVFS) create directory '/dev/xen'. */ - xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); - - /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ - pos = devfs_generate_path(evtchn_miscdev.devfs_handle, - &link_dest[3], - sizeof(link_dest) - 3); - if ( pos >= 0 ) - strncpy(&link_dest[pos], "../", 3); - - /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ - (void)devfs_mk_symlink(xen_dev_dir, - "evtchn", - DEVFS_FL_DEFAULT, - &link_dest[pos], - &symlink_handle, - NULL); - - /* (DEVFS) automatically destroy the symlink with its destination. */ - devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); -#endif - - printk("Event-channel device installed.\n"); - - return 0; + int err; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* (DEVFS) create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + printk("Event-channel device installed.\n"); + + return 0; } static void evtchn_cleanup(void) { - misc_deregister(&evtchn_miscdev); + misc_deregister(&evtchn_miscdev); } module_init(evtchn_init); module_exit(evtchn_cleanup); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/netback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/netback/common.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/common.h Thu Sep 22 17:42:01 2005 @@ -18,16 +18,11 @@ #include <asm-xen/xen-public/io/netif.h> #include <asm/io.h> #include <asm/pgalloc.h> - -#ifdef CONFIG_XEN_NETDEV_GRANT #include <asm-xen/xen-public/grant_table.h> #include <asm-xen/gnttab.h> +#include <asm-xen/driver_util.h> #define GRANT_INVALID_REF (0xFFFF) - -#endif - - #if 0 #define ASSERT(_p) \ @@ -44,74 +39,64 @@ #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_net: " fmt, ##args) +typedef struct netif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; -typedef struct netif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; + u8 fe_dev_addr[6]; - u8 fe_dev_addr[6]; + /* Physical parameters of the comms window. */ + u16 tx_shmem_handle; + grant_ref_t tx_shmem_ref; + u16 rx_shmem_handle; + grant_ref_t rx_shmem_ref; + unsigned int evtchn; + unsigned int remote_evtchn; - /* Physical parameters of the comms window. */ - unsigned long tx_shmem_frame; -#ifdef CONFIG_XEN_NETDEV_GRANT - u16 tx_shmem_handle; - unsigned long tx_shmem_vaddr; - grant_ref_t tx_shmem_ref; -#endif - unsigned long rx_shmem_frame; -#ifdef CONFIG_XEN_NETDEV_GRANT - u16 rx_shmem_handle; - unsigned long rx_shmem_vaddr; - grant_ref_t rx_shmem_ref; -#endif - unsigned int evtchn; - unsigned int remote_evtchn; + /* The shared rings and indexes. */ + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + struct vm_struct *comms_area; - /* The shared rings and indexes. */ - netif_tx_interface_t *tx; - netif_rx_interface_t *rx; + /* Private indexes into shared ring. */ + NETIF_RING_IDX rx_req_cons; + NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ + NETIF_RING_IDX rx_resp_prod_copy; + NETIF_RING_IDX tx_req_cons; + NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ - /* Private indexes into shared ring. */ - NETIF_RING_IDX rx_req_cons; - NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ -#ifdef CONFIG_XEN_NETDEV_GRANT - NETIF_RING_IDX rx_resp_prod_copy; /* private version of shared variable */ -#endif - NETIF_RING_IDX tx_req_cons; - NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ + unsigned long credit_bytes; + unsigned long credit_usec; + unsigned long remaining_credit; + struct timer_list credit_timeout; - /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ - unsigned long credit_bytes; - unsigned long credit_usec; - unsigned long remaining_credit; - struct timer_list credit_timeout; + /* Miscellaneous private stuff. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + int active; + struct list_head list; /* scheduling list */ + atomic_t refcnt; + struct net_device *dev; + struct net_device_stats stats; - /* Miscellaneous private stuff. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - int active; - struct list_head list; /* scheduling list */ - atomic_t refcnt; - struct net_device *dev; - struct net_device_stats stats; - - struct work_struct free_work; + struct work_struct free_work; } netif_t; void netif_creditlimit(netif_t *netif); int netif_disconnect(netif_t *netif); netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]); -void free_netif_callback(netif_t *netif); +void free_netif(netif_t *netif); int netif_map(netif_t *netif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn); #define netif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define netif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - free_netif_callback(_b); \ - } while (0) +#define netif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + free_netif(_b); \ + } while (0) void netif_xenbus_init(void); @@ -123,3 +108,13 @@ irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); #endif /* __NETIF__BACKEND__COMMON_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/netback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Thu Sep 22 17:42:01 2005 @@ -11,296 +11,293 @@ static void __netif_up(netif_t *netif) { - struct net_device *dev = netif->dev; - spin_lock_bh(&dev->xmit_lock); - netif->active = 1; - spin_unlock_bh(&dev->xmit_lock); - (void)bind_evtchn_to_irqhandler( - netif->evtchn, netif_be_int, 0, dev->name, netif); - netif_schedule_work(netif); + struct net_device *dev = netif->dev; + spin_lock_bh(&dev->xmit_lock); + netif->active = 1; + spin_unlock_bh(&dev->xmit_lock); + (void)bind_evtchn_to_irqhandler( + netif->evtchn, netif_be_int, 0, dev->name, netif); + netif_schedule_work(netif); } static void __netif_down(netif_t *netif) { - struct net_device *dev = netif->dev; - spin_lock_bh(&dev->xmit_lock); - netif->active = 0; - spin_unlock_bh(&dev->xmit_lock); - unbind_evtchn_from_irqhandler(netif->evtchn, netif); - netif_deschedule_work(netif); + struct net_device *dev = netif->dev; + spin_lock_bh(&dev->xmit_lock); + netif->active = 0; + spin_unlock_bh(&dev->xmit_lock); + unbind_evtchn_from_irqhandler(netif->evtchn, netif); + netif_deschedule_work(netif); } static int net_open(struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - if (netif->status == CONNECTED) - __netif_up(netif); - netif_start_queue(dev); - return 0; + netif_t *netif = netdev_priv(dev); + if (netif->status == CONNECTED) + __netif_up(netif); + netif_start_queue(dev); + return 0; } static int net_close(struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - netif_stop_queue(dev); - if (netif->status == CONNECTED) - __netif_down(netif); - return 0; + netif_t *netif = netdev_priv(dev); + netif_stop_queue(dev); + if (netif->status == CONNECTED) + __netif_down(netif); + return 0; } netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) { - int err = 0, i; - struct net_device *dev; - netif_t *netif; - char name[IFNAMSIZ] = {}; - - snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); - dev = alloc_netdev(sizeof(netif_t), name, ether_setup); - if (dev == NULL) { - DPRINTK("Could not create netif: out of memory\n"); - return NULL; - } - - netif = netdev_priv(dev); - memset(netif, 0, sizeof(*netif)); - netif->domid = domid; - netif->handle = handle; - netif->status = DISCONNECTED; - atomic_set(&netif->refcnt, 0); - netif->dev = dev; - - netif->credit_bytes = netif->remaining_credit = ~0UL; - netif->credit_usec = 0UL; - init_timer(&netif->credit_timeout); - - dev->hard_start_xmit = netif_be_start_xmit; - dev->get_stats = netif_be_get_stats; - dev->open = net_open; - dev->stop = net_close; - dev->features = NETIF_F_NO_CSUM; - - /* Disable queuing. */ - dev->tx_queue_len = 0; - - for (i = 0; i < ETH_ALEN; i++) - if (be_mac[i] != 0) - break; - if (i == ETH_ALEN) { - /* - * Initialise a dummy MAC address. We choose the numerically largest - * non-broadcast address to prevent the address getting stolen by an - * Ethernet bridge for STP purposes. (FE:FF:FF:FF:FF:FF) - */ - memset(dev->dev_addr, 0xFF, ETH_ALEN); - dev->dev_addr[0] &= ~0x01; - } else - memcpy(dev->dev_addr, be_mac, ETH_ALEN); - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - if (err) { - DPRINTK("Could not register new net device %s: err=%d\n", - dev->name, err); - free_netdev(dev); - return NULL; - } - - DPRINTK("Successfully created netif\n"); - return netif; -} - -static int map_frontend_pages(netif_t *netif, unsigned long localaddr, - unsigned long tx_ring_ref, - unsigned long rx_ring_ref) -{ -#ifdef CONFIG_XEN_NETDEV_GRANT - struct gnttab_map_grant_ref op; - - /* Map: Use the Grant table reference */ - op.host_addr = localaddr; - op.flags = GNTMAP_host_map; - op.ref = tx_ring_ref; - op.dom = netif->domid; + int err = 0, i; + struct net_device *dev; + netif_t *netif; + char name[IFNAMSIZ] = {}; + + snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); + dev = alloc_netdev(sizeof(netif_t), name, ether_setup); + if (dev == NULL) { + DPRINTK("Could not create netif: out of memory\n"); + return NULL; + } + + netif = netdev_priv(dev); + memset(netif, 0, sizeof(*netif)); + netif->domid = domid; + netif->handle = handle; + netif->status = DISCONNECTED; + atomic_set(&netif->refcnt, 0); + netif->dev = dev; + + netif->credit_bytes = netif->remaining_credit = ~0UL; + netif->credit_usec = 0UL; + init_timer(&netif->credit_timeout); + + dev->hard_start_xmit = netif_be_start_xmit; + dev->get_stats = netif_be_get_stats; + dev->open = net_open; + dev->stop = net_close; + dev->features = NETIF_F_NO_CSUM; + + /* Disable queuing. */ + dev->tx_queue_len = 0; + + for (i = 0; i < ETH_ALEN; i++) + if (be_mac[i] != 0) + break; + if (i == ETH_ALEN) { + /* + * Initialise a dummy MAC address. We choose the numerically + * largest non-broadcast address to prevent the address getting + * stolen by an Ethernet bridge for STP purposes. + * (FE:FF:FF:FF:FF:FF) + */ + memset(dev->dev_addr, 0xFF, ETH_ALEN); + dev->dev_addr[0] &= ~0x01; + } else + memcpy(dev->dev_addr, be_mac, ETH_ALEN); + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) { + DPRINTK("Could not register new net device %s: err=%d\n", + dev->name, err); + free_netdev(dev); + return NULL; + } + + DPRINTK("Successfully created netif\n"); + return netif; +} + +static int map_frontend_pages( + netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref) +{ + struct gnttab_map_grant_ref op; + + op.host_addr = (unsigned long)netif->comms_area->addr; + op.flags = GNTMAP_host_map; + op.ref = tx_ring_ref; + op.dom = netif->domid; - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); - if (op.handle < 0) { - DPRINTK(" Grant table operation failure mapping tx_ring_ref!\n"); - return op.handle; - } - - netif->tx_shmem_ref = tx_ring_ref; - netif->tx_shmem_handle = op.handle; - netif->tx_shmem_vaddr = localaddr; - - /* Map: Use the Grant table reference */ - op.host_addr = localaddr + PAGE_SIZE; - op.flags = GNTMAP_host_map; - op.ref = rx_ring_ref; - op.dom = netif->domid; - - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); - if (op.handle < 0) { - DPRINTK(" Grant table operation failure mapping rx_ring_ref!\n"); - return op.handle; - } - - netif->rx_shmem_ref = rx_ring_ref; - netif->rx_shmem_handle = op.handle; - netif->rx_shmem_vaddr = localaddr + PAGE_SIZE; - -#else - pgprot_t prot = __pgprot(_KERNPG_TABLE); - int err; - - err = direct_remap_pfn_range(&init_mm, localaddr, - tx_ring_ref, PAGE_SIZE, - prot, netif->domid); - - err |= direct_remap_pfn_range(&init_mm, localaddr + PAGE_SIZE, - rx_ring_ref, PAGE_SIZE, - prot, netif->domid); - - if (err) - return err; -#endif - - return 0; + lock_vm_area(netif->comms_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + unlock_vm_area(netif->comms_area); + + if (op.handle < 0) { + DPRINTK(" Gnttab failure mapping tx_ring_ref!\n"); + return op.handle; + } + + netif->tx_shmem_ref = tx_ring_ref; + netif->tx_shmem_handle = op.handle; + + op.host_addr = (unsigned long)netif->comms_area->addr + PAGE_SIZE; + op.flags = GNTMAP_host_map; + op.ref = rx_ring_ref; + op.dom = netif->domid; + + lock_vm_area(netif->comms_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + unlock_vm_area(netif->comms_area); + + if (op.handle < 0) { + DPRINTK(" Gnttab failure mapping rx_ring_ref!\n"); + return op.handle; + } + + netif->rx_shmem_ref = rx_ring_ref; + netif->rx_shmem_handle = op.handle; + + return 0; } static void unmap_frontend_pages(netif_t *netif) { -#ifdef CONFIG_XEN_NETDEV_GRANT - struct gnttab_unmap_grant_ref op; - - op.host_addr = netif->tx_shmem_vaddr; - op.handle = netif->tx_shmem_handle; - op.dev_bus_addr = 0; - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); - - op.host_addr = netif->rx_shmem_vaddr; - op.handle = netif->rx_shmem_handle; - op.dev_bus_addr = 0; - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); -#endif - - return; + struct gnttab_unmap_grant_ref op; + + op.host_addr = (unsigned long)netif->comms_area->addr; + op.handle = netif->tx_shmem_handle; + op.dev_bus_addr = 0; + + lock_vm_area(netif->comms_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + unlock_vm_area(netif->comms_area); + + op.host_addr = (unsigned long)netif->comms_area->addr + PAGE_SIZE; + op.handle = netif->rx_shmem_handle; + op.dev_bus_addr = 0; + + lock_vm_area(netif->comms_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + unlock_vm_area(netif->comms_area); } int netif_map(netif_t *netif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn) { - struct vm_struct *vma; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; - - vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP); - if (vma == NULL) - return -ENOMEM; - - err = map_frontend_pages(netif, (unsigned long)vma->addr, tx_ring_ref, - rx_ring_ref); - if (err) { - vfree(vma->addr); - return err; - } - - op.u.bind_interdomain.dom1 = DOMID_SELF; - op.u.bind_interdomain.dom2 = netif->domid; - op.u.bind_interdomain.port1 = 0; - op.u.bind_interdomain.port2 = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_frontend_pages(netif); - vfree(vma->addr); - return err; - } - - netif->evtchn = op.u.bind_interdomain.port1; - netif->remote_evtchn = evtchn; - - netif->tx = (netif_tx_interface_t *)vma->addr; - netif->rx = (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); - netif->tx->resp_prod = netif->rx->resp_prod = 0; - netif_get(netif); - wmb(); /* Other CPUs see new state before interface is started. */ - - rtnl_lock(); - netif->status = CONNECTED; - wmb(); - if (netif_running(netif->dev)) - __netif_up(netif); - rtnl_unlock(); - - return 0; -} - -static void free_netif(void *arg) -{ - evtchn_op_t op = { .cmd = EVTCHNOP_close }; - netif_t *netif = (netif_t *)arg; - - /* - * These can't be done in netif_disconnect() because at that point there - * may be outstanding requests in the network stack whose asynchronous - * responses must still be notified to the remote driver. - */ - - op.u.close.port = netif->evtchn; - op.u.close.dom = DOMID_SELF; - HYPERVISOR_event_channel_op(&op); - op.u.close.port = netif->remote_evtchn; - op.u.close.dom = netif->domid; - HYPERVISOR_event_channel_op(&op); - - unregister_netdev(netif->dev); - - if (netif->tx) { - unmap_frontend_pages(netif); - vfree(netif->tx); /* Frees netif->rx as well. */ - } - - free_netdev(netif->dev); -} - -void free_netif_callback(netif_t *netif) -{ - INIT_WORK(&netif->free_work, free_netif, (void *)netif); - schedule_work(&netif->free_work); + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + netif->comms_area = alloc_vm_area(2*PAGE_SIZE); + if (netif->comms_area == NULL) + return -ENOMEM; + + err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); + if (err) { + free_vm_area(netif->comms_area); + return err; + } + + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = netif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_pages(netif); + free_vm_area(netif->comms_area); + return err; + } + + netif->evtchn = op.u.bind_interdomain.port1; + netif->remote_evtchn = evtchn; + + netif->tx = (netif_tx_interface_t *)netif->comms_area->addr; + netif->rx = (netif_rx_interface_t *) + ((char *)netif->comms_area->addr + PAGE_SIZE); + netif->tx->resp_prod = netif->rx->resp_prod = 0; + netif_get(netif); + wmb(); /* Other CPUs see new state before interface is started. */ + + rtnl_lock(); + netif->status = CONNECTED; + wmb(); + if (netif_running(netif->dev)) + __netif_up(netif); + rtnl_unlock(); + + return 0; +} + +static void free_netif_callback(void *arg) +{ + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + netif_t *netif = (netif_t *)arg; + + /* + * These can't be done in netif_disconnect() because at that point + * there may be outstanding requests in the network stack whose + * asynchronous responses must still be notified to the remote driver. + */ + + op.u.close.port = netif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = netif->remote_evtchn; + op.u.close.dom = netif->domid; + HYPERVISOR_event_channel_op(&op); + + unregister_netdev(netif->dev); + + if (netif->tx) { + unmap_frontend_pages(netif); + free_vm_area(netif->comms_area); + } + + free_netdev(netif->dev); +} + +void free_netif(netif_t *netif) +{ + INIT_WORK(&netif->free_work, free_netif_callback, (void *)netif); + schedule_work(&netif->free_work); } void netif_creditlimit(netif_t *netif) { #if 0 - /* Set the credit limit (reset remaining credit to new limit). */ - netif->credit_bytes = netif->remaining_credit = creditlimit->credit_bytes; - netif->credit_usec = creditlimit->period_usec; - - if (netif->status == CONNECTED) { - /* - * Schedule work so that any packets waiting under previous credit - * limit are dealt with (acts like a replenishment point). - */ - netif->credit_timeout.expires = jiffies; - netif_schedule_work(netif); - } + /* Set the credit limit (reset remaining credit to new limit). */ + netif->credit_bytes = creditlimit->credit_bytes; + netif->remaining_credit = creditlimit->credit_bytes; + netif->credit_usec = creditlimit->period_usec; + + if (netif->status == CONNECTED) { + /* + * Schedule work so that any packets waiting under previous + * credit limit are dealt with (acts as a replenishment point). + */ + netif->credit_timeout.expires = jiffies; + netif_schedule_work(netif); + } #endif } int netif_disconnect(netif_t *netif) { - if (netif->status == CONNECTED) { - rtnl_lock(); - netif->status = DISCONNECTING; - wmb(); - if (netif_running(netif->dev)) - __netif_down(netif); - rtnl_unlock(); - netif_put(netif); - return 0; /* Caller should not send response message. */ - } - - return 1; -} + if (netif->status == CONNECTED) { + rtnl_lock(); + netif->status = DISCONNECTING; + wmb(); + if (netif_running(netif->dev)) + __netif_down(netif); + rtnl_unlock(); + netif_put(netif); + return 0; /* Caller should not send response message. */ + } + + return 1; +} + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Thu Sep 22 17:42:01 2005 @@ -23,7 +23,7 @@ static int make_rx_response(netif_t *netif, u16 id, s8 st, - unsigned long addr, + u16 offset, u16 size, u16 csum_valid); @@ -41,11 +41,7 @@ static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2+1]; static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; -#ifdef CONFIG_XEN_NETDEV_GRANT -static gnttab_donate_t grant_rx_op[MAX_PENDING_REQS]; -#else -static struct mmuext_op rx_mmuext[NETIF_RX_RING_SIZE]; -#endif +static gnttab_transfer_t grant_rx_op[MAX_PENDING_REQS]; static unsigned char rx_notify[NR_EVENT_CHANNELS]; /* Don't currently gate addition of an interface to the tx scheduling list. */ @@ -57,8 +53,8 @@ #define PKT_PROT_LEN 64 static struct { - netif_tx_request_t req; - netif_t *netif; + netif_tx_request_t req; + netif_t *netif; } pending_tx_info[MAX_PENDING_REQS]; static u16 pending_ring[MAX_PENDING_REQS]; typedef unsigned int PEND_RING_IDX; @@ -72,14 +68,9 @@ static struct sk_buff_head tx_queue; -#ifdef CONFIG_XEN_NETDEV_GRANT static u16 grant_tx_ref[MAX_PENDING_REQS]; static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; - -#else -static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; -#endif static struct list_head net_schedule_list; static spinlock_t net_schedule_list_lock; @@ -91,49 +82,49 @@ static unsigned long alloc_mfn(void) { - unsigned long mfn = 0, flags; - struct xen_memory_reservation reservation = { - .extent_start = mfn_list, - .nr_extents = MAX_MFN_ALLOC, - .extent_order = 0, - .domid = DOMID_SELF - }; - spin_lock_irqsave(&mfn_lock, flags); - if ( unlikely(alloc_index == 0) ) - alloc_index = HYPERVISOR_memory_op( - XENMEM_increase_reservation, &reservation); - if ( alloc_index != 0 ) - mfn = mfn_list[--alloc_index]; - spin_unlock_irqrestore(&mfn_lock, flags); - return mfn; -} - -#ifndef CONFIG_XEN_NETDEV_GRANT + unsigned long mfn = 0, flags; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = MAX_MFN_ALLOC, + .extent_order = 0, + .domid = DOMID_SELF + }; + spin_lock_irqsave(&mfn_lock, flags); + if ( unlikely(alloc_index == 0) ) + alloc_index = HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation); + if ( alloc_index != 0 ) + mfn = mfn_list[--alloc_index]; + spin_unlock_irqrestore(&mfn_lock, flags); + return mfn; +} + +#if 0 static void free_mfn(unsigned long mfn) { - unsigned long flags; - struct xen_memory_reservation reservation = { - .extent_start = &mfn, - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - spin_lock_irqsave(&mfn_lock, flags); - if ( alloc_index != MAX_MFN_ALLOC ) - mfn_list[alloc_index++] = mfn; - else if ( HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) - != 1 ) - BUG(); - spin_unlock_irqrestore(&mfn_lock, flags); + unsigned long flags; + struct xen_memory_reservation reservation = { + .extent_start = &mfn, + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index != MAX_MFN_ALLOC ) + mfn_list[alloc_index++] = mfn; + else + BUG_ON(HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation) != 1); + spin_unlock_irqrestore(&mfn_lock, flags); } #endif static inline void maybe_schedule_tx_action(void) { - smp_mb(); - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&net_schedule_list) ) - tasklet_schedule(&net_tx_tasklet); + smp_mb(); + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list)) + tasklet_schedule(&net_tx_tasklet); } /* @@ -142,77 +133,66 @@ */ static inline int is_xen_skb(struct sk_buff *skb) { - extern kmem_cache_t *skbuff_cachep; - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; - return (cp == skbuff_cachep); + extern kmem_cache_t *skbuff_cachep; + kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; + return (cp == skbuff_cachep); } int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - - ASSERT(skb->dev == dev); - - /* Drop the packet if the target domain has no receive buffers. */ - if ( !netif->active || - (netif->rx_req_cons == netif->rx->req_prod) || - ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) - goto drop; - - /* - * We do not copy the packet unless: - * 1. The data is shared; or - * 2. The data is not allocated from our special cache. - * NB. We also couldn't cope with fragmented packets, but we won't get - * any because we not advertise the NETIF_F_SG feature. - */ - if ( skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb) ) - { - int hlen = skb->data - skb->head; - struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len); - if ( unlikely(nskb == NULL) ) - goto drop; - skb_reserve(nskb, hlen); - __skb_put(nskb, skb->len); - if (skb_copy_bits(skb, -hlen, nskb->data - hlen, skb->len + hlen)) - BUG(); - nskb->dev = skb->dev; - nskb->proto_csum_valid = skb->proto_csum_valid; - dev_kfree_skb(skb); - skb = nskb; - } -#ifdef CONFIG_XEN_NETDEV_GRANT -#ifdef DEBUG_GRANT - printk(KERN_ALERT "#### be_xmit: req_prod=%d req_cons=%d id=%04x gr=%04x\n", - netif->rx->req_prod, - netif->rx_req_cons, - netif->rx->ring[ - MASK_NETIF_RX_IDX(netif->rx_req_cons)].req.id, - netif->rx->ring[ - MASK_NETIF_RX_IDX(netif->rx_req_cons)].req.gref); -#endif -#endif - netif->rx_req_cons++; - netif_get(netif); - - skb_queue_tail(&rx_queue, skb); - tasklet_schedule(&net_rx_tasklet); - - return 0; + netif_t *netif = netdev_priv(dev); + + ASSERT(skb->dev == dev); + + /* Drop the packet if the target domain has no receive buffers. */ + if (!netif->active || + (netif->rx_req_cons == netif->rx->req_prod) || + ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE)) + goto drop; + + /* + * We do not copy the packet unless: + * 1. The data is shared; or + * 2. The data is not allocated from our special cache. + * NB. We also couldn't cope with fragmented packets, but we won't get + * any because we not advertise the NETIF_F_SG feature. + */ + if (skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb)) { + int hlen = skb->data - skb->head; + struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len); + if ( unlikely(nskb == NULL) ) + goto drop; + skb_reserve(nskb, hlen); + __skb_put(nskb, skb->len); + BUG_ON(skb_copy_bits(skb, -hlen, nskb->data - hlen, + skb->len + hlen)); + nskb->dev = skb->dev; + nskb->proto_csum_valid = skb->proto_csum_valid; + dev_kfree_skb(skb); + skb = nskb; + } + + netif->rx_req_cons++; + netif_get(netif); + + skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; drop: - netif->stats.tx_dropped++; - dev_kfree_skb(skb); - return 0; + netif->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; } #if 0 static void xen_network_done_notify(void) { - static struct net_device *eth0_dev = NULL; - if ( unlikely(eth0_dev == NULL) ) - eth0_dev = __dev_get_by_name("eth0"); - netif_rx_schedule(eth0_dev); + static struct net_device *eth0_dev = NULL; + if (unlikely(eth0_dev == NULL)) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); } /* * Add following to poll() function in NAPI driver (Tigon3 is example): @@ -221,776 +201,644 @@ */ int xen_network_done(void) { - return skb_queue_empty(&rx_queue); + return skb_queue_empty(&rx_queue); } #endif static void net_rx_action(unsigned long unused) { - netif_t *netif = NULL; - s8 status; - u16 size, id, evtchn; - multicall_entry_t *mcl; - mmu_update_t *mmu; -#ifdef CONFIG_XEN_NETDEV_GRANT - gnttab_donate_t *gop; -#else - struct mmuext_op *mmuext; + netif_t *netif = NULL; + s8 status; + u16 size, id, evtchn; + multicall_entry_t *mcl; + mmu_update_t *mmu; + gnttab_transfer_t *gop; + unsigned long vdata, old_mfn, new_mfn; + struct sk_buff_head rxq; + struct sk_buff *skb; + u16 notify_list[NETIF_RX_RING_SIZE]; + int notify_nr = 0; + + skb_queue_head_init(&rxq); + + mcl = rx_mcl; + mmu = rx_mmu; + gop = grant_rx_op; + + while ((skb = skb_dequeue(&rx_queue)) != NULL) { + netif = netdev_priv(skb->dev); + vdata = (unsigned long)skb->data; + old_mfn = virt_to_mfn(vdata); + + /* Memory squeeze? Back off for an arbitrary while. */ + if ((new_mfn = alloc_mfn()) == 0) { + if ( net_ratelimit() ) + WPRINTK("Memory squeeze in netback driver.\n"); + mod_timer(&net_timer, jiffies + HZ); + skb_queue_head(&rx_queue, skb); + break; + } + /* + * Set the new P2M table entry before reassigning the old data + * page. Heed the comment in pgtable-2level.h:pte_page(). :-) + */ + phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = + new_mfn; + + MULTI_update_va_mapping(mcl, vdata, + pfn_pte_ma(new_mfn, PAGE_KERNEL), 0); + mcl++; + + gop->mfn = old_mfn; + gop->domid = netif->domid; + gop->ref = netif->rx->ring[ + MASK_NETIF_RX_IDX(netif->rx_resp_prod_copy)].req.gref; + netif->rx_resp_prod_copy++; + gop++; + + mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE; + mmu->val = __pa(vdata) >> PAGE_SHIFT; + mmu++; + + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if ((mcl - rx_mcl) == ARRAY_SIZE(rx_mcl)) + break; + } + + if (mcl == rx_mcl) + return; + + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + + mcl[-2].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + BUG_ON(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0); + + mcl = rx_mcl; + if( HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, + gop - grant_rx_op)) { + /* + * The other side has given us a bad grant ref, or has no + * headroom, or has gone away. Unfortunately the current grant + * table code doesn't inform us which is the case, so not much + * we can do. + */ + DPRINTK("net_rx: transfer to DOM%u failed; dropping (up to) " + "%d packets.\n", + grant_rx_op[0].domid, gop - grant_rx_op); + } + gop = grant_rx_op; + + while ((skb = __skb_dequeue(&rxq)) != NULL) { + netif = netdev_priv(skb->dev); + size = skb->tail - skb->data; + + /* Rederive the machine addresses. */ + new_mfn = mcl[0].args[1] >> PAGE_SHIFT; + old_mfn = 0; /* XXX Fix this so we can free_mfn() on error! */ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + netif->stats.tx_bytes += size; + netif->stats.tx_packets++; + + /* The update_va_mapping() must not fail. */ + BUG_ON(mcl[0].result != 0); + + /* Check the reassignment error code. */ + status = NETIF_RSP_OKAY; + if(gop->status != 0) { + DPRINTK("Bad status %d from grant transfer to DOM%u\n", + gop->status, netif->domid); + /* XXX SMH: should free 'old_mfn' here */ + status = NETIF_RSP_ERROR; + } + evtchn = netif->evtchn; + id = netif->rx->ring[ + MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; + if (make_rx_response(netif, id, status, + (unsigned long)skb->data & ~PAGE_MASK, + size, skb->proto_csum_valid) && + (rx_notify[evtchn] == 0)) { + rx_notify[evtchn] = 1; + notify_list[notify_nr++] = evtchn; + } + + netif_put(netif); + dev_kfree_skb(skb); + mcl++; + gop++; + } + + while (notify_nr != 0) { + evtchn = notify_list[--notify_nr]; + rx_notify[evtchn] = 0; + notify_via_evtchn(evtchn); + } + + /* More work to do? */ + if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer)) + tasklet_schedule(&net_rx_tasklet); +#if 0 + else + xen_network_done_notify(); #endif - unsigned long vdata, old_mfn, new_mfn; - struct sk_buff_head rxq; - struct sk_buff *skb; - u16 notify_list[NETIF_RX_RING_SIZE]; - int notify_nr = 0; - - skb_queue_head_init(&rxq); - - mcl = rx_mcl; - mmu = rx_mmu; -#ifdef CONFIG_XEN_NETDEV_GRANT - gop = grant_rx_op; -#else - mmuext = rx_mmuext; -#endif - - while ( (skb = skb_dequeue(&rx_queue)) != NULL ) - { - netif = netdev_priv(skb->dev); - vdata = (unsigned long)skb->data; - old_mfn = virt_to_mfn(vdata); - - /* Memory squeeze? Back off for an arbitrary while. */ - if ( (new_mfn = alloc_mfn()) == 0 ) - { - if ( net_ratelimit() ) - WPRINTK("Memory squeeze in netback driver.\n"); - mod_timer(&net_timer, jiffies + HZ); - skb_queue_head(&rx_queue, skb); - break; - } - /* - * Set the new P2M table entry before reassigning the old data page. - * Heed the comment in pgtable-2level.h:pte_page(). :-) - */ - phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; - - MULTI_update_va_mapping(mcl, vdata, - pfn_pte_ma(new_mfn, PAGE_KERNEL), 0); - mcl++; - -#ifdef CONFIG_XEN_NETDEV_GRANT - gop->mfn = old_mfn; - gop->domid = netif->domid; - gop->handle = netif->rx->ring[ - MASK_NETIF_RX_IDX(netif->rx_resp_prod_copy)].req.gref; - netif->rx_resp_prod_copy++; - gop++; -#else - mcl->op = __HYPERVISOR_mmuext_op; - mcl->args[0] = (unsigned long)mmuext; - mcl->args[1] = 1; - mcl->args[2] = 0; - mcl->args[3] = netif->domid; - mcl++; - - mmuext->cmd = MMUEXT_REASSIGN_PAGE; - mmuext->arg1.mfn = old_mfn; - mmuext++; -#endif - mmu->ptr = ((unsigned long long)new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - mmu->val = __pa(vdata) >> PAGE_SHIFT; - mmu++; - - __skb_queue_tail(&rxq, skb); - -#ifdef DEBUG_GRANT - dump_packet('a', old_mfn, vdata); -#endif - /* Filled the batch queue? */ - if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) - break; - } - - if ( mcl == rx_mcl ) - return; - - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)rx_mmu; - mcl->args[1] = mmu - rx_mmu; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - mcl++; - -#ifdef CONFIG_XEN_NETDEV_GRANT - mcl[-2].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; -#else - mcl[-3].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; -#endif - if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) ) - BUG(); - - mcl = rx_mcl; -#ifdef CONFIG_XEN_NETDEV_GRANT - if(HYPERVISOR_grant_table_op(GNTTABOP_donate, grant_rx_op, - gop - grant_rx_op)) { - /* - ** The other side has given us a bad grant ref, or has no headroom, - ** or has gone away. Unfortunately the current grant table code - ** doesn't inform us which is the case, so not much we can do. - */ - DPRINTK("net_rx: donate to DOM%u failed; dropping (up to) %d " - "packets.\n", grant_rx_op[0].domid, gop - grant_rx_op); - } - gop = grant_rx_op; -#else - mmuext = rx_mmuext; -#endif - while ( (skb = __skb_dequeue(&rxq)) != NULL ) - { - netif = netdev_priv(skb->dev); - size = skb->tail - skb->data; - - /* Rederive the machine addresses. */ - new_mfn = mcl[0].args[1] >> PAGE_SHIFT; -#ifdef CONFIG_XEN_NETDEV_GRANT - old_mfn = 0; /* XXX Fix this so we can free_mfn() on error! */ -#else - old_mfn = mmuext[0].arg1.mfn; -#endif - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - - netif->stats.tx_bytes += size; - netif->stats.tx_packets++; - - /* The update_va_mapping() must not fail. */ - BUG_ON(mcl[0].result != 0); - - /* Check the reassignment error code. */ - status = NETIF_RSP_OKAY; -#ifdef CONFIG_XEN_NETDEV_GRANT - if(gop->status != 0) { - DPRINTK("Bad status %d from grant donate to DOM%u\n", - gop->status, netif->domid); - /* XXX SMH: should free 'old_mfn' here */ - status = NETIF_RSP_ERROR; - } -#else - if ( unlikely(mcl[1].result != 0) ) - { - DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); - free_mfn(old_mfn); - status = NETIF_RSP_ERROR; - } -#endif - evtchn = netif->evtchn; - id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; - if ( make_rx_response(netif, id, status, - (old_mfn << PAGE_SHIFT) | /* XXX */ - ((unsigned long)skb->data & ~PAGE_MASK), - size, skb->proto_csum_valid) && - (rx_notify[evtchn] == 0) ) - { - rx_notify[evtchn] = 1; - notify_list[notify_nr++] = evtchn; - } - - netif_put(netif); - dev_kfree_skb(skb); -#ifdef CONFIG_XEN_NETDEV_GRANT - mcl++; - gop++; -#else - mcl += 2; - mmuext += 1; -#endif - } - - while ( notify_nr != 0 ) - { - evtchn = notify_list[--notify_nr]; - rx_notify[evtchn] = 0; - notify_via_evtchn(evtchn); - } - - out: - /* More work to do? */ - if ( !skb_queue_empty(&rx_queue) && !timer_pending(&net_timer) ) - tasklet_schedule(&net_rx_tasklet); -#if 0 - else - xen_network_done_notify(); -#endif } static void net_alarm(unsigned long unused) { - tasklet_schedule(&net_rx_tasklet); + tasklet_schedule(&net_rx_tasklet); } struct net_device_stats *netif_be_get_stats(struct net_device *dev) { - netif_t *netif = netdev_priv(dev); - return &netif->stats; + netif_t *netif = netdev_priv(dev); + return &netif->stats; } static int __on_net_schedule_list(netif_t *netif) { - return netif->list.next != NULL; + return netif->list.next != NULL; } static void remove_from_net_schedule_list(netif_t *netif) { - spin_lock_irq(&net_schedule_list_lock); - if ( likely(__on_net_schedule_list(netif)) ) - { - list_del(&netif->list); - netif->list.next = NULL; - netif_put(netif); - } - spin_unlock_irq(&net_schedule_list_lock); + spin_lock_irq(&net_schedule_list_lock); + if (likely(__on_net_schedule_list(netif))) { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); } static void add_to_net_schedule_list_tail(netif_t *netif) { - if ( __on_net_schedule_list(netif) ) - return; - - spin_lock_irq(&net_schedule_list_lock); - if ( !__on_net_schedule_list(netif) && netif->active ) - { - list_add_tail(&netif->list, &net_schedule_list); - netif_get(netif); - } - spin_unlock_irq(&net_schedule_list_lock); + if (__on_net_schedule_list(netif)) + return; + + spin_lock_irq(&net_schedule_list_lock); + if (!__on_net_schedule_list(netif) && netif->active) { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); } void netif_schedule_work(netif_t *netif) { - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } + if ((netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE)) { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } } void netif_deschedule_work(netif_t *netif) { - remove_from_net_schedule_list(netif); + remove_from_net_schedule_list(netif); } static void tx_credit_callback(unsigned long data) { - netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; - netif_schedule_work(netif); + netif_t *netif = (netif_t *)data; + netif->remaining_credit = netif->credit_bytes; + netif_schedule_work(netif); } inline static void net_tx_action_dealloc(void) { -#ifdef CONFIG_XEN_NETDEV_GRANT - gnttab_unmap_grant_ref_t *gop; -#else - multicall_entry_t *mcl; -#endif - u16 pending_idx; - PEND_RING_IDX dc, dp; - netif_t *netif; - - dc = dealloc_cons; - dp = dealloc_prod; - -#ifdef CONFIG_XEN_NETDEV_GRANT - /* - * Free up any grants we have finished using - */ - gop = tx_unmap_ops; - while ( dc != dp ) - { - pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - gop->host_addr = MMAP_VADDR(pending_idx); - gop->dev_bus_addr = 0; - gop->handle = grant_tx_ref[pending_idx]; - grant_tx_ref[pending_idx] = GRANT_INVALID_REF; - gop++; - } - BUG_ON(HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops)); -#else - mcl = tx_mcl; - while ( dc != dp ) - { - pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - MULTI_update_va_mapping(mcl, MMAP_VADDR(pending_idx), - __pte(0), 0); - mcl++; - } - - mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); - - mcl = tx_mcl; -#endif - while ( dealloc_cons != dp ) - { -#ifndef CONFIG_XEN_NETDEV_GRANT - /* The update_va_mapping() must not fail. */ - BUG_ON(mcl[0].result != 0); -#endif - - pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; - - netif = pending_tx_info[pending_idx].netif; - - make_tx_response(netif, pending_tx_info[pending_idx].req.id, - NETIF_RSP_OKAY); + gnttab_unmap_grant_ref_t *gop; + u16 pending_idx; + PEND_RING_IDX dc, dp; + netif_t *netif; + + dc = dealloc_cons; + dp = dealloc_prod; + + /* + * Free up any grants we have finished using + */ + gop = tx_unmap_ops; + while (dc != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + gop->host_addr = MMAP_VADDR(pending_idx); + gop->dev_bus_addr = 0; + gop->handle = grant_tx_ref[pending_idx]; + grant_tx_ref[pending_idx] = GRANT_INVALID_REF; + gop++; + } + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops)); + + while (dealloc_cons != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + make_tx_response(netif, pending_tx_info[pending_idx].req.id, + NETIF_RSP_OKAY); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - - /* - * Scheduling checks must happen after the above response is posted. - * This avoids a possible race with a guest OS on another CPU if that - * guest is testing against 'resp_prod' when deciding whether to notify - * us when it queues additional packets. - */ - mb(); - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - add_to_net_schedule_list_tail(netif); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + /* + * Scheduling checks must happen after the above response is + * posted. This avoids a possible race with a guest OS on + * another CPU if that guest is testing against 'resp_prod' + * when deciding whether to notify us when it queues additional + * packets. + */ + mb(); + if ((netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != + NETIF_TX_RING_SIZE)) + add_to_net_schedule_list_tail(netif); - netif_put(netif); - -#ifndef CONFIG_XEN_NETDEV_GRANT - mcl++; -#endif - } - + netif_put(netif); + } } /* Called after netfront has transmitted */ static void net_tx_action(unsigned long unused) { - struct list_head *ent; - struct sk_buff *skb; - netif_t *netif; - netif_tx_request_t txreq; - u16 pending_idx; - NETIF_RING_IDX i; -#ifdef CONFIG_XEN_NETDEV_GRANT - gnttab_map_grant_ref_t *mop; -#else - multicall_entry_t *mcl; -#endif - unsigned int data_len; - - if ( dealloc_cons != dealloc_prod ) - net_tx_action_dealloc(); - -#ifdef CONFIG_XEN_NETDEV_GRANT - mop = tx_map_ops; -#else - mcl = tx_mcl; -#endif - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&net_schedule_list) ) - { - /* Get a netif from the list with work to do. */ - ent = net_schedule_list.next; - netif = list_entry(ent, netif_t, list); - netif_get(netif); - remove_from_net_schedule_list(netif); - - /* Work to do? */ - i = netif->tx_req_cons; - if ( (i == netif->tx->req_prod) || - ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) - { - netif_put(netif); - continue; - } - - rmb(); /* Ensure that we see the request before we copy it. */ - memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, - sizeof(txreq)); - /* Credit-based scheduling. */ - if ( txreq.size > netif->remaining_credit ) - { - unsigned long now = jiffies; - unsigned long next_credit = - netif->credit_timeout.expires + - msecs_to_jiffies(netif->credit_usec / 1000); - - /* Timer could already be pending in some rare cases. */ - if ( timer_pending(&netif->credit_timeout) ) - break; - - /* Already passed the point at which we can replenish credit? */ - if ( time_after_eq(now, next_credit) ) - { - netif->credit_timeout.expires = now; - netif->remaining_credit = netif->credit_bytes; - } - - /* Still too big to send right now? Then set a timer callback. */ - if ( txreq.size > netif->remaining_credit ) - { - netif->remaining_credit = 0; - netif->credit_timeout.expires = next_credit; - netif->credit_timeout.data = (unsigned long)netif; - netif->credit_timeout.function = tx_credit_callback; - add_timer_on(&netif->credit_timeout, smp_processor_id()); - break; - } - } - netif->remaining_credit -= txreq.size; - - /* - * Why the barrier? It ensures that the frontend sees updated req_cons - * before we check for more work to schedule. - */ - netif->tx->req_cons = ++netif->tx_req_cons; - mb(); - - netif_schedule_work(netif); - - if ( unlikely(txreq.size < ETH_HLEN) || - unlikely(txreq.size > ETH_FRAME_LEN) ) - { - DPRINTK("Bad packet size: %d\n", txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - /* No crossing a page boundary as the payload mustn't fragment. */ - if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) - { - DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", - txreq.addr, txreq.size, - (txreq.addr &~PAGE_MASK) + txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - if ( unlikely((skb = alloc_skb(data_len+16, GFP_ATOMIC)) == NULL) ) - { - DPRINTK("Can't allocate a skb in start_xmit.\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - break; - } - - /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, 16); -#ifdef CONFIG_XEN_NETDEV_GRANT - mop->host_addr = MMAP_VADDR(pending_idx); - mop->dom = netif->domid; - mop->ref = txreq.addr >> PAGE_SHIFT; - mop->flags = GNTMAP_host_map | GNTMAP_readonly; - mop++; -#else - MULTI_update_va_mapping_otherdomain( - mcl, MMAP_VADDR(pending_idx), - pfn_pte_ma(txreq.addr >> PAGE_SHIFT, PAGE_KERNEL), - 0, netif->domid); - - mcl++; -#endif - - memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); - pending_tx_info[pending_idx].netif = netif; - *((u16 *)skb->data) = pending_idx; - - __skb_queue_tail(&tx_queue, skb); - - pending_cons++; - -#ifdef CONFIG_XEN_NETDEV_GRANT - if ( (mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops) ) - break; -#else - /* Filled the batch queue? */ - if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) - break; -#endif - } - -#ifdef CONFIG_XEN_NETDEV_GRANT - if ( mop == tx_map_ops ) - return; - - BUG_ON(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops)); - - mop = tx_map_ops; -#else - if ( mcl == tx_mcl ) - return; - - BUG_ON(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0); - - mcl = tx_mcl; -#endif - while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) - { - pending_idx = *((u16 *)skb->data); - netif = pending_tx_info[pending_idx].netif; - memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); - - /* Check the remap error code. */ -#ifdef CONFIG_XEN_NETDEV_GRANT - /* - XXX SMH: error returns from grant operations are pretty poorly - specified/thought out, but the below at least conforms with - what the rest of the code uses. - */ - if ( unlikely(mop->handle < 0) ) - { - printk(KERN_ALERT "#### netback grant fails\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - kfree_skb(skb); - mop++; - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - continue; - } - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = - FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT); - grant_tx_ref[pending_idx] = mop->handle; -#else - if ( unlikely(mcl[0].result != 0) ) - { - DPRINTK("Bad page frame\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - kfree_skb(skb); - mcl++; - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - continue; - } - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = - FOREIGN_FRAME(txreq.addr >> PAGE_SHIFT); -#endif - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - __skb_put(skb, data_len); - memcpy(skb->data, - (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), - data_len); - if ( data_len < txreq.size ) - { - /* Append the packet payload as a fragment. */ - skb_shinfo(skb)->frags[0].page = - virt_to_page(MMAP_VADDR(pending_idx)); - skb_shinfo(skb)->frags[0].size = txreq.size - data_len; - skb_shinfo(skb)->frags[0].page_offset = - (txreq.addr + data_len) & ~PAGE_MASK; - skb_shinfo(skb)->nr_frags = 1; - } - else - { - /* Schedule a response immediately. */ - netif_idx_release(pending_idx); - } - - skb->data_len = txreq.size - data_len; - skb->len += skb->data_len; - - skb->dev = netif->dev; - skb->protocol = eth_type_trans(skb, skb->dev); - - /* No checking needed on localhost, but remember the field is blank. */ - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->proto_csum_valid = 1; - skb->proto_csum_blank = txreq.csum_blank; - - netif->stats.rx_bytes += txreq.size; - netif->stats.rx_packets++; - - netif_rx(skb); - netif->dev->last_rx = jiffies; - -#ifdef CONFIG_XEN_NETDEV_GRANT - mop++; -#else - mcl++; -#endif - } + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + u16 pending_idx; + NETIF_RING_IDX i; + gnttab_map_grant_ref_t *mop; + unsigned int data_len; + + if (dealloc_cons != dealloc_prod) + net_tx_action_dealloc(); + + mop = tx_map_ops; + while ((NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list)) { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + /* Work to do? */ + i = netif->tx_req_cons; + if ((i == netif->tx->req_prod) || + ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE)) { + netif_put(netif); + continue; + } + + rmb(); /* Ensure that we see the request before we copy it. */ + memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, + sizeof(txreq)); + /* Credit-based scheduling. */ + if (txreq.size > netif->remaining_credit) { + unsigned long now = jiffies; + unsigned long next_credit = + netif->credit_timeout.expires + + msecs_to_jiffies(netif->credit_usec / 1000); + + /* Timer could already be pending in rare cases. */ + if (timer_pending(&netif->credit_timeout)) + break; + + /* Passed the point where we can replenish credit? */ + if (time_after_eq(now, next_credit)) { + netif->credit_timeout.expires = now; + netif->remaining_credit = netif->credit_bytes; + } + + /* Still too big to send right now? Set a callback. */ + if (txreq.size > netif->remaining_credit) { + netif->remaining_credit = 0; + netif->credit_timeout.expires = + next_credit; + netif->credit_timeout.data = + (unsigned long)netif; + netif->credit_timeout.function = + tx_credit_callback; + add_timer_on(&netif->credit_timeout, + smp_processor_id()); + break; + } + } + netif->remaining_credit -= txreq.size; + + /* + * Why the barrier? It ensures that the frontend sees updated + * req_cons before we check for more work to schedule. + */ + netif->tx->req_cons = ++netif->tx_req_cons; + mb(); + + netif_schedule_work(netif); + + if (unlikely(txreq.size < ETH_HLEN) || + unlikely(txreq.size > ETH_FRAME_LEN)) { + DPRINTK("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page as the payload mustn't fragment. */ + if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) { + DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", + txreq.addr, txreq.size, + (txreq.addr &~PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + data_len = (txreq.size > PKT_PROT_LEN) ? + PKT_PROT_LEN : txreq.size; + + skb = alloc_skb(data_len+16, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, 16); + + mop->host_addr = MMAP_VADDR(pending_idx); + mop->dom = netif->domid; + mop->ref = txreq.gref; + mop->flags = GNTMAP_host_map | GNTMAP_readonly; + mop++; + + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((u16 *)skb->data) = pending_idx; + + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) + break; + } + + if (mop == tx_map_ops) + return; + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops)); + + mop = tx_map_ops; + while ((skb = __skb_dequeue(&tx_queue)) != NULL) { + pending_idx = *((u16 *)skb->data); + netif = pending_tx_info[pending_idx].netif; + memcpy(&txreq, &pending_tx_info[pending_idx].req, + sizeof(txreq)); + + /* Check the remap error code. */ + if (unlikely(mop->handle < 0)) { + printk(KERN_ALERT "#### netback grant fails\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + kfree_skb(skb); + mop++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = + pending_idx; + continue; + } + phys_to_machine_mapping[ + __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = + FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT); + grant_tx_ref[pending_idx] = mop->handle; + + data_len = (txreq.size > PKT_PROT_LEN) ? + PKT_PROT_LEN : txreq.size; + + __skb_put(skb, data_len); + memcpy(skb->data, + (void *)(MMAP_VADDR(pending_idx)|txreq.offset), + data_len); + if (data_len < txreq.size) { + /* Append the packet payload as a fragment. */ + skb_shinfo(skb)->frags[0].page = + virt_to_page(MMAP_VADDR(pending_idx)); + skb_shinfo(skb)->frags[0].size = + txreq.size - data_len; + skb_shinfo(skb)->frags[0].page_offset = + txreq.offset + data_len; + skb_shinfo(skb)->nr_frags = 1; + } else { + /* Schedule a response immediately. */ + netif_idx_release(pending_idx); + } + + skb->data_len = txreq.size - data_len; + skb->len += skb->data_len; + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + /* + * No checking needed on localhost, but remember the field is + * blank. + */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->proto_csum_valid = 1; + skb->proto_csum_blank = txreq.csum_blank; + + netif->stats.rx_bytes += txreq.size; + netif->stats.rx_packets++; + + netif_rx(skb); + netif->dev->last_rx = jiffies; + + mop++; + } } static void netif_idx_release(u16 pending_idx) { - static spinlock_t _lock = SPIN_LOCK_UNLOCKED; - unsigned long flags; - - spin_lock_irqsave(&_lock, flags); - dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; - spin_unlock_irqrestore(&_lock, flags); - - tasklet_schedule(&net_tx_tasklet); + static spinlock_t _lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + + spin_lock_irqsave(&_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + spin_unlock_irqrestore(&_lock, flags); + + tasklet_schedule(&net_tx_tasklet); } static void netif_page_release(struct page *page) { - u16 pending_idx = page - virt_to_page(mmap_vstart); - - /* Ready for next use. */ - set_page_count(page, 1); - - netif_idx_release(pending_idx); + u16 pending_idx = page - virt_to_page(mmap_vstart); + + /* Ready for next use. */ + set_page_count(page, 1); + + netif_idx_release(pending_idx); } irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) { - netif_t *netif = dev_id; - if ( tx_work_exists(netif) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } - return IRQ_HANDLED; + netif_t *netif = dev_id; + if (tx_work_exists(netif)) { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } + return IRQ_HANDLED; } static void make_tx_response(netif_t *netif, u16 id, s8 st) { - NETIF_RING_IDX i = netif->tx_resp_prod; - netif_tx_response_t *resp; - - resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; - resp->id = id; - resp->status = st; - wmb(); - netif->tx->resp_prod = netif->tx_resp_prod = ++i; - - mb(); /* Update producer before checking event threshold. */ - if ( i == netif->tx->event ) - notify_via_evtchn(netif->evtchn); + NETIF_RING_IDX i = netif->tx_resp_prod; + netif_tx_response_t *resp; + + resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; + resp->id = id; + resp->status = st; + wmb(); + netif->tx->resp_prod = netif->tx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + if (i == netif->tx->event) + notify_via_evtchn(netif->evtchn); } static int make_rx_response(netif_t *netif, u16 id, s8 st, - unsigned long addr, + u16 offset, u16 size, u16 csum_valid) { - NETIF_RING_IDX i = netif->rx_resp_prod; - netif_rx_response_t *resp; - - resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; - resp->addr = addr; - resp->csum_valid = csum_valid; - resp->id = id; - resp->status = (s16)size; - if ( st < 0 ) - resp->status = (s16)st; - wmb(); - netif->rx->resp_prod = netif->rx_resp_prod = ++i; - - mb(); /* Update producer before checking event threshold. */ - return (i == netif->rx->event); + NETIF_RING_IDX i = netif->rx_resp_prod; + netif_rx_response_t *resp; + + resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + resp->offset = offset; + resp->csum_valid = csum_valid; + resp->id = id; + resp->status = (s16)size; + if (st < 0) + resp->status = (s16)st; + wmb(); + netif->rx->resp_prod = netif->rx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + return (i == netif->rx->event); } static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) { - struct list_head *ent; - netif_t *netif; - int i = 0; - - printk(KERN_ALERT "netif_schedule_list:\n"); - spin_lock_irq(&net_schedule_list_lock); - - list_for_each ( ent, &net_schedule_list ) - { - netif = list_entry(ent, netif_t, list); - printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", - i, netif->rx_req_cons, netif->rx_resp_prod); - printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", - netif->tx_req_cons, netif->tx_resp_prod); - printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", - netif->rx->req_prod, netif->rx->resp_prod); - printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", - netif->rx->event, netif->tx->req_prod); - printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", - netif->tx->resp_prod, netif->tx->event); - i++; - } - - spin_unlock_irq(&net_schedule_list_lock); - printk(KERN_ALERT " ** End of netif_schedule_list **\n"); - - return IRQ_HANDLED; + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each (ent, &net_schedule_list) { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x " + "rx_resp_prod=%08x\n", + i, netif->rx_req_cons, netif->rx_resp_prod); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx_req_cons, netif->tx_resp_prod); + printk(KERN_ALERT " shared(rx_req_prod=%08x " + "rx_resp_prod=%08x\n", + netif->rx->req_prod, netif->rx->resp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx->event, netif->tx->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx->resp_prod, netif->tx->event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + + return IRQ_HANDLED; } static int __init netback_init(void) { - int i; - struct page *page; - - if ( !(xen_start_info->flags & SIF_NET_BE_DOMAIN) && - !(xen_start_info->flags & SIF_INITDOMAIN) ) - return 0; - - IPRINTK("Initialising Xen netif backend.\n"); -#ifdef CONFIG_XEN_NETDEV_GRANT - IPRINTK("Using grant tables.\n"); -#endif - - /* We can increase reservation by this much in net_rx_action(). */ - balloon_update_driver_allowance(NETIF_RX_RING_SIZE); - - skb_queue_head_init(&rx_queue); - skb_queue_head_init(&tx_queue); - - init_timer(&net_timer); - net_timer.data = 0; - net_timer.function = net_alarm; + int i; + struct page *page; + + if (!(xen_start_info->flags & SIF_NET_BE_DOMAIN) && + !(xen_start_info->flags & SIF_INITDOMAIN)) + return 0; + + IPRINTK("Initialising Xen netif backend.\n"); + + /* We can increase reservation by this much in net_rx_action(). */ + balloon_update_driver_allowance(NETIF_RX_RING_SIZE); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + init_timer(&net_timer); + net_timer.data = 0; + net_timer.function = net_alarm; - page = balloon_alloc_empty_page_range(MAX_PENDING_REQS); - BUG_ON(page == NULL); - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - { - page = virt_to_page(MMAP_VADDR(i)); - set_page_count(page, 1); - SetPageForeign(page, netif_page_release); - } - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; - - spin_lock_init(&net_schedule_list_lock); - INIT_LIST_HEAD(&net_schedule_list); - - netif_xenbus_init(); - - (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), - netif_be_dbg, SA_SHIRQ, - "net-be-dbg", &netif_be_dbg); - - return 0; + page = balloon_alloc_empty_page_range(MAX_PENDING_REQS); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + + for (i = 0; i < MAX_PENDING_REQS; i++) { + page = virt_to_page(MMAP_VADDR(i)); + set_page_count(page, 1); + SetPageForeign(page, netif_page_release); + } + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for (i = 0; i < MAX_PENDING_REQS; i++) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netif_xenbus_init(); + + (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), + netif_be_dbg, SA_SHIRQ, + "net-be-dbg", &netif_be_dbg); + + return 0; } static void netback_cleanup(void) { - BUG(); + BUG(); } module_init(netback_init); module_exit(netback_cleanup); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Thu Sep 22 17:42:01 2005 @@ -242,6 +242,7 @@ be->dev = dev; be->backend_watch.node = dev->nodename; be->backend_watch.callback = backend_changed; + /* Registration implicitly calls backend_changed. */ err = register_xenbus_watch(&be->backend_watch); if (err) { be->backend_watch.node = NULL; @@ -263,8 +264,6 @@ } dev->data = be; - - backend_changed(&be->backend_watch, dev->nodename); return 0; free_be: @@ -294,3 +293,13 @@ { xenbus_register_backend(&netback); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Sep 22 17:42:01 2005 @@ -54,43 +54,10 @@ #include <asm-xen/balloon.h> #include <asm/page.h> #include <asm/uaccess.h> - -#ifdef CONFIG_XEN_NETDEV_GRANT #include <asm-xen/xen-public/grant_table.h> #include <asm-xen/gnttab.h> -static grant_ref_t gref_tx_head; -static grant_ref_t grant_tx_ref[NETIF_TX_RING_SIZE + 1]; - -static grant_ref_t gref_rx_head; -static grant_ref_t grant_rx_ref[NETIF_RX_RING_SIZE + 1]; - #define GRANT_INVALID_REF (0xFFFF) - -#ifdef GRANT_DEBUG -static void -dump_packet(int tag, void *addr, u32 ap) -{ - unsigned char *p = (unsigned char *)ap; - int i; - - printk(KERN_ALERT "#### rx_poll %c %08x ", tag & 0xff, addr); - for (i = 0; i < 20; i++) { - printk("%02x", p[i]); - } - printk("\n"); -} - -#define GDPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define dump_packet(x,y,z) ((void)0) -#define GDPRINTK(_f, _a...) ((void)0) -#endif - -#endif - - #ifndef __GFP_NOWARN #define __GFP_NOWARN 0 @@ -124,7 +91,6 @@ #define NETIF_STATE_DISCONNECTED 0 #define NETIF_STATE_CONNECTED 1 - static unsigned int netif_state = NETIF_STATE_DISCONNECTED; static void network_tx_buf_gc(struct net_device *dev); @@ -147,45 +113,50 @@ #define netfront_info net_private struct net_private { - struct list_head list; - struct net_device *netdev; - - struct net_device_stats stats; - NETIF_RING_IDX rx_resp_cons, tx_resp_cons; - unsigned int tx_full; + struct list_head list; + struct net_device *netdev; + + struct net_device_stats stats; + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; + unsigned int tx_full; - netif_tx_interface_t *tx; - netif_rx_interface_t *rx; - - spinlock_t tx_lock; - spinlock_t rx_lock; - - unsigned int handle; - unsigned int evtchn; - - /* What is the status of our connection to the remote backend? */ + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + unsigned int handle; + unsigned int evtchn; + + /* What is the status of our connection to the remote backend? */ #define BEST_CLOSED 0 #define BEST_DISCONNECTED 1 #define BEST_CONNECTED 2 - unsigned int backend_state; - - /* Is this interface open or closed (down or up)? */ + unsigned int backend_state; + + /* Is this interface open or closed (down or up)? */ #define UST_CLOSED 0 #define UST_OPEN 1 - unsigned int user_state; - - /* Receive-ring batched refills. */ + unsigned int user_state; + + /* Receive-ring batched refills. */ #define RX_MIN_TARGET 8 #define RX_MAX_TARGET NETIF_RX_RING_SIZE - int rx_min_target, rx_max_target, rx_target; - struct sk_buff_head rx_batch; - - /* - * {tx,rx}_skbs store outstanding skbuffs. The first entry in each - * array is an index into a chain of free entries. - */ - struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; - struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; + int rx_min_target, rx_max_target, rx_target; + struct sk_buff_head rx_batch; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; + + grant_ref_t gref_tx_head; + grant_ref_t grant_tx_ref[NETIF_TX_RING_SIZE + 1]; + grant_ref_t gref_rx_head; + grant_ref_t grant_rx_ref[NETIF_TX_RING_SIZE + 1]; struct xenbus_device *xbdev; char *backend; @@ -197,32 +168,32 @@ }; /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ -#define ADD_ID_TO_FREELIST(_list, _id) \ - (_list)[(_id)] = (_list)[0]; \ - (_list)[0] = (void *)(unsigned long)(_id); -#define GET_ID_FROM_FREELIST(_list) \ - ({ unsigned long _id = (unsigned long)(_list)[0]; \ - (_list)[0] = (_list)[_id]; \ - (unsigned short)_id; }) +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (void *)(unsigned long)(_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned long _id = (unsigned long)(_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) #ifdef DEBUG static char *be_state_name[] = { - [BEST_CLOSED] = "closed", - [BEST_DISCONNECTED] = "disconnected", - [BEST_CONNECTED] = "connected", + [BEST_CLOSED] = "closed", + [BEST_DISCONNECTED] = "disconnected", + [BEST_CONNECTED] = "connected", }; #endif #ifdef DEBUG #define DPRINTK(fmt, args...) \ - printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) + printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) ((void)0) #endif #define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_net: " fmt, ##args) + printk(KERN_INFO "xen_net: " fmt, ##args) #define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_net: " fmt, ##args) + printk(KERN_WARNING "xen_net: " fmt, ##args) /** Send a packet on a net device to encourage switches to learn the * MAC. We send a fake ARP request. @@ -232,628 +203,582 @@ */ static int send_fake_arp(struct net_device *dev) { - struct sk_buff *skb; - u32 src_ip, dst_ip; - - dst_ip = INADDR_BROADCAST; - src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); - - /* No IP? Then nothing to do. */ - if (src_ip == 0) - return 0; - - skb = arp_create(ARPOP_REPLY, ETH_P_ARP, - dst_ip, dev, src_ip, - /*dst_hw*/ NULL, /*src_hw*/ NULL, - /*target_hw*/ dev->dev_addr); - if (skb == NULL) - return -ENOMEM; - - return dev_queue_xmit(skb); + struct sk_buff *skb; + u32 src_ip, dst_ip; + + dst_ip = INADDR_BROADCAST; + src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); + + /* No IP? Then nothing to do. */ + if (src_ip == 0) + return 0; + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, + dst_ip, dev, src_ip, + /*dst_hw*/ NULL, /*src_hw*/ NULL, + /*target_hw*/ dev->dev_addr); + if (skb == NULL) + return -ENOMEM; + + return dev_queue_xmit(skb); } static int network_open(struct net_device *dev) { - struct net_private *np = netdev_priv(dev); - - memset(&np->stats, 0, sizeof(np->stats)); - - np->user_state = UST_OPEN; - - network_alloc_rx_buffers(dev); - np->rx->event = np->rx_resp_cons + 1; - - netif_start_queue(dev); - - return 0; + struct net_private *np = netdev_priv(dev); + + memset(&np->stats, 0, sizeof(np->stats)); + + np->user_state = UST_OPEN; + + network_alloc_rx_buffers(dev); + np->rx->event = np->rx_resp_cons + 1; + + netif_start_queue(dev); + + return 0; } static void network_tx_buf_gc(struct net_device *dev) { - NETIF_RING_IDX i, prod; - unsigned short id; - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb; - - if (np->backend_state != BEST_CONNECTED) - return; - - do { - prod = np->tx->resp_prod; - rmb(); /* Ensure we see responses up to 'rp'. */ - - for (i = np->tx_resp_cons; i != prod; i++) { - id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; - skb = np->tx_skbs[id]; -#ifdef CONFIG_XEN_NETDEV_GRANT - if (unlikely(gnttab_query_foreign_access(grant_tx_ref[id]) != 0)) { - /* other domain is still using this grant - shouldn't happen - but if it does, we'll try to reclaim the grant later */ - printk(KERN_ALERT "network_tx_buf_gc: warning -- grant " - "still in use by backend domain.\n"); - goto out; - } - gnttab_end_foreign_access_ref(grant_tx_ref[id], GNTMAP_readonly); - gnttab_release_grant_reference(&gref_tx_head, grant_tx_ref[id]); - grant_tx_ref[id] = GRANT_INVALID_REF; -#endif - ADD_ID_TO_FREELIST(np->tx_skbs, id); - dev_kfree_skb_irq(skb); - } + NETIF_RING_IDX i, prod; + unsigned short id; + struct net_private *np = netdev_priv(dev); + struct sk_buff *skb; + + if (np->backend_state != BEST_CONNECTED) + return; + + do { + prod = np->tx->resp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ + + for (i = np->tx_resp_cons; i != prod; i++) { + id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; + skb = np->tx_skbs[id]; + if (unlikely(gnttab_query_foreign_access( + np->grant_tx_ref[id]) != 0)) { + printk(KERN_ALERT "network_tx_buf_gc: warning " + "-- grant still in use by backend " + "domain.\n"); + goto out; + } + gnttab_end_foreign_access_ref( + np->grant_tx_ref[id], GNTMAP_readonly); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[id]); + np->grant_tx_ref[id] = GRANT_INVALID_REF; + ADD_ID_TO_FREELIST(np->tx_skbs, id); + dev_kfree_skb_irq(skb); + } - np->tx_resp_cons = prod; + np->tx_resp_cons = prod; - /* - * Set a new event, then check for race with update of tx_cons. Note - * that it is essential to schedule a callback, no matter how few - * buffers are pending. Even if there is space in the transmit ring, - * higher layers may be blocked because too much data is outstanding: - * in such cases notification from Xen is likely to be the only kick - * that we'll get. - */ - np->tx->event = - prod + ((np->tx->req_prod - prod) >> 1) + 1; - mb(); - } while (prod != np->tx->resp_prod); - -#ifdef CONFIG_XEN_NETDEV_GRANT - out: -#endif - - if (np->tx_full && ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE)) { - np->tx_full = 0; - if (np->user_state == UST_OPEN) - netif_wake_queue(dev); - } + /* + * Set a new event, then check for race with update of tx_cons. + * Note that it is essential to schedule a callback, no matter + * how few buffers are pending. Even if there is space in the + * transmit ring, higher layers may be blocked because too much + * data is outstanding: in such cases notification from Xen is + * likely to be the only kick that we'll get. + */ + np->tx->event = prod + ((np->tx->req_prod - prod) >> 1) + 1; + mb(); + } while (prod != np->tx->resp_prod); + + out: + if (np->tx_full && ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE)) { + np->tx_full = 0; + if (np->user_state == UST_OPEN) + netif_wake_queue(dev); + } } static void network_alloc_rx_buffers(struct net_device *dev) { - unsigned short id; - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb; - int i, batch_target; - NETIF_RING_IDX req_prod = np->rx->req_prod; - struct xen_memory_reservation reservation; -#ifdef CONFIG_XEN_NETDEV_GRANT - grant_ref_t ref; -#endif - - if (unlikely(np->backend_state != BEST_CONNECTED)) - return; - - /* - * Allocate skbuffs greedily, even though we batch updates to the - * receive ring. This creates a less bursty demand on the memory allocator, - * so should reduce the chance of failed allocation requests both for - * ourself and for other kernel subsystems. - */ - batch_target = np->rx_target - (req_prod - np->rx_resp_cons); - for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { - if (unlikely((skb = alloc_xen_skb(dev->mtu + RX_HEADROOM)) == NULL)) - break; - __skb_queue_tail(&np->rx_batch, skb); - } - - /* Is the batch large enough to be worthwhile? */ - if (i < (np->rx_target/2)) - return; - - for (i = 0; ; i++) { - if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) - break; - - skb->dev = dev; - - id = GET_ID_FROM_FREELIST(np->rx_skbs); - - np->rx_skbs[id] = skb; + unsigned short id; + struct net_private *np = netdev_priv(dev); + struct sk_buff *skb; + int i, batch_target; + NETIF_RING_IDX req_prod = np->rx->req_prod; + struct xen_memory_reservation reservation; + grant_ref_t ref; + + if (unlikely(np->backend_state != BEST_CONNECTED)) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory + * allocator, so should reduce the chance of failed allocation requests + * both for ourself and for other kernel subsystems. + */ + batch_target = np->rx_target - (req_prod - np->rx_resp_cons); + for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { + skb = alloc_xen_skb(dev->mtu + RX_HEADROOM); + if (skb == NULL) + break; + __skb_queue_tail(&np->rx_batch, skb); + } + + /* Is the batch large enough to be worthwhile? */ + if (i < (np->rx_target/2)) + return; + + for (i = 0; ; i++) { + if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) + break; + + skb->dev = dev; + + id = GET_ID_FROM_FREELIST(np->rx_skbs); + + np->rx_skbs[id] = skb; - np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; -#ifdef CONFIG_XEN_NETDEV_GRANT - ref = gnttab_claim_grant_reference(&gref_rx_head); - if (unlikely((signed short)ref < 0)) { - printk(KERN_ALERT "#### netfront can't claim rx reference\n"); - BUG(); - } - grant_rx_ref[id] = ref; - gnttab_grant_foreign_transfer_ref(ref, np->backend_id, - virt_to_mfn(skb->head)); - np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.gref = ref; -#endif - rx_pfn_array[i] = virt_to_mfn(skb->head); - - /* Remove this page from pseudo phys map before passing back to Xen. */ - phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] - = INVALID_P2M_ENTRY; - - MULTI_update_va_mapping(rx_mcl+i, (unsigned long)skb->head, - __pte(0), 0); - } - - /* After all PTEs have been zapped we blow away stale TLB entries. */ - rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; - - /* Give away a batch of pages. */ - rx_mcl[i].op = __HYPERVISOR_memory_op; - rx_mcl[i].args[0] = XENMEM_decrease_reservation; - rx_mcl[i].args[1] = (unsigned long)&reservation; - - reservation.extent_start = rx_pfn_array; - reservation.nr_extents = i; - reservation.extent_order = 0; - reservation.address_bits = 0; - reservation.domid = DOMID_SELF; - - /* Tell the ballon driver what is going on. */ - balloon_update_driver_allowance(i); - - /* Zap PTEs and give away pages in one big multicall. */ - (void)HYPERVISOR_multicall(rx_mcl, i+1); - - /* Check return status of HYPERVISOR_memory_op(). */ - if (unlikely(rx_mcl[i].result != i)) - panic("Unable to reduce memory reservation\n"); - - /* Above is a suitable barrier to ensure backend will see requests. */ - np->rx->req_prod = req_prod + i; - - /* Adjust our floating fill target if we risked running out of buffers. */ - if (((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) && - ((np->rx_target *= 2) > np->rx_max_target)) - np->rx_target = np->rx_max_target; + np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; + ref = gnttab_claim_grant_reference(&np->gref_rx_head); + BUG_ON((signed short)ref < 0); + np->grant_rx_ref[id] = ref; + gnttab_grant_foreign_transfer_ref(ref, np->backend_id); + np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.gref = ref; + rx_pfn_array[i] = virt_to_mfn(skb->head); + + /* Remove this page from map before passing back to Xen. */ + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] + = INVALID_P2M_ENTRY; + + MULTI_update_va_mapping(rx_mcl+i, (unsigned long)skb->head, + __pte(0), 0); + } + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + + /* Give away a batch of pages. */ + rx_mcl[i].op = __HYPERVISOR_memory_op; + rx_mcl[i].args[0] = XENMEM_decrease_reservation; + rx_mcl[i].args[1] = (unsigned long)&reservation; + + reservation.extent_start = rx_pfn_array; + reservation.nr_extents = i; + reservation.extent_order = 0; + reservation.address_bits = 0; + reservation.domid = DOMID_SELF; + + /* Tell the ballon driver what is going on. */ + balloon_update_driver_allowance(i); + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(rx_mcl, i+1); + + /* Check return status of HYPERVISOR_memory_op(). */ + if (unlikely(rx_mcl[i].result != i)) + panic("Unable to reduce memory reservation\n"); + + /* Above is a suitable barrier to ensure backend will see requests. */ + np->rx->req_prod = req_prod + i; + + /* Adjust our fill target if we risked running out of buffers. */ + if (((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) && + ((np->rx_target *= 2) > np->rx_max_target)) + np->rx_target = np->rx_max_target; } static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) { - unsigned short id; - struct net_private *np = netdev_priv(dev); - netif_tx_request_t *tx; - NETIF_RING_IDX i; -#ifdef CONFIG_XEN_NETDEV_GRANT - grant_ref_t ref; - unsigned long mfn; -#endif - - if (unlikely(np->tx_full)) { - printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); - netif_stop_queue(dev); - goto drop; - } - - if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= - PAGE_SIZE)) { - struct sk_buff *nskb; - if (unlikely((nskb = alloc_xen_skb(skb->len)) == NULL)) - goto drop; - skb_put(nskb, skb->len); - memcpy(nskb->data, skb->data, skb->len); - nskb->dev = skb->dev; - dev_kfree_skb(skb); - skb = nskb; - } + unsigned short id; + struct net_private *np = netdev_priv(dev); + netif_tx_request_t *tx; + NETIF_RING_IDX i; + grant_ref_t ref; + unsigned long mfn; + + if (unlikely(np->tx_full)) { + printk(KERN_ALERT "%s: full queue wasn't stopped!\n", + dev->name); + netif_stop_queue(dev); + goto drop; + } + + if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= + PAGE_SIZE)) { + struct sk_buff *nskb; + if (unlikely((nskb = alloc_xen_skb(skb->len)) == NULL)) + goto drop; + skb_put(nskb, skb->len); + memcpy(nskb->data, skb->data, skb->len); + nskb->dev = skb->dev; + dev_kfree_skb(skb); + skb = nskb; + } - spin_lock_irq(&np->tx_lock); - - if (np->backend_state != BEST_CONNECTED) { - spin_unlock_irq(&np->tx_lock); - goto drop; - } - - i = np->tx->req_prod; - - id = GET_ID_FROM_FREELIST(np->tx_skbs); - np->tx_skbs[id] = skb; - - tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req; - - tx->id = id; -#ifdef CONFIG_XEN_NETDEV_GRANT - ref = gnttab_claim_grant_reference(&gref_tx_head); - if (unlikely((signed short)ref < 0)) { - printk(KERN_ALERT "#### netfront can't claim tx grant reference\n"); - BUG(); - } - mfn = virt_to_mfn(skb->data); - gnttab_grant_foreign_access_ref(ref, np->backend_id, mfn, GNTMAP_readonly); - tx->addr = ref << PAGE_SHIFT; - grant_tx_ref[id] = ref; -#else - tx->addr = virt_to_mfn(skb->data) << PAGE_SHIFT; -#endif - tx->addr |= (unsigned long)skb->data & ~PAGE_MASK; - tx->size = skb->len; - tx->csum_blank = (skb->ip_summed == CHECKSUM_HW); - - wmb(); /* Ensure that backend will see the request. */ - np->tx->req_prod = i + 1; - - network_tx_buf_gc(dev); - - if ((i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1)) { - np->tx_full = 1; - netif_stop_queue(dev); - } - - spin_unlock_irq(&np->tx_lock); - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - - /* Only notify Xen if we really have to. */ - mb(); - if (np->tx->TX_TEST_IDX == i) - notify_via_evtchn(np->evtchn); - - return 0; + spin_lock_irq(&np->tx_lock); + + if (np->backend_state != BEST_CONNECTED) { + spin_unlock_irq(&np->tx_lock); + goto drop; + } + + i = np->tx->req_prod; + + id = GET_ID_FROM_FREELIST(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req; + + tx->id = id; + ref = gnttab_claim_grant_reference(&np->gref_tx_head); + BUG_ON((signed short)ref < 0); + mfn = virt_to_mfn(skb->data); + gnttab_grant_foreign_access_ref( + ref, np->backend_id, mfn, GNTMAP_readonly); + tx->gref = np->grant_tx_ref[id] = ref; + tx->offset = (unsigned long)skb->data & ~PAGE_MASK; + tx->size = skb->len; + tx->csum_blank = (skb->ip_summed == CHECKSUM_HW); + + wmb(); /* Ensure that backend will see the request. */ + np->tx->req_prod = i + 1; + + network_tx_buf_gc(dev); + + if ((i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1)) { + np->tx_full = 1; + netif_stop_queue(dev); + } + + spin_unlock_irq(&np->tx_lock); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Only notify Xen if we really have to. */ + mb(); + if (np->tx->TX_TEST_IDX == i) + notify_via_evtchn(np->evtchn); + + return 0; drop: - np->stats.tx_dropped++; - dev_kfree_skb(skb); - return 0; + np->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; } static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) { - struct net_device *dev = dev_id; - struct net_private *np = netdev_priv(dev); - unsigned long flags; - - spin_lock_irqsave(&np->tx_lock, flags); - network_tx_buf_gc(dev); - spin_unlock_irqrestore(&np->tx_lock, flags); - - if((np->rx_resp_cons != np->rx->resp_prod) && (np->user_state == UST_OPEN)) - netif_rx_schedule(dev); - - return IRQ_HANDLED; + struct net_device *dev = dev_id; + struct net_private *np = netdev_priv(dev); + unsigned long flags; + + spin_lock_irqsave(&np->tx_lock, flags); + network_tx_buf_gc(dev); + spin_unlock_irqrestore(&np->tx_lock, flags); + + if ((np->rx_resp_cons != np->rx->resp_prod) && + (np->user_state == UST_OPEN)) + netif_rx_schedule(dev); + + return IRQ_HANDLED; } static int netif_poll(struct net_device *dev, int *pbudget) { - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb, *nskb; - netif_rx_response_t *rx; - NETIF_RING_IDX i, rp; - mmu_update_t *mmu = rx_mmu; - multicall_entry_t *mcl = rx_mcl; - int work_done, budget, more_to_do = 1; - struct sk_buff_head rxq; - unsigned long flags; -#ifdef CONFIG_XEN_NETDEV_GRANT - unsigned long mfn; - grant_ref_t ref; -#endif - - spin_lock(&np->rx_lock); - - if (np->backend_state != BEST_CONNECTED) { - spin_unlock(&np->rx_lock); - return 0; - } - - skb_queue_head_init(&rxq); - - if ((budget = *pbudget) > dev->quota) - budget = dev->quota; - rp = np->rx->resp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ - - for (i = np->rx_resp_cons, work_done = 0; - (i != rp) && (work_done < budget); - i++, work_done++) { - rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp; - /* - * An error here is very odd. Usually indicates a backend bug, - * low-memory condition, or that we didn't have reservation headroom. - */ - if (unlikely(rx->status <= 0)) { - if (net_ratelimit()) - printk(KERN_WARNING "Bad rx buffer (memory squeeze?).\n"); - np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id; - wmb(); - np->rx->req_prod++; - work_done--; - continue; - } - -#ifdef CONFIG_XEN_NETDEV_GRANT - ref = grant_rx_ref[rx->id]; - - if(ref == GRANT_INVALID_REF) { - printk(KERN_WARNING "Bad rx grant reference %d from dom %d.\n", - ref, np->backend_id); - np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id; - wmb(); - np->rx->req_prod++; - work_done--; - continue; - } - - grant_rx_ref[rx->id] = GRANT_INVALID_REF; - mfn = gnttab_end_foreign_transfer_ref(ref); - gnttab_release_grant_reference(&gref_rx_head, ref); -#endif - - skb = np->rx_skbs[rx->id]; - ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); - - /* NB. We handle skb overflow later. */ -#ifdef CONFIG_XEN_NETDEV_GRANT - skb->data = skb->head + rx->addr; -#else - skb->data = skb->head + (rx->addr & ~PAGE_MASK); -#endif - skb->len = rx->status; - skb->tail = skb->data + skb->len; - - if ( rx->csum_valid ) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - np->stats.rx_packets++; - np->stats.rx_bytes += rx->status; - - /* Remap the page. */ -#ifdef CONFIG_XEN_NETDEV_GRANT - mmu->ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; -#else - mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; -#endif - mmu->val = __pa(skb->head) >> PAGE_SHIFT; - mmu++; -#ifdef CONFIG_XEN_NETDEV_GRANT - MULTI_update_va_mapping(mcl, (unsigned long)skb->head, - pfn_pte_ma(mfn, PAGE_KERNEL), 0); -#else - MULTI_update_va_mapping(mcl, (unsigned long)skb->head, - pfn_pte_ma(rx->addr >> PAGE_SHIFT, - PAGE_KERNEL), 0); -#endif - mcl++; - -#ifdef CONFIG_XEN_NETDEV_GRANT - phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = mfn; - GDPRINTK("#### rx_poll enqueue vdata=%p mfn=%lu ref=%x\n", - skb->data, mfn, ref); -#else - phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = - rx->addr >> PAGE_SHIFT; -#endif - - - __skb_queue_tail(&rxq, skb); - } - - - /* Some pages are no longer absent... */ - balloon_update_driver_allowance(-work_done); - - /* Do all the remapping work, and M->P updates, in one big hypercall. */ - if (likely((mcl - rx_mcl) != 0)) { - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)rx_mmu; - mcl->args[1] = mmu - rx_mmu; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - mcl++; - (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); - } - - while ((skb = __skb_dequeue(&rxq)) != NULL) { -#ifdef CONFIG_XEN_NETDEV_GRANT - GDPRINTK("#### rx_poll dequeue vdata=%p mfn=%lu\n", - skb->data, virt_to_mfn(skb->data)); - dump_packet('d', skb->data, (unsigned long)skb->data); -#endif - /* - * Enough room in skbuff for the data we were passed? Also, Linux - * expects at least 16 bytes headroom in each receive buffer. - */ - if (unlikely(skb->tail > skb->end) || - unlikely((skb->data - skb->head) < 16)) { - nskb = NULL; - - - /* Only copy the packet if it fits in the current MTU. */ - if (skb->len <= (dev->mtu + ETH_HLEN)) { - if ((skb->tail > skb->end) && net_ratelimit()) - printk(KERN_INFO "Received packet needs %zd bytes more " - "headroom.\n", skb->tail - skb->end); - - if ((nskb = alloc_xen_skb(skb->len + 2)) != NULL) { - skb_reserve(nskb, 2); - skb_put(nskb, skb->len); - memcpy(nskb->data, skb->data, skb->len); - nskb->dev = skb->dev; - } - } - else if (net_ratelimit()) - printk(KERN_INFO "Received packet too big for MTU " - "(%d > %d)\n", skb->len - ETH_HLEN, dev->mtu); - - /* Reinitialise and then destroy the old skbuff. */ - skb->len = 0; - skb->tail = skb->data; - init_skb_shinfo(skb); - dev_kfree_skb(skb); - - /* Switch old for new, if we copied the buffer. */ - if ((skb = nskb) == NULL) - continue; - } + struct net_private *np = netdev_priv(dev); + struct sk_buff *skb, *nskb; + netif_rx_response_t *rx; + NETIF_RING_IDX i, rp; + mmu_update_t *mmu = rx_mmu; + multicall_entry_t *mcl = rx_mcl; + int work_done, budget, more_to_do = 1; + struct sk_buff_head rxq; + unsigned long flags; + unsigned long mfn; + grant_ref_t ref; + + spin_lock(&np->rx_lock); + + if (np->backend_state != BEST_CONNECTED) { + spin_unlock(&np->rx_lock); + return 0; + } + + skb_queue_head_init(&rxq); + + if ((budget = *pbudget) > dev->quota) + budget = dev->quota; + rp = np->rx->resp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = np->rx_resp_cons, work_done = 0; + (i != rp) && (work_done < budget); + i++, work_done++) { + rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + /* + * An error here is very odd. Usually indicates a backend bug, + * low-mem condition, or we didn't have reservation headroom. + */ + if (unlikely(rx->status <= 0)) { + if (net_ratelimit()) + printk(KERN_WARNING "Bad rx buffer " + "(memory squeeze?).\n"); + np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)]. + req.id = rx->id; + wmb(); + np->rx->req_prod++; + work_done--; + continue; + } + + ref = np->grant_rx_ref[rx->id]; + + if(ref == GRANT_INVALID_REF) { + printk(KERN_WARNING "Bad rx grant reference %d " + "from dom %d.\n", + ref, np->backend_id); + np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)]. + req.id = rx->id; + wmb(); + np->rx->req_prod++; + work_done--; + continue; + } + + np->grant_rx_ref[rx->id] = GRANT_INVALID_REF; + mfn = gnttab_end_foreign_transfer_ref(ref); + gnttab_release_grant_reference(&np->gref_rx_head, ref); + + skb = np->rx_skbs[rx->id]; + ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + + /* NB. We handle skb overflow later. */ + skb->data = skb->head + rx->offset; + skb->len = rx->status; + skb->tail = skb->data + skb->len; + + if ( rx->csum_valid ) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + np->stats.rx_packets++; + np->stats.rx_bytes += rx->status; + + /* Remap the page. */ + mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu->val = __pa(skb->head) >> PAGE_SHIFT; + mmu++; + MULTI_update_va_mapping(mcl, (unsigned long)skb->head, + pfn_pte_ma(mfn, PAGE_KERNEL), 0); + mcl++; + + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = mfn; + + __skb_queue_tail(&rxq, skb); + } + + /* Some pages are no longer absent... */ + balloon_update_driver_allowance(-work_done); + + /* Do all the remapping work, and M2P updates, in one big hypercall. */ + if (likely((mcl - rx_mcl) != 0)) { + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + } + + while ((skb = __skb_dequeue(&rxq)) != NULL) { + /* + * Enough room in skbuff for the data we were passed? Also, + * Linux expects at least 16 bytes headroom in each rx buffer. + */ + if (unlikely(skb->tail > skb->end) || + unlikely((skb->data - skb->head) < 16)) { + nskb = NULL; + + /* Only copy the packet if it fits in the MTU. */ + if (skb->len <= (dev->mtu + ETH_HLEN)) { + if ((skb->tail > skb->end) && net_ratelimit()) + printk(KERN_INFO "Received packet " + "needs %zd bytes more " + "headroom.\n", + skb->tail - skb->end); + + nskb = alloc_xen_skb(skb->len + 2); + if (nskb != NULL) { + skb_reserve(nskb, 2); + skb_put(nskb, skb->len); + memcpy(nskb->data, + skb->data, + skb->len); + nskb->dev = skb->dev; + } + } + else if (net_ratelimit()) + printk(KERN_INFO "Received packet too big for " + "MTU (%d > %d)\n", + skb->len - ETH_HLEN, dev->mtu); + + /* Reinitialise and then destroy the old skbuff. */ + skb->len = 0; + skb->tail = skb->data; + init_skb_shinfo(skb); + dev_kfree_skb(skb); + + /* Switch old for new, if we copied the buffer. */ + if ((skb = nskb) == NULL) + continue; + } - /* Set the shared-info area, which is hidden behind the real data. */ - init_skb_shinfo(skb); - /* Ethernet-specific work. Delayed to here as it peeks the header. */ - skb->protocol = eth_type_trans(skb, dev); - - /* Pass it up. */ - netif_receive_skb(skb); - dev->last_rx = jiffies; - } - - np->rx_resp_cons = i; - - /* If we get a callback with very few responses, reduce fill target. */ - /* NB. Note exponential increase, linear decrease. */ - if (((np->rx->req_prod - np->rx->resp_prod) > ((3*np->rx_target) / 4)) && - (--np->rx_target < np->rx_min_target)) - np->rx_target = np->rx_min_target; - - network_alloc_rx_buffers(dev); - - *pbudget -= work_done; - dev->quota -= work_done; - - if (work_done < budget) { - local_irq_save(flags); - - np->rx->event = i + 1; + /* Set the shinfo area, which is hidden behind the data. */ + init_skb_shinfo(skb); + /* Ethernet work: Delayed to here as it peeks the header. */ + skb->protocol = eth_type_trans(skb, dev); + + /* Pass it up. */ + netif_receive_skb(skb); + dev->last_rx = jiffies; + } + + np->rx_resp_cons = i; + + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if (((np->rx->req_prod - np->rx->resp_prod) > + ((3*np->rx_target) / 4)) && + (--np->rx_target < np->rx_min_target)) + np->rx_target = np->rx_min_target; + + network_alloc_rx_buffers(dev); + + *pbudget -= work_done; + dev->quota -= work_done; + + if (work_done < budget) { + local_irq_save(flags); + + np->rx->event = i + 1; - /* Deal with hypervisor racing our resetting of rx_event. */ - mb(); - if (np->rx->resp_prod == i) { - __netif_rx_complete(dev); - more_to_do = 0; - } - - local_irq_restore(flags); - } - - spin_unlock(&np->rx_lock); - - return more_to_do; + /* Deal with hypervisor racing our resetting of rx_event. */ + mb(); + if (np->rx->resp_prod == i) { + __netif_rx_complete(dev); + more_to_do = 0; + } + + local_irq_restore(flags); + } + + spin_unlock(&np->rx_lock); + + return more_to_do; } static int network_close(struct net_device *dev) { - struct net_private *np = netdev_priv(dev); - np->user_state = UST_CLOSED; - netif_stop_queue(np->netdev); - return 0; + struct net_private *np = netdev_priv(dev); + np->user_state = UST_CLOSED; + netif_stop_queue(np->netdev); + return 0; } static struct net_device_stats *network_get_stats(struct net_device *dev) { - struct net_private *np = netdev_priv(dev); - return &np->stats; + struct net_private *np = netdev_priv(dev); + return &np->stats; } static void network_connect(struct net_device *dev) { - struct net_private *np; - int i, requeue_idx; - netif_tx_request_t *tx; - - np = netdev_priv(dev); - spin_lock_irq(&np->tx_lock); - spin_lock(&np->rx_lock); - - /* Recovery procedure: */ - - /* Step 1: Reinitialise variables. */ - np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; - np->rx->event = np->tx->event = 1; - - /* Step 2: Rebuild the RX and TX ring contents. - * NB. We could just free the queued TX packets now but we hope - * that sending them out might do some good. We have to rebuild - * the RX ring because some of our pages are currently flipped out - * so we can't just free the RX skbs. - * NB2. Freelist index entries are always going to be less than - * __PAGE_OFFSET, whereas pointers to skbs will always be equal or - * greater than __PAGE_OFFSET: we use this property to distinguish - * them. - */ - - /* Rebuild the TX buffer freelist and the TX ring itself. - * NB. This reorders packets. We could keep more private state - * to avoid this but maybe it doesn't matter so much given the - * interface has been down. - */ - for (requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++) { - if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { - struct sk_buff *skb = np->tx_skbs[i]; - - tx = &np->tx->ring[requeue_idx++].req; - - tx->id = i; -#ifdef CONFIG_XEN_NETDEV_GRANT - gnttab_grant_foreign_access_ref(grant_tx_ref[i], np->backend_id, - virt_to_mfn(np->tx_skbs[i]->data), - GNTMAP_readonly); - tx->addr = grant_tx_ref[i] << PAGE_SHIFT; -#else - tx->addr = virt_to_mfn(skb->data) << PAGE_SHIFT; -#endif - tx->addr |= (unsigned long)skb->data & ~PAGE_MASK; - tx->size = skb->len; - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - } - } - wmb(); - np->tx->req_prod = requeue_idx; - - /* Rebuild the RX buffer freelist and the RX ring itself. */ - for (requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++) { - if ((unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET) { -#ifdef CONFIG_XEN_NETDEV_GRANT - /* Reinstate the grant ref so backend can 'donate' mfn to us. */ - gnttab_grant_foreign_transfer_ref(grant_rx_ref[i], np->backend_id, - virt_to_mfn(np->rx_skbs[i]->head) - ); - np->rx->ring[requeue_idx].req.gref = grant_rx_ref[i]; -#endif - np->rx->ring[requeue_idx].req.id = i; - requeue_idx++; - } - } - - wmb(); - np->rx->req_prod = requeue_idx; - - /* Step 3: All public and private state should now be sane. Get - * ready to start sending and receiving packets and give the driver - * domain a kick because we've probably just requeued some - * packets. - */ - np->backend_state = BEST_CONNECTED; - wmb(); - notify_via_evtchn(np->evtchn); - network_tx_buf_gc(dev); - - if (np->user_state == UST_OPEN) - netif_start_queue(dev); - - spin_unlock(&np->rx_lock); - spin_unlock_irq(&np->tx_lock); + struct net_private *np; + int i, requeue_idx; + netif_tx_request_t *tx; + + np = netdev_priv(dev); + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + + /* Recovery procedure: */ + + /* Step 1: Reinitialise variables. */ + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + np->rx->event = np->tx->event = 1; + + /* Step 2: Rebuild the RX and TX ring contents. + * NB. We could just free the queued TX packets now but we hope + * that sending them out might do some good. We have to rebuild + * the RX ring because some of our pages are currently flipped out + * so we can't just free the RX skbs. + * NB2. Freelist index entries are always going to be less than + * __PAGE_OFFSET, whereas pointers to skbs will always be equal or + * greater than __PAGE_OFFSET: we use this property to distinguish + * them. + */ + + /* Rebuild the TX buffer freelist and the TX ring itself. + * NB. This reorders packets. We could keep more private state + * to avoid this but maybe it doesn't matter so much given the + * interface has been down. + */ + for (requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++) { + if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { + struct sk_buff *skb = np->tx_skbs[i]; + + tx = &np->tx->ring[requeue_idx++].req; + + tx->id = i; + gnttab_grant_foreign_access_ref( + np->grant_tx_ref[i], np->backend_id, + virt_to_mfn(np->tx_skbs[i]->data), + GNTMAP_readonly); + tx->gref = np->grant_tx_ref[i]; + tx->offset = (unsigned long)skb->data & ~PAGE_MASK; + tx->size = skb->len; + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + } + } + wmb(); + np->tx->req_prod = requeue_idx; + + /* Rebuild the RX buffer freelist and the RX ring itself. */ + for (requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++) { + if ((unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET) { + gnttab_grant_foreign_transfer_ref( + np->grant_rx_ref[i], np->backend_id); + np->rx->ring[requeue_idx].req.gref = + np->grant_rx_ref[i]; + np->rx->ring[requeue_idx].req.id = i; + requeue_idx++; + } + } + + wmb(); + np->rx->req_prod = requeue_idx; + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + np->backend_state = BEST_CONNECTED; + wmb(); + notify_via_evtchn(np->evtchn); + network_tx_buf_gc(dev); + + if (np->user_state == UST_OPEN) + netif_start_queue(dev); + + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); } static void show_device(struct net_private *np) @@ -890,6 +815,13 @@ show_device(np); } +static void netif_uninit(struct net_device *dev) +{ + struct net_private *np = netdev_priv(dev); + gnttab_free_grant_references(np->gref_tx_head); + gnttab_free_grant_references(np->gref_rx_head); +} + static struct ethtool_ops network_ethtool_ops = { .get_tx_csum = ethtool_op_get_tx_csum, @@ -904,84 +836,99 @@ static int create_netdev(int handle, struct xenbus_device *dev, struct net_device **val) { - int i, err = 0; - struct net_device *netdev = NULL; - struct net_private *np = NULL; - - if ((netdev = alloc_etherdev(sizeof(struct net_private))) == NULL) { - printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__); - err = -ENOMEM; - goto exit; - } - - np = netdev_priv(netdev); - np->backend_state = BEST_CLOSED; - np->user_state = UST_CLOSED; - np->handle = handle; - np->xbdev = dev; + int i, err = 0; + struct net_device *netdev = NULL; + struct net_private *np = NULL; + + if ((netdev = alloc_etherdev(sizeof(struct net_private))) == NULL) { + printk(KERN_WARNING "%s> alloc_etherdev failed.\n", + __FUNCTION__); + err = -ENOMEM; + goto exit; + } + + np = netdev_priv(netdev); + np->backend_state = BEST_CLOSED; + np->user_state = UST_CLOSED; + np->handle = handle; + np->xbdev = dev; - spin_lock_init(&np->tx_lock); - spin_lock_init(&np->rx_lock); - - skb_queue_head_init(&np->rx_batch); - np->rx_target = RX_MIN_TARGET; - np->rx_min_target = RX_MIN_TARGET; - np->rx_max_target = RX_MAX_TARGET; - - /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ - for (i = 0; i <= NETIF_TX_RING_SIZE; i++) { - np->tx_skbs[i] = (void *)((unsigned long) i+1); -#ifdef CONFIG_XEN_NETDEV_GRANT - grant_tx_ref[i] = GRANT_INVALID_REF; -#endif - } - - for (i = 0; i <= NETIF_RX_RING_SIZE; i++) { - np->rx_skbs[i] = (void *)((unsigned long) i+1); -#ifdef CONFIG_XEN_NETDEV_GRANT - grant_rx_ref[i] = GRANT_INVALID_REF; -#endif - } - - netdev->open = network_open; - netdev->hard_start_xmit = network_start_xmit; - netdev->stop = network_close; - netdev->get_stats = network_get_stats; - netdev->poll = netif_poll; - netdev->weight = 64; - netdev->features = NETIF_F_IP_CSUM; - - SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); - - if ((err = register_netdev(netdev)) != 0) { - printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err); - goto exit; - } - - if ((err = xennet_proc_addif(netdev)) != 0) { - unregister_netdev(netdev); - goto exit; - } - - np->netdev = netdev; - - exit: - if ((err != 0) && (netdev != NULL)) - kfree(netdev); - else if (val != NULL) - *val = netdev; - return err; + spin_lock_init(&np->tx_lock); + spin_lock_init(&np->rx_lock); + + skb_queue_head_init(&np->rx_batch); + np->rx_target = RX_MIN_TARGET; + np->rx_min_target = RX_MIN_TARGET; + np->rx_max_target = RX_MAX_TARGET; + + /* Initialise {tx,rx}_skbs as a free chain containing every entry. */ + for (i = 0; i <= NETIF_TX_RING_SIZE; i++) { + np->tx_skbs[i] = (void *)((unsigned long) i+1); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + } + + for (i = 0; i <= NETIF_RX_RING_SIZE; i++) { + np->rx_skbs[i] = (void *)((unsigned long) i+1); + np->grant_rx_ref[i] = GRANT_INVALID_REF; + } + + /* A grant for every tx ring slot */ + if (gnttab_alloc_grant_references(NETIF_TX_RING_SIZE, + &np->gref_tx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); + goto exit; + } + /* A grant for every rx ring slot */ + if (gnttab_alloc_grant_references(NETIF_RX_RING_SIZE, + &np->gref_rx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); + gnttab_free_grant_references(np->gref_tx_head); + goto exit; + } + + netdev->open = network_open; + netdev->hard_start_xmit = network_start_xmit; + netdev->stop = network_close; + netdev->get_stats = network_get_stats; + netdev->poll = netif_poll; + netdev->uninit = netif_uninit; + netdev->weight = 64; + netdev->features = NETIF_F_IP_CSUM; + + SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); + + if ((err = register_netdev(netdev)) != 0) { + printk(KERN_WARNING "%s> register_netdev err=%d\n", + __FUNCTION__, err); + goto exit_free_grefs; + } + + if ((err = xennet_proc_addif(netdev)) != 0) { + unregister_netdev(netdev); + goto exit_free_grefs; + } + + np->netdev = netdev; + + exit: + if ((err != 0) && (netdev != NULL)) + kfree(netdev); + else if (val != NULL) + *val = netdev; + return err; + + exit_free_grefs: + gnttab_free_grant_references(np->gref_tx_head); + gnttab_free_grant_references(np->gref_rx_head); + goto exit; } static int destroy_netdev(struct net_device *netdev) { - #ifdef CONFIG_PROC_FS xennet_proc_delif(netdev); #endif - unregister_netdev(netdev); - return 0; } @@ -992,20 +939,20 @@ static int inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; - - /* UP event and is it one of our devices? */ - if (event == NETDEV_UP && dev->open == network_open) - (void)send_fake_arp(dev); + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + + /* UP event and is it one of our devices? */ + if (event == NETDEV_UP && dev->open == network_open) + (void)send_fake_arp(dev); - return NOTIFY_DONE; + return NOTIFY_DONE; } static struct notifier_block notifier_inetdev = { - .notifier_call = inetdev_notify, - .next = NULL, - .priority = 0 + .notifier_call = inetdev_notify, + .next = NULL, + .priority = 0 }; static struct xenbus_device_id netfront_ids[] = { @@ -1022,10 +969,8 @@ evtchn_op_t op = { .cmd = EVTCHNOP_alloc_unbound }; int err; -#ifdef CONFIG_XEN_NETDEV_GRANT info->tx_ring_ref = GRANT_INVALID_REF; info->rx_ring_ref = GRANT_INVALID_REF; -#endif info->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); if (info->tx == 0) { @@ -1043,7 +988,6 @@ memset(info->rx, 0, PAGE_SIZE); info->backend_state = BEST_DISCONNECTED; -#ifdef CONFIG_XEN_NETDEV_GRANT err = gnttab_grant_foreign_access(info->backend_id, virt_to_mfn(info->tx), 0); if (err < 0) { @@ -1059,11 +1003,6 @@ goto out; } info->rx_ring_ref = err; - -#else - info->tx_ring_ref = virt_to_mfn(info->tx); - info->rx_ring_ref = virt_to_mfn(info->rx); -#endif op.u.alloc_unbound.dom = info->backend_id; err = HYPERVISOR_event_channel_op(&op); @@ -1082,7 +1021,6 @@ free_page((unsigned long)info->rx); info->rx = 0; -#ifdef CONFIG_XEN_NETDEV_GRANT if (info->tx_ring_ref != GRANT_INVALID_REF) gnttab_end_foreign_access(info->tx_ring_ref, 0); info->tx_ring_ref = GRANT_INVALID_REF; @@ -1090,7 +1028,6 @@ if (info->rx_ring_ref != GRANT_INVALID_REF) gnttab_end_foreign_access(info->rx_ring_ref, 0); info->rx_ring_ref = GRANT_INVALID_REF; -#endif return err; } @@ -1104,7 +1041,6 @@ free_page((unsigned long)info->rx); info->rx = 0; -#ifdef CONFIG_XEN_NETDEV_GRANT if (info->tx_ring_ref != GRANT_INVALID_REF) gnttab_end_foreign_access(info->tx_ring_ref, 0); info->tx_ring_ref = GRANT_INVALID_REF; @@ -1112,7 +1048,6 @@ if (info->rx_ring_ref != GRANT_INVALID_REF) gnttab_end_foreign_access(info->rx_ring_ref, 0); info->rx_ring_ref = GRANT_INVALID_REF; -#endif unbind_evtchn_from_irqhandler(info->evtchn, info->netdev); info->evtchn = 0; @@ -1282,10 +1217,6 @@ return err; } - - /* Call once in case entries already there. */ - watch_for_status(&info->watch, info->watch.node); - return 0; } @@ -1344,72 +1275,50 @@ static int wait_for_netif(void) { - int err = 0; - int i; - - /* - * We should figure out how many and which devices we need to - * proceed and only wait for those. For now, continue once the - * first device is around. - */ - for ( i=0; netif_state != NETIF_STATE_CONNECTED && (i < 10*HZ); i++ ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - if (netif_state != NETIF_STATE_CONNECTED) { - WPRINTK("Timeout connecting to device!\n"); - err = -ENOSYS; - } - return err; + int err = 0; + int i; + + /* + * We should figure out how many and which devices we need to + * proceed and only wait for those. For now, continue once the + * first device is around. + */ + for ( i=0; netif_state != NETIF_STATE_CONNECTED && (i < 10*HZ); i++ ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + if (netif_state != NETIF_STATE_CONNECTED) { + WPRINTK("Timeout connecting to device!\n"); + err = -ENOSYS; + } + return err; } static int __init netif_init(void) { - int err = 0; - - if (xen_start_info->flags & SIF_INITDOMAIN) - return 0; - - if ((err = xennet_proc_init()) != 0) - return err; - - IPRINTK("Initialising virtual ethernet driver.\n"); - -#ifdef CONFIG_XEN_NETDEV_GRANT - IPRINTK("Using grant tables.\n"); - - /* A grant for every tx ring slot */ - if (gnttab_alloc_grant_references(NETIF_TX_RING_SIZE, - &gref_tx_head) < 0) { - printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); - return 1; - } - /* A grant for every rx ring slot */ - if (gnttab_alloc_grant_references(NETIF_RX_RING_SIZE, - &gref_rx_head) < 0) { - printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); - return 1; - } -#endif - - - (void)register_inetaddr_notifier(&notifier_inetdev); - - init_net_xenbus(); - - wait_for_netif(); - - return err; + int err = 0; + + if (xen_start_info->flags & SIF_INITDOMAIN) + return 0; + + if ((err = xennet_proc_init()) != 0) + return err; + + IPRINTK("Initialising virtual ethernet driver.\n"); + + (void)register_inetaddr_notifier(&notifier_inetdev); + + init_net_xenbus(); + + wait_for_netif(); + + return err; } static void netif_exit(void) { -#ifdef CONFIG_XEN_NETDEV_GRANT - gnttab_free_grant_references(gref_tx_head); - gnttab_free_grant_references(gref_rx_head); -#endif } #ifdef CONFIG_PROC_FS @@ -1419,147 +1328,159 @@ #define TARGET_CUR 2UL static int xennet_proc_read( - char *page, char **start, off_t off, int count, int *eof, void *data) -{ - struct net_device *dev = (struct net_device *)((unsigned long)data & ~3UL); - struct net_private *np = netdev_priv(dev); - int len = 0, which_target = (long)data & 3; + char *page, char **start, off_t off, int count, int *eof, void *data) +{ + struct net_device *dev = + (struct net_device *)((unsigned long)data & ~3UL); + struct net_private *np = netdev_priv(dev); + int len = 0, which_target = (long)data & 3; - switch (which_target) - { - case TARGET_MIN: - len = sprintf(page, "%d\n", np->rx_min_target); - break; - case TARGET_MAX: - len = sprintf(page, "%d\n", np->rx_max_target); - break; - case TARGET_CUR: - len = sprintf(page, "%d\n", np->rx_target); - break; - } - - *eof = 1; - return len; + switch (which_target) + { + case TARGET_MIN: + len = sprintf(page, "%d\n", np->rx_min_target); + break; + case TARGET_MAX: + len = sprintf(page, "%d\n", np->rx_max_target); + break; + case TARGET_CUR: + len = sprintf(page, "%d\n", np->rx_target); + break; + } + + *eof = 1; + return len; } static int xennet_proc_write( - struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - struct net_device *dev = (struct net_device *)((unsigned long)data & ~3UL); - struct net_private *np = netdev_priv(dev); - int which_target = (long)data & 3; - char string[64]; - long target; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (count <= 1) - return -EBADMSG; /* runt */ - if (count > sizeof(string)) - return -EFBIG; /* too long */ - - if (copy_from_user(string, buffer, count)) - return -EFAULT; - string[sizeof(string)-1] = '\0'; - - target = simple_strtol(string, NULL, 10); - if (target < RX_MIN_TARGET) - target = RX_MIN_TARGET; - if (target > RX_MAX_TARGET) - target = RX_MAX_TARGET; - - spin_lock(&np->rx_lock); - - switch (which_target) - { - case TARGET_MIN: - if (target > np->rx_max_target) - np->rx_max_target = target; - np->rx_min_target = target; - if (target > np->rx_target) - np->rx_target = target; - break; - case TARGET_MAX: - if (target < np->rx_min_target) - np->rx_min_target = target; - np->rx_max_target = target; - if (target < np->rx_target) - np->rx_target = target; - break; - case TARGET_CUR: - break; - } - - network_alloc_rx_buffers(dev); - - spin_unlock(&np->rx_lock); - - return count; + struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + struct net_device *dev = + (struct net_device *)((unsigned long)data & ~3UL); + struct net_private *np = netdev_priv(dev); + int which_target = (long)data & 3; + char string[64]; + long target; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(string)) + return -EFBIG; /* too long */ + + if (copy_from_user(string, buffer, count)) + return -EFAULT; + string[sizeof(string)-1] = '\0'; + + target = simple_strtol(string, NULL, 10); + if (target < RX_MIN_TARGET) + target = RX_MIN_TARGET; + if (target > RX_MAX_TARGET) + target = RX_MAX_TARGET; + + spin_lock(&np->rx_lock); + + switch (which_target) + { + case TARGET_MIN: + if (target > np->rx_max_target) + np->rx_max_target = target; + np->rx_min_target = target; + if (target > np->rx_target) + np->rx_target = target; + break; + case TARGET_MAX: + if (target < np->rx_min_target) + np->rx_min_target = target; + np->rx_max_target = target; + if (target < np->rx_target) + np->rx_target = target; + break; + case TARGET_CUR: + break; + } + + network_alloc_rx_buffers(dev); + + spin_unlock(&np->rx_lock); + + return count; } static int xennet_proc_init(void) { - if (proc_mkdir("xen/net", NULL) == NULL) - return -ENOMEM; - return 0; + if (proc_mkdir("xen/net", NULL) == NULL) + return -ENOMEM; + return 0; } static int xennet_proc_addif(struct net_device *dev) { - struct proc_dir_entry *dir, *min, *max, *cur; - char name[30]; - - sprintf(name, "xen/net/%s", dev->name); - - dir = proc_mkdir(name, NULL); - if (!dir) - goto nomem; - - min = create_proc_entry("rxbuf_min", 0644, dir); - max = create_proc_entry("rxbuf_max", 0644, dir); - cur = create_proc_entry("rxbuf_cur", 0444, dir); - if (!min || !max || !cur) - goto nomem; - - min->read_proc = xennet_proc_read; - min->write_proc = xennet_proc_write; - min->data = (void *)((unsigned long)dev | TARGET_MIN); - - max->read_proc = xennet_proc_read; - max->write_proc = xennet_proc_write; - max->data = (void *)((unsigned long)dev | TARGET_MAX); - - cur->read_proc = xennet_proc_read; - cur->write_proc = xennet_proc_write; - cur->data = (void *)((unsigned long)dev | TARGET_CUR); - - return 0; + struct proc_dir_entry *dir, *min, *max, *cur; + char name[30]; + + sprintf(name, "xen/net/%s", dev->name); + + dir = proc_mkdir(name, NULL); + if (!dir) + goto nomem; + + min = create_proc_entry("rxbuf_min", 0644, dir); + max = create_proc_entry("rxbuf_max", 0644, dir); + cur = create_proc_entry("rxbuf_cur", 0444, dir); + if (!min || !max || !cur) + goto nomem; + + min->read_proc = xennet_proc_read; + min->write_proc = xennet_proc_write; + min->data = (void *)((unsigned long)dev | TARGET_MIN); + + max->read_proc = xennet_proc_read; + max->write_proc = xennet_proc_write; + max->data = (void *)((unsigned long)dev | TARGET_MAX); + + cur->read_proc = xennet_proc_read; + cur->write_proc = xennet_proc_write; + cur->data = (void *)((unsigned long)dev | TARGET_CUR); + + return 0; nomem: - xennet_proc_delif(dev); - return -ENOMEM; + xennet_proc_delif(dev); + return -ENOMEM; } static void xennet_proc_delif(struct net_device *dev) { - char name[30]; - - sprintf(name, "xen/net/%s/rxbuf_min", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s/rxbuf_max", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s/rxbuf_cur", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s", dev->name); - remove_proc_entry(name, NULL); + char name[30]; + + sprintf(name, "xen/net/%s/rxbuf_min", dev->name); + remove_proc_entry(name, NULL); + + sprintf(name, "xen/net/%s/rxbuf_max", dev->name); + remove_proc_entry(name, NULL); + + sprintf(name, "xen/net/%s/rxbuf_cur", dev->name); + remove_proc_entry(name, NULL); + + sprintf(name, "xen/net/%s", dev->name); + remove_proc_entry(name, NULL); } #endif module_init(netif_init); module_exit(netif_exit); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Thu Sep 22 17:42:01 2005 @@ -41,232 +41,253 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long data) { - int ret = -ENOSYS; - - switch ( cmd ) - { - case IOCTL_PRIVCMD_HYPERCALL: - { - privcmd_hypercall_t hypercall; + int ret = -ENOSYS; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: { + privcmd_hypercall_t hypercall; - if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) - return -EFAULT; + if (copy_from_user(&hypercall, (void *)data, + sizeof(hypercall))) + return -EFAULT; #if defined(__i386__) - __asm__ __volatile__ ( - "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " - "movl 4(%%eax),%%ebx ;" - "movl 8(%%eax),%%ecx ;" - "movl 12(%%eax),%%edx ;" - "movl 16(%%eax),%%esi ;" - "movl 20(%%eax),%%edi ;" - "movl (%%eax),%%eax ;" - TRAP_INSTR "; " - "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" - : "=a" (ret) : "0" (&hypercall) : "memory" ); + __asm__ __volatile__ ( + "pushl %%ebx; pushl %%ecx; pushl %%edx; " + "pushl %%esi; pushl %%edi; " + "movl 4(%%eax),%%ebx ;" + "movl 8(%%eax),%%ecx ;" + "movl 12(%%eax),%%edx ;" + "movl 16(%%eax),%%esi ;" + "movl 20(%%eax),%%edi ;" + "movl (%%eax),%%eax ;" + TRAP_INSTR "; " + "popl %%edi; popl %%esi; popl %%edx; " + "popl %%ecx; popl %%ebx" + : "=a" (ret) : "0" (&hypercall) : "memory" ); #elif defined (__x86_64__) - { - long ign1, ign2, ign3; - __asm__ __volatile__ ( - "movq %8,%%r10; movq %9,%%r8;" TRAP_INSTR - : "=a" (ret), "=D" (ign1), "=S" (ign2), "=d" (ign3) - : "0" ((unsigned long)hypercall.op), - "1" ((unsigned long)hypercall.arg[0]), - "2" ((unsigned long)hypercall.arg[1]), - "3" ((unsigned long)hypercall.arg[2]), - "g" ((unsigned long)hypercall.arg[3]), - "g" ((unsigned long)hypercall.arg[4]) - : "r11","rcx","r8","r10","memory"); - } + { + long ign1, ign2, ign3; + __asm__ __volatile__ ( + "movq %8,%%r10; movq %9,%%r8;" TRAP_INSTR + : "=a" (ret), "=D" (ign1), + "=S" (ign2), "=d" (ign3) + : "0" ((unsigned long)hypercall.op), + "1" ((unsigned long)hypercall.arg[0]), + "2" ((unsigned long)hypercall.arg[1]), + "3" ((unsigned long)hypercall.arg[2]), + "g" ((unsigned long)hypercall.arg[3]), + "g" ((unsigned long)hypercall.arg[4]) + : "r11","rcx","r8","r10","memory"); + } #elif defined (__ia64__) - __asm__ __volatile__ ( - ";; mov r14=%2; mov r15=%3; mov r16=%4; mov r17=%5; mov r18=%6; mov -r2=%1; break 0x1000;; mov %0=r8 ;;" - : "=r" (ret) - : "r" (hypercall.op), - "r" (hypercall.arg[0]), - "r" (hypercall.arg[1]), - "r" (hypercall.arg[2]), - "r" (hypercall.arg[3]), - "r" (hypercall.arg[4]) - : "r14","r15","r16","r17","r18","r2","r8","memory"); + __asm__ __volatile__ ( + ";; mov r14=%2; mov r15=%3; " + "mov r16=%4; mov r17=%5; mov r18=%6;" + "mov r2=%1; break 0x1000;; mov %0=r8 ;;" + : "=r" (ret) + : "r" (hypercall.op), + "r" (hypercall.arg[0]), + "r" (hypercall.arg[1]), + "r" (hypercall.arg[2]), + "r" (hypercall.arg[3]), + "r" (hypercall.arg[4]) + : "r14","r15","r16","r17","r18","r2","r8","memory"); #endif - } - break; + } + break; #if defined(CONFIG_XEN_PRIVILEGED_GUEST) - case IOCTL_PRIVCMD_MMAP: - { + case IOCTL_PRIVCMD_MMAP: { #define PRIVCMD_MMAP_SZ 32 - privcmd_mmap_t mmapcmd; - privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p; - int i, rc; - - if ( copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd)) ) - return -EFAULT; - - p = mmapcmd.entry; - - for (i=0; i<mmapcmd.num; i+=PRIVCMD_MMAP_SZ, p+=PRIVCMD_MMAP_SZ) - { - int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? - PRIVCMD_MMAP_SZ:(mmapcmd.num-i); - - - if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) ) - return -EFAULT; + privcmd_mmap_t mmapcmd; + privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p; + int i, rc; + + if (copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd))) + return -EFAULT; + + p = mmapcmd.entry; + + for (i = 0; i < mmapcmd.num; + i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) { + int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? + PRIVCMD_MMAP_SZ:(mmapcmd.num-i); + + if (copy_from_user(&msg, p, + n*sizeof(privcmd_mmap_entry_t))) + return -EFAULT; - for ( j = 0; j < n; j++ ) - { - struct vm_area_struct *vma = - find_vma( current->mm, msg[j].va ); - - if ( !vma ) - return -EINVAL; - - if ( msg[j].va > PAGE_OFFSET ) - return -EINVAL; - - if ( (msg[j].va + (msg[j].npages<<PAGE_SHIFT)) > vma->vm_end ) - return -EINVAL; - - if ( (rc = direct_remap_pfn_range(vma->vm_mm, - msg[j].va&PAGE_MASK, - msg[j].mfn, - msg[j].npages<<PAGE_SHIFT, - vma->vm_page_prot, - mmapcmd.dom)) < 0 ) - return rc; - } - } - ret = 0; - } - break; - - case IOCTL_PRIVCMD_MMAPBATCH: - { - mmu_update_t u; - privcmd_mmapbatch_t m; - struct vm_area_struct *vma = NULL; - unsigned long *p, addr; - unsigned long mfn, ptep; - int i; - - if ( copy_from_user(&m, (void *)data, sizeof(m)) ) - { ret = -EFAULT; goto batch_err; } - - vma = find_vma( current->mm, m.addr ); - - if ( !vma ) - { ret = -EINVAL; goto batch_err; } - - if ( m.addr > PAGE_OFFSET ) - { ret = -EFAULT; goto batch_err; } - - if ( (m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end ) - { ret = -EFAULT; goto batch_err; } - - p = m.arr; - addr = m.addr; - for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ ) - { - if ( get_user(mfn, p) ) - return -EFAULT; - - ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep); - if (ret) - goto batch_err; - - u.val = pte_val_ma(pfn_pte_ma(mfn, vma->vm_page_prot)); - u.ptr = ptep; - - if ( unlikely(HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0) ) - put_user(0xF0000000 | mfn, p); - } - - ret = 0; - break; - - batch_err: - printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", - ret, vma, m.addr, m.num, m.arr, - vma ? vma->vm_start : 0, vma ? vma->vm_end : 0); - break; - } - break; + for (j = 0; j < n; j++) { + struct vm_area_struct *vma = + find_vma( current->mm, msg[j].va ); + + if (!vma) + return -EINVAL; + + if (msg[j].va > PAGE_OFFSET) + return -EINVAL; + + if ((msg[j].va + (msg[j].npages << PAGE_SHIFT)) + > vma->vm_end ) + return -EINVAL; + + if ((rc = direct_remap_pfn_range( + vma, + msg[j].va&PAGE_MASK, + msg[j].mfn, + msg[j].npages<<PAGE_SHIFT, + vma->vm_page_prot, + mmapcmd.dom)) < 0) + return rc; + } + } + ret = 0; + } + break; + + case IOCTL_PRIVCMD_MMAPBATCH: { + mmu_update_t u; + privcmd_mmapbatch_t m; + struct vm_area_struct *vma = NULL; + unsigned long *p, addr; + unsigned long mfn, ptep; + int i; + + if (copy_from_user(&m, (void *)data, sizeof(m))) { + ret = -EFAULT; + goto batch_err; + } + + vma = find_vma( current->mm, m.addr ); + if (!vma) { + ret = -EINVAL; + goto batch_err; + } + + if (m.addr > PAGE_OFFSET) { + ret = -EFAULT; + goto batch_err; + } + + if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) { + ret = -EFAULT; + goto batch_err; + } + + p = m.arr; + addr = m.addr; + for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) { + if (get_user(mfn, p)) + return -EFAULT; + + ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep); + if (ret) + goto batch_err; + + u.val = pte_val_ma(pfn_pte_ma(mfn, vma->vm_page_prot)); + u.ptr = ptep; + + if (HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0) + put_user(0xF0000000 | mfn, p); + } + + ret = 0; + break; + + batch_err: + printk("batch_err ret=%d vma=%p addr=%lx " + "num=%d arr=%p %lx-%lx\n", + ret, vma, m.addr, m.num, m.arr, + vma ? vma->vm_start : 0, vma ? vma->vm_end : 0); + break; + } + break; #endif - case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: - { - unsigned long m2pv = (unsigned long)machine_to_phys_mapping; - pgd_t *pgd = pgd_offset_k(m2pv); - pud_t *pud = pud_offset(pgd, m2pv); - pmd_t *pmd = pmd_offset(pud, m2pv); - unsigned long m2p_start_mfn = (*(unsigned long *)pmd) >> PAGE_SHIFT; - ret = put_user(m2p_start_mfn, (unsigned long *)data) ? -EFAULT: 0; - } - break; - - case IOCTL_PRIVCMD_INITDOMAIN_STORE: - { - extern int do_xenbus_probe(void*); - unsigned long page; - - if (xen_start_info->store_evtchn != 0) { - ret = xen_start_info->store_mfn; - break; - } - - /* Allocate page. */ - page = get_zeroed_page(GFP_KERNEL); - if (!page) { - ret = -ENOMEM; - break; - } - - /* We don't refcnt properly, so set reserved on page. - * (this allocation is permanent) */ - SetPageReserved(virt_to_page(page)); - - /* Initial connect. Setup channel and page. */ - xen_start_info->store_evtchn = data; - xen_start_info->store_mfn = pfn_to_mfn(virt_to_phys((void *)page) >> - PAGE_SHIFT); - ret = xen_start_info->store_mfn; - - /* We'll return then this will wait for daemon to answer */ - kthread_run(do_xenbus_probe, NULL, "xenbus_probe"); - } - break; - - default: - ret = -EINVAL; - break; - } - return ret; + case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: { + unsigned long m2pv = (unsigned long)machine_to_phys_mapping; + pgd_t *pgd = pgd_offset_k(m2pv); + pud_t *pud = pud_offset(pgd, m2pv); + pmd_t *pmd = pmd_offset(pud, m2pv); + unsigned long m2p_start_mfn = + (*(unsigned long *)pmd) >> PAGE_SHIFT; + ret = put_user(m2p_start_mfn, (unsigned long *)data) ? + -EFAULT: 0; + } + break; + + case IOCTL_PRIVCMD_INITDOMAIN_STORE: { + extern int do_xenbus_probe(void*); + unsigned long page; + + if (xen_start_info->store_evtchn != 0) { + ret = xen_start_info->store_mfn; + break; + } + + /* Allocate page. */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + + /* We don't refcnt properly, so set reserved on page. + * (this allocation is permanent) */ + SetPageReserved(virt_to_page(page)); + + /* Initial connect. Setup channel and page. */ + xen_start_info->store_evtchn = data; + xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); + ret = xen_start_info->store_mfn; + + /* We'll return then this will wait for daemon to answer */ + kthread_run(do_xenbus_probe, NULL, "xenbus_probe"); + } + break; + + default: + ret = -EINVAL; + break; + } + + return ret; } static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) { - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; - - return 0; + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + + return 0; } static struct file_operations privcmd_file_ops = { - .ioctl = privcmd_ioctl, - .mmap = privcmd_mmap, + .ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, }; static int __init privcmd_init(void) { - privcmd_intf = create_xen_proc_entry("privcmd", 0400); - if ( privcmd_intf != NULL ) - privcmd_intf->proc_fops = &privcmd_file_ops; - - return 0; + privcmd_intf = create_xen_proc_entry("privcmd", 0400); + if (privcmd_intf != NULL) + privcmd_intf->proc_fops = &privcmd_file_ops; + + return 0; } __initcall(privcmd_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Thu Sep 22 17:42:01 2005 @@ -11,10 +11,10 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <asm-xen/evtchn.h> +#include <asm-xen/driver_util.h> #include <asm-xen/xen-public/io/tpmif.h> #include <asm/io.h> #include <asm/pgalloc.h> -#include <asm-xen/xen-public/io/domain_controller.h> #if 0 #define ASSERT(_p) \ @@ -34,12 +34,12 @@ unsigned int handle; /* Physical parameters of the comms window. */ - unsigned long tx_shmem_frame; unsigned int evtchn; unsigned int remote_evtchn; /* The shared rings and indexes. */ tpmif_tx_interface_t *tx; + struct vm_struct *tx_area; /* Miscellaneous private stuff. */ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; @@ -55,9 +55,7 @@ struct work_struct work; u16 shmem_handle; - unsigned long shmem_vaddr; grant_ref_t shmem_ref; - } tpmif_t; void tpmif_disconnect_complete(tpmif_t * tpmif); @@ -86,3 +84,13 @@ #define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE)) #endif /* __TPMIF__BACKEND__COMMON_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Thu Sep 22 17:42:01 2005 @@ -1,4 +1,4 @@ -/****************************************************************************** + /***************************************************************************** * drivers/xen/tpmback/interface.c * * Vritual TPM interface management. @@ -14,187 +14,192 @@ #include "common.h" #include <asm-xen/balloon.h> -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) - #define TPMIF_HASHSZ (2 << 5) #define TPMIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(TPMIF_HASHSZ-1)) static kmem_cache_t *tpmif_cachep; int num_frontends = 0; + LIST_HEAD(tpmif_list); - -tpmif_t *alloc_tpmif(domid_t domid, long int instance) -{ - struct page *page; - tpmif_t *tpmif; - - tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); - if (!tpmif) - return ERR_PTR(-ENOMEM); - - memset(tpmif, 0, sizeof(*tpmif)); - tpmif->domid = domid; - tpmif->status = DISCONNECTED; - tpmif->tpm_instance = instance; - atomic_set(&tpmif->refcnt, 1); - - page = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE); - BUG_ON(page == NULL); - tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - - list_add(&tpmif->tpmif_list, &tpmif_list); - num_frontends++; - - return tpmif; -} - - -void free_tpmif(tpmif_t *tpmif) -{ - num_frontends--; - list_del(&tpmif->tpmif_list); - kmem_cache_free(tpmif_cachep, tpmif); -} - - -tpmif_t *tpmif_find(domid_t domid, long int instance) -{ - tpmif_t *tpmif; - - list_for_each_entry(tpmif, &tpmif_list, tpmif_list) { - if (tpmif->tpm_instance == instance) { - if (tpmif->domid == domid) { - tpmif_get(tpmif); - return tpmif; - } else { - return NULL; - } - } - } - - return alloc_tpmif(domid, instance); -} - - -static int map_frontend_page(tpmif_t *tpmif, unsigned long localaddr, - unsigned long shared_page) -{ - struct gnttab_map_grant_ref op = { - .host_addr = localaddr, - .flags = GNTMAP_host_map, - .ref = shared_page, - .dom = tpmif->domid, - }; - - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); - - if (op.handle < 0) { - DPRINTK(" Grant table operation failure !\n"); - return op.handle; - } - - tpmif->shmem_ref = shared_page; - tpmif->shmem_handle = op.handle; - tpmif->shmem_vaddr = localaddr; - return 0; -} - - -static void unmap_frontend_page(tpmif_t *tpmif) -{ - struct gnttab_unmap_grant_ref op; - - op.host_addr = tpmif->shmem_vaddr; - op.handle = tpmif->shmem_handle; - op.dev_bus_addr = 0; - - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); -} - - -int tpmif_map(tpmif_t *tpmif, - unsigned long shared_page, unsigned int evtchn) -{ - struct vm_struct *vma; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; - - BUG_ON(tpmif->remote_evtchn); - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - return -ENOMEM; - - err = map_frontend_page(tpmif, - VMALLOC_VMADDR(vma->addr), - shared_page); - if (err) { - vfree(vma->addr); - return err; - } - - op.u.bind_interdomain.dom1 = DOMID_SELF; - op.u.bind_interdomain.dom2 = tpmif->domid; - op.u.bind_interdomain.port1 = 0; - op.u.bind_interdomain.port2 = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_frontend_page(tpmif); - vfree(vma->addr); - return err; - } - - tpmif->evtchn = op.u.bind_interdomain.port1; - tpmif->remote_evtchn = evtchn; - - tpmif->tx = (tpmif_tx_interface_t *) vma->addr; - - bind_evtchn_to_irqhandler(tpmif->evtchn, - tpmif_be_int, - 0, - "tpmif-backend", - tpmif); - tpmif->status = CONNECTED; - tpmif->shmem_ref = shared_page; - tpmif->active = 1; - - return 0; -} - - -static void __tpmif_disconnect_complete(void *arg) -{ - evtchn_op_t op = { .cmd = EVTCHNOP_close }; - tpmif_t *tpmif = (tpmif_t *) arg; - - op.u.close.port = tpmif->evtchn; - op.u.close.dom = DOMID_SELF; - HYPERVISOR_event_channel_op(&op); - op.u.close.port = tpmif->remote_evtchn; - op.u.close.dom = tpmif->domid; - HYPERVISOR_event_channel_op(&op); - - if (tpmif->evtchn) - unbind_evtchn_from_irqhandler(tpmif->evtchn, tpmif); - - if (tpmif->tx) { - unmap_frontend_page(tpmif); - vfree(tpmif->tx); - } - - free_tpmif(tpmif); -} - - -void tpmif_disconnect_complete(tpmif_t * tpmif) -{ - INIT_WORK(&tpmif->work, __tpmif_disconnect_complete, (void *)tpmif); - schedule_work(&tpmif->work); -} - - -void __init tpmif_interface_init(void) -{ - tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof(tpmif_t), - 0, 0, NULL, NULL); -} +tpmif_t * +alloc_tpmif(domid_t domid, long int instance) +{ + struct page *page; + tpmif_t *tpmif; + + tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); + if (!tpmif) + return ERR_PTR(-ENOMEM); + + memset(tpmif, 0, sizeof (*tpmif)); + tpmif->domid = domid; + tpmif->status = DISCONNECTED; + tpmif->tpm_instance = instance; + atomic_set(&tpmif->refcnt, 1); + + page = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE); + BUG_ON(page == NULL); + tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + + list_add(&tpmif->tpmif_list, &tpmif_list); + num_frontends++; + + return tpmif; +} + +void +free_tpmif(tpmif_t * tpmif) +{ + num_frontends--; + list_del(&tpmif->tpmif_list); + kmem_cache_free(tpmif_cachep, tpmif); +} + +tpmif_t * +tpmif_find(domid_t domid, long int instance) +{ + tpmif_t *tpmif; + + list_for_each_entry(tpmif, &tpmif_list, tpmif_list) { + if (tpmif->tpm_instance == instance) { + if (tpmif->domid == domid) { + tpmif_get(tpmif); + return tpmif; + } else { + return NULL; + } + } + } + + return alloc_tpmif(domid, instance); +} + +static int +map_frontend_page(tpmif_t *tpmif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op = { + .host_addr = (unsigned long)tpmif->tx_area->addr, + .flags = GNTMAP_host_map, + .ref = shared_page, + .dom = tpmif->domid, + }; + + lock_vm_area(tpmif->tx_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)); + unlock_vm_area(tpmif->tx_area); + + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } + + tpmif->shmem_ref = shared_page; + tpmif->shmem_handle = op.handle; + + return 0; +} + +static void +unmap_frontend_page(tpmif_t *tpmif) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = (unsigned long)tpmif->tx_area->addr; + op.handle = tpmif->shmem_handle; + op.dev_bus_addr = 0; + + lock_vm_area(tpmif->tx_area); + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); + unlock_vm_area(tpmif->tx_area); +} + +int +tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn) +{ + evtchn_op_t op = {.cmd = EVTCHNOP_bind_interdomain }; + int err; + + BUG_ON(tpmif->remote_evtchn); + + if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL) + return -ENOMEM; + + err = map_frontend_page(tpmif, shared_page); + if (err) { + free_vm_area(tpmif->tx_area); + return err; + } + + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = tpmif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(tpmif); + free_vm_area(tpmif->tx_area); + return err; + } + + tpmif->evtchn = op.u.bind_interdomain.port1; + tpmif->remote_evtchn = evtchn; + + tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr; + + bind_evtchn_to_irqhandler(tpmif->evtchn, + tpmif_be_int, 0, "tpmif-backend", tpmif); + tpmif->status = CONNECTED; + tpmif->shmem_ref = shared_page; + tpmif->active = 1; + + return 0; +} + +static void +__tpmif_disconnect_complete(void *arg) +{ + evtchn_op_t op = {.cmd = EVTCHNOP_close }; + tpmif_t *tpmif = (tpmif_t *) arg; + + op.u.close.port = tpmif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = tpmif->remote_evtchn; + op.u.close.dom = tpmif->domid; + HYPERVISOR_event_channel_op(&op); + + if (tpmif->evtchn) + unbind_evtchn_from_irqhandler(tpmif->evtchn, tpmif); + + if (tpmif->tx) { + unmap_frontend_page(tpmif); + free_vm_area(tpmif->tx_area); + } + + free_tpmif(tpmif); +} + +void +tpmif_disconnect_complete(tpmif_t * tpmif) +{ + INIT_WORK(&tpmif->work, __tpmif_disconnect_complete, (void *)tpmif); + schedule_work(&tpmif->work); +} + +void __init +tpmif_interface_init(void) +{ + tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t), + 0, 0, NULL, NULL); +} + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Thu Sep 22 17:42:01 2005 @@ -566,7 +566,7 @@ * the more time we give the TPM to process the request. */ mod_timer(&pak->processing_timer, - jiffies + (num_frontends * 10 * HZ)); + jiffies + (num_frontends * 60 * HZ)); dataex.copied_so_far = 0; } } @@ -850,7 +850,7 @@ write_lock_irqsave(&dataex.pak_lock, flags); list_add_tail(&pak->next, &dataex.pending_pak); /* give the TPM some time to pick up the request */ - mod_timer(&pak->processing_timer, jiffies + (10 * HZ)); + mod_timer(&pak->processing_timer, jiffies + (30 * HZ)); write_unlock_irqrestore(&dataex.pak_lock, flags); @@ -1075,3 +1075,13 @@ } __initcall(tpmback_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Thu Sep 22 17:42:01 2005 @@ -213,6 +213,7 @@ be->dev = dev; be->backend_watch.node = dev->nodename; + /* Implicitly calls backend_changed() once. */ be->backend_watch.callback = backend_changed; be->instance = -1; err = register_xenbus_watch(&be->backend_watch); @@ -236,8 +237,6 @@ } dev->data = be; - - backend_changed(&be->backend_watch, dev->nodename); return err; free_be: @@ -269,3 +268,13 @@ { xenbus_register_backend(&tpmback); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Thu Sep 22 17:42:01 2005 @@ -46,7 +46,6 @@ #include <asm-xen/xen-public/io/tpmif.h> #include <asm/uaccess.h> #include <asm-xen/xenbus.h> -#include <asm-xen/xen-public/io/domain_controller.h> #include <asm-xen/xen-public/grant_table.h> #include "tpmfront.h" @@ -258,18 +257,24 @@ tpm_allocate_buffers(tp); - info->ring_ref = gnttab_claim_grant_reference(&gref_head); - ASSERT(info->ring_ref != -ENOSPC); - gnttab_grant_foreign_access_ref(info->ring_ref, - backend_id, - (virt_to_machine(tp->tx) >> PAGE_SHIFT), - 0); + err = gnttab_grant_foreign_access(backend_id, + (virt_to_machine(tp->tx) >> PAGE_SHIFT), + 0); + + if (err == -ENOSPC) { + free_page((unsigned long)sring); + tp->tx = NULL; + xenbus_dev_error(dev, err, "allocating grant reference"); + return err; + } + info->ring_ref = err; op.u.alloc_unbound.dom = backend_id; err = HYPERVISOR_event_channel_op(&op); if (err) { + gnttab_end_foreign_access(info->ring_ref, 0); free_page((unsigned long)sring); - tp->tx = 0; + tp->tx = NULL; xenbus_dev_error(dev, err, "allocating event channel"); return err; } @@ -283,6 +288,7 @@ tpmif_set_connected_state(tp,0); if ( tp->tx != NULL ) { + gnttab_end_foreign_access(info->ring_ref, 0); free_page((unsigned long)tp->tx); tp->tx = NULL; } @@ -412,7 +418,6 @@ return err; } - watch_for_status(&info->watch, info->watch.node); return 0; } @@ -736,3 +741,13 @@ } __initcall(tpmif_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.h --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.h Thu Sep 22 17:42:01 2005 @@ -2,7 +2,8 @@ #define TPM_FRONT_H -struct tpm_private { +struct tpm_private +{ tpmif_tx_interface_t *tx; unsigned int evtchn; int connected; @@ -29,10 +30,21 @@ }; -struct tx_buffer { +struct tx_buffer +{ unsigned int size; // available space in data unsigned int len; // used space in data unsigned char *data; // pointer to a page }; #endif + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Thu Sep 22 17:42:01 2005 @@ -231,3 +231,13 @@ unbind_evtchn_from_irqhandler(xen_start_info->store_evtchn, &xb_waitq); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h Thu Sep 22 17:42:01 2005 @@ -39,3 +39,13 @@ extern wait_queue_head_t xb_waitq; #endif /* _XENBUS_COMMS_H */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c Thu Sep 22 17:42:01 2005 @@ -186,3 +186,13 @@ } __initcall(xenbus_dev_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Thu Sep 22 17:42:01 2005 @@ -687,3 +687,13 @@ } postcore_initcall(xenbus_probe_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Thu Sep 22 17:42:01 2005 @@ -253,31 +253,19 @@ EXPORT_SYMBOL(xenbus_read); /* Write the value of a single file. - * Returns -err on failure. createflags can be 0, O_CREAT, or O_CREAT|O_EXCL. + * Returns -err on failure. */ -int xenbus_write(const char *dir, const char *node, - const char *string, int createflags) -{ - const char *flags, *path; - struct kvec iovec[3]; +int xenbus_write(const char *dir, const char *node, const char *string) +{ + const char *path; + struct kvec iovec[2]; path = join(dir, node); - /* Format: Flags (as string), path, data. */ - if (createflags == 0) - flags = XS_WRITE_NONE; - else if (createflags == O_CREAT) - flags = XS_WRITE_CREATE; - else if (createflags == (O_CREAT|O_EXCL)) - flags = XS_WRITE_CREATE_EXCL; - else - return -EINVAL; iovec[0].iov_base = (void *)path; iovec[0].iov_len = strlen(path) + 1; - iovec[1].iov_base = (void *)flags; - iovec[1].iov_len = strlen(flags) + 1; - iovec[2].iov_base = (void *)string; - iovec[2].iov_len = strlen(string); + iovec[1].iov_base = (void *)string; + iovec[1].iov_len = strlen(string); return xs_error(xs_talkv(XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); } @@ -357,7 +345,7 @@ va_end(ap); BUG_ON(ret > sizeof(printf_buffer)-1); - return xenbus_write(dir, node, printf_buffer, O_CREAT); + return xenbus_write(dir, node, printf_buffer); } EXPORT_SYMBOL(xenbus_printf); @@ -377,7 +365,7 @@ BUG_ON(len + ret > sizeof(printf_buffer)-1); dev->has_error = 1; - if (xenbus_write(dev->nodename, "error", printf_buffer, O_CREAT) != 0) + if (xenbus_write(dev->nodename, "error", printf_buffer) != 0) printk("xenbus: failed to write error node for %s (%s)\n", dev->nodename, printf_buffer); } @@ -578,3 +566,13 @@ return PTR_ERR(watcher); return 0; } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h Thu Sep 22 17:42:01 2005 @@ -261,7 +261,6 @@ /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(__pa(v))) -#define machine_to_virt(m) (__va(machine_to_phys(m))) #define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Thu Sep 22 17:42:01 2005 @@ -460,7 +460,7 @@ #define kern_addr_valid(addr) (1) #endif /* !CONFIG_DISCONTIGMEM */ -int direct_remap_pfn_range(struct mm_struct *mm, +int direct_remap_pfn_range(struct vm_area_struct *vma, unsigned long address, unsigned long mfn, unsigned long size, @@ -474,10 +474,10 @@ unsigned long size); #define io_remap_page_range(vma,from,phys,size,prot) \ -direct_remap_pfn_range(vma->vm_mm,from,phys>>PAGE_SHIFT,size,prot,DOMID_IO) +direct_remap_pfn_range(vma,from,(phys)>>PAGE_SHIFT,size,prot,DOMID_IO) #define io_remap_pfn_range(vma,from,pfn,size,prot) \ -direct_remap_pfn_range(vma->vm_mm,from,pfn,size,prot,DOMID_IO) +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) #define MK_IOSPACE_PFN(space, pfn) (pfn) #define GET_IOSPACE(pfn) 0 diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h Thu Sep 22 17:42:01 2005 @@ -497,11 +497,22 @@ * includes these barriers, for example. */ +/* + * Don't use smp_processor_id() in preemptible code: debug builds will barf. + * It's okay in these cases as we only read the upcall mask in preemptible + * regions, which is always safe. + */ +#ifdef CONFIG_SMP +#define __this_cpu() __smp_processor_id() +#else +#define __this_cpu() 0 +#endif + #define __cli() \ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ barrier(); \ @@ -512,7 +523,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ _vcpu->evtchn_upcall_mask = 0; \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -523,7 +534,7 @@ #define __save_flags(x) \ do { \ vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ (x) = _vcpu->evtchn_upcall_mask; \ } while (0) @@ -532,7 +543,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -548,7 +559,7 @@ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ (x) = _vcpu->evtchn_upcall_mask; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ @@ -561,14 +572,8 @@ #define local_irq_disable() __cli() #define local_irq_enable() __sti() -/* Don't use smp_processor_id: this is called in debug versions of that fn. */ -#ifdef CONFIG_SMP -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[__smp_processor_id()].evtchn_upcall_mask -#else -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask -#endif +#define irqs_disabled() \ + HYPERVISOR_shared_info->vcpu_data[__this_cpu()].evtchn_upcall_mask /* * disable hlt during certain critical i/o operations diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Thu Sep 22 17:42:01 2005 @@ -35,7 +35,7 @@ * of cr3/ldt (i.e., not in __switch_to). */ __asm__ __volatile__ ( - "movl %%es,%0 ; movl %%ds,%1 ; movl %%fs,%2 ; movl %%gs,%3" + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" : "=m" (current->thread.es), "=m" (current->thread.ds), "=m" (current->thread.fsindex), diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h Thu Sep 22 17:42:01 2005 @@ -239,7 +239,6 @@ /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(__pa(v))) -#define machine_to_virt(m) (__va(machine_to_phys(m))) #define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Thu Sep 22 17:42:01 2005 @@ -526,7 +526,7 @@ #define DOMID_LOCAL (0xFFFFU) -int direct_remap_pfn_range(struct mm_struct *mm, +int direct_remap_pfn_range(struct vm_area_struct *vma, unsigned long address, unsigned long mfn, unsigned long size, @@ -542,10 +542,10 @@ unsigned long size); #define io_remap_page_range(vma, vaddr, paddr, size, prot) \ - direct_remap_pfn_range((vma)->vm_mm,vaddr,paddr>>PAGE_SHIFT,size,prot,DOMID_IO) + direct_remap_pfn_range(vma,vaddr,(paddr)>>PAGE_SHIFT,size,prot,DOMID_IO) #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - direct_remap_pfn_range((vma)->vm_mm,vaddr,pfn,size,prot,DOMID_IO) + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) #define MK_IOSPACE_PFN(space, pfn) (pfn) #define GET_IOSPACE(pfn) 0 diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/system.h Thu Sep 22 17:42:01 2005 @@ -321,11 +321,22 @@ * includes these barriers, for example. */ +/* + * Don't use smp_processor_id() in preemptible code: debug builds will barf. + * It's okay in these cases as we only read the upcall mask in preemptible + * regions, which is always safe. + */ +#ifdef CONFIG_SMP +#define __this_cpu() __smp_processor_id() +#else +#define __this_cpu() 0 +#endif + #define __cli() \ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ barrier(); \ @@ -336,7 +347,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ _vcpu->evtchn_upcall_mask = 0; \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -347,7 +358,7 @@ #define __save_flags(x) \ do { \ vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ (x) = _vcpu->evtchn_upcall_mask; \ } while (0) @@ -356,7 +367,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -372,7 +383,7 @@ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_data[__this_cpu()]; \ (x) = _vcpu->evtchn_upcall_mask; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ @@ -387,14 +398,8 @@ #define local_irq_disable() __cli() #define local_irq_enable() __sti() -/* Don't use smp_processor_id: this is called in debug versions of that fn. */ -#ifdef CONFIG_SMP -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[__smp_processor_id()].evtchn_upcall_mask -#else -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask -#endif +#define irqs_disabled() \ + HYPERVISOR_shared_info->vcpu_data[__this_cpu()].evtchn_upcall_mask /* * disable hlt during certain critical i/o operations diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/gnttab.h --- a/linux-2.6-xen-sparse/include/asm-xen/gnttab.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/gnttab.h Thu Sep 22 17:42:01 2005 @@ -37,7 +37,7 @@ void gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); void gnttab_end_foreign_access(grant_ref_t ref, int readonly); -int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); +int gnttab_grant_foreign_transfer(domid_t domid); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); @@ -64,8 +64,7 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); -void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, - unsigned long pfn); +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid); #ifdef __ia64__ #define gnttab_map_vaddr(map) __va(map.dev_bus_addr) diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/xenbus.h --- a/linux-2.6-xen-sparse/include/asm-xen/xenbus.h Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/xenbus.h Thu Sep 22 17:42:01 2005 @@ -83,8 +83,7 @@ char **xenbus_directory(const char *dir, const char *node, unsigned int *num); void *xenbus_read(const char *dir, const char *node, unsigned int *len); -int xenbus_write(const char *dir, const char *node, - const char *string, int createflags); +int xenbus_write(const char *dir, const char *node, const char *string); int xenbus_mkdir(const char *dir, const char *node); int xenbus_exists(const char *dir, const char *node); int xenbus_rm(const char *dir, const char *node); diff -r 97dbd9524a7e -r 06d84bf87159 tools/blktap/xenbus.c --- a/tools/blktap/xenbus.c Thu Sep 22 17:34:14 2005 +++ b/tools/blktap/xenbus.c Thu Sep 22 17:42:01 2005 @@ -92,7 +92,7 @@ if ((path == NULL) || (buf == NULL)) return 0; - ret = xs_write(h, path, buf, strlen(buf)+1, O_CREAT); + ret = xs_write(h, path, buf, strlen(buf)+1); free(buf); free(path); diff -r 97dbd9524a7e -r 06d84bf87159 tools/console/daemon/io.c --- a/tools/console/daemon/io.c Thu Sep 22 17:34:14 2005 +++ b/tools/console/daemon/io.c Thu Sep 22 17:42:01 2005 @@ -165,7 +165,7 @@ success = asprintf(&path, "%s/tty", dom->conspath) != -1; if (!success) goto out; - success = xs_write(xs, path, slave, strlen(slave), O_CREAT); + success = xs_write(xs, path, slave, strlen(slave)); free(path); if (!success) goto out; diff -r 97dbd9524a7e -r 06d84bf87159 tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure --- a/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure Thu Sep 22 17:34:14 2005 +++ b/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure Thu Sep 22 17:42:01 2005 @@ -3475,7 +3475,7 @@ GDBSERVER_DEPFILES="$srv_regobj $srv_tgtobj $srv_thread_depfiles" -GDBSERVER_LIBS="$srv_libs -L../../../../../libxc/ -lxc" +GDBSERVER_LIBS="$srv_libs -L../../../../../libxc/ -lxenctrl" diff -r 97dbd9524a7e -r 06d84bf87159 tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure.in --- a/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure.in Thu Sep 22 17:34:14 2005 +++ b/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/configure.in Thu Sep 22 17:42:01 2005 @@ -107,7 +107,7 @@ GDBSERVER_DEPFILES="$srv_regobj $srv_tgtobj $srv_thread_depfiles" -GDBSERVER_LIBS="$srv_libs -L../../../../../libxc/ -lxc" +GDBSERVER_LIBS="$srv_libs -L../../../../../libxc/ -lxenctrl" AC_SUBST(GDBSERVER_DEPFILES) AC_SUBST(GDBSERVER_LIBS) diff -r 97dbd9524a7e -r 06d84bf87159 tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c --- a/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c Thu Sep 22 17:34:14 2005 +++ b/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c Thu Sep 22 17:42:01 2005 @@ -37,9 +37,10 @@ #include <errno.h> #include <xenctrl.h> #define TRACE_ENTER /* printf("enter %s\n", __FUNCTION__) */ -long (*myptrace)(enum __ptrace_request, pid_t, long, long); -int (*myxcwait)(int domain, int *status, int options) ; - + +long (*myptrace)(int xc_handle, enum __ptrace_request, u32, long, long); +int (*myxcwait)(int xc_handle, int domain, int *status, int options) ; +static int xc_handle; #define DOMFLAGS_DYING (1<<0) /* Domain is scheduled to die. */ #define DOMFLAGS_SHUTDOWN (1<<2) /* The guest OS has shut down. */ @@ -47,11 +48,7 @@ #define DOMFLAGS_BLOCKED (1<<4) /* Currently blocked pending an event. */ #define DOMFLAGS_RUNNING (1<<5) /* Domain is currently running. */ - - struct inferior_list all_processes; - - static int current_domain; static int expect_signal = 0; static int signal_to_send = 0; @@ -150,7 +147,7 @@ { struct process_info *new_process; current_domain = domain; - if (myptrace (PTRACE_ATTACH, domain, 0, 0) != 0) { + if (myptrace (xc_handle, PTRACE_ATTACH, domain, 0, 0) != 0) { fprintf (stderr, "Cannot attach to domain %d: %s (%d)\n", domain, strerror (errno), errno); fflush (stderr); @@ -173,8 +170,7 @@ { struct thread_info *thread = (struct thread_info *) entry; struct process_info *process = get_thread_process (thread); - myptrace (PTRACE_KILL, pid_of (process), 0, 0); - + myptrace (xc_handle, PTRACE_KILL, pid_of (process), 0, 0); } static void @@ -190,7 +186,7 @@ struct thread_info *thread = (struct thread_info *) entry; struct process_info *process = get_thread_process (thread); - myptrace (PTRACE_DETACH, pid_of (process), 0, 0); + myptrace (xc_handle, PTRACE_DETACH, pid_of (process), 0, 0); } @@ -216,7 +212,7 @@ linux_wait (char *status) { int w; - if (myxcwait(current_domain, &w, 0)) + if (myxcwait(xc_handle, current_domain, &w, 0)) return -1; if (w & (DOMFLAGS_SHUTDOWN|DOMFLAGS_DYING)) { @@ -241,7 +237,7 @@ expect_signal = resume_info->sig; for_each_inferior(&all_threads, regcache_invalidate_one); - myptrace (step ? PTRACE_SINGLESTEP : PTRACE_CONT, current_domain, 0, 0); + myptrace (xc_handle, step ? PTRACE_SINGLESTEP : PTRACE_CONT, current_domain, 0, 0); } @@ -265,7 +261,7 @@ } buf = malloc (regset->size); - res = myptrace (regset->get_request, inferior_pid, 0, (PTRACE_XFER_TYPE)buf); + res = myptrace (xc_handle, regset->get_request, inferior_pid, 0, (PTRACE_XFER_TYPE)buf); if (res < 0) { if (errno == EIO) @@ -317,7 +313,7 @@ buf = malloc (regset->size); regset->fill_function (buf); - res = myptrace (regset->set_request, inferior_pid, 0, (PTRACE_XFER_TYPE)buf); + res = myptrace (xc_handle, regset->set_request, inferior_pid, 0, (PTRACE_XFER_TYPE)buf); if (res < 0) { if (errno == EIO) @@ -395,7 +391,7 @@ for (i = 0; i < count; i++, addr += sizeof (PTRACE_XFER_TYPE)) { errno = 0; - buffer[i] = myptrace (PTRACE_PEEKTEXT, inferior_pid, (PTRACE_ARG3_TYPE) addr, 0); + buffer[i] = myptrace (xc_handle, PTRACE_PEEKTEXT, inferior_pid, (PTRACE_ARG3_TYPE) addr, 0); if (errno) return errno; } @@ -428,13 +424,13 @@ /* Fill start and end extra bytes of buffer with existing memory data. */ - buffer[0] = myptrace (PTRACE_PEEKTEXT, inferior_pid, + buffer[0] = myptrace (xc_handle, PTRACE_PEEKTEXT, inferior_pid, (PTRACE_ARG3_TYPE) addr, 0); if (count > 1) { buffer[count - 1] - = myptrace (PTRACE_PEEKTEXT, inferior_pid, + = myptrace (xc_handle, PTRACE_PEEKTEXT, inferior_pid, (PTRACE_ARG3_TYPE) (addr + (count - 1) * sizeof (PTRACE_XFER_TYPE)), 0); @@ -448,7 +444,7 @@ for (i = 0; i < count; i++, addr += sizeof (PTRACE_XFER_TYPE)) { errno = 0; - myptrace (PTRACE_POKETEXT, inferior_pid, (PTRACE_ARG3_TYPE) addr, buffer[i]); + myptrace (xc_handle, PTRACE_POKETEXT, inferior_pid, (PTRACE_ARG3_TYPE) addr, buffer[i]); if (errno) return errno; } @@ -539,7 +535,7 @@ void initialize_low (void) { - + xc_handle = xc_interface_open(); set_target_ops (&linux_xen_target_ops); set_breakpoint_data (the_low_target.breakpoint, the_low_target.breakpoint_len); diff -r 97dbd9524a7e -r 06d84bf87159 tools/debugger/gdb/gdbbuild --- a/tools/debugger/gdb/gdbbuild Thu Sep 22 17:34:14 2005 +++ b/tools/debugger/gdb/gdbbuild Thu Sep 22 17:42:01 2005 @@ -1,20 +1,17 @@ #!/bin/sh -XENROOT=`hg root` -export XENROOT - -cd $XENROOT/tools/debugger/gdb -rm -rf gdb-6.2.1 gdb-6.2.1-linux-i386-xen -# FIXME:cw this should be smarter -wget -c ftp://ftp.gnu.org/gnu/gdb/gdb-6.2.1.tar.bz2 +rm -rf gdb-6.2.1 gdb-6.2.1-linux-i386-xen +[ -a gdb-6.2.1.tar.bz2 ] || wget -c ftp://ftp.gnu.org/gnu/gdb/gdb-6.2.1.tar.bz2 tar xjf gdb-6.2.1.tar.bz2 -cd $XENROOT/tools/debugger/gdb/gdb-6.2.1-xen-sparse +cd gdb-6.2.1-xen-sparse ./mkbuildtree ../gdb-6.2.1 -mkdir $XENROOT/tools/debugger/gdb/gdb-6.2.1-linux-i386-xen -cd $XENROOT/tools/debugger/gdb/gdb-6.2.1-linux-i386-xen +cd .. +mkdir gdb-6.2.1-linux-i386-xen +cd gdb-6.2.1-linux-i386-xen ../gdb-6.2.1/configure + # some people don't have gmake if which gmake ; then gmake -j4 diff -r 97dbd9524a7e -r 06d84bf87159 tools/examples/network-bridge --- a/tools/examples/network-bridge Thu Sep 22 17:34:14 2005 +++ b/tools/examples/network-bridge Thu Sep 22 17:42:01 2005 @@ -1,4 +1,4 @@ -#!/bin/sh -x +#!/bin/sh #============================================================================ # Default Xen network start/stop script. # Xend calls a network script when it starts. diff -r 97dbd9524a7e -r 06d84bf87159 tools/examples/xend-config.sxp --- a/tools/examples/xend-config.sxp Thu Sep 22 17:34:14 2005 +++ b/tools/examples/xend-config.sxp Thu Sep 22 17:42:01 2005 @@ -49,6 +49,6 @@ # If dom0-min-mem=0, dom0 will never balloon out. (dom0-min-mem 0) -# In SMP system, dom0 will use only CPUs in range [1,dom0-cpus] +# In SMP system, dom0 will use dom0-cpus # of CPUS # If dom0-cpus = 0, dom0 will take all cpus available (dom0-cpus 0) diff -r 97dbd9524a7e -r 06d84bf87159 tools/examples/xmexample.vmx --- a/tools/examples/xmexample.vmx Thu Sep 22 17:34:14 2005 +++ b/tools/examples/xmexample.vmx Thu Sep 22 17:42:01 2005 @@ -25,6 +25,10 @@ # A name for your domain. All domains must have different names. name = "ExampleVMXDomain" + +#----------------------------------------------------------------------------- +# the number of cpus guest platform has, default=1 +vcpus=1 # Which CPU to start domain on? #cpu = -1 # leave to Xen to pick diff -r 97dbd9524a7e -r 06d84bf87159 tools/firmware/acpi/acpi_madt.c --- a/tools/firmware/acpi/acpi_madt.c Thu Sep 22 17:34:14 2005 +++ b/tools/firmware/acpi/acpi_madt.c Thu Sep 22 17:42:01 2005 @@ -37,44 +37,7 @@ ACPI_LOCAL_APIC_ADDRESS, ACPI_MULTIPLE_APIC_FLAGS, }, - // - // LOCAL APIC Entries for 4 processors. - // - { - { - ACPI_PROCESSOR_LOCAL_APIC, - sizeof (ACPI_LOCAL_APIC_STRUCTURE), - 0x00, - 0x00, - 0x00000001, - }, - - { - ACPI_PROCESSOR_LOCAL_APIC, - sizeof (ACPI_LOCAL_APIC_STRUCTURE), - 0x01, - 0x00, - 0x00000000 - }, - - { - ACPI_PROCESSOR_LOCAL_APIC, - sizeof (ACPI_LOCAL_APIC_STRUCTURE), - 0x02, - 0x00, - 0x00000000 - }, - - { - ACPI_PROCESSOR_LOCAL_APIC, - sizeof (ACPI_LOCAL_APIC_STRUCTURE), - 0x03, - 0x00, - 0x00000000 - } - } - , - + // // IO APIC // @@ -87,5 +50,19 @@ ACPI_IO_APIC_ADDRESS_1, 0x0000 } + }, + + // + // LOCAL APIC Entries for up to 32 processors. + // + { + { + ACPI_PROCESSOR_LOCAL_APIC, + sizeof (ACPI_LOCAL_APIC_STRUCTURE), + 0x00, + 0x00, + 0x00000001, + } + } }; diff -r 97dbd9524a7e -r 06d84bf87159 tools/firmware/acpi/acpi_madt.h --- a/tools/firmware/acpi/acpi_madt.h Thu Sep 22 17:34:14 2005 +++ b/tools/firmware/acpi/acpi_madt.h Thu Sep 22 17:42:01 2005 @@ -35,9 +35,9 @@ // #pragma pack (1) typedef struct { - ACPI_2_0_MADT Header; - ACPI_LOCAL_APIC_STRUCTURE LocalApic[4]; - ACPI_IO_APIC_STRUCTURE IoApic[1]; + ACPI_2_0_MADT Header; + ACPI_IO_APIC_STRUCTURE IoApic[1]; + ACPI_LOCAL_APIC_STRUCTURE LocalApic[32]; } ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE; #pragma pack () diff -r 97dbd9524a7e -r 06d84bf87159 tools/firmware/vmxassist/Makefile --- a/tools/firmware/vmxassist/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/firmware/vmxassist/Makefile Thu Sep 22 17:42:01 2005 @@ -41,9 +41,9 @@ all: vmxloader -vmxloader: roms.h vmxloader.c acpi.h - ${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c - $(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o +vmxloader: roms.h vmxloader.c acpi.h acpi_madt.c + ${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c -c acpi_madt.c + $(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o acpi_madt.o objcopy --change-addresses=0xC0000000 vmxloader.tmp vmxloader rm -f vmxloader.tmp diff -r 97dbd9524a7e -r 06d84bf87159 tools/firmware/vmxassist/vmxloader.c --- a/tools/firmware/vmxassist/vmxloader.c Thu Sep 22 17:34:14 2005 +++ b/tools/firmware/vmxassist/vmxloader.c Thu Sep 22 17:42:01 2005 @@ -27,6 +27,7 @@ #ifdef _ACPI_ #include "acpi.h" #include "../acpi/acpi2_0.h" // for ACPI_PHYSICAL_ADDRESS +int acpi_madt_update(unsigned char* acpi_start); #endif @@ -110,7 +111,10 @@ } #ifdef _ACPI_ puts("Loading ACPI ...\n"); - if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000 ){ + + acpi_madt_update(acpi); + + if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000) { /* make sure acpi table does not overlap rombios * currently acpi less than 8K will be OK. */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/ioemu/vl.c --- a/tools/ioemu/vl.c Thu Sep 22 17:34:14 2005 +++ b/tools/ioemu/vl.c Thu Sep 22 17:42:01 2005 @@ -126,6 +126,7 @@ int vm_running; int audio_enabled = 0; int nic_pcnet = 1; +int vcpus = 1; int sb16_enabled = 1; int adlib_enabled = 1; int gus_enabled = 1; @@ -2105,6 +2106,7 @@ "-snapshot write to temporary files instead of disk image files\n" "-m megs set virtual RAM size to megs MB [default=%d]\n" "-nographic disable graphical output and redirect serial I/Os to console\n" + "-vcpus set CPU number of guest platform\n" #ifdef CONFIG_VNC "-vnc port use vnc instead of sdl\n" "-vncport port use a different port\n" @@ -2235,6 +2237,7 @@ QEMU_OPTION_hdachs, QEMU_OPTION_L, QEMU_OPTION_no_code_copy, + QEMU_OPTION_vcpus, QEMU_OPTION_pci, QEMU_OPTION_nic_pcnet, QEMU_OPTION_isa, @@ -2307,6 +2310,7 @@ { "hdachs", HAS_ARG, QEMU_OPTION_hdachs }, { "L", HAS_ARG, QEMU_OPTION_L }, { "no-code-copy", 0, QEMU_OPTION_no_code_copy }, + { "vcpus", 1, QEMU_OPTION_vcpus }, #ifdef TARGET_PPC { "prep", 0, QEMU_OPTION_prep }, { "g", 1, QEMU_OPTION_g }, @@ -2646,6 +2650,9 @@ case QEMU_OPTION_S: start_emulation = 0; break; + case QEMU_OPTION_vcpus: + vcpus = atoi(optarg); + fprintf(logfile, "qemu: the number of cpus is %d\n", vcpus); case QEMU_OPTION_pci: pci_enabled = 1; break; diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/Makefile --- a/tools/libxc/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/Makefile Thu Sep 22 17:42:01 2005 @@ -26,19 +26,21 @@ BUILD_SRCS += xc_linux_build.c BUILD_SRCS += xc_load_bin.c BUILD_SRCS += xc_load_elf.c -BUILD_SRCS += xg_private.c ifeq ($(XEN_TARGET_ARCH),ia64) BUILD_SRCS += xc_ia64_stubs.c else +ifeq ($(XEN_TARGET_ARCH),x86_32) SRCS += xc_ptrace.c SRCS += xc_ptrace_core.c - -BUILD_SRCS := xc_load_aout9.c +endif +BUILD_SRCS += xc_load_aout9.c BUILD_SRCS += xc_linux_restore.c BUILD_SRCS += xc_linux_save.c BUILD_SRCS += xc_vmx_build.c endif + +BUILD_SRCS += xg_private.c CFLAGS += -Wall CFLAGS += -Werror diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_core.c Thu Sep 22 17:42:01 2005 @@ -11,10 +11,10 @@ static int copy_from_domain_page(int xc_handle, - u32 domid, - unsigned long *page_array, - unsigned long src_pfn, - void *dst_page) + u32 domid, + unsigned long *page_array, + unsigned long src_pfn, + void *dst_page) { void *vaddr = xc_map_foreign_range( xc_handle, domid, PAGE_SIZE, PROT_READ, page_array[src_pfn]); @@ -27,90 +27,100 @@ int xc_domain_dumpcore(int xc_handle, - u32 domid, - const char *corename) + u32 domid, + const char *corename) { - unsigned long nr_pages; - unsigned long *page_array; - xc_dominfo_t info; - int i, j, vcpu_map_size, dump_fd; - char *dump_mem, *dump_mem_start = NULL; - struct xc_core_header header; - vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; + unsigned long nr_pages; + unsigned long *page_array; + xc_dominfo_t info; + int i, j, vcpu_map_size, dump_fd; + char *dump_mem, *dump_mem_start = NULL; + struct xc_core_header header; + vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; - - if ((dump_fd = open(corename, O_CREAT|O_RDWR, S_IWUSR|S_IRUSR)) < 0) { - PERROR("Could not open corefile %s: %s", corename, strerror(errno)); - goto error_out; - } - - if ((dump_mem_start = malloc(DUMP_INCREMENT*PAGE_SIZE)) == NULL) { - PERROR("Could not allocate dump_mem"); - goto error_out; - } - - if (xc_domain_getinfo(xc_handle, domid, 1, &info) != 1) { - PERROR("Could not get info for domain"); - goto error_out; - } - - vcpu_map_size = sizeof(info.vcpu_to_cpu) / sizeof(info.vcpu_to_cpu[0]); + + if ((dump_fd = open(corename, O_CREAT|O_RDWR, S_IWUSR|S_IRUSR)) < 0) { + PERROR("Could not open corefile %s: %s", corename, strerror(errno)); + goto error_out; + } + + if ((dump_mem_start = malloc(DUMP_INCREMENT*PAGE_SIZE)) == NULL) { + PERROR("Could not allocate dump_mem"); + goto error_out; + } + + if (xc_domain_getinfo(xc_handle, domid, 1, &info) != 1) { + PERROR("Could not get info for domain"); + goto error_out; + } + + vcpu_map_size = sizeof(info.vcpu_to_cpu) / sizeof(info.vcpu_to_cpu[0]); - for (i = 0, j = 0; i < vcpu_map_size; i++) { - if (info.vcpu_to_cpu[i] == -1) { - continue; - } - if (xc_domain_get_vcpu_context(xc_handle, domid, i, &ctxt[j])) { - PERROR("Could not get all vcpu contexts for domain"); - goto error_out; - } - j++; - } - - nr_pages = info.nr_pages; + for (i = 0, j = 0; i < vcpu_map_size; i++) { + if (info.vcpu_to_cpu[i] == -1) { + continue; + } + if (xc_domain_get_vcpu_context(xc_handle, domid, i, &ctxt[j])) { + PERROR("Could not get all vcpu contexts for domain"); + goto error_out; + } + j++; + } + + nr_pages = info.nr_pages; - header.xch_magic = 0xF00FEBED; - header.xch_nr_vcpus = info.vcpus; - header.xch_nr_pages = nr_pages; - header.xch_ctxt_offset = sizeof(struct xc_core_header); - header.xch_index_offset = sizeof(struct xc_core_header) + - sizeof(vcpu_guest_context_t)*info.vcpus; - header.xch_pages_offset = round_pgup(sizeof(struct xc_core_header) + - (sizeof(vcpu_guest_context_t) * info.vcpus) + - (nr_pages * sizeof(unsigned long))); + header.xch_magic = 0xF00FEBED; + header.xch_nr_vcpus = info.vcpus; + header.xch_nr_pages = nr_pages; + header.xch_ctxt_offset = sizeof(struct xc_core_header); + header.xch_index_offset = sizeof(struct xc_core_header) + + sizeof(vcpu_guest_context_t)*info.vcpus; + header.xch_pages_offset = round_pgup(sizeof(struct xc_core_header) + + (sizeof(vcpu_guest_context_t) * info.vcpus) + + (nr_pages * sizeof(unsigned long))); - write(dump_fd, &header, sizeof(struct xc_core_header)); - write(dump_fd, &ctxt, sizeof(ctxt[0]) * info.vcpus); + write(dump_fd, &header, sizeof(struct xc_core_header)); + write(dump_fd, &ctxt, sizeof(ctxt[0]) * info.vcpus); - if ((page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL) { - printf("Could not allocate memory\n"); - goto error_out; - } - if (xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages) { - printf("Could not get the page frame list\n"); - goto error_out; - } - write(dump_fd, page_array, nr_pages * sizeof(unsigned long)); - lseek(dump_fd, header.xch_pages_offset, SEEK_SET); - for (dump_mem = dump_mem_start, i = 0; i < nr_pages; i++) { - copy_from_domain_page(xc_handle, domid, page_array, i, dump_mem); - dump_mem += PAGE_SIZE; - if (((i + 1) % DUMP_INCREMENT == 0) || (i + 1) == nr_pages) { - if (write(dump_fd, dump_mem_start, dump_mem - dump_mem_start) < - dump_mem - dump_mem_start) { - PERROR("Partial write, file system full?"); - goto error_out; - } - dump_mem = dump_mem_start; - } - } + if ((page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL) { + printf("Could not allocate memory\n"); + goto error_out; + } + if (xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages) { + printf("Could not get the page frame list\n"); + goto error_out; + } + write(dump_fd, page_array, nr_pages * sizeof(unsigned long)); + lseek(dump_fd, header.xch_pages_offset, SEEK_SET); + for (dump_mem = dump_mem_start, i = 0; i < nr_pages; i++) { + copy_from_domain_page(xc_handle, domid, page_array, i, dump_mem); + dump_mem += PAGE_SIZE; + if (((i + 1) % DUMP_INCREMENT == 0) || (i + 1) == nr_pages) { + if (write(dump_fd, dump_mem_start, dump_mem - dump_mem_start) < + dump_mem - dump_mem_start) { + PERROR("Partial write, file system full?"); + goto error_out; + } + dump_mem = dump_mem_start; + } + } - close(dump_fd); - free(dump_mem_start); - return 0; + close(dump_fd); + free(dump_mem_start); + return 0; error_out: - if (dump_fd != -1) - close(dump_fd); - free(dump_mem_start); - return -1; + if (dump_fd != -1) + close(dump_fd); + free(dump_mem_start); + return -1; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_domain.c Thu Sep 22 17:42:01 2005 @@ -265,7 +265,7 @@ unsigned long nr_extents, unsigned int extent_order, unsigned int address_bits, - unsigned long *extent_start) + unsigned long *extent_start) { int err; struct xen_memory_reservation reservation = { @@ -296,7 +296,7 @@ u32 domid, unsigned long nr_extents, unsigned int extent_order, - unsigned long *extent_start) + unsigned long *extent_start) { int err; struct xen_memory_reservation reservation = { @@ -328,3 +328,13 @@ return err; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_ia64_stubs.c --- a/tools/libxc/xc_ia64_stubs.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_ia64_stubs.c Thu Sep 22 17:42:01 2005 @@ -9,8 +9,8 @@ } int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns, - unsigned int store_evtchn, unsigned long *store_mfn, - unsigned int console_evtchn, unsigned long *console_mfn) + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn) { PERROR("xc_linux_restore not implemented\n"); return -1; @@ -44,3 +44,12 @@ return -1; } +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_linux_build.c Thu Sep 22 17:42:01 2005 @@ -12,7 +12,6 @@ #if defined(__x86_64__) || defined(__ia64__) #define ELFSIZE 64 #endif - #include "xc_elf.h" #include "xc_aout9.h" @@ -33,6 +32,13 @@ #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) #endif +#ifdef __ia64__ +#define already_built(ctxt) (0) +#define get_tot_pages xc_get_max_pages +#else +#define already_built(ctxt) ((ctxt)->ctrlreg[3] != 0) +#define get_tot_pages xc_get_tot_pages +#endif #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) #define round_pgdown(_p) ((_p)&PAGE_MASK) @@ -47,7 +53,7 @@ { if ( probe_elf(image, image_size, load_funcs) && probe_bin(image, image_size, load_funcs) && - probe_aout9(image, image_size, load_funcs) ) + probe_aout9(image, image_size, load_funcs) ) { ERROR( "Unrecognized image format" ); return -EINVAL; @@ -56,27 +62,27 @@ return 0; } -#define alloc_pt(ltab, vltab) \ - ltab = (unsigned long long)(page_array[ppt_alloc++]) << PAGE_SHIFT; \ - if (vltab != NULL) { \ - munmap(vltab, PAGE_SIZE); \ - } \ - if ((vltab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, \ - PROT_READ|PROT_WRITE, \ - ltab >> PAGE_SHIFT)) == NULL) { \ - goto error_out; \ - } \ - memset(vltab, 0, PAGE_SIZE); +#define alloc_pt(ltab, vltab) \ +do { \ + ltab = (u64)page_array[ppt_alloc++] << PAGE_SHIFT; \ + if ( vltab != NULL ) \ + munmap(vltab, PAGE_SIZE); \ + if ( (vltab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, \ + PROT_READ|PROT_WRITE, \ + ltab >> PAGE_SHIFT)) == NULL ) \ + goto error_out; \ + memset(vltab, 0, PAGE_SIZE); \ +} while ( 0 ) #if defined(__i386__) static int setup_pg_tables(int xc_handle, u32 dom, - vcpu_guest_context_t *ctxt, - unsigned long dsi_v_start, - unsigned long v_end, - unsigned long *page_array, - unsigned long vpt_start, - unsigned long vpt_end) + vcpu_guest_context_t *ctxt, + unsigned long dsi_v_start, + unsigned long v_end, + unsigned long *page_array, + unsigned long vpt_start, + unsigned long vpt_end) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -90,11 +96,11 @@ vl2e = &vl2tab[l2_table_offset(dsi_v_start)]; ctxt->ctrlreg[3] = l2tab; - for ( count = 0; count < ((v_end-dsi_v_start)>>PAGE_SHIFT); count++ ) + for ( count = 0; count < ((v_end - dsi_v_start) >> PAGE_SHIFT); count++ ) { if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) { - alloc_pt(l1tab, vl1tab); + alloc_pt(l1tab, vl1tab); vl1e = &vl1tab[l1_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; *vl2e++ = l1tab | L2_PROT; } @@ -111,79 +117,67 @@ error_out: if (vl1tab) - munmap(vl1tab, PAGE_SIZE); + munmap(vl1tab, PAGE_SIZE); if (vl2tab) - munmap(vl2tab, PAGE_SIZE); + munmap(vl2tab, PAGE_SIZE); return -1; } static int setup_pg_tables_pae(int xc_handle, u32 dom, - vcpu_guest_context_t *ctxt, - unsigned long dsi_v_start, - unsigned long v_end, - unsigned long *page_array, - unsigned long vpt_start, - unsigned long vpt_end) + vcpu_guest_context_t *ctxt, + unsigned long dsi_v_start, + unsigned long v_end, + unsigned long *page_array, + unsigned long vpt_start, + unsigned long vpt_end) { - l1_pgentry_64_t *vl1tab=NULL, *vl1e=NULL; - l2_pgentry_64_t *vl2tab=NULL, *vl2e=NULL; - l3_pgentry_64_t *vl3tab=NULL, *vl3e=NULL; - unsigned long long l1tab = 0; - unsigned long long l2tab = 0; - unsigned long long l3tab = 0; - unsigned long ppt_alloc; - unsigned long count; + l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL; + l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL; + l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL; + u64 l1tab, l2tab, l3tab; + unsigned long ppt_alloc, count, nmfn; /* First allocate page for page dir. */ ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT; if ( page_array[ppt_alloc] > 0xfffff ) { - unsigned long nmfn; - nmfn = xc_make_page_below_4G( xc_handle, dom, page_array[ppt_alloc] ); - if ( nmfn == 0 ) - { - fprintf(stderr, "Couldn't get a page below 4GB :-(\n"); - goto error_out; - } - page_array[ppt_alloc] = nmfn; + nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]); + if ( nmfn == 0 ) + { + fprintf(stderr, "Couldn't get a page below 4GB :-(\n"); + goto error_out; + } + page_array[ppt_alloc] = nmfn; } alloc_pt(l3tab, vl3tab); vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)]; ctxt->ctrlreg[3] = l3tab; - if(l3tab>0xfffff000ULL) - { - fprintf(stderr,"L3TAB = %llx above 4GB!\n",l3tab); - goto error_out; - } - - for ( count = 0; count < ((v_end-dsi_v_start)>>PAGE_SHIFT); count++) + for ( count = 0; count < ((v_end - dsi_v_start) >> PAGE_SHIFT); count++) { if ( !((unsigned long)vl1e & (PAGE_SIZE-1)) ) { + if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) ) + { + alloc_pt(l2tab, vl2tab); + vl2e = &vl2tab[l2_table_offset_pae( + dsi_v_start + (count << PAGE_SHIFT))]; + *vl3e++ = l2tab | L3_PROT; + } + alloc_pt(l1tab, vl1tab); - - if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) ) - { - alloc_pt(l2tab, vl2tab); - vl2e = &vl2tab[l2_table_offset_pae(dsi_v_start + (count<<PAGE_SHIFT))]; - *vl3e = l2tab | L3_PROT; - vl3e++; - } - vl1e = &vl1tab[l1_table_offset_pae(dsi_v_start + (count<<PAGE_SHIFT))]; - *vl2e = l1tab | L2_PROT; - vl2e++; + vl1e = &vl1tab[l1_table_offset_pae( + dsi_v_start + (count << PAGE_SHIFT))]; + *vl2e++ = l1tab | L2_PROT; } - *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT; + *vl1e = ((u64)page_array[count] << PAGE_SHIFT) | L1_PROT; if ( (count >= ((vpt_start-dsi_v_start)>>PAGE_SHIFT)) && - (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) ) - { - *vl1e &= ~_PAGE_RW; - } - vl1e++; + (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) ) + *vl1e &= ~_PAGE_RW; + vl1e++; } munmap(vl1tab, PAGE_SIZE); @@ -193,11 +187,11 @@ error_out: if (vl1tab) - munmap(vl1tab, PAGE_SIZE); + munmap(vl1tab, PAGE_SIZE); if (vl2tab) - munmap(vl2tab, PAGE_SIZE); + munmap(vl2tab, PAGE_SIZE); if (vl3tab) - munmap(vl3tab, PAGE_SIZE); + munmap(vl3tab, PAGE_SIZE); return -1; } @@ -206,12 +200,12 @@ #if defined(__x86_64__) static int setup_pg_tables_64(int xc_handle, u32 dom, - vcpu_guest_context_t *ctxt, - unsigned long dsi_v_start, - unsigned long v_end, - unsigned long *page_array, - unsigned long vpt_start, - unsigned long vpt_end) + vcpu_guest_context_t *ctxt, + unsigned long dsi_v_start, + unsigned long v_end, + unsigned long *page_array, + unsigned long vpt_start, + unsigned long vpt_end) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -236,20 +230,20 @@ { alloc_pt(l1tab, vl1tab); - if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) ) + if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) ) + { + alloc_pt(l2tab, vl2tab); + if ( !((unsigned long)vl3e & (PAGE_SIZE-1)) ) { - alloc_pt(l2tab, vl2tab); - if ( !((unsigned long)vl3e & (PAGE_SIZE-1)) ) - { - alloc_pt(l3tab, vl3tab); - vl3e = &vl3tab[l3_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; - *vl4e = l3tab | L4_PROT; - vl4e++; - } - vl2e = &vl2tab[l2_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; - *vl3e = l2tab | L3_PROT; - vl3e++; + alloc_pt(l3tab, vl3tab); + vl3e = &vl3tab[l3_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; + *vl4e = l3tab | L4_PROT; + vl4e++; } + vl2e = &vl2tab[l2_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; + *vl3e = l2tab | L3_PROT; + vl3e++; + } vl1e = &vl1tab[l1_table_offset(dsi_v_start + (count<<PAGE_SHIFT))]; *vl2e = l1tab | L2_PROT; vl2e++; @@ -257,11 +251,11 @@ *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT; if ( (count >= ((vpt_start-dsi_v_start)>>PAGE_SHIFT)) && - (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) ) - { - *vl1e &= ~_PAGE_RW; - } - vl1e++; + (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) ) + { + *vl1e &= ~_PAGE_RW; + } + vl1e++; } munmap(vl1tab, PAGE_SIZE); @@ -272,13 +266,13 @@ error_out: if (vl1tab) - munmap(vl1tab, PAGE_SIZE); + munmap(vl1tab, PAGE_SIZE); if (vl2tab) - munmap(vl2tab, PAGE_SIZE); + munmap(vl2tab, PAGE_SIZE); if (vl3tab) - munmap(vl3tab, PAGE_SIZE); + munmap(vl3tab, PAGE_SIZE); if (vl4tab) - munmap(vl4tab, PAGE_SIZE); + munmap(vl4tab, PAGE_SIZE); return -1; } #endif @@ -286,18 +280,18 @@ #ifdef __ia64__ #include <asm/fpu.h> /* for FPSR_DEFAULT */ static int setup_guest(int xc_handle, - u32 dom, - char *image, unsigned long image_size, - gzFile initrd_gfd, unsigned long initrd_len, - unsigned long nr_pages, - unsigned long *pvsi, unsigned long *pvke, - unsigned long *pvss, vcpu_guest_context_t *ctxt, - const char *cmdline, - unsigned long shared_info_frame, - unsigned long flags, - unsigned int vcpus, - unsigned int store_evtchn, unsigned long *store_mfn, - unsigned int console_evtchn, unsigned long *console_mfn) + u32 dom, + char *image, unsigned long image_size, + gzFile initrd_gfd, unsigned long initrd_len, + unsigned long nr_pages, + unsigned long *pvsi, unsigned long *pvke, + unsigned long *pvss, vcpu_guest_context_t *ctxt, + const char *cmdline, + unsigned long shared_info_frame, + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn) { unsigned long *page_array = NULL; struct load_funcs load_funcs; @@ -339,19 +333,20 @@ *pvke = dsi.v_kernentry; /* Now need to retrieve machine pfn for system pages: - * start_info/store/console + * start_info/store/console */ pgnr = 3; - if ( xc_ia64_get_pfn_list(xc_handle, dom, page_array, nr_pages - 3, pgnr) != pgnr) - { - PERROR("Could not get page frame for xenstore"); - goto error_out; + if ( xc_ia64_get_pfn_list(xc_handle, dom, page_array, + nr_pages - 3, pgnr) != pgnr ) + { + PERROR("Could not get page frame for xenstore"); + goto error_out; } *store_mfn = page_array[1]; *console_mfn = page_array[2]; printf("store_mfn: 0x%lx, console_mfn: 0x%lx\n", - (u64)store_mfn, (u64)console_mfn); + (u64)store_mfn, (u64)console_mfn); start_info = xc_map_foreign_range( xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, page_array[0]); @@ -382,8 +377,8 @@ unsigned long shared_info_frame, unsigned long flags, unsigned int vcpus, - unsigned int store_evtchn, unsigned long *store_mfn, - unsigned int console_evtchn, unsigned long *console_mfn) + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn) { unsigned long *page_array = NULL; unsigned long count, i; @@ -458,26 +453,26 @@ if ( (v_end - vstack_end) < (512UL << 10) ) v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */ #if defined(__i386__) - if (dsi.pae_kernel) { - /* FIXME: assumes one L2 pgtable @ 0xc0000000 */ - if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT_PAE)-1)) >> - L2_PAGETABLE_SHIFT_PAE) + 2) <= nr_pt_pages ) - break; - } else { - if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> - L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) - break; - } + if (dsi.pae_kernel) { + /* FIXME: assumes one L2 pgtable @ 0xc0000000 */ + if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT_PAE)-1)) >> + L2_PAGETABLE_SHIFT_PAE) + 2) <= nr_pt_pages ) + break; + } else { + if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >> + L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) + break; + } #endif #if defined(__x86_64__) #define NR(_l,_h,_s) \ (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \ ((_l) & ~((1UL<<(_s))-1))) >> (_s)) - if ( (1 + /* # L4 */ - NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */ - NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */ - NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */ - <= nr_pt_pages ) + if ( (1 + /* # L4 */ + NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */ + NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */ + NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */ + <= nr_pt_pages ) break; #endif } @@ -541,7 +536,7 @@ goto error_out; } xc_copy_to_domain_page(xc_handle, dom, - page_array[i>>PAGE_SHIFT], page); + page_array[i>>PAGE_SHIFT], page); } } @@ -551,22 +546,22 @@ /* setup page tables */ #if defined(__i386__) if (dsi.pae_kernel) - rc = setup_pg_tables_pae(xc_handle, dom, ctxt, - dsi.v_start, v_end, - page_array, vpt_start, vpt_end); + rc = setup_pg_tables_pae(xc_handle, dom, ctxt, + dsi.v_start, v_end, + page_array, vpt_start, vpt_end); else { - rc = setup_pg_tables(xc_handle, dom, ctxt, - dsi.v_start, v_end, - page_array, vpt_start, vpt_end); + rc = setup_pg_tables(xc_handle, dom, ctxt, + dsi.v_start, v_end, + page_array, vpt_start, vpt_end); } #endif #if defined(__x86_64__) rc = setup_pg_tables_64(xc_handle, dom, ctxt, - dsi.v_start, v_end, - page_array, vpt_start, vpt_end); + dsi.v_start, v_end, + page_array, vpt_start, vpt_end); #endif if (0 != rc) - goto error_out; + goto error_out; /* Write the phys->machine and machine->phys table entries. */ physmap_pfn = (vphysmap_start - dsi.v_start) >> PAGE_SHIFT; @@ -576,11 +571,13 @@ for ( count = 0; count < nr_pages; count++ ) { - if ( xc_add_mmu_update(xc_handle, mmu, - ((unsigned long long)page_array[count] << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE, count) ) - { - fprintf(stderr,"m2p update failure p=%lx m=%lx\n",count,page_array[count] ); + if ( xc_add_mmu_update( + xc_handle, mmu, + ((u64)page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + count) ) + { + fprintf(stderr,"m2p update failure p=%lx m=%lx\n", + count, page_array[count]); munmap(physmap, PAGE_SIZE); goto error_out; } @@ -601,13 +598,13 @@ * correct protection for the page */ if (dsi.pae_kernel) { - if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE, - ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) - goto error_out; + if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE, + ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) + goto error_out; } else { - if ( pin_table(xc_handle, MMUEXT_PIN_L2_TABLE, - ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) - goto error_out; + if ( pin_table(xc_handle, MMUEXT_PIN_L2_TABLE, + ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) + goto error_out; } #endif @@ -616,8 +613,8 @@ * Pin down l4tab addr as page dir page - causes hypervisor to provide * correct protection for the page */ - if ( pin_table(xc_handle, MMUEXT_PIN_L4_TABLE, - ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) + if ( pin_table(xc_handle, MMUEXT_PIN_L4_TABLE, + ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) ) goto error_out; #endif @@ -703,12 +700,7 @@ unsigned long image_size, initrd_size=0; unsigned long vstartinfo_start, vkern_entry, vstack_start; -#ifdef __ia64__ - /* Current xen/ia64 allocates domU pages on demand */ - if ( (nr_pages = xc_get_max_pages(xc_handle, domid)) < 0 ) -#else - if ( (nr_pages = xc_get_tot_pages(xc_handle, domid)) < 0 ) -#endif + if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 ) { PERROR("Could not find total pages for domain"); goto error_out; @@ -755,12 +747,7 @@ goto error_out; } - if ( !(op.u.getdomaininfo.flags & DOMFLAGS_PAUSED) || -#ifdef __ia64__ - 0 ) -#else - (ctxt->ctrlreg[3] != 0) ) -#endif + if ( !(op.u.getdomaininfo.flags & DOMFLAGS_PAUSED) || already_built(ctxt) ) { ERROR("Domain is already constructed"); goto error_out; @@ -773,7 +760,7 @@ op.u.getdomaininfo.shared_info_frame, flags, vcpus, store_evtchn, store_mfn, - console_evtchn, console_mfn) < 0 ) + console_evtchn, console_mfn) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; @@ -789,12 +776,13 @@ /* based on new_thread in xen/arch/ia64/domain.c */ ctxt->flags = 0; ctxt->shared.flags = flags; - ctxt->shared.start_info_pfn = nr_pages - 3; // metaphysical + ctxt->shared.start_info_pfn = nr_pages - 3; /* metaphysical */ ctxt->regs.cr_ipsr = 0; /* all necessary bits filled by hypervisor */ ctxt->regs.cr_iip = vkern_entry; ctxt->regs.cr_ifs = 1UL << 63; ctxt->regs.ar_fpsr = FPSR_DEFAULT; - /* ctxt->regs.r28 = dom_fw_setup(); currently done by hypervisor, should move here */ + /* currently done by hypervisor, should move here */ + /* ctxt->regs.r28 = dom_fw_setup(); */ ctxt->vcpu.privregs = 0; ctxt->sys_pgnr = nr_pages - 3; i = 0; /* silence unused variable warning */ @@ -875,3 +863,13 @@ return -1; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_linux_save.c Thu Sep 22 17:42:01 2005 @@ -17,7 +17,6 @@ #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */ #define MAX_MBIT_RATE 500 - /* ** Default values for important tuning parameters. Can override by passing @@ -29,12 +28,9 @@ #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */ - - /* Flags to control behaviour of xc_linux_save */ #define XCFLAGS_LIVE 1 #define XCFLAGS_DEBUG 2 - #define DEBUG 0 @@ -115,8 +111,8 @@ int i, count = 0; unsigned long *p = (unsigned long *)addr; /* We know that the array is padded to unsigned long. */ - for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++) - count += hweight32( *p ); + for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ ) + count += hweight32(*p); return count; } @@ -201,42 +197,50 @@ struct timespec delay; long long delta; - if (START_MBIT_RATE == 0) - return write(io_fd, buf, n); + if ( START_MBIT_RATE == 0 ) + return write(io_fd, buf, n); budget -= n; - if (budget < 0) { - if (MBIT_RATE != ombit_rate) { - BURST_TIME_US = RATE_TO_BTU / MBIT_RATE; - ombit_rate = MBIT_RATE; - DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", - MBIT_RATE, BURST_BUDGET, BURST_TIME_US); - } - if (last_put.tv_sec == 0) { - budget += BURST_BUDGET; - gettimeofday(&last_put, NULL); - } else { - while (budget < 0) { - gettimeofday(&now, NULL); - delta = tv_delta(&now, &last_put); - while (delta > BURST_TIME_US) { - budget += BURST_BUDGET; - last_put.tv_usec += BURST_TIME_US; - if (last_put.tv_usec > 1000000) { - last_put.tv_usec -= 1000000; - last_put.tv_sec++; - } - delta -= BURST_TIME_US; - } - if (budget > 0) - break; - delay.tv_sec = 0; - delay.tv_nsec = 1000 * (BURST_TIME_US - delta); - while (delay.tv_nsec > 0) - if (nanosleep(&delay, &delay) == 0) - break; - } - } + if ( budget < 0 ) + { + if ( MBIT_RATE != ombit_rate ) + { + BURST_TIME_US = RATE_TO_BTU / MBIT_RATE; + ombit_rate = MBIT_RATE; + DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", + MBIT_RATE, BURST_BUDGET, BURST_TIME_US); + } + if ( last_put.tv_sec == 0 ) + { + budget += BURST_BUDGET; + gettimeofday(&last_put, NULL); + } + else + { + while ( budget < 0 ) + { + gettimeofday(&now, NULL); + delta = tv_delta(&now, &last_put); + while ( delta > BURST_TIME_US ) + { + budget += BURST_BUDGET; + last_put.tv_usec += BURST_TIME_US; + if ( last_put.tv_usec > 1000000 ) + { + last_put.tv_usec -= 1000000; + last_put.tv_sec++; + } + delta -= BURST_TIME_US; + } + if ( budget > 0 ) + break; + delay.tv_sec = 0; + delay.tv_nsec = 1000 * (BURST_TIME_US - delta); + while ( delay.tv_nsec > 0 ) + if ( nanosleep(&delay, &delay) == 0 ) + break; + } + } } return write(io_fd, buf, n); } @@ -271,20 +275,21 @@ if ( print ) fprintf(stderr, - "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " - "dirtied %dMb/s %" PRId32 " pages\n", - wall_delta, - (int)((d0_cpu_delta*100)/wall_delta), - (int)((d1_cpu_delta*100)/wall_delta), - (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), - (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), - stats->dirty_count); - - if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) { - mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) - + 50; - if (mbit_rate > MAX_MBIT_RATE) - mbit_rate = MAX_MBIT_RATE; + "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " + "dirtied %dMb/s %" PRId32 " pages\n", + wall_delta, + (int)((d0_cpu_delta*100)/wall_delta), + (int)((d1_cpu_delta*100)/wall_delta), + (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), + (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), + stats->dirty_count); + + if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate ) + { + mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) + + 50; + if (mbit_rate > MAX_MBIT_RATE) + mbit_rate = MAX_MBIT_RATE; } d0_cpu_last = d0_cpu_now; @@ -303,7 +308,7 @@ start = llgettimeofday(); - for (j = 0; j < runs; j++) + for ( j = 0; j < runs; j++ ) { int i; @@ -320,10 +325,10 @@ NULL, 0, &stats); fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32 - " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", - ((now-start)+500)/1000, - stats.fault_count, stats.dirty_count, - stats.dirty_net_count, stats.dirty_block_count); + " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", + ((now-start)+500)/1000, + stats.fault_count, stats.dirty_count, + stats.dirty_net_count, stats.dirty_block_count); } } @@ -331,7 +336,7 @@ } -static int suspend_and_state(int xc_handle, int io_fd, int dom, +static int suspend_and_state(int xc_handle, int io_fd, int dom, xc_dominfo_t *info, vcpu_guest_context_t *ctxt) { @@ -340,51 +345,53 @@ printf("suspend\n"); fflush(stdout); - if (fgets(ans, sizeof(ans), stdin) == NULL) { + if ( fgets(ans, sizeof(ans), stdin) == NULL ) + { ERR("failed reading suspend reply"); return -1; } - if (strncmp(ans, "done\n", 5)) { + if ( strncmp(ans, "done\n", 5) ) + { ERR("suspend reply incorrect: %s", ans); return -1; } -retry: + retry: if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1) { - ERR("Could not get domain info"); - return -1; + ERR("Could not get domain info"); + return -1; } if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, - ctxt) ) + ctxt) ) { ERR("Could not get vcpu context"); } if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend ) { - return 0; // success + return 0; // success } if ( info->paused ) { - // try unpausing domain, wait, and retest - xc_domain_unpause( xc_handle, dom ); - - ERR("Domain was paused. Wait and re-test."); - usleep(10000); // 10ms - - goto retry; + // try unpausing domain, wait, and retest + xc_domain_unpause( xc_handle, dom ); + + ERR("Domain was paused. Wait and re-test."); + usleep(10000); // 10ms + + goto retry; } if( ++i < 100 ) { - ERR("Retry suspend domain."); - usleep(10000); // 10ms - goto retry; + ERR("Retry suspend domain."); + usleep(10000); // 10ms + goto retry; } ERR("Unable to suspend domain."); @@ -454,26 +461,26 @@ /* If no explicit control parameters given, use defaults */ - if(!max_iters) + if( !max_iters ) max_iters = DEF_MAX_ITERS; - if(!max_factor) + if( !max_factor ) max_factor = DEF_MAX_FACTOR; DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false"); - if (mlock(&ctxt, sizeof(ctxt))) { + if ( mlock(&ctxt, sizeof(ctxt)) ) + { ERR("Unable to mlock ctxt"); return 1; } - if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) + if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) { ERR("Could not get domain info"); goto out; } - if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0, - &ctxt) ) + if ( xc_domain_get_vcpu_context(xc_handle, dom, /* FIXME */ 0, &ctxt) ) { ERR("Could not get vcpu context"); goto out; @@ -481,7 +488,8 @@ shared_info_frame = info.shared_info_frame; /* A cheesy test to see whether the domain contains valid state. */ - if ( ctxt.ctrlreg[3] == 0 ){ + if ( ctxt.ctrlreg[3] == 0 ) + { ERR("Domain is not in a valid Linux guest OS state"); goto out; } @@ -496,18 +504,17 @@ } /* Map the shared info frame */ - live_shinfo = xc_map_foreign_range(xc_handle, dom, - PAGE_SIZE, PROT_READ, - shared_info_frame); - - if (!live_shinfo){ + live_shinfo = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ, shared_info_frame); + if ( !live_shinfo ) + { ERR("Couldn't map live_shinfo"); goto out; } - live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(xc_handle, dom, - PAGE_SIZE, PROT_READ, - live_shinfo->arch.pfn_to_mfn_frame_list_list); + live_pfn_to_mfn_frame_list_list = xc_map_foreign_range( + xc_handle, dom, + PAGE_SIZE, PROT_READ, live_shinfo->arch.pfn_to_mfn_frame_list_list); if (!live_pfn_to_mfn_frame_list_list){ ERR("Couldn't map pfn_to_mfn_frame_list_list"); @@ -515,12 +522,13 @@ } live_pfn_to_mfn_frame_list = - xc_map_foreign_batch(xc_handle, dom, - PROT_READ, - live_pfn_to_mfn_frame_list_list, - (nr_pfns+(1024*1024)-1)/(1024*1024) ); - - if (!live_pfn_to_mfn_frame_list){ + xc_map_foreign_batch(xc_handle, dom, + PROT_READ, + live_pfn_to_mfn_frame_list_list, + (nr_pfns+(1024*1024)-1)/(1024*1024) ); + + if ( !live_pfn_to_mfn_frame_list) + { ERR("Couldn't map pfn_to_mfn_frame_list"); goto out; } @@ -535,7 +543,8 @@ PROT_READ, live_pfn_to_mfn_frame_list, (nr_pfns+1023)/1024 ); - if( !live_pfn_to_mfn_table ){ + if ( !live_pfn_to_mfn_table ) + { ERR("Couldn't map pfn_to_mfn table"); goto out; } @@ -544,15 +553,17 @@ mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle ); live_mfn_to_pfn_table = - xc_map_foreign_range(xc_handle, DOMID_XEN, - PAGE_SIZE*1024, PROT_READ, - mfn_to_pfn_table_start_mfn ); + xc_map_foreign_range(xc_handle, DOMID_XEN, + PAGE_SIZE*1024, PROT_READ, + mfn_to_pfn_table_start_mfn ); /* Canonicalise the pfn-to-mfn table frame-number list. */ memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE ); - for ( i = 0; i < nr_pfns; i += 1024 ){ - if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){ + for ( i = 0; i < nr_pfns; i += 1024 ) + { + if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ) + { ERR("Frame# in pfn-to-mfn frame list is not in pseudophys"); goto out; } @@ -561,40 +572,44 @@ /* Domain is still running at this point */ - if( live ) + if ( live ) { if ( xc_shadow_control( xc_handle, dom, DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL ) < 0 ) { + NULL, 0, NULL ) < 0 ) + { ERR("Couldn't enable shadow mode"); goto out; } last_iter = 0; - } else{ - /* This is a non-live suspend. Issue the call back to get the - domain suspended */ + } + else + { + /* This is a non-live suspend. Issue the call back to get the + domain suspended */ last_iter = 1; - if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) ) - { - ERR("Domain appears not to have suspended"); - goto out; - } + if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) ) + { + ERR("Domain appears not to have suspended"); + goto out; + } } sent_last_iter = 1<<20; /* 4GB of pages */ /* calculate the power of 2 order of nr_pfns, e.g. 15->4 16->4 17->5 */ - for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ ); + for ( i = nr_pfns-1, order_nr = 0; i ; i >>= 1, order_nr++ ) + continue; /* Setup to_send bitmap */ { - /* size these for a maximal 4GB domain, to make interaction - with balloon driver easier. It's only user space memory, - ater all... (3x 128KB) */ + /* size these for a maximal 4GB domain, to make interaction + with balloon driver easier. It's only user space memory, + ater all... (3x 128KB) */ int sz = ( 1<<20 ) / 8; @@ -602,21 +617,24 @@ to_fix = calloc( 1, sz ); to_skip = malloc( sz ); - if (!to_send || !to_fix || !to_skip){ + if ( !to_send || !to_fix || !to_skip ) + { ERR("Couldn't allocate to_send array"); goto out; } - memset( to_send, 0xff, sz ); - - if ( mlock( to_send, sz ) ){ + memset(to_send, 0xff, sz); + + if ( mlock(to_send, sz) ) + { ERR("Unable to mlock to_send"); return 1; } /* (to fix is local only) */ - if ( mlock( to_skip, sz ) ){ + if ( mlock(to_skip, sz) ) + { ERR("Unable to mlock to_skip"); return 1; } @@ -629,12 +647,14 @@ pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long)); pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long)); - if ( (pfn_type == NULL) || (pfn_batch == NULL) ){ + if ( (pfn_type == NULL) || (pfn_batch == NULL) ) + { errno = ENOMEM; goto out; } - if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){ + if ( mlock(pfn_type, BATCH_SIZE * sizeof(unsigned long)) ) + { ERR("Unable to mlock"); goto out; } @@ -645,31 +665,34 @@ */ #if DEBUG { - int err=0; - for ( i = 0; i < nr_pfns; i++ ) - { - mfn = live_pfn_to_mfn_table[i]; - - if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) ) - { - fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n", - i,mfn,live_mfn_to_pfn_table[mfn]); - err++; - } - } - fprintf(stderr, "Had %d unexplained entries in p2m table\n",err); + int err=0; + for ( i = 0; i < nr_pfns; i++ ) + { + mfn = live_pfn_to_mfn_table[i]; + + if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) ) + { + fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n", + i,mfn,live_mfn_to_pfn_table[mfn]); + err++; + } + } + fprintf(stderr, "Had %d unexplained entries in p2m table\n",err); } #endif /* Start writing out the saved-domain record. */ - if (write(io_fd, &nr_pfns, sizeof(unsigned long)) != - sizeof(unsigned long)) { - ERR("write: nr_pfns"); - goto out; - } - if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) { + if ( write(io_fd, &nr_pfns, sizeof(unsigned long)) != + sizeof(unsigned long) ) + { + ERR("write: nr_pfns"); + goto out; + } + + if ( write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE ) + { ERR("write: pfn_to_mfn_frame_list"); goto out; } @@ -678,7 +701,8 @@ /* Now write out each data page, canonicalising page tables as we go... */ - while(1){ + for ( ; ; ) + { unsigned int prev_pc, sent_this_iter, N, batch; iter++; @@ -689,10 +713,12 @@ DPRINTF("Saving memory pages: iter %d 0%%", iter); - while( N < nr_pfns ){ + while ( N < nr_pfns ) + { unsigned int this_pc = (N * 100) / nr_pfns; - if ( (this_pc - prev_pc) >= 5 ){ + if ( (this_pc - prev_pc) >= 5 ) + { DPRINTF("\b\b\b\b%3d%%", this_pc); prev_pc = this_pc; } @@ -701,10 +727,10 @@ but this is fast enough for the moment. */ if ( !last_iter && - xc_shadow_control(xc_handle, dom, + xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK, to_skip, nr_pfns, NULL) != nr_pfns ) - { + { ERR("Error peeking shadow bitmap"); goto out; } @@ -748,7 +774,7 @@ pfn_type[batch] = live_pfn_to_mfn_table[n]; if( ! is_mapped(pfn_type[batch]) ) - { + { /* not currently in pusedo-physical map -- set bit in to_fix that we must send this page in last_iter unless its sent sooner anyhow */ @@ -756,7 +782,7 @@ set_bit( n, to_fix ); if( iter>1 ) DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n", - iter,n,pfn_type[batch]); + iter,n,pfn_type[batch]); continue; } @@ -790,8 +816,10 @@ goto out; } - for ( j = 0; j < batch; j++ ){ - if ( (pfn_type[j] & LTAB_MASK) == XTAB ){ + for ( j = 0; j < batch; j++ ) + { + if ( (pfn_type[j] & LTAB_MASK) == XTAB ) + { DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]); continue; } @@ -809,21 +837,25 @@ pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j]; } - if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) { + if ( write(io_fd, &batch, sizeof(int)) != sizeof(int) ) + { ERR("Error when writing to state file (2)"); goto out; } - if (write(io_fd, pfn_type, sizeof(unsigned long)*j) != - sizeof(unsigned long)*j) { + if ( write(io_fd, pfn_type, sizeof(unsigned long)*j) != + (sizeof(unsigned long) * j) ) + { ERR("Error when writing to state file (3)"); goto out; } /* entering this loop, pfn_type is now in pfns (Not mfns) */ - for( j = 0; j < batch; j++ ){ + for ( j = 0; j < batch; j++ ) + { /* write out pages in batch */ - if( (pfn_type[j] & LTAB_MASK) == XTAB){ + if ( (pfn_type[j] & LTAB_MASK) == XTAB ) + { DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]); continue; } @@ -836,7 +868,8 @@ k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ? (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); - k++ ){ + k++ ) + { unsigned long pfn; if ( !(page[k] & _PAGE_PRESENT) ) @@ -849,13 +882,13 @@ { /* I don't think this should ever happen */ fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, " - "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n", - j, pfn_type[j], k, - page[k], mfn, live_mfn_to_pfn_table[mfn], - (live_mfn_to_pfn_table[mfn]<nr_pfns)? - live_pfn_to_mfn_table[ - live_mfn_to_pfn_table[mfn]] : - 0xdeadbeef); + "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n", + j, pfn_type[j], k, + page[k], mfn, live_mfn_to_pfn_table[mfn], + (live_mfn_to_pfn_table[mfn]<nr_pfns)? + live_pfn_to_mfn_table[ + live_mfn_to_pfn_table[mfn]] : + 0xdeadbeef); pfn = 0; /* be suspicious */ } @@ -865,12 +898,12 @@ #if 0 fprintf(stderr, - "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx " - "xpfn=%d\n", - pfn_type[j]>>28, - j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT); + "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx " + "xpfn=%d\n", + pfn_type[j]>>28, + j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT); #endif - + } /* end of page table rewrite for loop */ if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) { @@ -880,8 +913,9 @@ } /* end of it's a PT page */ else { /* normal page */ - if (ratewrite(io_fd, region_base + (PAGE_SIZE*j), - PAGE_SIZE) != PAGE_SIZE) { + if ( ratewrite(io_fd, region_base + (PAGE_SIZE*j), + PAGE_SIZE) != PAGE_SIZE ) + { ERR("Error when writing to state file (5)"); goto out; } @@ -899,13 +933,13 @@ total_sent += sent_this_iter; DPRINTF("\r %d: sent %d, skipped %d, ", - iter, sent_this_iter, skip_this_iter ); + iter, sent_this_iter, skip_this_iter ); if ( last_iter ) { print_stats( xc_handle, dom, sent_this_iter, &stats, 1); DPRINTF("Total pages sent= %d (%.2fx)\n", - total_sent, ((float)total_sent)/nr_pfns ); + total_sent, ((float)total_sent)/nr_pfns ); DPRINTF("(of which %d were fixups)\n", needed_to_fix ); } @@ -930,7 +964,7 @@ { if ( ( ( sent_this_iter > sent_last_iter ) && - (mbit_rate == MAX_MBIT_RATE ) ) || + (mbit_rate == MAX_MBIT_RATE ) ) || (iter >= max_iters) || (sent_this_iter+skip_this_iter < 50) || (total_sent > nr_pfns*max_factor) ) @@ -938,15 +972,15 @@ DPRINTF("Start last iteration\n"); last_iter = 1; - if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) ) - { - ERR("Domain appears not to have suspended"); - goto out; - } - - DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n", - info.shared_info_frame, - ctxt.user_regs.eip, ctxt.user_regs.esi); + if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) ) + { + ERR("Domain appears not to have suspended"); + goto out; + } + + DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n", + info.shared_info_frame, + ctxt.user_regs.eip, ctxt.user_regs.esi); } if ( xc_shadow_control( xc_handle, dom, @@ -972,86 +1006,92 @@ rc = 0; /* Zero terminate */ - if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) { + if ( write(io_fd, &rc, sizeof(int)) != sizeof(int) ) + { ERR("Error when writing to state file (6)"); goto out; } /* Send through a list of all the PFNs that were not in map at the close */ { - unsigned int i,j; - unsigned int pfntab[1024]; - - for ( i = 0, j = 0; i < nr_pfns; i++ ) - { - if ( ! is_mapped(live_pfn_to_mfn_table[i]) ) - j++; - } - - if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) { - ERR("Error when writing to state file (6a)"); - goto out; - } - - for ( i = 0, j = 0; i < nr_pfns; ) - { - if ( ! is_mapped(live_pfn_to_mfn_table[i]) ) - { - pfntab[j++] = i; - } - i++; - if ( j == 1024 || i == nr_pfns ) - { - if (write(io_fd, &pfntab, sizeof(unsigned long)*j) != - sizeof(unsigned long)*j) { - ERR("Error when writing to state file (6b)"); - goto out; - } - j = 0; - } - } + unsigned int i,j; + unsigned int pfntab[1024]; + + for ( i = 0, j = 0; i < nr_pfns; i++ ) + if ( !is_mapped(live_pfn_to_mfn_table[i]) ) + j++; + + if ( write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int) ) + { + ERR("Error when writing to state file (6a)"); + goto out; + } + + for ( i = 0, j = 0; i < nr_pfns; ) + { + if ( !is_mapped(live_pfn_to_mfn_table[i]) ) + { + pfntab[j++] = i; + } + i++; + if ( j == 1024 || i == nr_pfns ) + { + if ( write(io_fd, &pfntab, sizeof(unsigned long)*j) != + (sizeof(unsigned long) * j) ) + { + ERR("Error when writing to state file (6b)"); + goto out; + } + j = 0; + } + } } /* Canonicalise the suspend-record frame number. */ - if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){ + if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ) + { ERR("Suspend record is not in range of pseudophys map"); goto out; } /* Canonicalise each GDT frame number. */ - for ( i = 0; i < ctxt.gdt_ents; i += 512 ) { - if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) { + for ( i = 0; i < ctxt.gdt_ents; i += 512 ) + { + if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) + { ERR("GDT frame is not in range of pseudophys map"); goto out; } } /* Canonicalise the page table base pointer. */ - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) { + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) + { ERR("PT base is not in range of pseudophys map"); goto out; } ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] << PAGE_SHIFT; - if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) || - write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) { + if ( write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) || + write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) + { ERR("Error when writing to state file (1)"); goto out; } out: - if(live_shinfo) + if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE); - if(live_pfn_to_mfn_frame_list) + if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE); - if(live_pfn_to_mfn_table) + if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4); - if(live_mfn_to_pfn_table) + if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024); free(pfn_type); @@ -1063,3 +1103,13 @@ DPRINTF("Save exit rc=%d\n",rc); return !!rc; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_load_aout9.c --- a/tools/libxc/xc_load_aout9.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_load_aout9.c Thu Sep 22 17:42:01 2005 @@ -64,11 +64,11 @@ dstart = round_pgup(start + ehdr.text); end = dstart + ehdr.data + ehdr.bss; - dsi->v_start = KZERO; - dsi->v_kernstart = start; - dsi->v_kernend = end; - dsi->v_kernentry = ehdr.entry; - dsi->v_end = end; + dsi->v_start = KZERO; + dsi->v_kernstart = start; + dsi->v_kernend = end; + dsi->v_kernentry = ehdr.entry; + dsi->v_end = end; /* XXX load symbols */ @@ -168,3 +168,12 @@ return ehdr; } +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_load_bin.c --- a/tools/libxc/xc_load_bin.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_load_bin.c Thu Sep 22 17:42:01 2005 @@ -109,8 +109,8 @@ unsigned long *parray, struct domain_setup_info *dsi); int probe_bin(char *image, - unsigned long image_size, - struct load_funcs *load_funcs) + unsigned long image_size, + struct load_funcs *load_funcs) { if ( NULL == findtable(image, image_size) ) { @@ -297,3 +297,13 @@ return 0; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_load_elf.c --- a/tools/libxc/xc_load_elf.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_load_elf.c Thu Sep 22 17:42:01 2005 @@ -30,8 +30,8 @@ struct domain_setup_info *dsi); int probe_elf(char *image, - unsigned long image_size, - struct load_funcs *load_funcs) + unsigned long image_size, + struct load_funcs *load_funcs) { Elf_Ehdr *ehdr = (Elf_Ehdr *)image; @@ -116,7 +116,7 @@ return -EINVAL; } if ( (strstr(guestinfo, "PAE=yes") != NULL) ) - dsi->pae_kernel = 1; + dsi->pae_kernel = 1; break; } @@ -313,3 +313,13 @@ return 0; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_misc.c --- a/tools/libxc/xc_misc.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_misc.c Thu Sep 22 17:42:01 2005 @@ -133,5 +133,15 @@ long xc_init_store(int xc_handle, int remote_port) { - return ioctl(xc_handle, IOCTL_PRIVCMD_INITDOMAIN_STORE, remote_port); + return ioctl(xc_handle, IOCTL_PRIVCMD_INITDOMAIN_STORE, remote_port); } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_private.c Thu Sep 22 17:42:01 2005 @@ -15,7 +15,7 @@ void *addr; addr = mmap(NULL, num*PAGE_SIZE, prot, MAP_SHARED, xc_handle, 0); if ( addr == MAP_FAILED ) - return NULL; + return NULL; ioctlx.num=num; ioctlx.dom=dom; @@ -24,10 +24,10 @@ if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx ) < 0 ) { int saved_errno = errno; - perror("XXXXXXXX"); - (void)munmap(addr, num*PAGE_SIZE); + perror("XXXXXXXX"); + (void)munmap(addr, num*PAGE_SIZE); errno = saved_errno; - return NULL; + return NULL; } return addr; @@ -36,15 +36,15 @@ /*******************/ void *xc_map_foreign_range(int xc_handle, u32 dom, - int size, int prot, - unsigned long mfn ) + int size, int prot, + unsigned long mfn ) { privcmd_mmap_t ioctlx; privcmd_mmap_entry_t entry; void *addr; addr = mmap(NULL, size, prot, MAP_SHARED, xc_handle, 0); if ( addr == MAP_FAILED ) - return NULL; + return NULL; ioctlx.num=1; ioctlx.dom=dom; @@ -55,9 +55,9 @@ if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx ) < 0 ) { int saved_errno = errno; - (void)munmap(addr, size); + (void)munmap(addr, size); errno = saved_errno; - return NULL; + return NULL; } return addr; } @@ -66,7 +66,7 @@ /* NB: arr must be mlock'ed */ int xc_get_pfn_type_batch(int xc_handle, - u32 dom, int num, unsigned long *arr) + u32 dom, int num, unsigned long *arr) { dom0_op_t op; op.cmd = DOM0_GETPAGEFRAMEINFO2; @@ -116,8 +116,8 @@ if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) { - fprintf(stderr, "Dom_mmuext operation failed (rc=%ld errno=%d)-- need to" - " rebuild the user-space tool set?\n",ret,errno); + fprintf(stderr, "Dom_mmuext operation failed (rc=%ld errno=%d)-- need to" + " rebuild the user-space tool set?\n",ret,errno); } safe_munlock(op, nr_ops*sizeof(*op)); @@ -172,7 +172,7 @@ } int xc_add_mmu_update(int xc_handle, xc_mmu_t *mmu, - unsigned long long ptr, unsigned long long val) + unsigned long long ptr, unsigned long long val) { mmu->updates[mmu->idx].ptr = ptr; mmu->updates[mmu->idx].val = val; @@ -229,7 +229,7 @@ if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) { - fprintf(stderr, "hypercall failed (rc=%ld errno=%d)-- need to" + fprintf(stderr, "hypercall failed (rc=%ld errno=%d)-- need to" " rebuild the user-space tool set?\n",ret,errno); } @@ -275,16 +275,16 @@ if ( ioctl( xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN, &mfn ) < 0 ) { - perror("xc_get_m2p_start_mfn:"); - return 0; + perror("xc_get_m2p_start_mfn:"); + return 0; } return mfn; } int xc_get_pfn_list(int xc_handle, - u32 domid, - unsigned long *pfn_buf, - unsigned long max_pfns) + u32 domid, + unsigned long *pfn_buf, + unsigned long max_pfns) { dom0_op_t op; int ret; @@ -306,16 +306,16 @@ #if 0 #ifdef DEBUG - DPRINTF(("Ret for xc_get_pfn_list is %d\n", ret)); - if (ret >= 0) { - int i, j; - for (i = 0; i < op.u.getmemlist.num_pfns; i += 16) { - fprintf(stderr, "0x%x: ", i); - for (j = 0; j < 16; j++) - fprintf(stderr, "0x%lx ", pfn_buf[i + j]); - fprintf(stderr, "\n"); - } - } + DPRINTF(("Ret for xc_get_pfn_list is %d\n", ret)); + if (ret >= 0) { + int i, j; + for (i = 0; i < op.u.getmemlist.num_pfns; i += 16) { + fprintf(stderr, "0x%x: ", i); + for (j = 0; j < 16; j++) + fprintf(stderr, "0x%lx ", pfn_buf[i + j]); + fprintf(stderr, "\n"); + } + } #endif #endif @@ -324,10 +324,10 @@ #ifdef __ia64__ int xc_ia64_get_pfn_list(int xc_handle, - u32 domid, - unsigned long *pfn_buf, - unsigned int start_page, - unsigned int nr_pages) + u32 domid, + unsigned long *pfn_buf, + unsigned int start_page, + unsigned int nr_pages) { dom0_op_t op; int ret; @@ -372,9 +372,9 @@ } int xc_copy_to_domain_page(int xc_handle, - u32 domid, - unsigned long dst_pfn, - void *src_page) + u32 domid, + unsigned long dst_pfn, + void *src_page) { void *vaddr = xc_map_foreign_range( xc_handle, domid, PAGE_SIZE, PROT_WRITE, dst_pfn); @@ -465,18 +465,28 @@ unsigned long new_mfn; if ( xc_domain_memory_decrease_reservation( - xc_handle, domid, 1, 0, &mfn) != 0 ) - { - fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn); - return 0; + xc_handle, domid, 1, 0, &mfn) != 0 ) + { + fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn); + return 0; } if ( xc_domain_memory_increase_reservation( xc_handle, domid, 1, 0, 32, &new_mfn) != 0 ) { - fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn); - return 0; + fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn); + return 0; } return new_mfn; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_ptrace.c --- a/tools/libxc/xc_ptrace.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_ptrace.c Thu Sep 22 17:42:01 2005 @@ -1,25 +1,15 @@ #include <sys/ptrace.h> #include <sys/wait.h> #include "xc_private.h" +#include "xg_private.h" #include <time.h> #define X86_CR0_PE 0x00000001 /* Enable Protected Mode (RW) */ #define X86_CR0_PG 0x80000000 /* Paging (RW) */ - -#define BSD_PAGE_MASK (PAGE_SIZE-1) -#define PG_FRAME (~((unsigned long)BSD_PAGE_MASK) +#define BSD_PAGE_MASK (PAGE_SIZE-1) #define PDRSHIFT 22 -#define PSL_T 0x00000100 /* trace enable bit */ - +#define PSL_T 0x00000100 /* trace enable bit */ #define VCPU 0 /* XXX */ - -/* - * long - * ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); - */ - - -int waitdomain(int domain, int *status, int options); char * ptrace_names[] = { "PTRACE_TRACEME", @@ -69,67 +59,64 @@ int xss; /* 64 */ }; -#define FETCH_REGS(cpu) \ - if (!regs_valid[cpu]) \ - { \ - int retval = xc_domain_get_vcpu_context(xc_handle, domid, cpu, &ctxt[cpu]); \ - if (retval) \ - goto error_out; \ - cr3[cpu] = ctxt[cpu].ctrlreg[3]; /* physical address */ \ - regs_valid[cpu] = 1; \ - } \ +#define FETCH_REGS(cpu) \ + if (!regs_valid[cpu]) \ + { \ + int retval = xc_domain_get_vcpu_context( \ + xc_handle, domid, cpu, &ctxt[cpu]); \ + if (retval) \ + goto error_out; \ + cr3[cpu] = ctxt[cpu].ctrlreg[3]; /* physical address */ \ + regs_valid[cpu] = 1; \ + } #define printval(x) printf("%s = %lx\n", #x, (long)x); -#define SET_PT_REGS(pt, xc) \ -{ \ - pt.ebx = xc.ebx; \ - pt.ecx = xc.ecx; \ - pt.edx = xc.edx; \ - pt.esi = xc.esi; \ - pt.edi = xc.edi; \ - pt.ebp = xc.ebp; \ - pt.eax = xc.eax; \ - pt.eip = xc.eip; \ - pt.xcs = xc.cs; \ - pt.eflags = xc.eflags; \ - pt.esp = xc.esp; \ - pt.xss = xc.ss; \ - pt.xes = xc.es; \ - pt.xds = xc.ds; \ - pt.xfs = xc.fs; \ - pt.xgs = xc.gs; \ -} - -#define SET_XC_REGS(pt, xc) \ -{ \ - xc.ebx = pt->ebx; \ - xc.ecx = pt->ecx; \ - xc.edx = pt->edx; \ - xc.esi = pt->esi; \ - xc.edi = pt->edi; \ - xc.ebp = pt->ebp; \ - xc.eax = pt->eax; \ - xc.eip = pt->eip; \ - xc.cs = pt->xcs; \ - xc.eflags = pt->eflags; \ - xc.esp = pt->esp; \ - xc.ss = pt->xss; \ - xc.es = pt->xes; \ - xc.ds = pt->xds; \ - xc.fs = pt->xfs; \ - xc.gs = pt->xgs; \ -} - +#define SET_PT_REGS(pt, xc) \ +{ \ + pt.ebx = xc.ebx; \ + pt.ecx = xc.ecx; \ + pt.edx = xc.edx; \ + pt.esi = xc.esi; \ + pt.edi = xc.edi; \ + pt.ebp = xc.ebp; \ + pt.eax = xc.eax; \ + pt.eip = xc.eip; \ + pt.xcs = xc.cs; \ + pt.eflags = xc.eflags; \ + pt.esp = xc.esp; \ + pt.xss = xc.ss; \ + pt.xes = xc.es; \ + pt.xds = xc.ds; \ + pt.xfs = xc.fs; \ + pt.xgs = xc.gs; \ +} + +#define SET_XC_REGS(pt, xc) \ +{ \ + xc.ebx = pt->ebx; \ + xc.ecx = pt->ecx; \ + xc.edx = pt->edx; \ + xc.esi = pt->esi; \ + xc.edi = pt->edi; \ + xc.ebp = pt->ebp; \ + xc.eax = pt->eax; \ + xc.eip = pt->eip; \ + xc.cs = pt->xcs; \ + xc.eflags = pt->eflags; \ + xc.esp = pt->esp; \ + xc.ss = pt->xss; \ + xc.es = pt->xes; \ + xc.ds = pt->xds; \ + xc.fs = pt->xfs; \ + xc.gs = pt->xgs; \ +} #define vtopdi(va) ((va) >> PDRSHIFT) #define vtopti(va) (((va) >> PAGE_SHIFT) & 0x3ff) /* XXX application state */ - - -static int xc_handle; -static long nr_pages = 0; -unsigned long *page_array = NULL; +static long nr_pages = 0; +unsigned long *page_array = NULL; static int regs_valid[MAX_VIRT_CPUS]; static unsigned long cr3[MAX_VIRT_CPUS]; static vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; @@ -137,14 +124,60 @@ static inline int paging_enabled(vcpu_guest_context_t *v) { unsigned long cr0 = v->ctrlreg[0]; - return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); } /* --------------------- */ static void * -map_domain_va(unsigned long domid, int cpu, void * guest_va, int perm) +map_domain_va_pae( + int xc_handle, + unsigned long domid, + int cpu, + void *guest_va, + int perm) +{ + unsigned long l2p, l1p, p, va = (unsigned long)guest_va; + u64 *l3, *l2, *l1; + static void *v; + + FETCH_REGS(cpu); + + l3 = xc_map_foreign_range( + xc_handle, domid, PAGE_SIZE, PROT_READ, cr3[cpu] >> PAGE_SHIFT); + if ( l3 == NULL ) + goto error_out; + + l2p = l3[l3_table_offset_pae(va)] >> PAGE_SHIFT; + l2 = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ, l2p); + if ( l2 == NULL ) + goto error_out; + + l1p = l2[l2_table_offset_pae(va)] >> PAGE_SHIFT; + l1 = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, perm, l1p); + if ( l1 == NULL ) + goto error_out; + + p = l1[l1_table_offset_pae(va)] >> PAGE_SHIFT; + if ( v != NULL ) + munmap(v, PAGE_SIZE); + v = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, perm, p); + if ( v == NULL ) + goto error_out; + + return (void *)((unsigned long)v | (va & (PAGE_SIZE - 1))); + + error_out: + return NULL; +} + +static void * +map_domain_va( + int xc_handle, + unsigned long domid, + int cpu, + void *guest_va, + int perm) { unsigned long pde, page; unsigned long va = (unsigned long)guest_va; @@ -155,69 +188,88 @@ static unsigned long pde_phys[MAX_VIRT_CPUS]; static unsigned long *pde_virt[MAX_VIRT_CPUS]; static unsigned long page_phys[MAX_VIRT_CPUS]; - static unsigned long *page_virt[MAX_VIRT_CPUS]; - + static unsigned long *page_virt[MAX_VIRT_CPUS]; static int prev_perm[MAX_VIRT_CPUS]; - - if (nr_pages != npgs) { - if (nr_pages > 0) - free(page_array); - nr_pages = npgs; - if ((page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL) { - printf("Could not allocate memory\n"); - goto error_out; - } - - if (xc_get_pfn_list(xc_handle, domid, page_array, nr_pages) != nr_pages) { - printf("Could not get the page frame list\n"); - goto error_out; - } + static enum { MODE_UNKNOWN, MODE_32, MODE_PAE } mode; + + if ( mode == MODE_UNKNOWN ) + { + xen_capabilities_info_t caps; + (void)xc_version(xc_handle, XENVER_capabilities, caps); + mode = MODE_32; + if ( strstr(caps, "_x86_32p") ) + mode = MODE_PAE; + } + + if ( mode == MODE_PAE ) + return map_domain_va_pae(xc_handle, domid, cpu, guest_va, perm); + + if ( nr_pages != npgs ) + { + if ( nr_pages > 0 ) + free(page_array); + nr_pages = npgs; + if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL ) + { + printf("Could not allocate memory\n"); + goto error_out; + } + if ( xc_get_pfn_list(xc_handle, domid, + page_array, nr_pages) != nr_pages ) + { + printf("Could not get the page frame list\n"); + goto error_out; + } } FETCH_REGS(cpu); - if (cr3[cpu] != cr3_phys[cpu]) - { - cr3_phys[cpu] = cr3[cpu]; - if (cr3_virt[cpu]) - munmap(cr3_virt[cpu], PAGE_SIZE); - if ((cr3_virt[cpu] = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, - PROT_READ, - cr3_phys[cpu] >> PAGE_SHIFT)) == NULL) - goto error_out; + if ( cr3[cpu] != cr3_phys[cpu] ) + { + cr3_phys[cpu] = cr3[cpu]; + if ( cr3_virt[cpu] ) + munmap(cr3_virt[cpu], PAGE_SIZE); + cr3_virt[cpu] = xc_map_foreign_range( + xc_handle, domid, PAGE_SIZE, PROT_READ, + cr3_phys[cpu] >> PAGE_SHIFT); + if ( cr3_virt[cpu] == NULL ) + goto error_out; + } + if ( (pde = cr3_virt[cpu][vtopdi(va)]) == 0 ) + goto error_out; + if ( (ctxt[cpu].flags & VGCF_VMX_GUEST) && paging_enabled(&ctxt[cpu]) ) + pde = page_array[pde >> PAGE_SHIFT] << PAGE_SHIFT; + if ( pde != pde_phys[cpu] ) + { + pde_phys[cpu] = pde; + if ( pde_virt[cpu] ) + munmap(pde_virt[cpu], PAGE_SIZE); + pde_virt[cpu] = xc_map_foreign_range( + xc_handle, domid, PAGE_SIZE, PROT_READ, + pde_phys[cpu] >> PAGE_SHIFT); + if ( pde_virt[cpu] == NULL ) + goto error_out; + } + if ( (page = pde_virt[cpu][vtopti(va)]) == 0 ) + goto error_out; + if ( (ctxt[cpu].flags & VGCF_VMX_GUEST) && paging_enabled(&ctxt[cpu]) ) + page = page_array[page >> PAGE_SHIFT] << PAGE_SHIFT; + if ( (page != page_phys[cpu]) || (perm != prev_perm[cpu]) ) + { + page_phys[cpu] = page; + if ( page_virt[cpu] ) + munmap(page_virt[cpu], PAGE_SIZE); + page_virt[cpu] = xc_map_foreign_range( + xc_handle, domid, PAGE_SIZE, perm, + page_phys[cpu] >> PAGE_SHIFT); + if ( page_virt[cpu] == NULL ) + { + page_phys[cpu] = 0; + goto error_out; + } + prev_perm[cpu] = perm; } - if ((pde = cr3_virt[cpu][vtopdi(va)]) == 0) /* logical address */ - goto error_out; - if ((ctxt[cpu].flags & VGCF_VMX_GUEST) && paging_enabled(&ctxt[cpu])) - pde = page_array[pde >> PAGE_SHIFT] << PAGE_SHIFT; - if (pde != pde_phys[cpu]) - { - pde_phys[cpu] = pde; - if (pde_virt[cpu]) - munmap(pde_virt[cpu], PAGE_SIZE); - if ((pde_virt[cpu] = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, - PROT_READ, - pde_phys[cpu] >> PAGE_SHIFT)) == NULL) - goto error_out; - } - if ((page = pde_virt[cpu][vtopti(va)]) == 0) /* logical address */ - goto error_out; - if (ctxt[cpu].flags & VGCF_VMX_GUEST && paging_enabled(&ctxt[cpu])) - page = page_array[page >> PAGE_SHIFT] << PAGE_SHIFT; - if (page != page_phys[cpu] || perm != prev_perm[cpu]) - { - page_phys[cpu] = page; - if (page_virt[cpu]) - munmap(page_virt[cpu], PAGE_SIZE); - if ((page_virt[cpu] = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, - perm, - page_phys[cpu] >> PAGE_SHIFT)) == NULL) { - printf("cr3 %lx pde %lx page %lx pti %lx\n", cr3[cpu], pde, page, vtopti(va)); - page_phys[cpu] = 0; - goto error_out; - } - prev_perm[cpu] = perm; - } + return (void *)(((unsigned long)page_virt[cpu]) | (va & BSD_PAGE_MASK)); error_out: @@ -225,7 +277,11 @@ } int -xc_waitdomain(int domain, int *status, int options) +xc_waitdomain( + int xc_handle, + int domain, + int *status, + int options) { dom0_op_t op; int retval; @@ -233,38 +289,39 @@ ts.tv_sec = 0; ts.tv_nsec = 10*1000*1000; - if (!xc_handle) - if ((xc_handle = xc_interface_open()) < 0) - { - printf("xc_interface_open failed\n"); - return -1; - } op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = domain; + retry: - retval = do_dom0_op(xc_handle, &op); - if (retval || op.u.getdomaininfo.domain != domain) { - printf("getdomaininfo failed\n"); - goto done; + if ( retval || (op.u.getdomaininfo.domain != domain) ) + { + printf("getdomaininfo failed\n"); + goto done; } *status = op.u.getdomaininfo.flags; - if (options & WNOHANG) - goto done; - - - if (!(op.u.getdomaininfo.flags & DOMFLAGS_PAUSED)) { - nanosleep(&ts,NULL); - goto retry; - } + if ( options & WNOHANG ) + goto done; + + if ( !(op.u.getdomaininfo.flags & DOMFLAGS_PAUSED) ) + { + nanosleep(&ts,NULL); + goto retry; + } + done: return retval; } long -xc_ptrace(enum __ptrace_request request, u32 domid, long eaddr, long edata) +xc_ptrace( + int xc_handle, + enum __ptrace_request request, + u32 domid, + long eaddr, + long edata) { dom0_op_t op; int status = 0; @@ -277,108 +334,124 @@ op.interface_version = DOM0_INTERFACE_VERSION; - if (!xc_handle) - if ((xc_handle = xc_interface_open()) < 0) - return -1; -#if 0 - printf("%20s %d, %p, %p \n", ptrace_names[request], domid, addr, data); -#endif - switch (request) { + switch ( request ) + { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - if ((guest_va = (unsigned long *)map_domain_va(domid, cpu, addr, PROT_READ)) == NULL) { - status = EFAULT; - goto error_out; - } - - retval = *guest_va; - break; + guest_va = (unsigned long *)map_domain_va( + xc_handle, domid, cpu, addr, PROT_READ); + if ( guest_va == NULL ) + { + status = EFAULT; + goto error_out; + } + retval = *guest_va; + break; + case PTRACE_POKETEXT: case PTRACE_POKEDATA: - if ((guest_va = (unsigned long *)map_domain_va(domid, cpu, addr, PROT_READ|PROT_WRITE)) == NULL) { - status = EFAULT; - goto error_out; - } - - *guest_va = (unsigned long)data; - break; + guest_va = (unsigned long *)map_domain_va( + xc_handle, domid, cpu, addr, PROT_READ|PROT_WRITE); + if ( guest_va == NULL ) + { + status = EFAULT; + goto error_out; + } + *guest_va = (unsigned long)data; + break; + case PTRACE_GETREGS: case PTRACE_GETFPREGS: case PTRACE_GETFPXREGS: - FETCH_REGS(cpu); - - if (request == PTRACE_GETREGS) { - SET_PT_REGS(pt, ctxt[cpu].user_regs); - memcpy(data, &pt, sizeof(struct gdb_regs)); - } else if (request == PTRACE_GETFPREGS) - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); - else /*if (request == PTRACE_GETFPXREGS)*/ - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); - break; + FETCH_REGS(cpu); + if ( request == PTRACE_GETREGS ) + { + SET_PT_REGS(pt, ctxt[cpu].user_regs); + memcpy(data, &pt, sizeof(struct gdb_regs)); + } + else if (request == PTRACE_GETFPREGS) + { + memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); + } + else /*if (request == PTRACE_GETFPXREGS)*/ + { + memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); + } + break; + case PTRACE_SETREGS: - op.cmd = DOM0_SETDOMAININFO; - SET_XC_REGS(((struct gdb_regs *)data), ctxt[VCPU].user_regs); - op.u.setdomaininfo.domain = domid; - /* XXX need to understand multiple vcpus */ - op.u.setdomaininfo.vcpu = cpu; - op.u.setdomaininfo.ctxt = &ctxt[cpu]; - retval = do_dom0_op(xc_handle, &op); - if (retval) - goto error_out; - - break; + op.cmd = DOM0_SETDOMAININFO; + SET_XC_REGS(((struct gdb_regs *)data), ctxt[VCPU].user_regs); + op.u.setdomaininfo.domain = domid; + /* XXX need to understand multiple vcpus */ + op.u.setdomaininfo.vcpu = cpu; + op.u.setdomaininfo.ctxt = &ctxt[cpu]; + retval = do_dom0_op(xc_handle, &op); + if (retval) + goto error_out; + break; + case PTRACE_ATTACH: - op.cmd = DOM0_GETDOMAININFO; - op.u.getdomaininfo.domain = domid; - retval = do_dom0_op(xc_handle, &op); - if (retval || op.u.getdomaininfo.domain != domid) { - perror("dom0 op failed"); - goto error_out; - } - if (op.u.getdomaininfo.flags & DOMFLAGS_PAUSED) { - printf("domain currently paused\n"); - goto error_out; - } - printf("domain not currently paused\n"); - op.cmd = DOM0_PAUSEDOMAIN; - op.u.pausedomain.domain = domid; - retval = do_dom0_op(xc_handle, &op); - break; + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = domid; + retval = do_dom0_op(xc_handle, &op); + if ( retval || (op.u.getdomaininfo.domain != domid) ) + { + perror("dom0 op failed"); + goto error_out; + } + if ( op.u.getdomaininfo.flags & DOMFLAGS_PAUSED ) + { + printf("domain currently paused\n"); + goto error_out; + } + printf("domain not currently paused\n"); + op.cmd = DOM0_PAUSEDOMAIN; + op.u.pausedomain.domain = domid; + retval = do_dom0_op(xc_handle, &op); + break; + case PTRACE_SINGLESTEP: - ctxt[VCPU].user_regs.eflags |= PSL_T; - op.cmd = DOM0_SETDOMAININFO; - op.u.setdomaininfo.domain = domid; - op.u.setdomaininfo.vcpu = 0; - op.u.setdomaininfo.ctxt = &ctxt[cpu]; - retval = do_dom0_op(xc_handle, &op); - if (retval) { - perror("dom0 op failed"); - goto error_out; - } - /* FALLTHROUGH */ + ctxt[VCPU].user_regs.eflags |= PSL_T; + op.cmd = DOM0_SETDOMAININFO; + op.u.setdomaininfo.domain = domid; + op.u.setdomaininfo.vcpu = 0; + op.u.setdomaininfo.ctxt = &ctxt[cpu]; + retval = do_dom0_op(xc_handle, &op); + if ( retval ) + { + perror("dom0 op failed"); + goto error_out; + } + /* FALLTHROUGH */ + case PTRACE_CONT: case PTRACE_DETACH: - if (request != PTRACE_SINGLESTEP) { - FETCH_REGS(cpu); - /* Clear trace flag */ - if (ctxt[cpu].user_regs.eflags & PSL_T) { - ctxt[cpu].user_regs.eflags &= ~PSL_T; - op.cmd = DOM0_SETDOMAININFO; - op.u.setdomaininfo.domain = domid; - op.u.setdomaininfo.vcpu = cpu; - op.u.setdomaininfo.ctxt = &ctxt[cpu]; - retval = do_dom0_op(xc_handle, &op); - if (retval) { - perror("dom0 op failed"); - goto error_out; - } - } - } - regs_valid[cpu] = 0; - op.cmd = DOM0_UNPAUSEDOMAIN; - op.u.unpausedomain.domain = domid > 0 ? domid : -domid; - retval = do_dom0_op(xc_handle, &op); - break; + if ( request != PTRACE_SINGLESTEP ) + { + FETCH_REGS(cpu); + /* Clear trace flag */ + if ( ctxt[cpu].user_regs.eflags & PSL_T ) + { + ctxt[cpu].user_regs.eflags &= ~PSL_T; + op.cmd = DOM0_SETDOMAININFO; + op.u.setdomaininfo.domain = domid; + op.u.setdomaininfo.vcpu = cpu; + op.u.setdomaininfo.ctxt = &ctxt[cpu]; + retval = do_dom0_op(xc_handle, &op); + if ( retval ) + { + perror("dom0 op failed"); + goto error_out; + } + } + } + regs_valid[cpu] = 0; + op.cmd = DOM0_UNPAUSEDOMAIN; + op.u.unpausedomain.domain = domid > 0 ? domid : -domid; + retval = do_dom0_op(xc_handle, &op); + break; + case PTRACE_SETFPREGS: case PTRACE_SETFPXREGS: case PTRACE_PEEKUSER: @@ -386,20 +459,33 @@ case PTRACE_SYSCALL: case PTRACE_KILL: #ifdef DEBUG - printf("unsupported xc_ptrace request %s\n", ptrace_names[request]); + printf("unsupported xc_ptrace request %s\n", ptrace_names[request]); #endif - /* XXX not yet supported */ - status = ENOSYS; - break; + /* XXX not yet supported */ + status = ENOSYS; + break; + case PTRACE_TRACEME: - printf("PTRACE_TRACEME is an invalid request under Xen\n"); - status = EINVAL; + printf("PTRACE_TRACEME is an invalid request under Xen\n"); + status = EINVAL; } - if (status) { - errno = status; - retval = -1; - } + if ( status ) + { + errno = status; + retval = -1; + } + error_out: return retval; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_ptrace_core.c --- a/tools/libxc/xc_ptrace_core.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_ptrace_core.c Thu Sep 22 17:42:01 2005 @@ -3,19 +3,14 @@ #include "xc_private.h" #include <time.h> - -#define BSD_PAGE_MASK (PAGE_SIZE-1) -#define PG_FRAME (~((unsigned long)BSD_PAGE_MASK) +#define BSD_PAGE_MASK (PAGE_SIZE-1) #define PDRSHIFT 22 -#define PSL_T 0x00000100 /* trace enable bit */ - #define VCPU 0 /* XXX */ /* * long * ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); */ - struct gdb_regs { long ebx; /* 0 */ @@ -38,44 +33,44 @@ }; #define printval(x) printf("%s = %lx\n", #x, (long)x); -#define SET_PT_REGS(pt, xc) \ -{ \ - pt.ebx = xc.ebx; \ - pt.ecx = xc.ecx; \ - pt.edx = xc.edx; \ - pt.esi = xc.esi; \ - pt.edi = xc.edi; \ - pt.ebp = xc.ebp; \ - pt.eax = xc.eax; \ - pt.eip = xc.eip; \ - pt.xcs = xc.cs; \ - pt.eflags = xc.eflags; \ - pt.esp = xc.esp; \ - pt.xss = xc.ss; \ - pt.xes = xc.es; \ - pt.xds = xc.ds; \ - pt.xfs = xc.fs; \ - pt.xgs = xc.gs; \ -} - -#define SET_XC_REGS(pt, xc) \ -{ \ - xc.ebx = pt->ebx; \ - xc.ecx = pt->ecx; \ - xc.edx = pt->edx; \ - xc.esi = pt->esi; \ - xc.edi = pt->edi; \ - xc.ebp = pt->ebp; \ - xc.eax = pt->eax; \ - xc.eip = pt->eip; \ - xc.cs = pt->xcs; \ - xc.eflags = pt->eflags; \ - xc.esp = pt->esp; \ - xc.ss = pt->xss; \ - xc.es = pt->xes; \ - xc.ds = pt->xds; \ - xc.fs = pt->xfs; \ - xc.gs = pt->xgs; \ +#define SET_PT_REGS(pt, xc) \ +{ \ + pt.ebx = xc.ebx; \ + pt.ecx = xc.ecx; \ + pt.edx = xc.edx; \ + pt.esi = xc.esi; \ + pt.edi = xc.edi; \ + pt.ebp = xc.ebp; \ + pt.eax = xc.eax; \ + pt.eip = xc.eip; \ + pt.xcs = xc.cs; \ + pt.eflags = xc.eflags; \ + pt.esp = xc.esp; \ + pt.xss = xc.ss; \ + pt.xes = xc.es; \ + pt.xds = xc.ds; \ + pt.xfs = xc.fs; \ + pt.xgs = xc.gs; \ +} + +#define SET_XC_REGS(pt, xc) \ +{ \ + xc.ebx = pt->ebx; \ + xc.ecx = pt->ecx; \ + xc.edx = pt->edx; \ + xc.esi = pt->esi; \ + xc.edi = pt->edi; \ + xc.ebp = pt->ebp; \ + xc.eax = pt->eax; \ + xc.eip = pt->eip; \ + xc.cs = pt->xcs; \ + xc.eflags = pt->eflags; \ + xc.esp = pt->esp; \ + xc.ss = pt->xss; \ + xc.es = pt->xes; \ + xc.ds = pt->xds; \ + xc.fs = pt->xfs; \ + xc.gs = pt->xgs; \ } @@ -84,10 +79,9 @@ /* XXX application state */ - -static long nr_pages = 0; -static unsigned long *p2m_array = NULL; -static unsigned long *m2p_array = NULL; +static long nr_pages = 0; +static unsigned long *p2m_array = NULL; +static unsigned long *m2p_array = NULL; static unsigned long pages_offset; static unsigned long cr3[MAX_VIRT_CPUS]; static vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; @@ -117,54 +111,54 @@ if (cr3[cpu] != cr3_phys[cpu]) { - cr3_phys[cpu] = cr3[cpu]; - if (cr3_virt[cpu]) - munmap(cr3_virt[cpu], PAGE_SIZE); - v = mmap( + cr3_phys[cpu] = cr3[cpu]; + if (cr3_virt[cpu]) + munmap(cr3_virt[cpu], PAGE_SIZE); + v = mmap( NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE, domfd, map_mtop_offset(cr3_phys[cpu])); if (v == MAP_FAILED) - { - perror("mmap failed"); - goto error_out; - } + { + perror("mmap failed"); + goto error_out; + } cr3_virt[cpu] = v; } if ((pde = cr3_virt[cpu][vtopdi(va)]) == 0) /* logical address */ - goto error_out; + goto error_out; if (ctxt[cpu].flags & VGCF_VMX_GUEST) - pde = p2m_array[pde >> PAGE_SHIFT] << PAGE_SHIFT; + pde = p2m_array[pde >> PAGE_SHIFT] << PAGE_SHIFT; if (pde != pde_phys[cpu]) { - pde_phys[cpu] = pde; - if (pde_virt[cpu]) - munmap(pde_virt[cpu], PAGE_SIZE); - v = mmap( + pde_phys[cpu] = pde; + if (pde_virt[cpu]) + munmap(pde_virt[cpu], PAGE_SIZE); + v = mmap( NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE, domfd, map_mtop_offset(pde_phys[cpu])); if (v == MAP_FAILED) - goto error_out; + goto error_out; pde_virt[cpu] = v; } if ((page = pde_virt[cpu][vtopti(va)]) == 0) /* logical address */ - goto error_out; + goto error_out; if (ctxt[cpu].flags & VGCF_VMX_GUEST) - page = p2m_array[page >> PAGE_SHIFT] << PAGE_SHIFT; + page = p2m_array[page >> PAGE_SHIFT] << PAGE_SHIFT; if (page != page_phys[cpu]) { - page_phys[cpu] = page; - if (page_virt[cpu]) - munmap(page_virt[cpu], PAGE_SIZE); - v = mmap( + page_phys[cpu] = page; + if (page_virt[cpu]) + munmap(page_virt[cpu], PAGE_SIZE); + v = mmap( NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE, domfd, map_mtop_offset(page_phys[cpu])); if (v == MAP_FAILED) { - printf("cr3 %lx pde %lx page %lx pti %lx\n", cr3[cpu], pde, page, vtopti(va)); - page_phys[cpu] = 0; - goto error_out; - } + printf("cr3 %lx pde %lx page %lx pti %lx\n", cr3[cpu], pde, page, vtopti(va)); + page_phys[cpu] = 0; + goto error_out; + } page_virt[cpu] = v; - } + } return (void *)(((unsigned long)page_virt[cpu]) | (va & BSD_PAGE_MASK)); error_out: @@ -172,7 +166,11 @@ } int -xc_waitdomain_core(int domfd, int *status, int options) +xc_waitdomain_core( + int xc_handle, + int domfd, + int *status, + int options) { int retval = -1; int nr_vcpus; @@ -181,37 +179,37 @@ if (nr_pages == 0) { - if (read(domfd, &header, sizeof(header)) != sizeof(header)) - return -1; - - nr_pages = header.xch_nr_pages; - nr_vcpus = header.xch_nr_vcpus; - pages_offset = header.xch_pages_offset; - - if (read(domfd, ctxt, sizeof(vcpu_guest_context_t)*nr_vcpus) != - sizeof(vcpu_guest_context_t)*nr_vcpus) - return -1; - - for (i = 0; i < nr_vcpus; i++) { - cr3[i] = ctxt[i].ctrlreg[3]; - } - if ((p2m_array = malloc(nr_pages * sizeof(unsigned long))) == NULL) { - printf("Could not allocate p2m_array\n"); - goto error_out; - } - if (read(domfd, p2m_array, sizeof(unsigned long)*nr_pages) != - sizeof(unsigned long)*nr_pages) - return -1; - - if ((m2p_array = malloc((1<<20) * sizeof(unsigned long))) == NULL) { - printf("Could not allocate m2p array\n"); - goto error_out; - } - bzero(m2p_array, sizeof(unsigned long)* 1 << 20); - - for (i = 0; i < nr_pages; i++) { - m2p_array[p2m_array[i]] = i; - } + if (read(domfd, &header, sizeof(header)) != sizeof(header)) + return -1; + + nr_pages = header.xch_nr_pages; + nr_vcpus = header.xch_nr_vcpus; + pages_offset = header.xch_pages_offset; + + if (read(domfd, ctxt, sizeof(vcpu_guest_context_t)*nr_vcpus) != + sizeof(vcpu_guest_context_t)*nr_vcpus) + return -1; + + for (i = 0; i < nr_vcpus; i++) { + cr3[i] = ctxt[i].ctrlreg[3]; + } + if ((p2m_array = malloc(nr_pages * sizeof(unsigned long))) == NULL) { + printf("Could not allocate p2m_array\n"); + goto error_out; + } + if (read(domfd, p2m_array, sizeof(unsigned long)*nr_pages) != + sizeof(unsigned long)*nr_pages) + return -1; + + if ((m2p_array = malloc((1<<20) * sizeof(unsigned long))) == NULL) { + printf("Could not allocate m2p array\n"); + goto error_out; + } + bzero(m2p_array, sizeof(unsigned long)* 1 << 20); + + for (i = 0; i < nr_pages; i++) { + m2p_array[p2m_array[i]] = i; + } } retval = 0; @@ -221,7 +219,12 @@ } long -xc_ptrace_core(enum __ptrace_request request, u32 domfd, long eaddr, long edata) +xc_ptrace_core( + int xc_handle, + enum __ptrace_request request, + u32 domfd, + long eaddr, + long edata) { int status = 0; struct gdb_regs pt; @@ -234,38 +237,38 @@ #if 0 printf("%20s %d, %p, %p \n", ptrace_names[request], domid, addr, data); #endif - switch (request) { + switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - if ((guest_va = (unsigned long *)map_domain_va(domfd, cpu, addr)) == NULL) { - status = EFAULT; - goto error_out; - } - - retval = *guest_va; - break; + if ((guest_va = (unsigned long *)map_domain_va(domfd, cpu, addr)) == NULL) { + status = EFAULT; + goto error_out; + } + + retval = *guest_va; + break; case PTRACE_POKETEXT: case PTRACE_POKEDATA: - if ((guest_va = (unsigned long *)map_domain_va(domfd, cpu, addr)) == NULL) { - status = EFAULT; - goto error_out; - } - *guest_va = (unsigned long)data; - break; + if ((guest_va = (unsigned long *)map_domain_va(domfd, cpu, addr)) == NULL) { + status = EFAULT; + goto error_out; + } + *guest_va = (unsigned long)data; + break; case PTRACE_GETREGS: case PTRACE_GETFPREGS: case PTRACE_GETFPXREGS: - if (request == PTRACE_GETREGS) { - SET_PT_REGS(pt, ctxt[cpu].user_regs); - memcpy(data, &pt, sizeof(struct gdb_regs)); - } else if (request == PTRACE_GETFPREGS) - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); - else /*if (request == PTRACE_GETFPXREGS)*/ - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); - break; + if (request == PTRACE_GETREGS) { + SET_PT_REGS(pt, ctxt[cpu].user_regs); + memcpy(data, &pt, sizeof(struct gdb_regs)); + } else if (request == PTRACE_GETFPREGS) + memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); + else /*if (request == PTRACE_GETFPXREGS)*/ + memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); + break; case PTRACE_ATTACH: - retval = 0; - break; + retval = 0; + break; case PTRACE_SETREGS: case PTRACE_SINGLESTEP: case PTRACE_CONT: @@ -277,19 +280,29 @@ case PTRACE_SYSCALL: case PTRACE_KILL: #ifdef DEBUG - printf("unsupported xc_ptrace request %s\n", ptrace_names[request]); + printf("unsupported xc_ptrace request %s\n", ptrace_names[request]); #endif - status = ENOSYS; - break; + status = ENOSYS; + break; case PTRACE_TRACEME: - printf("PTRACE_TRACEME is an invalid request under Xen\n"); - status = EINVAL; + printf("PTRACE_TRACEME is an invalid request under Xen\n"); + status = EINVAL; } if (status) { - errno = status; - retval = -1; + errno = status; + retval = -1; } error_out: return retval; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xc_vmx_build.c --- a/tools/libxc/xc_vmx_build.c Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xc_vmx_build.c Thu Sep 22 17:42:01 2005 @@ -107,11 +107,38 @@ mem_mapp->nr_map = nr_map; } +/* + * Use E820 reserved memory 0x9F800 to pass number of vcpus to vmxloader + * vmxloader will use it to config ACPI MADT table + */ +#define VCPU_MAGIC 0x76637075 /* "vcpu" */ +static int +set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list, + struct domain_setup_info *dsi, unsigned long vcpus) +{ + char *va_map; + unsigned long *va_vcpus; + + va_map = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]); + if ( va_map == NULL ) + return -1; + + va_vcpus = (unsigned long *)(va_map + 0x800); + *va_vcpus++ = VCPU_MAGIC; + *va_vcpus++ = vcpus; + + munmap(va_map, PAGE_SIZE); + + return 0; +} + #ifdef __i386__ static int zap_mmio_range(int xc_handle, u32 dom, - l2_pgentry_32_t *vl2tab, - unsigned long mmio_range_start, - unsigned long mmio_range_size) + l2_pgentry_32_t *vl2tab, + unsigned long mmio_range_start, + unsigned long mmio_range_size) { unsigned long mmio_addr; unsigned long mmio_range_end = mmio_range_start + mmio_range_size; @@ -123,12 +150,14 @@ vl2e = vl2tab[l2_table_offset(mmio_addr)]; if (vl2e == 0) continue; - vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, vl2e >> PAGE_SHIFT); - if (vl1tab == 0) { - PERROR("Failed zap MMIO range"); - return -1; - } + vl1tab = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, + PROT_READ|PROT_WRITE, vl2e >> PAGE_SHIFT); + if ( vl1tab == 0 ) + { + PERROR("Failed zap MMIO range"); + return -1; + } vl1tab[l1_table_offset(mmio_addr)] = 0; munmap(vl1tab, PAGE_SIZE); } @@ -136,114 +165,118 @@ } static int zap_mmio_ranges(int xc_handle, u32 dom, - unsigned long l2tab, - struct mem_map *mem_mapp) + unsigned long l2tab, + struct mem_map *mem_mapp) { int i; l2_pgentry_32_t *vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, - l2tab >> PAGE_SHIFT); - if (vl2tab == 0) - return -1; - for (i = 0; i < mem_mapp->nr_map; i++) { - if ((mem_mapp->map[i].type == E820_IO) - && (mem_mapp->map[i].caching_attr == MEMMAP_UC)) - if (zap_mmio_range(xc_handle, dom, vl2tab, - mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1) - return -1; - } + PROT_READ|PROT_WRITE, + l2tab >> PAGE_SHIFT); + if ( vl2tab == 0 ) + return -1; + + for ( i = 0; i < mem_mapp->nr_map; i++ ) + { + if ( (mem_mapp->map[i].type == E820_IO) && + (mem_mapp->map[i].caching_attr == MEMMAP_UC) && + (zap_mmio_range(xc_handle, dom, vl2tab, + mem_mapp->map[i].addr, + mem_mapp->map[i].size) == -1) ) + return -1; + } + munmap(vl2tab, PAGE_SIZE); return 0; } #else static int zap_mmio_range(int xc_handle, u32 dom, - l3_pgentry_t *vl3tab, - unsigned long mmio_range_start, - unsigned long mmio_range_size) -{ - unsigned long mmio_addr; - unsigned long mmio_range_end = mmio_range_start + mmio_range_size; - unsigned long vl2e = 0; - unsigned long vl3e; - l1_pgentry_t *vl1tab; - l2_pgentry_t *vl2tab; + l3_pgentry_t *vl3tab, + unsigned long mmio_range_start, + unsigned long mmio_range_size) +{ + unsigned long mmio_addr; + unsigned long mmio_range_end = mmio_range_start + mmio_range_size; + unsigned long vl2e = 0; + unsigned long vl3e; + l1_pgentry_t *vl1tab; + l2_pgentry_t *vl2tab; - mmio_addr = mmio_range_start & PAGE_MASK; - for ( ; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE ) - { - vl3e = vl3tab[l3_table_offset(mmio_addr)]; - if ( vl3e == 0 ) - continue; - - vl2tab = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, vl3e>>PAGE_SHIFT); - if ( vl2tab == NULL ) - { - PERROR("Failed zap MMIO range"); - return -1; - } - - vl2e = vl2tab[l2_table_offset(mmio_addr)]; - if ( vl2e == 0 ) - { - munmap(vl2tab, PAGE_SIZE); - continue; - } - - vl1tab = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, vl2e>>PAGE_SHIFT); - if ( vl1tab == NULL ) - { - PERROR("Failed zap MMIO range"); - munmap(vl2tab, PAGE_SIZE); - return -1; - } - - vl1tab[l1_table_offset(mmio_addr)] = 0; - munmap(vl2tab, PAGE_SIZE); - munmap(vl1tab, PAGE_SIZE); - } - return 0; + mmio_addr = mmio_range_start & PAGE_MASK; + for ( ; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE ) + { + vl3e = vl3tab[l3_table_offset(mmio_addr)]; + if ( vl3e == 0 ) + continue; + + vl2tab = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, vl3e>>PAGE_SHIFT); + if ( vl2tab == NULL ) + { + PERROR("Failed zap MMIO range"); + return -1; + } + + vl2e = vl2tab[l2_table_offset(mmio_addr)]; + if ( vl2e == 0 ) + { + munmap(vl2tab, PAGE_SIZE); + continue; + } + + vl1tab = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, vl2e>>PAGE_SHIFT); + if ( vl1tab == NULL ) + { + PERROR("Failed zap MMIO range"); + munmap(vl2tab, PAGE_SIZE); + return -1; + } + + vl1tab[l1_table_offset(mmio_addr)] = 0; + munmap(vl2tab, PAGE_SIZE); + munmap(vl1tab, PAGE_SIZE); + } + return 0; } static int zap_mmio_ranges(int xc_handle, u32 dom, unsigned long l3tab, struct mem_map *mem_mapp) { - int i; - l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, - l3tab >> PAGE_SHIFT); - if (vl3tab == 0) - return -1; - for (i = 0; i < mem_mapp->nr_map; i++) { - if ((mem_mapp->map[i].type == E820_IO) - && (mem_mapp->map[i].caching_attr == MEMMAP_UC)) - if (zap_mmio_range(xc_handle, dom, vl3tab, - mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1) - return -1; - } - munmap(vl3tab, PAGE_SIZE); - return 0; + int i; + l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, + PROT_READ|PROT_WRITE, + l3tab >> PAGE_SHIFT); + if (vl3tab == 0) + return -1; + for (i = 0; i < mem_mapp->nr_map; i++) { + if ((mem_mapp->map[i].type == E820_IO) + && (mem_mapp->map[i].caching_attr == MEMMAP_UC)) + if (zap_mmio_range(xc_handle, dom, vl3tab, + mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1) + return -1; + } + munmap(vl3tab, PAGE_SIZE); + return 0; } #endif static int setup_guest(int xc_handle, - u32 dom, int memsize, - char *image, unsigned long image_size, - gzFile initrd_gfd, unsigned long initrd_len, - unsigned long nr_pages, - vcpu_guest_context_t *ctxt, - const char *cmdline, - unsigned long shared_info_frame, - unsigned int control_evtchn, - unsigned long flags, - unsigned int vcpus, - unsigned int store_evtchn, - unsigned long *store_mfn, - struct mem_map *mem_mapp - ) + u32 dom, int memsize, + char *image, unsigned long image_size, + gzFile initrd_gfd, unsigned long initrd_len, + unsigned long nr_pages, + vcpu_guest_context_t *ctxt, + const char *cmdline, + unsigned long shared_info_frame, + unsigned int control_evtchn, + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn, + struct mem_map *mem_mapp + ) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -303,7 +336,8 @@ /* memsize is in megabytes */ v_end = memsize << 20; - vinitrd_end = v_end - PAGE_SIZE; /* leaving the top 4k untouched for IO requests page use */ + /* leaving the top 4k untouched for IO requests page use */ + vinitrd_end = v_end - PAGE_SIZE; vinitrd_start = vinitrd_end - initrd_len; vinitrd_start = vinitrd_start & (~(PAGE_SIZE - 1)); @@ -369,16 +403,28 @@ goto error_out; } xc_copy_to_domain_page(xc_handle, dom, - page_array[i>>PAGE_SHIFT], page); + page_array[i>>PAGE_SHIFT], page); } } if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL ) goto error_out; + /* First allocate page for page dir or pdpt */ + ppt_alloc = (vpt_start - dsi.v_start) >> PAGE_SHIFT; + if ( page_array[ppt_alloc] > 0xfffff ) + { + unsigned long nmfn; + nmfn = xc_make_page_below_4G( xc_handle, dom, page_array[ppt_alloc] ); + if ( nmfn == 0 ) + { + fprintf(stderr, "Couldn't get a page below 4GB :-(\n"); + goto error_out; + } + page_array[ppt_alloc] = nmfn; + } + #ifdef __i386__ - /* First allocate page for page dir. */ - ppt_alloc = (vpt_start - dsi.v_start) >> PAGE_SHIFT; l2tab = page_array[ppt_alloc++] << PAGE_SHIFT; ctxt->ctrlreg[3] = l2tab; @@ -414,8 +460,6 @@ munmap(vl1tab, PAGE_SIZE); munmap(vl2tab, PAGE_SIZE); #else - /* First allocate pdpt */ - ppt_alloc = (vpt_start - dsi.v_start) >> PAGE_SHIFT; /* here l3tab means pdpt, only 4 entry is used */ l3tab = page_array[ppt_alloc++] << PAGE_SHIFT; ctxt->ctrlreg[3] = l3tab; @@ -438,8 +482,8 @@ munmap(vl2tab, PAGE_SIZE); if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, - l2tab >> PAGE_SHIFT)) == NULL ) + PROT_READ|PROT_WRITE, + l2tab >> PAGE_SHIFT)) == NULL ) goto error_out; memset(vl2tab, 0, PAGE_SIZE); @@ -452,8 +496,8 @@ if ( vl1tab != NULL ) munmap(vl1tab, PAGE_SIZE); if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, - l1tab >> PAGE_SHIFT)) == NULL ) + PROT_READ|PROT_WRITE, + l1tab >> PAGE_SHIFT)) == NULL ) { munmap(vl2tab, PAGE_SIZE); goto error_out; @@ -475,15 +519,16 @@ for ( count = 0; count < nr_pages; count++ ) { if ( xc_add_mmu_update(xc_handle, mmu, - (page_array[count] << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE, count) ) - goto error_out; - } - + (page_array[count] << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE, count) ) + goto error_out; + } + + set_nr_vcpus(xc_handle, dom, page_array, &dsi, vcpus); if ((boot_paramsp = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0) + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0) goto error_out; memset(boot_paramsp, 0, sizeof(*boot_paramsp)); @@ -548,9 +593,9 @@ #if defined (__i386__) if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1) #else - if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1) + if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1) #endif - goto error_out; + goto error_out; boot_paramsp->e820_map_nr = mem_mapp->nr_map; for (i=0; i<mem_mapp->nr_map; i++) { boot_paramsp->e820_map[i].addr = mem_mapp->map[i].addr; @@ -562,9 +607,9 @@ munmap(boot_paramsp, PAGE_SIZE); if ((boot_gdtp = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - page_array[(vboot_gdt_start-dsi.v_start)>>PAGE_SHIFT])) == 0) - goto error_out; + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + page_array[(vboot_gdt_start-dsi.v_start)>>PAGE_SHIFT])) == 0) + goto error_out; memset(boot_gdtp, 0, PAGE_SIZE); boot_gdtp[12*4 + 0] = boot_gdtp[13*4 + 0] = 0xffff; /* limit */ boot_gdtp[12*4 + 1] = boot_gdtp[13*4 + 1] = 0x0000; /* base */ @@ -574,20 +619,24 @@ /* shared_info page starts its life empty. */ if ((shared_info = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - shared_info_frame)) == 0) - goto error_out; + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + shared_info_frame)) == 0) + goto error_out; memset(shared_info, 0, sizeof(shared_info_t)); /* Mask all upcalls... */ for ( i = 0; i < MAX_VIRT_CPUS; i++ ) shared_info->vcpu_data[i].evtchn_upcall_mask = 1; + + shared_info->n_vcpu = vcpus; + printf(" VCPUS: %d\n", shared_info->n_vcpu); + munmap(shared_info, PAGE_SIZE); /* Populate the event channel port in the shared page */ if ((sp = (shared_iopage_t *) xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - page_array[shared_page_frame])) == 0) - goto error_out; + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + page_array[shared_page_frame])) == 0) + goto error_out; memset(sp, 0, PAGE_SIZE); sp->sp_global.eport = control_evtchn; munmap(sp, PAGE_SIZE); @@ -612,7 +661,7 @@ ctxt->user_regs.edx = vboot_gdt_start; ctxt->user_regs.eax = 0x800; ctxt->user_regs.esp = vboot_gdt_end; - ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */ + ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */ ctxt->user_regs.ecx = mem_mapp->nr_map; ctxt->user_regs.esi = vboot_params_start; ctxt->user_regs.edi = vboot_params_start + 0x2d0; @@ -636,9 +685,9 @@ #ifdef __i386__ __asm__ __volatile__ ("pushl %%ebx; cpuid; popl %%ebx" - : "=a" (eax), "=c" (ecx) - : "0" (1) - : "dx"); + : "=a" (eax), "=c" (ecx) + : "0" (1) + : "dx"); #elif defined __x86_64__ __asm__ __volatile__ ("pushq %%rbx; cpuid; popq %%rbx" : "=a" (eax), "=c" (ecx) @@ -653,17 +702,17 @@ } int xc_vmx_build(int xc_handle, - u32 domid, - int memsize, - const char *image_name, - struct mem_map *mem_mapp, - const char *ramdisk_name, - const char *cmdline, - unsigned int control_evtchn, - unsigned long flags, - unsigned int vcpus, - unsigned int store_evtchn, - unsigned long *store_mfn) + u32 domid, + int memsize, + const char *image_name, + struct mem_map *mem_mapp, + const char *ramdisk_name, + const char *cmdline, + unsigned int control_evtchn, + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn) { dom0_op_t launch_op, op; int initrd_fd = -1; @@ -735,11 +784,11 @@ } if ( setup_guest(xc_handle, domid, memsize, image, image_size, - initrd_gfd, initrd_size, nr_pages, - ctxt, cmdline, - op.u.getdomaininfo.shared_info_frame, - control_evtchn, flags, vcpus, store_evtchn, store_mfn, - mem_mapp) < 0 ) + initrd_gfd, initrd_size, nr_pages, + ctxt, cmdline, + op.u.getdomaininfo.shared_info_frame, + control_evtchn, flags, vcpus, store_evtchn, store_mfn, + mem_mapp) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; @@ -770,8 +819,8 @@ /* Ring 1 stack is the initial stack. */ /* - ctxt->kernel_ss = FLAT_KERNEL_DS; - ctxt->kernel_sp = vstartinfo_start; + ctxt->kernel_ss = FLAT_KERNEL_DS; + ctxt->kernel_sp = vstartinfo_start; */ /* No debugging. */ memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg)); @@ -851,7 +900,7 @@ return -EINVAL; } shdr = (Elf32_Shdr *)(elfbase + ehdr->e_shoff + - (ehdr->e_shstrndx*ehdr->e_shentsize)); + (ehdr->e_shstrndx*ehdr->e_shentsize)); shstrtab = elfbase + shdr->sh_offset; for ( h = 0; h < ehdr->e_phnum; h++ ) @@ -906,9 +955,9 @@ { pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET; if ((va = xc_map_foreign_range( - xch, dom, PAGE_SIZE, PROT_WRITE, - parray[pa>>PAGE_SHIFT])) == 0) - return -1; + xch, dom, PAGE_SIZE, PROT_WRITE, + parray[pa>>PAGE_SHIFT])) == 0) + return -1; chunksz = phdr->p_filesz - done; if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) ) chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1)); @@ -921,9 +970,9 @@ { pa = (phdr->p_paddr + done) - dsi->v_start - LINUX_PAGE_OFFSET; if ((va = xc_map_foreign_range( - xch, dom, PAGE_SIZE, PROT_WRITE, - parray[pa>>PAGE_SHIFT])) == 0) - return -1; + xch, dom, PAGE_SIZE, PROT_WRITE, + parray[pa>>PAGE_SHIFT])) == 0) + return -1; chunksz = phdr->p_memsz - done; if ( chunksz > (PAGE_SIZE - (pa & (PAGE_SIZE-1))) ) chunksz = PAGE_SIZE - (pa & (PAGE_SIZE-1)); @@ -934,3 +983,13 @@ return 0; } + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Sep 22 17:34:14 2005 +++ b/tools/libxc/xenctrl.h Thu Sep 22 17:42:01 2005 @@ -101,23 +101,31 @@ } xc_core_header_t; -long xc_ptrace(enum __ptrace_request request, - u32 domid, - long addr, - long data); - -long xc_ptrace_core(enum __ptrace_request request, - u32 domid, - long addr, - long data); - -int xc_waitdomain(int domain, - int *status, - int options); - -int xc_waitdomain_core(int domain, - int *status, - int options); +long xc_ptrace( + int xc_handle, + enum __ptrace_request request, + u32 domid, + long addr, + long data); + +long xc_ptrace_core( + int xc_handle, + enum __ptrace_request request, + u32 domid, + long addr, + long data); + +int xc_waitdomain( + int xc_handle, + int domain, + int *status, + int options); + +int xc_waitdomain_core( + int xc_handle, + int domain, + int *status, + int options); /* * DOMAIN MANAGEMENT FUNCTIONS diff -r 97dbd9524a7e -r 06d84bf87159 tools/misc/xend --- a/tools/misc/xend Thu Sep 22 17:34:14 2005 +++ b/tools/misc/xend Thu Sep 22 17:42:01 2005 @@ -86,9 +86,6 @@ daemon = SrvDaemon.instance() if not sys.argv[1:]: print 'usage: %s {start|stop|restart}' % sys.argv[0] - elif os.fork(): - pid, status = os.wait() - return status >> 8 elif sys.argv[1] == 'start': start_xenstored() start_consoled() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/pylintrc --- a/tools/python/pylintrc Thu Sep 22 17:34:14 2005 +++ b/tools/python/pylintrc Thu Sep 22 17:42:01 2005 @@ -74,7 +74,7 @@ init-import=no # List of variable names used for dummy variables (i.e. not used). -dummy-variables=_,dummy +dummy-variables=_,_1,_2,_3,_4,_5,dummy @@ -131,7 +131,7 @@ bad-names=foo,bar,baz,toto,tutu,tata # List of builtins function names that should not be used, separated by a comma -bad-functions=map,filter,apply,input +bad-functions=apply,input diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/lowlevel/xc/xc.c Thu Sep 22 17:42:01 2005 @@ -220,7 +220,13 @@ return PyErr_NoMemory(); nr_doms = xc_domain_getinfo(xc->xc_handle, first_dom, max_doms, info); - + + if (nr_doms < 0) + { + free(info); + return PyErr_SetFromErrno(xc_error); + } + list = PyList_New(nr_doms); for ( i = 0 ; i < nr_doms; i++ ) { @@ -844,7 +850,7 @@ XcObject *xc = (XcObject *)self; u32 dom; - unsigned long maxmem_kb; + unsigned int maxmem_kb; static char *kwd_list[] = { "dom", "maxmem_kb", NULL }; @@ -1175,7 +1181,7 @@ METH_VARARGS | METH_KEYWORDS, "\n" "Set a domain's memory limit\n" " dom [int]: Identifier of domain.\n" - " maxmem_kb [long]: .\n" + " maxmem_kb [int]: .\n" "Returns: [int] 0 on success; -1 on error.\n" }, { "domain_memory_increase_reservation", diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/lowlevel/xs/xs.c --- a/tools/python/xen/lowlevel/xs/xs.c Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/lowlevel/xs/xs.c Thu Sep 22 17:42:01 2005 @@ -116,8 +116,6 @@ "Write data to a path.\n" \ " path [string] : xenstore path to write to\n." \ " data [string] : data to write.\n" \ - " create [int] : create flag, default 0.\n" \ - " excl [int] : exclusive flag, default 0.\n" \ "\n" \ "Returns None on success.\n" \ "Raises RuntimeError on error.\n" \ @@ -125,30 +123,23 @@ static PyObject *xspy_write(PyObject *self, PyObject *args, PyObject *kwds) { - static char *kwd_spec[] = { "path", "data", "create", "excl", NULL }; - static char *arg_spec = "ss#|ii"; + static char *kwd_spec[] = { "path", "data", NULL }; + static char *arg_spec = "ss#"; char *path = NULL; char *data = NULL; int data_n = 0; - int create = 0; - int excl = 0; - - struct xs_handle *xh = xshandle(self); - PyObject *val = NULL; - int flags = 0; + + struct xs_handle *xh = xshandle(self); + PyObject *val = NULL; int xsval = 0; if (!xh) goto exit; if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, - &path, &data, &data_n, &create, &excl)) - goto exit; - if (create) - flags |= O_CREAT; - if (excl) - flags |= O_EXCL; - Py_BEGIN_ALLOW_THREADS - xsval = xs_write(xh, path, data, data_n, flags); + &path, &data, &data_n)) + goto exit; + Py_BEGIN_ALLOW_THREADS + xsval = xs_write(xh, path, data, data_n); Py_END_ALLOW_THREADS if (!xsval) { PyErr_SetFromErrno(PyExc_RuntimeError); @@ -808,6 +799,48 @@ } Py_INCREF(Py_None); val = Py_None; + exit: + return val; +} + +#define xspy_get_domain_path_doc "\n" \ + "Return store path of domain.\n" \ + " domid [int]: domain id\n" \ + "\n" \ + "Returns: [string] domain store path.\n" \ + " None if domid doesn't exist.\n" \ + "Raises RuntimeError on error.\n" \ + "\n" + +static PyObject *xspy_get_domain_path(PyObject *self, PyObject *args, + PyObject *kwds) +{ + static char *kwd_spec[] = { "domid", NULL }; + static char *arg_spec = "i"; + int domid = 0; + + struct xs_handle *xh = xshandle(self); + char *xsval = NULL; + PyObject *val = NULL; + + if (!xh) + goto exit; + if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, + &domid)) + goto exit; + Py_BEGIN_ALLOW_THREADS + xsval = xs_get_domain_path(xh, domid); + Py_END_ALLOW_THREADS + if (!xsval) { + if (errno == ENOENT) { + Py_INCREF(Py_None); + val = Py_None; + } else + PyErr_SetFromErrno(PyExc_RuntimeError); + goto exit; + } + val = PyString_FromString(xsval); + free(xsval); exit: return val; } @@ -858,6 +891,7 @@ XSPY_METH(release_domain), XSPY_METH(close), XSPY_METH(shutdown), + XSPY_METH(get_domain_path), XSPY_METH(fileno), { /* Terminator. */ }, }; diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/sv/Main.py --- a/tools/python/xen/sv/Main.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/sv/Main.py Thu Sep 22 17:42:01 2005 @@ -1,5 +1,4 @@ -from xen.sv.HTMLBase import HTMLBase from xen.sv.NodeInfo import NodeInfo from xen.sv.DomInfo import DomInfo from xen.sv.CreateDomain import CreateDomain @@ -33,15 +32,8 @@ result.append( (key, self.fieldStorage.getlist( key ) ) ) return result -class TwistedAdapter: - def __init__( self, req ): - self.args = Args( req ) - self.uri = req.unparsed_uri - self.url = req.uri - self.write = req.write - # This is the Main class -# It peices together all the modules +# It pieces together all the modules class Main: def __init__( self ): @@ -61,7 +53,7 @@ self.init_modules( request ) self.init_done = True - for moduleName, module in self.modules.iteritems(): + for _, module in self.modules.iteritems(): module.write_MENU( request ) request.write( "\n" ) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/sv/Wizard.py --- a/tools/python/xen/sv/Wizard.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/sv/Wizard.py Thu Sep 22 17:42:01 2005 @@ -47,7 +47,7 @@ def __init__( self, urlWriter, title, location ): HTMLBase.__init__( self ) self.urlWriter = urlWriter - self.feilds = [] + self.fields = [] self.title = title self.location = location self.passback = None @@ -86,9 +86,9 @@ request.write( "<table width='100%' cellpadding='0' cellspacing='1' border='0'>" ) - for (feild, control) in self.feilds: - control.write_Control( request, previous_values.get( feild ) ) - if previous_values.get( feild ) is not None and not control.validate( previous_values.get( feild ) ): + for (field, control) in self.fields: + control.write_Control( request, previous_values.get( field ) ) + if previous_values.get( field ) is not None and not control.validate( previous_values.get( field ) ): control.write_Help( request ) request.write( "</table>" ) @@ -97,7 +97,7 @@ #request.write( "<input type='hidden' name='visited-sheet%s' value='True'></p>" % self.location ) def addControl( self, control ): - self.feilds.append( [ control.getName(), control ] ) + self.fields.append( [ control.getName(), control ] ) def validate( self, request ): @@ -108,10 +108,10 @@ previous_values = ssxp2hash( string2sxp( self.passback ) ) #get the map for quick reference if DEBUG: print previous_values - for (feild, control) in self.feilds: - if not control.validate( previous_values.get( feild ) ): + for (field, control) in self.fields: + if not control.validate( previous_values.get( field ) ): check = False - if DEBUG: print "> %s = %s" % (feild, previous_values.get( feild )) + if DEBUG: print "> %s = %s" % (field, previous_values.get( field )) return check @@ -143,7 +143,7 @@ class InputControl( SheetControl ): - def __init__( self, name, defaultValue, humanText, reg_exp = ".*", help_text = "You must enter the appropriate details in this feild." ): + def __init__( self, name, defaultValue, humanText, reg_exp = ".*", help_text = "You must enter the appropriate details in this field." ): SheetControl.__init__( self, reg_exp ) self.setName( name ) @@ -206,7 +206,7 @@ class FileControl( InputControl ): - def __init__( self, name, defaultValue, humanText, reg_exp = ".*", help_text = "You must enter the appropriate details in this feild." ): + def __init__( self, name, defaultValue, humanText, reg_exp = ".*", help_text = "You must enter the appropriate details in this field." ): InputControl.__init__( self, name, defaultValue, humanText ) def validate( self, persistedValue ): diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/util/process.py --- a/tools/python/xen/util/process.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/util/process.py Thu Sep 22 17:42:01 2005 @@ -24,6 +24,8 @@ r = p.poll() for (fd, event) in r: if event == select.POLLHUP: + cout.close() + cerr.close() return stdout if fd == cout.fileno(): stdout = stdout + cout.readline() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/web/httpserver.py --- a/tools/python/xen/web/httpserver.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/web/httpserver.py Thu Sep 22 17:42:01 2005 @@ -273,6 +273,9 @@ self.interface = interface self.port = port self.root = root + # ready indicates when we are ready to begin accept connections + # it should be set after a successful bind + self.ready = False def getRoot(self): return self.root @@ -283,6 +286,7 @@ def run(self): self.bind() self.listen() + self.ready = True self.requestLoop() def stop(self): diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/web/tcp.py --- a/tools/python/xen/web/tcp.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/web/tcp.py Thu Sep 22 17:42:01 2005 @@ -18,6 +18,8 @@ import sys import socket import types +import time +import errno from connection import * from protocol import * @@ -35,9 +37,20 @@ def createSocket(self): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - addr = (self.interface, self.port) - sock.bind(addr) - return sock + + # SO_REUSEADDR does not always ensure that we do not get an address + # in use error when restarted quickly + # we implement a timeout to try and avoid failing unnecessarily + timeout = time.time() + 30 + while True: + try: + sock.bind((self.interface, self.port)) + return sock + except socket.error, (_errno, strerrno): + if _errno == errno.EADDRINUSE and time.time() < timeout: + time.sleep(0.5) + else: + raise def acceptConnection(self, sock, protocol, addr): return TCPServerConnection(sock, protocol, addr, self) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/Args.py --- a/tools/python/xen/xend/Args.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/Args.py Thu Sep 22 17:42:01 2005 @@ -32,12 +32,12 @@ self.arg_dict = {} self.key_ord = [] self.key_dict = {} - for (name, type) in paramspec: + for (name, typ) in paramspec: self.arg_ord.append(name) - self.arg_dict[name] = type - for (name, type) in keyspec: + self.arg_dict[name] = typ + for (name, typ) in keyspec: self.key_ord.append(name) - self.key_dict[name] = type + self.key_dict[name] = typ def get_args(self, d, xargs=None): args = {} @@ -56,12 +56,12 @@ def split_args(self, d, args, keys): for (k, v) in d.items(): if k in self.arg_dict: - type = self.arg_dict[k] - val = self.coerce(type, v) + typ = self.arg_dict[k] + val = self.coerce(typ, v) args[k] = val elif k in self.key_dict: - type = self.key_dict[k] - val = self.coerce(type, v) + typ = self.key_dict[k] + val = self.coerce(typ, v) keys[k] = val else: raise ArgError('Invalid parameter: %s' % k) @@ -85,20 +85,20 @@ d[k] = val return self.get_args(d, xargs=xargs) - def coerce(self, type, v): + def coerce(self, typ, v): try: - if type == 'int': + if typ == 'int': val = int(v) - elif type == 'long': + elif typ == 'long': val = long(v) - elif type == 'str': + elif typ == 'str': val = str(v) - elif type == 'sxpr': + elif typ == 'sxpr': val = self.sxpr(v) - elif type == 'bool': + elif typ == 'bool': val = self.bool(v) else: - raise ArgError('invalid type:' + str(type)) + raise ArgError('invalid type:' + str(typ)) return val except ArgError: raise @@ -142,7 +142,9 @@ Used on the client. """ - def __init__(self, fn, paramspec, keyspec={}): + def __init__(self, fn, paramspec, keyspec = None): + if keyspec == None: + keyspec = {} Args.__init__(self, paramspec, keyspec) self.fn = fn @@ -154,7 +156,9 @@ Used in the HTTP server. """ - def __init__(self, fn, paramspec, keyspec={}): + def __init__(self, fn, paramspec, keyspec = None): + if keyspec == None: + keyspec = {} Args.__init__(self, paramspec, keyspec) self.fn = fn diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/EventServer.py --- a/tools/python/xen/xend/EventServer.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/EventServer.py Thu Sep 22 17:42:01 2005 @@ -145,7 +145,7 @@ self.lock.release() if async: - scheduler.now(self.call_handlers, [event, val]) + scheduler.now(self.call_handlers, event, val) else: self.call_handlers(event, val) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/PrettyPrint.py --- a/tools/python/xen/xend/PrettyPrint.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/PrettyPrint.py Thu Sep 22 17:42:01 2005 @@ -13,6 +13,7 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ """General pretty-printer, including support for SXP. @@ -34,11 +35,11 @@ def get_width(self): return self.width - def output(self, out): + def output(self, _): print '***PrettyItem>output>', self pass - def prettyprint(self, out, width): + def prettyprint(self, _, width): print '***PrettyItem>prettyprint>', self return width @@ -51,7 +52,7 @@ def output(self, out): out.write(self.value) - def prettyprint(self, line): + def prettyprint(self, line, _): line.output(self) def show(self, out): @@ -62,7 +63,7 @@ def output(self, out): out.write(' ' * self.width) - def prettyprint(self, line): + def prettyprint(self, line, _): line.output(self) def show(self, out): @@ -79,7 +80,7 @@ def output(self, out): out.write(' ' * self.width) - def prettyprint(self, line): + def prettyprint(self, line, _): if line.breaks(self.space): self.active = 1 line.newline(self.indent) @@ -88,26 +89,20 @@ def show(self, out): print >> out, ("(break (width %d) (indent %d) (space %d) (active %d))" - % (self.width, self.indent, self.space, self.lspace, self.active)) + % (self.width, self.indent, self.space, self.active)) class PrettyNewline(PrettySpace): - - def __init__(self, indent): - PrettySpace.__init__(self, indent) def insert(self, block): block.newline() block.addtoline(self) - def output(self, out): - out.write(' ' * self.width) - - def prettyprint(self, line): + def prettyprint(self, line, _): line.newline(0) line.output(self) def show(self, out): - print >> out, ("(nl (indent %d))" % self.indent) + print >> out, ("(nl (width %d))" % self.width) class PrettyLine(PrettyItem): def __init__(self): @@ -132,7 +127,7 @@ lastbreak.space = (width - lastwidth) self.width = width - def prettyprint(self, line): + def prettyprint(self, line, _): for x in self.content: x.prettyprint(line) @@ -145,7 +140,8 @@ class PrettyBlock(PrettyItem): def __init__(self, all=0, parent=None): - self.width = 0 + PrettyItem.__init__(self, 0) + self.lines = [] self.parent = parent self.indent = 0 @@ -163,7 +159,7 @@ if self.width < l.width: self.width = l.width - def breaks(self, n): + def breaks(self, _): return self.all and self.broken def newline(self): @@ -172,7 +168,7 @@ def addtoline(self, x): self.lines[-1].write(x) - def prettyprint(self, line): + def prettyprint(self, line, _): self.indent = line.used line.block = self if not line.fits(self.width): @@ -191,6 +187,7 @@ class Line: def __init__(self, out, width): + self.block = None self.out = out self.width = width self.used = 0 @@ -255,8 +252,7 @@ self.block = self.block.parent def prettyprint(self, out=sys.stdout): - line = Line(out, self.width) - self.top.prettyprint(line) + self.top.prettyprint(Line(out, self.width)) class SXPPrettyPrinter(PrettyPrinter): """An SXP prettyprinter. diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/Vifctl.py --- a/tools/python/xen/xend/Vifctl.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/Vifctl.py Thu Sep 22 17:42:01 2005 @@ -13,13 +13,13 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ """Xend interface to networking control scripts. """ import os import os.path -import sys import xen.util.process from xen.xend import XendRoot @@ -71,7 +71,7 @@ vif = vif_old return vif -def vifctl(op, vif=None, script=None, domain=None, mac=None, bridge=None, ipaddr=[]): +def vifctl(op, vif=None, script=None, domain=None, mac=None, bridge=None, ipaddr=None): """Call a vif control script. Xend calls this when bringing vifs up or down. diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendBootloader.py --- a/tools/python/xen/xend/XendBootloader.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendBootloader.py Thu Sep 22 17:42:01 2005 @@ -12,7 +12,7 @@ # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # -import os, sys, select, errno +import os, select, errno import sxp from XendLogging import log @@ -72,7 +72,7 @@ if len(s) == 0: break - (pid, status) = os.waitpid(child, 0) + os.waitpid(child, 0) os.close(r) os.unlink(BL_FIFO) @@ -89,6 +89,4 @@ if vcpus and sxp.child_value(config_image, "vcpus") is None: config_image.append(['vcpus', vcpus]) - config = ['image', config_image] - return config - + return config_image diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendCheckpoint.py Thu Sep 22 17:42:01 2005 @@ -4,7 +4,6 @@ # Public License. See the file "COPYING" in the main directory of # this archive for more details. -import errno import os import re import select @@ -12,7 +11,7 @@ from string import join from struct import pack, unpack, calcsize from xen.util.xpopen import xPopen3 -import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() +import xen.lowlevel.xc from xen.xend.xenstore.xsutil import IntroduceDomain from XendError import XendError @@ -24,6 +23,10 @@ sizeof_int = calcsize("i") sizeof_unsigned_long = calcsize("L") + + +xc = xen.lowlevel.xc.new() + def write_exact(fd, buf, errmsg): if os.write(fd, buf) != len(buf): @@ -83,7 +86,7 @@ if child.wait() != 0: raise XendError("xc_save failed: %s" % lasterr) - dominfo.setStoreChannel(None) + dominfo.closeStoreChannel() xd.domain_destroy(dominfo.domid) return None diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendClient.py --- a/tools/python/xen/xend/XendClient.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendClient.py Thu Sep 22 17:42:01 2005 @@ -33,8 +33,6 @@ UnixXendClientProtocol, \ XendError -DEBUG = 0 - def fileof(val): """Converter for passing configs or other 'large' data. Handles lists, files directly. @@ -385,7 +383,6 @@ python XendClient.py domain 0 (domain (id 0) (name Domain-0) (memory 128)) """ - global DEBUG from getopt import getopt short_options = 'x:au:d' long_options = ['xend=', 'unix=', 'debug'] @@ -397,8 +394,6 @@ srv = v elif k in ['-u', '--unix']: unix = int(v) - elif k in ['-d', '--debug']: - DEBUG = 1 if len(args): fn = args[0] args = args[1:] diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendDmesg.py --- a/tools/python/xen/xend/XendDmesg.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendDmesg.py Thu Sep 22 17:42:01 2005 @@ -18,7 +18,6 @@ """Get dmesg output for this node. """ -import os import xen.lowlevel.xc class XendDmesg: diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendDomain.py Thu Sep 22 17:42:01 2005 @@ -14,40 +14,52 @@ #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> # Copyright (C) 2005 Christian Limpach <Christian.Limpach@xxxxxxxxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ """Handler for domain operations. Nothing here is persistent (across reboots). Needs to be persistent for one uptime. """ -import errno import os -import sys -import time -import traceback - -import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() + +import xen.lowlevel.xc from xen.xend import sxp -from xen.xend import XendRoot; xroot = XendRoot.instance() +from xen.xend import XendRoot from xen.xend import XendCheckpoint from xen.xend.XendDomainInfo import XendDomainInfo, shutdown_reason -from xen.xend import EventServer; eserver = EventServer.instance() +from xen.xend import EventServer from xen.xend.XendError import XendError from xen.xend.XendLogging import log from xen.xend import scheduler from xen.xend.server import relocate from xen.xend.uuid import getUuid from xen.xend.xenstore import XenNode, DBMap +from xen.xend.xenstore.xstransact import xstransact +from xen.xend.xenstore.xsutil import GetDomainPath + + +xc = xen.lowlevel.xc.new() +xroot = XendRoot.instance() +eserver = EventServer.instance() + __all__ = [ "XendDomain" ] SHUTDOWN_TIMEOUT = 30 +PRIV_DOMAIN = 0 + +def is_dead(dom): + return dom['crashed'] or dom['shutdown'] or ( + dom['dying'] and not(dom['running'] or dom['paused'] or + dom['blocked'])) + class XendDomainDict(dict): def get_by_name(self, name): try: - return filter(lambda d: d.name == name, self.values())[0] + return filter(lambda d: d.getName() == name, self.values())[0] except IndexError, err: return None @@ -65,9 +77,12 @@ # So we stuff the XendDomain instance (self) into xroot's components. xroot.add_component("xen.xend.XendDomain", self) self.domains = XendDomainDict() - self.dbmap = DBMap(db=XenNode("/domain")) + self.domroot = "/domain" + self.vmroot = "/domain" + self.dbmap = DBMap(db=XenNode(self.vmroot)) self.watchReleaseDomain() self.initial_refresh() + self.dom0_setup() def list(self): """Get list of domain objects. @@ -83,7 +98,7 @@ @return: domain objects """ doms = self.list() - doms.sort(lambda x, y: cmp(x.name, y.name)) + doms.sort(lambda x, y: cmp(x.getName(), y.getName())) return doms def list_names(self): @@ -92,10 +107,12 @@ @return: domain names """ doms = self.list_sorted() - return map(lambda x: x.name, doms) + return map(lambda x: x.getName(), doms) def onReleaseDomain(self): - self.refresh(cleanup=True) + self.reap() + self.refresh() + self.domain_restarts() def watchReleaseDomain(self): from xen.xend.xenstore.xswatch import xswatch @@ -123,70 +140,58 @@ else: dominfo = dominfo[0] return dominfo - + def initial_refresh(self): """Refresh initial domain info from db. """ doms = self.xen_domains() - self.dbmap.readDB() - for domdb in self.dbmap.values(): - if not domdb.has_key("xend"): - continue - db = domdb.addChild("xend") + self.dbmap.readDB() # XXX only needed for "xend" + for dom in doms.values(): + domid = dom['dom'] + dompath = GetDomainPath(domid) + if not dompath: + continue + vmpath = xstransact.Read(dompath, "vm") + if not vmpath: + continue + uuid = xstransact.Read(vmpath, "uuid") + if not uuid: + continue + log.info("recreating domain %d, uuid %s" % (domid, uuid)) + dompath = "/".join(dompath.split("/")[0:-1]) try: - domid = int(domdb["domid"].getData()) - except: - domid = None - # XXX if domid in self.domains, then something went wrong - if (domid is None) or (domid in self.domains): - domdb.delete() - elif domid in doms: - try: - self._new_domain(domdb["uuid"].getData(), domid, db, - doms[domid]) - except Exception, ex: - log.exception("Error recreating domain info: id=%d", domid) - self._delete_domain(domid) - else: - self._delete_domain(domid) - self.refresh(cleanup=True) - - dom0 = self.domain_lookup(0) + dominfo = XendDomainInfo.recreate(uuid, dompath, domid, dom) + except Exception, ex: + log.exception("Error recreating domain info: id=%d", domid) + continue + self._add_domain(dominfo) + self.reap() + self.refresh() + self.domain_restarts() + + def dom0_setup(self): + dom0 = self.domain_lookup(PRIV_DOMAIN) if not dom0: - dom0 = self.domain_unknown(0) + dom0 = self.dom0_unknown() dom0.dom0_init_store() + dom0.dom0_enforce_vcpus() def close(self): pass - def _new_domain(self, uuid, domid, db, info): - """Create a domain entry from saved info. - - @param db: saved info from the db - @param info: domain info from xen - @return: domain - """ - dominfo = XendDomainInfo.recreate(uuid, domid, db, info) - self.domains[dominfo.domid] = dominfo - return dominfo - def _add_domain(self, info, notify=True): """Add a domain entry to the tables. @param info: domain info object @param notify: send a domain created event if true """ - # Remove entries under the wrong id. - for i, d in self.domains.items(): - if i != d.domid: - del self.domains[i] - self.dbmap.delete(d.uuid) - if info.domid in self.domains: + if info.getDomid() in self.domains: notify = False - self.domains[info.domid] = info - info.exportToDB(save=True) + self.domains[info.getDomid()] = info + info.exportToDB() if notify: - eserver.inject('xend.domain.create', [info.name, info.domid]) + eserver.inject('xend.domain.create', [info.getName(), + info.getDomid()]) def _delete_domain(self, id, notify=True): """Remove a domain from the tables. @@ -194,18 +199,14 @@ @param id: domain id @param notify: send a domain died event if true """ - try: - if self.xen_domain(id): - return - except: - pass info = self.domains.get(id) if info: del self.domains[id] info.cleanup() info.delete() if notify: - eserver.inject('xend.domain.died', [info.name, info.domid]) + eserver.inject('xend.domain.died', [info.getName(), + info.getDomid()]) # XXX this should not be needed for domdb in self.dbmap.values(): if not domdb.has_key("xend"): @@ -222,61 +223,40 @@ """Look for domains that have crashed or stopped. Tidy them up. """ - casualties = [] doms = self.xen_domains() for d in doms.values(): - dead = 0 - dead = dead or (d['crashed'] or d['shutdown']) - dead = dead or (d['dying'] and - not(d['running'] or d['paused'] or d['blocked'])) - if dead: - casualties.append(d) - for d in casualties: - id = d['dom'] - dominfo = self.domains.get(id) - name = (dominfo and dominfo.name) or '??' - if dominfo and dominfo.is_terminated(): - continue - log.debug('XendDomain>reap> domain died name=%s id=%d', name, id) + if not is_dead(d): + continue + domid = d['dom'] + dominfo = self.domains.get(domid) + if not dominfo or dominfo.is_terminated(): + continue + log.debug('domain died name=%s domid=%d', dominfo.getName(), domid) + if d['crashed'] and xroot.get_enable_dump(): + self.domain_dumpcore(domid) if d['shutdown']: reason = shutdown_reason(d['shutdown_reason']) - log.debug('XendDomain>reap> shutdown name=%s id=%d reason=%s', name, id, reason) - if reason in ['suspend']: - if dominfo and dominfo.is_terminated(): - log.debug('XendDomain>reap> Suspended domain died id=%d', id) - else: - eserver.inject('xend.domain.suspended', [name, id]) - if dominfo: - dominfo.state_set("suspended") - continue + log.debug('shutdown name=%s id=%d reason=%s', + dominfo.getName(), domid, reason) + if reason == 'suspend': + dominfo.state_set("suspended") + continue if reason in ['poweroff', 'reboot']: - eserver.inject('xend.domain.exit', [name, id, reason]) - self.domain_restart_schedule(id, reason) - else: - if xroot.get_enable_dump(): - self.domain_dumpcore(id) - eserver.inject('xend.domain.exit', [name, id, 'crash']) - self.final_domain_destroy(id) - - def refresh(self, cleanup=False): + self.domain_restart_schedule(domid, reason) + dominfo.destroy() + + def refresh(self): """Refresh domain list from Xen. """ - if cleanup: - self.reap() doms = self.xen_domains() # Remove entries for domains that no longer exist. # Update entries for existing domains. - do_domain_restarts = False for d in self.domains.values(): - info = doms.get(d.domid) + info = doms.get(d.getDomid()) if info: d.update(info) - elif d.restart_pending(): - do_domain_restarts = True - else: - self._delete_domain(d.domid) - if cleanup and do_domain_restarts: - scheduler.now(self.domain_restarts) + elif not d.restart_pending(): + self._delete_domain(d.getDomid()) def update_domain(self, id): """Update information for a single domain. @@ -297,7 +277,8 @@ @param config: configuration @return: domain """ - dominfo = XendDomainInfo.create(self.dbmap, config) + dominfo = XendDomainInfo.create(self.dbmap.getPath(), config) + self._add_domain(dominfo) return dominfo def domain_restart(self, dominfo): @@ -305,31 +286,39 @@ @param dominfo: domain object """ - log.info("Restarting domain: name=%s id=%s", dominfo.name, dominfo.domid) + log.info("Restarting domain: name=%s id=%s", dominfo.getName(), + dominfo.getDomid()) eserver.inject("xend.domain.restart", - [dominfo.name, dominfo.domid, "begin"]) + [dominfo.getName(), dominfo.getDomid(), "begin"]) try: dominfo.restart() - log.info('Restarted domain name=%s id=%s', dominfo.name, dominfo.domid) + log.info('Restarted domain name=%s id=%s', dominfo.getName(), + dominfo.getDomid()) eserver.inject("xend.domain.restart", - [dominfo.name, dominfo.domid, "success"]) - self.domain_unpause(dominfo.domid) + [dominfo.getName(), dominfo.getDomid(), + "success"]) + self.domain_unpause(dominfo.getDomid()) except Exception, ex: log.exception("Exception restarting domain: name=%s id=%s", - dominfo.name, dominfo.domid) + dominfo.getName(), dominfo.getDomid()) eserver.inject("xend.domain.restart", - [dominfo.name, dominfo.domid, "fail"]) + [dominfo.getName(), dominfo.getDomid(), "fail"]) return dominfo - def domain_configure(self, vmconfig): + def domain_configure(self, config): """Configure an existing domain. This is intended for internal use by domain restore and migrate. @param vmconfig: vm configuration """ - config = sxp.child_value(vmconfig, 'config') - dominfo = XendDomainInfo.restore(self.dbmap, config) - return dominfo + # We accept our configuration specified as ['config' [...]], which + # some tools or configuration files may be using. For save-restore, + # we use the value of XendDomainInfo.sxpr() directly, which has no + # such item. + nested = sxp.child_value(config, 'config') + if nested: + config = nested + return XendDomainInfo.restore(self.dbmap.getPath(), config) def domain_restore(self, src, progress=False): """Restore a domain from file. @@ -340,7 +329,9 @@ try: fd = os.open(src, os.O_RDONLY) - return XendCheckpoint.restore(self, fd) + dominfo = XendCheckpoint.restore(self, fd) + self._add_domain(dominfo) + return dominfo except OSError, ex: raise XendError("can't read guest state file %s: %s" % (src, ex[1])) @@ -354,22 +345,32 @@ self.update_domain(id) return self.domains.get(id) - def domain_unknown(self, id): - try: - info = self.xen_domain(id) - if info: - uuid = getUuid() - log.info( - "Creating entry for unknown domain: id=%d uuid=%s", - id, uuid) - db = self.dbmap.addChild("%s/xend" % uuid) - dominfo = XendDomainInfo.recreate(uuid, id, db, info) - self._add_domain(dominfo) - return dominfo - except Exception, ex: - raise - log.exception("Error creating domain info: id=%d", id) - return None + def dom0_unknown(self): + dom0 = PRIV_DOMAIN + uuid = None + info = self.xen_domain(dom0) + dompath = GetDomainPath(dom0) + if dompath: + vmpath = xstransact.Read(dompath, "vm") + if vmpath: + uuid = xstransact.Read(vmpath, "uuid") + if not uuid: + uuid = dompath.split("/")[-1] + dompath = "/".join(dompath.split("/")[0:-1]) + if not uuid: + uuid = getUuid() + dompath = self.domroot + log.info("Creating entry for unknown xend domain: id=%d uuid=%s", + dom0, uuid) + try: + dominfo = XendDomainInfo.recreate(uuid, dompath, dom0, info) + self._add_domain(dominfo) + return dominfo + except Exception, exn: + log.exception(exn) + raise XendError("Error recreating xend domain info: id=%d: %s" % + (dom0, str(exn))) + def domain_lookup(self, id): return self.domains.get(id) @@ -390,9 +391,10 @@ @param id: domain id """ dominfo = self.domain_lookup(id) - eserver.inject('xend.domain.unpause', [dominfo.name, dominfo.domid]) - try: - return xc.domain_unpause(dom=dominfo.domid) + eserver.inject('xend.domain.unpause', [dominfo.getName(), + dominfo.getDomid()]) + try: + return xc.domain_unpause(dom=dominfo.getDomid()) except Exception, ex: raise XendError(str(ex)) @@ -402,9 +404,10 @@ @param id: domain id """ dominfo = self.domain_lookup(id) - eserver.inject('xend.domain.pause', [dominfo.name, dominfo.domid]) - try: - return xc.domain_pause(dom=dominfo.domid) + eserver.inject('xend.domain.pause', [dominfo.getName(), + dominfo.getDomid()]) + try: + return xc.domain_pause(dom=dominfo.getDomid()) except Exception, ex: raise XendError(str(ex)) @@ -420,8 +423,9 @@ @param reason: shutdown type: poweroff, reboot, suspend, halt """ dominfo = self.domain_lookup(id) - self.domain_restart_schedule(dominfo.domid, reason, force=True) - eserver.inject('xend.domain.shutdown', [dominfo.name, dominfo.domid, reason]) + self.domain_restart_schedule(dominfo.getDomid(), reason, force=True) + eserver.inject('xend.domain.shutdown', [dominfo.getName(), + dominfo.getDomid(), reason]) if reason == 'halt': reason = 'poweroff' val = dominfo.shutdown(reason) @@ -445,13 +449,13 @@ if not dominfo.shutdown_pending: # domain doesn't need shutdown continue - id = dominfo.domid + id = dominfo.getDomid() left = dominfo.shutdown_time_left(SHUTDOWN_TIMEOUT) if left <= 0: # Shutdown expired - destroy domain. try: log.info("Domain shutdown timeout expired: name=%s id=%s", - dominfo.name, id) + dominfo.getName(), id) self.domain_destroy(id, reason= dominfo.shutdown_pending['reason']) except Exception: @@ -476,15 +480,16 @@ restart = (force and reason == 'reboot') or dominfo.restart_needed(reason) if restart: log.info('Scheduling restart for domain: name=%s id=%s', - dominfo.name, dominfo.domid) + dominfo.getName(), dominfo.getDomid()) eserver.inject("xend.domain.restart", - [dominfo.name, dominfo.domid, "schedule"]) + [dominfo.getName(), dominfo.getDomid(), + "schedule"]) dominfo.restarting() else: log.info('Cancelling restart for domain: name=%s id=%s', - dominfo.name, dominfo.domid) + dominfo.getName(), dominfo.getDomid()) eserver.inject("xend.domain.restart", - [dominfo.name, dominfo.domid, "cancel"]) + [dominfo.getName(), dominfo.getDomid(), "cancel"]) dominfo.restart_cancel() def domain_restarts(self): @@ -494,45 +499,36 @@ for dominfo in self.domains.values(): if not dominfo.restart_pending(): continue - print 'domain_restarts>', dominfo.name, dominfo.domid - info = doms.get(dominfo.domid) + info = doms.get(dominfo.getDomid()) if info: # Don't execute restart for domains still running. - print 'domain_restarts> still runnning: ', dominfo.name continue # Remove it from the restarts. - print 'domain_restarts> restarting: ', dominfo.name + log.info('restarting: %s' % dominfo.getName()) self.domain_restart(dominfo) - def final_domain_destroy(self, id): - """Final destruction of a domain.. - - @param id: domain id - """ - try: - dominfo = self.domain_lookup(id) - log.info('Destroying domain: name=%s', dominfo.name) - eserver.inject('xend.domain.destroy', [dominfo.name, dominfo.domid]) + def domain_destroy(self, domid, reason='halt'): + """Terminate domain immediately. + - halt: cancel any restart for the domain + - reboot schedule a restart for the domain + + @param domid: domain id + """ + + if domid == PRIV_DOMAIN: + raise XendError("Cannot destroy privileged domain %i" % domid) + + self.domain_restart_schedule(domid, reason, force=True) + dominfo = self.domain_lookup(domid) + if dominfo: val = dominfo.destroy() - except: - #todo + else: try: - val = xc.domain_destroy(dom=id) + val = xc.domain_destroy(dom=domid) except Exception, ex: raise XendError(str(ex)) return val - def domain_destroy(self, id, reason='halt'): - """Terminate domain immediately. - - halt: cancel any restart for the domain - - reboot schedule a restart for the domain - - @param id: domain id - """ - self.domain_restart_schedule(id, reason, force=True) - val = self.final_domain_destroy(id) - return val - def domain_migrate(self, id, dst, live=False, resource=0): """Start domain migration. @@ -547,13 +543,14 @@ # temporarily rename domain for localhost migration if dst == "localhost": - dominfo.name = "tmp-" + dominfo.name + dominfo.setName("tmp-" + dominfo.getName()) try: XendCheckpoint.save(self, sock.fileno(), dominfo, live) except: if dst == "localhost": - dominfo.name = string.replace(dominfo.name, "tmp-", "", 1) + dominfo.setName( + string.replace(dominfo.getName(), "tmp-", "", 1)) raise return None @@ -587,7 +584,7 @@ """ dominfo = self.domain_lookup(id) try: - return xc.domain_pincpu(dominfo.domid, vcpu, cpumap) + return xc.domain_pincpu(dominfo.getDomid(), vcpu, cpumap) except Exception, ex: raise XendError(str(ex)) @@ -596,8 +593,10 @@ """ dominfo = self.domain_lookup(id) try: - return xc.bvtsched_domain_set(dom=dominfo.domid, mcuadv=mcuadv, - warpback=warpback, warpvalue=warpvalue, + return xc.bvtsched_domain_set(dom=dominfo.getDomid(), + mcuadv=mcuadv, + warpback=warpback, + warpvalue=warpvalue, warpl=warpl, warpu=warpu) except Exception, ex: raise XendError(str(ex)) @@ -607,7 +606,7 @@ """ dominfo = self.domain_lookup(id) try: - return xc.bvtsched_domain_get(dominfo.domid) + return xc.bvtsched_domain_get(dominfo.getDomid()) except Exception, ex: raise XendError(str(ex)) @@ -617,7 +616,8 @@ """ dominfo = self.domain_lookup(id) try: - return xc.sedf_domain_set(dominfo.domid, period, slice, latency, extratime, weight) + return xc.sedf_domain_set(dominfo.getDomid(), period, slice, + latency, extratime, weight) except Exception, ex: raise XendError(str(ex)) @@ -626,7 +626,7 @@ """ dominfo = self.domain_lookup(id) try: - return xc.sedf_domain_get(dominfo.domid) + return xc.sedf_domain_get(dominfo.getDomid()) except Exception, ex: raise XendError(str(ex)) @@ -674,9 +674,8 @@ @param type: device type """ dominfo = self.domain_lookup(id) - val = dominfo.device_delete(type, devid) - dominfo.exportToDB() - return val + return dominfo.destroyDevice(type, devid) + def domain_devtype_ls(self, id, type): """Get list of device sxprs for a domain. @@ -716,7 +715,7 @@ """ dominfo = self.domain_lookup(id) try: - return xc.shadow_control(dominfo.domid, op) + return xc.shadow_control(dominfo.getDomid(), op) except Exception, ex: raise XendError(str(ex)) @@ -730,7 +729,8 @@ dominfo = self.domain_lookup(id) maxmem = int(mem) * 1024 try: - return xc.domain_setmaxmem(dominfo.domid, maxmem_kb = maxmem) + return xc.domain_setmaxmem(dominfo.getDomid(), + maxmem_kb = maxmem) except Exception, ex: raise XendError(str(ex)) @@ -742,7 +742,7 @@ @return: 0 on success, -1 on error """ dominfo = self.domain_lookup(id) - return dominfo.setMemoryTarget(mem * (1 << 20)) + return dominfo.setMemoryTarget(mem << 10) def domain_vcpu_hotplug(self, id, vcpu, state): """Enable or disable VCPU vcpu in DOM id @@ -762,12 +762,13 @@ @param id: domain """ dominfo = self.domain_lookup(id) - corefile = "/var/xen/dump/%s.%s.core"% (dominfo.name, dominfo.domid) - try: - xc.domain_dumpcore(dom=dominfo.domid, corefile=corefile) + corefile = "/var/xen/dump/%s.%s.core" % (dominfo.getName(), + dominfo.getDomid()) + try: + xc.domain_dumpcore(dom=dominfo.getDomid(), corefile=corefile) except Exception, ex: log.warning("Dumpcore failed, id=%s name=%s: %s", - dominfo.domid, dominfo.name, ex) + dominfo.getDomid(), dominfo.getName(), ex) def instance(): """Singleton constructor. Use this instead of the class constructor. diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendDomainInfo.py Thu Sep 22 17:42:01 2005 @@ -1,4 +1,4 @@ -#============================================================================ +#=========================================================================== # This library is free software; you can redistribute it and/or # modify it under the terms of version 2.1 of the GNU Lesser General Public # License as published by the Free Software Foundation. @@ -13,6 +13,7 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ """Representation of a single domain. @@ -23,31 +24,23 @@ """ -import string, re -import os +import string import time import threading import errno -import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() -from xen.util.ip import check_subnet, get_current_ipgw +import xen.lowlevel.xc from xen.util.blkif import blkdev_uname_to_file -from xen.xend.server import controller -from xen.xend.server import SrvDaemon; xend = SrvDaemon.instance() from xen.xend.server.channel import EventChannel -from xen.util.blkif import blkdev_name_to_number, expand_dev_name from xen.xend import sxp -from xen.xend import Blkctl -from xen.xend.PrettyPrint import prettyprintstring from xen.xend.XendBootloader import bootloader from xen.xend.XendLogging import log from xen.xend.XendError import XendError, VmError from xen.xend.XendRoot import get_component from xen.xend.uuid import getUuid -from xen.xend.xenstore import DBVar, XenNode, DBMap from xen.xend.xenstore.xstransact import xstransact from xen.xend.xenstore.xsutil import IntroduceDomain @@ -88,6 +81,18 @@ STATE_VM_TERMINATED = "terminated" STATE_VM_SUSPENDED = "suspended" +"""Flag for a block device backend domain.""" +SIF_BLK_BE_DOMAIN = (1<<4) + +"""Flag for a net device backend domain.""" +SIF_NET_BE_DOMAIN = (1<<5) + +"""Flag for a TPM device backend domain.""" +SIF_TPM_BE_DOMAIN = (1<<7) + + +xc = xen.lowlevel.xc.new() + def domain_exists(name): # See comment in XendDomain constructor. @@ -110,9 +115,13 @@ @param dom: domain id @return: info or None """ - domlist = xc.domain_getinfo(dom, 1) - if domlist and dom == domlist[0]['dom']: - return domlist[0] + try: + domlist = xc.domain_getinfo(dom, 1) + if domlist and dom == domlist[0]['dom']: + return domlist[0] + except Exception, err: + # ignore missing domain + log.exception("domain_getinfo(%d) failed, ignoring", dom) return None class XendDomainInfo: @@ -122,149 +131,288 @@ """ MINIMUM_RESTART_TIME = 20 - def create(cls, parentdb, config): + + def create(cls, dompath, config): """Create a VM from a configuration. - @param parentdb: parent db + @param dompath: The path to all domain information @param config configuration @raise: VmError for invalid configuration """ - uuid = getUuid() - db = parentdb.addChild("%s/xend" % uuid) - path = parentdb.getPath() - vm = cls(uuid, path, db) - vm.construct(config) - vm.saveToDB(sync=True) - + + log.debug("XendDomainInfo.create(%s, ...)", dompath) + + vm = cls(getUuid(), dompath, cls.parseConfig(config)) + vm.construct() return vm create = classmethod(create) - def recreate(cls, uuid, domid, db, info): + + def recreate(cls, uuid, dompath, domid, info): """Create the VM object for an existing domain. - @param db: domain db + @param dompath: The path to all domain information @param info: domain info from xc """ - path = "/".join(db.getPath().split("/")[0:-2]) - vm = cls(uuid, path, db) - vm.setDomid(domid) - vm.name, vm.start_time = vm.gatherVm(("name", str), - ("start-time", float)) - try: - db.readDB() - except: pass - vm.importFromDB() - config = vm.config - log.debug('info=' + str(info)) - log.debug('config=' + prettyprintstring(config)) - - vm.memory = info['mem_kb'] / 1024 - vm.target = info['mem_kb'] * 1024 - - if config: - try: - vm.recreate = True - vm.construct(config) - finally: - vm.recreate = False - else: - vm.setName("Domain-%d" % domid) - - vm.exportToDB(save=True) - return vm + + log.debug("XendDomainInfo.recreate(%s, %s, %s, %s)", uuid, dompath, + domid, info) + + return cls(uuid, dompath, info, domid, True) recreate = classmethod(recreate) - def restore(cls, parentdb, config, uuid=None): + + def restore(cls, dompath, config, uuid = None): """Create a domain and a VM object to do a restore. - @param parentdb: parent db + @param dompath: The path to all domain information @param config: domain configuration @param uuid: uuid to use """ + + log.debug("XendDomainInfo.restore(%s, %s, %s)", dompath, config, uuid) + if not uuid: uuid = getUuid() - db = parentdb.addChild("%s/xend" % uuid) - path = parentdb.getPath() - vm = cls(uuid, path, db) - ssidref = int(sxp.child_value(config, 'ssidref')) - log.debug('restoring with ssidref='+str(ssidref)) - id = xc.domain_create(ssidref = ssidref) - vm.setDomid(id) + + try: + ssidref = int(sxp.child_value(config, 'ssidref')) + except TypeError, exn: + raise VmError('Invalid ssidref in config: %s' % exn) + + log.debug('restoring with ssidref = %d' % ssidref) + + vm = cls(uuid, dompath, cls.parseConfig(config), + xc.domain_create(ssidref = ssidref)) vm.clear_shutdown() + vm.create_channel() + vm.configure() + vm.exportToDB() + return vm + + restore = classmethod(restore) + + + def parseConfig(cls, config): + def get_cfg(name, conv = None): + val = sxp.child_value(config, name) + + if conv and not val is None: + try: + return conv(val) + except TypeError, exn: + raise VmError( + 'Invalid setting %s = %s in configuration: %s' % + (name, val, str(exn))) + else: + return val + + + log.debug("parseConfig: config is %s" % str(config)) + + result = {} + imagecfg = "()" + + result['name'] = get_cfg('name') + result['ssidref'] = get_cfg('ssidref', int) + result['memory'] = get_cfg('memory', int) + result['mem_kb'] = get_cfg('mem_kb', int) + result['maxmem'] = get_cfg('maxmem', int) + result['maxmem_kb'] = get_cfg('maxmem_kb', int) + result['cpu'] = get_cfg('cpu', int) + result['cpu_weight'] = get_cfg('cpu_weight', float) + result['bootloader'] = get_cfg('bootloader') + result['restart_mode'] = get_cfg('restart') + try: - vm.restore = True - vm.construct(config) - finally: - vm.restore = False - vm.exportToDB(save=True, sync=True) - return vm - - restore = classmethod(restore) - - __exports__ = [ - DBVar('config', ty='sxpr'), - DBVar('state', ty='str'), - DBVar('restart_mode', ty='str'), - DBVar('restart_state', ty='str'), - DBVar('restart_time', ty='float'), - DBVar('restart_count', ty='int'), - DBVar('device_model_pid', ty='int'), - ] + imagecfg = get_cfg('image') + + if imagecfg: + result['image'] = imagecfg + result['vcpus'] = int(sxp.child_value(imagecfg, 'vcpus', + 1)) + else: + result['vcpus'] = 1 + except TypeError, exn: + raise VmError( + 'Invalid configuration setting: vcpus = %s: %s' % + (sxp.child_value(imagecfg, 'vcpus', 1), + str(exn))) + + result['backend'] = [] + for c in sxp.children(config, 'backend'): + result['backend'].append(sxp.name(sxp.child0(c))) + + result['device'] = [] + for d in sxp.children(config, 'device'): + c = sxp.child0(d) + result['device'].append((sxp.name(c), c)) + + log.debug("parseConfig: result is %s" % str(result)) + return result + + + parseConfig = classmethod(parseConfig) + - def __init__(self, uuid, path, db): + def __init__(self, uuid, parentpath, info, domid = None, augment = False): + self.uuid = uuid - self.path = path + "/" + uuid - - self.db = db - - self.recreate = 0 - self.restore = 0 - - self.config = None - self.domid = None - self.cpu_weight = 1 - self.start_time = None - self.name = None - self.memory = None - self.ssidref = None + self.info = info + + self.path = parentpath + "/" + uuid + + if domid: + self.domid = domid + elif 'dom' in info: + self.domid = int(info['dom']) + else: + self.domid = None + + if augment: + self.augmentInfo() + + self.validateInfo() + self.image = None - - self.target = None self.store_channel = None self.store_mfn = None self.console_channel = None self.console_mfn = None - self.controllers = {} - - self.info = None - self.blkif_backend = False - self.netif_backend = False - self.netif_idx = 0 - self.tpmif_backend = False #todo: state: running, suspended self.state = STATE_VM_OK self.state_updated = threading.Condition() self.shutdown_pending = None - #todo: set to migrate info if migrating - self.migrate = None - - self.restart_mode = RESTART_ONREBOOT self.restart_state = None self.restart_time = None self.restart_count = 0 - self.vcpus = 1 - self.bootloader = None - self.device_model_pid = 0 - self.writeVm("uuid", self.uuid) self.storeDom("vm", self.path) + + def augmentInfo(self): + def useIfNeeded(name, val): + if not self.infoIsSet(name) and val is not None: + self.info[name] = val + + params = (("name", str), + ("start-time", float)) + + from_store = self.gatherVm(*params) + + map(lambda x, y: useIfNeeded(x[0], y), params, from_store) + + + def validateInfo(self): + """Validate and normalise the info block. This has either been parsed + by parseConfig, or received from xc through recreate. + """ + def defaultInfo(name, val): + if not self.infoIsSet(name): + self.info[name] = val() + + try: + defaultInfo('name', lambda: "Domain-%d" % self.domid) + defaultInfo('restart_mode', lambda: RESTART_ONREBOOT) + defaultInfo('cpu_weight', lambda: 1.0) + defaultInfo('bootloader', lambda: None) + defaultInfo('backend', lambda: []) + defaultInfo('device', lambda: []) + + self.check_name(self.info['name']) + + # Internally, we keep only maxmem_KiB, and not maxmem or maxmem_kb + # (which come from outside, and are in MiB and KiB respectively). + # This means that any maxmem or maxmem_kb settings here have come + # from outside, and maxmem_KiB must be updated to reflect them. + # If we have both maxmem and maxmem_kb and these are not + # consistent, then this is an error, as we've no way to tell which + # one takes precedence. + + # Exactly the same thing applies to memory_KiB, memory, and + # mem_kb. + + def discard_negatives(name): + if self.infoIsSet(name) and self.info[name] <= 0: + del self.info[name] + + def valid_KiB_(mb_name, kb_name): + discard_negatives(kb_name) + discard_negatives(mb_name) + + if self.infoIsSet(kb_name): + if self.infoIsSet(mb_name): + mb = self.info[mb_name] + kb = self.info[kb_name] + if mb * 1024 == kb: + return kb + else: + raise VmError( + 'Inconsistent %s / %s settings: %s / %s' % + (mb_name, kb_name, mb, kb)) + else: + return self.info[kb_name] + elif self.infoIsSet(mb_name): + return self.info[mb_name] * 1024 + else: + return None + + def valid_KiB(mb_name, kb_name): + result = valid_KiB_(mb_name, kb_name) + if result <= 0: + raise VmError('Invalid %s / %s: %s' % + (mb_name, kb_name, result)) + else: + return result + + def delIf(name): + if name in self.info: + del self.info[name] + + self.info['memory_KiB'] = valid_KiB('memory', 'mem_kb') + delIf('memory') + delIf('mem_kb') + self.info['maxmem_KiB'] = valid_KiB_('maxmem', 'maxmem_kb') + delIf('maxmem') + delIf('maxmem_kb') + + if not self.info['maxmem_KiB']: + self.info['maxmem_KiB'] = 1 << 30 + + if self.info['maxmem_KiB'] > self.info['memory_KiB']: + self.info['maxmem_KiB'] = self.info['memory_KiB'] + + # Validate the given backend names. + for s in self.info['backend']: + if s not in backendFlags: + raise VmError('Invalid backend type: %s' % s) + + for (n, c) in self.info['device']: + if not n or not c or n not in controllerClasses: + raise VmError('invalid device (%s, %s)' % + (str(n), str(c))) + + if self.info['restart_mode'] not in restart_modes: + raise VmError('invalid restart mode: ' + + str(self.info['restart_mode'])) + + if 'cpumap' not in self.info: + if [self.info['vcpus'] == 1]: + self.info['cpumap'] = [1]; + else: + raise VmError('Cannot create CPU map') + + except KeyError, exn: + log.exception(exn) + raise VmError('Unspecified domain detail: %s' % str(exn)) + + def readVm(self, *args): return xstransact.Read(self.path, *args) @@ -295,20 +443,28 @@ def storeDom(self, *args): return xstransact.Store(self.path, *args) - def setDB(self, db): - self.db = db - - def saveToDB(self, save=False, sync=False): - self.db.saveDB(save=save, sync=sync) - - def exportToDB(self, save=False, sync=False): - if self.image: - self.image.exportToDB(save=save, sync=sync) - self.db.exportToDB(self, fields=self.__exports__, save=save, sync=sync) - - def importFromDB(self): - self.db.importFromDB(self, fields=self.__exports__) - self.store_channel = self.eventChannel("store/port") + + def exportToDB(self): + to_store = { + 'domid': str(self.domid), + 'uuid': self.uuid, + + 'restart_time': str(self.restart_time), + + 'xend/state': self.state, + 'xend/restart_count': str(self.restart_count), + 'xend/restart_mode': str(self.info['restart_mode']), + + 'memory/target': str(self.info['memory_KiB']) + } + + for (k, v) in self.info.items(): + to_store[k] = str(v) + + log.debug("Storing %s" % str(to_store)) + + self.writeVm(to_store) + def setDomid(self, domid): """Set the domain id. @@ -318,40 +474,87 @@ self.domid = domid self.storeDom("domid", self.domid) - def getDomain(self): + def getDomid(self): return self.domid def setName(self, name): - self.name = name + self.check_name(name) + self.info['name'] = name self.storeVm("name", name) def getName(self): - return self.name + return self.info['name'] + + def getPath(self): + return self.path + + def getUuid(self): + return self.uuid + + def getVCpuCount(self): + return self.info['vcpus'] + + def getSsidref(self): + return self.info['ssidref'] + + def getMemoryTarget(self): + """Get this domain's target memory size, in KiB.""" + return self.info['memory_KiB'] def setStoreRef(self, ref): self.store_mfn = ref self.storeDom("store/ring-ref", ref) - def setStoreChannel(self, channel): - if self.store_channel and self.store_channel != channel: - self.store_channel.close() - self.store_channel = channel - self.storeDom("store/port", channel.port1) + + def getBackendFlags(self): + return reduce(lambda x, y: x | backendFlags[y], + self.info['backend'], 0) + + + def closeStoreChannel(self): + """Close the store channel, if any. Nothrow guarantee.""" + + try: + if self.store_channel: + try: + self.store_channel.close() + self.removeDom("store/port") + finally: + self.store_channel = None + except Exception, exn: + log.exception(exn) + def setConsoleRef(self, ref): self.console_mfn = ref self.storeDom("console/ring-ref", ref) + def setMemoryTarget(self, target): - self.memory_target = target + """Set the memory target of this domain. + @param target In KiB. + """ + self.info['memory_KiB'] = target self.storeDom("memory/target", target) - def update(self, info=None): - """Update with info from xc.domain_getinfo(). - """ - self.info = info or dom_get(self.domid) - self.memory = self.info['mem_kb'] / 1024 - self.ssidref = self.info['ssidref'] + + def update(self, info = None): + """Update with info from xc.domain_getinfo(). + """ + + log.debug("XendDomainInfo.update(%s) on domain %d", info, self.domid) + + if not info: + info = dom_get(self.domid) + if not info: + return + + self.info.update(info) + self.validateInfo() + + log.debug("XendDomainInfo.update done on domain %d: %s", self.domid, + self.info) + def state_set(self, state): self.state_updated.acquire() @@ -359,7 +562,7 @@ self.state = state self.state_updated.notifyAll() self.state_updated.release() - self.saveToDB() + self.exportToDB() def state_wait(self, state): self.state_updated.acquire() @@ -370,190 +573,83 @@ def __str__(self): s = "<domain" s += " id=" + str(self.domid) - s += " name=" + self.name - s += " memory=" + str(self.memory) - s += " ssidref=" + str(self.ssidref) + s += " name=" + self.info['name'] + s += " memory=" + str(self.info['memory_KiB'] / 1024) + s += " ssidref=" + str(self.info['ssidref']) s += ">" return s __repr__ = __str__ - def getDeviceController(self, type, error=True): - ctrl = self.controllers.get(type) - if not ctrl and error: - raise XendError("invalid device type:" + type) - return ctrl - - def findDeviceController(self, type): - return (self.getDeviceController(type, error=False) - or self.createDeviceController(type)) - - def createDeviceController(self, type): - ctrl = controller.createDevController(type, self, recreate=self.recreate) - self.controllers[type] = ctrl - return ctrl - - def createDevice(self, type, devconfig, change=False): - if self.recreate: - return - if type == 'vbd': - typedev = sxp.child_value(devconfig, 'dev') - if re.match('^ioemu:', typedev): - return; - - backdom = domain_exists(sxp.child_value(devconfig, 'backend', '0')) - - devnum = blkdev_name_to_number(sxp.child_value(devconfig, 'dev')) - - backpath = "%s/backend/%s/%s/%d" % (backdom.path, type, - self.uuid, devnum) - frontpath = "%s/device/%s/%d" % (self.path, type, devnum) - - front = { 'backend' : backpath, - 'backend-id' : "%i" % backdom.domid, - 'virtual-device' : "%i" % devnum } - xstransact.Write(frontpath, front) - - (type, params) = string.split(sxp.child_value(devconfig, - 'uname'), ':', 1) - back = { 'type' : type, - 'params' : params, - 'frontend' : frontpath, - 'frontend-id' : "%i" % self.domid } - xstransact.Write(backpath, back) - - return - - if type == 'vif': - from xen.xend import XendRoot - xroot = XendRoot.instance() - - def _get_config_ipaddr(config): - val = [] - for ipaddr in sxp.children(config, elt='ip'): - val.append(sxp.child0(ipaddr)) - return val - - backdom = domain_exists(sxp.child_value(devconfig, 'backend', '0')) - - devnum = self.netif_idx - self.netif_idx += 1 - - script = sxp.child_value(devconfig, 'script', - xroot.get_vif_script()) - script = os.path.join(xroot.network_script_dir, script) - bridge = sxp.child_value(devconfig, 'bridge', - xroot.get_vif_bridge()) - mac = sxp.child_value(devconfig, 'mac') - ipaddr = _get_config_ipaddr(devconfig) - - backpath = "%s/backend/%s/%s/%d" % (backdom.path, type, - self.uuid, devnum) - frontpath = "%s/device/%s/%d" % (self.path, type, devnum) - - front = { 'backend' : backpath, - 'backend-id' : "%i" % backdom.domid, - 'handle' : "%i" % devnum, - 'mac' : mac } - xstransact.Write(frontpath, front) - - back = { 'script' : script, - 'domain' : self.name, - 'mac' : mac, - 'bridge' : bridge, - 'frontend' : frontpath, - 'frontend-id' : "%i" % self.domid, - 'handle' : "%i" % devnum } - if ipaddr: - back['ip'] = ' '.join(ipaddr) - xstransact.Write(backpath, back) - - return - - if type == 'vtpm': - backdom = domain_exists(sxp.child_value(devconfig, 'backend', '0')) - - devnum = int(sxp.child_value(devconfig, 'instance', '0')) - log.error("The domain has a TPM with instance %d." % devnum) - - backpath = "%s/backend/%s/%s/%d" % (backdom.path, type, - self.uuid, devnum) - frontpath = "%s/device/%s/%d" % (self.path, type, devnum) - - front = { 'backend' : backpath, - 'backend-id' : "%i" % backdom.domid, - 'handle' : "%i" % devnum } - xstransact.Write(frontpath, front) - - back = { 'instance' : "%i" % devnum, - 'frontend' : frontpath, - 'frontend-id' : "%i" % self.domid } - xstransact.Write(backpath, back) - - return - - ctrl = self.findDeviceController(type) - return ctrl.createDevice(devconfig, recreate=self.recreate, - change=change) - - def configureDevice(self, type, id, devconfig): - ctrl = self.getDeviceController(type) - return ctrl.configureDevice(id, devconfig) - - def destroyDevice(self, type, id, change=False, reboot=False): - ctrl = self.getDeviceController(type) - return ctrl.destroyDevice(id, change=change, reboot=reboot) - - def deleteDevice(self, type, id): - ctrl = self.getDeviceController(type) - return ctrl.deleteDevice(id) - - def getDevice(self, type, id, error=True): - ctrl = self.getDeviceController(type) - return ctrl.getDevice(id, error=error) - - def getDeviceIds(self, type): - ctrl = self.getDeviceController(type) - return ctrl.getDeviceIds() - - def getDeviceSxprs(self, type): - ctrl = self.getDeviceController(type) - return ctrl.getDeviceSxprs() + + def getDeviceController(self, name): + if name not in controllerClasses: + raise XendError("unknown device type: " + str(name)) + + return controllerClasses[name](self) + + + def createDevice(self, deviceClass, devconfig): + return self.getDeviceController(deviceClass).createDevice(devconfig) + + + def configureDevice(self, deviceClass, devid, devconfig): + return self.getDeviceController(deviceClass).configureDevice( + devid, devconfig) + + + def destroyDevice(self, deviceClass, devid): + return self.getDeviceController(deviceClass).destroyDevice(devid) + def sxpr(self): sxpr = ['domain', ['domid', self.domid], - ['name', self.name], - ['memory', self.memory], - ['ssidref', self.ssidref], - ['target', self.target] ] + ['name', self.info['name']], + ['memory', self.info['memory_KiB'] / 1024], + ['ssidref', self.info['ssidref']]] if self.uuid: sxpr.append(['uuid', self.uuid]) if self.info: - sxpr.append(['maxmem', self.info['maxmem_kb']/1024 ]) - run = (self.info['running'] and 'r') or '-' - block = (self.info['blocked'] and 'b') or '-' - pause = (self.info['paused'] and 'p') or '-' - shut = (self.info['shutdown'] and 's') or '-' - crash = (self.info['crashed'] and 'c') or '-' - state = run + block + pause + shut + crash + sxpr.append(['maxmem', self.info['maxmem_KiB'] / 1024]) + + if self.infoIsSet('device'): + for (n, c) in self.info['device']: + sxpr.append(['device', c]) + + def stateChar(name): + if name in self.info: + if self.info[name]: + return name[0] + else: + return '-' + else: + return '?' + + state = reduce( + lambda x, y: x + y, + map(stateChar, + ['running', 'blocked', 'paused', 'shutdown', 'crashed'])) + sxpr.append(['state', state]) - if self.info['shutdown']: + if self.infoIsSet('shutdown'): reason = shutdown_reason(self.info['shutdown_reason']) sxpr.append(['shutdown_reason', reason]) - sxpr.append(['cpu', self.info['vcpu_to_cpu'][0]]) - sxpr.append(['cpu_time', self.info['cpu_time']/1e9]) + if self.infoIsSet('cpu_time'): + sxpr.append(['cpu_time', self.info['cpu_time']/1e9]) sxpr.append(['vcpus', self.info['vcpus']]) sxpr.append(['cpumap', self.info['cpumap']]) - # build a string, using '|' to seperate items, show only up - # to number of vcpus in domain, and trim the trailing '|' - sxpr.append(['vcpu_to_cpu', ''.join(map(lambda x: str(x)+'|', - self.info['vcpu_to_cpu'][0:self.info['vcpus']]))[:-1]]) + if self.infoIsSet('vcpu_to_cpu'): + sxpr.append(['cpu', self.info['vcpu_to_cpu'][0]]) + # build a string, using '|' to separate items, show only up + # to number of vcpus in domain, and trim the trailing '|' + sxpr.append(['vcpu_to_cpu', ''.join(map(lambda x: str(x)+'|', + self.info['vcpu_to_cpu'][0:self.info['vcpus']]))[:-1]]) - if self.start_time: - up_time = time.time() - self.start_time + if self.infoIsSet('start_time'): + up_time = time.time() - self.info['start_time'] sxpr.append(['up_time', str(up_time) ]) - sxpr.append(['start_time', str(self.start_time) ]) + sxpr.append(['start_time', str(self.info['start_time']) ]) if self.store_channel: sxpr.append(self.store_channel.sxpr()) @@ -563,36 +659,12 @@ sxpr.append(['console_channel', self.console_channel.sxpr()]) if self.console_mfn: sxpr.append(['console_mfn', self.console_mfn]) -# already in (devices) -# console = self.getConsole() -# if console: -# sxpr.append(console.sxpr()) - if self.restart_count: sxpr.append(['restart_count', self.restart_count]) if self.restart_state: sxpr.append(['restart_state', self.restart_state]) if self.restart_time: sxpr.append(['restart_time', str(self.restart_time)]) - - devs = self.sxpr_devices() - if devs: - sxpr.append(devs) - if self.config: - sxpr.append(['config', self.config]) - if self.device_model_pid: - sxpr.append(['device_model_pid',self.device_model_pid]) - return sxpr - - def sxpr_devices(self): - sxpr = [] - for ty in self.controllers.keys(): - devs = self.getDeviceSxprs(ty) - sxpr += devs - if sxpr: - sxpr.insert(0, 'devices') - else: - sxpr = None return sxpr def check_name(self, name): @@ -601,9 +673,8 @@ The same name cannot be used for more than one vm at the same time. @param name: name - @raise: VMerror if invalid - """ - if self.recreate: return + @raise: VmError if invalid + """ if name is None or name == '': raise VmError('missing vm name') for c in name: @@ -619,33 +690,35 @@ return if dominfo.is_terminated(): return - if not self.domid or (dominfo.domid != self.domid): - raise VmError('vm name clash: ' + name) - - def construct(self, config): + if self.domid is None: + raise VmError("VM name '%s' already in use by domain %d" % + (name, dominfo.domid)) + if dominfo.domid != self.domid: + raise VmError("VM name '%s' is used in both domains %d and %d" % + (name, self.domid, dominfo.domid)) + + + def construct(self): """Construct the vm instance from its configuration. @param config: configuration @raise: VmError on error """ # todo - add support for scheduling params? - self.config = config try: - # Initial domain create. - self.setName(sxp.child_value(config, 'name')) - self.check_name(self.name) - self.init_image() - self.configure_cpus(config) - self.init_domain() - self.register_domain() - self.configure_bootloader() + if 'image' not in self.info: + raise VmError('Missing image in configuration') + + self.image = ImageHandler.create(self, + self.info['image'], + self.info['device']) + + self.initDomain() # Create domain devices. - self.configure_backends() - self.configure_restart() self.construct_image() self.configure() - self.exportToDB(save=True) + self.exportToDB() except Exception, ex: # Catch errors, cleanup and re-raise. print 'Domain construction error:', ex @@ -654,45 +727,43 @@ self.destroy() raise - def register_domain(self): - xd = get_component('xen.xend.XendDomain') - xd._add_domain(self) - self.exportToDB(save=True) - - def configure_cpus(self, config): - try: - self.cpu_weight = float(sxp.child_value(config, 'cpu_weight', '1')) - except: - raise VmError('invalid cpu weight') - self.memory = int(sxp.child_value(config, 'memory')) - if self.memory is None: - raise VmError('missing memory size') - self.setMemoryTarget(self.memory * (1 << 20)) - self.ssidref = int(sxp.child_value(config, 'ssidref')) - cpu = sxp.child_value(config, 'cpu') - if self.recreate and self.domid and cpu is not None and int(cpu) >= 0: - xc.domain_pincpu(self.domid, 0, 1<<int(cpu)) - try: - image = sxp.child_value(self.config, 'image') - vcpus = sxp.child_value(image, 'vcpus') - if vcpus: - self.vcpus = int(vcpus) - except: - raise VmError('invalid vcpus value') + + def initDomain(self): + log.debug('XendDomainInfo.initDomain: %s %s %s %s)', + str(self.domid), + str(self.info['memory_KiB']), + str(self.info['ssidref']), + str(self.info['cpu_weight'])) + + self.domid = xc.domain_create(dom = self.domid or 0, + ssidref = self.info['ssidref']) + if self.domid <= 0: + raise VmError('Creating domain failed: name=%s' % + self.info['name']) + + if self.info['bootloader']: + self.image.handleBootloading() + + xc.domain_setcpuweight(self.domid, self.info['cpu_weight']) + m = self.image.getDomainMemory(self.info['memory_KiB']) + xc.domain_setmaxmem(self.domid, m) + xc.domain_memory_increase_reservation(self.domid, m, 0, 0) + + cpu = self.info['cpu'] + if cpu is not None and cpu != -1: + xc.domain_pincpu(self.domid, 0, 1 << cpu) + + self.info['start_time'] = time.time() + + log.debug('init_domain> Created domain=%d name=%s memory=%d', + self.domid, self.info['name'], self.info['memory_KiB']) + def configure_vcpus(self, vcpus): d = {} for v in range(0, vcpus): d["cpu/%d/availability" % v] = "online" self.writeVm(d) - - def init_image(self): - """Create boot image handler for the domain. - """ - image = sxp.child_value(self.config, 'image') - if image is None: - raise VmError('missing image') - self.image = ImageHandler.create(self, image) def construct_image(self): """Construct the boot image for the domain. @@ -704,23 +775,17 @@ IntroduceDomain(self.domid, self.store_mfn, self.store_channel.port1, self.path) # get the configured value of vcpus and update store - self.configure_vcpus(self.vcpus) + self.configure_vcpus(self.info['vcpus']) + def delete(self): """Delete the vm's db. """ - if dom_get(self.domid): - return - self.domid = None - self.saveToDB(sync=True) try: - # Todo: eventually will have to wait for devices to signal - # destruction before can delete the db. - if self.db: - self.db.delete() + xstransact.Remove(self.path, 'domid') except Exception, ex: log.warning("error in domain db delete: %s", ex) - pass + def destroy_domain(self): """Destroy the vm's domain. @@ -732,17 +797,16 @@ try: xc.domain_destroy(dom=self.domid) except Exception, err: - log.exception("Domain destroy failed: %s", self.name) + log.exception("Domain destroy failed: %s", self.info['name']) def cleanup(self): """Cleanup vm resources: release devices. """ self.state = STATE_VM_TERMINATED self.release_devices() - if self.store_channel: - self.setStoreChannel(None) + self.closeStoreChannel() if self.console_channel: - # notify processes using this cosole? + # notify processes using this console? try: self.console_channel.close() self.console_channel = None @@ -750,18 +814,20 @@ pass if self.image: try: - self.device_model_pid = 0 self.image.destroy() self.image = None except: pass def destroy(self): - """Clenup vm and destroy domain. - """ + """Cleanup vm and destroy domain. + """ + + log.debug("XendDomainInfo.destroy") + self.destroy_domain() self.cleanup() - self.saveToDB() + self.exportToDB() return 0 def is_terminated(self): @@ -772,43 +838,21 @@ def release_devices(self): """Release all vm devices. """ - reboot = self.restart_pending() - for ctrl in self.controllers.values(): - if ctrl.isDestroyed(): continue - ctrl.destroyController(reboot=reboot) + t = xstransact("%s/device" % self.path) - for d in t.list("vbd"): - t.remove(d) - for d in t.list("vif"): - t.remove(d) - for d in t.list("vtpm"): - t.remove(d) + + for n in controllerClasses.keys(): + for d in t.list(n): + try: + t.remove(d) + except ex: + # Log and swallow any exceptions in removal -- there's + # nothing more we can do. + log.exception( + "Device release failed: %s; %s; %s; %s" % + (self.info['name'], n, d, str(ex))) t.commit() - def show(self): - """Print virtual machine info. - """ - print "[VM dom=%d name=%s memory=%d ssidref=%d" % (self.domid, self.name, self.memory, self.ssidref) - print "image:" - sxp.show(self.image) - print "]" - - def init_domain(self): - """Initialize the domain memory. - """ - if self.recreate: - return - if self.start_time is None: - self.start_time = time.time() - self.storeVm(("start-time", self.start_time)) - try: - cpu = int(sxp.child_value(self.config, 'cpu', '-1')) - except: - raise VmError('invalid cpu') - id = self.image.initDomain(self.domid, self.memory, self.ssidref, cpu, self.cpu_weight) - log.debug('init_domain> Created domain=%d name=%s memory=%d', - id, self.name, self.memory) - self.setDomid(id) def eventChannel(self, path=None): """Create an event channel to the domain. @@ -833,17 +877,8 @@ self.console_channel = self.eventChannel("console/port") def create_configured_devices(self): - devices = sxp.children(self.config, 'device') - for d in devices: - dev_config = sxp.child0(d) - if dev_config is None: - raise VmError('invalid device') - dev_type = sxp.name(dev_config) - - if not controller.isDevControllerClass(dev_type): - raise VmError('unknown device type: ' + dev_type) - - self.createDevice(dev_type, dev_config) + for (n, c) in self.info['device']: + self.createDevice(n, c) def create_devices(self): @@ -851,13 +886,10 @@ @raise: VmError for invalid devices """ - if self.rebooting(): - for ctrl in self.controllers.values(): - ctrl.initController(reboot=True) - else: + if not self.rebooting(): self.create_configured_devices() - if not self.device_model_pid: - self.device_model_pid = self.image.createDeviceModel() + if self.image: + self.image.createDeviceModel() def device_create(self, dev_config): """Create a new device. @@ -865,60 +897,19 @@ @param dev_config: device configuration """ dev_type = sxp.name(dev_config) - dev = self.createDevice(dev_type, dev_config, change=True) - self.config.append(['device', dev.getConfig()]) - return dev.sxpr() - - def device_configure(self, dev_config, id): + devid = self.createDevice(dev_type, dev_config) +# self.config.append(['device', dev.getConfig()]) + return self.getDeviceController(dev_type).sxpr(devid) + + + def device_configure(self, dev_config, devid): """Configure an existing device. - @param dev_config: device configuration - @param id: device id - """ - type = sxp.name(dev_config) - dev = self.getDevice(type, id) - old_config = dev.getConfig() - new_config = dev.configure(dev_config, change=True) - # Patch new config into vm config. - new_full_config = ['device', new_config] - old_full_config = ['device', old_config] - old_index = self.config.index(old_full_config) - self.config[old_index] = new_full_config - return new_config - - def device_refresh(self, type, id): - """Refresh a device. - - @param type: device type - @param id: device id - """ - dev = self.getDevice(type, id) - dev.refresh() - - def device_delete(self, type, id): - """Destroy and remove a device. - - @param type: device type - @param id: device id - """ - dev = self.getDevice(type, id) - dev_config = dev.getConfig() - if dev_config: - self.config.remove(['device', dev_config]) - self.deleteDevice(type, dev.getId()) - - def configure_bootloader(self): - """Configure boot loader. - """ - self.bootloader = sxp.child_value(self.config, "bootloader") - - def configure_restart(self): - """Configure the vm restart mode. - """ - r = sxp.child_value(self.config, 'restart', RESTART_ONREBOOT) - if r not in restart_modes: - raise VmError('invalid restart mode: ' + str(r)) - self.restart_mode = r; + @param devid: device id + """ + deviceClass = sxp.name(dev_config) + self.configureDevice(deviceClass, devid, dev_config) + def restart_needed(self, reason): """Determine if the vm needs to be restarted when shutdown @@ -927,11 +918,11 @@ @param reason: shutdown reason @return True if needs restart, False otherwise """ - if self.restart_mode == RESTART_NEVER: + if self.info['restart_mode'] == RESTART_NEVER: return False - if self.restart_mode == RESTART_ALWAYS: + if self.info['restart_mode'] == RESTART_ALWAYS: return True - if self.restart_mode == RESTART_ONREBOOT: + if self.info['restart_mode'] == RESTART_ONREBOOT: return reason == 'reboot' return False @@ -963,7 +954,7 @@ tdelta = tnow - self.restart_time if tdelta < self.MINIMUM_RESTART_TIME: self.restart_cancel() - msg = 'VM %s restarting too fast' % self.name + msg = 'VM %s restarting too fast' % self.info['name'] log.error(msg) raise VmError(msg) self.restart_time = tnow @@ -981,14 +972,15 @@ self.restart_check() self.exportToDB() self.restart_state = STATE_RESTART_BOOTING - if self.bootloader: - self.config = self.bootloader_config() - self.construct(self.config) - self.saveToDB() + self.configure_bootloader() + self.construct() + self.exportToDB() finally: self.restart_state = None - def bootloader_config(self): + def configure_bootloader(self): + if not self.info['bootloader']: + return # if we're restarting with a bootloader, we need to run it # FIXME: this assumes the disk is the first device and # that we're booting from the first disk @@ -998,72 +990,30 @@ if dev: disk = sxp.child_value(dev, "uname") fn = blkdev_uname_to_file(disk) - blcfg = bootloader(self.bootloader, fn, 1, self.vcpus) + blcfg = bootloader(self.info['bootloader'], fn, 1, self.info['vcpus']) if blcfg is None: msg = "Had a bootloader specified, but can't find disk" log.error(msg) raise VmError(msg) - config = sxp.merge(['vm', blcfg ], self.config) - return config - - def configure_backends(self): - """Set configuration flags if the vm is a backend for netif or blkif. - Configure the backends to use for vbd and vif if specified. - """ - for c in sxp.children(self.config, 'backend'): - v = sxp.child0(c) - name = sxp.name(v) - if name == 'blkif': - self.blkif_backend = True - elif name == 'netif': - self.netif_backend = True - elif name == 'usbif': - self.usbif_backend = True - elif name == 'tpmif': - self.tpmif_backend = True - else: - raise VmError('invalid backend type:' + str(name)) + self.config = sxp.merge(['vm', ['image', blcfg]], self.config) + def configure(self): """Configure a vm. """ - self.configure_fields() + self.configure_maxmem() self.create_devices() - self.create_blkif() - - def create_blkif(self): - """Create the block device interface (blkif) for the vm. - The vm needs a blkif even if it doesn't have any disks - at creation time, for example when it uses NFS root. - - """ - return - blkif = self.getDeviceController("vbd", error=False) - if not blkif: - blkif = self.createDeviceController("vbd") - backend = blkif.getBackend(0) - backend.connect(recreate=self.recreate) - - def configure_fields(self): - """Process the vm configuration fields using the registered handlers. - """ - index = {} - for field in sxp.children(self.config): - field_name = sxp.name(field) - field_index = index.get(field_name, 0) - field_handler = config_handlers.get(field_name) - # Ignore unknown fields. Warn? - if field_handler: - v = field_handler(self, self.config, field, field_index) - else: - log.warning("Unknown config field %s", field_name) - index[field_name] = field_index + 1 + + + def configure_maxmem(self): + xc.domain_setmaxmem(self.domid, maxmem_kb = self.info['maxmem_KiB']) + def vcpu_hotplug(self, vcpu, state): """Disable or enable VCPU in domain. """ - if vcpu > self.vcpus: + if vcpu > self.info['vcpus']: log.error("Invalid VCPU %d" % vcpu) return if int(state) == 0: @@ -1109,26 +1059,29 @@ # get run-time value of vcpus and update store self.configure_vcpus(dom_get(self.domid)['vcpus']) - -def vm_field_ignore(_, _1, _2, _3): - """Dummy config field handler used for fields with built-in handling. - Matches the signature required by config_handlers. - """ - pass - - -def vm_field_maxmem(vm, _1, val, _2): - """Config field handler to configure vm memory limit. Matches the - signature required by config_handlers. - """ - maxmem = sxp.child0(val) - if maxmem is None: - maxmem = vm.memory - try: - maxmem = int(maxmem) - except: - raise VmError("invalid maxmem: " + str(maxmem)) - xc.domain_setmaxmem(vm.domid, maxmem_kb = maxmem * 1024) + def dom0_enforce_vcpus(self): + dom = 0 + # get max number of vcpus to use for dom0 from config + from xen.xend import XendRoot + xroot = XendRoot.instance() + target = int(xroot.get_dom0_vcpus()) + log.debug("number of vcpus to use is %d" % (target)) + + # target = 0 means use all processors + if target > 0: + # count the number of online vcpus (cpu values in v2c map >= 0) + vcpu_to_cpu = dom_get(dom)['vcpu_to_cpu'] + vcpus_online = len(filter(lambda x: x >= 0, vcpu_to_cpu)) + log.debug("found %d vcpus online" % (vcpus_online)) + + # disable any extra vcpus that are online over the requested target + for vcpu in range(target, vcpus_online): + log.info("enforcement is disabling DOM%d VCPU%d" % (dom, vcpu)) + self.vcpu_hotplug(vcpu, 0) + + + def infoIsSet(self, name): + return name in self.info and self.info[name] is not None #============================================================================ @@ -1144,37 +1097,32 @@ addImageHandlerClass(VmxImageHandler) -"""Table of handlers for field configuration. - -field_name[String]: fn(vm, config, field, index) -> value(ignored) -""" -config_handlers = { - - # Ignore the fields we already handle. - - 'name': vm_field_ignore, - 'memory': vm_field_ignore, - 'ssidref': vm_field_ignore, - 'cpu': vm_field_ignore, - 'cpu_weight': vm_field_ignore, - 'restart': vm_field_ignore, - 'image': vm_field_ignore, - 'device': vm_field_ignore, - 'backend': vm_field_ignore, - 'vcpus': vm_field_ignore, - 'bootloader': vm_field_ignore, - - # Register other config handlers. - 'maxmem': vm_field_maxmem - } - - #============================================================================ # Register device controllers and their device config types. +"""A map from device-class names to the subclass of DevController that +implements the device control specific to that device-class.""" +controllerClasses = {} + + +"""A map of backend names and the corresponding flag.""" +backendFlags = {} + + +def addControllerClass(device_class, backend_name, backend_flag, cls): + """Register a subclass of DevController to handle the named device-class. + + @param backend_flag One of the SIF_XYZ_BE_DOMAIN constants, or None if + no flag is to be set. + """ + cls.deviceClass = device_class + backendFlags[backend_name] = backend_flag + controllerClasses[device_class] = cls + + from xen.xend.server import blkif, netif, tpmif, pciif, usbif -controller.addDevControllerClass("vbd", blkif.BlkifController) -controller.addDevControllerClass("vif", netif.NetifController) -controller.addDevControllerClass("vtpm", tpmif.TPMifController) -controller.addDevControllerClass("pci", pciif.PciController) -controller.addDevControllerClass("usb", usbif.UsbifController) +addControllerClass('vbd', 'blkif', SIF_BLK_BE_DOMAIN, blkif.BlkifController) +addControllerClass('vif', 'netif', SIF_NET_BE_DOMAIN, netif.NetifController) +addControllerClass('vtpm', 'tpmif', SIF_TPM_BE_DOMAIN, tpmif.TPMifController) +addControllerClass('pci', 'pciif', None, pciif.PciController) +addControllerClass('usb', 'usbif', None, usbif.UsbifController) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendLogging.py --- a/tools/python/xen/xend/XendLogging.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendLogging.py Thu Sep 22 17:42:01 2005 @@ -50,9 +50,6 @@ self.getLogger().setLevel(level) self.level = level - def getLevel(self, level): - return logging.getLevelName(self.level) - def getLogger(self): return logging.getLogger("xend") @@ -65,8 +62,7 @@ backupCount=self.backupCount) self.logfilename = filename self.logfile.setFormatter(Formatter(self.logFileFormat, self.dateFormat)) - log = self.getLogger() - log.addHandler(self.logfile) + self.getLogger().addHandler(self.logfile) def getLogFile(self): return self.logfile diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendNode.py --- a/tools/python/xen/xend/XendNode.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendNode.py Thu Sep 22 17:42:01 2005 @@ -36,7 +36,7 @@ def reboot(self): return 0 - def notify(self, uri): + def notify(self, _): return 0 def cpu_bvt_slice_set(self, ctx_allow): diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendProtocol.py --- a/tools/python/xen/xend/XendProtocol.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendProtocol.py Thu Sep 22 17:42:01 2005 @@ -22,7 +22,7 @@ from encode import * import sxp -from xen.xend import XendRoot; xroot = XendRoot.instance() +from xen.xend import XendRoot DEBUG = 0 @@ -30,6 +30,10 @@ HTTP_CREATED = 201 HTTP_ACCEPTED = 202 HTTP_NO_CONTENT = 204 + + +xroot = XendRoot.instance() + class XendError(RuntimeError): """Error class for 'expected errors' when talking to xend. diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/XendRoot.py --- a/tools/python/xen/xend/XendRoot.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/XendRoot.py Thu Sep 22 17:42:01 2005 @@ -87,7 +87,7 @@ dom0_min_mem_default = '0' - dom0_cpus_default = '0' + dom0_vcpus_default = '0' components = {} @@ -332,8 +332,8 @@ def get_dom0_min_mem(self): return self.get_config_int('dom0-min-mem', self.dom0_min_mem_default) - def get_dom0_cpus(self): - return self.get_config_int('dom0-cpus', self.dom0_cpus_default) + def get_dom0_vcpus(self): + return self.get_config_int('dom0-cpus', self.dom0_vcpus_default) def instance(): """Get an instance of XendRoot. diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/encode.py --- a/tools/python/xen/xend/encode.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/encode.py Thu Sep 22 17:42:01 2005 @@ -26,7 +26,6 @@ from StringIO import StringIO import urllib -import httplib import random import md5 @@ -104,7 +103,7 @@ val = ({}, None) if d is None: return val multipart = 0 - for (k, v) in data_values(d): + for (_, v) in data_values(d): if encode_isfile(v): multipart = 1 break @@ -156,7 +155,7 @@ def mime_boundary(): random.seed() m = md5.new() - for i in range(0, 10): + for _ in range(0, 10): c = chr(random.randint(1, 255)) m.update(c) b = m.hexdigest() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/image.py Thu Sep 22 17:42:01 2005 @@ -13,34 +13,29 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ + import os, string import re -import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() +import xen.lowlevel.xc from xen.xend import sxp from xen.xend.XendError import VmError from xen.xend.XendLogging import log -from xen.xend.xenstore import DBVar -from xen.xend.xenstore.xstransact import xstransact from xen.xend.server import channel -"""Flag for a block device backend domain.""" -SIF_BLK_BE_DOMAIN = (1<<4) - -"""Flag for a net device backend domain.""" -SIF_NET_BE_DOMAIN = (1<<5) - -"""Flag for a TPM device backend domain.""" -SIF_TPM_BE_DOMAIN = (1<<7) + +xc = xen.lowlevel.xc.new() + + +MAX_GUEST_CMDLINE = 1024 class ImageHandler: """Abstract base class for image handlers. - initDomain() is called to initialise the domain memory. - createImage() is called to configure and build the domain from its kernel image and ramdisk etc. @@ -88,49 +83,57 @@ findImageHandlerClass = classmethod(findImageHandlerClass) - def create(cls, vm, image): + def create(cls, vm, imageConfig, deviceConfig): """Create an image handler for a vm. - @param vm vm - @param image image config @return ImageHandler instance """ - imageClass = cls.findImageHandlerClass(image) - return imageClass(vm, image) + imageClass = cls.findImageHandlerClass(imageConfig) + return imageClass(vm, imageConfig, deviceConfig) create = classmethod(create) #====================================================================== # Instance vars and methods. - db = None ostype = None - config = None kernel = None ramdisk = None cmdline = None + flags = 0 - __exports__ = [ - DBVar('ostype', ty='str'), - DBVar('config', ty='sxpr'), - DBVar('kernel', ty='str'), - DBVar('ramdisk', ty='str'), - DBVar('cmdline', ty='str'), - DBVar('flags', ty='int'), - ] - - def __init__(self, vm, config): + def __init__(self, vm, imageConfig, deviceConfig): self.vm = vm - self.db = vm.db.addChild('/image') - self.config = config - - def exportToDB(self, save=False, sync=False): - self.db.exportToDB(self, fields=self.__exports__, save=save, sync=sync) - - def importFromDB(self): - self.db.importFromDB(self, fields=self.__exports__) + self.configure(imageConfig, deviceConfig) + + def configure(self, imageConfig, _): + """Config actions common to all unix-like domains.""" + + self.kernel = sxp.child_value(imageConfig, "kernel") + self.cmdline = "" + ip = sxp.child_value(imageConfig, "ip", None) + if ip: + self.cmdline += " ip=" + ip + root = sxp.child_value(imageConfig, "root") + if root: + self.cmdline += " root=" + root + args = sxp.child_value(imageConfig, "args") + if args: + self.cmdline += " " + args + self.ramdisk = sxp.child_value(imageConfig, "ramdisk", '') + + self.vm.storeVm(("image/ostype", self.ostype), + ("image/kernel", self.kernel), + ("image/cmdline", self.cmdline), + ("image/ramdisk", self.ramdisk)) + + + def handleBootloading(): + self.unlink(self.kernel) + self.unlink(self.ramdisk) + def unlink(self, f): if not f: return @@ -139,94 +142,39 @@ except OSError, ex: log.warning("error removing bootloader file '%s': %s", f, ex) - def initDomain(self, dom, memory, ssidref, cpu, cpu_weight): - """Initial domain create. - - @return domain id - """ - - mem_kb = self.getDomainMemory(memory) - if not self.vm.restore: - dom = xc.domain_create(dom = dom or 0, ssidref = ssidref) - # if bootloader, unlink here. But should go after buildDomain() ? - if self.vm.bootloader: - self.unlink(self.kernel) - self.unlink(self.ramdisk) - if dom <= 0: - raise VmError('Creating domain failed: name=%s' % self.vm.name) - log.debug("initDomain: cpu=%d mem_kb=%d ssidref=%d dom=%d", cpu, mem_kb, ssidref, dom) - xc.domain_setcpuweight(dom, cpu_weight) - xc.domain_setmaxmem(dom, mem_kb) - - try: - # Give the domain some memory below 4GB - lmem_kb = 0 - if lmem_kb > 0: - xc.domain_memory_increase_reservation(dom, min(lmem_kb,mem_kb), 0, 32) - if mem_kb > lmem_kb: - xc.domain_memory_increase_reservation(dom, mem_kb-lmem_kb, 0, 0) - except: - xc.domain_destroy(dom) - raise - - if cpu != -1: - xc.domain_pincpu(dom, 0, 1<<int(cpu)) - return dom def createImage(self): """Entry point to create domain memory image. Override in subclass if needed. """ - self.configure() self.createDomain() - def configure(self): - """Config actions common to all unix-like domains.""" - self.kernel = sxp.child_value(self.config, "kernel") - self.cmdline = "" - ip = sxp.child_value(self.config, "ip", None) - if ip: - self.cmdline += " ip=" + ip - root = sxp.child_value(self.config, "root") - if root: - self.cmdline += " root=" + root - args = sxp.child_value(self.config, "args") - if args: - self.cmdline += " " + args - self.ramdisk = sxp.child_value(self.config, "ramdisk", '') - def createDomain(self): """Build the domain boot image. """ # Set params and call buildDomain(). - self.flags = 0 - if self.vm.netif_backend: self.flags |= SIF_NET_BE_DOMAIN - if self.vm.blkif_backend: self.flags |= SIF_BLK_BE_DOMAIN - if self.vm.tpmif_backend: self.flags |= SIF_TPM_BE_DOMAIN - - if self.vm.recreate or self.vm.restore: - return + self.flags = self.vm.getBackendFlags() + if not os.path.isfile(self.kernel): raise VmError('Kernel image does not exist: %s' % self.kernel) if self.ramdisk and not os.path.isfile(self.ramdisk): raise VmError('Kernel ramdisk does not exist: %s' % self.ramdisk) - if len(self.cmdline) >= 256: - log.warning('kernel cmdline too long, domain %d', self.vm.getDomain()) + if len(self.cmdline) >= MAX_GUEST_CMDLINE: + log.warning('kernel cmdline too long, domain %d', + self.vm.getDomid()) log.info("buildDomain os=%s dom=%d vcpus=%d", self.ostype, - self.vm.getDomain(), self.vm.vcpus) + self.vm.getDomid(), self.vm.getVCpuCount()) err = self.buildDomain() if err != 0: raise VmError('Building domain failed: ostype=%s dom=%d err=%d' - % (self.ostype, self.vm.getDomain(), err)) - - def getDomainMemory(self, mem_mb): - """Memory (in KB) the domain will need for mem_mb (in MB).""" - if os.uname()[4] == 'ia64': - """Append extra system pages, like xenstore and console""" - return (mem_mb * 1024 + 3 * 16) - else: - return mem_mb * 1024 + % (self.ostype, self.vm.getDomid(), err)) + + def getDomainMemory(self, mem): + """@return The memory required, in KiB, by the domain to store the + given amount, also in KiB. This is normally just mem, but VMX domains + have overheads to account for.""" + return mem def buildDomain(self): """Build the domain. Define in subclass.""" @@ -262,23 +210,23 @@ else: console_evtchn = 0 - log.debug("dom = %d", self.vm.getDomain()) + log.debug("dom = %d", self.vm.getDomid()) log.debug("image = %s", self.kernel) log.debug("store_evtchn = %d", store_evtchn) log.debug("console_evtchn = %d", console_evtchn) log.debug("cmdline = %s", self.cmdline) log.debug("ramdisk = %s", self.ramdisk) log.debug("flags = %d", self.flags) - log.debug("vcpus = %d", self.vm.vcpus) - - ret = xc.linux_build(dom = self.vm.getDomain(), + log.debug("vcpus = %d", self.vm.getVCpuCount()) + + ret = xc.linux_build(dom = self.vm.getDomid(), image = self.kernel, store_evtchn = store_evtchn, console_evtchn = console_evtchn, cmdline = self.cmdline, ramdisk = self.ramdisk, flags = self.flags, - vcpus = self.vm.vcpus) + vcpus = self.vm.getVCpuCount()) if isinstance(ret, dict): self.set_vminfo(ret) return 0 @@ -286,49 +234,72 @@ class VmxImageHandler(ImageHandler): - __exports__ = ImageHandler.__exports__ + [ - DBVar('memmap', ty='str'), - DBVar('memmap_value', ty='sxpr'), - # device channel? - ] - ostype = "vmx" - memmap = None - memmap_value = [] - device_channel = None - pid = 0 + + def configure(self, imageConfig, deviceConfig): + ImageHandler.configure(self, imageConfig, deviceConfig) + + self.memmap = sxp.child_value(imageConfig, 'memmap') + self.dmargs = self.parseDeviceModelArgs(imageConfig, deviceConfig) + self.device_model = sxp.child_value(imageConfig, 'device_model') + if not self.device_model: + raise VmError("vmx: missing device model") + self.display = sxp.child_value(imageConfig, 'display') + + self.vm.storeVm(("image/memmap", self.memmap), + ("image/dmargs", " ".join(self.dmargs)), + ("image/device-model", self.device_model), + ("image/display", self.display)) + + self.device_channel = None + self.pid = 0 + self.memmap_value = [] + + self.dmargs += self.configVNC(imageConfig) + + def createImage(self): """Create a VM for the VMX environment. """ - self.configure() self.parseMemmap() self.createDomain() def buildDomain(self): # Create an event channel - self.device_channel = channel.eventChannel(0, self.vm.getDomain()) + self.device_channel = channel.eventChannel(0, self.vm.getDomid()) log.info("VMX device model port: %d", self.device_channel.port2) if self.vm.store_channel: store_evtchn = self.vm.store_channel.port2 else: store_evtchn = 0 - ret = xc.vmx_build(dom = self.vm.getDomain(), - image = self.kernel, - control_evtchn = self.device_channel.port2, - store_evtchn = store_evtchn, - memsize = self.vm.memory, - memmap = self.memmap_value, - cmdline = self.cmdline, - ramdisk = self.ramdisk, - flags = self.flags, - vcpus = self.vm.vcpus) + + log.debug("dom = %d", self.vm.getDomid()) + log.debug("image = %s", self.kernel) + log.debug("control_evtchn = %d", self.device_channel.port2) + log.debug("store_evtchn = %d", store_evtchn) + log.debug("memsize = %d", self.vm.getMemoryTarget() / 1024) + log.debug("memmap = %s", self.memmap_value) + log.debug("cmdline = %s", self.cmdline) + log.debug("ramdisk = %s", self.ramdisk) + log.debug("flags = %d", self.flags) + log.debug("vcpus = %d", self.vm.getVCpuCount()) + + ret = xc.vmx_build(dom = self.vm.getDomid(), + image = self.kernel, + control_evtchn = self.device_channel.port2, + store_evtchn = store_evtchn, + memsize = self.vm.getMemoryTarget() / 1024, + memmap = self.memmap_value, + cmdline = self.cmdline, + ramdisk = self.ramdisk, + flags = self.flags, + vcpus = self.vm.getVCpuCount()) if isinstance(ret, dict): self.set_vminfo(ret) return 0 return ret def parseMemmap(self): - self.memmap = sxp.child_value(self.vm.config, "memmap") if self.memmap is None: return memmap = sxp.parse(open(self.memmap))[0] @@ -337,12 +308,12 @@ # Return a list of cmd line args to the device models based on the # xm config file - def parseDeviceModelArgs(self): - dmargs = [ 'cdrom', 'boot', 'fda', 'fdb', - 'localtime', 'serial', 'stdvga', 'isa' ] - ret = [] - for a in dmargs: - v = sxp.child_value(self.vm.config, a) + def parseDeviceModelArgs(self, imageConfig, deviceConfig): + dmargs = [ 'cdrom', 'boot', 'fda', 'fdb', + 'localtime', 'serial', 'stdvga', 'isa', 'vcpus' ] + ret = [] + for a in dmargs: + v = sxp.child_value(imageConfig, a) # python doesn't allow '-' in variable names if a == 'stdvga': a = 'std-vga' @@ -351,20 +322,17 @@ if a in ['localtime', 'std-vga', 'isa']: if v != None: v = int(v) - log.debug("args: %s, val: %s" % (a,v)) - if v: - ret.append("-%s" % a) - ret.append("%s" % v) + log.debug("args: %s, val: %s" % (a,v)) + if v: + ret.append("-%s" % a) + ret.append("%s" % v) # Handle disk/network related options - devices = sxp.children(self.vm.config, 'device') - for device in devices: - name = sxp.name(sxp.child0(device)) + for (name, info) in deviceConfig: if name == 'vbd': - vbdinfo = sxp.child(device, 'vbd') - uname = sxp.child_value(vbdinfo, 'uname') - typedev = sxp.child_value(vbdinfo, 'dev') - (vbdtype, vbdparam) = string.split(uname, ':', 1) + uname = sxp.child_value(info, 'uname') + typedev = sxp.child_value(info, 'dev') + (_, vbdparam) = string.split(uname, ':', 1) if re.match('^ioemu:', typedev): (emtype, vbddev) = string.split(typedev, ':', 1) else: @@ -378,61 +346,59 @@ ret.append("-%s" % vbddev) ret.append("%s" % vbdparam) if name == 'vif': - vifinfo = sxp.child(device, 'vif') - mac = sxp.child_value(vifinfo, 'mac') + mac = sxp.child_value(info, 'mac') ret.append("-macaddr") ret.append("%s" % mac) if name == 'vtpm': - vtpminfo = sxp.child(device, 'vtpm') - instance = sxp.child_value(vtpminfo, 'instance') + instance = sxp.child_value(info, 'instance') ret.append("-instance") ret.append("%s" % instance) - - # Handle graphics library related options - vnc = sxp.child_value(self.vm.config, 'vnc') - sdl = sxp.child_value(self.vm.config, 'sdl') - nographic = sxp.child_value(self.vm.config, 'nographic') - if nographic: - ret.append('-nographic') - return ret - - if vnc and sdl: - ret = ret + ['-vnc-and-sdl', '-k', 'en-us'] - elif vnc: - ret = ret + ['-vnc', '-k', 'en-us'] - if vnc: - vncport = int(self.vm.getDomain()) + 5900 - ret = ret + ['-vncport', '%d' % vncport] - return ret - + return ret + + def configVNC(self, config): + # Handle graphics library related options + vnc = sxp.child_value(config, 'vnc') + sdl = sxp.child_value(config, 'sdl') + ret = [] + nographic = sxp.child_value(config, 'nographic') + if nographic: + ret.append('-nographic') + return ret + + if vnc and sdl: + ret = ret + ['-vnc-and-sdl', '-k', 'en-us'] + elif vnc: + ret = ret + ['-vnc', '-k', 'en-us'] + if vnc: + vncport = int(self.vm.getDomid()) + 5900 + ret = ret + ['-vncport', '%d' % vncport] + return ret + def createDeviceModel(self): - device_model = sxp.child_value(self.vm.config, 'device_model') - if not device_model: - raise VmError("vmx: missing device model") + if self.pid: + return # Execute device model. #todo: Error handling # XXX RN: note that the order of args matter! - args = [device_model] + args = [self.device_model] vnc = self.vncParams() if len(vnc): args = args + vnc - args = args + ([ "-d", "%d" % self.vm.getDomain(), + args = args + ([ "-d", "%d" % self.vm.getDomid(), "-p", "%d" % self.device_channel.port1, - "-m", "%s" % self.vm.memory ]) - args = args + self.parseDeviceModelArgs() + "-m", "%s" % (self.vm.getMemoryTarget() / 1024)]) + args = args + self.dmargs env = dict(os.environ) - env['DISPLAY'] = sxp.child_value(self.vm.config, 'display') - log.info("spawning device models: %s %s", device_model, args) - self.pid = os.spawnve(os.P_NOWAIT, device_model, args, env) + env['DISPLAY'] = self.display + log.info("spawning device models: %s %s", self.device_model, args) + self.pid = os.spawnve(os.P_NOWAIT, self.device_model, args, env) log.info("device model pid: %d", self.pid) - return self.pid def vncParams(self): # see if a vncviewer was specified # XXX RN: bit of a hack. should unify this, maybe stick in config space vncconnect=[] - image = self.config - args = sxp.child_value(image, "args") + args = self.cmdline if args: arg_list = string.split(args) for arg in arg_list: @@ -446,15 +412,16 @@ channel.eventChannelClose(self.device_channel) import signal if not self.pid: - self.pid = self.vm.device_model_pid + return os.kill(self.pid, signal.SIGKILL) - (pid, status) = os.waitpid(self.pid, 0) + os.waitpid(self.pid, 0) self.pid = 0 - def getDomainMemory(self, mem_mb): + def getDomainMemory(self, mem): + """@see ImageHandler.getDomainMemory""" # for ioreq_t and xenstore static_pages = 2 - return (mem_mb * 1024) + self.getPageTableSize(mem_mb) + 4 * static_pages + return mem + self.getPageTableSize(mem / 1024) + 4 * static_pages def getPageTableSize(self, mem_mb): """Return the size of memory needed for 1:1 page tables for physical @@ -466,8 +433,9 @@ # 1 page for the PGD + 1 pte page for 4MB of memory (rounded) if os.uname()[4] == 'x86_64': return (5 + ((mem_mb + 1) >> 1)) * 4 - elif os.uname()[4] == 'ia64': - # XEN/IA64 has p2m table allocated on demand, so only return guest firmware size here. - return 16 * 1024 + elif os.uname()[4] == 'ia64': + # XEN/IA64 has p2m table allocated on demand, so only return + # guest firmware size here. + return 16 * 1024 else: return (1 + ((mem_mb + 3) >> 2)) * 4 diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/scheduler.py --- a/tools/python/xen/xend/scheduler.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/scheduler.py Thu Sep 22 17:42:01 2005 @@ -13,11 +13,12 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ import threading -def later(delay, fn, args=(), kwargs={}): +def later(delay, fn, *args, **kwargs): """Schedule a function to be called later. @param delay: delay in seconds @@ -29,7 +30,7 @@ timer.start() return timer -def now(fn, args=(), kwargs={}): +def now(fn, *args, **kwargs): """Schedule a function to be called now. @param fn: function diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/SrvDaemon.py --- a/tools/python/xen/xend/server/SrvDaemon.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/SrvDaemon.py Thu Sep 22 17:42:01 2005 @@ -25,7 +25,6 @@ from xen.xend.XendLogging import log from xen.xend import XendRoot; xroot = XendRoot.instance() -import controller import event import relocate from params import * @@ -137,13 +136,6 @@ else: return 0 - def onSIGCHLD(self, signum, frame): - if self.child > 0: - try: - pid, sts = os.waitpid(self.child, os.WNOHANG) - except os.error, ex: - pass - def fork_pid(self, pidfile): """Fork and write the pid of the child to 'pidfile'. @@ -200,15 +192,29 @@ # Trying to run an already-running service is a success. return 0 - signal.signal(signal.SIGCHLD, self.onSIGCHLD) + ret = 0 + + # we use a pipe to communicate between the parent and the child process + # this way we know when the child has actually initialized itself so + # we can avoid a race condition during startup + + r,w = os.pipe() if self.fork_pid(XEND_PID_FILE): - #Parent. Sleep to give child time to start. - time.sleep(1) + os.close(w) + r = os.fdopen(r, 'r') + s = r.read() + r.close() + if not len(s): + ret = 1 + else: + ret = int(s) else: + os.close(r) # Child self.tracing(trace) - self.run() - return 0 + self.run(os.fdopen(w, 'w')) + + return ret def tracing(self, traceon): """Turn tracing on or off. @@ -290,20 +296,21 @@ def stop(self): return self.cleanup(kill=True) - def run(self): - _enforce_dom0_cpus() + def run(self, status): try: log.info("Xend Daemon started") event.listenEvent(self) relocate.listenRelocation() servers = SrvServer.create() self.daemonize() - servers.start() + servers.start(status) except Exception, ex: print >>sys.stderr, 'Exception starting xend:', ex if XEND_DEBUG: traceback.print_exc() log.exception("Exception starting xend (%s)" % ex) + status.write('1') + status.close() self.exit(1) def exit(self, rc=0): @@ -314,32 +321,6 @@ #sys.exit(rc) os._exit(rc) -def _enforce_dom0_cpus(): - dn = xroot.get_dom0_cpus() - - for d in glob.glob("/sys/devices/system/cpu/cpu*"): - cpu = int(os.path.basename(d)[3:]) - if (dn == 0) or (cpu < dn): - v = "1" - else: - v = "0" - try: - f = open("%s/online" %d, "r+") - c = f.read(1) - if (c != v): - if v == "0": - log.info("dom0 is trying to give back cpu %d", cpu) - else: - log.info("dom0 is trying to take cpu %d", cpu) - f.seek(0) - f.write(v) - f.close() - log.info("dom0 successfully enforced cpu %d", cpu) - else: - f.close() - except: - pass - def instance(): global inst try: diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/SrvDomainDir.py --- a/tools/python/xen/xend/server/SrvDomainDir.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/SrvDomainDir.py Thu Sep 22 17:42:01 2005 @@ -85,7 +85,7 @@ def _op_create_cb(self, dominfo, configstring, req): """Callback to handle domain creation. """ - dom = dominfo.name + dom = dominfo.getName() domurl = "%s/%s" % (req.prePathURL(), dom) req.setResponseCode(http.CREATED, "created") req.setHeader("Location", domurl) @@ -112,7 +112,7 @@ fn = FormFn(self.xd.domain_restore, [['file', 'str']]) dominfo = fn(req.args) - dom = dominfo.name + dom = dominfo.getName() domurl = "%s/%s" % (req.prePathURL(), dom) req.setResponseCode(http.CREATED) req.setHeader("Location", domurl) @@ -152,12 +152,12 @@ domains = self.xd.list_sorted() req.write('<ul>') for d in domains: - req.write('<li><a href="%s%s"> Domain %s</a>' - % (url, d.name, d.name)) - req.write('id=%s' % d.domid) - req.write('memory=%d'% d.memory) - req.write('ssidref=%d'% d.ssidref) - req.write('</li>') + req.write('<li><a href="%s%s"> Domain %s</a>' + % (url, d.getName(), d.getName())) + req.write('id=%s' % d.getDomain()) + req.write('memory=%d'% d.getMemoryTarget()) + req.write('ssidref=%d'% d.getSsidref()) + req.write('</li>') req.write('</ul>') def form(self, req): diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/SrvNode.py --- a/tools/python/xen/xend/server/SrvNode.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/SrvNode.py Thu Sep 22 17:42:01 2005 @@ -15,7 +15,6 @@ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> #============================================================================ -import os from xen.web.SrvDir import SrvDir from xen.xend import sxp @@ -32,15 +31,15 @@ self.add('dmesg', 'SrvDmesg') self.add('log', 'SrvXendLog') - def op_shutdown(self, op, req): + def op_shutdown(self, _1, _2): val = self.xn.shutdown() return val - def op_reboot(self, op, req): + def op_reboot(self, _1, _2): val = self.xn.reboot() return val - def op_cpu_bvt_slice_set(self, op, req): + def op_cpu_bvt_slice_set(self, _, req): fn = FormFn(self.xn.cpu_bvt_slice_set, [['ctx_allow', 'int']]) val = fn(req.args, {}) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/SrvServer.py --- a/tools/python/xen/xend/server/SrvServer.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/SrvServer.py Thu Sep 22 17:42:01 2005 @@ -44,12 +44,17 @@ from xen.web.httpserver import HttpServer, UnixHttpServer -from xen.xend import XendRoot; xroot = XendRoot.instance() +from xen.xend import XendRoot from xen.xend import Vifctl from xen.xend.XendLogging import log from xen.web.SrvDir import SrvDir +import time from SrvRoot import SrvRoot + + +xroot = XendRoot.instance() + class XendServers: @@ -59,13 +64,32 @@ def add(self, server): self.servers.append(server) - def start(self): + def start(self, status): Vifctl.network('start') threads = [] for server in self.servers: thread = Thread(target=server.run) thread.start() threads.append(thread) + + + # check for when all threads have initialized themselves and then + # close the status pipe + + threads_left = True + while threads_left: + threads_left = False + + for server in self.servers: + if not server.ready: + threads_left = True + break + + if threads_left: + time.sleep(.5) + + status.write('0') + status.close() for t in threads: t.join() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/blkif.py --- a/tools/python/xen/xend/server/blkif.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/blkif.py Thu Sep 22 17:42:01 2005 @@ -13,322 +13,47 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ -"""Support for virtual block devices. -""" + +import re import string from xen.util import blkif -from xen.xend.XendError import XendError, VmError -from xen.xend.XendRoot import get_component -from xen.xend.XendLogging import log from xen.xend import sxp -from xen.xend import Blkctl -from xen.xend.xenstore import DBVar -from xen.xend.server.controller import Dev, DevController +from xen.xend.server.DevController import DevController -class BlkifBackend: - """ Handler for the 'back-end' channel to a block device driver domain - on behalf of a front-end domain. - Must be connected using connect() before it can be used. - """ - def __init__(self, controller, id, dom, recreate=False): - self.controller = controller - self.id = id - self.frontendDomain = self.controller.getDomain() - self.backendDomain = dom - self.destroyed = False - self.connected = False - self.status = None - - def init(self, recreate=False, reboot=False): - self.destroyed = False - self.status = BLKIF_INTERFACE_STATUS_DISCONNECTED - self.frontendDomain = self.controller.getDomain() - - def __str__(self): - return ('<BlkifBackend frontend=%d backend=%d id=%d>' - % (self.frontendDomain, - self.backendDomain, - self.id)) - - def getId(self): - return self.id - - def connect(self, recreate=False): - """Connect to the blkif control interface. - - @param recreate: true if after xend restart - """ - log.debug("Connecting blkif %s", str(self)) - if recreate or self.connected: - self.connected = True - pass - - def destroy(self, change=False, reboot=False): - """Disconnect from the blkif control interface and destroy it. - """ - self.destroyed = True - # For change true need to notify front-end, or back-end will do it? - - def connectInterface(self, val): - self.status = BLKIF_INTERFACE_STATUS_CONNECTED - - def interfaceDisconnected(self): - self.status = BLKIF_INTERFACE_STATUS_DISCONNECTED - -class BlkDev(Dev): - """Info record for a block device. - """ - - __exports__ = Dev.__exports__ + [ - DBVar('dev', ty='str'), - DBVar('vdev', ty='int'), - DBVar('mode', ty='str'), - DBVar('viftype', ty='str'), - DBVar('params', ty='str'), - DBVar('node', ty='str'), - DBVar('device', ty='long'), - DBVar('dev_handle', ty='long'), - DBVar('start_sector', ty='long'), - DBVar('nr_sectors', ty='long'), - ] - - def __init__(self, controller, id, config, recreate=False): - Dev.__init__(self, controller, id, config, recreate=recreate) - self.dev = None - self.uname = None - self.vdev = None - self.mode = None - self.type = None - self.params = None - self.node = None - self.device = None - self.dev_handle = 0 - self.start_sector = None - self.nr_sectors = None - - self.frontendDomain = self.getDomain() - self.backendDomain = None - self.backendId = 0 - self.configure(self.config, recreate=recreate) - - def exportToDB(self, save=False): - Dev.exportToDB(self, save=save) - backend = self.getBackend() - - def init(self, recreate=False, reboot=False): - self.frontendDomain = self.getDomain() - backend = self.getBackend() - self.backendId = backend.domid - - def configure(self, config, change=False, recreate=False): - if change: - raise XendError("cannot reconfigure vbd") - self.config = config - self.uname = sxp.child_value(config, 'uname') - if not self.uname: - raise VmError('vbd: Missing uname') - # Split into type and type-specific params (which are passed to the - # type-specific control script). - (self.type, self.params) = string.split(self.uname, ':', 1) - self.dev = sxp.child_value(config, 'dev') - if not self.dev: - raise VmError('vbd: Missing dev') - self.mode = sxp.child_value(config, 'mode', 'r') - - self.vdev = blkif.blkdev_name_to_number(self.dev) - if not self.vdev: - raise VmError('vbd: Device not found: %s' % self.dev) - - try: - xd = get_component('xen.xend.XendDomain') - self.backendDomain = xd.domain_lookup_by_name(sxp.child_value(config, 'backend', '0')).domid - except: - raise XendError('invalid backend domain') - - return self.config - - def attach(self, recreate=False, change=False): - if recreate: - pass - else: - node = Blkctl.block('bind', self.type, self.params) - self.setNode(node) - self.attachBackend() - if change: - self.interfaceChanged() - - def unbind(self): - if self.node is None: return - log.debug("Unbinding vbd (type %s) from %s" - % (self.type, self.node)) - Blkctl.block('unbind', self.type, self.node) - - def setNode(self, node): - - # NOTE: - # This clause is testing code for storage system experiments. - # Add a new disk type that will just pass an opaque id in the - # dev_handle and use an experimental device type. - # Please contact andrew.warfield@xxxxxxxxxxxx with any concerns. - if self.type == 'parallax': - self.node = node - self.device = 61440 # (240,0) - self.dev_handle = long(self.params) - self.nr_sectors = long(0) - return - # done. - - mounted_mode = self.check_mounted(node) - if not '!' in self.mode and mounted_mode: - if mounted_mode == "w": - raise VmError("vbd: Segment %s is in writable use" % - self.uname) - elif 'w' in self.mode: - raise VmError("vbd: Segment %s is in read-only use" % - self.uname) - - segment = blkif.blkdev_segment(node) - if not segment: - raise VmError("vbd: Segment not found: uname=%s" % self.uname) - self.node = node - self.device = segment['device'] - self.start_sector = segment['start_sector'] - self.nr_sectors = segment['nr_sectors'] - - def check_mounted(self, name): - mode = blkif.mount_mode(name) - xd = get_component('xen.xend.XendDomain') - for vm in xd.list(): - ctrl = vm.getDeviceController(self.getType(), error=False) - if (not ctrl): continue - for dev in ctrl.getDevices(): - if dev is self: continue - if dev.type == 'phy' and name == blkif.expand_dev_name(dev.params): - mode = dev.mode - if 'w' in mode: - return 'w' - if mode and 'r' in mode: - return 'r' - return None - - def readonly(self): - return 'w' not in self.mode - - def sxpr(self): - val = ['vbd', - ['id', self.id], - ['vdev', self.vdev], - ['device', self.device], - ['mode', self.mode]] - if self.dev: - val.append(['dev', self.dev]) - if self.uname: - val.append(['uname', self.uname]) - if self.node: - val.append(['node', self.node]) - return val - - def getBackend(self): - return self.controller.getBackend(self.backendDomain) - - def refresh(self): - log.debug("Refreshing vbd domain=%d id=%s", self.frontendDomain, - self.id) - self.interfaceChanged() - - def destroy(self, change=False, reboot=False): - """Destroy the device. If 'change' is true notify the front-end interface. - - @param change: change flag - """ - self.destroyed = True - log.debug("Destroying vbd domain=%d id=%s", self.frontendDomain, - self.id) - if change: - self.interfaceChanged() - self.unbind() - - def interfaceChanged(self): - """Tell the back-end to notify the front-end that a device has been - added or removed. - """ - self.getBackend().interfaceChanged() - - def attachBackend(self): - """Attach the device to its controller. - - """ - self.getBackend().connect() - class BlkifController(DevController): """Block device interface controller. Handles all block devices for a domain. """ - def __init__(self, vm, recreate=False): + def __init__(self, vm): """Create a block device controller. """ - DevController.__init__(self, vm, recreate=recreate) - self.backends = {} - self.backendId = 0 + DevController.__init__(self, vm) - def initController(self, recreate=False, reboot=False): - self.destroyed = False - if reboot: - self.rebootBackends() - self.rebootDevices() - def sxpr(self): - val = ['blkif', ['dom', self.getDomain()]] - return val + def getDeviceDetails(self, config): + """@see DevController.getDeviceDetails""" + + typedev = sxp.child_value(config, 'dev') + if re.match('^ioemu:', typedev): + return (0,{},{}) - def rebootBackends(self): - for backend in self.backends.values(): - backend.init(reboot=True) + devid = blkif.blkdev_name_to_number(sxp.child_value(config, 'dev')) - def getBackendById(self, id): - return self.backends.get(id) + (typ, params) = string.split(sxp.child_value(config, 'uname'), ':', 1) + back = { 'type' : typ, + 'params' : params + } - def getBackendByDomain(self, dom): - for backend in self.backends.values(): - if backend.backendDomain == dom: - return backend - return None + if 'r' == sxp.child_value(config, 'mode', 'r'): + back['read-only'] = "" # existence indicates read-only - def getBackend(self, dom): - backend = self.getBackendByDomain(dom) - if backend: return backend - backend = BlkifBackend(self, self.backendId, dom) - self.backendId += 1 - self.backends[backend.getId()] = backend - backend.init() - return backend + front = { 'virtual-device' : "%i" % devid } - def newDevice(self, id, config, recreate=False): - """Create a device.. - - @param id: device id - @param config: device configuration - @param recreate: if true it's being recreated (after xend restart) - @type recreate: bool - @return: device - @rtype: BlkDev - """ - return BlkDev(self, id, config, recreate=recreate) - - def destroyController(self, reboot=False): - """Destroy the controller and all devices. - """ - self.destroyed = True - log.debug("Destroying blkif domain=%d", self.getDomain()) - self.destroyDevices(reboot=reboot) - self.destroyBackends(reboot=reboot) - - def destroyBackends(self, reboot=False): - for backend in self.backends.values(): - backend.destroy(reboot=reboot) + return (devid, back, front) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/channel.py --- a/tools/python/xen/xend/server/channel.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/channel.py Thu Sep 22 17:42:01 2005 @@ -43,33 +43,6 @@ interdomain = classmethod(interdomain) - def restoreFromDB(cls, db, dom1, dom2, port1=0, port2=0): - """Create an event channel using db info if available. - Inverse to saveToDB(). - - @param db db - @param dom1 - @param dom2 - @param port1 - @param port2 - """ - try: - dom1 = int(db['dom1'].getData()) - except: pass - try: - dom2 = int(db['dom2'].getData()) - except: pass - try: - port1 = int(db['port1'].getData()) - except: pass - try: - port2 = int(db['port2'].getData()) - except: pass - evtchn = cls.interdomain(dom1, dom2, port1=port1, port2=port2) - return evtchn - - restoreFromDB = classmethod(restoreFromDB) - def __init__(self, dom1, dom2, d): d['dom1'] = dom1 d['dom2'] = dom2 @@ -92,18 +65,6 @@ print 'EventChannel>close>', self evtchn_close(self.dom1, self.port1) evtchn_close(self.dom2, self.port2) - - def saveToDB(self, db, save=False): - """Save the event channel to the db so it can be restored later, - using restoreFromDB() on the class. - - @param db db - """ - db['dom1'] = str(self.dom1) - db['dom2'] = str(self.dom2) - db['port1'] = str(self.port1) - db['port2'] = str(self.port2) - db.saveDB(save=save) def sxpr(self): return ['event-channel', diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/event.py --- a/tools/python/xen/xend/server/event.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/event.py Thu Sep 22 17:42:01 2005 @@ -174,11 +174,6 @@ else: logging.removeLogStderr() - def op_debug_controller(self, name, v): - mode = v[1] - import controller - controller.DEBUG = (mode == 'on') - def op_domain_ls(self, name, v): xd = xroot.get_component("xen.xend.XendDomain") return xd.list_names() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/netif.py --- a/tools/python/xen/xend/server/netif.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/netif.py Thu Sep 22 17:42:01 2005 @@ -13,396 +13,58 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ + """Support for virtual network interfaces. """ -import random - -from xen.util.mac import macFromString, macToString +import os from xen.xend import sxp -from xen.xend import Vifctl -from xen.xend.XendError import XendError, VmError -from xen.xend.XendLogging import log -from xen.xend import XendVnet -from xen.xend.XendRoot import get_component -from xen.xend.xenstore import DBVar -from xen.xend.server.controller import Dev, DevController +from xen.xend.server.DevController import DevController -class NetDev(Dev): - """A network device. - """ - # State: - # inherited + - # ./config - # ./mac - # ./be_mac - # ./bridge - # ./script - # ./ipaddr ? - # - # ./credit - # ./period - # - # ./vifctl: up/down? - # ./vifname - # - # - # Poss should have no backend state here - except for ref to backend's own tree - # for the device? And a status - the one we want. - # ./back/dom - # ./back/devid - id for back-end (netif_handle) - same as front/devid - # ./back/id - backend id (if more than one b/e per domain) - # ./back/status - # ./back/tx_shmem_frame - actually these belong in back-end state - # ./back/rx_shmem_frame - # - # ./front/dom - # ./front/devid - # ./front/status - need 2: one for requested, one for actual? Or drive from dev status - # and this is front status only. - # ./front/tx_shmem_frame - # ./front/rx_shmem_frame - # - # ./evtchn/front - here or in front/back? - # ./evtchn/back - # ./evtchn/status ? - # At present created by dev: but should be created unbound by front/back - # separately and then bound (by back)? - - __exports__ = Dev.__exports__ + [ - DBVar('config', ty='sxpr'), - DBVar('mac', ty='mac'), - DBVar('be_mac', ty='mac'), - DBVar('bridge', ty='str'), - DBVar('script', ty='str'), - DBVar('credit', ty='int'), - DBVar('period', ty='int'), - DBVar('vifname', ty='str'), - ] - - def __init__(self, controller, id, config, recreate=False): - Dev.__init__(self, controller, id, config, recreate=recreate) - self.vif = int(self.id) - self.status = None - self.frontendDomain = self.getDomain() - self.backendDomain = None - self.credit = None - self.period = None - self.mac = None - self.be_mac = None - self.bridge = None - self.script = None - self.ipaddr = None - self.mtu = None - self.vifname = None - self.configure(self.config, recreate=recreate) - - def exportToDB(self, save=False): - Dev.exportToDB(self, save=save) - - def init(self, recreate=False, reboot=False): - self.destroyed = False - self.status = NETIF_INTERFACE_STATUS_DISCONNECTED - self.frontendDomain = self.getDomain() - - def _get_config_mac(self, config): - vmac = sxp.child_value(config, 'mac') - if not vmac: return None - try: - mac = macFromString(vmac) - except: - raise XendError("invalid mac: %s" % vmac) - return mac - - def _get_config_be_mac(self, config): - vmac = sxp.child_value(config, 'be_mac') - if not vmac: return None - try: - mac = macFromString(vmac) - except: - raise XendError("invalid backend mac: %s" % vmac) - return mac - - def _get_config_ipaddr(self, config): - ips = sxp.children(config, elt='ip') - if ips: - val = [] - for ipaddr in ips: - val.append(sxp.child0(ipaddr)) - else: - val = None - return val - - def _get_config_mtu(self, config): - mtu = sxp.child_value(config, 'mtu') - if not mtu: return None - try: - mtu = int(mtu) - except: - raise XendError("invalid mtu: %s" & mtu) - return mtu - - def configure(self, config, change=False, recreate=False): - if change: - return self.reconfigure(config) - self.config = config - self.mac = None - self.be_mac = None - self.bridge = None - self.script = None - self.ipaddr = [] - self.vifname = None - - self.vifname = sxp.child_value(config, 'vifname') - if self.vifname is None: - self.vifname = self.default_vifname() - if len(self.vifname) > 15: - raise XendError('invalid vifname: too long: ' + self.vifname) - mac = self._get_config_mac(config) - if mac is None: - raise XendError("invalid mac") - self.mac = mac - self.be_mac = self._get_config_be_mac(config) - self.bridge = sxp.child_value(config, 'bridge') - self.script = sxp.child_value(config, 'script') - self.ipaddr = self._get_config_ipaddr(config) or [] - self.mtu = self._get_config_mtu(config) - self._config_credit_limit(config) - - try: - if recreate: - self.backendDomain = int(sxp.child_value(config, 'backend', '0')) - else: - #todo: Code below will fail on xend restart when backend is not domain 0. - xd = get_component('xen.xend.XendDomain') - self.backendDomain = xd.domain_lookup_by_name(sxp.child_value(config, 'backend', '0')).domid - except: - raise XendError('invalid backend domain') - return self.config - - def reconfigure(self, config): - """Reconfigure the interface with new values. - Not all configuration parameters can be changed: - bridge, script and ip addresses can, - backend and mac cannot. - - To leave a parameter unchanged, omit it from the changes. - - @param config configuration changes - @return updated interface configuration - @raise XendError on errors - """ - changes = {} - mac = self._get_config_mac(config) - be_mac = self._get_config_be_mac(config) - bridge = sxp.child_value(config, 'bridge') - script = sxp.child_value(config, 'script') - ipaddr = self._get_config_ipaddr(config) - mtu = self._get_config_mtu(config) - - xd = get_component('xen.xend.XendDomain') - backendDomain = xd.domain_lookup_by_name(sxp.child_value(config, 'backend', '0')).domid - - if (mac is not None) and (mac != self.mac): - raise XendError("cannot change mac") - if (be_mac is not None) and (be_mac != self.be_mac): - raise XendError("cannot change backend mac") - if (backendDomain is not None) and (backendDomain != self.backendDomain): - raise XendError("cannot change backend") - if (bridge is not None) and (bridge != self.bridge): - changes['bridge'] = bridge - if (script is not None) and (script != self.script): - changes['script'] = script - if (ipaddr is not None) and (ipaddr != self.ipaddr): - changes['ipaddr'] = ipaddr - if (mtu is not None) and (mtu != self.mtu): - changes['mtu'] = mtu - - if changes: - self.vifctl("down") - for (k, v) in changes.items(): - setattr(self, k, v) - self.config = sxp.merge(config, self.config) - self.vifctl("up") - - self._config_credit_limit(config, change=True) - return self.config - - def _config_credit_limit(self, config, change=False): - period = sxp.child_value(config, 'period') - credit = sxp.child_value(config, 'credit') - if period and credit: - try: - period = int(period) - credit = int(credit) - except ex: - raise XendError('vif: invalid credit limit') - if change: - self.setCreditLimit(credit, period) - self.config = sxp.merge([sxp.name(self.config), - ['credit', credit], - ['period', period]], - self.config) - else: - self.period = period - self.credit = credit - elif period or credit: - raise XendError('vif: invalid credit limit') - - def sxpr(self): - vif = str(self.vif) - mac = self.get_mac() - val = ['vif', - ['id', self.id], - ['vif', vif], - ['mac', mac], - ['vifname', self.vifname], - ] - - if self.be_mac: - val.append(['be_mac', self.get_be_mac()]) - if self.bridge: - val.append(['bridge', self.bridge]) - if self.script: - val.append(['script', self.script]) - for ip in self.ipaddr: - val.append(['ip', ip]) - if self.credit: - val.append(['credit', self.credit]) - if self.period: - val.append(['period', self.period]) - return val - - def get_vifname(self): - """Get the virtual interface device name. - """ - return self.vifname - - def default_vifname(self): - return "vif%d.%d" % (self.frontendDomain, self.vif) - - def get_mac(self): - """Get the MAC address as a string. - """ - return macToString(self.mac) - - def get_be_mac(self): - """Get the backend MAC address as a string. - """ - return macToString(self.be_mac) - - def vifctl_params(self, vmname=None): - """Get the parameters to pass to vifctl. - """ - dom = self.frontendDomain - if vmname is None: - xd = get_component('xen.xend.XendDomain') - try: - vm = xd.domain_lookup(dom) - vmname = vm.name - except: - vmname = 'Domain-%d' % dom - return { 'domain': vmname, - 'vif' : self.get_vifname(), - 'mac' : self.get_mac(), - 'bridge': self.bridge, - 'script': self.script, - 'ipaddr': self.ipaddr, } - - def vifctl(self, op, vmname=None): - """Bring the device up or down. - The vmname is needed when bringing a device up for a new domain because - the domain is not yet in the table so we can't look its name up. - - @param op: operation name (up, down) - @param vmname: vmname - """ - if op == 'up': - Vifctl.set_vif_name(self.default_vifname(), self.vifname) - Vifctl.vifctl(op, **self.vifctl_params(vmname=vmname)) - vnet = XendVnet.instance().vnet_of_bridge(self.bridge) - if vnet: - vnet.vifctl(op, self.get_vifname(), self.get_mac()) - - def attach(self, recreate=False, change=False): - if recreate: - pass - else: - if self.credit and self.period: - #self.send_be_creditlimit(self.credit, self.period) - pass - self.vifctl('up', vmname=self.getDomainName()) - - def destroy(self, change=False, reboot=False): - """Destroy the device's resources and disconnect from the back-end - device controller. If 'change' is true notify the front-end interface. - - @param change: change flag - """ - self.destroyed = True - self.status = NETIF_INTERFACE_STATUS_CLOSED - log.debug("Destroying vif domain=%d vif=%d", self.frontendDomain, self.vif) - self.vifctl('down') - if change: - self.reportStatus() - - def setCreditLimit(self, credit, period): - #todo: these params should be in sxpr and vif config. - self.credit = credit - self.period = period - - def getCredit(self): - return self.credit - - def getPeriod(self): - return self.period - - def interfaceChanged(self): - """Notify the front-end that a device has been added or removed. - """ - pass - class NetifController(DevController): """Network interface controller. Handles all network devices for a domain. """ - def __init__(self, vm, recreate=False): - DevController.__init__(self, vm, recreate=recreate) + def __init__(self, vm): + DevController.__init__(self, vm) - def initController(self, recreate=False, reboot=False): - self.destroyed = False - if reboot: - self.rebootDevices() - def destroyController(self, reboot=False): - """Destroy the controller and all devices. - """ - self.destroyed = True - log.debug("Destroying netif domain=%d", self.getDomain()) - self.destroyDevices(reboot=reboot) + def getDeviceDetails(self, config): + """@see DevController.getDeviceDetails""" - def sxpr(self): - val = ['netif', ['dom', self.getDomain()]] - return val - - def newDevice(self, id, config, recreate=False): - """Create a network device. + from xen.xend import XendRoot + xroot = XendRoot.instance() - @param id: interface id - @param config: device configuration - @param recreate: recreate flag (true after xend restart) - """ - return NetDev(self, id, config, recreate=recreate) + def _get_config_ipaddr(config): + val = [] + for ipaddr in sxp.children(config, elt='ip'): + val.append(sxp.child0(ipaddr)) + return val - def limitDevice(self, vif, credit, period): - if vif not in self.devices: - raise XendError('device does not exist for credit limit: vif' - + str(self.getDomain()) + '.' + str(vif)) - - dev = self.devices[vif] - return dev.setCreditLimit(credit, period) + script = os.path.join(xroot.network_script_dir, + sxp.child_value(config, 'script', + xroot.get_vif_script())) + bridge = sxp.child_value(config, 'bridge', + xroot.get_vif_bridge()) + mac = sxp.child_value(config, 'mac') + ipaddr = _get_config_ipaddr(config) + + devid = self.allocateDeviceID() + + back = { 'script' : script, + 'mac' : mac, + 'bridge' : bridge, + 'handle' : "%i" % devid } + if ipaddr: + back['ip'] = ' '.join(ipaddr) + + front = { 'handle' : "%i" % devid, + 'mac' : mac } + + return (devid, back, front) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/pciif.py --- a/tools/python/xen/xend/server/pciif.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/pciif.py Thu Sep 22 17:42:01 2005 @@ -13,16 +13,22 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd #============================================================================ + import types -import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() +import xen.lowlevel.xc; from xen.xend import sxp from xen.xend.XendError import VmError -from controller import Dev, DevController +from xen.xend.server.DevController import DevController + + +xc = xen.lowlevel.xc.new() + def parse_pci(val): """Parse a pci field. @@ -36,41 +42,41 @@ v = val return v -class PciDev(Dev): - def __init__(self, controller, id, config, recreate=False): - Dev.__init__(self, controller, id, config, recreate=recreate) - bus = sxp.child_value(self.config, 'bus') - if not bus: - raise VmError('pci: Missing bus') - dev = sxp.child_value(self.config, 'dev') - if not dev: - raise VmError('pci: Missing dev') - func = sxp.child_value(self.config, 'func') - if not func: - raise VmError('pci: Missing func') - try: - bus = parse_pci(bus) - dev = parse_pci(dev) - func = parse_pci(func) - except: - raise VmError('pci: invalid parameter') +class PciController(DevController): - def attach(self, recreate=False, change=False): - rc = xc.physdev_pci_access_modify(dom = self.getDomain(), + def __init__(self, vm): + DevController.__init__(self, vm) + + + def getDeviceDetails(self, config): + """@see DevController.getDeviceDetails""" + + def get_param(field): + try: + val = sxp.child_value(config, field) + + if not val: + raise VmError('pci: Missing %s config setting' % field) + + return parse_pci(val) + except: + raise VmError('pci: Invalid config setting %s: %s' % + (field, val)) + + bus = get_param('bus') + dev = get_param('dev') + func = get_param('func') + + rc = xc.physdev_pci_access_modify(dom = self.getDomid(), bus = bus, dev = dev, func = func, enable = True) if rc < 0: #todo non-fatal - raise VmError('pci: Failed to configure device: bus=%s dev=%s func=%s' % - (bus, dev, func)) + raise VmError( + 'pci: Failed to configure device: bus=%s dev=%s func=%s' % + (bus, dev, func)) - def destroy(self, change=False, reboot=False): - pass - -class PciController(DevController): - - def newDevice(self, id, config, recreate=False): - return PciDev(self, id, config, recreate=recreate) + return (dev, {}, {}) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/tpmif.py --- a/tools/python/xen/xend/server/tpmif.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/tpmif.py Thu Sep 22 17:42:01 2005 @@ -1,45 +1,47 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> # Copyright (C) 2005 IBM Corporation -# Authort: Stefan Berger, stefanb@xxxxxxxxxx -# Derived from netif.py: -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +# Author: Stefan Berger, stefanb@xxxxxxxxxx +# Copyright (C) 2005 XenSource Ltd +#============================================================================ + """Support for virtual TPM interfaces. """ -import random +from xen.xend import sxp +from xen.xend.XendLogging import log -from xen.xend import sxp -from xen.xend.XendError import XendError, VmError -from xen.xend.XendLogging import log -from xen.xend.XendRoot import get_component -from xen.xend.xenstore import DBVar +from xen.xend.server.DevController import DevController -from xen.xend.server.controller import Dev, DevController class TPMifController(DevController): """TPM interface controller. Handles all TPM devices for a domain. """ - def __init__(self, vm, recreate=False): - DevController.__init__(self, vm, recreate=recreate) + def __init__(self, vm): + DevController.__init__(self, vm) - def initController(self, recreate=False, reboot=False): - self.destroyed = False - def destroyController(self, reboot=False): - """Destroy the controller and all devices. - """ - self.destroyed = True - self.destroyDevices(reboot=reboot) + def getDeviceDetails(self, config): + """@see DevController.getDeviceDetails""" + + devid = int(sxp.child_value(config, 'instance', '0')) + log.debug("The domain has a TPM with instance %d." % devid) - def sxpr(self): - val = ['tpmif', ['dom', self.getDomain()]] - return val + back = { 'instance' : "%i" % devid } + front = { 'handle' : "%i" % devid } - def newDevice(self, id, config, recreate=False): - """Create a TPM device. - - @param id: interface id - @param config: device configuration - @param recreate: recreate flag (true after xend restart) - """ - return None + return (devid, back, front) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/usbif.py --- a/tools/python/xen/xend/server/usbif.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/usbif.py Thu Sep 22 17:42:01 2005 @@ -1,185 +1,42 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ # Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> # Copyright (C) 2004 Intel Research Cambridge # Copyright (C) 2004 Mark Williamson <mark.williamson@xxxxxxxxxxxx> +# Copyright (C) 2005 XenSource Ltd +#============================================================================ + + """Support for virtual USB hubs. """ -from xen.xend import sxp -from xen.xend.XendLogging import log -from xen.xend.XendError import XendError -from xen.xend.xenstore import DBVar +from xen.xend.server.DevController import DevController -from xen.xend.server.controller import Dev, DevController - -class UsbBackend: - """Handler for the 'back-end' channel to a USB device driver domain - on behalf of a front-end domain. - """ - def __init__(self, controller, id, dom): - self.controller = controller - self.id = id - self.destroyed = False - self.connected = False - self.connecting = False - self.frontendDomain = self.controller.getDomain() - self.backendDomain = dom - - def init(self, recreate=False, reboot=False): - pass - - def __str__(self): - return ('<UsbifBackend frontend=%d backend=%d id=%d>' - % (self.frontendDomain, - self.backendDomain, - self.id)) - - def connect(self, recreate=False): - """Connect the controller to the usbif control interface. - - @param recreate: true if after xend restart - """ - log.debug("Connecting usbif %s", str(self)) - if recreate or self.connected or self.connecting: - pass - - def destroy(self, reboot=False): - """Disconnect from the usbif control interface and destroy it. - """ - self.destroyed = True - - def interfaceChanged(self): - pass - - -class UsbDev(Dev): - - __exports__ = Dev.__exports__ + [ - DBVar('port', ty='int'), - DBVar('path', ty='str'), - ] - - def __init__(self, controller, id, config, recreate=False): - Dev.__init__(self, controller, id, config, recreate=recreate) - self.port = id - self.path = None - self.frontendDomain = self.getDomain() - self.backendDomain = 0 - self.configure(self.config, recreate=recreate) - - def init(self, recreate=False, reboot=False): - self.destroyed = False - self.frontendDomain = self.getDomain() - - def configure(self, config, change=False, recreate=False): - if change: - raise XendError("cannot reconfigure usb") - #todo: FIXME: Use sxp access methods to get this value. - # Must not use direct indexing. - self.path = config[1][1] - - #todo: FIXME: Support configuring the backend domain. -## try: -## self.backendDomain = int(sxp.child_value(config, 'backend', '0')) -## except: -## raise XendError('invalid backend domain') - - def attach(self, recreate=False, change=False): - if recreate: - pass - else: - self.attachBackend() - if change: - self.interfaceChanged() - - def sxpr(self): - val = ['usb', - ['id', self.id], - ['port', self.port], - ['path', self.path], - ] - return val - - def getBackend(self): - return self.controller.getBackend(self.backendDomain) - - def destroy(self, change=False, reboot=False): - """Destroy the device. If 'change' is true notify the front-end interface. - - @param change: change flag - """ - self.destroyed = True - log.debug("Destroying usb domain=%d id=%s", self.frontendDomain, self.id) - if change: - self.interfaceChanged() - - def interfaceChanged(self): - """Tell the back-end to notify the front-end that a device has been - added or removed. - """ - self.getBackend().interfaceChanged() - - def attachBackend(self): - """Attach the device to its controller. - - """ - self.getBackend().connect() class UsbifController(DevController): """USB device interface controller. Handles all USB devices for a domain. """ - def __init__(self, vm, recreate=False): + def __init__(self, vm): """Create a USB device controller. """ - DevController.__init__(self, vm, recreate=recreate) - self.backends = {} - self.backendId = 0 + DevController.__init__(self, vm) - def init(self, recreate=False, reboot=False): - self.destroyed = False - if reboot: - self.rebootBackends() - self.rebootDevices() - def sxpr(self): - val = ['usbif', - ['dom', self.getDomain()]] - return val + def getDeviceDetails(self, _): + """@see DevController.getDeviceDetails""" - def newDevice(self, id, config, recreate=False): - return UsbDev(self, id, config, recreate=recreate) - - def destroyController(self, reboot=False): - """Destroy the controller and all devices. - """ - self.destroyed = True - log.debug("Destroying blkif domain=%d", self.getDomain()) - self.destroyDevices(reboot=reboot) - self.destroyBackends(reboot=reboot) - - def rebootBackends(self): - for backend in self.backends.values(): - backend.init(reboot=True) - - def getBackendById(self, id): - return self.backends.get(id) - - def getBackendByDomain(self, dom): - for backend in self.backends.values(): - if backend.backendDomain == dom: - return backend - return None - - def getBackend(self, dom): - backend = self.getBackendByDomain(dom) - if backend: return backend - backend = UsbBackend(self, self.backendId, dom) - self.backendId += 1 - self.backends[backend.getId()] = backend - backend.init() - return backend - - def destroyBackends(self, reboot=False): - for backend in self.backends.values(): - backend.destroy(reboot=reboot) + return (self.allocateDeviceID(), {}, {}) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/xenstore/xsnode.py --- a/tools/python/xen/xend/xenstore/xsnode.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/xenstore/xsnode.py Thu Sep 22 17:42:01 2005 @@ -244,7 +244,9 @@ if ex.args[0] == errno.ENOENT: return False else: - raise + raise RuntimeError(ex.args[0], + ex.args[1] + + (', in exists(%s)' % (str(path)))) def mkdirs(self, path): if self.exists(path): @@ -255,7 +257,7 @@ if x == "": continue p = os.path.join(p, x) if not self.exists(p): - self.getxs().write(p, "", create=True) + self.getxs().write(p, "") def read(self, path): try: @@ -266,15 +268,17 @@ else: raise - def create(self, path, excl=False): - self.write(path, "", create=True, excl=excl) - - def write(self, path, data, create=True, excl=False): - self.mkdirs(path) - try: - self.getxs().write(path, data, create=create, excl=excl) - except Exception, ex: - raise + def create(self, path): + self.write(path, "") + + def write(self, path, data): + try: + self.getxs().write(path, data) + except RuntimeError, ex: + raise RuntimeError(ex.args[0], + ex.args[1] + + (', while writing %s : %s' % (str(path), + str(data)))) def begin(self, path): self.getxs().transaction_start(path) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/xenstore/xsobj.py --- a/tools/python/xen/xend/xenstore/xsobj.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/xenstore/xsobj.py Thu Sep 22 17:42:01 2005 @@ -469,9 +469,6 @@ n = n._addChild(x) return n - def getDB(self): - return self.__db__ - def setDB(self, db): if (db is not None) and not isinstance(db, XenNode): raise ValueError("invalid db") diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/xenstore/xstransact.py --- a/tools/python/xen/xend/xenstore/xstransact.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/xenstore/xstransact.py Thu Sep 22 17:42:01 2005 @@ -41,7 +41,11 @@ def _read(self, key): path = "%s/%s" % (self.path, key) - return xshandle().read(path) + try: + return xshandle().read(path) + except RuntimeError, ex: + raise RuntimeError(ex.args[0], + '%s, while reading %s' % (ex.args[1], path)) def read(self, *args): if len(args) == 0: @@ -53,13 +57,16 @@ ret.append(self._read(key)) return ret - def _write(self, key, data, create=True, excl=False): - path = "%s/%s" % (self.path, key) - xshandle().write(path, data, create=create, excl=excl) + def _write(self, key, data): + path = "%s/%s" % (self.path, key) + try: + xshandle().write(path, data) + except RuntimeError, ex: + raise RuntimeError(ex.args[0], + ('%s, while writing %s : %s' % + (ex.args[1], path, str(data)))) def write(self, *args, **opts): - create = opts.get('create') or True - excl = opts.get('excl') or False if len(args) == 0: raise TypeError if isinstance(args[0], dict): @@ -67,15 +74,19 @@ if not isinstance(d, dict): raise TypeError for key in d.keys(): - self._write(key, d[key], create, excl) + try: + self._write(key, d[key]) + except TypeError, msg: + raise TypeError('Writing %s: %s: %s' % + (key, str(d[key]), msg)) elif isinstance(args[0], list): for l in args: if not len(l) == 2: raise TypeError - self._write(l[0], l[1], create, excl) + self._write(l[0], l[1]) elif len(args) % 2 == 0: for i in range(len(args) / 2): - self._write(args[i * 2], args[i * 2 + 1], create, excl) + self._write(args[i * 2], args[i * 2 + 1]) else: raise TypeError @@ -84,10 +95,15 @@ return xshandle().rm(path) def remove(self, *args): - if len(args) == 0: - raise TypeError - for key in args: - self._remove(key) + """If no arguments are given, remove this transaction's path. + Otherwise, treat each argument as a subpath to this transaction's + path, and remove each of those instead. + """ + if len(args) == 0: + xshandle().rm(self.path) + else: + for key in args: + self._remove(key) def _list(self, key): path = "%s/%s" % (self.path, key) @@ -114,10 +130,20 @@ defval = None else: (key, fn, defval) = tup - try: - val = fn(self._read(key)) - except TypeError: + + val = self._read(key) + # If fn is str, then this will successfully convert None to + # 'None'. If it is int, then it will throw TypeError on None, or + # on any other non-integer value. We have to, therefore, both + # check explicitly for None, and catch TypeError. Either failure + # will result in defval being used instead. + if val is None: val = defval + else: + try: + val = fn(val) + except TypeError: + val = defval ret.append(val) if len(ret) == 1: return ret[0] @@ -146,8 +172,8 @@ def Read(cls, path, *args): while True: - try: - t = cls(path) + t = cls(path) + try: v = t.read(*args) t.commit() return v @@ -165,8 +191,8 @@ def Write(cls, path, *args, **opts): while True: - try: - t = cls(path) + t = cls(path) + try: t.write(*args, **opts) t.commit() return @@ -183,9 +209,13 @@ Write = classmethod(Write) def Remove(cls, path, *args): - while True: - try: - t = cls(path) + """If only one argument is given (path), remove it. Otherwise, treat + each further argument as a subpath to the given path, and remove each + of those instead. This operation is performed inside a transaction. + """ + while True: + t = cls(path) + try: t.remove(*args) t.commit() return @@ -203,8 +233,8 @@ def List(cls, path, *args): while True: - try: - t = cls(path) + t = cls(path) + try: v = t.list(*args) t.commit() return v @@ -222,8 +252,8 @@ def Gather(cls, path, *args): while True: - try: - t = cls(path) + t = cls(path) + try: v = t.gather(*args) t.commit() return v @@ -241,8 +271,8 @@ def Store(cls, path, *args): while True: - try: - t = cls(path) + t = cls(path) + try: v = t.store(*args) t.commit() return v diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/xenstore/xsutil.py --- a/tools/python/xen/xend/xenstore/xsutil.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/xenstore/xsutil.py Thu Sep 22 17:42:01 2005 @@ -18,3 +18,6 @@ def IntroduceDomain(domid, page, port, path): return xshandle().introduce_domain(domid, page, port, path) + +def GetDomainPath(domid): + return xshandle().get_domain_path(domid) diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/xenstore/xswatch.py --- a/tools/python/xen/xend/xenstore/xswatch.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/xenstore/xswatch.py Thu Sep 22 17:42:01 2005 @@ -1,4 +1,5 @@ # Copyright (C) 2005 Christian Limpach <Christian.Limpach@xxxxxxxxxxxx> +# Copyright (C) 2005 XenSource Ltd # This file is subject to the terms and conditions of the GNU General # Public License. See the file "COPYING" in the main directory of @@ -15,7 +16,7 @@ xs = None xslock = threading.Lock() - def __init__(self, path, fn, args=(), kwargs={}): + def __init__(self, path, fn, *args, **kwargs): self.fn = fn self.args = args self.kwargs = kwargs @@ -46,11 +47,11 @@ cls.threadcond.release() while True: try: - (ord, owr, oer) = select.select([ cls.xs ], [], []) + (fd, _1, _2) = select.select([ cls.xs ], [], []) cls.xslock.acquire() # reconfirm ready to read with lock - (ord, owr, oer) = select.select([ cls.xs ], [], [], 0.001) - if not cls.xs in ord: + (fd, _1, _2) = select.select([ cls.xs ], [], [], 0.001) + if not cls.xs in fd: cls.xslock.release() continue we = cls.xs.read_watch() diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xm/create.py Thu Sep 22 17:42:01 2005 @@ -109,7 +109,7 @@ The address of the vncviewer is passed to the domain on the kernel command line using 'VNC_SERVER=<host>:<port>'. The port used by vnc is 5500 + DISPLAY. A display value with a free port is chosen if possible. - Only valid when vnc=1. + Only valid when vnc=1. """) gopts.var('name', val='NAME', @@ -141,7 +141,7 @@ use="Domain memory in MB.") gopts.var('ssidref', val='SSIDREF', - fn=set_u32, default=-1, + fn=set_u32, default=0, use="Security Identifier.") gopts.var('maxmem', val='MEMORY', @@ -342,7 +342,7 @@ else: return s -def configure_image(opts, config, vals): +def configure_image(opts, vals): """Create the image config. """ config_image = [ vals.builder ] @@ -359,8 +359,7 @@ config_image.append(['args', vals.extra]) if vals.vcpus: config_image.append(['vcpus', vals.vcpus]) - config.append(['image', config_image ]) - + return config_image def configure_disks(opts, config_devs, vals): """Create the config for disks (virtual block devices). @@ -494,17 +493,17 @@ config_vfr.append(['vif', ['id', idx], ['ip', ip]]) config.append(config_vfr) -def configure_vmx(opts, config_devs, vals): +def configure_vmx(opts, config_image, vals): """Create the config for VMX devices. """ - args = [ 'memmap', 'device_model', 'cdrom', - 'boot', 'fda', 'fdb', 'localtime', 'serial', 'macaddr', 'stdvga', - 'isa', 'nographic', 'vnc', 'vncviewer', 'sdl', 'display'] + args = [ 'memmap', 'device_model', 'vcpus', 'cdrom', + 'boot', 'fda', 'fdb', 'localtime', 'serial', 'macaddr', 'stdvga', + 'isa', 'nographic', 'vnc', 'vncviewer', 'sdl', 'display'] for a in args: - if (vals.__dict__[a]): - config_devs.append([a, vals.__dict__[a]]) - -def run_bootloader(opts, config, vals): + if (vals.__dict__[a]): + config_image.append([a, vals.__dict__[a]]) + +def run_bootloader(opts, vals): if not os.access(vals.bootloader, os.X_OK): opts.err("Bootloader isn't executable") if len(vals.disk) < 1: @@ -512,11 +511,8 @@ (uname, dev, mode, backend) = vals.disk[0] file = blkif.blkdev_uname_to_file(uname) - blcfg = bootloader(vals.bootloader, file, not vals.console_autoconnect, - vals.vcpus, vals.blentry) - - config.append(['bootloader', vals.bootloader]) - config.append(blcfg) + return bootloader(vals.bootloader, file, not vals.console_autoconnect, + vals.vcpus, vals.blentry) def make_config(opts, vals): """Create the domain configuration. @@ -542,16 +538,19 @@ config.append(['restart', vals.restart]) if vals.bootloader: - run_bootloader(opts, config, vals) + config.append(['bootloader', vals.bootloader]) + config_image = run_bootloader(opts, vals) else: - configure_image(opts, config, vals) + config_image = configure_image(opts, vals) + configure_vmx(opts, config_image, vals) + config.append(['image', config_image ]) + config_devs = [] configure_disks(opts, config_devs, vals) configure_pci(opts, config_devs, vals) configure_vifs(opts, config_devs, vals) configure_usb(opts, config_devs, vals) configure_vtpm(opts, config_devs, vals) - configure_vmx(opts, config_devs, vals) config += config_devs return config @@ -673,7 +672,7 @@ # Local port is field 3. y = x.split()[3] # Field is addr:port, split off the port. - y = y.split(':')[1] + y = y.split(':')[-1] r.append(int(y)) return r diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm/Makefile --- a/tools/vtpm/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm/Makefile Thu Sep 22 17:42:01 2005 @@ -4,7 +4,7 @@ include $(XEN_ROOT)/tools/vtpm/Rules.mk # Dir name for emulator (as dom0 tpm driver) -TPM_EMULATOR_DIR = tpm_emulator-0.2 +TPM_EMULATOR_DIR = tpm_emulator # Dir name for vtpm instance VTPM_DIR = vtpm @@ -13,7 +13,7 @@ all: build -build: $(TPM_EMULATOR_TARFILE) extract patch build_sub +build: $(TPM_EMULATOR_DIR) $(VTPM_DIR) build_sub install: build $(MAKE) -C $(TPM_EMULATOR_DIR) $@ @@ -26,36 +26,32 @@ if [ -d $(VTPM_DIR) ]; \ then $(MAKE) -C $(VTPM_DIR) clean; \ fi + +mrproper: + rm -f $(TPM_EMULATOR_TARFILE) rm -rf $(TPM_EMULATOR_DIR) rm -rf $(VTPM_DIR) - -mrproper: clean - rm -f $(TPM_EMULATOR_TARFILE) # Download Swiss emulator $(TPM_EMULATOR_TARFILE): wget http://download.berlios.de/tpm-emulator/$(TPM_EMULATOR_TARFILE) # Create vtpm and TPM emulator dirs -extract: $(TPM_EMULATOR_DIR)/README $(VTPM_DIR)/README - -$(TPM_EMULATOR_DIR)/README: - -rm -rf $(TPM_EMULATOR_DIR) - tar -xzf $(TPM_EMULATOR_TARFILE) - -$(VTPM_DIR)/README: - -rm -rf $(VTPM_DIR) - cp -r --preserve $(TPM_EMULATOR_DIR) $(VTPM_DIR) - # apply patches for 1) used as dom0 tpm driver 2) used as vtpm device instance -patch: $(TPM_EMULATOR_DIR)/Makefile $(VTPM_DIR)/Makefile - -$(TPM_EMULATOR_DIR)/Makefile: tpm_emulator.patch +$(TPM_EMULATOR_DIR): $(TPM_EMULATOR_TARFILE) + tar -xzf $(TPM_EMULATOR_TARFILE); + mv tpm_emulator-0.2 $(TPM_EMULATOR_DIR); + -cd $(TPM_EMULATOR_DIR); \ + patch -p1 < ../tpm_emulator-0.2b-x86_64.patch; \ patch -p1 <../tpm_emulator.patch -$(VTPM_DIR)/Makefile: vtpm.patch +$(VTPM_DIR): $(TPM_EMULATOR_TARFILE) + tar -xzf $(TPM_EMULATOR_TARFILE); + mv tpm_emulator-0.2 $(VTPM_DIR); + -cd $(VTPM_DIR); \ + patch -p1 < ../tpm_emulator-0.2b-x86_64.patch; \ patch -p1 <../vtpm.patch build_sub: diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm/README --- a/tools/vtpm/README Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm/README Thu Sep 22 17:42:01 2005 @@ -23,6 +23,7 @@ - xen-unstable - IBM frontend/backend vtpm driver patch - vtpm_managerd +- GNU MP Big number library (GMP) vtpmd Flow (for vtpm_manager. vtpmd never run by default) ============================ diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm/tpm_emulator.patch --- a/tools/vtpm/tpm_emulator.patch Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm/tpm_emulator.patch Thu Sep 22 17:42:01 2005 @@ -1,12 +1,12 @@ -diff -uprN orig/tpm_emulator-0.2/AUTHORS tpm_emulator-0.2/AUTHORS ---- orig/tpm_emulator-0.2/AUTHORS 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/AUTHORS 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/AUTHORS tpm_emulator/AUTHORS +--- orig/tpm_emulator-0.2-x86_64/AUTHORS 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator/AUTHORS 2005-09-14 20:27:22.000000000 -0700 @@ -1 +1,2 @@ Mario Strasser <mast@xxxxxxx> +INTEL Corp <> -diff -uprN orig/tpm_emulator-0.2/ChangeLog tpm_emulator-0.2/ChangeLog ---- orig/tpm_emulator-0.2/ChangeLog 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/ChangeLog 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/ChangeLog tpm_emulator/ChangeLog +--- orig/tpm_emulator-0.2-x86_64/ChangeLog 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator/ChangeLog 2005-09-14 20:27:22.000000000 -0700 @@ -1,3 +1,7 @@ +2005-08-16: INTEL Corp + * Set default permissions to PCRs @@ -15,10 +15,29 @@ 2005-08-15 Mario Strasser <mast@xxxxxxx> * all: some typos corrected * tpm_integrity.c: bug in TPM_Extend fixed -diff -uprN orig/tpm_emulator-0.2/Makefile tpm_emulator-0.2/Makefile ---- orig/tpm_emulator-0.2/Makefile 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/Makefile 2005-08-17 10:55:52.000000000 -0700 -@@ -1,15 +1,19 @@ +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.h tpm_emulator/linux_module.h +--- orig/tpm_emulator-0.2-x86_64/linux_module.h 2005-09-15 19:21:14.844078720 -0700 ++++ tpm_emulator/linux_module.h 2005-09-14 20:27:22.000000000 -0700 +@@ -1,5 +1,6 @@ + /* Software-Based Trusted Platform Module (TPM) Emulator for Linux + * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, ++ * Copyright (C) 2005 INTEL Corp. + * + * This module is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published +@@ -35,7 +36,7 @@ + #include "tpm_version.h" + + #define TPM_DEVICE_MINOR 224 +-#define TPM_DEVICE_NAME "tpm" ++#define TPM_DEVICE_NAME "tpm0" + #define TPM_MODULE_NAME "tpm_emulator" + + /* debug and log output functions */ +diff -uprN orig/tpm_emulator-0.2-x86_64/Makefile tpm_emulator/Makefile +--- orig/tpm_emulator-0.2-x86_64/Makefile 2005-09-15 19:21:14.845078568 -0700 ++++ tpm_emulator/Makefile 2005-09-14 20:27:22.000000000 -0700 +@@ -1,16 +1,20 @@ # Software-Based Trusted Platform Module (TPM) Emulator for Linux # Copyright (C) 2004 Mario Strasser <mast@xxxxxxx> +# Copyright (C) 2005 INTEL Corp. @@ -33,6 +52,7 @@ -KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build +KERNEL_BUILD := $(XEN_ROOT)/linux-2.6.12-xen0 MOD_SUBDIR := misc + COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/) # module settings -MODULE_NAME := tpm_emulator @@ -40,7 +60,7 @@ VERSION_MAJOR := 0 VERSION_MINOR := 2 VERSION_BUILD := $(shell date +"%s") -@@ -27,11 +30,9 @@ DIRS := . crypto tpm +@@ -34,11 +38,9 @@ DIRS := . crypto tpm SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) OBJS := $(patsubst %.c, %.o, $(SRCS)) SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) @@ -54,7 +74,7 @@ EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm -@@ -42,23 +43,17 @@ all: $(src)/crypto/gmp.h $(src)/crypto/l +@@ -49,23 +51,17 @@ all: $(src)/crypto/gmp.h $(src)/crypto/l @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules install: @@ -84,9 +104,9 @@ $(src)/crypto/libgmp.a: test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a -diff -uprN orig/tpm_emulator-0.2/README tpm_emulator-0.2/README ---- orig/tpm_emulator-0.2/README 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/README 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/README tpm_emulator/README +--- orig/tpm_emulator-0.2-x86_64/README 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator/README 2005-09-14 20:27:22.000000000 -0700 @@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli Copyright -------------------------------------------------------------------------- @@ -97,28 +117,9 @@ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -diff -uprN orig/tpm_emulator-0.2/linux_module.h tpm_emulator-0.2/linux_module.h ---- orig/tpm_emulator-0.2/linux_module.h 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/linux_module.h 2005-08-17 10:55:52.000000000 -0700 -@@ -1,5 +1,6 @@ - /* Software-Based Trusted Platform Module (TPM) Emulator for Linux - * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, -+ * Copyright (C) 2005 INTEL Corp. - * - * This module is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published -@@ -33,7 +34,7 @@ - #include "tpm_version.h" - - #define TPM_DEVICE_MINOR 224 --#define TPM_DEVICE_NAME "tpm" -+#define TPM_DEVICE_NAME "tpm0" - #define TPM_MODULE_NAME "tpm_emulator" - - /* debug and log output functions */ -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_data.c tpm_emulator-0.2/tpm/tpm_data.c ---- orig/tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c tpm_emulator/tpm/tpm_data.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c 2005-09-15 19:21:14.847078264 -0700 ++++ tpm_emulator/tpm/tpm_data.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -139,13 +140,3 @@ tpmData.permanent.data.pcrAttrib[i].pcrReset = TRUE; } /* set tick type */ -diff -uprN orig/tpm_emulator-0.2/tpm_version.h tpm_emulator-0.2/tpm_version.h ---- orig/tpm_emulator-0.2/tpm_version.h 2005-08-17 10:58:36.000000000 -0700 -+++ tpm_emulator-0.2/tpm_version.h 2005-08-17 10:55:53.000000000 -0700 -@@ -2,5 +2,5 @@ - #define _TPM_VERSION_H_ - #define VERSION_MAJOR 0 - #define VERSION_MINOR 2 --#define VERSION_BUILD 1123950310 -+#define VERSION_BUILD 1124301353 - #endif /* _TPM_VERSION_H_ */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm/vtpm.patch --- a/tools/vtpm/vtpm.patch Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm/vtpm.patch Thu Sep 22 17:42:01 2005 @@ -1,12 +1,12 @@ -diff -uprN orig/tpm_emulator-0.2/AUTHORS vtpm/AUTHORS ---- orig/tpm_emulator-0.2/AUTHORS 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/AUTHORS 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/AUTHORS vtpm/AUTHORS +--- orig/tpm_emulator-0.2-x86_64/AUTHORS 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/AUTHORS 2005-09-14 20:27:22.000000000 -0700 @@ -1 +1,2 @@ Mario Strasser <mast@xxxxxxx> +INTEL Corp <> -diff -uprN orig/tpm_emulator-0.2/ChangeLog vtpm/ChangeLog ---- orig/tpm_emulator-0.2/ChangeLog 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/ChangeLog 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/ChangeLog vtpm/ChangeLog +--- orig/tpm_emulator-0.2-x86_64/ChangeLog 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/ChangeLog 2005-09-14 20:27:22.000000000 -0700 @@ -1,3 +1,7 @@ +2005-08-16 Intel Corp + Moved module out of kernel to run as a ring 3 app @@ -15,115 +15,9 @@ 2005-08-15 Mario Strasser <mast@xxxxxxx> * all: some typos corrected * tpm_integrity.c: bug in TPM_Extend fixed -diff -uprN orig/tpm_emulator-0.2/Makefile vtpm/Makefile ---- orig/tpm_emulator-0.2/Makefile 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/Makefile 2005-08-17 10:55:52.000000000 -0700 -@@ -1,21 +1,29 @@ - # Software-Based Trusted Platform Module (TPM) Emulator for Linux - # Copyright (C) 2004 Mario Strasser <mast@xxxxxxx> -+# Copyright (C) 2005 INTEL Corp. - # - # $Id: Makefile 10 2005-04-26 20:59:50Z mast $ - --# kernel settings --KERNEL_RELEASE := $(shell uname -r) --KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build --MOD_SUBDIR := misc -- - # module settings --MODULE_NAME := tpm_emulator -+BIN := vtpmd - VERSION_MAJOR := 0 - VERSION_MINOR := 2 - VERSION_BUILD := $(shell date +"%s") - --# enable/disable DEBUG messages --EXTRA_CFLAGS += -DDEBUG -g -+# Installation program and options -+INSTALL = install -+INSTALL_PROG = $(INSTALL) -m0755 -+INSTALL_DIR = $(INSTALL) -d -m0755 -+ -+# Xen tools installation directory -+TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin -+ -+CC := gcc -+CFLAGS += -g -Wall $(INCLUDE) -DDEBUG -+CFLAGS += -I. -Itpm -+ -+# Is the simulator running in it's own vm? -+#CFLAGS += -DVTPM_MULTI_VM - - # GNU MP configuration - GMP_LIB := /usr/lib/libgmp.a -@@ -27,38 +35,31 @@ DIRS := . crypto tpm - SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) - OBJS := $(patsubst %.c, %.o, $(SRCS)) - SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) --DISTSRC := ./README ./AUTHORS ./ChangeLog ./Makefile $(SRCS) --DISTDIR := tpm_emulator-$(VERSION_MAJOR).$(VERSION_MINOR) - --obj-m := $(MODULE_NAME).o --$(MODULE_NAME)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a -+obj-m := $(BIN) -+$(BIN)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a - - EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm - - # do not print "Entering directory ..." - MAKEFLAGS += --no-print-directory - --all: $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules -+all: $(BIN) -+ -+$(BIN): $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version $(SRCS) $(OBJS) -+ $(CC) $(CFLAGS) $(OBJS) $(src)/crypto/libgmp.a -o $(BIN) -+ -+%.o: %.c -+ $(CC) $(CFLAGS) -c $< -o $@ - - install: -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules_install -- test -d /var/tpm || mkdir /var/tpm -- test -c /dev/tpm || mknod /dev/tpm c 10 224 -- chmod 666 /dev/tpm -- depmod -a -+ $(INSTALL_PROG) $(BIN) $(TOOLS_INSTALL_DIR) - - clean: -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) clean -- rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a -+ rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a $(OBJS) - --dist: $(DISTSRC) -- rm -rf $(DISTDIR) -- mkdir $(DISTDIR) -- cp --parents $(DISTSRC) $(DISTDIR)/ -- rm -f $(DISTDIR)/crypto/gmp.h -- tar -chzf $(DISTDIR).tar.gz $(DISTDIR) -- rm -rf $(DISTDIR) -+mrproper: clean -+ rm -f $(BIN) - - $(src)/crypto/libgmp.a: - test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a -diff -uprN orig/tpm_emulator-0.2/README vtpm/README ---- orig/tpm_emulator-0.2/README 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/README 2005-08-17 10:55:52.000000000 -0700 -@@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli - Copyright - -------------------------------------------------------------------------- - Copyright (C) 2004 Mario Strasser <mast@xxxxxxx> and Swiss Federal --Institute of Technology (ETH) Zurich. -+ Institute of Technology (ETH) Zurich. -+Copyright (C) 2005 INTEL Corp - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by -diff -uprN orig/tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c vtpm/crypto/gmp_kernel_wrapper.c ---- orig/tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/crypto/gmp_kernel_wrapper.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c vtpm/crypto/gmp_kernel_wrapper.c +--- orig/tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c 2005-09-15 19:21:42.508873032 -0700 ++++ vtpm/crypto/gmp_kernel_wrapper.c 2005-09-15 19:25:37.319176440 -0700 @@ -1,5 +1,6 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -154,9 +48,9 @@ { - void *ret = (void*)kmalloc(size, GFP_KERNEL); - if (!ret) panic(KERN_CRIT TPM_MODULE_NAME -- "GMP: cannot allocate memory (size=%u)\n", size); +- "GMP: cannot allocate memory (size=%Zu)\n", size); + void *ret = (void*)malloc(size); -+ if (!ret) error("GMP: cannot allocate memory (size=%u)\n", size); ++ if (!ret) error("GMP: cannot allocate memory (size=%Zu)\n", size); return ret; } @@ -165,9 +59,10 @@ { - void *ret = (void*)kmalloc(new_size, GFP_KERNEL); - if (!ret) panic(KERN_CRIT TPM_MODULE_NAME "GMP: Cannot reallocate memory " +- "(old_size=%Zu new_size=%Zu)\n", old_size, new_size); + void *ret = (void*)malloc(new_size); + if (!ret) error("GMP: Cannot reallocate memory " - "(old_size=%u new_size=%u)\n", old_size, new_size); ++ "(old_size=%Zu new_size=%Zu)\n", old_size, new_size); memcpy(ret, oldptr, old_size); - kfree(oldptr); + free(oldptr); @@ -183,9 +78,9 @@ } } -diff -uprN orig/tpm_emulator-0.2/crypto/rsa.c vtpm/crypto/rsa.c ---- orig/tpm_emulator-0.2/crypto/rsa.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/crypto/rsa.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/crypto/rsa.c vtpm/crypto/rsa.c +--- orig/tpm_emulator-0.2-x86_64/crypto/rsa.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/crypto/rsa.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,5 +1,6 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -211,8 +106,8 @@ sha1_final(&ctx, &msg[1]); if (memcmp(&msg[1], &msg[1 + SHA1_DIGEST_LENGTH], SHA1_DIGEST_LENGTH) != 0) return -1; -diff -uprN orig/tpm_emulator-0.2/linux_module.c vtpm/linux_module.c ---- orig/tpm_emulator-0.2/linux_module.c 2005-08-17 10:58:36.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.c vtpm/linux_module.c +--- orig/tpm_emulator-0.2-x86_64/linux_module.c 2005-09-15 19:22:40.343080896 -0700 +++ vtpm/linux_module.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,163 +0,0 @@ -/* Software-Based Trusted Platform Module (TPM) Emulator for Linux @@ -283,7 +178,7 @@ - -static ssize_t tpm_read(struct file *file, char *buf, size_t count, loff_t *ppos) -{ -- debug("%s(%d)", __FUNCTION__, count); +- debug("%s(%Zu)", __FUNCTION__, count); - down(&tpm_mutex); - if (tpm_response.data != NULL) { - count = min(count, (size_t)tpm_response.size - (size_t)*ppos); @@ -298,7 +193,7 @@ - -static ssize_t tpm_write(struct file *file, const char *buf, size_t count, loff_t *ppos) -{ -- debug("%s(%d)", __FUNCTION__, count); +- debug("%s(%Zu)", __FUNCTION__, count); - down(&tpm_mutex); - *ppos = 0; - if (tpm_response.data != NULL) kfree(tpm_response.data); @@ -378,9 +273,9 @@ - return (ticks > 0) ? ticks : 1; -} - -diff -uprN orig/tpm_emulator-0.2/linux_module.h vtpm/linux_module.h ---- orig/tpm_emulator-0.2/linux_module.h 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/linux_module.h 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.h vtpm/linux_module.h +--- orig/tpm_emulator-0.2-x86_64/linux_module.h 2005-09-15 19:21:14.844078720 -0700 ++++ vtpm/linux_module.h 2005-09-14 20:27:22.000000000 -0700 @@ -1,5 +1,6 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -416,17 +311,20 @@ +/* module settings */ +#define min(A,B) ((A)<(B)?(A):(B)) + #ifndef STR #define STR(s) __STR__(s) #define __STR__(s) #s - #include "tpm_version.h" -@@ -39,32 +45,35 @@ +@@ -39,34 +45,38 @@ + #define TPM_MODULE_NAME "tpm_emulator" + /* debug and log output functions */ ++extern int dmi_id; #ifdef DEBUG -#define debug(fmt, ...) printk(KERN_DEBUG "%s %s:%d: Debug: " fmt "\n", \ - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) -+#define debug(fmt, ...) printf("%s:%d: Debug: " fmt "\n", \ -+ __FILE__, __LINE__, ## __VA_ARGS__) ++#define debug(fmt, ...) printf("TPMD[%d]: %s:%d: Debug: " fmt "\n", \ ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) #else #define debug(fmt, ...) #endif @@ -436,12 +334,12 @@ - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) -#define alert(fmt, ...) printk(KERN_ALERT "%s %s:%d: Alert: " fmt "\n", \ - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) -+#define info(fmt, ...) printf("%s:%d: Info: " fmt "\n", \ -+ __FILE__, __LINE__, ## __VA_ARGS__) -+#define error(fmt, ...) printf("%s:%d: Error: " fmt "\n", \ -+ __FILE__, __LINE__, ## __VA_ARGS__) -+#define alert(fmt, ...) printf("%s:%d: Alert: " fmt "\n", \ -+ __FILE__, __LINE__, ## __VA_ARGS__) ++#define info(fmt, ...) printf("TPMD[%d]: %s:%d: Info: " fmt "\n", \ ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) ++#define error(fmt, ...) printf("TPMD[%d]: %s:%d: Error: " fmt "\n", \ ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) ++#define alert(fmt, ...) printf("TPMD[%d]: %s:%d: Alert: " fmt "\n", \ ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) /* memory allocation */ @@ -465,7 +363,7 @@ static inline void tpm_get_random_bytes(void *buf, int nbytes) { get_random_bytes(buf, nbytes); -@@ -84,9 +93,9 @@ uint64_t tpm_get_ticks(void); +@@ -86,9 +96,9 @@ uint64_t tpm_get_ticks(void); #define CPU_TO_LE16(x) __cpu_to_le16(x) #define BE64_TO_CPU(x) __be64_to_cpu(x) @@ -477,9 +375,116 @@ #define BE16_TO_CPU(x) __be16_to_cpu(x) #define LE16_TO_CPU(x) __le16_to_cpu(x) -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_audit.c vtpm/tpm/tpm_audit.c ---- orig/tpm_emulator-0.2/tpm/tpm_audit.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_audit.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/Makefile vtpm/Makefile +--- orig/tpm_emulator-0.2-x86_64/Makefile 2005-09-15 19:21:14.845078568 -0700 ++++ vtpm/Makefile 2005-09-14 20:27:22.000000000 -0700 +@@ -1,22 +1,31 @@ + # Software-Based Trusted Platform Module (TPM) Emulator for Linux + # Copyright (C) 2004 Mario Strasser <mast@xxxxxxx> ++# Copyright (C) 2005 INTEL Corp. + # + # $Id: Makefile 10 2005-04-26 20:59:50Z mast $ + +-# kernel settings +-KERNEL_RELEASE := $(shell uname -r) +-KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build +-MOD_SUBDIR := misc + COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/) + + # module settings +-MODULE_NAME := tpm_emulator ++BIN := vtpmd + VERSION_MAJOR := 0 + VERSION_MINOR := 2 + VERSION_BUILD := $(shell date +"%s") + +-# enable/disable DEBUG messages +-EXTRA_CFLAGS += -DDEBUG -g ++# Installation program and options ++INSTALL = install ++INSTALL_PROG = $(INSTALL) -m0755 ++INSTALL_DIR = $(INSTALL) -d -m0755 ++ ++# Xen tools installation directory ++TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin ++ ++CC := gcc ++CFLAGS += -g -Wall $(INCLUDE) -DDEBUG ++CFLAGS += -I. -Itpm ++ ++# Is the simulator running in it's own vm? ++#CFLAGS += -DVTPM_MULTI_VM + + ifeq ($(COMPILE_ARCH),x86_64) + LIBDIR = lib64 +@@ -34,38 +43,31 @@ DIRS := . crypto tpm + SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) + OBJS := $(patsubst %.c, %.o, $(SRCS)) + SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) +-DISTSRC := ./README ./AUTHORS ./ChangeLog ./Makefile $(SRCS) +-DISTDIR := tpm_emulator-$(VERSION_MAJOR).$(VERSION_MINOR) + +-obj-m := $(MODULE_NAME).o +-$(MODULE_NAME)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a ++obj-m := $(BIN) ++$(BIN)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a + + EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm + + # do not print "Entering directory ..." + MAKEFLAGS += --no-print-directory + +-all: $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules ++all: $(BIN) ++ ++$(BIN): $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version $(SRCS) $(OBJS) ++ $(CC) $(CFLAGS) $(OBJS) $(src)/crypto/libgmp.a -o $(BIN) ++ ++%.o: %.c ++ $(CC) $(CFLAGS) -c $< -o $@ + + install: +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules_install +- test -d /var/tpm || mkdir /var/tpm +- test -c /dev/tpm || mknod /dev/tpm c 10 224 +- chmod 666 /dev/tpm +- depmod -a ++ $(INSTALL_PROG) $(BIN) $(TOOLS_INSTALL_DIR) + + clean: +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) clean +- rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a ++ rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a $(OBJS) + +-dist: $(DISTSRC) +- rm -rf $(DISTDIR) +- mkdir $(DISTDIR) +- cp --parents $(DISTSRC) $(DISTDIR)/ +- rm -f $(DISTDIR)/crypto/gmp.h +- tar -chzf $(DISTDIR).tar.gz $(DISTDIR) +- rm -rf $(DISTDIR) ++mrproper: clean ++ rm -f $(BIN) tpm_version.h + + $(src)/crypto/libgmp.a: + test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a +diff -uprN orig/tpm_emulator-0.2-x86_64/README vtpm/README +--- orig/tpm_emulator-0.2-x86_64/README 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/README 2005-09-14 20:27:22.000000000 -0700 +@@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli + Copyright + -------------------------------------------------------------------------- + Copyright (C) 2004 Mario Strasser <mast@xxxxxxx> and Swiss Federal +-Institute of Technology (ETH) Zurich. ++ Institute of Technology (ETH) Zurich. ++Copyright (C) 2005 INTEL Corp + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_audit.c vtpm/tpm/tpm_audit.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_audit.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_audit.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -542,9 +547,9 @@ return TPM_SUCCESS; } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_authorization.c vtpm/tpm/tpm_authorization.c ---- orig/tpm_emulator-0.2/tpm/tpm_authorization.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_authorization.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_authorization.c vtpm/tpm/tpm_authorization.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_authorization.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_authorization.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -568,9 +573,9 @@ } - - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_capability.c vtpm/tpm/tpm_capability.c ---- orig/tpm_emulator-0.2/tpm/tpm_capability.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_capability.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_capability.c vtpm/tpm/tpm_capability.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_capability.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_capability.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -593,9 +598,9 @@ } } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_cmd_handler.c vtpm/tpm/tpm_cmd_handler.c ---- orig/tpm_emulator-0.2/tpm/tpm_cmd_handler.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_cmd_handler.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_cmd_handler.c vtpm/tpm/tpm_cmd_handler.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_cmd_handler.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_cmd_handler.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -658,9 +663,9 @@ return 0; } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_crypto.c vtpm/tpm/tpm_crypto.c ---- orig/tpm_emulator-0.2/tpm/tpm_crypto.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_crypto.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c vtpm/tpm/tpm_crypto.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c 2005-09-15 19:21:14.846078416 -0700 ++++ vtpm/tpm/tpm_crypto.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -678,14 +683,14 @@ memcpy(&buf[30], areaToSign, areaToSignSize); if (rsa_sign(&key->key, RSA_SSA_PKCS1_SHA1, buf, areaToSignSize + 30, *sig)) { -@@ -379,4 +380,3 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL +@@ -383,4 +384,3 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL } return TPM_SUCCESS; } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_data.c vtpm/tpm/tpm_data.c ---- orig/tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_data.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c vtpm/tpm/tpm_data.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c 2005-09-15 19:21:14.847078264 -0700 ++++ vtpm/tpm/tpm_data.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1005,7 +1010,7 @@ } #else -@@ -231,7 +431,6 @@ int tpm_restore_permanent_data(void) +@@ -232,7 +432,6 @@ int tpm_restore_permanent_data(void) int tpm_erase_permanent_data(void) { @@ -1014,9 +1019,9 @@ return res; } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_deprecated.c vtpm/tpm/tpm_deprecated.c ---- orig/tpm_emulator-0.2/tpm/tpm_deprecated.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_deprecated.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_deprecated.c vtpm/tpm/tpm_deprecated.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_deprecated.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_deprecated.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1043,9 +1048,9 @@ authContextSize, &contextBlob); if (res != TPM_SUCCESS) return res; len = *authContextSize; -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_emulator.h vtpm/tpm/tpm_emulator.h ---- orig/tpm_emulator-0.2/tpm/tpm_emulator.h 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_emulator.h 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_emulator.h vtpm/tpm/tpm_emulator.h +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_emulator.h 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_emulator.h 2005-09-14 20:27:22.000000000 -0700 @@ -1,5 +1,6 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1063,9 +1068,9 @@ /** * tpm_emulator_init - initialises and starts the TPM emulator -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_integrity.c vtpm/tpm/tpm_integrity.c ---- orig/tpm_emulator-0.2/tpm/tpm_integrity.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_integrity.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_integrity.c vtpm/tpm/tpm_integrity.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_integrity.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_integrity.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1079,9 +1084,9 @@ return TPM_SUCCESS; } - -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_structures.h vtpm/tpm/tpm_structures.h ---- orig/tpm_emulator-0.2/tpm/tpm_structures.h 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_structures.h 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_structures.h vtpm/tpm/tpm_structures.h +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_structures.h 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_structures.h 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1099,9 +1104,9 @@ #include "crypto/rsa.h" /* -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_testing.c vtpm/tpm/tpm_testing.c ---- orig/tpm_emulator-0.2/tpm/tpm_testing.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_testing.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_testing.c vtpm/tpm/tpm_testing.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_testing.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_testing.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1217,9 +1222,9 @@ rsa_private_key_t priv_key; rsa_public_key_t pub_key; -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_ticks.c vtpm/tpm/tpm_ticks.c ---- orig/tpm_emulator-0.2/tpm/tpm_ticks.c 2005-08-17 10:58:36.000000000 -0700 -+++ vtpm/tpm/tpm_ticks.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_ticks.c vtpm/tpm/tpm_ticks.c +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_ticks.c 2005-08-15 00:58:57.000000000 -0700 ++++ vtpm/tpm/tpm_ticks.c 2005-09-14 20:27:22.000000000 -0700 @@ -1,6 +1,7 @@ /* Software-Based Trusted Platform Module (TPM) Emulator for Linux * Copyright (C) 2004 Mario Strasser <mast@xxxxxxx>, @@ -1302,9 +1307,9 @@ } -diff -uprN orig/tpm_emulator-0.2/tpm/vtpm_manager.h vtpm/tpm/vtpm_manager.h ---- orig/tpm_emulator-0.2/tpm/vtpm_manager.h 1969-12-31 16:00:00.000000000 -0800 -+++ vtpm/tpm/vtpm_manager.h 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/vtpm_manager.h vtpm/tpm/vtpm_manager.h +--- orig/tpm_emulator-0.2-x86_64/tpm/vtpm_manager.h 1969-12-31 16:00:00.000000000 -0800 ++++ vtpm/tpm/vtpm_manager.h 2005-09-14 20:27:22.000000000 -0700 @@ -0,0 +1,126 @@ +// =================================================================== +// @@ -1432,9 +1437,9 @@ +*********************************************************************/ + +#endif //_VTPM_MANAGER_H_ -diff -uprN orig/tpm_emulator-0.2/tpmd.c vtpm/tpmd.c ---- orig/tpm_emulator-0.2/tpmd.c 1969-12-31 16:00:00.000000000 -0800 -+++ vtpm/tpmd.c 2005-08-17 10:55:52.000000000 -0700 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpmd.c vtpm/tpmd.c +--- orig/tpm_emulator-0.2-x86_64/tpmd.c 1969-12-31 16:00:00.000000000 -0800 ++++ vtpm/tpmd.c 2005-09-15 19:28:55.783005352 -0700 @@ -0,0 +1,207 @@ +/* Software-Based Trusted Platform Module (TPM) Emulator for Linux + * Copyright (C) 2005 INTEL Corp @@ -1468,9 +1473,9 @@ +#else + #define GUEST_RX_FIFO_D "/var/vtpm/fifos/guest-to-%d.fifo" + #define GUEST_TX_FIFO "/var/vtpm/fifos/guest-from-all.fifo" ++#endif + + int dmi_id; -+#endif + +#define BUFFER_SIZE 2048 + @@ -1506,7 +1511,7 @@ +{ + uint8_t in[BUFFER_SIZE], *out, *addressed_out; + uint32_t out_size; -+ int in_size, written ; ++ int in_size, written; + int i, guest_id=-1; + + int vtpm_tx_fh=-1, vtpm_rx_fh=-1; @@ -1602,7 +1607,7 @@ + written = write(vtpm_tx_fh, ctrl_msg, sizeof(ctrl_msg)); + + if (written != sizeof(ctrl_msg)) { -+ printf("ERROR: Part of response not written %d/%d.\n", written, sizeof(ctrl_msg)); ++ printf("ERROR: Part of response not written %d/%Zu.\n", written, sizeof(ctrl_msg)); + } else { + printf("Send Ctrl Message confermation\n"); + } @@ -1623,7 +1628,7 @@ + printf("%x ", addressed_out[i]); + printf("\n"); + } else { -+ printf("Sent[%d]: ", out_size + sizeof(uint32_t)); ++ printf("Sent[%Zu]: ", out_size + sizeof(uint32_t)); + for (i=0; i< out_size+ sizeof(uint32_t); i++) + printf("%x ", addressed_out[i]); + printf("\n"); diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/README --- a/tools/vtpm_manager/README Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/README Thu Sep 22 17:42:01 2005 @@ -51,14 +51,24 @@ DUMMY_BACKEND -> vtpm_manager listens on /tmp/in.fifo and /tmp/out.fifo rather than backend -MANUAL_DM_LAUNCH -> User must manually launch & kill VTPMs +MANUAL_DM_LAUNCH -> Must manually launch & kill VTPMs -USE_FIXED_SRK_AUTH -> Do not randomly generate a random SRK & Owner auth +WELL_KNOWN_SRK_AUTH -> Rather than randomly generating the password for the SRK, + use a well known value. This is necessary for sharing use + of the SRK across applications. Such as VTPM and Dom0 + measurement software. + +WELL_KNOWN_OWNER_AUTH -> Rather than randomly generating the password for the owner, + use a well known value. This is useful for debugging and for + poor bios which do not support clearing TPM if OwnerAuth is + lost. However this has no protection from malicious app + issuing a TPM_OwnerClear to wipe the TPM Requirements ============ - xen-unstable -- IBM frontend/backend vtpm driver patch +- vtpm frontend/backend driver patch +- OpenSSL Library Single-VM Flow ============================ diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/Rules.mk --- a/tools/vtpm_manager/Rules.mk Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/Rules.mk Thu Sep 22 17:42:01 2005 @@ -57,7 +57,8 @@ #CFLAGS += -DMANUAL_DM_LAUNCH # Fixed SRK -CFLAGS += -DUSE_FIXED_SRK_AUTH +CFLAGS += -DWELL_KNOWN_SRK_AUTH +#CFLAGS += -DWELL_KNOWN_OWNER_AUTH # TPM Hardware Device or TPM Simulator #CFLAGS += -DTPM_HWDEV diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/crypto/Makefile --- a/tools/vtpm_manager/crypto/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/crypto/Makefile Thu Sep 22 17:42:01 2005 @@ -13,6 +13,7 @@ rm -f *.a *.so *.o *.rpm $(DEP_FILES) mrproper: clean + rm -f *~ $(BIN): $(OBJS) $(AR) rcs $(BIN) $(OBJS) diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/manager/Makefile --- a/tools/vtpm_manager/manager/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/manager/Makefile Thu Sep 22 17:42:01 2005 @@ -17,7 +17,7 @@ rm -f *.a *.so *.o *.rpm $(DEP_FILES) mrproper: clean - rm -f $(BIN) + rm -f $(BIN) *~ $(BIN): $(OBJS) $(CC) $(LDFLAGS) $^ $(LIBS) -o $@ diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/manager/dmictl.c --- a/tools/vtpm_manager/manager/dmictl.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/manager/dmictl.c Thu Sep 22 17:42:01 2005 @@ -1,339 +1,344 @@ -// =================================================================== -// -// Copyright (c) 2005, Intel Corp. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Intel Corporation nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -// OF THE POSSIBILITY OF SUCH DAMAGE. -// =================================================================== -// -// dmictl.c -// -// Functions for creating and destroying DMIs -// -// ================================================================== - -#include <stdio.h> -#include <unistd.h> -#include <string.h> - -#ifndef VTPM_MUTLI_VM - #include <sys/types.h> - #include <sys/stat.h> - #include <fcntl.h> - #include <signal.h> - #include <wait.h> -#endif - -#include "vtpmpriv.h" -#include "bsg.h" -#include "buffer.h" -#include "log.h" -#include "hashtable.h" -#include "hashtable_itr.h" - -#define TPM_EMULATOR_PATH "/usr/bin/vtpmd" - -TPM_RESULT close_dmi( VTPM_DMI_RESOURCE *dmi_res) { - TPM_RESULT status = TPM_FAIL; - - if (dmi_res == NULL) - return TPM_SUCCESS; - - status = TCS_CloseContext(dmi_res->TCSContext); - free ( dmi_res->NVMLocation ); - dmi_res->connected = FALSE; - -#ifndef VTPM_MULTI_VM - free(dmi_res->guest_tx_fname); - free(dmi_res->vtpm_tx_fname); - - close(dmi_res->guest_tx_fh); dmi_res->guest_tx_fh = -1; - close(dmi_res->vtpm_tx_fh); dmi_res->vtpm_tx_fh = -1; - - - #ifndef MANUAL_DM_LAUNCH - if (dmi_res->dmi_id != VTPM_CTL_DM) { - if (dmi_res->dmi_pid != 0) { - vtpmloginfo(VTPM_LOG_VTPM, "Killing dmi on pid %d.\n", dmi_res->dmi_pid); - if ((kill(dmi_res->dmi_pid, SIGKILL) !=0) || - (waitpid(dmi_res->dmi_pid, NULL, 0) != dmi_res->dmi_pid)){ - vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi on pid %d.\n", dmi_res->dmi_pid); - status = TPM_FAIL; - } - } else - vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi because it's pid was 0.\n"); - } - #endif -#endif - - return status; -} - -TPM_RESULT VTPM_Handle_New_DMI( const buffer_t *param_buf) { - - VTPM_DMI_RESOURCE *new_dmi=NULL; - TPM_RESULT status=TPM_FAIL; - BYTE type; - UINT32 dmi_id, domain_id, *dmi_id_key; - int fh; - -#ifndef VTPM_MUTLI_VM - char dmi_id_str[11]; // UINT32s are up to 10 digits + NULL - struct stat file_info; -#endif - - if (param_buf == NULL) { // Assume creation of Dom 0 control - type = 0; - domain_id = VTPM_CTL_DM; - dmi_id = VTPM_CTL_DM; - } else if (buffer_len(param_buf) != sizeof(BYTE) + sizeof(UINT32) *2) { - vtpmloginfo(VTPM_LOG_VTPM, "New DMI command wrong length: %d.\n", buffer_len(param_buf)); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } else { - BSG_UnpackList( param_buf->bytes, 3, - BSG_TYPE_BYTE, &type, - BSG_TYPE_UINT32, &domain_id, - BSG_TYPE_UINT32, &dmi_id); - } - - new_dmi = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); - if (new_dmi == NULL) { - vtpmloginfo(VTPM_LOG_VTPM, "Creating new DMI instance %d attached on domain %d.\n", dmi_id, domain_id); - // Brand New DMI. Initialize the persistent pieces - if ((new_dmi = (VTPM_DMI_RESOURCE *) malloc (sizeof(VTPM_DMI_RESOURCE))) == NULL) { - status = TPM_RESOURCES; - goto abort_egress; - } - memset(new_dmi, 0, sizeof(VTPM_DMI_RESOURCE)); - new_dmi->dmi_id = dmi_id; - new_dmi->connected = FALSE; - - if ((dmi_id_key = (UINT32 *) malloc (sizeof(UINT32))) == NULL) { - status = TPM_RESOURCES; - goto abort_egress; - } - *dmi_id_key = new_dmi->dmi_id; - - // install into map - if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, new_dmi)){ - free(new_dmi); - free(dmi_id_key); - status = TPM_FAIL; - goto egress; - } - - } else - vtpmloginfo(VTPM_LOG_VTPM, "Re-attaching DMI instance %d on domain %d .\n", dmi_id, domain_id); - - if (new_dmi->connected) { - vtpmlogerror(VTPM_LOG_VTPM, "Attempt to re-attach, currently attached instance %d. Ignoring\n", dmi_id); - status = TPM_BAD_PARAMETER; - goto egress; - } - - // Initialize the Non-persistent pieces - new_dmi->dmi_domain_id = domain_id; - new_dmi->NVMLocation = NULL; - - new_dmi->TCSContext = 0; - TPMTRYRETURN( TCS_OpenContext(&new_dmi->TCSContext) ); - - new_dmi->NVMLocation = (char *) malloc(11 + strlen(DMI_NVM_FILE)); - sprintf(new_dmi->NVMLocation, DMI_NVM_FILE, (uint32_t) new_dmi->dmi_id); - - // Measure DMI - // FIXME: This will measure DMI. Until then use a fixed DMI_Measurement value - /* - fh = open(TPM_EMULATOR_PATH, O_RDONLY); - stat_ret = fstat(fh, &file_stat); - if (stat_ret == 0) - dmi_size = file_stat.st_size; - else { - vtpmlogerror(VTPM_LOG_VTPM, "Could not open tpm_emulator!!\n"); - status = TPM_IOERROR; - goto abort_egress; - } - dmi_buffer - */ - memset(&new_dmi->DMI_measurement, 0xcc, sizeof(TPM_DIGEST)); - -#ifndef VTPM_MULTI_VM - if (dmi_id != VTPM_CTL_DM) { - // Create a pair of fifo pipes - if( (new_dmi->guest_tx_fname = (char *) malloc(11 + strlen(GUEST_TX_FIFO))) == NULL){ - status = TPM_RESOURCES; - goto abort_egress; - } - sprintf(new_dmi->guest_tx_fname, GUEST_TX_FIFO, (uint32_t) dmi_id); - - if ((new_dmi->vtpm_tx_fname = (char *) malloc(11 + strlen(VTPM_TX_FIFO))) == NULL) { - status = TPM_RESOURCES; - goto abort_egress; - } - sprintf(new_dmi->vtpm_tx_fname, VTPM_TX_FIFO, (uint32_t) dmi_id); - - new_dmi->guest_tx_fh = -1; - new_dmi->vtpm_tx_fh= -1; - - if ( stat(new_dmi->guest_tx_fname, &file_info) == -1) { - if ( mkfifo(new_dmi->guest_tx_fname, S_IWUSR | S_IRUSR ) ){ - status = TPM_FAIL; - goto abort_egress; - } - } - - if ( (fh = open(new_dmi->vtpm_tx_fname, O_RDWR)) == -1) { - if ( mkfifo(new_dmi->vtpm_tx_fname, S_IWUSR | S_IRUSR ) ) { - status = TPM_FAIL; - goto abort_egress; - } - } - - // Launch DMI - sprintf(dmi_id_str, "%d", (int) dmi_id); -#ifdef MANUAL_DM_LAUNCH - vtpmlogerror(VTPM_LOG_VTPM, "FAKING starting vtpm with dmi=%s\n", dmi_id_str); - new_dmi->dmi_pid = 0; -#else - pid_t pid = fork(); - - if (pid == -1) { - vtpmlogerror(VTPM_LOG_VTPM, "Could not fork to launch vtpm\n"); - status = TPM_RESOURCES; - goto abort_egress; - } else if (pid == 0) { - if ( stat(new_dmi->NVMLocation, &file_info) == -1) - execl (TPM_EMULATOR_PATH, "vtmpd", "clear", dmi_id_str, NULL); - else - execl (TPM_EMULATOR_PATH, "vtpmd", "save", dmi_id_str, NULL); - - // Returning from these at all is an error. - vtpmlogerror(VTPM_LOG_VTPM, "Could not exec to launch vtpm\n"); - } else { - new_dmi->dmi_pid = pid; - vtpmloginfo(VTPM_LOG_VTPM, "Launching DMI on PID = %d\n", pid); - } -#endif // MANUAL_DM_LAUNCH - } -#else // VTPM_MUTLI_VM - // FIXME: Measure DMI through call to Measurement agent in platform. -#endif - - vtpm_globals->DMI_table_dirty = TRUE; - new_dmi->connected = TRUE; - status=TPM_SUCCESS; - goto egress; - - abort_egress: - close_dmi( new_dmi ); - - egress: - return status; -} - -TPM_RESULT VTPM_Handle_Close_DMI( const buffer_t *param_buf) { - - TPM_RESULT status=TPM_FAIL; - VTPM_DMI_RESOURCE *dmi_res=NULL; - UINT32 dmi_id; - - if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { - vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size."); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - BSG_UnpackList( param_buf->bytes, 1, - BSG_TYPE_UINT32, &dmi_id); - - vtpmloginfo(VTPM_LOG_VTPM, "Closing DMI %d.\n", dmi_id); - - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); - if (dmi_res == NULL ) { - vtpmlogerror(VTPM_LOG_VTPM, "Trying to close nonexistent DMI.\n"); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - if (!dmi_res->connected) { - vtpmlogerror(VTPM_LOG_VTPM, "Closing non-connected DMI.\n"); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - // Close Dmi - TPMTRYRETURN(close_dmi( dmi_res )); - - status=TPM_SUCCESS; - goto egress; - - abort_egress: - egress: - - return status; -} - -TPM_RESULT VTPM_Handle_Delete_DMI( const buffer_t *param_buf) { - - TPM_RESULT status=TPM_FAIL; - VTPM_DMI_RESOURCE *dmi_res=NULL; - UINT32 dmi_id; - - if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { - vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size.\n"); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - BSG_UnpackList( param_buf->bytes, 1, - BSG_TYPE_UINT32, &dmi_id); - - vtpmloginfo(VTPM_LOG_VTPM, "Deleting DMI %d.\n", dmi_id); - - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_remove(vtpm_globals->dmi_map, &dmi_id); - if (dmi_res == NULL) { - vtpmlogerror(VTPM_LOG_VTPM, "Closing non-existent DMI.\n"); - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - //TODO: Automatically delete file dmi_res->NVMLocation - - // Close DMI first - TPMTRYRETURN(close_dmi( dmi_res )); - free ( dmi_res ); - - status=TPM_SUCCESS; - goto egress; - - abort_egress: - egress: - - return status; -} +// =================================================================== +// +// Copyright (c) 2005, Intel Corp. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +// OF THE POSSIBILITY OF SUCH DAMAGE. +// =================================================================== +// +// dmictl.c +// +// Functions for creating and destroying DMIs +// +// ================================================================== + +#include <stdio.h> +#include <unistd.h> +#include <string.h> + +#ifndef VTPM_MUTLI_VM + #include <sys/types.h> + #include <sys/stat.h> + #include <fcntl.h> + #include <signal.h> + #include <wait.h> +#endif + +#include "vtpmpriv.h" +#include "bsg.h" +#include "buffer.h" +#include "log.h" +#include "hashtable.h" +#include "hashtable_itr.h" + +#define TPM_EMULATOR_PATH "/usr/bin/vtpmd" + +TPM_RESULT close_dmi( VTPM_DMI_RESOURCE *dmi_res) { + TPM_RESULT status = TPM_FAIL; + + if (dmi_res == NULL) + return TPM_SUCCESS; + + status = TCS_CloseContext(dmi_res->TCSContext); + free ( dmi_res->NVMLocation ); + dmi_res->connected = FALSE; + +#ifndef VTPM_MULTI_VM + free(dmi_res->guest_tx_fname); + free(dmi_res->vtpm_tx_fname); + + close(dmi_res->guest_tx_fh); dmi_res->guest_tx_fh = -1; + close(dmi_res->vtpm_tx_fh); dmi_res->vtpm_tx_fh = -1; + + #ifndef MANUAL_DM_LAUNCH + if (dmi_res->dmi_id != VTPM_CTL_DM) { + if (dmi_res->dmi_pid != 0) { + vtpmloginfo(VTPM_LOG_VTPM, "Killing dmi on pid %d.\n", dmi_res->dmi_pid); + if (kill(dmi_res->dmi_pid, SIGKILL) !=0) { + vtpmloginfo(VTPM_LOG_VTPM, "DMI on pid %d is already dead.\n", dmi_res->dmi_pid); + } else if (waitpid(dmi_res->dmi_pid, NULL, 0) != dmi_res->dmi_pid) { + vtpmlogerror(VTPM_LOG_VTPM, "DMI on pid %d failed to stop.\n", dmi_res->dmi_pid); + status = TPM_FAIL; + } + } else { + vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi because it's pid was 0.\n"); + status = TPM_FAIL; + } + } + #endif +#endif + + return status; +} + +TPM_RESULT VTPM_Handle_New_DMI( const buffer_t *param_buf) { + + VTPM_DMI_RESOURCE *new_dmi=NULL; + TPM_RESULT status=TPM_FAIL; + BYTE type; + UINT32 dmi_id, domain_id, *dmi_id_key; + +#ifndef VTPM_MULTI_VM + int fh; + char dmi_id_str[11]; // UINT32s are up to 10 digits + NULL + struct stat file_info; +#endif + + if (param_buf == NULL) { // Assume creation of Dom 0 control + type = 0; + domain_id = VTPM_CTL_DM; + dmi_id = VTPM_CTL_DM; + } else if (buffer_len(param_buf) != sizeof(BYTE) + sizeof(UINT32) *2) { + vtpmloginfo(VTPM_LOG_VTPM, "New DMI command wrong length: %d.\n", buffer_len(param_buf)); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } else { + BSG_UnpackList( param_buf->bytes, 3, + BSG_TYPE_BYTE, &type, + BSG_TYPE_UINT32, &domain_id, + BSG_TYPE_UINT32, &dmi_id); + } + + new_dmi = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); + if (new_dmi == NULL) { + vtpmloginfo(VTPM_LOG_VTPM, "Creating new DMI instance %d attached on domain %d.\n", dmi_id, domain_id); + // Brand New DMI. Initialize the persistent pieces + if ((new_dmi = (VTPM_DMI_RESOURCE *) malloc (sizeof(VTPM_DMI_RESOURCE))) == NULL) { + status = TPM_RESOURCES; + goto abort_egress; + } + memset(new_dmi, 0, sizeof(VTPM_DMI_RESOURCE)); + new_dmi->dmi_id = dmi_id; + new_dmi->connected = FALSE; + + if ((dmi_id_key = (UINT32 *) malloc (sizeof(UINT32))) == NULL) { + status = TPM_RESOURCES; + goto abort_egress; + } + *dmi_id_key = new_dmi->dmi_id; + + // install into map + if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, new_dmi)){ + free(new_dmi); + free(dmi_id_key); + status = TPM_FAIL; + goto egress; + } + + } else + vtpmloginfo(VTPM_LOG_VTPM, "Re-attaching DMI instance %d on domain %d .\n", dmi_id, domain_id); + + if (new_dmi->connected) { + vtpmlogerror(VTPM_LOG_VTPM, "Attempt to re-attach, currently attached instance %d. Ignoring\n", dmi_id); + status = TPM_BAD_PARAMETER; + goto egress; + } + + // Initialize the Non-persistent pieces + new_dmi->dmi_domain_id = domain_id; + new_dmi->NVMLocation = NULL; + + new_dmi->TCSContext = 0; + TPMTRYRETURN( TCS_OpenContext(&new_dmi->TCSContext) ); + + new_dmi->NVMLocation = (char *) malloc(11 + strlen(DMI_NVM_FILE)); + sprintf(new_dmi->NVMLocation, DMI_NVM_FILE, (uint32_t) new_dmi->dmi_id); + + // Measure DMI + // FIXME: This will measure DMI. Until then use a fixed DMI_Measurement value + /* + fh = open(TPM_EMULATOR_PATH, O_RDONLY); + stat_ret = fstat(fh, &file_stat); + if (stat_ret == 0) + dmi_size = file_stat.st_size; + else { + vtpmlogerror(VTPM_LOG_VTPM, "Could not open tpm_emulator!!\n"); + status = TPM_IOERROR; + goto abort_egress; + } + dmi_buffer + */ + memset(&new_dmi->DMI_measurement, 0xcc, sizeof(TPM_DIGEST)); + +#ifndef VTPM_MULTI_VM + if (dmi_id != VTPM_CTL_DM) { + // Create a pair of fifo pipes + if( (new_dmi->guest_tx_fname = (char *) malloc(11 + strlen(GUEST_TX_FIFO))) == NULL){ + status = TPM_RESOURCES; + goto abort_egress; + } + sprintf(new_dmi->guest_tx_fname, GUEST_TX_FIFO, (uint32_t) dmi_id); + + if ((new_dmi->vtpm_tx_fname = (char *) malloc(11 + strlen(VTPM_TX_FIFO))) == NULL) { + status = TPM_RESOURCES; + goto abort_egress; + } + sprintf(new_dmi->vtpm_tx_fname, VTPM_TX_FIFO, (uint32_t) dmi_id); + + new_dmi->guest_tx_fh = -1; + new_dmi->vtpm_tx_fh= -1; + + if ( stat(new_dmi->guest_tx_fname, &file_info) == -1) { + if ( mkfifo(new_dmi->guest_tx_fname, S_IWUSR | S_IRUSR ) ){ + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create dmi fifo.\n"); + status = TPM_IOERROR; + goto abort_egress; + } + } + + if ( (fh = open(new_dmi->vtpm_tx_fname, O_RDWR)) == -1) { + if ( mkfifo(new_dmi->vtpm_tx_fname, S_IWUSR | S_IRUSR ) ) { + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create dmi fifo.\n"); + status = TPM_IOERROR; + goto abort_egress; + } + } + + // Launch DMI + sprintf(dmi_id_str, "%d", (int) dmi_id); +#ifdef MANUAL_DM_LAUNCH + vtpmlogerror(VTPM_LOG_VTPM, "FAKING starting vtpm with dmi=%s\n", dmi_id_str); + new_dmi->dmi_pid = 0; +#else + pid_t pid = fork(); + + if (pid == -1) { + vtpmlogerror(VTPM_LOG_VTPM, "Could not fork to launch vtpm\n"); + status = TPM_RESOURCES; + goto abort_egress; + } else if (pid == 0) { + if ( stat(new_dmi->NVMLocation, &file_info) == -1) + execl (TPM_EMULATOR_PATH, "vtmpd", "clear", dmi_id_str, NULL); + else + execl (TPM_EMULATOR_PATH, "vtpmd", "save", dmi_id_str, NULL); + + // Returning from these at all is an error. + vtpmlogerror(VTPM_LOG_VTPM, "Could not exec to launch vtpm\n"); + } else { + new_dmi->dmi_pid = pid; + vtpmloginfo(VTPM_LOG_VTPM, "Launching DMI on PID = %d\n", pid); + } +#endif // MANUAL_DM_LAUNCH + } +#else // VTPM_MUTLI_VM + // FIXME: Measure DMI through call to Measurement agent in platform. +#endif + + vtpm_globals->DMI_table_dirty = TRUE; + new_dmi->connected = TRUE; + status=TPM_SUCCESS; + goto egress; + + abort_egress: + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create DMI id=%d due to status=%s. Cleaning.\n", dmi_id, tpm_get_error_name(status)); + close_dmi( new_dmi ); + + egress: + return status; +} + +TPM_RESULT VTPM_Handle_Close_DMI( const buffer_t *param_buf) { + + TPM_RESULT status=TPM_FAIL; + VTPM_DMI_RESOURCE *dmi_res=NULL; + UINT32 dmi_id; + + if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { + vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size."); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + BSG_UnpackList( param_buf->bytes, 1, + BSG_TYPE_UINT32, &dmi_id); + + vtpmloginfo(VTPM_LOG_VTPM, "Closing DMI %d.\n", dmi_id); + + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); + if (dmi_res == NULL ) { + vtpmlogerror(VTPM_LOG_VTPM, "Trying to close nonexistent DMI.\n"); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + if (!dmi_res->connected) { + vtpmlogerror(VTPM_LOG_VTPM, "Closing non-connected DMI.\n"); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + // Close Dmi + TPMTRYRETURN(close_dmi( dmi_res )); + + status=TPM_SUCCESS; + goto egress; + + abort_egress: + egress: + + return status; +} + +TPM_RESULT VTPM_Handle_Delete_DMI( const buffer_t *param_buf) { + + TPM_RESULT status=TPM_FAIL; + VTPM_DMI_RESOURCE *dmi_res=NULL; + UINT32 dmi_id; + + if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { + vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size.\n"); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + BSG_UnpackList( param_buf->bytes, 1, + BSG_TYPE_UINT32, &dmi_id); + + vtpmloginfo(VTPM_LOG_VTPM, "Deleting DMI %d.\n", dmi_id); + + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_remove(vtpm_globals->dmi_map, &dmi_id); + if (dmi_res == NULL) { + vtpmlogerror(VTPM_LOG_VTPM, "Closing non-existent DMI.\n"); + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + //TODO: Automatically delete file dmi_res->NVMLocation + + // Close DMI first + TPMTRYRETURN(close_dmi( dmi_res )); + free ( dmi_res ); + + status=TPM_SUCCESS; + goto egress; + + abort_egress: + egress: + + return status; +} diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/manager/securestorage.c --- a/tools/vtpm_manager/manager/securestorage.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/manager/securestorage.c Thu Sep 22 17:42:01 2005 @@ -1,401 +1,401 @@ -// =================================================================== -// -// Copyright (c) 2005, Intel Corp. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Intel Corporation nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -// OF THE POSSIBILITY OF SUCH DAMAGE. -// =================================================================== -// -// securestorage.c -// -// Functions regarding securely storing DMI secrets. -// -// ================================================================== - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include <string.h> - -#include "tcg.h" -#include "vtpm_manager.h" -#include "vtpmpriv.h" -#include "vtsp.h" -#include "bsg.h" -#include "crypto.h" -#include "hashtable.h" -#include "hashtable_itr.h" -#include "buffer.h" -#include "log.h" - -TPM_RESULT VTPM_Handle_Save_NVM(VTPM_DMI_RESOURCE *myDMI, - const buffer_t *inbuf, - buffer_t *outbuf) { - - TPM_RESULT status = TPM_SUCCESS; - symkey_t symkey; - buffer_t state_cipher = NULL_BUF, - symkey_cipher = NULL_BUF; - int fh; - long bytes_written; - BYTE *sealed_NVM=NULL; - UINT32 sealed_NVM_size, i; - struct pack_constbuf_t symkey_cipher32, state_cipher32; - - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Save_NVMing[%d]: 0x", buffer_len(inbuf)); - for (i=0; i< buffer_len(inbuf); i++) - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", inbuf->bytes[i]); - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - // Generate a sym key and encrypt state with it - TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_genkey (&symkey) ); - TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_encrypt (&symkey, inbuf, &state_cipher) ); - - // Encrypt symmetric key - TPMTRYRETURN( VTSP_Bind( &vtpm_globals->storageKey, - &symkey.key, - &symkey_cipher) ); - - // Create output blob: symkey_size + symkey_cipher + state_cipher_size + state_cipher - - symkey_cipher32.size = buffer_len(&symkey_cipher); - symkey_cipher32.data = symkey_cipher.bytes; - - state_cipher32.size = buffer_len(&state_cipher); - state_cipher32.data = state_cipher.bytes; - - sealed_NVM = (BYTE *) malloc( 2 * sizeof(UINT32) + symkey_cipher32.size + state_cipher32.size); - - sealed_NVM_size = BSG_PackList(sealed_NVM, 2, - BSG_TPM_SIZE32_DATA, &symkey_cipher32, - BSG_TPM_SIZE32_DATA, &state_cipher32); - - // Mark DMI Table so new save state info will get pushed to disk on return. - vtpm_globals->DMI_table_dirty = TRUE; - - // Write sealed blob off disk from NVMLocation - // TODO: How to properly return from these. Do we care if we return failure - // after writing the file? We can't get the old one back. - // TODO: Backup old file and try and recover that way. - fh = open(myDMI->NVMLocation, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); - if ( (bytes_written = write(fh, sealed_NVM, sealed_NVM_size) ) != (long) sealed_NVM_size) { - vtpmlogerror(VTPM_LOG_VTPM, "We just overwrote a DMI_NVM and failed to finish. %ld/%ld bytes.\n", bytes_written, (long)sealed_NVM_size); - status = TPM_IOERROR; - goto abort_egress; - } - close(fh); - - Crypto_SHA1Full (sealed_NVM, sealed_NVM_size, (BYTE *) &myDMI->NVM_measurement); - - vtpmloginfo(VTPM_LOG_VTPM, "Saved %d bytes of E(symkey) + %d bytes of E(NVM)\n", buffer_len(&symkey_cipher), buffer_len(&state_cipher)); - goto egress; - - abort_egress: - vtpmlogerror(VTPM_LOG_VTPM, "Failed to load NVM\n."); - - egress: - - buffer_free ( &state_cipher); - buffer_free ( &symkey_cipher); - free(sealed_NVM); - Crypto_symcrypto_freekey (&symkey); - - return status; -} - - -/* inbuf = null outbuf = sealed blob size, sealed blob.*/ -TPM_RESULT VTPM_Handle_Load_NVM(VTPM_DMI_RESOURCE *myDMI, - const buffer_t *inbuf, - buffer_t *outbuf) { - - TPM_RESULT status = TPM_SUCCESS; - symkey_t symkey; - buffer_t state_cipher = NULL_BUF, - symkey_clear = NULL_BUF, - symkey_cipher = NULL_BUF; - struct pack_buf_t symkey_cipher32, state_cipher32; - - UINT32 sealed_NVM_size; - BYTE *sealed_NVM = NULL; - long fh_size; - int fh, stat_ret, i; - struct stat file_stat; - TPM_DIGEST sealedNVMHash; - - memset(&symkey, 0, sizeof(symkey_t)); - - if (myDMI->NVMLocation == NULL) { - vtpmlogerror(VTPM_LOG_VTPM, "Unable to load NVM because the file name NULL.\n"); - status = TPM_AUTHFAIL; - goto abort_egress; - } - - //Read sealed blob off disk from NVMLocation - fh = open(myDMI->NVMLocation, O_RDONLY); - stat_ret = fstat(fh, &file_stat); - if (stat_ret == 0) - fh_size = file_stat.st_size; - else { - status = TPM_IOERROR; - goto abort_egress; - } - - sealed_NVM = (BYTE *) malloc(fh_size); - if (read(fh, sealed_NVM, fh_size) != fh_size) { - status = TPM_IOERROR; - goto abort_egress; - } - close(fh); - - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Load_NVMing[%ld]: 0x", fh_size); - for (i=0; i< fh_size; i++) - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", sealed_NVM[i]); - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - sealed_NVM_size = BSG_UnpackList(sealed_NVM, 2, - BSG_TPM_SIZE32_DATA, &symkey_cipher32, - BSG_TPM_SIZE32_DATA, &state_cipher32); - - TPMTRYRETURN( buffer_init_convert (&symkey_cipher, - symkey_cipher32.size, - symkey_cipher32.data) ); - - TPMTRYRETURN( buffer_init_convert (&state_cipher, - state_cipher32.size, - state_cipher32.data) ); - - Crypto_SHA1Full(sealed_NVM, sealed_NVM_size, (BYTE *) &sealedNVMHash); - - // Verify measurement of sealed blob. - if (memcmp(&sealedNVMHash, &myDMI->NVM_measurement, sizeof(TPM_DIGEST)) ) { - vtpmlogerror(VTPM_LOG_VTPM, "VTPM LoadNVM NVM measurement check failed.\n"); - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Correct hash: "); - for (i=0; i< sizeof(TPM_DIGEST); i++) - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&myDMI->NVM_measurement)[i]); - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Measured hash: "); - for (i=0; i< sizeof(TPM_DIGEST); i++) - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&sealedNVMHash)[i]); - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - status = TPM_AUTHFAIL; - goto abort_egress; - } - - // Decrypt Symmetric Key - TPMTRYRETURN( VTSP_Unbind( myDMI->TCSContext, - vtpm_globals->storageKeyHandle, - &symkey_cipher, - (const TPM_AUTHDATA*)&vtpm_globals->storage_key_usage_auth, - &symkey_clear, - &(vtpm_globals->keyAuth) ) ); - - // create symmetric key using saved bits - Crypto_symcrypto_initkey (&symkey, &symkey_clear); - - // Decrypt State - TPMTRY(TPM_DECRYPT_ERROR, Crypto_symcrypto_decrypt (&symkey, &state_cipher, outbuf) ); - - goto egress; - - abort_egress: - vtpmlogerror(VTPM_LOG_VTPM, "Failed to load NVM\n."); - - egress: - - buffer_free ( &state_cipher); - buffer_free ( &symkey_clear); - buffer_free ( &symkey_cipher); - free( sealed_NVM ); - Crypto_symcrypto_freekey (&symkey); - - return status; -} - -TPM_RESULT VTPM_SaveService(void) { - TPM_RESULT status=TPM_SUCCESS; - int fh, dmis=-1; - - BYTE *flat_global; - int flat_global_size, bytes_written; - UINT32 storageKeySize = buffer_len(&vtpm_globals->storageKeyWrap); - struct pack_buf_t storage_key_pack = {storageKeySize, vtpm_globals->storageKeyWrap.bytes}; - - struct hashtable_itr *dmi_itr; - VTPM_DMI_RESOURCE *dmi_res; - - UINT32 flat_global_full_size; - - // Global Values needing to be saved - flat_global_full_size = 3*sizeof(TPM_DIGEST) + // Auths - sizeof(UINT32) + // storagekeysize - storageKeySize + // storage key - hashtable_count(vtpm_globals->dmi_map) * // num DMIS - (sizeof(UINT32) + 2*sizeof(TPM_DIGEST)); // Per DMI info - - - flat_global = (BYTE *) malloc( flat_global_full_size); - - flat_global_size = BSG_PackList(flat_global, 4, - BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, - BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, - BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, - BSG_TPM_SIZE32_DATA, &storage_key_pack); - - // Per DMI values to be saved - if (hashtable_count(vtpm_globals->dmi_map) > 0) { - - dmi_itr = hashtable_iterator(vtpm_globals->dmi_map); - do { - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_iterator_value(dmi_itr); - dmis++; - - // No need to save dmi0. - if (dmi_res->dmi_id == 0) - continue; - - - flat_global_size += BSG_PackList( flat_global + flat_global_size, 3, - BSG_TYPE_UINT32, &dmi_res->dmi_id, - BSG_TPM_DIGEST, &dmi_res->NVM_measurement, - BSG_TPM_DIGEST, &dmi_res->DMI_measurement); - - } while (hashtable_iterator_advance(dmi_itr)); - } - - //FIXME: Once we have a way to protect a TPM key, we should use it to - // encrypt this blob. BUT, unless there is a way to ensure the key is - // not used by other apps, this encryption is useless. - fh = open(STATE_FILE, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); - if (fh == -1) { - vtpmlogerror(VTPM_LOG_VTPM, "Unable to open %s file for write.\n", STATE_FILE); - status = TPM_IOERROR; - goto abort_egress; - } - - if ( (bytes_written = write(fh, flat_global, flat_global_size)) != flat_global_size ) { - vtpmlogerror(VTPM_LOG_VTPM, "Failed to save service data. %d/%d bytes written.\n", bytes_written, flat_global_size); - status = TPM_IOERROR; - goto abort_egress; - } - vtpm_globals->DMI_table_dirty = FALSE; - - goto egress; - - abort_egress: - egress: - - free(flat_global); - close(fh); - - vtpmloginfo(VTPM_LOG_VTPM, "Saved VTPM Service state (status = %d, dmis = %d)\n", (int) status, dmis); - return status; -} - -TPM_RESULT VTPM_LoadService(void) { - - TPM_RESULT status=TPM_SUCCESS; - int fh, stat_ret, dmis=0; - long fh_size = 0, step_size; - BYTE *flat_global=NULL; - struct pack_buf_t storage_key_pack; - UINT32 *dmi_id_key; - - VTPM_DMI_RESOURCE *dmi_res; - struct stat file_stat; - - fh = open(STATE_FILE, O_RDONLY ); - stat_ret = fstat(fh, &file_stat); - if (stat_ret == 0) - fh_size = file_stat.st_size; - else { - status = TPM_IOERROR; - goto abort_egress; - } - - flat_global = (BYTE *) malloc(fh_size); - - if ((long) read(fh, flat_global, fh_size) != fh_size ) { - status = TPM_IOERROR; - goto abort_egress; - } - - // Global Values needing to be saved - step_size = BSG_UnpackList( flat_global, 4, - BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, - BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, - BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, - BSG_TPM_SIZE32_DATA, &storage_key_pack); - - TPMTRYRETURN(buffer_init(&vtpm_globals->storageKeyWrap, 0, 0) ); - TPMTRYRETURN(buffer_append_raw(&vtpm_globals->storageKeyWrap, storage_key_pack.size, storage_key_pack.data) ); - - // Per DMI values to be saved - while ( step_size < fh_size ){ - if (fh_size - step_size < (long) (sizeof(UINT32) + 2*sizeof(TPM_DIGEST))) { - vtpmlogerror(VTPM_LOG_VTPM, "Encountered %ld extra bytes at end of manager state.\n", fh_size-step_size); - step_size = fh_size; - } else { - dmi_res = (VTPM_DMI_RESOURCE *) malloc(sizeof(VTPM_DMI_RESOURCE)); - dmis++; - - dmi_res->connected = FALSE; - - step_size += BSG_UnpackList(flat_global + step_size, 3, - BSG_TYPE_UINT32, &dmi_res->dmi_id, - BSG_TPM_DIGEST, &dmi_res->NVM_measurement, - BSG_TPM_DIGEST, &dmi_res->DMI_measurement); - - // install into map - dmi_id_key = (UINT32 *) malloc (sizeof(UINT32)); - *dmi_id_key = dmi_res->dmi_id; - if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, dmi_res)) { - status = TPM_FAIL; - goto abort_egress; - } - - } - - } - - goto egress; - - abort_egress: - vtpmlogerror(VTPM_LOG_VTPM, "Failed to save service data\n"); - egress: - - if (flat_global) - free(flat_global); - close(fh); - - vtpmloginfo(VTPM_LOG_VTPM, "Previously saved state reloaded (status = %d, dmis = %d).\n", (int) status, dmis); - return status; -} +// =================================================================== +// +// Copyright (c) 2005, Intel Corp. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +// OF THE POSSIBILITY OF SUCH DAMAGE. +// =================================================================== +// +// securestorage.c +// +// Functions regarding securely storing DMI secrets. +// +// ================================================================== + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <string.h> + +#include "tcg.h" +#include "vtpm_manager.h" +#include "vtpmpriv.h" +#include "vtsp.h" +#include "bsg.h" +#include "crypto.h" +#include "hashtable.h" +#include "hashtable_itr.h" +#include "buffer.h" +#include "log.h" + +TPM_RESULT VTPM_Handle_Save_NVM(VTPM_DMI_RESOURCE *myDMI, + const buffer_t *inbuf, + buffer_t *outbuf) { + + TPM_RESULT status = TPM_SUCCESS; + symkey_t symkey; + buffer_t state_cipher = NULL_BUF, + symkey_cipher = NULL_BUF; + int fh; + long bytes_written; + BYTE *sealed_NVM=NULL; + UINT32 sealed_NVM_size, i; + struct pack_constbuf_t symkey_cipher32, state_cipher32; + + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Save_NVMing[%d]: 0x", buffer_len(inbuf)); + for (i=0; i< buffer_len(inbuf); i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", inbuf->bytes[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + // Generate a sym key and encrypt state with it + TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_genkey (&symkey) ); + TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_encrypt (&symkey, inbuf, &state_cipher) ); + + // Encrypt symmetric key + TPMTRYRETURN( VTSP_Bind( &vtpm_globals->storageKey, + &symkey.key, + &symkey_cipher) ); + + // Create output blob: symkey_size + symkey_cipher + state_cipher_size + state_cipher + + symkey_cipher32.size = buffer_len(&symkey_cipher); + symkey_cipher32.data = symkey_cipher.bytes; + + state_cipher32.size = buffer_len(&state_cipher); + state_cipher32.data = state_cipher.bytes; + + sealed_NVM = (BYTE *) malloc( 2 * sizeof(UINT32) + symkey_cipher32.size + state_cipher32.size); + + sealed_NVM_size = BSG_PackList(sealed_NVM, 2, + BSG_TPM_SIZE32_DATA, &symkey_cipher32, + BSG_TPM_SIZE32_DATA, &state_cipher32); + + // Mark DMI Table so new save state info will get pushed to disk on return. + vtpm_globals->DMI_table_dirty = TRUE; + + // Write sealed blob off disk from NVMLocation + // TODO: How to properly return from these. Do we care if we return failure + // after writing the file? We can't get the old one back. + // TODO: Backup old file and try and recover that way. + fh = open(myDMI->NVMLocation, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); + if ( (bytes_written = write(fh, sealed_NVM, sealed_NVM_size) ) != (long) sealed_NVM_size) { + vtpmlogerror(VTPM_LOG_VTPM, "We just overwrote a DMI_NVM and failed to finish. %ld/%ld bytes.\n", bytes_written, (long)sealed_NVM_size); + status = TPM_IOERROR; + goto abort_egress; + } + close(fh); + + Crypto_SHA1Full (sealed_NVM, sealed_NVM_size, (BYTE *) &myDMI->NVM_measurement); + + vtpmloginfo(VTPM_LOG_VTPM, "Saved %d bytes of E(symkey) + %d bytes of E(NVM)\n", buffer_len(&symkey_cipher), buffer_len(&state_cipher)); + goto egress; + + abort_egress: + vtpmlogerror(VTPM_LOG_VTPM, "Failed to load NVM\n."); + + egress: + + buffer_free ( &state_cipher); + buffer_free ( &symkey_cipher); + free(sealed_NVM); + Crypto_symcrypto_freekey (&symkey); + + return status; +} + + +/* inbuf = null outbuf = sealed blob size, sealed blob.*/ +TPM_RESULT VTPM_Handle_Load_NVM(VTPM_DMI_RESOURCE *myDMI, + const buffer_t *inbuf, + buffer_t *outbuf) { + + TPM_RESULT status = TPM_SUCCESS; + symkey_t symkey; + buffer_t state_cipher = NULL_BUF, + symkey_clear = NULL_BUF, + symkey_cipher = NULL_BUF; + struct pack_buf_t symkey_cipher32, state_cipher32; + + UINT32 sealed_NVM_size; + BYTE *sealed_NVM = NULL; + long fh_size; + int fh, stat_ret, i; + struct stat file_stat; + TPM_DIGEST sealedNVMHash; + + memset(&symkey, 0, sizeof(symkey_t)); + + if (myDMI->NVMLocation == NULL) { + vtpmlogerror(VTPM_LOG_VTPM, "Unable to load NVM because the file name NULL.\n"); + status = TPM_AUTHFAIL; + goto abort_egress; + } + + //Read sealed blob off disk from NVMLocation + fh = open(myDMI->NVMLocation, O_RDONLY); + stat_ret = fstat(fh, &file_stat); + if (stat_ret == 0) + fh_size = file_stat.st_size; + else { + status = TPM_IOERROR; + goto abort_egress; + } + + sealed_NVM = (BYTE *) malloc(fh_size); + if (read(fh, sealed_NVM, fh_size) != fh_size) { + status = TPM_IOERROR; + goto abort_egress; + } + close(fh); + + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Load_NVMing[%ld]: 0x", fh_size); + for (i=0; i< fh_size; i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", sealed_NVM[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + sealed_NVM_size = BSG_UnpackList(sealed_NVM, 2, + BSG_TPM_SIZE32_DATA, &symkey_cipher32, + BSG_TPM_SIZE32_DATA, &state_cipher32); + + TPMTRYRETURN( buffer_init_convert (&symkey_cipher, + symkey_cipher32.size, + symkey_cipher32.data) ); + + TPMTRYRETURN( buffer_init_convert (&state_cipher, + state_cipher32.size, + state_cipher32.data) ); + + Crypto_SHA1Full(sealed_NVM, sealed_NVM_size, (BYTE *) &sealedNVMHash); + + // Verify measurement of sealed blob. + if (memcmp(&sealedNVMHash, &myDMI->NVM_measurement, sizeof(TPM_DIGEST)) ) { + vtpmlogerror(VTPM_LOG_VTPM, "VTPM LoadNVM NVM measurement check failed.\n"); + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Correct hash: "); + for (i=0; i< sizeof(TPM_DIGEST); i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&myDMI->NVM_measurement)[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Measured hash: "); + for (i=0; i< sizeof(TPM_DIGEST); i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&sealedNVMHash)[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + status = TPM_AUTHFAIL; + goto abort_egress; + } + + // Decrypt Symmetric Key + TPMTRYRETURN( VTSP_Unbind( myDMI->TCSContext, + vtpm_globals->storageKeyHandle, + &symkey_cipher, + (const TPM_AUTHDATA*)&vtpm_globals->storage_key_usage_auth, + &symkey_clear, + &(vtpm_globals->keyAuth) ) ); + + // create symmetric key using saved bits + Crypto_symcrypto_initkey (&symkey, &symkey_clear); + + // Decrypt State + TPMTRY(TPM_DECRYPT_ERROR, Crypto_symcrypto_decrypt (&symkey, &state_cipher, outbuf) ); + + goto egress; + + abort_egress: + vtpmlogerror(VTPM_LOG_VTPM, "Failed to load NVM\n."); + + egress: + + buffer_free ( &state_cipher); + buffer_free ( &symkey_clear); + buffer_free ( &symkey_cipher); + free( sealed_NVM ); + Crypto_symcrypto_freekey (&symkey); + + return status; +} + +TPM_RESULT VTPM_SaveService(void) { + TPM_RESULT status=TPM_SUCCESS; + int fh, dmis=-1; + + BYTE *flat_global; + int flat_global_size, bytes_written; + UINT32 storageKeySize = buffer_len(&vtpm_globals->storageKeyWrap); + struct pack_buf_t storage_key_pack = {storageKeySize, vtpm_globals->storageKeyWrap.bytes}; + + struct hashtable_itr *dmi_itr; + VTPM_DMI_RESOURCE *dmi_res; + + UINT32 flat_global_full_size; + + // Global Values needing to be saved + flat_global_full_size = 3*sizeof(TPM_DIGEST) + // Auths + sizeof(UINT32) + // storagekeysize + storageKeySize + // storage key + hashtable_count(vtpm_globals->dmi_map) * // num DMIS + (sizeof(UINT32) + 2*sizeof(TPM_DIGEST)); // Per DMI info + + + flat_global = (BYTE *) malloc( flat_global_full_size); + + flat_global_size = BSG_PackList(flat_global, 4, + BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, + BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, + BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, + BSG_TPM_SIZE32_DATA, &storage_key_pack); + + // Per DMI values to be saved + if (hashtable_count(vtpm_globals->dmi_map) > 0) { + + dmi_itr = hashtable_iterator(vtpm_globals->dmi_map); + do { + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_iterator_value(dmi_itr); + dmis++; + + // No need to save dmi0. + if (dmi_res->dmi_id == 0) + continue; + + + flat_global_size += BSG_PackList( flat_global + flat_global_size, 3, + BSG_TYPE_UINT32, &dmi_res->dmi_id, + BSG_TPM_DIGEST, &dmi_res->NVM_measurement, + BSG_TPM_DIGEST, &dmi_res->DMI_measurement); + + } while (hashtable_iterator_advance(dmi_itr)); + } + + //FIXME: Once we have a way to protect a TPM key, we should use it to + // encrypt this blob. BUT, unless there is a way to ensure the key is + // not used by other apps, this encryption is useless. + fh = open(STATE_FILE, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); + if (fh == -1) { + vtpmlogerror(VTPM_LOG_VTPM, "Unable to open %s file for write.\n", STATE_FILE); + status = TPM_IOERROR; + goto abort_egress; + } + + if ( (bytes_written = write(fh, flat_global, flat_global_size)) != flat_global_size ) { + vtpmlogerror(VTPM_LOG_VTPM, "Failed to save service data. %d/%d bytes written.\n", bytes_written, flat_global_size); + status = TPM_IOERROR; + goto abort_egress; + } + vtpm_globals->DMI_table_dirty = FALSE; + + goto egress; + + abort_egress: + egress: + + free(flat_global); + close(fh); + + vtpmloginfo(VTPM_LOG_VTPM, "Saved VTPM Service state (status = %d, dmis = %d)\n", (int) status, dmis); + return status; +} + +TPM_RESULT VTPM_LoadService(void) { + + TPM_RESULT status=TPM_SUCCESS; + int fh, stat_ret, dmis=0; + long fh_size = 0, step_size; + BYTE *flat_global=NULL; + struct pack_buf_t storage_key_pack; + UINT32 *dmi_id_key; + + VTPM_DMI_RESOURCE *dmi_res; + struct stat file_stat; + + fh = open(STATE_FILE, O_RDONLY ); + stat_ret = fstat(fh, &file_stat); + if (stat_ret == 0) + fh_size = file_stat.st_size; + else { + status = TPM_IOERROR; + goto abort_egress; + } + + flat_global = (BYTE *) malloc(fh_size); + + if ((long) read(fh, flat_global, fh_size) != fh_size ) { + status = TPM_IOERROR; + goto abort_egress; + } + + // Global Values needing to be saved + step_size = BSG_UnpackList( flat_global, 4, + BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, + BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, + BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, + BSG_TPM_SIZE32_DATA, &storage_key_pack); + + TPMTRYRETURN(buffer_init(&vtpm_globals->storageKeyWrap, 0, 0) ); + TPMTRYRETURN(buffer_append_raw(&vtpm_globals->storageKeyWrap, storage_key_pack.size, storage_key_pack.data) ); + + // Per DMI values to be saved + while ( step_size < fh_size ){ + if (fh_size - step_size < (long) (sizeof(UINT32) + 2*sizeof(TPM_DIGEST))) { + vtpmlogerror(VTPM_LOG_VTPM, "Encountered %ld extra bytes at end of manager state.\n", fh_size-step_size); + step_size = fh_size; + } else { + dmi_res = (VTPM_DMI_RESOURCE *) malloc(sizeof(VTPM_DMI_RESOURCE)); + dmis++; + + dmi_res->connected = FALSE; + + step_size += BSG_UnpackList(flat_global + step_size, 3, + BSG_TYPE_UINT32, &dmi_res->dmi_id, + BSG_TPM_DIGEST, &dmi_res->NVM_measurement, + BSG_TPM_DIGEST, &dmi_res->DMI_measurement); + + // install into map + dmi_id_key = (UINT32 *) malloc (sizeof(UINT32)); + *dmi_id_key = dmi_res->dmi_id; + if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, dmi_res)) { + status = TPM_FAIL; + goto abort_egress; + } + + } + + } + + vtpmloginfo(VTPM_LOG_VTPM, "Loaded saved state (dmis = %d).\n", dmis); + goto egress; + + abort_egress: + vtpmlogerror(VTPM_LOG_VTPM, "Failed to load service data with error = %s\n", tpm_get_error_name(status)); + egress: + + if (flat_global) + free(flat_global); + close(fh); + + return status; +} diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/manager/vtpm_manager.c --- a/tools/vtpm_manager/manager/vtpm_manager.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/manager/vtpm_manager.c Thu Sep 22 17:42:01 2005 @@ -1,735 +1,811 @@ -// =================================================================== -// -// Copyright (c) 2005, Intel Corp. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Intel Corporation nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED -// OF THE POSSIBILITY OF SUCH DAMAGE. -// =================================================================== -// -// vtpm_manager.c -// -// This file will house the main logic of the VTPM Manager -// -// ================================================================== - -#include <stdio.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <string.h> - -#ifndef VTPM_MULTI_VM -#include <pthread.h> -#include <errno.h> -#include <aio.h> -#include <time.h> -#endif - -#include "vtpm_manager.h" -#include "vtpmpriv.h" -#include "vtsp.h" -#include "bsg.h" -#include "hashtable.h" -#include "hashtable_itr.h" - -#include "log.h" -#include "buffer.h" - -VTPM_GLOBALS *vtpm_globals=NULL; - -#ifdef VTPM_MULTI_VM - #define vtpmhandlerloginfo(module,fmt,args...) vtpmloginfo (module, fmt, ##args ); - #define vtpmhandlerloginfomore(module,fmt,args...) vtpmloginfomore (module, fmt, ##args ); - #define vtpmhandlerlogerror(module,fmt,args...) vtpmlogerror (module, fmt, ##args ); -#else - #define vtpmhandlerloginfo(module,fmt,args...) vtpmloginfo (module, "[%d]: " fmt, threadType, ##args ); - #define vtpmhandlerloginfomore(module,fmt,args...) vtpmloginfomore (module, fmt, ##args ); - #define vtpmhandlerlogerror(module,fmt,args...) vtpmlogerror (module, "[%d]: " fmt, threadType, ##args ); -#endif - -// --------------------------- Static Auths -------------------------- -#ifdef USE_FIXED_SRK_AUTH - -static BYTE FIXED_SRK_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - -static BYTE FIXED_EK_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - -#endif - -// -------------------------- Hash table functions -------------------- - -static unsigned int hashfunc32(void *ky) { - return (* (UINT32 *) ky); -} - -static int equals32(void *k1, void *k2) { - return (*(UINT32 *) k1 == *(UINT32 *) k2); -} - -// --------------------------- Functions ------------------------------ - -TPM_RESULT VTPM_Create_Service(){ - - TPM_RESULT status = TPM_SUCCESS; - - // Generate Auth's for SRK & Owner -#ifdef USE_FIXED_SRK_AUTH - memcpy(vtpm_globals->owner_usage_auth, FIXED_SRK_AUTH, sizeof(TPM_AUTHDATA)); - memcpy(vtpm_globals->srk_usage_auth, FIXED_EK_AUTH, sizeof(TPM_AUTHDATA)); -#else - Crypto_GetRandom(vtpm_globals->owner_usage_auth, sizeof(TPM_AUTHDATA) ); - Crypto_GetRandom(vtpm_globals->srk_usage_auth, sizeof(TPM_AUTHDATA) ); -#endif - - // Take Owership of TPM - CRYPTO_INFO ek_cryptoInfo; - - vtpmloginfo(VTPM_LOG_VTPM, "Attempting Pubek Read. NOTE: Failure is ok.\n"); - status = VTSP_ReadPubek(vtpm_globals->manager_tcs_handle, &ek_cryptoInfo); - - // If we can read PubEK then there is no owner and we should take it. - if (status == TPM_SUCCESS) { - TPMTRYRETURN(VTSP_TakeOwnership(vtpm_globals->manager_tcs_handle, - (const TPM_AUTHDATA*)&vtpm_globals->owner_usage_auth, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, - &ek_cryptoInfo, - &vtpm_globals->keyAuth)); - - TPMTRYRETURN(VTSP_DisablePubekRead(vtpm_globals->manager_tcs_handle, - (const TPM_AUTHDATA*)&vtpm_globals->owner_usage_auth, - &vtpm_globals->keyAuth)); - } - - // Generate storage key's auth - Crypto_GetRandom( &vtpm_globals->storage_key_usage_auth, - sizeof(TPM_AUTHDATA) ); - - TCS_AUTH osap; - TPM_AUTHDATA sharedsecret; - - TPMTRYRETURN( VTSP_OSAP(vtpm_globals->manager_tcs_handle, - TPM_ET_SRK, - 0, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, - &sharedsecret, - &osap) ); - - TPMTRYRETURN( VTSP_CreateWrapKey( vtpm_globals->manager_tcs_handle, - TPM_KEY_BIND, - (const TPM_AUTHDATA*)&vtpm_globals->storage_key_usage_auth, - TPM_SRK_KEYHANDLE, - (const TPM_AUTHDATA*)&sharedsecret, - &vtpm_globals->storageKeyWrap, - &osap) ); - - vtpm_globals->keyAuth.fContinueAuthSession = TRUE; - - goto egress; - - abort_egress: - exit(1); - - egress: - vtpmloginfo(VTPM_LOG_VTPM, "New VTPM Service initialized (Status = %d).\n", status); - return status; - -} - - -////////////////////////////////////////////////////////////////////////////// -#ifdef VTPM_MULTI_VM -int VTPM_Service_Handler(){ -#else -void *VTPM_Service_Handler(void *threadTypePtr){ -#endif - TPM_RESULT status = TPM_FAIL; // Should never return - UINT32 dmi, in_param_size, cmd_size, out_param_size, out_message_size, out_message_size_full, dmi_cmd_size; - BYTE *cmd_header, *in_param, *out_message, *dmi_cmd; - buffer_t *command_buf=NULL, *result_buf=NULL; - TPM_TAG tag; - TPM_COMMAND_CODE ord; - VTPM_DMI_RESOURCE *dmi_res; - int size_read, size_write, i; - -#ifndef VTPM_MULTI_VM - int threadType = *(int *) threadTypePtr; - - // async io structures - struct aiocb dmi_aio; - struct aiocb *dmi_aio_a[1]; - dmi_aio_a[0] = &dmi_aio; -#endif - -#ifdef DUMMY_BACKEND - int dummy_rx; -#endif - - // TODO: Reinsert ifdefs to enable support for MULTI-VM - - cmd_header = (BYTE *) malloc(VTPM_COMMAND_HEADER_SIZE_SRV); - command_buf = (buffer_t *) malloc(sizeof(buffer_t)); - result_buf = (buffer_t *) malloc(sizeof(buffer_t)); - -#ifndef VTPM_MULTI_VM - TPM_RESULT *ret_value = (TPM_RESULT *) malloc(sizeof(TPM_RESULT)); -#endif - - int *tx_fh, *rx_fh; - -#ifdef VTPM_MULTI_VM - rx_fh = &vtpm_globals->be_fh; -#else - if (threadType == BE_LISTENER_THREAD) { -#ifdef DUMMY_BACKEND - dummy_rx = -1; - rx_fh = &dummy_rx; -#else - rx_fh = &vtpm_globals->be_fh; -#endif - } else { // DMI_LISTENER_THREAD - rx_fh = &vtpm_globals->vtpm_rx_fh; - } -#endif - -#ifndef VTPM_MULTI_VM - int fh; - if (threadType == BE_LISTENER_THREAD) { - tx_fh = &vtpm_globals->be_fh; - if ( (fh = open(GUEST_RX_FIFO, O_RDWR)) == -1) { - if ( mkfifo(GUEST_RX_FIFO, S_IWUSR | S_IRUSR ) ){ - *ret_value = TPM_FAIL; - pthread_exit(ret_value); - } - } else - close(fh); - - } else { // else DMI_LISTENER_THREAD - // tx_fh will be set once the DMI is identified - // But we need to make sure the read pip is created. - if ( (fh = open(VTPM_RX_FIFO, O_RDWR)) == -1) { - if ( mkfifo(VTPM_RX_FIFO, S_IWUSR | S_IRUSR ) ){ - *ret_value = TPM_FAIL; - pthread_exit(ret_value); - } - } else - close(fh); - - } -#endif - - while(1) { - - if (threadType == BE_LISTENER_THREAD) { - vtpmhandlerloginfo(VTPM_LOG_VTPM, "Waiting for Guest requests & ctrl messages.\n"); - } else - vtpmhandlerloginfo(VTPM_LOG_VTPM, "Waiting for DMI messages.\n"); - - - if (*rx_fh < 0) { - if (threadType == BE_LISTENER_THREAD) -#ifdef DUMMY_BACKEND - *rx_fh = open("/tmp/in.fifo", O_RDWR); -#else - *rx_fh = open(VTPM_BE_DEV, O_RDWR); -#endif - else // DMI Listener - *rx_fh = open(VTPM_RX_FIFO, O_RDWR); - - } - - if (*rx_fh < 0) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh.\n"); -#ifdef VTPM_MULTI_VM - return TPM_IOERROR; -#else - *ret_value = TPM_IOERROR; - pthread_exit(ret_value); -#endif - } - - size_read = read(*rx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); - if (size_read > 0) { - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "RECV[%d}: 0x", size_read); - for (i=0; i<size_read; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't read from BE. Aborting... \n"); - close(*rx_fh); - *rx_fh = -1; - goto abort_command; - } - - if (size_read < (int) VTPM_COMMAND_HEADER_SIZE_SRV) { - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "\n"); - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command shorter than normal header (%d bytes). Aborting...\n", size_read); - goto abort_command; - } - - BSG_UnpackList(cmd_header, 4, - BSG_TYPE_UINT32, &dmi, - BSG_TPM_TAG, &tag, - BSG_TYPE_UINT32, &in_param_size, - BSG_TPM_COMMAND_CODE, &ord ); - - // Note that in_param_size is in the client's context - cmd_size = in_param_size - VTPM_COMMAND_HEADER_SIZE_CLT; - if (cmd_size > 0) { - in_param = (BYTE *) malloc(cmd_size); - size_read = read( *rx_fh, in_param, cmd_size); - if (size_read > 0) { - for (i=0; i<size_read; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); - - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from BE. Aborting... \n"); - close(*rx_fh); - *rx_fh = -1; - goto abort_command; - } - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - if (size_read < (int) cmd_size) { - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command read(%d) is shorter than header indicates(%d). Aborting...\n", size_read, cmd_size); - goto abort_command; - } - } else { - in_param = NULL; - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - } - - if ((threadType != BE_LISTENER_THREAD) && (dmi == 0)) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempt to access dom0 commands from DMI interface. Aborting...\n"); - goto abort_command; - } - - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi); - if (dmi_res == NULL) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempted access to non-existent DMI in domain: %d. Aborting...\n", dmi); - goto abort_command; - } - if (!dmi_res->connected) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempted access to disconnected DMI in domain: %d. Aborting...\n", dmi); - goto abort_command; - } - - if (threadType != BE_LISTENER_THREAD) - tx_fh = &dmi_res->vtpm_tx_fh; - // else we set this before the while loop since it doesn't change. - - if ( (buffer_init_convert(command_buf, cmd_size, in_param) != TPM_SUCCESS) || - (buffer_init(result_buf, 0, 0) != TPM_SUCCESS) ) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Failed to setup buffers. Aborting...\n"); - goto abort_command; - } - - // Dispatch it as either control or user request. - if (tag == VTPM_TAG_REQ) { - if (dmi_res->dmi_id == VTPM_CTL_DM){ - switch (ord) { - case VTPM_ORD_OPEN: - status = VTPM_Handle_New_DMI(command_buf); - break; - - case VTPM_ORD_CLOSE: - status = VTPM_Handle_Close_DMI(command_buf); - break; - - case VTPM_ORD_DELETE: - status = VTPM_Handle_Delete_DMI(command_buf); - break; - default: - status = TPM_BAD_ORDINAL; - } // switch - } else { - - switch (ord) { - case VTPM_ORD_SAVENVM: - status= VTPM_Handle_Save_NVM(dmi_res, - command_buf, - result_buf); - break; - case VTPM_ORD_LOADNVM: - status= VTPM_Handle_Load_NVM(dmi_res, - command_buf, - result_buf); - break; - - case VTPM_ORD_TPMCOMMAND: - status= VTPM_Handle_TPM_Command(dmi_res, - command_buf, - result_buf); - break; - - default: - status = TPM_BAD_ORDINAL; - } // switch - } - } else { // This is not a VTPM Command at all - - if (threadType == BE_LISTENER_THREAD) { - if (dmi == 0) { - // This usually indicates a FE/BE driver. - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Illegal use of TPM command from dom0\n"); - status = TPM_FAIL; - } else { - vtpmhandlerloginfo(VTPM_LOG_VTPM, "Forwarding command to DMI.\n"); - - if (dmi_res->guest_tx_fh < 0) - dmi_res->guest_tx_fh = open(dmi_res->guest_tx_fname, O_WRONLY | O_NONBLOCK); - - if (dmi_res->guest_tx_fh < 0){ - vtpmhandlerlogerror(VTPM_LOG_VTPM, "VTPM ERROR: Can't open outbound fh to dmi.\n"); - status = TPM_IOERROR; - goto abort_with_error; - } - - //Note: Send message + dmi_id - if (cmd_size) { - dmi_cmd = (BYTE *) malloc(VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size); - dmi_cmd_size = VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size; - memcpy(dmi_cmd, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); - memcpy(dmi_cmd + VTPM_COMMAND_HEADER_SIZE_SRV, in_param, cmd_size); - size_write = write(dmi_res->guest_tx_fh, dmi_cmd, dmi_cmd_size); - - if (size_write > 0) { - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "SENT (DMI): 0x"); - for (i=0; i<VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size; i++) { - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", dmi_cmd[i]); - } - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to DMI. Aborting... \n"); - close(dmi_res->guest_tx_fh); - dmi_res->guest_tx_fh = -1; - status = TPM_IOERROR; - goto abort_with_error; - } - free(dmi_cmd); - } else { - dmi_cmd_size = VTPM_COMMAND_HEADER_SIZE_SRV; - size_write = write(dmi_res->guest_tx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV ); - if (size_write > 0) { - for (i=0; i<VTPM_COMMAND_HEADER_SIZE_SRV; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); - - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to DMI. Aborting... \n"); - close(dmi_res->guest_tx_fh); - dmi_res->guest_tx_fh = -1; - status = TPM_IOERROR; - goto abort_with_error; - } - } - - if (size_write != (int) dmi_cmd_size) - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Could not write entire command to DMI (%d/%d)\n", size_write, dmi_cmd_size); - buffer_free(command_buf); - - if (vtpm_globals->guest_rx_fh < 0) - vtpm_globals->guest_rx_fh = open(GUEST_RX_FIFO, O_RDONLY); - - if (vtpm_globals->guest_rx_fh < 0){ - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh to dmi.\n"); - status = TPM_IOERROR; - goto abort_with_error; - } - - size_read = read( vtpm_globals->guest_rx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); - if (size_read > 0) { - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "RECV (DMI): 0x"); - for (i=0; i<size_read; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); - - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from DMI. Aborting... \n"); - close(vtpm_globals->guest_rx_fh); - vtpm_globals->guest_rx_fh = -1; - status = TPM_IOERROR; - goto abort_with_error; - } - - if (size_read < (int) VTPM_COMMAND_HEADER_SIZE_SRV) { - //vtpmdeepsublog("\n"); - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command from DMI shorter than normal header. Aborting...\n"); - status = TPM_IOERROR; - goto abort_with_error; - } - - BSG_UnpackList(cmd_header, 4, - BSG_TYPE_UINT32, &dmi, - BSG_TPM_TAG, &tag, - BSG_TYPE_UINT32, &in_param_size, - BSG_TPM_COMMAND_CODE, &status ); - - // Note that in_param_size is in the client's context - cmd_size = in_param_size - VTPM_COMMAND_HEADER_SIZE_CLT; - if (cmd_size > 0) { - in_param = (BYTE *) malloc(cmd_size); - size_read = read( vtpm_globals->guest_rx_fh, in_param, cmd_size); - if (size_read > 0) { - for (i=0; i<size_read; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); - - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from BE. Aborting... \n"); - close(vtpm_globals->guest_rx_fh); - vtpm_globals->guest_rx_fh = -1; - status = TPM_IOERROR; - goto abort_with_error; - } - vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); - - if (size_read < (int)cmd_size) { - vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command read(%d) from DMI is shorter than header indicates(%d). Aborting...\n", size_read, cmd_size); - status = TPM_IOERROR; - goto abort_with_error; - } - } else { - in_param = NULL; - vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); - } - - if (buffer_init_convert(result_buf, cmd_size, in_param) != TPM_SUCCESS) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Failed to setup buffers. Aborting...\n"); - status = TPM_FAIL; - goto abort_with_error; - } - - vtpmhandlerloginfo(VTPM_LOG_VTPM, "Sending DMI's response to guest.\n"); - } // end else for if (dmi==0) - - } else { // This is a DMI lister thread. Thus this is from a DMI -#ifdef VTPM_MULTI_VM - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempt to use unsupported direct access to TPM.\n"); - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "Bad Command. dmi:%d, tag:%d, size:%d, ord:%d, Params: ", dmi, tag, in_param_size, ord); - for (UINT32 q=0; q<cmd_size; q++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[q]); - - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - - status = TPM_FAIL; -#else - -#endif - } // end else for if BE Listener - } // end else for is VTPM Command - - // Send response to Backend - if (*tx_fh < 0) { - if (threadType == BE_LISTENER_THREAD) -#ifdef DUMMY_BACKEND - *tx_fh = open("/tmp/out.fifo", O_RDWR); -#else - *tx_fh = open(VTPM_BE_DEV, O_RDWR); -#endif - else // DMI Listener - *tx_fh = open(dmi_res->vtpm_tx_fname, O_WRONLY); - } - - if (*tx_fh < 0) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "VTPM ERROR: Can't open outbound fh.\n"); -#ifdef VTPM_MULTI_VM - return TPM_IOERROR; -#else - *ret_value = TPM_IOERROR; - pthread_exit(ret_value); -#endif - } - - abort_with_error: - // Prepend VTPM header with destination DM stamped - out_param_size = buffer_len(result_buf); - out_message_size = VTPM_COMMAND_HEADER_SIZE_CLT + out_param_size; - out_message_size_full = VTPM_COMMAND_HEADER_SIZE_SRV + out_param_size; - out_message = (BYTE *) malloc (out_message_size_full); - - BSG_PackList(out_message, 4, - BSG_TYPE_UINT32, (BYTE *) &dmi, - BSG_TPM_TAG, (BYTE *) &tag, - BSG_TYPE_UINT32, (BYTE *) &out_message_size, - BSG_TPM_RESULT, (BYTE *) &status); - - if (buffer_len(result_buf) > 0) - memcpy(out_message + VTPM_COMMAND_HEADER_SIZE_SRV, result_buf->bytes, out_param_size); - - - //Note: Send message + dmi_id - size_write = write(*tx_fh, out_message, out_message_size_full ); - if (size_write > 0) { - vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "SENT: 0x"); - for (i=0; i < out_message_size_full; i++) - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", out_message[i]); - - vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); - } else { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to BE. Aborting... \n"); - close(*tx_fh); - *tx_fh = -1; - goto abort_command; - } - free(out_message); - - if (size_write < (int)out_message_size_full) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Unable to write full command to BE (%d/%d)\n", size_write, out_message_size_full); - goto abort_command; - } - - abort_command: - //free buffers - bzero(cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); - //free(in_param); // This was converted to command_buf. No need to free - if (command_buf != result_buf) - buffer_free(result_buf); - - buffer_free(command_buf); - -#ifndef VTPM_MULTI_VM - if (threadType != BE_LISTENER_THREAD) { -#endif - if ( (vtpm_globals->DMI_table_dirty) && - (VTPM_SaveService() != TPM_SUCCESS) ) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "ERROR: Unable to save manager data.\n"); - } -#ifndef VTPM_MULTI_VM - } -#endif - - } // End while(1) - -} - - -/////////////////////////////////////////////////////////////////////////////// -TPM_RESULT VTPM_Init_Service() { - TPM_RESULT status = TPM_FAIL; - BYTE *randomsead; - UINT32 randomsize; - - if ((vtpm_globals = (VTPM_GLOBALS *) malloc(sizeof(VTPM_GLOBALS))) == NULL){ - status = TPM_FAIL; - goto abort_egress; - } - memset(vtpm_globals, 0, sizeof(VTPM_GLOBALS)); - vtpm_globals->be_fh = -1; - -#ifndef VTPM_MULTI_VM - vtpm_globals->vtpm_rx_fh = -1; - vtpm_globals->guest_rx_fh = -1; -#endif - if ((vtpm_globals->dmi_map = create_hashtable(10, hashfunc32, equals32)) == NULL){ - status = TPM_FAIL; - goto abort_egress; - } - - vtpm_globals->DMI_table_dirty = FALSE; - - // Create new TCS Object - vtpm_globals->manager_tcs_handle = 0; - - TPMTRYRETURN(TCS_create()); - - // Create TCS Context for service - TPMTRYRETURN( TCS_OpenContext(&vtpm_globals->manager_tcs_handle ) ); - - TPMTRYRETURN( TCSP_GetRandom(vtpm_globals->manager_tcs_handle, - &randomsize, - &randomsead)); - - Crypto_Init(randomsead, randomsize); - TPMTRYRETURN( TCS_FreeMemory (vtpm_globals->manager_tcs_handle, randomsead)); - - // Create OIAP session for service's authorized commands - TPMTRYRETURN( VTSP_OIAP( vtpm_globals->manager_tcs_handle, - &vtpm_globals->keyAuth) ); - vtpm_globals->keyAuth.fContinueAuthSession = TRUE; - - // If failed, create new Service. - if (VTPM_LoadService() != TPM_SUCCESS) - TPMTRYRETURN( VTPM_Create_Service() ); - - - //Load Storage Key - TPMTRYRETURN( VTSP_LoadKey( vtpm_globals->manager_tcs_handle, - TPM_SRK_KEYHANDLE, - &vtpm_globals->storageKeyWrap, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, - &vtpm_globals->storageKeyHandle, - &vtpm_globals->keyAuth, - &vtpm_globals->storageKey) ); - - // Create entry for Dom0 for control messages - TPMTRYRETURN( VTPM_Handle_New_DMI(NULL) ); - - // --------------------- Command handlers --------------------------- - - goto egress; - - abort_egress: - egress: - - return(status); -} - -void VTPM_Stop_Service() { - VTPM_DMI_RESOURCE *dmi_res; - struct hashtable_itr *dmi_itr; - - // Close all the TCS contexts. TCS should evict keys based on this - if (hashtable_count(vtpm_globals->dmi_map) > 0) { - dmi_itr = hashtable_iterator(vtpm_globals->dmi_map); - do { - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_iterator_value(dmi_itr); - if (dmi_res->connected) - if (close_dmi( dmi_res ) != TPM_SUCCESS) - vtpmlogerror(VTPM_LOG_VTPM, "Failed to close dmi %d properly.\n", dmi_res->dmi_id); - - } while (hashtable_iterator_advance(dmi_itr)); - free (dmi_itr); - } - - - TCS_CloseContext(vtpm_globals->manager_tcs_handle); - - if ( (vtpm_globals->DMI_table_dirty) && - (VTPM_SaveService() != TPM_SUCCESS) ) - vtpmlogerror(VTPM_LOG_VTPM, "Unable to save manager data.\n"); - - hashtable_destroy(vtpm_globals->dmi_map, 1); - free(vtpm_globals); - - close(vtpm_globals->be_fh); - Crypto_Exit(); - - vtpmloginfo(VTPM_LOG_VTPM, "VTPM Manager stopped.\n"); -} +// =================================================================== +// +// Copyright (c) 2005, Intel Corp. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +// OF THE POSSIBILITY OF SUCH DAMAGE. +// =================================================================== +// +// vtpm_manager.c +// +// This file will house the main logic of the VTPM Manager +// +// ================================================================== + +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <string.h> + +#ifndef VTPM_MULTI_VM +#include <pthread.h> +#include <errno.h> +#include <aio.h> +#include <time.h> +#endif + +#include "vtpm_manager.h" +#include "vtpmpriv.h" +#include "vtsp.h" +#include "bsg.h" +#include "hashtable.h" +#include "hashtable_itr.h" + +#include "log.h" +#include "buffer.h" + +VTPM_GLOBALS *vtpm_globals=NULL; + +#ifdef VTPM_MULTI_VM + #define vtpmhandlerloginfo(module,fmt,args...) vtpmloginfo (module, fmt, ##args ); + #define vtpmhandlerloginfomore(module,fmt,args...) vtpmloginfomore (module, fmt, ##args ); + #define vtpmhandlerlogerror(module,fmt,args...) vtpmlogerror (module, fmt, ##args ); +#else + #define vtpmhandlerloginfo(module,fmt,args...) vtpmloginfo (module, "[%d]: " fmt, threadType, ##args ); + #define vtpmhandlerloginfomore(module,fmt,args...) vtpmloginfomore (module, fmt, ##args ); + #define vtpmhandlerlogerror(module,fmt,args...) vtpmlogerror (module, "[%d]: " fmt, threadType, ##args ); +#endif + +// --------------------------- Well Known Auths -------------------------- +#ifdef WELL_KNOWN_SRK_AUTH +static BYTE FIXED_SRK_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; +#endif + +#ifdef WELL_KNOWN_OWNER_AUTH +static BYTE FIXED_OWNER_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; +#endif + +// -------------------------- Hash table functions -------------------- + +static unsigned int hashfunc32(void *ky) { + return (* (UINT32 *) ky); +} + +static int equals32(void *k1, void *k2) { + return (*(UINT32 *) k1 == *(UINT32 *) k2); +} + +// --------------------------- Functions ------------------------------ + +TPM_RESULT VTPM_Create_Service(){ + + TPM_RESULT status = TPM_SUCCESS; + + // Generate Auth's for SRK & Owner +#ifdef WELL_KNOWN_SRK_AUTH + memcpy(vtpm_globals->srk_usage_auth, FIXED_SRK_AUTH, sizeof(TPM_AUTHDATA)); +#else + Crypto_GetRandom(vtpm_globals->srk_usage_auth, sizeof(TPM_AUTHDATA) ); +#endif + +#ifdef WELL_KNOWN_OWNER_AUTH + memcpy(vtpm_globals->owner_usage_auth, FIXED_OWNER_AUTH, sizeof(TPM_AUTHDATA)); +#else + Crypto_GetRandom(vtpm_globals->owner_usage_auth, sizeof(TPM_AUTHDATA) ); +#endif + + // Take Owership of TPM + CRYPTO_INFO ek_cryptoInfo; + + vtpmloginfo(VTPM_LOG_VTPM, "Attempting Pubek Read. NOTE: Failure is ok.\n"); + status = VTSP_ReadPubek(vtpm_globals->manager_tcs_handle, &ek_cryptoInfo); + + // If we can read PubEK then there is no owner and we should take it. + if (status == TPM_SUCCESS) { + TPMTRYRETURN(VTSP_TakeOwnership(vtpm_globals->manager_tcs_handle, + (const TPM_AUTHDATA*)&vtpm_globals->owner_usage_auth, + (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &ek_cryptoInfo, + &vtpm_globals->keyAuth)); + + TPMTRYRETURN(VTSP_DisablePubekRead(vtpm_globals->manager_tcs_handle, + (const TPM_AUTHDATA*)&vtpm_globals->owner_usage_auth, + &vtpm_globals->keyAuth)); + } + + // Generate storage key's auth + Crypto_GetRandom( &vtpm_globals->storage_key_usage_auth, + sizeof(TPM_AUTHDATA) ); + + TCS_AUTH osap; + TPM_AUTHDATA sharedsecret; + + TPMTRYRETURN( VTSP_OSAP(vtpm_globals->manager_tcs_handle, + TPM_ET_SRK, + 0, + (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &sharedsecret, + &osap) ); + + TPMTRYRETURN( VTSP_CreateWrapKey( vtpm_globals->manager_tcs_handle, + TPM_KEY_BIND, + (const TPM_AUTHDATA*)&vtpm_globals->storage_key_usage_auth, + TPM_SRK_KEYHANDLE, + (const TPM_AUTHDATA*)&sharedsecret, + &vtpm_globals->storageKeyWrap, + &osap) ); + + vtpm_globals->keyAuth.fContinueAuthSession = TRUE; + + goto egress; + + abort_egress: + exit(1); + + egress: + vtpmloginfo(VTPM_LOG_VTPM, "Finished initialized new VTPM service (Status = %d).\n", status); + return status; + +} + + +////////////////////////////////////////////////////////////////////////////// +#ifdef VTPM_MULTI_VM +int VTPM_Service_Handler(){ +#else +void *VTPM_Service_Handler(void *threadTypePtr){ +#endif + TPM_RESULT status = TPM_FAIL; // Should never return + UINT32 dmi, in_param_size, cmd_size, out_param_size, out_message_size, out_message_size_full; + BYTE *cmd_header, *in_param, *out_message; + buffer_t *command_buf=NULL, *result_buf=NULL; + TPM_TAG tag; + TPM_COMMAND_CODE ord; + VTPM_DMI_RESOURCE *dmi_res; + int size_read, size_write, i; + +#ifndef VTPM_MULTI_VM + UINT32 dmi_cmd_size; + BYTE *dmi_cmd; + int threadType = *(int *) threadTypePtr; + + // async io structures + struct aiocb dmi_aio; + struct aiocb *dmi_aio_a[1]; + dmi_aio_a[0] = &dmi_aio; +#endif + +#ifdef DUMMY_BACKEND + int dummy_rx; +#endif + + cmd_header = (BYTE *) malloc(VTPM_COMMAND_HEADER_SIZE_SRV); + command_buf = (buffer_t *) malloc(sizeof(buffer_t)); + result_buf = (buffer_t *) malloc(sizeof(buffer_t)); + +#ifndef VTPM_MULTI_VM + TPM_RESULT *ret_value = (TPM_RESULT *) malloc(sizeof(TPM_RESULT)); +#endif + + int *tx_fh, // Pointer to the filehandle this function will write to + *rx_fh; // Pointer to the filehandle this function will read from + // For a multi VM VTPM system, this function tx/rx with the BE + // via vtpm_globals->be_fh. + // For a single VM system, the BE_LISTENER_THREAD tx/rx with theBE + // via vtpm_globals->be_fh, and the DMI_LISTENER_THREAD rx from + // vtpm_globals->vtpm_rx_fh and tx to dmi_res->vtpm_tx_fh + + // Set rx_fh to point to the correct fh based on this mode. +#ifdef VTPM_MULTI_VM + rx_fh = &vtpm_globals->be_fh; +#else + if (threadType == BE_LISTENER_THREAD) { + #ifdef DUMMY_BACKEND + dummy_rx = -1; + rx_fh = &dummy_rx; + #else + rx_fh = &vtpm_globals->be_fh; + #endif + } else { // DMI_LISTENER_THREAD + rx_fh = &vtpm_globals->vtpm_rx_fh; + } +#endif + + // Set tx_fh to point to the correct fh based on this mode (If static) + // Create any fifos that these fh will use. +#ifndef VTPM_MULTI_VM + int fh; + if (threadType == BE_LISTENER_THREAD) { + tx_fh = &vtpm_globals->be_fh; + if ( (fh = open(GUEST_RX_FIFO, O_RDWR)) == -1) { + if ( mkfifo(GUEST_RX_FIFO, S_IWUSR | S_IRUSR ) ){ + vtpmlogerror(VTPM_LOG_VTPM, "Unable to create FIFO: %s.\n", GUEST_RX_FIFO); + *ret_value = TPM_FAIL; + pthread_exit(ret_value); + } + } else + close(fh); + + } else { // else DMI_LISTENER_THREAD + // tx_fh will be set once the DMI is identified + // But we need to make sure the read pip is created. + if ( (fh = open(VTPM_RX_FIFO, O_RDWR)) == -1) { + if ( mkfifo(VTPM_RX_FIFO, S_IWUSR | S_IRUSR ) ){ + vtpmlogerror(VTPM_LOG_VTPM, "Unable to create FIFO: %s.\n", VTPM_RX_FIFO); + *ret_value = TPM_FAIL; + pthread_exit(ret_value); + } + } else + close(fh); + + } +#else + tx_fh = &vtpm_globals->be_fh; +#endif + + ////////////////////////// Main Loop ////////////////////////////////// + while(1) { + +#ifdef VTPM_MULTI_VM + vtpmhandlerloginfo(VTPM_LOG_VTPM, "Waiting for DMI messages.\n"); +#else + if (threadType == BE_LISTENER_THREAD) { + vtpmhandlerloginfo(VTPM_LOG_VTPM, "Waiting for Guest requests & ctrl messages.\n"); + } else + vtpmhandlerloginfo(VTPM_LOG_VTPM, "Waiting for DMI messages.\n"); +#endif + + // Check status of rx_fh. If necessary attempt to re-open it. + if (*rx_fh < 0) { +#ifdef VTPM_MULTI_VM + *rx_fh = open(VTPM_BE_DEV, O_RDWR); +#else + if (threadType == BE_LISTENER_THREAD) + #ifdef DUMMY_BACKEND + *rx_fh = open("/tmp/in.fifo", O_RDWR); + #else + *rx_fh = open(VTPM_BE_DEV, O_RDWR); + #endif + else // DMI Listener + *rx_fh = open(VTPM_RX_FIFO, O_RDWR); +#endif + } + + // Respond to failures to open rx_fh + if (*rx_fh < 0) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh.\n"); +#ifdef VTPM_MULTI_VM + return TPM_IOERROR; +#else + *ret_value = TPM_IOERROR; + pthread_exit(ret_value); +#endif + } + + // Read command header from rx_fh + size_read = read(*rx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); + if (size_read > 0) { + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "RECV[%d}: 0x", size_read); + for (i=0; i<size_read; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't read from BE. Aborting... \n"); + close(*rx_fh); + *rx_fh = -1; + goto abort_command; + } + + if (size_read < (int) VTPM_COMMAND_HEADER_SIZE_SRV) { + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "\n"); + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command shorter than normal header (%d bytes). Aborting...\n", size_read); + goto abort_command; + } + + // Unpack header + BSG_UnpackList(cmd_header, 4, + BSG_TYPE_UINT32, &dmi, + BSG_TPM_TAG, &tag, + BSG_TYPE_UINT32, &in_param_size, + BSG_TPM_COMMAND_CODE, &ord ); + + // Using the header info, read from rx_fh the parameters of the command + // Note that in_param_size is in the client's context + cmd_size = in_param_size - VTPM_COMMAND_HEADER_SIZE_CLT; + if (cmd_size > 0) { + in_param = (BYTE *) malloc(cmd_size); + size_read = read( *rx_fh, in_param, cmd_size); + if (size_read > 0) { + for (i=0; i<size_read; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); + + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from cmd. Aborting... \n"); + close(*rx_fh); + *rx_fh = -1; + goto abort_command; + } + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + if (size_read < (int) cmd_size) { + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command read(%d) is shorter than header indicates(%d). Aborting...\n", size_read, cmd_size); + goto abort_command; + } + } else { + in_param = NULL; + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + } + +#ifndef VTPM_MULTI_VM + // It's illegal to receive a Dom0 command from a DMI. + if ((threadType != BE_LISTENER_THREAD) && (dmi == 0)) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempt to access dom0 commands from DMI interface. Aborting...\n"); + goto abort_command; + } +#endif + + // Fetch infomation about the DMI issuing the request. + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi); + if (dmi_res == NULL) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempted access to non-existent DMI in domain: %d. Aborting...\n", dmi); + goto abort_command; + } + if (!dmi_res->connected) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempted access to disconnected DMI in domain: %d. Aborting...\n", dmi); + goto abort_command; + } + +#ifndef VTPM_MULTI_VM + // Now that we know which DMI this is, we can set the tx_fh handle. + if (threadType != BE_LISTENER_THREAD) + tx_fh = &dmi_res->vtpm_tx_fh; + // else we set this before the while loop since it doesn't change. +#endif + + // Init the buffers used to handle the command and the response + if ( (buffer_init_convert(command_buf, cmd_size, in_param) != TPM_SUCCESS) || + (buffer_init(result_buf, 0, 0) != TPM_SUCCESS) ) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Failed to setup buffers. Aborting...\n"); + goto abort_command; + } + + // Dispatch it as either control or user request. + if (tag == VTPM_TAG_REQ) { + if (dmi_res->dmi_id == VTPM_CTL_DM){ + switch (ord) { + case VTPM_ORD_OPEN: + status = VTPM_Handle_New_DMI(command_buf); + break; + + case VTPM_ORD_CLOSE: + status = VTPM_Handle_Close_DMI(command_buf); + break; + + case VTPM_ORD_DELETE: + status = VTPM_Handle_Delete_DMI(command_buf); + break; + default: + status = TPM_BAD_ORDINAL; + } // switch + } else { + + switch (ord) { + case VTPM_ORD_SAVENVM: + status= VTPM_Handle_Save_NVM(dmi_res, + command_buf, + result_buf); + break; + case VTPM_ORD_LOADNVM: + status= VTPM_Handle_Load_NVM(dmi_res, + command_buf, + result_buf); + break; + + case VTPM_ORD_TPMCOMMAND: + status= VTPM_Handle_TPM_Command(dmi_res, + command_buf, + result_buf); + break; + + default: + status = TPM_BAD_ORDINAL; + } // switch + } + } else { // This is not a VTPM Command at all. + // This happens in two cases. + // MULTI_VM = A DMI illegally sent a raw TPM command to the manager + // Single VM: + // BE_LISTENER_THREAD: Guest issued a TPM command. + // Send this to DMI and wait for response + // DMI_LISTENER_THREAD: A DMI illegally sent a raw TPM command. + +#ifdef VTPM_MULTI_VM + // Raw TPM commands are not supported from the DMI + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempt to use unsupported direct access to TPM.\n"); + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "Bad Command. dmi:%d, tag:%d, size:%d, ord:%d, Params: ", dmi, tag, in_param_size, ord); + for (i=0; i<cmd_size; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); + + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + status = TPM_FAIL; + +#else + // If BE_LISTENER_THREAD then this is a TPM command from a guest + if (threadType == BE_LISTENER_THREAD) { + // Dom0 can't talk to the BE, so this must be a broken FE/BE or badness + if (dmi == 0) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Illegal use of TPM command from dom0\n"); + status = TPM_FAIL; + } else { + vtpmhandlerloginfo(VTPM_LOG_VTPM, "Forwarding command to DMI.\n"); + + // open the dmi_res->guest_tx_fh to send command to DMI + if (dmi_res->guest_tx_fh < 0) + dmi_res->guest_tx_fh = open(dmi_res->guest_tx_fname, O_WRONLY | O_NONBLOCK); + + // handle failed opens dmi_res->guest_tx_fh + if (dmi_res->guest_tx_fh < 0){ + vtpmhandlerlogerror(VTPM_LOG_VTPM, "VTPM ERROR: Can't open outbound fh to dmi.\n"); + status = TPM_IOERROR; + goto abort_with_error; + } + + //Forward TPM CMD stamped with dmi_id to DMI for handling + if (cmd_size) { + dmi_cmd = (BYTE *) malloc(VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size); + dmi_cmd_size = VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size; + memcpy(dmi_cmd, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); + memcpy(dmi_cmd + VTPM_COMMAND_HEADER_SIZE_SRV, in_param, cmd_size); + size_write = write(dmi_res->guest_tx_fh, dmi_cmd, dmi_cmd_size); + + if (size_write > 0) { + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "SENT (DMI): 0x"); + for (i=0; i<VTPM_COMMAND_HEADER_SIZE_SRV + cmd_size; i++) { + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", dmi_cmd[i]); + } + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to DMI. Aborting... \n"); + close(dmi_res->guest_tx_fh); + dmi_res->guest_tx_fh = -1; + status = TPM_IOERROR; + goto abort_with_error; + } + free(dmi_cmd); + } else { + dmi_cmd_size = VTPM_COMMAND_HEADER_SIZE_SRV; + size_write = write(dmi_res->guest_tx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV ); + if (size_write > 0) { + for (i=0; i<VTPM_COMMAND_HEADER_SIZE_SRV; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); + + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to DMI. Aborting... \n"); + close(dmi_res->guest_tx_fh); + dmi_res->guest_tx_fh = -1; + status = TPM_IOERROR; + goto abort_with_error; + } + } + + if (size_write != (int) dmi_cmd_size) + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Could not write entire command to DMI (%d/%d)\n", size_write, dmi_cmd_size); + buffer_free(command_buf); + + // Open vtpm_globals->guest_rx_fh to receive DMI response + if (vtpm_globals->guest_rx_fh < 0) + vtpm_globals->guest_rx_fh = open(GUEST_RX_FIFO, O_RDONLY); + + // Handle open failures + if (vtpm_globals->guest_rx_fh < 0){ + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh to dmi.\n"); + status = TPM_IOERROR; + goto abort_with_error; + } + + // Read header for response to TPM command from DMI + size_read = read( vtpm_globals->guest_rx_fh, cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); + if (size_read > 0) { + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "RECV (DMI): 0x"); + for (i=0; i<size_read; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cmd_header[i]); + + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from DMI. Aborting... \n"); + close(vtpm_globals->guest_rx_fh); + vtpm_globals->guest_rx_fh = -1; + status = TPM_IOERROR; + goto abort_with_error; + } + + if (size_read < (int) VTPM_COMMAND_HEADER_SIZE_SRV) { + //vtpmdeepsublog("\n"); + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command from DMI shorter than normal header. Aborting...\n"); + status = TPM_IOERROR; + goto abort_with_error; + } + + // Unpack response from DMI for TPM command + BSG_UnpackList(cmd_header, 4, + BSG_TYPE_UINT32, &dmi, + BSG_TPM_TAG, &tag, + BSG_TYPE_UINT32, &in_param_size, + BSG_TPM_COMMAND_CODE, &status ); + + // If response has parameters, read them. + // Note that in_param_size is in the client's context + cmd_size = in_param_size - VTPM_COMMAND_HEADER_SIZE_CLT; + if (cmd_size > 0) { + in_param = (BYTE *) malloc(cmd_size); + size_read = read( vtpm_globals->guest_rx_fh, in_param, cmd_size); + if (size_read > 0) { + for (i=0; i<size_read; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); + + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error reading from BE. Aborting... \n"); + close(vtpm_globals->guest_rx_fh); + vtpm_globals->guest_rx_fh = -1; + status = TPM_IOERROR; + goto abort_with_error; + } + vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); + + if (size_read < (int)cmd_size) { + vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Command read(%d) from DMI is shorter than header indicates(%d). Aborting...\n", size_read, cmd_size); + status = TPM_IOERROR; + goto abort_with_error; + } + } else { + in_param = NULL; + vtpmhandlerloginfomore(VTPM_LOG_VTPM, "\n"); + } + + if (buffer_init_convert(result_buf, cmd_size, in_param) != TPM_SUCCESS) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Failed to setup buffers. Aborting...\n"); + status = TPM_FAIL; + goto abort_with_error; + } + + vtpmhandlerloginfo(VTPM_LOG_VTPM, "Sending DMI's response to guest.\n"); + } // end else for if (dmi==0) + + } else { // This is a DMI lister thread. Thus this is from a DMI + // Raw TPM commands are not supported from the DMI + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Attempt to use unsupported direct access to TPM.\n"); + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "Bad Command. dmi:%d, tag:%d, size:%d, ord:%d, Params: ", dmi, tag, in_param_size, ord); + for (i=0; i<cmd_size; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", in_param[i]); + + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + + status = TPM_FAIL; + } // end else for if BE Listener +#endif + + } // end else for is VTPM Command + + // This marks the beginning of preparing response to be sent out. + // Errors while handling responses jump here to reply with error messages + // NOTE: Currently there are no recoverable errors in multi-VM mode. If one + // is added to the code, this ifdef should be removed. + // Also note this is NOT referring to errors in commands, but rather + // this is about I/O errors and such. +#ifndef VTPM_MULTI_VM + abort_with_error: +#endif + + // Open tx_fh in preperation to send reponse back + if (*tx_fh < 0) { +#ifdef VTPM_MULTI_VM + *tx_fh = open(VTPM_BE_DEV, O_RDWR); +#else + if (threadType == BE_LISTENER_THREAD) + #ifdef DUMMY_BACKEND + *tx_fh = open("/tmp/out.fifo", O_RDWR); + #else + *tx_fh = open(VTPM_BE_DEV, O_RDWR); + #endif + else // DMI Listener + *tx_fh = open(dmi_res->vtpm_tx_fname, O_WRONLY); +#endif + } + + + // Handle failed open + if (*tx_fh < 0) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "VTPM ERROR: Can't open outbound fh.\n"); +#ifdef VTPM_MULTI_VM + return TPM_IOERROR; +#else + *ret_value = TPM_IOERROR; + pthread_exit(ret_value); +#endif + } + + // Prepend VTPM header with destination DM stamped + out_param_size = buffer_len(result_buf); + out_message_size = VTPM_COMMAND_HEADER_SIZE_CLT + out_param_size; + out_message_size_full = VTPM_COMMAND_HEADER_SIZE_SRV + out_param_size; + out_message = (BYTE *) malloc (out_message_size_full); + + BSG_PackList(out_message, 4, + BSG_TYPE_UINT32, (BYTE *) &dmi, + BSG_TPM_TAG, (BYTE *) &tag, + BSG_TYPE_UINT32, (BYTE *) &out_message_size, + BSG_TPM_RESULT, (BYTE *) &status); + + if (buffer_len(result_buf) > 0) + memcpy(out_message + VTPM_COMMAND_HEADER_SIZE_SRV, result_buf->bytes, out_param_size); + + + //Note: Send message + dmi_id + size_write = write(*tx_fh, out_message, out_message_size_full ); + if (size_write > 0) { + vtpmhandlerloginfo(VTPM_LOG_VTPM_DEEP, "SENT: 0x"); + for (i=0; i < out_message_size_full; i++) + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", out_message[i]); + + vtpmhandlerloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + } else { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Error writing to BE. Aborting... \n"); + close(*tx_fh); + *tx_fh = -1; + goto abort_command; + } + free(out_message); + + if (size_write < (int)out_message_size_full) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Unable to write full command to BE (%d/%d)\n", size_write, out_message_size_full); + goto abort_command; + } + + // On certain failures an error message cannot be sent. + // This marks the beginning of cleanup in preperation for the next command. + abort_command: + //free buffers + bzero(cmd_header, VTPM_COMMAND_HEADER_SIZE_SRV); + //free(in_param); // This was converted to command_buf. No need to free + if (command_buf != result_buf) + buffer_free(result_buf); + + buffer_free(command_buf); + +#ifndef VTPM_MULTI_VM + if (threadType != BE_LISTENER_THREAD) { +#endif + if ( (vtpm_globals->DMI_table_dirty) && + (VTPM_SaveService() != TPM_SUCCESS) ) { + vtpmhandlerlogerror(VTPM_LOG_VTPM, "ERROR: Unable to save manager data.\n"); + } +#ifndef VTPM_MULTI_VM + } +#endif + + } // End while(1) + +} + + +/////////////////////////////////////////////////////////////////////////////// +TPM_RESULT VTPM_Init_Service() { + TPM_RESULT status = TPM_FAIL; + BYTE *randomsead; + UINT32 randomsize; + + if ((vtpm_globals = (VTPM_GLOBALS *) malloc(sizeof(VTPM_GLOBALS))) == NULL){ + status = TPM_FAIL; + goto abort_egress; + } + memset(vtpm_globals, 0, sizeof(VTPM_GLOBALS)); + vtpm_globals->be_fh = -1; + +#ifndef VTPM_MULTI_VM + vtpm_globals->vtpm_rx_fh = -1; + vtpm_globals->guest_rx_fh = -1; +#endif + if ((vtpm_globals->dmi_map = create_hashtable(10, hashfunc32, equals32)) == NULL){ + status = TPM_FAIL; + goto abort_egress; + } + + vtpm_globals->DMI_table_dirty = FALSE; + + // Create new TCS Object + vtpm_globals->manager_tcs_handle = 0; + + TPMTRYRETURN(TCS_create()); + + // Create TCS Context for service + TPMTRYRETURN( TCS_OpenContext(&vtpm_globals->manager_tcs_handle ) ); + + TPMTRYRETURN( TCSP_GetRandom(vtpm_globals->manager_tcs_handle, + &randomsize, + &randomsead)); + + Crypto_Init(randomsead, randomsize); + TPMTRYRETURN( TCS_FreeMemory (vtpm_globals->manager_tcs_handle, randomsead)); + + // Create OIAP session for service's authorized commands + TPMTRYRETURN( VTSP_OIAP( vtpm_globals->manager_tcs_handle, + &vtpm_globals->keyAuth) ); + vtpm_globals->keyAuth.fContinueAuthSession = TRUE; + + // If failed, create new Service. + if (VTPM_LoadService() != TPM_SUCCESS) + TPMTRYRETURN( VTPM_Create_Service() ); + + //Load Storage Key + TPMTRYRETURN( VTSP_LoadKey( vtpm_globals->manager_tcs_handle, + TPM_SRK_KEYHANDLE, + &vtpm_globals->storageKeyWrap, + (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &vtpm_globals->storageKeyHandle, + &vtpm_globals->keyAuth, + &vtpm_globals->storageKey) ); + + // Create entry for Dom0 for control messages + TPMTRYRETURN( VTPM_Handle_New_DMI(NULL) ); + + // --------------------- Command handlers --------------------------- + + goto egress; + + abort_egress: + egress: + + return(status); +} + +void VTPM_Stop_Service() { + VTPM_DMI_RESOURCE *dmi_res; + struct hashtable_itr *dmi_itr; + + // Close all the TCS contexts. TCS should evict keys based on this + if (hashtable_count(vtpm_globals->dmi_map) > 0) { + dmi_itr = hashtable_iterator(vtpm_globals->dmi_map); + do { + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_iterator_value(dmi_itr); + if (dmi_res->connected) + close_dmi( dmi_res ); // Not really interested in return code + + } while (hashtable_iterator_advance(dmi_itr)); + free (dmi_itr); + } + + + TCS_CloseContext(vtpm_globals->manager_tcs_handle); + + if ( (vtpm_globals->DMI_table_dirty) && + (VTPM_SaveService() != TPM_SUCCESS) ) + vtpmlogerror(VTPM_LOG_VTPM, "Unable to save manager data.\n"); + + hashtable_destroy(vtpm_globals->dmi_map, 1); + free(vtpm_globals); + + close(vtpm_globals->be_fh); + Crypto_Exit(); + + vtpmloginfo(VTPM_LOG_VTPM, "VTPM Manager stopped.\n"); +} diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/manager/vtpmpriv.h --- a/tools/vtpm_manager/manager/vtpmpriv.h Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/manager/vtpmpriv.h Thu Sep 22 17:42:01 2005 @@ -47,8 +47,8 @@ #define STATE_FILE "/var/vtpm/VTPM" #define DMI_NVM_FILE "/var/vtpm/vtpm_dm_%d.data" -#define VTPM_BE_DEV "/dev/vtpm" -#define VTPM_CTL_DM 0 +#define VTPM_BE_DEV "/dev/vtpm0" +#define VTPM_CTL_DM 0 #ifndef VTPM_MUTLI_VM #include <sys/types.h> diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/Makefile --- a/tools/vtpm_manager/tcs/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/Makefile Thu Sep 22 17:42:01 2005 @@ -13,6 +13,7 @@ rm -f *.a *.so *.o *.rpm $(DEP_FILES) mrproper: clean + rm -f *~ $(BIN): $(OBJS) $(AR) rcs $(BIN) $(OBJS) diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/contextmgr.c --- a/tools/vtpm_manager/tcs/contextmgr.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/contextmgr.c Thu Sep 22 17:42:01 2005 @@ -43,6 +43,7 @@ #include "tcs.h" #include "contextmgr.h" #include "log.h" +#include "hashtable.h" BYTE* AddMemBlock(CONTEXT_HANDLE* pContextHandle, // in int BlockSize) { // in @@ -131,12 +132,14 @@ return bFound; } -BOOL AddHandleToList(CONTEXT_HANDLE* pContextHandle, // in +BOOL AddHandleToList(TCS_CONTEXT_HANDLE hContext, // in TPM_RESOURCE_TYPE type, // in TPM_HANDLE handle) { // in HANDLE_LIST* pNewHandle = NULL; - + vtpmloginfo(VTPM_LOG_TCS_DEEP, "Adding Handle to list\n"); + CONTEXT_HANDLE* pContextHandle = LookupContext(hContext); + if (pContextHandle == NULL) return 0; @@ -154,11 +157,13 @@ return 1; } -BOOL DeleteHandleFromList( CONTEXT_HANDLE* pContextHandle, // in +BOOL DeleteHandleFromList( TCS_CONTEXT_HANDLE hContext, // in TPM_HANDLE handle) { // in + CONTEXT_HANDLE* pContextHandle = LookupContext(hContext); + HANDLE_LIST *pCurrentHandle = pContextHandle->pHandleList, - *pLastHandle = pCurrentHandle; + *pLastHandle = pCurrentHandle; vtpmloginfo(VTPM_LOG_TCS_DEEP, "Deleting Handle from list\n"); @@ -202,10 +207,10 @@ switch (pCurrentHandle->type) { case TPM_RT_KEY: - returncode = returncode && !TCSP_EvictKey((TCS_CONTEXT_HANDLE) pContextHandle, pCurrentHandle->handle); + returncode = returncode && !TCSP_EvictKey(pContextHandle->handle, pCurrentHandle->handle); break; case TPM_RT_AUTH: - returncode = returncode && !TCSP_TerminateHandle((TCS_CONTEXT_HANDLE) pContextHandle, pCurrentHandle->handle); + returncode = returncode && !TCSP_TerminateHandle(pContextHandle->handle, pCurrentHandle->handle); break; default: returncode = FALSE; diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/contextmgr.h --- a/tools/vtpm_manager/tcs/contextmgr.h Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/contextmgr.h Thu Sep 22 17:42:01 2005 @@ -57,6 +57,7 @@ } HANDLE_LIST; typedef struct context_handle { + TCS_CONTEXT_HANDLE handle; int nBlockCount; BLOCK* pTopBlock; HANDLE_LIST* pHandleList; @@ -69,11 +70,11 @@ BYTE* pTCPA_BYTEs); // in -BOOL AddHandleToList( CONTEXT_HANDLE* pContextHandle, // in +BOOL AddHandleToList( TCS_CONTEXT_HANDLE hContext, // in TPM_RESOURCE_TYPE type, // in TPM_HANDLE handle); // in -BOOL DeleteHandleFromList( CONTEXT_HANDLE* pContextHandle, // in +BOOL DeleteHandleFromList( TCS_CONTEXT_HANDLE hContext, // in TPM_HANDLE handle); // in BOOL FreeHandleList( CONTEXT_HANDLE* pContextHandle); // in diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/tcs.c --- a/tools/vtpm_manager/tcs/tcs.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/tcs.c Thu Sep 22 17:42:01 2005 @@ -47,9 +47,10 @@ #include "contextmgr.h" #include "tpmddl.h" #include "log.h" +#include "hashtable.h" +#include "hashtable_itr.h" // Static Global Vars for the TCS -static BOOL TCS_m_bConnected; static int TCS_m_nCount = 0; #define TCPA_MAX_BUFFER_LENGTH 0x2000 @@ -57,6 +58,21 @@ static BYTE InBuf [TCPA_MAX_BUFFER_LENGTH]; static BYTE OutBuf[TCPA_MAX_BUFFER_LENGTH]; +struct hashtable *context_ht; + +// -------------------------- Hash table functions -------------------- + +static unsigned int hashfunc32(void *ky) { + return (* (UINT32 *) ky); +} + +static int equals32(void *k1, void *k2) { + return (*(UINT32 *) k1 == *(UINT32 *) k2); +} + +CONTEXT_HANDLE *LookupContext( TCS_CONTEXT_HANDLE hContext) { + return( (CONTEXT_HANDLE *) hashtable_search(context_ht, &hContext) ); +} // --------------------------------------------------------------------------------- // Initialization/Uninitialization SubComponent API @@ -64,34 +80,50 @@ TPM_RESULT TCS_create() { TDDL_RESULT hRes = TDDL_E_FAIL; TPM_RESULT result = TPM_FAIL; - TCS_m_bConnected = FALSE; if (TCS_m_nCount == 0) { vtpmloginfo(VTPM_LOG_TCS, "Constructing new TCS:\n"); hRes = TDDL_Open(); - - if (hRes == TDDL_SUCCESS) { - TCS_m_bConnected = TRUE; + + context_ht = create_hashtable(10, hashfunc32, equals32); + + if ((hRes == TDDL_SUCCESS) && (context_ht != NULL)) { result = TPM_SUCCESS; + TCS_m_nCount++; + } else { + result = TPM_IOERROR; + hashtable_destroy(context_ht, 1); } } else - TCS_m_bConnected = TRUE; - - TCS_m_nCount++; - + TCS_m_nCount++; + return(result); } void TCS_destroy() { - // FIXME: Should iterate through all open contexts and close them. TCS_m_nCount--; - if (TCS_m_bConnected == TRUE && TCS_m_nCount == 0) { + if (TCS_m_nCount == 0) { vtpmloginfo(VTPM_LOG_TCS, "Destructing TCS:\n"); TDDL_Close(); - TCS_m_bConnected = FALSE; + + struct hashtable_itr *context_itr; + TCS_CONTEXT_HANDLE *hContext; + + // Close all the TCS contexts. TCS should evict keys based on this + if (hashtable_count(context_ht) > 0) { + context_itr = hashtable_iterator(context_ht); + do { + hContext = (TCS_CONTEXT_HANDLE *) hashtable_iterator_key(context_itr); + if (TCS_CloseContext(*hContext) != TPM_SUCCESS) + vtpmlogerror(VTPM_LOG_TCS, "Failed to close context %d properly.\n", *hContext); + + } while (hashtable_iterator_advance(context_itr)); + free(context_itr); + } + hashtable_destroy(context_ht, 1); } } @@ -101,7 +133,7 @@ BYTE** ppMemPtr) {// out TPM_RESULT returnCode = TPM_FAIL; - CONTEXT_HANDLE* pContextHandle = (CONTEXT_HANDLE*)hContext; + CONTEXT_HANDLE* pContextHandle = LookupContext(hContext); if (pContextHandle != NULL && ppMemPtr != NULL) { *ppMemPtr = (BYTE *)AddMemBlock(pContextHandle, MemSize); @@ -114,7 +146,7 @@ TPM_RESULT TCS_FreeMemory( TCS_CONTEXT_HANDLE hContext, // in BYTE* pMemory) { // in TPM_RESULT returnCode = TPM_FAIL; - CONTEXT_HANDLE* pContextHandle = (CONTEXT_HANDLE*)hContext; + CONTEXT_HANDLE* pContextHandle = LookupContext(hContext); if ( (pContextHandle != NULL && pMemory != NULL) && (DeleteMemBlock(pContextHandle, pMemory) == TRUE) ) @@ -126,15 +158,15 @@ TPM_RESULT TCS_OpenContext(TCS_CONTEXT_HANDLE* hContext) { // out TPM_RESULT returnCode = TPM_FAIL; + TCS_CONTEXT_HANDLE *newContext; vtpmloginfo(VTPM_LOG_TCS, "Calling TCS_OpenContext:\n"); // hContext must point to a null memory context handle if(*hContext == HANDLE_NULL) { - CONTEXT_HANDLE* pContextHandle = (CONTEXT_HANDLE *)malloc(sizeof(CONTEXT_HANDLE)); + CONTEXT_HANDLE* pContextHandle = (CONTEXT_HANDLE *) malloc(sizeof(CONTEXT_HANDLE)); if (pContextHandle == NULL) return TPM_SIZE; - // initialize to 0 pContextHandle->nBlockCount = 0; @@ -144,19 +176,32 @@ // Create New Block AddMemBlock(pContextHandle, BLOCK_SIZE); - *hContext = (TCS_CONTEXT_HANDLE)pContextHandle; - returnCode = TPM_SUCCESS; + newContext = (TCS_CONTEXT_HANDLE *) malloc(sizeof(TCS_CONTEXT_HANDLE)); + *newContext = (TCS_CONTEXT_HANDLE) (((uintptr_t) pContextHandle >> 2) & 0xffffffff); + + if (hashtable_search(context_ht, &newContext) !=NULL) + *newContext += 1; + + pContextHandle->handle = *newContext; + if (!hashtable_insert(context_ht, newContext, pContextHandle)) { + free(newContext); + free(pContextHandle); + returnCode = TPM_FAIL; + } else { + *hContext = *newContext; + returnCode = TPM_SUCCESS; + } } return(returnCode); } TPM_RESULT TCS_CloseContext(TCS_CONTEXT_HANDLE hContext) {// in - //FIXME: TCS SHOULD Track track failed auths and make sure + //FIXME: TCS SHOULD Track failed auths and make sure //we don't try and re-free them here. TPM_RESULT returnCode = TPM_FAIL; - CONTEXT_HANDLE* pContextHandle = (CONTEXT_HANDLE*)hContext; + CONTEXT_HANDLE* pContextHandle = LookupContext(hContext); if(pContextHandle != NULL) { // Print test info @@ -171,6 +216,9 @@ vtpmlogerror(VTPM_LOG_TCS, "Not all handles evicted from TPM.\n"); // Release the TPM's resources + if (hashtable_remove(context_ht, &hContext) == NULL) + vtpmlogerror(VTPM_LOG_TCS, "Not all handles evicted from TPM.\n"); + free(pContextHandle); returnCode = TPM_SUCCESS; } @@ -255,7 +303,7 @@ BSG_TYPE_UINT32, authHandle, BSG_TPM_NONCE, nonce0); - if (!AddHandleToList((CONTEXT_HANDLE *)hContext, TPM_RT_AUTH, *authHandle)) + if (!AddHandleToList(hContext, TPM_RT_AUTH, *authHandle)) vtpmlogerror(VTPM_LOG_TCS, "New AuthHandle not recorded\n"); vtpmloginfo(VTPM_LOG_TCS_DEEP, "Received paramSize : %d\n", paramSize); @@ -321,7 +369,7 @@ BSG_TPM_NONCE, nonceEven, BSG_TPM_NONCE, nonceEvenOSAP); - if (!AddHandleToList((CONTEXT_HANDLE *)hContext, TPM_RT_AUTH, *authHandle)) { + if (!AddHandleToList(hContext, TPM_RT_AUTH, *authHandle)) { vtpmlogerror(VTPM_LOG_TCS, "New AuthHandle not recorded\n"); } @@ -498,7 +546,7 @@ BSG_TYPE_UINT32, &paramSize, BSG_TPM_COMMAND_CODE, &returnCode); - if (!DeleteHandleFromList((CONTEXT_HANDLE *)hContext, handle)) + if (!DeleteHandleFromList(hContext, handle)) vtpmlogerror(VTPM_LOG_TCS, "KeyHandle not removed from list\n"); @@ -897,7 +945,7 @@ phKeyTCSI); unpackAuth(pAuth, OutBuf+i); - if (!AddHandleToList((CONTEXT_HANDLE *)hContext, TPM_RT_KEY, *phKeyTCSI)) { + if (!AddHandleToList(hContext, TPM_RT_KEY, *phKeyTCSI)) { vtpmlogerror(VTPM_LOG_TCS, "New KeyHandle not recorded\n"); } @@ -942,7 +990,7 @@ BSG_TYPE_UINT32, &paramSize, BSG_TPM_COMMAND_CODE, &returnCode); - if (!DeleteHandleFromList((CONTEXT_HANDLE *)hContext, hKey)) { + if (!DeleteHandleFromList(hContext, hKey)) { vtpmlogerror(VTPM_LOG_TCS, "KeyHandle not removed from list\n"); } diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/tcs.h --- a/tools/vtpm_manager/tcs/tcs.h Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/tcs.h Thu Sep 22 17:42:01 2005 @@ -41,6 +41,7 @@ #define __TCS_H__ #include "tcg.h" +#include "contextmgr.h" #include "buffer.h" #define HANDLE_NULL 0 @@ -235,4 +236,7 @@ UINT32 *outDataSize,// in/out BYTE *outData); // out +///////////// Private Functions //////////////////// +CONTEXT_HANDLE* LookupContext( TCS_CONTEXT_HANDLE hContext); + #endif //TCS_H diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/tcs/transmit.c --- a/tools/vtpm_manager/tcs/transmit.c Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/tcs/transmit.c Thu Sep 22 17:42:01 2005 @@ -69,7 +69,7 @@ ERRORDIE (TPM_IOERROR); } else if ((TDDL_UINT32) size < insize) { - vtpmlogerror(VTPM_LOG_TXDATA, "Wrote %d instead of %d bytes!\n", size, insize); + vtpmlogerror(VTPM_LOG_TXDATA, "Wrote %d instead of %d bytes!\n", (int) size, insize); // ... ? } diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/util/Makefile --- a/tools/vtpm_manager/util/Makefile Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/util/Makefile Thu Sep 22 17:42:01 2005 @@ -13,6 +13,7 @@ rm -f *.a *.so *.o *.rpm $(DEP_FILES) mrproper: clean + rm -f *~ $(BIN): $(OBJS) $(AR) rcs $(BIN) $(OBJS) diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm_manager/util/tcg.h --- a/tools/vtpm_manager/util/tcg.h Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm_manager/util/tcg.h Thu Sep 22 17:42:01 2005 @@ -453,14 +453,14 @@ // DEPENDS: local var 'status' of type TPM_RESULT // DEPENDS: label 'abort_egress' which cleans up and returns the status #define ERRORDIE(s) do { status = s; \ - fprintf (stderr, "*** ERRORDIE in %s, line %i\n", __func__, __LINE__); \ + fprintf (stderr, "*** ERRORDIE in %s at %s: %i\n", __func__, __FILE__, __LINE__); \ goto abort_egress; } \ while (0) // ASSUME: the return value used after the abort_egress label has been set // already (eg. the 'status' local var) #define STATUSCHECK(s) if (s != TPM_SUCCESS) { \ - fprintf (stderr, "*** ERR in %s, line %i\n", __func__, __LINE__); \ + fprintf (stderr, "*** ERR in %s at %s:%i\n", __func__, __FILE__, __LINE__); \ goto abort_egress; \ } @@ -475,7 +475,7 @@ // Try command c. If it fails, print error message, set status to actual return code. Goto shame #define TPMTRYRETURN(c) do { status = c; \ if (status != TPM_SUCCESS) { \ - printf("ERROR in %s:%i code: %s.\n", __func__, __LINE__, tpm_get_error_name(status)); \ + printf("ERROR in %s at %s:%i code: %s.\n", __func__, __FILE__, __LINE__, tpm_get_error_name(status)); \ goto abort_egress; \ } \ } while(0) diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/01simple.test --- a/tools/xenstore/testsuite/01simple.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/01simple.test Thu Sep 22 17:42:01 2005 @@ -1,4 +1,4 @@ # Create an entry, read it. -write /test create contents +write /test contents expect contents read /test diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/02directory.test --- a/tools/xenstore/testsuite/02directory.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/02directory.test Thu Sep 22 17:42:01 2005 @@ -3,7 +3,7 @@ dir / # Create a file. -write /test create contents +write /test contents # Directory shows it. expect test @@ -21,16 +21,14 @@ dir /dir # Create a file, check it exists. -write /dir/test2 create contents2 +write /dir/test2 contents2 expect test2 dir /dir expect contents2 read /dir/test2 -# Creating dir over the top should fail. -expect mkdir failed: File exists +# Creating dir over the top should succeed. mkdir /dir -expect mkdir failed: File exists mkdir /dir/test2 # Mkdir implicitly creates directories. diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/03write.test --- a/tools/xenstore/testsuite/03write.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/03write.test Thu Sep 22 17:42:01 2005 @@ -1,31 +1,20 @@ -# Write without create fails. -expect write failed: No such file or directory -write /test none contents - -# Exclusive write succeeds -write /test excl contents +# Write succeeds +write /test contents expect contents read /test -# Exclusive write fails to overwrite. -expect write failed: File exists -write /test excl contents - -# Non-exclusive overwrite succeeds. -write /test none contents2 +# Overwrite succeeds. +write /test contents2 expect contents2 -read /test -write /test create contents3 -expect contents3 read /test # Write should implicitly create directories -write /dir/test create contents +write /dir/test contents expect test dir /dir expect contents read /dir/test -write /dir/1/2/3/4 excl contents4 +write /dir/1/2/3/4 contents4 expect test expect 1 dir /dir diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/04rm.test --- a/tools/xenstore/testsuite/04rm.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/04rm.test Thu Sep 22 17:42:01 2005 @@ -1,11 +1,10 @@ -# Remove non-existant fails. -expect rm failed: No such file or directory +# Remove non-existant is OK, as long as parent exists rm /test expect rm failed: No such file or directory rm /dir/test # Create file and remove it -write /test excl contents +write /test contents rm /test # Create directory and remove it. @@ -14,5 +13,5 @@ # Create directory, create file, remove all. mkdir /dir -write /dir/test excl contents +write /dir/test contents rm /dir diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/05filepermissions.test --- a/tools/xenstore/testsuite/05filepermissions.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/05filepermissions.test Thu Sep 22 17:42:01 2005 @@ -5,7 +5,7 @@ getperm /dir/test # Create file: inherits from root (0 READ) -write /test excl contents +write /test contents expect 0 READ getperm /test setid 1 @@ -14,7 +14,7 @@ expect contents read /test expect write failed: Permission denied -write /test none contents +write /test contents # Take away read access to file. setid 0 @@ -25,7 +25,7 @@ expect read failed: Permission denied read /test expect write failed: Permission denied -write /test none contents +write /test contents # Grant everyone write access to file. setid 0 @@ -35,7 +35,7 @@ getperm /test expect read failed: Permission denied read /test -write /test none contents2 +write /test contents2 setid 0 expect contents2 read /test @@ -47,7 +47,7 @@ getperm /test expect contents2 read /test -write /test none contents3 +write /test contents3 expect contents3 read /test @@ -59,7 +59,7 @@ getperm /test expect contents3 read /test -write /test none contents4 +write /test contents4 # User 2 can do nothing. setid 2 @@ -70,7 +70,7 @@ expect read failed: Permission denied read /test expect write failed: Permission denied -write /test none contents4 +write /test contents4 # Tools can always access things. setid 0 @@ -78,4 +78,4 @@ getperm /test expect contents4 read /test -write /test none contents5 +write /test contents5 diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/06dirpermissions.test --- a/tools/xenstore/testsuite/06dirpermissions.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/06dirpermissions.test Thu Sep 22 17:42:01 2005 @@ -11,7 +11,7 @@ getperm /dir dir /dir expect write failed: Permission denied -write /dir/test create contents2 +write /dir/test contents2 # Remove everyone's read access to directoy. setid 0 @@ -22,7 +22,7 @@ expect read failed: Permission denied read /dir/test create contents2 expect write failed: Permission denied -write /dir/test create contents2 +write /dir/test contents2 # Grant everyone write access to directory. setid 0 @@ -32,7 +32,7 @@ getperm /dir expect dir failed: Permission denied dir /dir -write /dir/test create contents +write /dir/test contents setid 0 expect 1 WRITE getperm /dir/test @@ -47,7 +47,7 @@ getperm /dir expect test dir /dir -write /dir/test2 create contents +write /dir/test2 contents expect contents read /dir/test2 setperm /dir/test2 1 NONE @@ -60,7 +60,7 @@ expect test expect test2 dir /dir -write /dir/test3 create contents +write /dir/test3 contents # User 2 can do nothing. Can't even tell if file exists. setid 2 @@ -79,17 +79,9 @@ expect read failed: Permission denied read /dir/test4 expect write failed: Permission denied -write /dir/test none contents +write /dir/test contents expect write failed: Permission denied -write /dir/test create contents -expect write failed: Permission denied -write /dir/test excl contents -expect write failed: Permission denied -write /dir/test4 none contents -expect write failed: Permission denied -write /dir/test4 create contents -expect write failed: Permission denied -write /dir/test4 excl contents +write /dir/test4 contents # Tools can always access things. setid 0 @@ -99,13 +91,13 @@ expect test2 expect test3 dir /dir -write /dir/test4 create contents +write /dir/test4 contents # Inherited by child. mkdir /dir/subdir expect 1 NONE getperm /dir/subdir -write /dir/subfile excl contents +write /dir/subfile contents expect 1 NONE getperm /dir/subfile @@ -114,12 +106,12 @@ expect 2 READ/WRITE getperm /dir/subdir setid 3 -write /dir/subdir/subfile excl contents +write /dir/subdir/subfile contents expect 3 READ/WRITE getperm /dir/subdir/subfile # Inheritence works through multiple directories, too. -write /dir/subdir/1/2/3/4 excl contents +write /dir/subdir/1/2/3/4 contents expect 3 READ/WRITE getperm /dir/subdir/1/2/3/4 mkdir /dir/subdir/a/b/c/d diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/07watch.test --- a/tools/xenstore/testsuite/07watch.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/07watch.test Thu Sep 22 17:42:01 2005 @@ -1,8 +1,8 @@ # Watch something, write to it, check watch has fired. -write /test create contents +write /test contents 1 watch /test token -2 write /test create contents2 +2 write /test contents2 expect 1:/test:token 1 waitwatch 1 ackwatch token @@ -44,7 +44,7 @@ # ignore watches while doing commands, should work. watch /dir token -1 write /dir/test create contents +1 write /dir/test contents expect contents read /dir/test expect /dir/test:token @@ -56,7 +56,7 @@ 1 watch /dir token1 3 watch /dir token3 2 watch /dir token2 -write /dir/test create contents +write /dir/test contents expect 3:/dir/test:token3 3 waitwatch 3 ackwatch token3 @@ -73,7 +73,7 @@ # If one dies (without acking), the other should still get ack. 1 watch /dir token1 2 watch /dir token2 -write /dir/test create contents +write /dir/test contents expect 2:/dir/test:token2 2 waitwatch 2 close @@ -85,7 +85,7 @@ # If one dies (without reading at all), the other should still get ack. 1 watch /dir token1 2 watch /dir token2 -write /dir/test create contents +write /dir/test contents 2 close expect 1:/dir/test:token1 1 waitwatch @@ -97,7 +97,7 @@ 1 watch /dir token1 1 unwatch /dir token1 1 watch /dir token2 -2 write /dir/test2 create contents +2 write /dir/test2 contents expect 1:/dir/test2:token2 1 waitwatch 1 unwatch /dir token2 @@ -107,7 +107,7 @@ # unwatch while watch pending. Other watcher still gets the event. 1 watch /dir token1 2 watch /dir token2 -write /dir/test create contents +write /dir/test contents 2 unwatch /dir token2 expect 1:/dir/test:token1 1 waitwatch @@ -117,17 +117,17 @@ # unwatch while watch pending. Should clear this so we get next event. 1 watch /dir token1 -write /dir/test create contents +write /dir/test contents 1 unwatch /dir token1 1 watch /dir/test token2 -write /dir/test none contents2 +write /dir/test contents2 expect 1:/dir/test:token2 1 waitwatch 1 ackwatch token2 # check we only get notified once. 1 watch /test token -2 write /test create contents2 +2 write /test contents2 expect 1:/test:token 1 waitwatch 1 ackwatch token @@ -137,9 +137,9 @@ # watches are queued in order. 1 watch / token -2 write /test1 create contents -2 write /test2 create contents -2 write /test3 create contents +2 write /test1 contents +2 write /test2 contents +2 write /test3 contents expect 1:/test1:token 1 waitwatch 1 ackwatch token @@ -153,8 +153,8 @@ # Creation of subpaths should be covered correctly. 1 watch / token -2 write /test/subnode create contents2 -2 write /test/subnode/subnode create contents2 +2 write /test/subnode contents2 +2 write /test/subnode/subnode contents2 expect 1:/test/subnode:token 1 waitwatch 1 ackwatch token @@ -167,11 +167,13 @@ # Watch event must have happened before we registered interest. 1 watch / token -2 write /test/subnode create contents2 -1 watch / token2 0 +2 write /test/subnode contents2 +1 watchnoack / token2 0 expect 1:/test/subnode:token 1 waitwatch 1 ackwatch token +expect 1:/:token2 +1 waitwatch expect 1: waitwatch failed: Connection timed out 1 waitwatch 1 close @@ -185,7 +187,7 @@ # Watch should not double-send after we ack, even if we did something in between. 1 watch /test2 token -2 write /test2/foo create contents2 +2 write /test2/foo contents2 expect 1:/test2/foo:token 1 waitwatch expect 1:contents2 diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/08transaction.slowtest --- a/tools/xenstore/testsuite/08transaction.slowtest Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/08transaction.slowtest Thu Sep 22 17:42:01 2005 @@ -1,7 +1,7 @@ # Test transaction timeouts. Take a second each. mkdir /test -write /test/entry1 create contents +write /test/entry1 contents # Transactions can take as long as the want... start /test diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/08transaction.test --- a/tools/xenstore/testsuite/08transaction.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/08transaction.test Thu Sep 22 17:42:01 2005 @@ -4,7 +4,7 @@ # Simple transaction: create a file inside transaction. 1 start /test -1 write /test/entry1 create contents +1 write /test/entry1 contents 2 dir /test expect 1:entry1 1 dir /test @@ -16,14 +16,14 @@ # Create a file and abort transaction. 1 start /test -1 write /test/entry1 create contents +1 write /test/entry1 contents 2 dir /test expect 1:entry1 1 dir /test 1 abort 2 dir /test -write /test/entry1 create contents +write /test/entry1 contents # Delete in transaction, commit 1 start /test 1 rm /test/entry1 @@ -34,7 +34,7 @@ 2 dir /test # Delete in transaction, abort. -write /test/entry1 create contents +write /test/entry1 contents 1 start /test 1 rm /test/entry1 expect 2:entry1 @@ -84,8 +84,8 @@ # Multiple events from single transaction don't trigger assert 1 watch /test token 2 start /test -2 write /test/1 create contents -2 write /test/2 create contents +2 write /test/1 contents +2 write /test/2 contents 2 commit expect 1:/test/1:token 1 waitwatch diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/09domain.test --- a/tools/xenstore/testsuite/09domain.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/09domain.test Thu Sep 22 17:42:01 2005 @@ -3,7 +3,7 @@ # Create a domain, write an entry. expect handle is 1 introduce 1 100 7 /my/home -1 write /entry1 create contents +1 write /entry1 contents expect entry1 expect tool dir / diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/10domain-homedir.test --- a/tools/xenstore/testsuite/10domain-homedir.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/10domain-homedir.test Thu Sep 22 17:42:01 2005 @@ -4,7 +4,7 @@ mkdir /home expect handle is 1 introduce 1 100 7 /home -1 write entry1 create contents +1 write entry1 contents expect contents read /home/entry1 expect entry1 @@ -13,7 +13,7 @@ # Place a watch using a relative path: expect relative answer. 1 mkdir foo 1 watch foo token -write /home/foo/bar create contents +write /home/foo/bar contents expect 1:foo/bar:token 1 waitwatch 1 ackwatch token diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/11domain-watch.test --- a/tools/xenstore/testsuite/11domain-watch.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/11domain-watch.test Thu Sep 22 17:42:01 2005 @@ -1,13 +1,13 @@ # Test watching from a domain. # Watch something, write to it, check watch has fired. -write /test create contents +write /test contents mkdir /dir expect handle is 1 introduce 1 100 7 /my/home 1 watch /test token -write /test create contents2 +write /test contents2 expect 1:/test:token 1 waitwatch 1 ackwatch token @@ -19,10 +19,10 @@ expect handle is 1 introduce 1 100 7 /my/home 1 watch /dir token -write /dir/test create contents -1 write /dir/test2 create contents2 -1 write /dir/test3 create contents3 -1 write /dir/test4 create contents4 +write /dir/test contents +1 write /dir/test2 contents2 +1 write /dir/test3 contents3 +1 write /dir/test4 contents4 expect 1:/dir/test:token 1 waitwatch 1 ackwatch token @@ -35,7 +35,7 @@ 1 watch /dir token1 1 unwatch /dir token1 1 watch /dir token2 -write /dir/test2 create contents +write /dir/test2 contents expect 1:/dir/test2:token2 1 waitwatch 1 unwatch /dir token2 @@ -46,7 +46,7 @@ expect handle is 1 introduce 1 100 7 /my/home 1 watch /dir token1 -write /dir/test2 create contents +write /dir/test2 contents 1 unwatch /dir token1 release 1 1 close diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/12readonly.test --- a/tools/xenstore/testsuite/12readonly.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/12readonly.test Thu Sep 22 17:42:01 2005 @@ -1,6 +1,6 @@ # Test that read only connection can't alter store. -write /test create contents +write /test contents readonly expect test @@ -20,9 +20,9 @@ # These don't work expect write failed: Read-only file system -write /test2 create contents +write /test2 contents expect write failed: Read-only file system -write /test create contents +write /test contents expect setperm failed: Read-only file system setperm /test 100 NONE expect setperm failed: Read-only file system @@ -35,7 +35,7 @@ # Check that watches work like normal. watch / token 1 readwrite -1 write /test create contents +1 write /test contents expect /test:token waitwatch ackwatch token diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/13watch-ack.test --- a/tools/xenstore/testsuite/13watch-ack.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/13watch-ack.test Thu Sep 22 17:42:01 2005 @@ -13,10 +13,10 @@ 1 watch /test/1 token1 1 watch /test/2 token2 1 watch /test/3 token3 -2 write /test/2 create contents2 +2 write /test/2 contents2 expect 1:/test/2:token2 1 waitwatch -3 write /test/1 create contents1 -4 write /test/3 create contents3 +3 write /test/1 contents1 +4 write /test/3 contents3 1 ackwatch token2 1 close diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/14complexperms.test --- a/tools/xenstore/testsuite/14complexperms.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/14complexperms.test Thu Sep 22 17:42:01 2005 @@ -12,13 +12,7 @@ expect *Permission denied read /dir/file expect *Permission denied -write /dir/file none value -expect *Permission denied -write /dir/file create value -expect *Permission denied -write /dir/file excl value -expect write failed: Invalid argument -write /dir/file crap value +write /dir/file value expect *Permission denied mkdir /dir/file expect *Permission denied @@ -29,8 +23,9 @@ getperm /dir/file expect *Permission denied setperm /dir/file 0 NONE -watch /dir/file token -1 write /dir/file create contents +# We get no watch event when there's no permission. It's a corner case. +watchnoack /dir/file token +1 write /dir/file contents 1 rm /dir/file expect waitwatch failed: Connection timed out waitwatch @@ -50,7 +45,7 @@ # Now it exists setid 0 -write /dir/file create contents +write /dir/file contents setid 1 expect *Permission denied @@ -58,13 +53,7 @@ expect *Permission denied read /dir/file expect *Permission denied -write /dir/file none value -expect *Permission denied -write /dir/file create value -expect *Permission denied -write /dir/file excl value -expect write failed: Invalid argument -write /dir/file crap value +write /dir/file value expect *Permission denied mkdir /dir/file expect *Permission denied @@ -75,8 +64,8 @@ getperm /dir/file expect *Permission denied setperm /dir/file 0 NONE -watch /dir/file token -1 write /dir/file create contents +watchnoack /dir/file token +1 write /dir/file contents 1 rm /dir/file expect waitwatch failed: Connection timed out waitwatch diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/15nowait.test --- a/tools/xenstore/testsuite/15nowait.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/15nowait.test Thu Sep 22 17:42:01 2005 @@ -1,10 +1,10 @@ # If we don't wait for an ack, we can crash daemon as it never expects to be # sending out two replies on top of each other. -noackwrite /1 create 1 -noackwrite /2 create 2 -noackwrite /3 create 3 -noackwrite /4 create 4 -noackwrite /5 create 5 +noackwrite /1 1 +noackwrite /2 2 +noackwrite /3 3 +noackwrite /4 4 +noackwrite /5 5 readack readack readack @@ -13,11 +13,11 @@ expect handle is 1 introduce 1 100 7 /my/home -1 noackwrite /1 create 1 -1 noackwrite /2 create 2 -1 noackwrite /3 create 3 -1 noackwrite /4 create 4 -1 noackwrite /5 create 5 +1 noackwrite /1 1 +1 noackwrite /2 2 +1 noackwrite /3 3 +1 noackwrite /4 4 +1 noackwrite /5 5 1 readack 1 readack 1 readack diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/testsuite/16block-watch-crash.test --- a/tools/xenstore/testsuite/16block-watch-crash.test Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/testsuite/16block-watch-crash.test Thu Sep 22 17:42:01 2005 @@ -4,8 +4,8 @@ watch /test token 1 start /test # This will block on above -noackwrite /test/entry create contents -1 write /test/entry2 create contents +noackwrite /test/entry contents +1 write /test/entry2 contents 1 commit readack expect /test/entry2:token diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xenstore_client.c --- a/tools/xenstore/xenstore_client.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xenstore_client.c Thu Sep 22 17:42:01 2005 @@ -102,7 +102,7 @@ optind++; #elif defined(CLIENT_write) success = xs_write(xsh, argv[optind], argv[optind + 1], - strlen(argv[optind + 1]), O_CREAT); + strlen(argv[optind + 1])); if (!success) { warnx("could not write path %s", argv[optind]); ret = 1; diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xenstored_core.c --- a/tools/xenstore/xenstored_core.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xenstored_core.c Thu Sep 22 17:42:01 2005 @@ -961,14 +961,19 @@ return dir; } -/* path, flags, data... */ +static bool node_exists(struct connection *conn, const char *node) +{ + struct stat st; + + return lstat(node_dir(conn->transaction, node), &st) == 0; +} + +/* path, data... */ static void do_write(struct connection *conn, struct buffered_data *in) { unsigned int offset, datalen; - char *vec[2]; + char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */ char *node, *tmppath; - enum xs_perm_type mode; - struct stat st; /* Extra "strings" can be created by binary data. */ if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec)) { @@ -985,37 +990,20 @@ if (transaction_block(conn, node)) return; - offset = strlen(vec[0]) + strlen(vec[1]) + 2; + offset = strlen(vec[0]) + 1; datalen = in->used - offset; - if (streq(vec[1], XS_WRITE_NONE)) - mode = XS_PERM_WRITE; - else if (streq(vec[1], XS_WRITE_CREATE)) - mode = XS_PERM_WRITE|XS_PERM_ENOENT_OK; - else if (streq(vec[1], XS_WRITE_CREATE_EXCL)) - mode = XS_PERM_WRITE|XS_PERM_ENOENT_OK; - else { - send_error(conn, EINVAL); - return; - } - - if (!check_node_perms(conn, node, mode)) { + if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) { send_error(conn, errno); return; } - if (lstat(node_dir(conn->transaction, node), &st) != 0) { + if (!node_exists(conn, node)) { char *dir; /* Does not exist... */ if (errno != ENOENT) { send_error(conn, errno); - return; - } - - /* Not going to create it? */ - if (streq(vec[1], XS_WRITE_NONE)) { - send_error(conn, ENOENT); return; } @@ -1027,11 +1015,6 @@ } else { /* Exists... */ - if (streq(vec[1], XS_WRITE_CREATE_EXCL)) { - send_error(conn, EEXIST); - return; - } - tmppath = tempfile(node_datafile(conn->transaction, node), in->buffer + offset, datalen); if (!tmppath) { @@ -1050,7 +1033,6 @@ static void do_mkdir(struct connection *conn, const char *node) { char *dir; - struct stat st; node = canonicalize(conn, node); if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) { @@ -1066,9 +1048,9 @@ if (transaction_block(conn, node)) return; - /* Must not already exist. */ - if (lstat(node_dir(conn->transaction, node), &st) == 0) { - send_error(conn, EEXIST); + /* If it already exists, fine. */ + if (node_exists(conn, node)) { + send_ack(conn, XS_MKDIR); return; } @@ -1089,6 +1071,15 @@ node = canonicalize(conn, node); if (!check_node_perms(conn, node, XS_PERM_WRITE)) { + /* Didn't exist already? Fine, if parent exists. */ + if (errno == ENOENT) { + if (node_exists(conn, get_parent(node))) { + send_ack(conn, XS_RM); + return; + } + /* Restore errno, just in case. */ + errno = ENOENT; + } send_error(conn, errno); return; } diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xenstored_watch.c --- a/tools/xenstore/xenstored_watch.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xenstored_watch.c Thu Sep 22 17:42:01 2005 @@ -236,6 +236,9 @@ trace_create(watch, "watch"); talloc_set_destructor(watch, destroy_watch); send_ack(conn, XS_WATCH); + + /* We fire once up front: simplifies clients and restart. */ + add_event(conn, watch, watch->node); } void do_watch_ack(struct connection *conn, const char *token) diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs.c --- a/tools/xenstore/xs.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs.c Thu Sep 22 17:42:01 2005 @@ -326,38 +326,23 @@ } /* Write the value of a single file. - * Returns false on failure. createflags can be 0, O_CREAT, or O_CREAT|O_EXCL. + * Returns false on failure. */ bool xs_write(struct xs_handle *h, const char *path, - const void *data, unsigned int len, int createflags) -{ - const char *flags; - struct iovec iovec[3]; - - /* Format: Flags (as string), path, data. */ - if (createflags == 0) - flags = XS_WRITE_NONE; - else if (createflags == O_CREAT) - flags = XS_WRITE_CREATE; - else if (createflags == (O_CREAT|O_EXCL)) - flags = XS_WRITE_CREATE_EXCL; - else { - errno = EINVAL; - return false; - } + const void *data, unsigned int len) +{ + struct iovec iovec[2]; iovec[0].iov_base = (void *)path; iovec[0].iov_len = strlen(path) + 1; - iovec[1].iov_base = (void *)flags; - iovec[1].iov_len = strlen(flags) + 1; - iovec[2].iov_base = (void *)data; - iovec[2].iov_len = len; + iovec[1].iov_base = (void *)data; + iovec[1].iov_len = len; return xs_bool(xs_talkv(h, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); } /* Create a new directory. - * Returns false on failure. + * Returns false on failure, or success if it already exists. */ bool xs_mkdir(struct xs_handle *h, const char *path) { @@ -365,7 +350,7 @@ } /* Destroy a file or directory (directories must be empty). - * Returns false on failure. + * Returns false on failure, or success if it doesn't exist. */ bool xs_rm(struct xs_handle *h, const char *path) { diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs.h --- a/tools/xenstore/xs.h Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs.h Thu Sep 22 17:42:01 2005 @@ -53,18 +53,18 @@ void *xs_read(struct xs_handle *h, const char *path, unsigned int *len); /* Write the value of a single file. - * Returns false on failure. createflags can be 0, O_CREAT, or O_CREAT|O_EXCL. + * Returns false on failure. */ bool xs_write(struct xs_handle *h, const char *path, const void *data, - unsigned int len, int createflags); + unsigned int len); /* Create a new directory. - * Returns false on failure. + * Returns false on failure, or success if it already exists. */ bool xs_mkdir(struct xs_handle *h, const char *path); /* Destroy a file or directory (and children). - * Returns false on failure. + * Returns false on failure, or success if it doesn't exist. */ bool xs_rm(struct xs_handle *h, const char *path); diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs_crashme.c --- a/tools/xenstore/xs_crashme.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs_crashme.c Thu Sep 22 17:42:01 2005 @@ -267,17 +267,12 @@ free(xs_read(h, name, &num)); break; case 2: { - int flags = random_flags(&state); char *contents = talloc_asprintf(NULL, "%i", get_randomness(&state)); unsigned int len = get_randomness(&state)%(strlen(contents)+1); if (verbose) - printf("WRITE %s %s %.*s\n", name, - flags == O_CREAT ? "O_CREAT" - : flags == (O_CREAT|O_EXCL) ? "O_CREAT|O_EXCL" - : flags == 0 ? "0" : "CRAPFLAGS", - len, contents); - xs_write(h, name, contents, len, flags); + printf("WRITE %s %.*s\n", name, len, contents); + xs_write(h, name, contents, len); break; } case 3: diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs_random.c --- a/tools/xenstore/xs_random.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs_random.c Thu Sep 22 17:42:01 2005 @@ -26,7 +26,7 @@ void *(*read)(void *h, const char *path, unsigned int *len); bool (*write)(void *h, const char *path, const void *data, - unsigned int len, int createflags); + unsigned int len); bool (*mkdir)(void *h, const char *path); @@ -74,9 +74,9 @@ static void maybe_convert_to_directory(const char *filename) { struct stat st; - char *dirname = talloc_asprintf(filename, "%.*s", - strrchr(filename, '/') - filename, - filename); + char *dirname = talloc_asprintf( + filename, "%.*s", + (int)(strrchr(filename, '/') - filename), filename); if (lstat(dirname, &st) == 0 && S_ISREG(st.st_mode)) convert_to_dir(dirname); } @@ -249,7 +249,7 @@ /* Copy permissions from parent */ command = talloc_asprintf(filename, "cp %.*s/.perms %s", - strrchr(filename, '/') - filename, + (int)(strrchr(filename, '/') - filename), filename, permfile); do_command(command); } @@ -308,7 +308,7 @@ char *slash = strrchr(name + 1, '/'); if (!slash) return talloc_strdup(name, "/"); - return talloc_asprintf(name, "%.*s", slash-name, name); + return talloc_asprintf(name, "%.*s", (int)(slash-name), name); } static void make_dirs(const char *filename) @@ -333,40 +333,18 @@ static bool file_write(struct file_ops_info *info, const char *path, const void *data, - unsigned int len, int createflags) + unsigned int len) { char *filename = filename_to_data(path_to_name(info, path)); int fd; - /* Kernel isn't strict, but library is. */ - if (createflags & ~(O_CREAT|O_EXCL)) { - errno = EINVAL; - return false; - } - if (!write_ok(info, path)) return false; - /* We regard it as existing if dir exists. */ - if (strends(filename, ".DATA")) { - if (!createflags) - createflags = O_CREAT; - if (createflags & O_EXCL) { - errno = EEXIST; - return false; - } - } - - if (createflags & O_CREAT) - make_dirs(parent_filename(filename)); - - fd = open(filename, createflags|O_TRUNC|O_WRONLY, 0600); - if (fd < 0) { - /* FIXME: Another hack. */ - if (!(createflags & O_CREAT) && errno == EISDIR) - errno = EEXIST; + make_dirs(parent_filename(filename)); + fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0600); + if (fd < 0) return false; - } if (write(fd, data, len) != (int)len) barf_perror("Bad write to %s", filename); @@ -385,7 +363,7 @@ make_dirs(parent_filename(dirname)); if (mkdir(dirname, 0700) != 0) - return false; + return (errno == EEXIST); init_perms(dirname); return true; @@ -401,8 +379,11 @@ return false; } - if (lstat(filename, &st) != 0) - return false; + if (lstat(filename, &st) != 0) { + if (lstat(parent_filename(filename), &st) != 0) + return false; + return true; + } if (!write_ok(info, path)) return false; @@ -843,20 +824,6 @@ return ret; } -static int random_flags(int *state) -{ - switch (get_randomness(state) % 4) { - case 0: - return 0; - case 1: - return O_CREAT; - case 2: - return O_CREAT|O_EXCL; - default: - return get_randomness(state); - } -} - /* Do the next operation, return the results. */ static char *do_next_op(struct ops *ops, void *h, int state, bool verbose) { @@ -880,18 +847,12 @@ ret = linearize_read(ops->read(h, name, &num), &num); break; case 2: { - int flags = random_flags(&state); char *contents = talloc_asprintf(NULL, "%i", get_randomness(&state)); unsigned int len = get_randomness(&state)%(strlen(contents)+1); if (verbose) - printf("WRITE %s %s %.*s\n", name, - flags == O_CREAT ? "O_CREAT" - : flags == (O_CREAT|O_EXCL) ? "O_CREAT|O_EXCL" - : flags == 0 ? "0" : "CRAPFLAGS", - len, contents); - ret = bool_to_errstring(ops->write(h, name, contents, len, - flags)); + printf("WRITE %s %.*s\n", name, len, contents); + ret = bool_to_errstring(ops->write(h, name, contents, len)); talloc_steal(ret, contents); break; } @@ -1102,7 +1063,8 @@ ret = do_next_op(data->ops, h, i + data->seed, verbose); if (verbose) - printf("-> %.*s\n", strchr(ret, '\n') - ret, ret); + printf("-> %.*s\n", + (int)(strchr(ret, '\n') - ret), ret); if (streq(ret, "FAILED:Bad file descriptor")) goto out; if (kill(daemon_pid, 0) != 0) @@ -1373,13 +1335,14 @@ file = do_next_op(&file_ops, fileh, i+data->seed, verbose); if (verbose) - printf("-> %.*s\n", strchr(file, '/') - file, file); + printf("-> %.*s\n", + (int)(strchr(file, '/') - file), file); if (verbose) printf("XS: "); xs = do_next_op(&xs_ops, xsh, i+data->seed, verbose); if (verbose) - printf("-> %.*s\n", strchr(xs, '/') - xs, xs); + printf("-> %.*s\n", (int)(strchr(xs, '/') - xs), xs); if (!streq(file, xs)) goto out; @@ -1547,7 +1510,8 @@ aborted++; if (verbose) - printf("-> %.*s\n", strchr(ret, '\n') - ret, ret); + printf("-> %.*s\n", + (int)(strchr(ret, '\n') - ret), ret); talloc_free(ret); diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs_stress.c --- a/tools/xenstore/xs_stress.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs_stress.c Thu Sep 22 17:42:01 2005 @@ -61,7 +61,7 @@ barf_perror("%i: can't read %s iter %i", childnum, file, i); sprintf(tmp, "%i", atoi(contents) + 1); - if (!xs_write(h, file, tmp, strlen(tmp)+1, 0)) + if (!xs_write(h, file, tmp, strlen(tmp)+1)) barf_perror("%i: can't write %s iter %i", childnum, file, i); @@ -91,7 +91,7 @@ if (togo == 0) { sprintf(filename, "%s/count", base); - if (!xs_write(h, filename, "0", 2, O_EXCL|O_CREAT)) + if (!xs_write(h, filename, "0", 1)) barf_perror("Writing to %s", filename); return; } diff -r 97dbd9524a7e -r 06d84bf87159 tools/xenstore/xs_test.c --- a/tools/xenstore/xs_test.c Thu Sep 22 17:34:14 2005 +++ b/tools/xenstore/xs_test.c Thu Sep 22 17:42:01 2005 @@ -192,7 +192,7 @@ "Reads commands from stdin, one per line:" " dir <path>\n" " read <path>\n" - " write <path> <flags> <value>...\n" + " write <path> <value>...\n" " setid <id>\n" " mkdir <path>\n" " rm <path>\n" @@ -200,6 +200,7 @@ " setperm <path> <id> <flags> ...\n" " shutdown\n" " watch <path> <token>\n" + " watchnoack <path> <token>\n" " waitwatch\n" " ackwatch <token>\n" " unwatch <path> <token>\n" @@ -213,7 +214,7 @@ " notimeout\n" " readonly\n" " readwrite\n" - " noackwrite <path> <flags> <value>...\n" + " noackwrite <path> <value>...\n" " readack\n" " dump\n"); } @@ -348,47 +349,22 @@ output("%.*s\n", len, value); } -static void do_write(unsigned int handle, char *path, char *flags, char *data) -{ - int f; - - if (streq(flags, "none")) - f = 0; - else if (streq(flags, "create")) - f = O_CREAT; - else if (streq(flags, "excl")) - f = O_CREAT | O_EXCL; - else if (streq(flags, "crap")) - f = 100; - else - barf("write flags 'none', 'create' or 'excl' only"); - - if (!xs_write(handles[handle], path, data, strlen(data), f)) +static void do_write(unsigned int handle, char *path, char *data) +{ + if (!xs_write(handles[handle], path, data, strlen(data))) failed(handle); } static void do_noackwrite(unsigned int handle, - char *path, const char *flags, char *data) + char *path, char *data) { struct xsd_sockmsg msg; - /* Format: Flags (as string), path, data. */ - if (streq(flags, "none")) - flags = XS_WRITE_NONE; - else if (streq(flags, "create")) - flags = XS_WRITE_CREATE; - else if (streq(flags, "excl")) - flags = XS_WRITE_CREATE_EXCL; - else - barf("noackwrite flags 'none', 'create' or 'excl' only"); - - msg.len = strlen(path) + 1 + strlen(flags) + 1 + strlen(data); + msg.len = strlen(path) + 1 + strlen(data); msg.type = XS_WRITE; if (!write_all_choice(handles[handle]->fd, &msg, sizeof(msg))) failed(handle); if (!write_all_choice(handles[handle]->fd, path, strlen(path) + 1)) - failed(handle); - if (!write_all_choice(handles[handle]->fd, flags, strlen(flags) + 1)) failed(handle); if (!write_all_choice(handles[handle]->fd, data, strlen(data))) failed(handle); @@ -505,10 +481,20 @@ failed(handle); } -static void do_watch(unsigned int handle, const char *node, const char *token) +static void do_watch(unsigned int handle, const char *node, const char *token, + bool swallow_event) { if (!xs_watch(handles[handle], node, token)) failed(handle); + + /* Convenient for testing... */ + if (swallow_event) { + char **vec = xs_read_watch(handles[handle]); + if (!vec || !streq(vec[0], node) || !streq(vec[1], token)) + failed(handle); + if (!xs_acknowledge_watch(handles[handle], token)) + failed(handle); + } } static void set_timeout(void) @@ -778,8 +764,7 @@ else if (streq(command, "read")) do_read(handle, arg(line, 1)); else if (streq(command, "write")) - do_write(handle, - arg(line, 1), arg(line, 2), arg(line, 3)); + do_write(handle, arg(line, 1), arg(line, 2)); else if (streq(command, "setid")) do_setid(handle, arg(line, 1)); else if (streq(command, "mkdir")) @@ -793,7 +778,9 @@ else if (streq(command, "shutdown")) do_shutdown(handle); else if (streq(command, "watch")) - do_watch(handle, arg(line, 1), arg(line, 2)); + do_watch(handle, arg(line, 1), arg(line, 2), true); + else if (streq(command, "watchnoack")) + do_watch(handle, arg(line, 1), arg(line, 2), false); else if (streq(command, "waitwatch")) do_waitwatch(handle); else if (streq(command, "ackwatch")) @@ -832,7 +819,7 @@ xs_daemon_close(handles[handle]); handles[handle] = NULL; } else if (streq(command, "noackwrite")) - do_noackwrite(handle, arg(line,1), arg(line,2), arg(line,3)); + do_noackwrite(handle, arg(line,1), arg(line,2)); else if (streq(command, "readack")) do_readack(handle); else diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/ia64/xen/grant_table.c --- a/xen/arch/ia64/xen/grant_table.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/ia64/xen/grant_table.c Thu Sep 22 17:42:01 2005 @@ -850,7 +850,7 @@ #endif static long -gnttab_donate(gnttab_donate_t *uop, unsigned int count) +gnttab_transfer(gnttab_transfer_t *uop, unsigned int count) { struct domain *d = current->domain; struct domain *e; @@ -864,27 +864,27 @@ return GNTST_general_error; #else for (i = 0; i < count; i++) { - gnttab_donate_t *gop = &uop[i]; + gnttab_transfer_t *gop = &uop[i]; #if GRANT_DEBUG - printk("gnttab_donate: i=%d mfn=%lx domid=%d gref=%08x\n", + printk("gnttab_transfer: i=%d mfn=%lx domid=%d gref=%08x\n", i, gop->mfn, gop->domid, gop->handle); #endif page = &frame_table[gop->mfn]; if (unlikely(IS_XEN_HEAP_FRAME(page))) { - printk("gnttab_donate: xen heap frame mfn=%lx\n", + printk("gnttab_transfer: xen heap frame mfn=%lx\n", (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; } if (unlikely(!pfn_valid(page_to_pfn(page)))) { - printk("gnttab_donate: invalid pfn for mfn=%lx\n", + printk("gnttab_transfer: invalid pfn for mfn=%lx\n", (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; } if (unlikely((e = find_domain_by_id(gop->domid)) == NULL)) { - printk("gnttab_donate: can't find domain %d\n", gop->domid); + printk("gnttab_transfer: can't find domain %d\n", gop->domid); gop->status = GNTST_bad_domain; continue; } @@ -904,7 +904,7 @@ x = y; if (unlikely((x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated)) || unlikely(_nd != _d)) { - printk("gnttab_donate: Bad page values %p: ed=%p(%u), sd=%p," + printk("gnttab_transfer: Bad page values %p: ed=%p(%u), sd=%p," " caf=%08x, taf=%" PRtype_info "\n", (void *) page_to_pfn(page), d, d->domain_id, unpickle_domptr(_nd), x, @@ -947,14 +947,14 @@ break; } if (unlikely(test_bit(DOMFLAGS_DYING, &e->domain_flags))) { - printk("gnttab_donate: target domain is dying\n"); + printk("gnttab_transfer: target domain is dying\n"); spin_unlock(&e->page_alloc_lock); put_domain(e); result = GNTST_general_error; break; } - if (unlikely(!gnttab_prepare_for_transfer(e, d, gop->handle))) { - printk("gnttab_donate: gnttab_prepare_for_transfer fails\n"); + if (unlikely(!gnttab_prepare_for_transfer(e, d, gop->ref))) { + printk("gnttab_transfer: gnttab_prepare_for_transfer fails\n"); spin_unlock(&e->page_alloc_lock); put_domain(e); result = GNTST_general_error; @@ -964,10 +964,10 @@ ASSERT(e->tot_pages <= e->max_pages); if (unlikely(test_bit(DOMFLAGS_DYING, &e->domain_flags)) || unlikely(e->tot_pages == e->max_pages) || - unlikely(!gnttab_prepare_for_transfer(e, d, gop->handle))) { - printk("gnttab_donate: Transferee has no reservation headroom (%d," + unlikely(!gnttab_prepare_for_transfer(e, d, gop->ref))) { + printk("gnttab_transfer: Transferee has no reservation headroom (%d," "%d) or provided a bad grant ref (%08x) or is dying (%p)\n", - e->tot_pages, e->max_pages, gop->handle, e->d_flags); + e->tot_pages, e->max_pages, gop->ref, e->d_flags); spin_unlock(&e->page_alloc_lock); put_domain(e); result = GNTST_general_error; @@ -987,7 +987,7 @@ * Transfer is all done: tell the guest about its new page * frame. */ - gnttab_notify_transfer(e, d, gop->handle, gop->mfn); + gnttab_notify_transfer(e, d, gop->ref, gop->mfn); put_domain(e); @@ -1037,11 +1037,11 @@ rc = gnttab_dump_table((gnttab_dump_table_t *)uop); break; #endif - case GNTTABOP_donate: + case GNTTABOP_transfer: if (unlikely(!array_access_ok(uop, count, - sizeof(gnttab_donate_t)))) + sizeof(gnttab_transfer_t)))) goto out; - rc = gnttab_donate(uop, count); + rc = gnttab_transfer(uop, count); break; default: rc = -ENOSYS; diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/shadow.c --- a/xen/arch/x86/shadow.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/shadow.c Thu Sep 22 17:42:01 2005 @@ -697,6 +697,8 @@ } } + __shadow_get_l2e(v, va, &sl2e); + if ( shadow_mode_refcounts(d) ) { l1_pgentry_t old_spte; diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/shadow32.c --- a/xen/arch/x86/shadow32.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/shadow32.c Thu Sep 22 17:42:01 2005 @@ -399,22 +399,26 @@ perfc_decr(shadow_l1_pages); shadow_demote(d, gpfn, gmfn); free_shadow_l1_table(d, smfn); + d->arch.shadow_page_count--; break; case PGT_l2_shadow: perfc_decr(shadow_l2_pages); shadow_demote(d, gpfn, gmfn); free_shadow_l2_table(d, smfn, page->u.inuse.type_info); + d->arch.shadow_page_count--; break; case PGT_hl2_shadow: perfc_decr(hl2_table_pages); shadow_demote(d, gpfn, gmfn); free_shadow_hl2_table(d, smfn); + d->arch.hl2_page_count--; break; case PGT_snapshot: perfc_decr(snapshot_pages); + d->arch.snapshot_page_count--; break; default: @@ -422,8 +426,6 @@ page_to_pfn(page), page->u.inuse.type_info); break; } - - d->arch.shadow_page_count--; // No TLB flushes are needed the next time this page gets allocated. // diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/shadow_public.c --- a/xen/arch/x86/shadow_public.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/shadow_public.c Thu Sep 22 17:42:01 2005 @@ -595,18 +595,21 @@ perfc_decr(shadow_l1_pages); shadow_demote(d, gpfn, gmfn); free_shadow_l1_table(d, smfn); + d->arch.shadow_page_count--; break; #if defined (__i386__) case PGT_l2_shadow: perfc_decr(shadow_l2_pages); shadow_demote(d, gpfn, gmfn); free_shadow_l2_table(d, smfn, page->u.inuse.type_info); + d->arch.shadow_page_count--; break; case PGT_hl2_shadow: perfc_decr(hl2_table_pages); shadow_demote(d, gpfn, gmfn); free_shadow_hl2_table(d, smfn); + d->arch.hl2_page_count--; break; #else case PGT_l2_shadow: @@ -614,12 +617,13 @@ case PGT_l4_shadow: shadow_demote(d, gpfn, gmfn); free_shadow_tables(d, smfn, shadow_type_to_level(type)); + d->arch.shadow_page_count--; break; case PGT_fl1_shadow: free_shadow_fl1_table(d, smfn); + d->arch.shadow_page_count--; break; - #endif case PGT_snapshot: @@ -631,8 +635,6 @@ page_to_pfn(page), page->u.inuse.type_info); break; } - - d->arch.shadow_page_count--; // No TLB flushes are needed the next time this page gets allocated. // diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/vmx.c --- a/xen/arch/x86/vmx.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/vmx.c Thu Sep 22 17:42:01 2005 @@ -377,12 +377,13 @@ static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs) { - unsigned long eip; unsigned long gpa; /* FIXME: PAE */ int result; -#if VMX_DEBUG +#if 0 /* keep for debugging */ { + unsigned long eip; + __vmread(GUEST_RIP, &eip); VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx", @@ -429,9 +430,9 @@ clts(); setup_fpu(current); - __vmread(CR0_READ_SHADOW, &cr0); + __vmread_vcpu(CR0_READ_SHADOW, &cr0); if (!(cr0 & X86_CR0_TS)) { - __vmread(GUEST_CR0, &cr0); + __vmread_vcpu(GUEST_CR0, &cr0); cr0 &= ~X86_CR0_TS; __vmwrite(GUEST_CR0, cr0); } @@ -470,6 +471,8 @@ } #endif + /* Unsupportable for virtualised CPUs. */ + clear_bit(X86_FEATURE_MWAIT & 31, &ecx); } regs->eax = (unsigned long) eax; @@ -1100,6 +1103,11 @@ d->arch.arch_vmx.cpu_cr3, mfn); } + if(!((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled) + if(d->arch.arch_vmx.cpu_cr3) + put_page(pfn_to_page(get_mfn_from_pfn( + d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT))); + /* * VMX does not implement real-mode virtualization. We emulate * real-mode by performing a world switch to VMXAssist whenever @@ -1124,9 +1132,7 @@ __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); } } - __vmread(GUEST_RIP, &eip); - VMX_DBG_LOG(DBG_LEVEL_1, - "Disabling CR0.PE at %%eip 0x%lx\n", eip); + if (vmx_assist(d, VMX_ASSIST_INVOKE)) { set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &d->arch.arch_vmx.cpu_state); __vmread(GUEST_RIP, &eip); @@ -1365,17 +1371,17 @@ clts(); setup_fpu(current); - __vmread(GUEST_CR0, &value); + __vmread_vcpu(GUEST_CR0, &value); value &= ~X86_CR0_TS; /* clear TS */ __vmwrite(GUEST_CR0, value); - __vmread(CR0_READ_SHADOW, &value); + __vmread_vcpu(CR0_READ_SHADOW, &value); value &= ~X86_CR0_TS; /* clear TS */ __vmwrite(CR0_READ_SHADOW, value); break; case TYPE_LMSW: TRACE_VMEXIT(1,TYPE_LMSW); - __vmread(CR0_READ_SHADOW, &value); + __vmread_vcpu(CR0_READ_SHADOW, &value); value = (value & ~0xF) | (((exit_qualification & LMSW_SOURCE_DATA) >> 16) & 0xF); return vmx_set_cr0(value); @@ -1451,17 +1457,13 @@ (unsigned long)regs->edx); } +volatile unsigned long do_hlt_count; /* * Need to use this exit to reschedule */ -static inline void vmx_vmexit_do_hlt(void) -{ -#if VMX_DEBUG - unsigned long eip; - __vmread(GUEST_RIP, &eip); -#endif - VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%lx", eip); - raise_softirq(SCHEDULE_SOFTIRQ); +void vmx_vmexit_do_hlt(void) +{ + do_block(); } static inline void vmx_vmexit_do_extint(struct cpu_user_regs *regs) @@ -1511,16 +1513,6 @@ } } -static inline void vmx_vmexit_do_mwait(void) -{ -#if VMX_DEBUG - unsigned long eip; - __vmread(GUEST_RIP, &eip); -#endif - VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%lx", eip); - raise_softirq(SCHEDULE_SOFTIRQ); -} - #define BUF_SIZ 256 #define MAX_LINE 80 char print_buf[BUF_SIZ]; @@ -1626,9 +1618,13 @@ return; } - __vmread(GUEST_RIP, &eip); - TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason); - TRACE_VMEXIT(0,exit_reason); +#ifdef TRACE_BUFFER + { + __vmread(GUEST_RIP, &eip); + TRACE_3D(TRC_VMX_VMEXIT, v->domain->domain_id, eip, exit_reason); + TRACE_VMEXIT(0,exit_reason); + } +#endif switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: @@ -1798,9 +1794,7 @@ __update_guest_eip(inst_len); break; case EXIT_REASON_MWAIT_INSTRUCTION: - __get_instruction_length(inst_len); - __update_guest_eip(inst_len); - vmx_vmexit_do_mwait(); + __vmx_bug(&regs); break; default: __vmx_bug(&regs); /* should not happen */ diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/vmx_intercept.c --- a/xen/arch/x86/vmx_intercept.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/vmx_intercept.c Thu Sep 22 17:42:01 2005 @@ -28,6 +28,7 @@ #include <xen/sched.h> #include <asm/current.h> #include <io_ports.h> +#include <xen/event.h> #ifdef CONFIG_VMX @@ -205,6 +206,7 @@ /* Set the pending intr bit, and send evtchn notification to myself. */ if (test_and_set_bit(vpit->vector, vpit->intr_bitmap)) vpit->pending_intr_nr++; /* already set, then count the pending intr */ + evtchn_set_pending(vpit->v, iopacket_port(vpit->v->domain)); /* pick up missed timer tick */ if ( missed_ticks > 0 ) { @@ -281,6 +283,7 @@ } vpit->intr_bitmap = intr; + vpit->v = d; vpit->scheduled = NOW() + vpit->period; set_ac_timer(&vpit->pit_timer, vpit->scheduled); diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/vmx_io.c --- a/xen/arch/x86/vmx_io.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/vmx_io.c Thu Sep 22 17:42:01 2005 @@ -891,7 +891,7 @@ struct vcpu *v = current; highest_vector = find_highest_pending_irq(v, &intr_type); - __vmread(CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control); + __vmread_vcpu(CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control); if (highest_vector == -1) { disable_irq_window(cpu_exec_control); @@ -948,14 +948,6 @@ void vmx_do_resume(struct vcpu *d) { vmx_stts(); - if ( vmx_paging_enabled(d) ) - __vmwrite(GUEST_CR3, pagetable_get_paddr(d->arch.shadow_table)); - else - // paging is not enabled in the guest - __vmwrite(GUEST_CR3, pagetable_get_paddr(d->domain->arch.phys_table)); - - __vmwrite(HOST_CR3, pagetable_get_paddr(d->arch.monitor_table)); - __vmwrite(HOST_RSP, (unsigned long)get_stack_bottom()); if (event_pending(d)) { vmx_check_events(d); diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/vmx_platform.c --- a/xen/arch/x86/vmx_platform.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/vmx_platform.c Thu Sep 22 17:42:01 2005 @@ -671,13 +671,13 @@ if (inst->operand[0] & REGISTER) { /* dest is memory */ index = operand_index(inst->operand[0]); value = get_reg_value(size, index, 0, regs); - send_mmio_req(type, gpa, 1, size, value, IOREQ_WRITE, 0); + send_mmio_req(type, gpa, 1, inst->op_size, value, IOREQ_WRITE, 0); } else if (inst->operand[0] & IMMEDIATE) { /* dest is memory */ value = inst->immediate; - send_mmio_req(type, gpa, 1, size, value, IOREQ_WRITE, 0); + send_mmio_req(type, gpa, 1, inst->op_size, value, IOREQ_WRITE, 0); } else if (inst->operand[0] & MEMORY) { /* dest is register */ /* send the request and wait for the value */ - send_mmio_req(type, gpa, 1, size, 0, IOREQ_READ, 0); + send_mmio_req(type, gpa, 1, inst->op_size, 0, IOREQ_READ, 0); } else { printf("mmio_operands: invalid operand\n"); domain_crash_synchronous(); diff -r 97dbd9524a7e -r 06d84bf87159 xen/arch/x86/vmx_vmcs.c --- a/xen/arch/x86/vmx_vmcs.c Thu Sep 22 17:34:14 2005 +++ b/xen/arch/x86/vmx_vmcs.c Thu Sep 22 17:42:01 2005 @@ -67,9 +67,6 @@ error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, MONITOR_PIN_BASED_EXEC_CONTROLS); - - error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, - MONITOR_CPU_BASED_EXEC_CONTROLS); error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS); @@ -117,12 +114,6 @@ unsigned long fs_base; unsigned long gs_base; #endif - - /* control registers */ - unsigned long cr3; - unsigned long cr0; - unsigned long cr4; - unsigned long dr7; }; #define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */ @@ -217,8 +208,32 @@ /* Update CR3, GDT, LDT, TR */ unsigned int error = 0; unsigned long pfn = 0; + unsigned long cr0, cr4; struct pfn_info *page; struct cpu_user_regs *regs = guest_cpu_user_regs(); + + __asm__ __volatile__ ("mov %%cr0,%0" : "=r" (cr0) : ); + + error |= __vmwrite(GUEST_CR0, cr0); + cr0 &= ~X86_CR0_PG; + error |= __vmwrite(CR0_READ_SHADOW, cr0); + error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, + MONITOR_CPU_BASED_EXEC_CONTROLS); + + __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (cr4) : ); + +#ifdef __x86_64__ + error |= __vmwrite(GUEST_CR4, cr4 & ~X86_CR4_PSE); +#else + error |= __vmwrite(GUEST_CR4, cr4); +#endif + +#ifdef __x86_64__ + cr4 &= ~(X86_CR4_PGE | X86_CR4_VMXE | X86_CR4_PAE); +#else + cr4 &= ~(X86_CR4_PGE | X86_CR4_VMXE); +#endif + error |= __vmwrite(CR4_READ_SHADOW, cr4); vmx_stts(); @@ -254,7 +269,7 @@ int error = 0; union vmcs_arbytes arbytes; unsigned long dr7; - unsigned long eflags, shadow_cr; + unsigned long eflags; /* MSR */ error |= __vmwrite(VM_EXIT_MSR_LOAD_ADDR, 0); @@ -326,27 +341,7 @@ arbytes.fields.seg_type = 0xb; /* 32-bit TSS (busy) */ error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes); - - error |= __vmwrite(GUEST_CR0, host_env->cr0); /* same CR0 */ - - /* Initally PG, PE are not set*/ - shadow_cr = host_env->cr0; - shadow_cr &= ~X86_CR0_PG; - error |= __vmwrite(CR0_READ_SHADOW, shadow_cr); /* CR3 is set in vmx_final_setup_guest */ -#ifdef __x86_64__ - error |= __vmwrite(GUEST_CR4, host_env->cr4 & ~X86_CR4_PSE); -#else - error |= __vmwrite(GUEST_CR4, host_env->cr4); -#endif - shadow_cr = host_env->cr4; - -#ifdef __x86_64__ - shadow_cr &= ~(X86_CR4_PGE | X86_CR4_VMXE | X86_CR4_PAE); -#else - shadow_cr &= ~(X86_CR4_PGE | X86_CR4_VMXE); -#endif - error |= __vmwrite(CR4_READ_SHADOW, shadow_cr); error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base); error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base); @@ -403,12 +398,10 @@ host_env->cs_base = 0; __asm__ __volatile__ ("mov %%cr0,%0" : "=r" (crn) : ); - host_env->cr0 = crn; error |= __vmwrite(HOST_CR0, crn); /* same CR0 */ /* CR3 is set in vmx_final_setup_hostos */ __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) : ); - host_env->cr4 = crn; error |= __vmwrite(HOST_CR4, crn); error |= __vmwrite(HOST_RIP, (unsigned long) vmx_asm_vmexit_handler); diff -r 97dbd9524a7e -r 06d84bf87159 xen/common/grant_table.c --- a/xen/common/grant_table.c Thu Sep 22 17:34:14 2005 +++ b/xen/common/grant_table.c Thu Sep 22 17:42:01 2005 @@ -797,7 +797,7 @@ #endif static long -gnttab_donate(gnttab_donate_t *uop, unsigned int count) +gnttab_transfer(gnttab_transfer_t *uop, unsigned int count) { struct domain *d = current->domain; struct domain *e; @@ -805,19 +805,20 @@ u32 _d, _nd, x, y; int i; int result = GNTST_okay; + grant_entry_t *sha; for ( i = 0; i < count; i++ ) { - gnttab_donate_t *gop = &uop[i]; + gnttab_transfer_t *gop = &uop[i]; #if GRANT_DEBUG - printk("gnttab_donate: i=%d mfn=%lx domid=%d gref=%08x\n", + printk("gnttab_transfer: i=%d mfn=%lx domid=%d gref=%08x\n", i, gop->mfn, gop->domid, gop->handle); #endif page = &frame_table[gop->mfn]; if ( unlikely(IS_XEN_HEAP_FRAME(page))) { - printk("gnttab_donate: xen heap frame mfn=%lx\n", + printk("gnttab_transfer: xen heap frame mfn=%lx\n", (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; @@ -825,7 +826,7 @@ if ( unlikely(!pfn_valid(page_to_pfn(page))) ) { - printk("gnttab_donate: invalid pfn for mfn=%lx\n", + printk("gnttab_transfer: invalid pfn for mfn=%lx\n", (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; @@ -833,7 +834,7 @@ if ( unlikely((e = find_domain_by_id(gop->domid)) == NULL) ) { - printk("gnttab_donate: can't find domain %d\n", gop->domid); + printk("gnttab_transfer: can't find domain %d\n", gop->domid); gop->status = GNTST_bad_domain; continue; } @@ -853,7 +854,7 @@ x = y; if (unlikely((x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated)) || unlikely(_nd != _d)) { - printk("gnttab_donate: Bad page values %p: ed=%p(%u), sd=%p," + printk("gnttab_transfer: Bad page values %p: ed=%p(%u), sd=%p," " caf=%08x, taf=%" PRtype_info "\n", (void *) page_to_pfn(page), d, d->domain_id, unpickle_domptr(_nd), x, @@ -888,12 +889,12 @@ */ if ( unlikely(test_bit(DOMFLAGS_DYING, &e->domain_flags)) || unlikely(e->tot_pages >= e->max_pages) || - unlikely(!gnttab_prepare_for_transfer(e, d, gop->handle)) ) - { - DPRINTK("gnttab_donate: Transferee has no reservation headroom " + unlikely(!gnttab_prepare_for_transfer(e, d, gop->ref)) ) + { + DPRINTK("gnttab_transfer: Transferee has no reservation headroom " "(%d,%d) or provided a bad grant ref (%08x) or " "is dying (%lx)\n", - e->tot_pages, e->max_pages, gop->handle, e->domain_flags); + e->tot_pages, e->max_pages, gop->ref, e->domain_flags); spin_unlock(&e->page_alloc_lock); put_domain(e); gop->status = result = GNTST_general_error; @@ -908,11 +909,11 @@ spin_unlock(&e->page_alloc_lock); - /* - * Transfer is all done: tell the guest about its new page - * frame. - */ - gnttab_notify_transfer(e, d, gop->handle, gop->mfn); + /* Tell the guest about its new page frame. */ + sha = &e->grant_table->shared[gop->ref]; + sha->frame = gop->mfn; + wmb(); + sha->flags |= GTF_transfer_completed; put_domain(e); @@ -960,11 +961,11 @@ rc = gnttab_dump_table((gnttab_dump_table_t *)uop); break; #endif - case GNTTABOP_donate: + case GNTTABOP_transfer: if (unlikely(!array_access_ok( - uop, count, sizeof(gnttab_donate_t)))) + uop, count, sizeof(gnttab_transfer_t)))) goto out; - rc = gnttab_donate(uop, count); + rc = gnttab_transfer(uop, count); break; default: rc = -ENOSYS; @@ -1171,46 +1172,6 @@ return 0; } -void -gnttab_notify_transfer( - struct domain *rd, struct domain *ld, grant_ref_t ref, unsigned long frame) -{ - grant_entry_t *sha; - unsigned long pfn; - -#if GRANT_DEBUG_VERBOSE - DPRINTK("gnttab_notify_transfer rd(%hu) ld(%hu) ref(%hu).\n", - rd->domain_id, ld->domain_id, ref); -#endif - - sha = &rd->grant_table->shared[ref]; - - spin_lock(&rd->grant_table->lock); - - pfn = sha->frame; - - if ( unlikely(pfn >= max_page ) ) - DPRINTK("Bad pfn (%lx)\n", pfn); - else - { - set_pfn_from_mfn(frame, pfn); - - if ( unlikely(shadow_mode_log_dirty(ld))) - mark_dirty(ld, frame); - - if (shadow_mode_translate(ld)) - set_mfn_from_pfn(pfn, frame); - } - sha->frame = __mfn_to_gpfn(rd, frame); - sha->domid = rd->domain_id; - wmb(); - sha->flags = ( GTF_accept_transfer | GTF_transfer_completed ); - - spin_unlock(&rd->grant_table->lock); - - return; -} - int grant_table_create( struct domain *d) diff -r 97dbd9524a7e -r 06d84bf87159 xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Thu Sep 22 17:34:14 2005 +++ b/xen/common/sched_sedf.c Thu Sep 22 17:42:01 2005 @@ -846,7 +846,7 @@ * the domain can't finish it's workload in the period * -in addition to that the domain can be treated prioritised when * extratime is available - * -addition: experiments hve shown that this may have a HUGE impact on + * -addition: experiments have shown that this may have a HUGE impact on * performance of other domains, becaus it can lead to excessive context * switches diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/asm-x86/shadow.h Thu Sep 22 17:42:01 2005 @@ -1595,6 +1595,8 @@ } } + __shadow_get_l2e(v, va, &sl2e); + if ( shadow_mode_refcounts(d) ) { l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)]; diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/asm-x86/vmx.h --- a/xen/include/asm-x86/vmx.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/asm-x86/vmx.h Thu Sep 22 17:42:01 2005 @@ -314,6 +314,57 @@ return 0; } + +static always_inline void __vmwrite_vcpu(unsigned long field, unsigned long value) +{ + struct vcpu *v = current; + + switch(field) { + case CR0_READ_SHADOW: + v->arch.arch_vmx.cpu_shadow_cr0 = value; + break; + case GUEST_CR0: + v->arch.arch_vmx.cpu_cr0 = value; + break; + case CPU_BASED_VM_EXEC_CONTROL: + v->arch.arch_vmx.cpu_based_exec_control = value; + break; + default: + printk("__vmwrite_cpu: invalid field %lx\n", field); + break; + } +} + +static always_inline void __vmread_vcpu(unsigned long field, unsigned long *value) +{ + struct vcpu *v = current; + + switch(field) { + case CR0_READ_SHADOW: + *value = v->arch.arch_vmx.cpu_shadow_cr0; + break; + case GUEST_CR0: + *value = v->arch.arch_vmx.cpu_cr0; + break; + case CPU_BASED_VM_EXEC_CONTROL: + *value = v->arch.arch_vmx.cpu_based_exec_control; + break; + default: + printk("__vmread_cpu: invalid field %lx\n", field); + break; + } + + /* + * __vmwrite() can be used for non-current vcpu, and it's possible that + * the vcpu field is not initialized at that case. + * + */ + if (!*value) { + __vmread(field, value); + __vmwrite_vcpu(field, *value); + } +} + static inline int __vmwrite (unsigned long field, unsigned long value) { unsigned long eflags; @@ -326,6 +377,15 @@ __save_flags(eflags); if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF) return -1; + + switch(field) { + case CR0_READ_SHADOW: + case GUEST_CR0: + case CPU_BASED_VM_EXEC_CONTROL: + __vmwrite_vcpu(field, value); + break; + } + return 0; } @@ -379,11 +439,12 @@ { unsigned long cr0; - __vmread(GUEST_CR0, &cr0); - if (!(cr0 & X86_CR0_TS)) + __vmread_vcpu(GUEST_CR0, &cr0); + if (!(cr0 & X86_CR0_TS)) { __vmwrite(GUEST_CR0, cr0 | X86_CR0_TS); - - __vmread(CR0_READ_SHADOW, &cr0); + } + + __vmread_vcpu(CR0_READ_SHADOW, &cr0); if (!(cr0 & X86_CR0_TS)) __vm_set_bit(EXCEPTION_BITMAP, EXCEPTION_BITMAP_NM); } @@ -393,7 +454,7 @@ { unsigned long cr0; - __vmread(CR0_READ_SHADOW, &cr0); + __vmread_vcpu(CR0_READ_SHADOW, &cr0); return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); } diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/asm-x86/vmx_virpit.h --- a/xen/include/asm-x86/vmx_virpit.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/asm-x86/vmx_virpit.h Thu Sep 22 17:42:01 2005 @@ -35,8 +35,8 @@ unsigned int count; /* the 16 bit channel count */ unsigned int init_val; /* the init value for the counter */ - -} ; + struct vcpu *v; +}; /* to hook the ioreq packet to get the PIT initializaiton info */ extern void vmx_hooks_assist(struct vcpu *d); diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/asm-x86/vmx_vmcs.h --- a/xen/include/asm-x86/vmx_vmcs.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/asm-x86/vmx_vmcs.h Thu Sep 22 17:42:01 2005 @@ -74,9 +74,12 @@ struct arch_vmx_struct { struct vmcs_struct *vmcs; /* VMCS pointer in virtual */ unsigned long flags; /* VMCS flags */ + unsigned long cpu_cr0; /* copy of guest CR0 */ + unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */ unsigned long cpu_cr2; /* save CR2 */ unsigned long cpu_cr3; unsigned long cpu_state; + unsigned long cpu_based_exec_control; struct msr_state msr_content; void *io_bitmap_a, *io_bitmap_b; }; diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/public/grant_table.h --- a/xen/include/public/grant_table.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/public/grant_table.h Thu Sep 22 17:42:01 2005 @@ -215,18 +215,19 @@ } gnttab_dump_table_t; /* - * GNTTABOP_donate_grant_ref: Donate <frame> to a foreign domain. The + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The * foreign domain has previously registered the details of the transfer. * These can be identified from <handle>, a grant reference. */ -#define GNTTABOP_donate 4 +#define GNTTABOP_transfer 4 typedef struct { - unsigned long mfn; /* 0 */ - domid_t domid; /* 4 */ - u16 handle; /* 8 */ - s16 status; /* 10: GNTST_* */ - u32 __pad; -} gnttab_donate_t; /* 14 bytes */ + /* IN parameters. */ + unsigned long mfn; + domid_t domid; + grant_ref_t ref; + /* OUT parameters. */ + s16 status; +} gnttab_transfer_t; /* * Bitfield values for update_pin_status.flags. diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/public/io/netif.h --- a/xen/include/public/io/netif.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/public/io/netif.h Thu Sep 22 17:42:01 2005 @@ -10,10 +10,11 @@ #define __XEN_PUBLIC_IO_NETIF_H__ typedef struct netif_tx_request { - unsigned long addr; /* Machine address of packet. */ + grant_ref_t gref; /* Reference to buffer page */ + u16 offset:15; /* Offset within buffer page */ u16 csum_blank:1; /* Proto csum field blank? */ - u16 id:15; /* Echoed in response message. */ - u16 size; /* Packet size in bytes. */ + u16 id; /* Echoed in response message. */ + u16 size; /* Packet size in bytes. */ } netif_tx_request_t; typedef struct netif_tx_response { @@ -22,21 +23,15 @@ } netif_tx_response_t; typedef struct { - u16 id; /* Echoed in response message. */ -#ifdef CONFIG_XEN_NETDEV_GRANT - grant_ref_t gref; /* 2: Reference to incoming granted frame */ -#endif + u16 id; /* Echoed in response message. */ + grant_ref_t gref; /* Reference to incoming granted frame */ } netif_rx_request_t; typedef struct { -#ifdef CONFIG_XEN_NETDEV_GRANT - u32 addr; /* 0: Offset in page of start of received packet */ -#else - unsigned long addr; /* Machine address of packet. */ -#endif - u16 csum_valid:1; /* Protocol checksum is validated? */ - u16 id:15; - s16 status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ + u16 offset; /* Offset in page of start of received packet */ + u16 csum_valid; /* Protocol checksum is validated? */ + u16 id; + s16 status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ } netif_rx_response_t; /* @@ -53,18 +48,8 @@ #define MASK_NETIF_RX_IDX(_i) ((_i)&(NETIF_RX_RING_SIZE-1)) #define MASK_NETIF_TX_IDX(_i) ((_i)&(NETIF_TX_RING_SIZE-1)) -#ifdef __x86_64__ -/* - * This restriction can be lifted when we move netfront/netback to use - * grant tables. This will remove memory_t fields from the above structures - * and thus relax natural alignment restrictions. - */ -#define NETIF_TX_RING_SIZE 128 -#define NETIF_RX_RING_SIZE 128 -#else #define NETIF_TX_RING_SIZE 256 #define NETIF_RX_RING_SIZE 256 -#endif /* This structure must fit in a memory page. */ typedef struct netif_tx_interface { diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/public/io/tpmif.h --- a/xen/include/public/io/tpmif.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/public/io/tpmif.h Thu Sep 22 17:42:01 2005 @@ -20,8 +20,7 @@ unsigned long addr; /* Machine address of packet. */ int ref; /* grant table access reference */ u16 id; /* Echoed in response message. */ - u16 size:15; /* Packet size in bytes. */ - u16 mapped:1; + u16 size; /* Packet size in bytes. */ } tpmif_tx_request_t; /* @@ -30,13 +29,16 @@ */ typedef u32 TPMIF_RING_IDX; -#define TPMIF_TX_RING_SIZE 16 +#define TPMIF_TX_RING_SIZE 10 /* This structure must fit in a memory page. */ + typedef struct { - union { - tpmif_tx_request_t req; - } ring[TPMIF_TX_RING_SIZE]; + tpmif_tx_request_t req; +} tpmif_ring_t; + +typedef struct { + tpmif_ring_t ring[TPMIF_TX_RING_SIZE]; } tpmif_tx_interface_t; #endif diff -r 97dbd9524a7e -r 06d84bf87159 xen/include/xen/grant_table.h --- a/xen/include/xen/grant_table.h Thu Sep 22 17:34:14 2005 +++ b/xen/include/xen/grant_table.h Thu Sep 22 17:42:01 2005 @@ -106,12 +106,6 @@ gnttab_prepare_for_transfer( struct domain *rd, struct domain *ld, grant_ref_t ref); -/* Notify 'rd' of a completed transfer via an already-locked grant entry. */ -void -gnttab_notify_transfer( - struct domain *rd, struct domain *ld, - grant_ref_t ref, unsigned long frame); - /* Domain death release of granted device mappings of other domains.*/ void gnttab_release_dev_mappings(grant_table_t *gt); diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/architecture.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/architecture.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,140 @@ +\chapter{Virtual Architecture} + +On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It +has full access to the physical memory available in the system and is +responsible for allocating portions of it to the domains. Guest +operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as +they see fit. Segmentation is used to prevent the guest OS from +accessing the portion of the address space that is reserved for Xen. +We expect most guest operating systems will use ring 1 for their own +operation and place applications in ring 3. + +In this chapter we consider the basic virtual architecture provided by +Xen: the basic CPU state, exception and interrupt handling, and time. +Other aspects such as memory and device access are discussed in later +chapters. + + +\section{CPU state} + +All privileged state must be handled by Xen. The guest OS has no +direct access to CR3 and is not permitted to update privileged bits in +EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; +these are analogous to system calls but occur from ring 1 to ring 0. + +A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. + + +\section{Exceptions} + +A virtual IDT is provided --- a domain can submit a table of trap +handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap +handlers are identical to native x86 handlers, although the page-fault +handler is somewhat different. + + +\section{Interrupts and events} + +Interrupts are virtualized by mapping them to \emph{events}, which are +delivered asynchronously to the target domain using a callback +supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map +these events onto its standard interrupt dispatch mechanisms. Xen is +responsible for determining the target domain that will handle each +physical interrupt source. For more details on the binding of event +sources to events, see Chapter~\ref{c:devices}. + + +\section{Time} + +Guest operating systems need to be aware of the passage of both real +(or wallclock) time and their own `virtual time' (the time for which +they have been executing). Furthermore, Xen has a notion of time which +is used for scheduling. The following notions of time are provided: + +\begin{description} +\item[Cycle counter time.] + + This provides a fine-grained time reference. The cycle counter time + is used to accurately extrapolate the other time references. On SMP + machines it is currently assumed that the cycle counter time is + synchronized between CPUs. The current x86-based implementation + achieves this within inter-CPU communication latencies. + +\item[System time.] + + This is a 64-bit counter which holds the number of nanoseconds that + have elapsed since system boot. + +\item[Wall clock time.] + + This is the time of day in a Unix-style {\tt struct timeval} + (seconds and microseconds since 1 January 1970, adjusted by leap + seconds). An NTP client hosted by {\it domain 0} can keep this + value accurate. + +\item[Domain virtual time.] + + This progresses at the same pace as system time, but only while a + domain is executing --- it stops while a domain is de-scheduled. + Therefore the share of the CPU that a domain receives is indicated + by the rate at which its virtual time increases. + +\end{description} + + +Xen exports timestamps for system time and wall-clock time to guest +operating systems through a shared page of memory. Xen also provides +the cycle counter time at the instant the timestamps were calculated, +and the CPU frequency in Hertz. This allows the guest to extrapolate +system and wall-clock times accurately based on the current cycle +counter time. + +Since all time stamps need to be updated and read \emph{atomically} +two version numbers are also stored in the shared info page. The first +is incremented prior to an update, while the second is only +incremented afterwards. Thus a guest can be sure that it read a +consistent state by checking the two version numbers are equal. + +Xen includes a periodic ticker which sends a timer event to the +currently executing domain every 10ms. The Xen scheduler also sends a +timer event whenever a domain is scheduled; this allows the guest OS +to adjust for the time that has passed while it has been inactive. In +addition, Xen allows each domain to request that they receive a timer +event sent at a specified system time by using the {\tt + set\_timer\_op()} hypercall. Guest OSes may use this timer to +implement timeout values when they block. + + + +%% % akw: demoting this to a section -- not sure if there is any point +%% % though, maybe just remove it. + +\section{Xen CPU Scheduling} + +Xen offers a uniform API for CPU schedulers. It is possible to choose +from a number of schedulers at boot and it should be easy to add more. +The BVT, Atropos and Round Robin schedulers are part of the normal Xen +distribution. BVT provides proportional fair shares of the CPU to the +running domains. Atropos can be used to reserve absolute shares of +the CPU for each domain. Round-robin is provided as an example of +Xen's internal scheduler API. + +\paragraph*{Note: SMP host support} +Xen has always supported SMP host systems. Domains are statically +assigned to CPUs, either at creation time or when manually pinning to +a particular CPU. The current schedulers then run locally on each CPU +to decide which of the assigned domains should be run there. The +user-level control software can be used to perform coarse-grain +load-balancing between CPUs. + + +%% More information on the characteristics and use of these schedulers +%% is available in {\tt Sched-HOWTO.txt}. + + +\section{Privileged operations} + +Xen exports an extended interface to privileged domains (viz.\ {\it + Domain 0}). This allows such domains to build and boot other domains +on the server, and provides control interfaces for managing +scheduling, memory, networking, and block devices. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/debugging.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/debugging.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,62 @@ +\chapter{Debugging} + +Xen provides tools for debugging both Xen and guest OSes. Currently, the +Pervasive Debugger provides a GDB stub, which provides facilities for symbolic +debugging of Xen itself and of OS kernels running on top of Xen. The Trace +Buffer provides a lightweight means to log data about Xen's internal state and +behaviour at runtime, for later analysis. + +\section{Pervasive Debugger} + +Information on using the pervasive debugger is available in pdb.txt. + + +\section{Trace Buffer} + +The trace buffer provides a means to observe Xen's operation from domain 0. +Trace events, inserted at key points in Xen's code, record data that can be +read by the {\tt xentrace} tool. Recording these events has a low overhead +and hence the trace buffer may be useful for debugging timing-sensitive +behaviours. + +\subsection{Internal API} + +To use the trace buffer functionality from within Xen, you must {\tt \#include +<xen/trace.h>}, which contains definitions related to the trace buffer. Trace +events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, +2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional +(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these +will insert the event ID and data into the trace buffer, along with the current +value of the CPU cycle-counter. For builds without the trace buffer enabled, +the macros expand to no-ops and thus can be left in place without incurring +overheads. + +\subsection{Trace-enabled builds} + +By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} +is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, +either in {\tt <xen/config.h>} or on the gcc command line. + +The size (in pages) of the per-CPU trace buffers can be specified using the +{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace +buffers will be disabled. + +\subsection{Dumping trace data} + +When running a trace buffer build of Xen, trace data are written continuously +into the buffer data areas, with newer data overwriting older data. This data +can be captured using the {\tt xentrace} program in domain 0. + +The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace +buffers into its address space. It then periodically polls all the buffers for +new data, dumping out any new records from each buffer in turn. As a result, +for machines with multiple (logical) CPUs, the trace buffer output will not be +in overall chronological order. + +The output from {\tt xentrace} can be post-processed using {\tt +xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and +{\tt xentrace\_format} (used to pretty-print trace data). For the predefined +trace points, there is an example format file in {\tt tools/xentrace/formats }. + +For more information, see the manual pages for {\tt xentrace}, {\tt +xentrace\_format} and {\tt xentrace\_cpusplit}. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/devices.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/devices.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,178 @@ +\chapter{Devices} +\label{c:devices} + +Devices such as network and disk are exported to guests using a split +device driver. The device driver domain, which accesses the physical +device directly also runs a \emph{backend} driver, serving requests to +that device from guests. Each guest will use a simple \emph{frontend} +driver, to access the backend. Communication between these domains is +composed of two parts: First, data is placed onto a shared memory page +between the domains. Second, an event channel between the two domains +is used to pass notification that data is outstanding. This +separation of notification from data transfer allows message batching, +and results in very efficient device access. + +Event channels are used extensively in device virtualization; each +domain has a number of end-points or \emph{ports} each of which may be +bound to one of the following \emph{event sources}: +\begin{itemize} + \item a physical interrupt from a real device, + \item a virtual interrupt (callback) from Xen, or + \item a signal from another domain +\end{itemize} + +Events are lightweight and do not carry much information beyond the +source of the notification. Hence when performing bulk data transfer, +events are typically used as synchronization primitives over a shared +memory transport. Event channels are managed via the {\tt + event\_channel\_op()} hypercall; for more details see +Section~\ref{s:idc}. + +This chapter focuses on some individual device interfaces available to +Xen guests. + + +\section{Network I/O} + +Virtual network device services are provided by shared memory +communication with a backend domain. From the point of view of other +domains, the backend may be viewed as a virtual ethernet switch +element with each domain having one or more virtual network interfaces +connected to it. + +\subsection{Backend Packet Handling} + +The backend driver is responsible for a variety of actions relating to +the transmission and reception of packets from the physical device. +With regard to transmission, the backend performs these key actions: + +\begin{itemize} +\item {\bf Validation:} To ensure that domains do not attempt to + generate invalid (e.g. spoofed) traffic, the backend driver may + validate headers ensuring that source MAC and IP addresses match the + interface that they have been sent from. + + Validation functions can be configured using standard firewall rules + ({\small{\tt iptables}} in the case of Linux). + +\item {\bf Scheduling:} Since a number of domains can share a single + physical network interface, the backend must mediate access when + several domains each have packets queued for transmission. This + general scheduling function subsumes basic shaping or rate-limiting + schemes. + +\item {\bf Logging and Accounting:} The backend domain can be + configured with classifier rules that control how packets are + accounted or logged. For example, log messages might be generated + whenever a domain attempts to send a TCP packet containing a SYN. +\end{itemize} + +On receipt of incoming packets, the backend acts as a simple +demultiplexer: Packets are passed to the appropriate virtual interface +after any necessary logging and accounting have been carried out. + +\subsection{Data Transfer} + +Each virtual interface uses two ``descriptor rings'', one for +transmit, the other for receive. Each descriptor identifies a block +of contiguous physical memory allocated to the domain. + +The transmit ring carries packets to transmit from the guest to the +backend domain. The return path of the transmit ring carries messages +indicating that the contents have been physically transmitted and the +backend no longer requires the associated pages of memory. + +To receive packets, the guest places descriptors of unused pages on +the receive ring. The backend will return received packets by +exchanging these pages in the domain's memory with new pages +containing the received data, and passing back descriptors regarding +the new packets on the ring. This zero-copy approach allows the +backend to maintain a pool of free pages to receive packets into, and +then deliver them to appropriate domains after examining their +headers. + +% Real physical addresses are used throughout, with the domain +% performing translation from pseudo-physical addresses if that is +% necessary. + +If a domain does not keep its receive ring stocked with empty buffers +then packets destined to it may be dropped. This provides some +defence against receive livelock problems because an overload domain +will cease to receive further data. Similarly, on the transmit path, +it provides the application with feedback on the rate at which packets +are able to leave the system. + +Flow control on rings is achieved by including a pair of producer +indexes on the shared ring page. Each side will maintain a private +consumer index indicating the next outstanding message. In this +manner, the domains cooperate to divide the ring into two message +lists, one in each direction. Notification is decoupled from the +immediate placement of new messages on the ring; the event channel +will be used to generate notification when {\em either} a certain +number of outstanding messages are queued, {\em or} a specified number +of nanoseconds have elapsed since the oldest message was placed on the +ring. + +%% Not sure if my version is any better -- here is what was here +%% before: Synchronization between the backend domain and the guest is +%% achieved using counters held in shared memory that is accessible to +%% both. Each ring has associated producer and consumer indices +%% indicating the area in the ring that holds descriptors that contain +%% data. After receiving {\it n} packets or {\t nanoseconds} after +%% receiving the first packet, the hypervisor sends an event to the +%% domain. + + +\section{Block I/O} + +All guest OS disk access goes through the virtual block device VBD +interface. This interface allows domains access to portions of block +storage devices visible to the the block backend device. The VBD +interface is a split driver, similar to the network interface +described above. A single shared memory ring is used between the +frontend and backend drivers, across which read and write messages are +sent. + +Any block device accessible to the backend domain, including +network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, +can be exported as a VBD. Each VBD is mapped to a device node in the +guest, specified in the guest's startup configuration. + +Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since +similar functionality can be achieved using the more complete LVM +system, which is already in widespread use. + +\subsection{Data Transfer} + +The single ring between the guest and the block backend supports three +messages: + +\begin{description} +\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to + this guest from the backend. The request includes a descriptor of a + free page into which the reply will be written by the backend. + +\item [{\small {\tt READ}}:] Read data from the specified block + device. The front end identifies the device and location to read + from and attaches pages for the data to be copied to (typically via + DMA from the device). The backend acknowledges completed read + requests as they finish. + +\item [{\small {\tt WRITE}}:] Write data to the specified block + device. This functions essentially as {\small {\tt READ}}, except + that the data moves to the device instead of from it. +\end{description} + +%% um... some old text: In overview, the same style of descriptor-ring +%% that is used for network packets is used here. Each domain has one +%% ring that carries operation requests to the hypervisor and carries +%% the results back again. + +%% Rather than copying data, the backend simply maps the domain's +%% buffers in order to enable direct DMA to them. The act of mapping +%% the buffers also increases the reference counts of the underlying +%% pages, so that the unprivileged domain cannot try to return them to +%% the hypervisor, install them as page tables, or any other unsafe +%% behaviour. +%% +%% % block API here diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/further_info.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/further_info.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,49 @@ +\chapter{Further Information} + +If you have questions that are not answered by this manual, the +sources of information listed below may be of interest to you. Note +that bug reports, suggestions and contributions related to the +software (or the documentation) should be sent to the Xen developers' +mailing list (address below). + + +\section{Other documentation} + +If you are mainly interested in using (rather than developing for) +Xen, the \emph{Xen Users' Manual} is distributed in the {\tt docs/} +directory of the Xen source distribution. + +% Various HOWTOs are also available in {\tt docs/HOWTOS}. + + +\section{Online references} + +The official Xen web site is found at: +\begin{quote} +{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} +\end{quote} + +This contains links to the latest versions of all on-line +documentation. + + +\section{Mailing lists} + +There are currently four official Xen mailing lists: + +\begin{description} +\item[xen-devel@xxxxxxxxxxxxxxxxxxx] Used for development + discussions and bug reports. Subscribe at: \\ + {\small {\tt http://lists.xensource.com/xen-devel}} +\item[xen-users@xxxxxxxxxxxxxxxxxxx] Used for installation and usage + discussions and requests for help. Subscribe at: \\ + {\small {\tt http://lists.xensource.com/xen-users}} +\item[xen-announce@xxxxxxxxxxxxxxxxxxx] Used for announcements only. + Subscribe at: \\ + {\small {\tt http://lists.xensource.com/xen-announce}} +\item[xen-changelog@xxxxxxxxxxxxxxxxxxx] Changelog feed + from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ + {\small {\tt http://lists.xensource.com/xen-changelog}} +\end{description} + +Of these, xen-devel is the most active. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/hypercalls.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/hypercalls.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,524 @@ + +\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} + +\chapter{Xen Hypercalls} +\label{a:hypercalls} + +Hypercalls represent the procedural interface to Xen; this appendix +categorizes and describes the current set of hypercalls. + +\section{Invoking Hypercalls} + +Hypercalls are invoked in a manner analogous to system calls in a +conventional operating system; a software interrupt is issued which +vectors to an entry point within Xen. On x86\_32 machines the +instruction required is {\tt int \$82}; the (real) IDT is setup so +that this may only be issued from within ring 1. The particular +hypercall to be invoked is contained in {\tt EAX} --- a list +mapping these values to symbolic hypercall names can be found +in {\tt xen/include/public/xen.h}. + +On some occasions a set of hypercalls will be required to carry +out a higher-level function; a good example is when a guest +operating wishes to context switch to a new process which +requires updating various privileged CPU state. As an optimization +for these cases, there is a generic mechanism to issue a set of +hypercalls as a batch: + +\begin{quote} +\hypercall{multicall(void *call\_list, int nr\_calls)} + +Execute a series of hypervisor calls; {\tt nr\_calls} is the length of +the array of {\tt multicall\_entry\_t} structures pointed to be {\tt +call\_list}. Each entry contains the hypercall operation code followed +by up to 7 word-sized arguments. +\end{quote} + +Note that multicalls are provided purely as an optimization; there is +no requirement to use them when first porting a guest operating +system. + + +\section{Virtual CPU Setup} + +At start of day, a guest operating system needs to setup the virtual +CPU it is executing on. This includes installing vectors for the +virtual IDT so that the guest OS can handle interrupts, page faults, +etc. However the very first thing a guest OS must setup is a pair +of hypervisor callbacks: these are the entry points which Xen will +use when it wishes to notify the guest OS of an occurrence. + +\begin{quote} +\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long + event\_address, unsigned long failsafe\_selector, unsigned long + failsafe\_address) } + +Register the normal (``event'') and failsafe callbacks for +event processing. In each case the code segment selector and +address within that segment are provided. The selectors must +have RPL 1; in XenLinux we simply use the kernel's CS for both +{\tt event\_selector} and {\tt failsafe\_selector}. + +The value {\tt event\_address} specifies the address of the guest OSes +event handling and dispatch routine; the {\tt failsafe\_address} +specifies a separate entry point which is used only if a fault occurs +when Xen attempts to use the normal callback. +\end{quote} + + +After installing the hypervisor callbacks, the guest OS can +install a `virtual IDT' by using the following hypercall: + +\begin{quote} +\hypercall{set\_trap\_table(trap\_info\_t *table)} + +Install one or more entries into the per-domain +trap handler table (essentially a software version of the IDT). +Each entry in the array pointed to by {\tt table} includes the +exception vector number with the corresponding segment selector +and entry point. Most guest OSes can use the same handlers on +Xen as when running on the real hardware; an exception is the +page fault handler (exception vector 14) where a modified +stack-frame layout is used. + + +\end{quote} + + + +\section{Scheduling and Timer} + +Domains are preemptively scheduled by Xen according to the +parameters installed by domain 0 (see Section~\ref{s:dom0ops}). +In addition, however, a domain may choose to explicitly +control certain behavior with the following hypercall: + +\begin{quote} +\hypercall{sched\_op(unsigned long op)} + +Request scheduling operation from hypervisor. The options are: {\it +yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the +calling domain runnable but may cause a reschedule if other domains +are runnable. {\it block} removes the calling domain from the run +queue and cause is to sleeps until an event is delivered to it. {\it +shutdown} is used to end the domain's execution; the caller can +additionally specify whether the domain should reboot, halt or +suspend. +\end{quote} + +To aid the implementation of a process scheduler within a guest OS, +Xen provides a virtual programmable timer: + +\begin{quote} +\hypercall{set\_timer\_op(uint64\_t timeout)} + +Request a timer event to be sent at the specified system time (time +in nanoseconds since system boot). The hypercall actually passes the +64-bit timeout value as a pair of 32-bit values. + +\end{quote} + +Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} +allows block-with-timeout semantics. + + +\section{Page Table Management} + +Since guest operating systems have read-only access to their page +tables, Xen must be involved when making any changes. The following +multi-purpose hypercall can be used to modify page-table entries, +update the machine-to-physical mapping table, flush the TLB, install +a new page-table base pointer, and more. + +\begin{quote} +\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} + +Update the page table for the domain; a set of {\tt count} updates are +submitted for processing in a batch, with {\tt success\_count} being +updated to report the number of successful updates. + +Each element of {\tt req[]} contains a pointer (address) and value; +the least significant 2-bits of the pointer are used to distinguish +the type of update requested as follows: +\begin{description} + +\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or +page table entry to the associated value; Xen will check that the +update is safe, as described in Chapter~\ref{c:memory}. + +\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the + machine-to-physical table. The calling domain must own the machine + page in question (or be privileged). + +\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. +The set of additional MMU operations is considerable, and includes +updating {\tt cr3} (or just re-installing it for a TLB flush), +flushing the cache, installing a new LDT, or pinning \& unpinning +page-table pages (to ensure their reference count doesn't drop to zero +which would require a revalidation of all entries). + +Further extended commands are used to deal with granting and +acquiring page ownership; see Section~\ref{s:idc}. + + +\end{description} + +More details on the precise format of all commands can be +found in {\tt xen/include/public/xen.h}. + + +\end{quote} + +Explicitly updating batches of page table entries is extremely +efficient, but can require a number of alterations to the guest +OS. Using the writable page table mode (Chapter~\ref{c:memory}) is +recommended for new OS ports. + +Regardless of which page table update mode is being used, however, +there are some occasions (notably handling a demand page fault) where +a guest OS will wish to modify exactly one PTE rather than a +batch. This is catered for by the following: + +\begin{quote} +\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long +val, \\ unsigned long flags)} + +Update the currently installed PTE for the page {\tt page\_nr} to +{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification +is safe before applying it. The {\tt flags} determine which kind +of TLB flush, if any, should follow the update. + +\end{quote} + +Finally, sufficiently privileged domains may occasionally wish to manipulate +the pages of others: +\begin{quote} + +\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, +unsigned long val, unsigned long flags, uint16\_t domid)} + +Identical to {\tt update\_va\_mapping()} save that the pages being +mapped must belong to the domain {\tt domid}. + +\end{quote} + +This privileged operation is currently used by backend virtual device +drivers to safely map pages containing I/O data. + + + +\section{Segmentation Support} + +Xen allows guest OSes to install a custom GDT if they require it; +this is context switched transparently whenever a domain is +[de]scheduled. The following hypercall is effectively a +`safe' version of {\tt lgdt}: + +\begin{quote} +\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} + +Install a global descriptor table for a domain; {\tt frame\_list} is +an array of up to 16 machine page frames within which the GDT resides, +with {\tt entries} being the actual number of descriptor-entry +slots. All page frames must be mapped read-only within the guest's +address space, and the table must be large enough to contain Xen's +reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). + +\end{quote} + +Many guest OSes will also wish to install LDTs; this is achieved by +using {\tt mmu\_update()} with an extended command, passing the +linear address of the LDT base along with the number of entries. No +special safety checks are required; Xen needs to perform this task +simply since {\tt lldt} requires CPL 0. + + +Xen also allows guest operating systems to update just an +individual segment descriptor in the GDT or LDT: + +\begin{quote} +\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, +unsigned long word2)} + +Update the GDT/LDT entry at machine address {\tt ma}; the new +8-byte descriptor is stored in {\tt word1} and {\tt word2}. +Xen performs a number of checks to ensure the descriptor is +valid. + +\end{quote} + +Guest OSes can use the above in place of context switching entire +LDTs (or the GDT) when the number of changing descriptors is small. + +\section{Context Switching} + +When a guest OS wishes to context switch between two processes, +it can use the page table and segmentation hypercalls described +above to perform the the bulk of the privileged work. In addition, +however, it will need to invoke Xen to switch the kernel (ring 1) +stack pointer: + +\begin{quote} +\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} + +Request kernel stack switch from hypervisor; {\tt ss} is the new +stack segment, which {\tt esp} is the new stack pointer. + +\end{quote} + +A final useful hypercall for context switching allows ``lazy'' +save and restore of floating point state: + +\begin{quote} +\hypercall{fpu\_taskswitch(void)} + +This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} +control register; this means that the next attempt to use floating +point will cause a trap which the guest OS can trap. Typically it will +then save/restore the FP state, and clear the {\tt TS} bit. +\end{quote} + +This is provided as an optimization only; guest OSes can also choose +to save and restore FP state on all context switches for simplicity. + + +\section{Physical Memory Management} + +As mentioned previously, each domain has a maximum and current +memory allocation. The maximum allocation, set at domain creation +time, cannot be modified. However a domain can choose to reduce +and subsequently grow its current allocation by using the +following call: + +\begin{quote} +\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, + unsigned long nr\_extents, unsigned int extent\_order)} + +Increase or decrease current memory allocation (as determined by +the value of {\tt op}). Each invocation provides a list of +extents each of which is $2^s$ pages in size, +where $s$ is the value of {\tt extent\_order}. + +\end{quote} + +In addition to simply reducing or increasing the current memory +allocation via a `balloon driver', this call is also useful for +obtaining contiguous regions of machine memory when required (e.g. +for certain PCI devices, or if using superpages). + + +\section{Inter-Domain Communication} +\label{s:idc} + +Xen provides a simple asynchronous notification mechanism via +\emph{event channels}. Each domain has a set of end-points (or +\emph{ports}) which may be bound to an event source (e.g. a physical +IRQ, a virtual IRQ, or an port in another domain). When a pair of +end-points in two different domains are bound together, then a `send' +operation on one will cause an event to be received by the destination +domain. + +The control and use of event channels involves the following hypercall: + +\begin{quote} +\hypercall{event\_channel\_op(evtchn\_op\_t *op)} + +Inter-domain event-channel management; {\tt op} is a discriminated +union which allows the following 7 operations: + +\begin{description} + +\item[\it alloc\_unbound:] allocate a free (unbound) local + port and prepare for connection from a specified domain. +\item[\it bind\_virq:] bind a local port to a virtual +IRQ; any particular VIRQ can be bound to at most one port per domain. +\item[\it bind\_pirq:] bind a local port to a physical IRQ; +once more, a given pIRQ can be bound to at most one port per +domain. Furthermore the calling domain must be sufficiently +privileged. +\item[\it bind\_interdomain:] construct an interdomain event +channel; in general, the target domain must have previously allocated +an unbound port for this channel, although this can be bypassed by +privileged domains during domain setup. +\item[\it close:] close an interdomain event channel. +\item[\it send:] send an event to the remote end of a +interdomain event channel. +\item[\it status:] determine the current status of a local port. +\end{description} + +For more details see +{\tt xen/include/public/event\_channel.h}. + +\end{quote} + +Event channels are the fundamental communication primitive between +Xen domains and seamlessly support SMP. However they provide little +bandwidth for communication {\sl per se}, and hence are typically +married with a piece of shared memory to produce effective and +high-performance inter-domain communication. + +Safe sharing of memory pages between guest OSes is carried out by +granting access on a per page basis to individual domains. This is +achieved by using the {\tt grant\_table\_op()} hypercall. + +\begin{quote} +\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} + +Grant or remove access to a particular page to a particular domain. + +\end{quote} + +This is not currently widely in use by guest operating systems, but +we intend to integrate support more fully in the near future. + +\section{PCI Configuration} + +Domains with physical device access (i.e.\ driver domains) receive +limited access to certain PCI devices (bus address space and +interrupts). However many guest operating systems attempt to +determine the PCI configuration by directly access the PCI BIOS, +which cannot be allowed for safety. + +Instead, Xen provides the following hypercall: + +\begin{quote} +\hypercall{physdev\_op(void *physdev\_op)} + +Perform a PCI configuration option; depending on the value +of {\tt physdev\_op} this can be a PCI config read, a PCI config +write, or a small number of other queries. + +\end{quote} + + +For examples of using {\tt physdev\_op()}, see the +Xen-specific PCI code in the linux sparse tree. + +\section{Administrative Operations} +\label{s:dom0ops} + +A large number of control operations are available to a sufficiently +privileged domain (typically domain 0). These allow the creation and +management of new domains, for example. A complete list is given +below: for more details on any or all of these, please see +{\tt xen/include/public/dom0\_ops.h} + + +\begin{quote} +\hypercall{dom0\_op(dom0\_op\_t *op)} + +Administrative domain operations for domain management. The options are: + +\begin{description} +\item [\it DOM0\_CREATEDOMAIN:] create a new domain + +\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run +queue. + +\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable + once again. + +\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated +with a domain + +\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain + +\item [\it DOM0\_SCHEDCTL:] + +\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain + +\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain + +\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain + +\item [\it DOM0\_GETPAGEFRAMEINFO:] + +\item [\it DOM0\_GETPAGEFRAMEINFO2:] + +\item [\it DOM0\_IOPL:] set I/O privilege level + +\item [\it DOM0\_MSR:] read or write model specific registers + +\item [\it DOM0\_DEBUG:] interactively invoke the debugger + +\item [\it DOM0\_SETTIME:] set system time + +\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring + +\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU + +\item [\it DOM0\_GETTBUFS:] get information about the size and location of + the trace buffers (only on trace-buffer enabled builds) + +\item [\it DOM0\_PHYSINFO:] get information about the host machine + +\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions + +\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler + +\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes + +\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain + +\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain + +\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options +\end{description} +\end{quote} + +Most of the above are best understood by looking at the code +implementing them (in {\tt xen/common/dom0\_ops.c}) and in +the user-space tools that use them (mostly in {\tt tools/libxc}). + +\section{Debugging Hypercalls} + +A few additional hypercalls are mainly useful for debugging: + +\begin{quote} +\hypercall{console\_io(int cmd, int count, char *str)} + +Use Xen to interact with the console; operations are: + +{\it CONSOLEIO\_write}: Output count characters from buffer str. + +{\it CONSOLEIO\_read}: Input at most count characters into buffer str. +\end{quote} + +A pair of hypercalls allows access to the underlying debug registers: +\begin{quote} +\hypercall{set\_debugreg(int reg, unsigned long value)} + +Set debug register {\tt reg} to {\tt value} + +\hypercall{get\_debugreg(int reg)} + +Return the contents of the debug register {\tt reg} +\end{quote} + +And finally: +\begin{quote} +\hypercall{xen\_version(int cmd)} + +Request Xen version number. +\end{quote} + +This is useful to ensure that user-space tools are in sync +with the underlying hypervisor. + +\section{Deprecated Hypercalls} + +Xen is under constant development and refinement; as such there +are plans to improve the way in which various pieces of functionality +are exposed to guest OSes. + +\begin{quote} +\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} + +Toggle various memory management modes (in particular wrritable page +tables and superpage support). + +\end{quote} + +This is likely to be replaced with mode values in the shared +information page since this is more resilient for resumption +after migration or checkpoint. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/memory.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/memory.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,162 @@ +\chapter{Memory} +\label{c:memory} + +Xen is responsible for managing the allocation of physical memory to +domains, and for ensuring safe use of the paging and segmentation +hardware. + + +\section{Memory Allocation} + +Xen resides within a small fixed portion of physical memory; it also +reserves the top 64MB of every virtual address space. The remaining +physical memory is available for allocation to domains at a page +granularity. Xen tracks the ownership and use of each page, which +allows it to enforce secure partitioning between domains. + +Each domain has a maximum and current physical memory allocation. A +guest OS may run a `balloon driver' to dynamically adjust its current +memory allocation up to its limit. + + +%% XXX SMH: I use machine and physical in the next section (which is +%% kinda required for consistency with code); wonder if this section +%% should use same terms? +%% +%% Probably. +%% +%% Merging this and below section at some point prob makes sense. + +\section{Pseudo-Physical Memory} + +Since physical memory is allocated and freed on a page granularity, +there is no guarantee that a domain will receive a contiguous stretch +of physical memory. However most operating systems do not have good +support for operating in a fragmented physical address space. To aid +porting such operating systems to run on top of Xen, we make a +distinction between \emph{machine memory} and \emph{pseudo-physical + memory}. + +Put simply, machine memory refers to the entire amount of memory +installed in the machine, including that reserved by Xen, in use by +various domains, or currently unallocated. We consider machine memory +to comprise a set of 4K \emph{machine page frames} numbered +consecutively starting from 0. Machine frame numbers mean the same +within Xen or any domain. + +Pseudo-physical memory, on the other hand, is a per-domain +abstraction. It allows a guest operating system to consider its memory +allocation to consist of a contiguous range of physical page frames +starting at physical frame 0, despite the fact that the underlying +machine page frames may be sparsely allocated and in any order. + +To achieve this, Xen maintains a globally readable {\it + machine-to-physical} table which records the mapping from machine +page frames to pseudo-physical ones. In addition, each domain is +supplied with a {\it physical-to-machine} table which performs the +inverse mapping. Clearly the machine-to-physical table has size +proportional to the amount of RAM installed in the machine, while each +physical-to-machine table has size proportional to the memory +allocation of the given domain. + +Architecture dependent code in guest operating systems can then use +the two tables to provide the abstraction of pseudo-physical memory. +In general, only certain specialized parts of the operating system +(such as page table management) needs to understand the difference +between machine and pseudo-physical addresses. + + +\section{Page Table Updates} + +In the default mode of operation, Xen enforces read-only access to +page tables and requires guest operating systems to explicitly request +any modifications. Xen validates all such requests and only applies +updates that it deems safe. This is necessary to prevent domains from +adding arbitrary mappings to their page tables. + +To aid validation, Xen associates a type and reference count with each +memory page. A page has one of the following mutually-exclusive types +at any point in time: page directory ({\sf PD}), page table ({\sf + PT}), local descriptor table ({\sf LDT}), global descriptor table +({\sf GDT}), or writable ({\sf RW}). Note that a guest OS may always +create readable mappings of its own memory regardless of its current +type. + +%%% XXX: possibly explain more about ref count 'lifecyle' here? +This mechanism is used to maintain the invariants required for safety; +for example, a domain cannot have a writable mapping to any part of a +page table as this would require the page concerned to simultaneously +be of types {\sf PT} and {\sf RW}. + + +% \section{Writable Page Tables} + +Xen also provides an alternative mode of operation in which guests be +have the illusion that their page tables are directly writable. Of +course this is not really the case, since Xen must still validate +modifications to ensure secure partitioning. To this end, Xen traps +any write attempt to a memory page of type {\sf PT} (i.e., that is +currently part of a page table). If such an access occurs, Xen +temporarily allows write access to that page while at the same time +\emph{disconnecting} it from the page table that is currently in use. +This allows the guest to safely make updates to the page because the +newly-updated entries cannot be used by the MMU until Xen revalidates +and reconnects the page. Reconnection occurs automatically in a +number of situations: for example, when the guest modifies a different +page-table page, when the domain is preempted, or whenever the guest +uses Xen's explicit page-table update interfaces. + +Finally, Xen also supports a form of \emph{shadow page tables} in +which the guest OS uses a independent copy of page tables which are +unknown to the hardware (i.e.\ which are never pointed to by {\tt + cr3}). Instead Xen propagates changes made to the guest's tables to +the real ones, and vice versa. This is useful for logging page writes +(e.g.\ for live migration or checkpoint). A full version of the shadow +page tables also allows guest OS porting with less effort. + + +\section{Segment Descriptor Tables} + +On boot a guest is supplied with a default GDT, which does not reside +within its own memory allocation. If the guest wishes to use other +than the default `flat' ring-1 and ring-3 segments that this GDT +provides, it must register a custom GDT and/or LDT with Xen, allocated +from its own memory. Note that a number of GDT entries are reserved by +Xen -- any custom GDT must also include sufficient space for these +entries. + +For example, the following hypercall is used to specify a new GDT: + +\begin{quote} + int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em + entries}) + + \emph{frame\_list}: An array of up to 16 machine page frames within + which the GDT resides. Any frame registered as a GDT frame may only + be mapped read-only within the guest's address space (e.g., no + writable mappings, no use as a page-table page, and so on). + + \emph{entries}: The number of descriptor-entry slots in the GDT. + Note that the table must be large enough to contain Xen's reserved + entries; thus we must have `{\em entries $>$ + LAST\_RESERVED\_GDT\_ENTRY}\ '. Note also that, after registering + the GDT, slots \emph{FIRST\_} through + \emph{LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest + and may be overwritten by Xen. +\end{quote} + +The LDT is updated via the generic MMU update mechanism (i.e., via the +{\tt mmu\_update()} hypercall. + +\section{Start of Day} + +The start-of-day environment for guest operating systems is rather +different to that provided by the underlying hardware. In particular, +the processor is already executing in protected mode with paging +enabled. + +{\it Domain 0} is created and booted by Xen itself. For all subsequent +domains, the analogue of the boot-loader is the {\it domain builder}, +user-space software running in {\it domain 0}. The domain builder is +responsible for building the initial page tables for a domain and +loading its kernel image at the appropriate virtual address. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/interface/scheduling.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/interface/scheduling.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,268 @@ +\chapter{Scheduling API} + +The scheduling API is used by both the schedulers described above and should +also be used by any new schedulers. It provides a generic interface and also +implements much of the ``boilerplate'' code. + +Schedulers conforming to this API are described by the following +structure: + +\begin{verbatim} +struct scheduler +{ + char *name; /* full name for this scheduler */ + char *opt_name; /* option name for this scheduler */ + unsigned int sched_id; /* ID for this scheduler */ + + int (*init_scheduler) (); + int (*alloc_task) (struct task_struct *); + void (*add_task) (struct task_struct *); + void (*free_task) (struct task_struct *); + void (*rem_task) (struct task_struct *); + void (*wake_up) (struct task_struct *); + void (*do_block) (struct task_struct *); + task_slice_t (*do_schedule) (s_time_t); + int (*control) (struct sched_ctl_cmd *); + int (*adjdom) (struct task_struct *, + struct sched_adjdom_cmd *); + s32 (*reschedule) (struct task_struct *); + void (*dump_settings) (void); + void (*dump_cpu_state) (int); + void (*dump_runq_el) (struct task_struct *); +}; +\end{verbatim} + +The only method that {\em must} be implemented is +{\tt do\_schedule()}. However, if there is not some implementation for the +{\tt wake\_up()} method then waking tasks will not get put on the runqueue! + +The fields of the above structure are described in more detail below. + +\subsubsection{name} + +The name field should point to a descriptive ASCII string. + +\subsubsection{opt\_name} + +This field is the value of the {\tt sched=} boot-time option that will select +this scheduler. + +\subsubsection{sched\_id} + +This is an integer that uniquely identifies this scheduler. There should be a +macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. + +\subsubsection{init\_scheduler} + +\paragraph*{Purpose} + +This is a function for performing any scheduler-specific initialisation. For +instance, it might allocate memory for per-CPU scheduler data and initialise it +appropriately. + +\paragraph*{Call environment} + +This function is called after the initialisation performed by the generic +layer. The function is called exactly once, for the scheduler that has been +selected. + +\paragraph*{Return values} + +This should return negative on failure --- this will cause an +immediate panic and the system will fail to boot. + +\subsubsection{alloc\_task} + +\paragraph*{Purpose} +Called when a {\tt task\_struct} is allocated by the generic scheduler +layer. A particular scheduler implementation may use this method to +allocate per-task data for this task. It may use the {\tt +sched\_priv} pointer in the {\tt task\_struct} to point to this data. + +\paragraph*{Call environment} +The generic layer guarantees that the {\tt sched\_priv} field will +remain intact from the time this method is called until the task is +deallocated (so long as the scheduler implementation does not change +it explicitly!). + +\paragraph*{Return values} +Negative on failure. + +\subsubsection{add\_task} + +\paragraph*{Purpose} + +Called when a task is initially added by the generic layer. + +\paragraph*{Call environment} + +The fields in the {\tt task\_struct} are now filled out and available for use. +Schedulers should implement appropriate initialisation of any per-task private +information in this method. + +\subsubsection{free\_task} + +\paragraph*{Purpose} + +Schedulers should free the space used by any associated private data +structures. + +\paragraph*{Call environment} + +This is called when a {\tt task\_struct} is about to be deallocated. +The generic layer will have done generic task removal operations and +(if implemented) called the scheduler's {\tt rem\_task} method before +this method is called. + +\subsubsection{rem\_task} + +\paragraph*{Purpose} + +This is called when a task is being removed from scheduling (but is +not yet being freed). + +\subsubsection{wake\_up} + +\paragraph*{Purpose} + +Called when a task is woken up, this method should put the task on the runqueue +(or do the scheduler-specific equivalent action). + +\paragraph*{Call environment} + +The task is already set to state RUNNING. + +\subsubsection{do\_block} + +\paragraph*{Purpose} + +This function is called when a task is blocked. This function should +not remove the task from the runqueue. + +\paragraph*{Call environment} + +The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to +TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt + do\_schedule} method will be made after this method returns, in +order to select the next task to run. + +\subsubsection{do\_schedule} + +This method must be implemented. + +\paragraph*{Purpose} + +The method is called each time a new task must be chosen for scheduling on the +current CPU. The current time as passed as the single argument (the current +task can be found using the {\tt current} macro). + +This method should select the next task to run on this CPU and set it's minimum +time to run as well as returning the data described below. + +This method should also take the appropriate action if the previous +task has blocked, e.g. removing it from the runqueue. + +\paragraph*{Call environment} + +The other fields in the {\tt task\_struct} are updated by the generic layer, +which also performs all Xen-specific tasks and performs the actual task switch +(unless the previous task has been chosen again). + +This method is called with the {\tt schedule\_lock} held for the current CPU +and local interrupts disabled. + +\paragraph*{Return values} + +Must return a {\tt struct task\_slice} describing what task to run and how long +for (at maximum). + +\subsubsection{control} + +\paragraph*{Purpose} + +This method is called for global scheduler control operations. It takes a +pointer to a {\tt struct sched\_ctl\_cmd}, which it should either +source data from or populate with data, depending on the value of the +{\tt direction} field. + +\paragraph*{Call environment} + +The generic layer guarantees that when this method is called, the +caller selected the correct scheduler ID, hence the scheduler's +implementation does not need to sanity-check these parts of the call. + +\paragraph*{Return values} + +This function should return the value to be passed back to user space, hence it +should either be 0 or an appropriate errno value. + +\subsubsection{sched\_adjdom} + +\paragraph*{Purpose} + +This method is called to adjust the scheduling parameters of a particular +domain, or to query their current values. The function should check +the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in +order to determine which of these operations is being performed. + +\paragraph*{Call environment} + +The generic layer guarantees that the caller has specified the correct +control interface version and scheduler ID and that the supplied {\tt +task\_struct} will not be deallocated during the call (hence it is not +necessary to {\tt get\_task\_struct}). + +\paragraph*{Return values} + +This function should return the value to be passed back to user space, hence it +should either be 0 or an appropriate errno value. + +\subsubsection{reschedule} + +\paragraph*{Purpose} + +This method is called to determine if a reschedule is required as a result of a +particular task. + +\paragraph*{Call environment} +The generic layer will cause a reschedule if the current domain is the idle +task or it has exceeded its minimum time slice before a reschedule. The +generic layer guarantees that the task passed is not currently running but is +on the runqueue. + +\paragraph*{Return values} + +Should return a mask of CPUs to cause a reschedule on. + +\subsubsection{dump\_settings} + +\paragraph*{Purpose} + +If implemented, this should dump any private global settings for this +scheduler to the console. + +\paragraph*{Call environment} + +This function is called with interrupts enabled. + +\subsubsection{dump\_cpu\_state} + +\paragraph*{Purpose} + +This method should dump any private settings for the specified CPU. + +\paragraph*{Call environment} + +This function is called with interrupts disabled and the {\tt schedule\_lock} +for the specified CPU held. + +\subsubsection{dump\_runq\_el} + +\paragraph*{Purpose} + +This method should dump any private settings for the specified task. + +\paragraph*{Call environment} + +This function is called with interrupts disabled and the {\tt schedule\_lock} +for the task's CPU held. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/build.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/build.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,170 @@ +\chapter{Build, Boot and Debug Options} + +This chapter describes the build- and boot-time options which may be +used to tailor your Xen system. + + +\section{Xen Build Options} + +Xen provides a number of build-time options which should be set as +environment variables or passed on make's command-line. + +\begin{description} +\item[verbose=y] Enable debugging messages when Xen detects an + unexpected condition. Also enables console output from all domains. +\item[debug=y] Enable debug assertions. Implies {\bf verbose=y}. + (Primarily useful for tracing bugs in Xen). +\item[debugger=y] Enable the in-Xen debugger. This can be used to + debug Xen, guest OSes, and applications. +\item[perfc=y] Enable performance counters for significant events + within Xen. The counts can be reset or displayed on Xen's console + via console control keys. +\item[trace=y] Enable per-cpu trace buffers which log a range of + events within Xen for collection by control software. +\end{description} + + +\section{Xen Boot Options} +\label{s:xboot} + +These options are used to configure Xen's behaviour at runtime. They +should be appended to Xen's command line, either manually or by +editing \path{grub.conf}. + +\begin{description} +\item [ noreboot ] Don't reboot the machine automatically on errors. + This is useful to catch debug output if you aren't catching console + messages via the serial line. +\item [ nosmp ] Disable SMP support. This option is implied by + `ignorebiostables'. +\item [ watchdog ] Enable NMI watchdog which can report certain + failures. +\item [ noirqbalance ] Disable software IRQ balancing and affinity. + This can be used on systems such as Dell 1850/2850 that have + workarounds in hardware for IRQ-routing issues. +\item [ badpage=$<$page number$>$,$<$page number$>$, \ldots ] Specify + a list of pages not to be allocated for use because they contain bad + bytes. For example, if your memory tester says that byte 0x12345678 + is bad, you would place `badpage=0x12345' on Xen's command line. +\item [ com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ + com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ + Xen supports up to two 16550-compatible serial ports. For example: + `com1=9600, 8n1, 0x408, 5' maps COM1 to a 9600-baud port, 8 data + bits, no parity, 1 stop bit, I/O port base 0x408, IRQ 5. If some + configuration options are standard (e.g., I/O base and IRQ), then + only a prefix of the full configuration string need be specified. If + the baud rate is pre-configured (e.g., by the bootloader) then you + can specify `auto' in place of a numeric baud rate. +\item [ console=$<$specifier list$>$ ] Specify the destination for Xen + console I/O. This is a comma-separated list of, for example: + \begin{description} + \item[ vga ] Use VGA console and allow keyboard input. + \item[ com1 ] Use serial port com1. + \item[ com2H ] Use serial port com2. Transmitted chars will have the + MSB set. Received chars must have MSB set. + \item[ com2L] Use serial port com2. Transmitted chars will have the + MSB cleared. Received chars must have MSB cleared. + \end{description} + The latter two examples allow a single port to be shared by two + subsystems (e.g.\ console and debugger). Sharing is controlled by + MSB of each transmitted/received character. [NB. Default for this + option is `com1,vga'] +\item [ sync\_console ] Force synchronous console output. This is + useful if you system fails unexpectedly before it has sent all + available output to the console. In most cases Xen will + automatically enter synchronous mode when an exceptional event + occurs, but this option provides a manual fallback. +\item [ conswitch=$<$switch-char$><$auto-switch-char$>$ ] Specify how + to switch serial-console input between Xen and DOM0. The required + sequence is CTRL-$<$switch-char$>$ pressed three times. Specifying + the backtick character disables switching. The + $<$auto-switch-char$>$ specifies whether Xen should auto-switch + input to DOM0 when it boots --- if it is `x' then auto-switching is + disabled. Any other value, or omitting the character, enables + auto-switching. [NB. Default switch-char is `a'.] +\item [ nmi=xxx ] + Specify what to do with an NMI parity or I/O error. \\ + `nmi=fatal': Xen prints a diagnostic and then hangs. \\ + `nmi=dom0': Inform DOM0 of the NMI. \\ + `nmi=ignore': Ignore the NMI. +\item [ mem=xxx ] Set the physical RAM address limit. Any RAM + appearing beyond this physical address in the memory map will be + ignored. This parameter may be specified with a B, K, M or G suffix, + representing bytes, kilobytes, megabytes and gigabytes respectively. + The default unit, if no suffix is specified, is kilobytes. +\item [ dom0\_mem=xxx ] Set the amount of memory to be allocated to + domain0. In Xen 3.x the parameter may be specified with a B, K, M or + G suffix, representing bytes, kilobytes, megabytes and gigabytes + respectively; if no suffix is specified, the parameter defaults to + kilobytes. In previous versions of Xen, suffixes were not supported + and the value is always interpreted as kilobytes. +\item [ tbuf\_size=xxx ] Set the size of the per-cpu trace buffers, in + pages (default 1). Note that the trace buffers are only enabled in + debug builds. Most users can ignore this feature completely. +\item [ sched=xxx ] Select the CPU scheduler Xen should use. The + current possibilities are `bvt' (default), `atropos' and `rrobin'. + For more information see Section~\ref{s:sched}. +\item [ apic\_verbosity=debug,verbose ] Print more detailed + information about local APIC and IOAPIC configuration. +\item [ lapic ] Force use of local APIC even when left disabled by + uniprocessor BIOS. +\item [ nolapic ] Ignore local APIC in a uniprocessor system, even if + enabled by the BIOS. +\item [ apic=bigsmp,default,es7000,summit ] Specify NUMA platform. + This can usually be probed automatically. +\end{description} + +In addition, the following options may be specified on the Xen command +line. Since domain 0 shares responsibility for booting the platform, +Xen will automatically propagate these options to its command line. +These options are taken from Linux's command-line syntax with +unchanged semantics. + +\begin{description} +\item [ acpi=off,force,strict,ht,noirq,\ldots ] Modify how Xen (and + domain 0) parses the BIOS ACPI tables. +\item [ acpi\_skip\_timer\_override ] Instruct Xen (and domain~0) to + ignore timer-interrupt override instructions specified by the BIOS + ACPI tables. +\item [ noapic ] Instruct Xen (and domain~0) to ignore any IOAPICs + that are present in the system, and instead continue to use the + legacy PIC. +\end{description} + + +\section{XenLinux Boot Options} + +In addition to the standard Linux kernel boot options, we support: +\begin{description} +\item[ xencons=xxx ] Specify the device node to which the Xen virtual + console driver is attached. The following options are supported: + \begin{center} + \begin{tabular}{l} + `xencons=off': disable virtual console \\ + `xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ + `xencons=ttyS': attach console to /dev/ttyS0 + \end{tabular} +\end{center} +The default is ttyS for dom0 and tty for all other domains. +\end{description} + + +\section{Debugging} +\label{s:keys} + +Xen has a set of debugging features that can be useful to try and +figure out what's going on. Hit `h' on the serial line (if you +specified a baud rate on the Xen command line) or ScrollLock-h on the +keyboard to get a list of supported commands. + +If you have a crash you'll likely get a crash dump containing an EIP +(PC) which, along with an \path{objdump -d image}, can be useful in +figuring out what's happened. Debug a Xenlinux image just as you +would any other Linux kernel. + +%% We supply a handy debug terminal program which you can find in +%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} This should +%% be built and executed on another machine that is connected via a +%% null modem cable. Documentation is included. Alternatively, if the +%% Xen machine is connected to a serial-port server then we supply a +%% dumb TCP terminal client, {\tt xencons}. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/control_software.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/control_software.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,115 @@ +\chapter{Control Software} + +The Xen control software includes the \xend\ node control daemon +(which must be running), the xm command line tools, and the prototype +xensv web interface. + +\section{\Xend\ (node control daemon)} +\label{s:xend} + +The Xen Daemon (\Xend) performs system management functions related to +virtual machines. It forms a central point of control for a machine +and can be controlled using an HTTP-based protocol. \Xend\ must be +running in order to start and manage virtual machines. + +\Xend\ must be run as root because it needs access to privileged +system management functions. A small set of commands may be issued on +the \xend\ command line: + +\begin{tabular}{ll} + \verb!# xend start! & start \xend, if not already running \\ + \verb!# xend stop! & stop \xend\ if already running \\ + \verb!# xend restart! & restart \xend\ if running, otherwise start it \\ + % \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ + \verb!# xend status! & indicates \xend\ status by its return code +\end{tabular} + +A SysV init script called {\tt xend} is provided to start \xend\ at +boot time. {\tt make install} installs this script in +\path{/etc/init.d}. To enable it, you have to make symbolic links in +the appropriate runlevel directories or use the {\tt chkconfig} tool, +where available. + +Once \xend\ is running, more sophisticated administration can be done +using the xm tool (see Section~\ref{s:xm}) and the experimental Xensv +web interface (see Section~\ref{s:xensv}). + +As \xend\ runs, events will be logged to \path{/var/log/xend.log} and, +if the migration assistant daemon (\path{xfrd}) has been started, +\path{/var/log/xfrd.log}. These may be of use for troubleshooting +problems. + +\section{Xm (command line interface)} +\label{s:xm} + +The xm tool is the primary tool for managing Xen from the console. +The general format of an xm command line is: + +\begin{verbatim} +# xm command [switches] [arguments] [variables] +\end{verbatim} + +The available \emph{switches} and \emph{arguments} are dependent on +the \emph{command} chosen. The \emph{variables} may be set using +declarations of the form {\tt variable=value} and command line +declarations override any of the values in the configuration file +being used, including the standard variables described above and any +custom variables (for instance, the \path{xmdefconfig} file uses a +{\tt vmid} variable). + +The available commands are as follows: + +\begin{description} +\item[set-mem] Request a domain to adjust its memory footprint. +\item[create] Create a new domain. +\item[destroy] Kill a domain immediately. +\item[list] List running domains. +\item[shutdown] Ask a domain to shutdown. +\item[dmesg] Fetch the Xen (not Linux!) boot output. +\item[consoles] Lists the available consoles. +\item[console] Connect to the console for a domain. +\item[help] Get help on xm commands. +\item[save] Suspend a domain to disk. +\item[restore] Restore a domain from disk. +\item[pause] Pause a domain's execution. +\item[unpause] Un-pause a domain. +\item[pincpu] Pin a domain to a CPU. +\item[bvt] Set BVT scheduler parameters for a domain. +\item[bvt\_ctxallow] Set the BVT context switching allowance for the + system. +\item[atropos] Set the atropos parameters for a domain. +\item[rrobin] Set the round robin time slice for the system. +\item[info] Get information about the Xen host. +\item[call] Call a \xend\ HTTP API function directly. +\end{description} + +For a detailed overview of switches, arguments and variables to each +command try +\begin{quote} +\begin{verbatim} +# xm help command +\end{verbatim} +\end{quote} + +\section{Xensv (web control interface)} +\label{s:xensv} + +Xensv is the experimental web control interface for managing a Xen +machine. It can be used to perform some (but not yet all) of the +management tasks that can be done using the xm tool. + +It can be started using: +\begin{quote} + \verb_# xensv start_ +\end{quote} +and stopped using: +\begin{quote} + \verb_# xensv stop_ +\end{quote} + +By default, Xensv will serve out the web interface on port 8080. This +can be changed by editing +\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. + +Once Xensv is running, the web interface can be used to create and +manage running domains. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/debian.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/debian.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,154 @@ +\chapter{Installing Xen / XenLinux on Debian} + +The Debian project provides a tool called \path{debootstrap} which +allows a base Debian system to be installed into a filesystem without +requiring the host system to have any Debian-specific software (such +as \path{apt}). + +Here's some info how to install Debian 3.1 (Sarge) for an unprivileged +Xen domain: + +\begin{enumerate} + +\item Set up Xen and test that it's working, as described earlier in + this manual. + +\item Create disk images for rootfs and swap. Alternatively, you might + create dedicated partitions, LVM logical volumes, etc.\ if that + suits your setup. +\begin{verbatim} +dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes +dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes +\end{verbatim} + + If you're going to use this filesystem / disk image only as a + `template' for other vm disk images, something like 300 MB should be + enough. (of course it depends what kind of packages you are planning + to install to the template) + +\item Create the filesystem and initialise the swap image +\begin{verbatim} +mkfs.ext3 /path/diskimage +mkswap /path/swapimage +\end{verbatim} + +\item Mount the disk image for installation +\begin{verbatim} +mount -o loop /path/diskimage /mnt/disk +\end{verbatim} + +\item Install \path{debootstrap}. Make sure you have debootstrap + installed on the host. If you are running Debian Sarge (3.1 / + testing) or unstable you can install it by running \path{apt-get + install debootstrap}. Otherwise, it can be downloaded from the + Debian project website. + +\item Install Debian base to the disk image: +\begin{verbatim} +debootstrap --arch i386 sarge /mnt/disk \ + http://ftp.<countrycode>.debian.org/debian +\end{verbatim} + + You can use any other Debian http/ftp mirror you want. + +\item When debootstrap completes successfully, modify settings: +\begin{verbatim} +chroot /mnt/disk /bin/bash +\end{verbatim} + +Edit the following files using vi or nano and make needed changes: +\begin{verbatim} +/etc/hostname +/etc/hosts +/etc/resolv.conf +/etc/network/interfaces +/etc/networks +\end{verbatim} + +Set up access to the services, edit: +\begin{verbatim} +/etc/hosts.deny +/etc/hosts.allow +/etc/inetd.conf +\end{verbatim} + +Add Debian mirror to: +\begin{verbatim} +/etc/apt/sources.list +\end{verbatim} + +Create fstab like this: +\begin{verbatim} +/dev/sda1 / ext3 errors=remount-ro 0 1 +/dev/sda2 none swap sw 0 0 +proc /proc proc defaults 0 0 +\end{verbatim} + +Logout + +\item Unmount the disk image +\begin{verbatim} +umount /mnt/disk +\end{verbatim} + +\item Create Xen 2.0 configuration file for the new domain. You can + use the example-configurations coming with Xen as a template. + + Make sure you have the following set up: +\begin{verbatim} +disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] +root = "/dev/sda1 ro" +\end{verbatim} + +\item Start the new domain +\begin{verbatim} +xm create -f domain_config_file +\end{verbatim} + +Check that the new domain is running: +\begin{verbatim} +xm list +\end{verbatim} + +\item Attach to the console of the new domain. You should see + something like this when starting the new domain: + +\begin{verbatim} +Started domain testdomain2, console on port 9626 +\end{verbatim} + + There you can see the ID of the console: 26. You can also list the + consoles with \path{xm consoles} (ID is the last two digits of the + port number.) + + Attach to the console: + +\begin{verbatim} +xm console 26 +\end{verbatim} + + or by telnetting to the port 9626 of localhost (the xm console + program works better). + +\item Log in and run base-config + + As a default there's no password for the root. + + Check that everything looks OK, and the system started without + errors. Check that the swap is active, and the network settings are + correct. + + Run \path{/usr/sbin/base-config} to set up the Debian settings. + + Set up the password for root using passwd. + +\item Done. You can exit the console by pressing {\path{Ctrl + ]}} + +\end{enumerate} + + +If you need to create new domains, you can just copy the contents of +the `template'-image to the new disk images, either by mounting the +template and the new image, and using \path{cp -a} or \path{tar} or by +simply copying the image file. Once this is done, modify the +image-specific settings (hostname, network settings, etc). diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/domain_configuration.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/domain_configuration.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,281 @@ +\chapter{Domain Configuration} +\label{cha:config} + +The following contains the syntax of the domain configuration files +and description of how to further specify networking, driver domain +and general scheduling behavior. + + +\section{Configuration Files} +\label{s:cfiles} + +Xen configuration files contain the following standard variables. +Unless otherwise stated, configuration items should be enclosed in +quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} +for concrete examples of the syntax. + +\begin{description} +\item[kernel] Path to the kernel image. +\item[ramdisk] Path to a ramdisk image (optional). + % \item[builder] The name of the domain build function (e.g. + % {\tt'linux'} or {\tt'netbsd'}. +\item[memory] Memory size in megabytes. +\item[cpu] CPU to run this domain on, or {\tt -1} for auto-allocation. +\item[console] Port to export the domain console on (default 9600 + + domain ID). +\item[nics] Number of virtual network interfaces. +\item[vif] List of MAC addresses (random addresses are assigned if not + given) and bridges to use for the domain's network interfaces, e.g.\ +\begin{verbatim} +vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', + 'bridge=xen-br1' ] +\end{verbatim} + to assign a MAC address and bridge to the first interface and assign + a different bridge to the second interface, leaving \xend\ to choose + the MAC address. +\item[disk] List of block devices to export to the domain, e.g.\ \\ + \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ + exports physical device \path{/dev/hda1} to the domain as + \path{/dev/sda1} with read-only access. Exporting a disk read-write + which is currently mounted is dangerous -- if you are \emph{certain} + you wish to do this, you can specify \path{w!} as the mode. +\item[dhcp] Set to {\tt `dhcp'} if you want to use DHCP to configure + networking. +\item[netmask] Manually configured IP netmask. +\item[gateway] Manually configured IP gateway. +\item[hostname] Set the hostname for the virtual machine. +\item[root] Specify the root device parameter on the kernel command + line. +\item[nfs\_server] IP address for the NFS server (if any). +\item[nfs\_root] Path of the root filesystem on the NFS server (if + any). +\item[extra] Extra string to append to the kernel command line (if + any) +\item[restart] Three possible options: + \begin{description} + \item[always] Always restart the domain, no matter what its exit + code is. + \item[never] Never restart the domain. + \item[onreboot] Restart the domain iff it requests reboot. + \end{description} +\end{description} + +For additional flexibility, it is also possible to include Python +scripting commands in configuration files. An example of this is the +\path{xmexample2} file, which uses Python code to handle the +\path{vmid} variable. + + +%\part{Advanced Topics} + + +\section{Network Configuration} + +For many users, the default installation should work ``out of the +box''. More complicated network setups, for instance with multiple +Ethernet interfaces and/or existing bridging setups will require some +special configuration. + +The purpose of this section is to describe the mechanisms provided by +\xend\ to allow a flexible configuration for Xen's virtual networking. + +\subsection{Xen virtual network topology} + +Each domain network interface is connected to a virtual network +interface in dom0 by a point to point link (effectively a ``virtual +crossover cable''). These devices are named {\tt + vif$<$domid$>$.$<$vifid$>$} (e.g.\ {\tt vif1.0} for the first +interface in domain~1, {\tt vif3.1} for the second interface in +domain~3). + +Traffic on these virtual interfaces is handled in domain~0 using +standard Linux mechanisms for bridging, routing, rate limiting, etc. +Xend calls on two shell scripts to perform initial configuration of +the network and configuration of new virtual interfaces. By default, +these scripts configure a single bridge for all the virtual +interfaces. Arbitrary routing / bridging configurations can be +configured by customizing the scripts, as described in the following +section. + +\subsection{Xen networking scripts} + +Xen's virtual networking is configured by two shell scripts (by +default \path{network} and \path{vif-bridge}). These are called +automatically by \xend\ when certain events occur, with arguments to +the scripts providing further contextual information. These scripts +are found by default in \path{/etc/xen/scripts}. The names and +locations of the scripts can be configured in +\path{/etc/xen/xend-config.sxp}. + +\begin{description} +\item[network:] This script is called whenever \xend\ is started or + stopped to respectively initialize or tear down the Xen virtual + network. In the default configuration initialization creates the + bridge `xen-br0' and moves eth0 onto that bridge, modifying the + routing accordingly. When \xend\ exits, it deletes the Xen bridge + and removes eth0, restoring the normal IP and routing configuration. + + %% In configurations where the bridge already exists, this script + %% could be replaced with a link to \path{/bin/true} (for instance). + +\item[vif-bridge:] This script is called for every domain virtual + interface and can configure firewalling rules and add the vif to the + appropriate bridge. By default, this adds and removes VIFs on the + default Xen bridge. +\end{description} + +For more complex network setups (e.g.\ where routing is required or +integrate with existing bridges) these scripts may be replaced with +customized variants for your site's preferred configuration. + +%% There are two possible types of privileges: IO privileges and +%% administration privileges. + + +\section{Driver Domain Configuration} + +I/O privileges can be assigned to allow a domain to directly access +PCI devices itself. This is used to support driver domains. + +Setting back-end privileges is currently only supported in SXP format +config files. To allow a domain to function as a back-end for others, +somewhere within the {\tt vm} element of its configuration file must +be a {\tt back-end} element of the form {\tt (back-end ({\em type}))} +where {\tt \em type} may be either {\tt netif} or {\tt blkif}, +according to the type of virtual device this domain will service. +%% After this domain has been built, \xend will connect all new and +%% existing {\em virtual} devices (of the appropriate type) to that +%% back-end. + +Note that a block back-end cannot currently import virtual block +devices from other domains, and a network back-end cannot import +virtual network devices from other domains. Thus (particularly in the +case of block back-ends, which cannot import a virtual block device as +their root filesystem), you may need to boot a back-end domain from a +ramdisk or a network device. + +Access to PCI devices may be configured on a per-device basis. Xen +will assign the minimal set of hardware privileges to a domain that +are required to control its devices. This can be configured in either +format of configuration file: + +\begin{itemize} +\item SXP Format: Include device elements of the form: \\ + \centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ + inside the top-level {\tt vm} element. Each one specifies the + address of a device this domain is allowed to access --- the numbers + \emph{x},\emph{y} and \emph{z} may be in either decimal or + hexadecimal format. +\item Flat Format: Include a list of PCI device addresses of the + format: \\ + \centerline{{\tt pci = ['x,y,z', \ldots]}} \\ + where each element in the list is a string specifying the components + of the PCI device address, separated by commas. The components + ({\tt \em x}, {\tt \em y} and {\tt \em z}) of the list may be + formatted as either decimal or hexadecimal. +\end{itemize} + +%% \section{Administration Domains} + +%% Administration privileges allow a domain to use the `dom0 +%% operations' (so called because they are usually available only to +%% domain 0). A privileged domain can build other domains, set +%% scheduling parameters, etc. + +% Support for other administrative domains is not yet available... +% perhaps we should plumb it in some time + + +\section{Scheduler Configuration} +\label{s:sched} + +Xen offers a boot time choice between multiple schedulers. To select +a scheduler, pass the boot parameter \emph{sched=sched\_name} to Xen, +substituting the appropriate scheduler name. Details of the +schedulers and their parameters are included below; future versions of +the tools will provide a higher-level interface to these tools. + +It is expected that system administrators configure their system to +use the scheduler most appropriate to their needs. Currently, the BVT +scheduler is the recommended choice. + +\subsection{Borrowed Virtual Time} + +{\tt sched=bvt} (the default) \\ + +BVT provides proportional fair shares of the CPU time. It has been +observed to penalize domains that block frequently (e.g.\ I/O +intensive domains), but this can be compensated for by using warping. + +\subsubsection{Global Parameters} + +\begin{description} +\item[ctx\_allow] The context switch allowance is similar to the + ``quantum'' in traditional schedulers. It is the minimum time that + a scheduled domain will be allowed to run before being preempted. +\end{description} + +\subsubsection{Per-domain parameters} + +\begin{description} +\item[mcuadv] The MCU (Minimum Charging Unit) advance determines the + proportional share of the CPU that a domain receives. It is set + inversely proportionally to a domain's sharing weight. +\item[warp] The amount of ``virtual time'' the domain is allowed to + warp backwards. +\item[warpl] The warp limit is the maximum time a domain can run + warped for. +\item[warpu] The unwarp requirement is the minimum time a domain must + run unwarped for before it can warp again. +\end{description} + +\subsection{Atropos} + +{\tt sched=atropos} \\ + +Atropos is a soft real time scheduler. It provides guarantees about +absolute shares of the CPU, with a facility for sharing slack CPU time +on a best-effort basis. It can provide timeliness guarantees for +latency-sensitive domains. + +Every domain has an associated period and slice. The domain should +receive `slice' nanoseconds every `period' nanoseconds. This allows +the administrator to configure both the absolute share of the CPU a +domain receives and the frequency with which it is scheduled. + +%% When domains unblock, their period is reduced to the value of the +%% latency hint (the slice is scaled accordingly so that they still +%% get the same proportion of the CPU). For each subsequent period, +%% the slice and period times are doubled until they reach their +%% original values. + +Note: don't over-commit the CPU when using Atropos (i.e.\ don't reserve +more CPU than is available --- the utilization should be kept to +slightly less than 100\% in order to ensure predictable behavior). + +\subsubsection{Per-domain parameters} + +\begin{description} +\item[period] The regular time interval during which a domain is + guaranteed to receive its allocation of CPU time. +\item[slice] The length of time per period that a domain is guaranteed + to run for (in the absence of voluntary yielding of the CPU). +\item[latency] The latency hint is used to control how soon after + waking up a domain it should be scheduled. +\item[xtratime] This is a boolean flag that specifies whether a domain + should be allowed a share of the system slack time. +\end{description} + +\subsection{Round Robin} + +{\tt sched=rrobin} \\ + +The round robin scheduler is included as a simple demonstration of +Xen's internal scheduler API. It is not intended for production use. + +\subsubsection{Global Parameters} + +\begin{description} +\item[rr\_slice] The maximum time each domain runs before the next + scheduling decision is made. +\end{description} diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/domain_filesystem.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/domain_filesystem.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,243 @@ +\chapter{Domain Filesystem Storage} + +It is possible to directly export any Linux block device in dom0 to +another domain, or to export filesystems / devices to virtual machines +using standard network protocols (e.g.\ NBD, iSCSI, NFS, etc.). This +chapter covers some of the possibilities. + + +\section{Exporting Physical Devices as VBDs} +\label{s:exporting-physical-devices-as-vbds} + +One of the simplest configurations is to directly export individual +partitions from domain~0 to other domains. To achieve this use the +\path{phy:} specifier in your domain configuration file. For example a +line like +\begin{quote} + \verb_disk = ['phy:hda3,sda1,w']_ +\end{quote} +specifies that the partition \path{/dev/hda3} in domain~0 should be +exported read-write to the new domain as \path{/dev/sda1}; one could +equally well export it as \path{/dev/hda} or \path{/dev/sdb5} should +one wish. + +In addition to local disks and partitions, it is possible to export +any device that Linux considers to be ``a disk'' in the same manner. +For example, if you have iSCSI disks or GNBD volumes imported into +domain~0 you can export these to other domains using the \path{phy:} +disk syntax. E.g.: +\begin{quote} + \verb_disk = ['phy:vg/lvm1,sda2,w']_ +\end{quote} + +\begin{center} + \framebox{\bf Warning: Block device sharing} +\end{center} +\begin{quote} + Block devices should typically only be shared between domains in a + read-only fashion otherwise the Linux kernel's file systems will get + very confused as the file system structure may change underneath + them (having the same ext3 partition mounted \path{rw} twice is a + sure fire way to cause irreparable damage)! \Xend\ will attempt to + prevent you from doing this by checking that the device is not + mounted read-write in domain~0, and hasn't already been exported + read-write to another domain. If you want read-write sharing, + export the directory to other domains via NFS from domain~0 (or use + a cluster file system such as GFS or ocfs2). +\end{quote} + + +\section{Using File-backed VBDs} + +It is also possible to use a file in Domain~0 as the primary storage +for a virtual machine. As well as being convenient, this also has the +advantage that the virtual block device will be \emph{sparse} --- +space will only really be allocated as parts of the file are used. So +if a virtual machine uses only half of its disk space then the file +really takes up half of the size allocated. + +For example, to create a 2GB sparse file-backed virtual block device +(actually only consumes 1KB of disk): +\begin{quote} + \verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ +\end{quote} + +Make a file system in the disk file: +\begin{quote} + \verb_# mkfs -t ext3 vm1disk_ +\end{quote} + +(when the tool asks for confirmation, answer `y') + +Populate the file system e.g.\ by copying from the current root: +\begin{quote} +\begin{verbatim} +# mount -o loop vm1disk /mnt +# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt +# mkdir /mnt/{proc,sys,home,tmp} +\end{verbatim} +\end{quote} + +Tailor the file system by editing \path{/etc/fstab}, +\path{/etc/hostname}, etc.\ Don't forget to edit the files in the +mounted file system, instead of your domain~0 filesystem, e.g.\ you +would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab}. For +this example put \path{/dev/sda1} to root in fstab. + +Now unmount (this is important!): +\begin{quote} + \verb_# umount /mnt_ +\end{quote} + +In the configuration file set: +\begin{quote} + \verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ +\end{quote} + +As the virtual machine writes to its `disk', the sparse file will be +filled in and consume more space up to the original 2GB. + +{\bf Note that file-backed VBDs may not be appropriate for backing + I/O-intensive domains.} File-backed VBDs are known to experience +substantial slowdowns under heavy I/O workloads, due to the I/O +handling by the loopback block device used to support file-backed VBDs +in dom0. Better I/O performance can be achieved by using either +LVM-backed VBDs (Section~\ref{s:using-lvm-backed-vbds}) or physical +devices as VBDs (Section~\ref{s:exporting-physical-devices-as-vbds}). + +Linux supports a maximum of eight file-backed VBDs across all domains +by default. This limit can be statically increased by using the +\emph{max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is +compiled as a module in the dom0 kernel, or by using the +\emph{max\_loop=n} boot option if CONFIG\_BLK\_DEV\_LOOP is compiled +directly into the dom0 kernel. + + +\section{Using LVM-backed VBDs} +\label{s:using-lvm-backed-vbds} + +A particularly appealing solution is to use LVM volumes as backing for +domain file-systems since this allows dynamic growing/shrinking of +volumes as well as snapshot and other features. + +To initialize a partition to support LVM volumes: +\begin{quote} +\begin{verbatim} +# pvcreate /dev/sda10 +\end{verbatim} +\end{quote} + +Create a volume group named `vg' on the physical partition: +\begin{quote} +\begin{verbatim} +# vgcreate vg /dev/sda10 +\end{verbatim} +\end{quote} + +Create a logical volume of size 4GB named `myvmdisk1': +\begin{quote} +\begin{verbatim} +# lvcreate -L4096M -n myvmdisk1 vg +\end{verbatim} +\end{quote} + +You should now see that you have a \path{/dev/vg/myvmdisk1} Make a +filesystem, mount it and populate it, e.g.: +\begin{quote} +\begin{verbatim} +# mkfs -t ext3 /dev/vg/myvmdisk1 +# mount /dev/vg/myvmdisk1 /mnt +# cp -ax / /mnt +# umount /mnt +\end{verbatim} +\end{quote} + +Now configure your VM with the following disk configuration: +\begin{quote} +\begin{verbatim} + disk = [ 'phy:vg/myvmdisk1,sda1,w' ] +\end{verbatim} +\end{quote} + +LVM enables you to grow the size of logical volumes, but you'll need +to resize the corresponding file system to make use of the new space. +Some file systems (e.g.\ ext3) now support online resize. See the LVM +manuals for more details. + +You can also use LVM for creating copy-on-write (CoW) clones of LVM +volumes (known as writable persistent snapshots in LVM terminology). +This facility is new in Linux 2.6.8, so isn't as stable as one might +hope. In particular, using lots of CoW LVM disks consumes a lot of +dom0 memory, and error conditions such as running out of disk space +are not handled well. Hopefully this will improve in future. + +To create two copy-on-write clone of the above file system you would +use the following commands: + +\begin{quote} +\begin{verbatim} +# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 +# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 +\end{verbatim} +\end{quote} + +Each of these can grow to have 1GB of differences from the master +volume. You can grow the amount of space for storing the differences +using the lvextend command, e.g.: +\begin{quote} +\begin{verbatim} +# lvextend +100M /dev/vg/myclonedisk1 +\end{verbatim} +\end{quote} + +Don't let the `differences volume' ever fill up otherwise LVM gets +rather confused. It may be possible to automate the growing process by +using \path{dmsetup wait} to spot the volume getting full and then +issue an \path{lvextend}. + +In principle, it is possible to continue writing to the volume that +has been cloned (the changes will not be visible to the clones), but +we wouldn't recommend this: have the cloned volume as a `pristine' +file system install that isn't mounted directly by any of the virtual +machines. + + +\section{Using NFS Root} + +First, populate a root filesystem in a directory on the server +machine. This can be on a distinct physical machine, or simply run +within a virtual machine on the same node. + +Now configure the NFS server to export this filesystem over the +network by adding a line to \path{/etc/exports}, for instance: + +\begin{quote} + \begin{small} +\begin{verbatim} +/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) +\end{verbatim} + \end{small} +\end{quote} + +Finally, configure the domain to use NFS root. In addition to the +normal variables, you should make sure to set the following values in +the domain's configuration file: + +\begin{quote} + \begin{small} +\begin{verbatim} +root = '/dev/nfs' +nfs_server = '2.3.4.5' # substitute IP address of server +nfs_root = '/path/to/root' # path to root FS on the server +\end{verbatim} + \end{small} +\end{quote} + +The domain will need network access at boot time, so either statically +configure an IP address using the config variables \path{ip}, +\path{netmask}, \path{gateway}, \path{hostname}; or enable DHCP +(\path{dhcp='dhcp'}). + +Note that the Linux NFS root implementation is known to have stability +problems under high load (this is not a Xen-specific problem), so this +configuration may not be appropriate for critical servers. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/domain_mgmt.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/domain_mgmt.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,203 @@ +\chapter{Domain Management Tools} + +The previous chapter described a simple example of how to configure +and start a domain. This chapter summarises the tools available to +manage running domains. + + +\section{Command-line Management} + +Command line management tasks are also performed using the \path{xm} +tool. For online help for the commands available, type: +\begin{quote} + \verb_# xm help_ +\end{quote} + +You can also type \path{xm help $<$command$>$} for more information on +a given command. + +\subsection{Basic Management Commands} + +The most important \path{xm} commands are: +\begin{quote} + \verb_# xm list_: Lists all domains running.\\ + \verb_# xm consoles_: Gives information about the domain consoles.\\ + \verb_# xm console_: Opens a console to a domain (e.g.\ + \verb_# xm console myVM_) +\end{quote} + +\subsection{\tt xm list} + +The output of \path{xm list} is in rows of the following format: +\begin{center} {\tt name domid memory cpu state cputime console} +\end{center} + +\begin{quote} + \begin{description} + \item[name] The descriptive name of the virtual machine. + \item[domid] The number of the domain ID this virtual machine is + running in. + \item[memory] Memory size in megabytes. + \item[cpu] The CPU this domain is running on. + \item[state] Domain state consists of 5 fields: + \begin{description} + \item[r] running + \item[b] blocked + \item[p] paused + \item[s] shutdown + \item[c] crashed + \end{description} + \item[cputime] How much CPU time (in seconds) the domain has used so + far. + \item[console] TCP port accepting connections to the domain's + console. + \end{description} +\end{quote} + +The \path{xm list} command also supports a long output format when the +\path{-l} switch is used. This outputs the fulls details of the +running domains in \xend's SXP configuration format. + +For example, suppose the system is running the ttylinux domain as +described earlier. The list command should produce output somewhat +like the following: +\begin{verbatim} +# xm list +Name Id Mem(MB) CPU State Time(s) Console +Domain-0 0 251 0 r---- 172.2 +ttylinux 5 63 0 -b--- 3.0 9605 +\end{verbatim} + +Here we can see the details for the ttylinux domain, as well as for +domain~0 (which, of course, is always running). Note that the console +port for the ttylinux domain is 9605. This can be connected to by TCP +using a terminal program (e.g. \path{telnet} or, better, +\path{xencons}). The simplest way to connect is to use the +\path{xm~console} command, specifying the domain name or ID. To +connect to the console of the ttylinux domain, we could use any of the +following: +\begin{verbatim} +# xm console ttylinux +# xm console 5 +# xencons localhost 9605 +\end{verbatim} + +\section{Domain Save and Restore} + +The administrator of a Xen system may suspend a virtual machine's +current state into a disk file in domain~0, allowing it to be resumed +at a later time. + +The ttylinux domain described earlier can be suspended to disk using +the command: +\begin{verbatim} +# xm save ttylinux ttylinux.xen +\end{verbatim} + +This will stop the domain named `ttylinux' and save its current state +into a file called \path{ttylinux.xen}. + +To resume execution of this domain, use the \path{xm restore} command: +\begin{verbatim} +# xm restore ttylinux.xen +\end{verbatim} + +This will restore the state of the domain and restart it. The domain +will carry on as before and the console may be reconnected using the +\path{xm console} command, as above. + +\section{Live Migration} + +Live migration is used to transfer a domain between physical hosts +whilst that domain continues to perform its usual activities --- from +the user's perspective, the migration should be imperceptible. + +To perform a live migration, both hosts must be running Xen / \xend\ +and the destination host must have sufficient resources (e.g.\ memory +capacity) to accommodate the domain after the move. Furthermore we +currently require both source and destination machines to be on the +same L2 subnet. + +Currently, there is no support for providing automatic remote access +to filesystems stored on local disk when a domain is migrated. +Administrators should choose an appropriate storage solution (i.e.\ +SAN, NAS, etc.) to ensure that domain filesystems are also available +on their destination node. GNBD is a good method for exporting a +volume from one machine to another. iSCSI can do a similar job, but is +more complex to set up. + +When a domain migrates, it's MAC and IP address move with it, thus it +is only possible to migrate VMs within the same layer-2 network and IP +subnet. If the destination node is on a different subnet, the +administrator would need to manually configure a suitable etherip or +IP tunnel in the domain~0 of the remote node. + +A domain may be migrated using the \path{xm migrate} command. To live +migrate a domain to another machine, we would use the command: + +\begin{verbatim} +# xm migrate --live mydomain destination.ournetwork.com +\end{verbatim} + +Without the \path{--live} flag, \xend\ simply stops the domain and +copies the memory image over to the new node and restarts it. Since +domains can have large allocations this can be quite time consuming, +even on a Gigabit network. With the \path{--live} flag \xend\ attempts +to keep the domain running while the migration is in progress, +resulting in typical `downtimes' of just 60--300ms. + +For now it will be necessary to reconnect to the domain's console on +the new machine using the \path{xm console} command. If a migrated +domain has any open network connections then they will be preserved, +so SSH connections do not have this limitation. + + +\section{Managing Domain Memory} + +XenLinux domains have the ability to relinquish / reclaim machine +memory at the request of the administrator or the user of the domain. + +\subsection{Setting memory footprints from dom0} + +The machine administrator can request that a domain alter its memory +footprint using the \path{xm set-mem} command. For instance, we can +request that our example ttylinux domain reduce its memory footprint +to 32 megabytes. + +\begin{verbatim} +# xm set-mem ttylinux 32 +\end{verbatim} + +We can now see the result of this in the output of \path{xm list}: + +\begin{verbatim} +# xm list +Name Id Mem(MB) CPU State Time(s) Console +Domain-0 0 251 0 r---- 172.2 +ttylinux 5 31 0 -b--- 4.3 9605 +\end{verbatim} + +The domain has responded to the request by returning memory to Xen. We +can restore the domain to its original size using the command line: + +\begin{verbatim} +# xm set-mem ttylinux 64 +\end{verbatim} + +\subsection{Setting memory footprints from within a domain} + +The virtual file \path{/proc/xen/balloon} allows the owner of a domain +to adjust their own memory footprint. Reading the file (e.g.\ +\path{cat /proc/xen/balloon}) prints out the current memory footprint +of the domain. Writing the file (e.g.\ \path{echo new\_target > + /proc/xen/balloon}) requests that the kernel adjust the domain's +memory footprint to a new value. + +\subsection{Setting memory limits} + +Xen associates a memory size limit with each domain. By default, this +is the amount of memory the domain is originally started with, +preventing the domain from ever growing beyond this size. To permit a +domain to grow beyond its original allocation or to prevent a domain +you've shrunk from reclaiming the memory it relinquished, use the +\path{xm maxmem} command. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/glossary.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/glossary.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,79 @@ +\chapter{Glossary of Terms} + +\begin{description} + +\item[Atropos] One of the CPU schedulers provided by Xen. Atropos + provides domains with absolute shares of the CPU, with timeliness + guarantees and a mechanism for sharing out `slack time'. + +\item[BVT] The BVT scheduler is used to give proportional fair shares + of the CPU to domains. + +\item[Exokernel] A minimal piece of privileged code, similar to a {\bf + microkernel} but providing a more `hardware-like' interface to the + tasks it manages. This is similar to a paravirtualising VMM like + {\bf Xen} but was designed as a new operating system structure, + rather than specifically to run multiple conventional OSs. + +\item[Domain] A domain is the execution context that contains a + running {\bf virtual machine}. The relationship between virtual + machines and domains on Xen is similar to that between programs and + processes in an operating system: a virtual machine is a persistent + entity that resides on disk (somewhat like a program). When it is + loaded for execution, it runs in a domain. Each domain has a {\bf + domain ID}. + +\item[Domain 0] The first domain to be started on a Xen machine. + Domain 0 is responsible for managing the system. + +\item[Domain ID] A unique identifier for a {\bf domain}, analogous to + a process ID in an operating system. + +\item[Full virtualisation] An approach to virtualisation which + requires no modifications to the hosted operating system, providing + the illusion of a complete system of real hardware devices. + +\item[Hypervisor] An alternative term for {\bf VMM}, used because it + means `beyond supervisor', since it is responsible for managing + multiple `supervisor' kernels. + +\item[Live migration] A technique for moving a running virtual machine + to another physical host, without stopping it or the services + running on it. + +\item[Microkernel] A small base of code running at the highest + hardware privilege level. A microkernel is responsible for sharing + CPU and memory (and sometimes other devices) between less privileged + tasks running on the system. This is similar to a VMM, particularly + a {\bf paravirtualising} VMM but typically addressing a different + problem space and providing different kind of interface. + +\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. + +\item[Paravirtualisation] An approach to virtualisation which requires + modifications to the operating system in order to run in a virtual + machine. Xen uses paravirtualisation but preserves binary + compatibility for user space applications. + +\item[Shadow pagetables] A technique for hiding the layout of machine + memory from a virtual machine's operating system. Used in some {\bf + VMMs} to provide the illusion of contiguous physical memory, in + Xen this is used during {\bf live migration}. + +\item[Virtual Machine] The environment in which a hosted operating + system runs, providing the abstraction of a dedicated machine. A + virtual machine may be identical to the underlying hardware (as in + {\bf full virtualisation}, or it may differ, as in {\bf + paravirtualisation}). + +\item[VMM] Virtual Machine Monitor - the software that allows multiple + virtual machines to be multiplexed on a single physical machine. + +\item[Xen] Xen is a paravirtualising virtual machine monitor, + developed primarily by the Systems Research Group at the University + of Cambridge Computer Laboratory. + +\item[XenLinux] Official name for the port of the Linux kernel that + runs on Xen. + +\end{description} diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/installation.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/installation.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,394 @@ +\chapter{Installation} + +The Xen distribution includes three main components: Xen itself, ports +of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the userspace +tools required to manage a Xen-based system. This chapter describes +how to install the Xen~2.0 distribution from source. Alternatively, +there may be pre-built packages available as part of your operating +system distribution. + + +\section{Prerequisites} +\label{sec:prerequisites} + +The following is a full list of prerequisites. Items marked `$\dag$' +are required by the \xend\ control tools, and hence required if you +want to run more than one virtual machine; items marked `$*$' are only +required if you wish to build from source. +\begin{itemize} +\item A working Linux distribution using the GRUB bootloader and + running on a P6-class (or newer) CPU. +\item [$\dag$] The \path{iproute2} package. +\item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt + http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) +\item [$\dag$] An installation of Twisted~v1.3 or + above\footnote{Available from {\tt http://www.twistedmatrix.com}}. + There may be a binary package available for your distribution; + alternatively it can be installed by running `{\sl make + install-twisted}' in the root of the Xen source tree. +\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). +\item [$*$] Development installation of libcurl (e.g., libcurl-devel) +\item [$*$] Development installation of zlib (e.g., zlib-dev). +\item [$*$] Development installation of Python v2.2 or later (e.g., + python-dev). +\item [$*$] \LaTeX\ and transfig are required to build the + documentation. +\end{itemize} + +Once you have satisfied the relevant prerequisites, you can now +install either a binary or source distribution of Xen. + + +\section{Installing from Binary Tarball} + +Pre-built tarballs are available for download from the Xen download +page +\begin{quote} {\tt http://xen.sf.net} +\end{quote} + +Once you've downloaded the tarball, simply unpack and install: +\begin{verbatim} +# tar zxvf xen-2.0-install.tgz +# cd xen-2.0-install +# sh ./install.sh +\end{verbatim} + +Once you've installed the binaries you need to configure your system +as described in Section~\ref{s:configure}. + + +\section{Installing from Source} + +This section describes how to obtain, build, and install Xen from +source. + +\subsection{Obtaining the Source} + +The Xen source tree is available as either a compressed source tar +ball or as a clone of our master BitKeeper repository. + +\begin{description} +\item[Obtaining the Source Tarball]\mbox{} \\ + Stable versions (and daily snapshots) of the Xen source tree are + available as compressed tarballs from the Xen download page + \begin{quote} {\tt http://xen.sf.net} + \end{quote} + +\item[Using BitKeeper]\mbox{} \\ + If you wish to install Xen from a clone of our latest BitKeeper + repository then you will need to install the BitKeeper tools. + Download instructions for BitKeeper can be obtained by filling out + the form at: + \begin{quote} {\tt http://www.bitmover.com/cgi-bin/download.cgi} +\end{quote} +The public master BK repository for the 2.0 release lives at: +\begin{quote} {\tt bk://xen.bkbits.net/xen-2.0.bk} +\end{quote} +You can use BitKeeper to download it and keep it updated with the +latest features and fixes. + +Change to the directory in which you want to put the source code, then +run: +\begin{verbatim} +# bk clone bk://xen.bkbits.net/xen-2.0.bk +\end{verbatim} + +Under your current directory, a new directory named \path{xen-2.0.bk} +has been created, which contains all the source code for Xen, the OS +ports, and the control tools. You can update your repository with the +latest changes at any time by running: +\begin{verbatim} +# cd xen-2.0.bk # to change into the local repository +# bk pull # to update the repository +\end{verbatim} +\end{description} + +% \section{The distribution} +% +% The Xen source code repository is structured as follows: +% +% \begin{description} +% \item[\path{tools/}] Xen node controller daemon (Xend), command line +% tools, control libraries +% \item[\path{xen/}] The Xen VMM. +% \item[\path{linux-*-xen-sparse/}] Xen support for Linux. +% \item[\path{linux-*-patches/}] Experimental patches for Linux. +% \item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. +% \item[\path{docs/}] Various documentation files for users and +% developers. +% \item[\path{extras/}] Bonus extras. +% \end{description} + +\subsection{Building from Source} + +The top-level Xen Makefile includes a target `world' that will do the +following: + +\begin{itemize} +\item Build Xen. +\item Build the control tools, including \xend. +\item Download (if necessary) and unpack the Linux 2.6 source code, + and patch it for use with Xen. +\item Build a Linux kernel to use in domain 0 and a smaller + unprivileged kernel, which can optionally be used for unprivileged + virtual machines. +\end{itemize} + +After the build has completed you should have a top-level directory +called \path{dist/} in which all resulting targets will be placed; of +particular interest are the two kernels XenLinux kernel images, one +with a `-xen0' extension which contains hardware device drivers and +drivers for Xen's virtual devices, and one with a `-xenU' extension +that just contains the virtual ones. These are found in +\path{dist/install/boot/} along with the image for Xen itself and the +configuration files used during the build. + +The NetBSD port can be built using: +\begin{quote} +\begin{verbatim} +# make netbsd20 +\end{verbatim} +\end{quote} +NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. +The snapshot is downloaded as part of the build process, if it is not +yet present in the \path{NETBSD\_SRC\_PATH} search path. The build +process also downloads a toolchain which includes all the tools +necessary to build the NetBSD kernel under Linux. + +To customize further the set of kernels built you need to edit the +top-level Makefile. Look for the line: + +\begin{quote} +\begin{verbatim} +KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU +\end{verbatim} +\end{quote} + +You can edit this line to include any set of operating system kernels +which have configurations in the top-level \path{buildconfigs/} +directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 +kernel containing only virtual device drivers. + +%% Inspect the Makefile if you want to see what goes on during a +%% build. Building Xen and the tools is straightforward, but XenLinux +%% is more complicated. The makefile needs a `pristine' Linux kernel +%% tree to which it will then add the Xen architecture files. You can +%% tell the makefile the location of the appropriate Linux compressed +%% tar file by +%% setting the LINUX\_SRC environment variable, e.g. \\ +%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by +%% placing the tar file somewhere in the search path of {\tt +%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the +%% makefile can't find a suitable kernel tar file it attempts to +%% download it from kernel.org (this won't work if you're behind a +%% firewall). + +%% After untaring the pristine kernel tree, the makefile uses the {\tt +%% mkbuildtree} script to add the Xen patches to the kernel. + + +%% The procedure is similar to build the Linux 2.4 port: \\ +%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! + + +%% \framebox{\parbox{5in}{ +%% {\bf Distro specific:} \\ +%% {\it Gentoo} --- if not using udev (most installations, +%% currently), you'll need to enable devfs and devfs mount at boot +%% time in the xen0 config. }} + +\subsection{Custom XenLinux Builds} + +% If you have an SMP machine you may wish to give the {\tt '-j4'} +% argument to make to get a parallel build. + +If you wish to build a customized XenLinux kernel (e.g. to support +additional devices or enable distribution-required features), you can +use the standard Linux configuration mechanisms, specifying that the +architecture being built for is \path{xen}, e.g: +\begin{quote} +\begin{verbatim} +# cd linux-2.6.11-xen0 +# make ARCH=xen xconfig +# cd .. +# make +\end{verbatim} +\end{quote} + +You can also copy an existing Linux configuration (\path{.config}) +into \path{linux-2.6.11-xen0} and execute: +\begin{quote} +\begin{verbatim} +# make ARCH=xen oldconfig +\end{verbatim} +\end{quote} + +You may be prompted with some Xen-specific options; we advise +accepting the defaults for these options. + +Note that the only difference between the two types of Linux kernel +that are built is the configuration file used for each. The `U' +suffixed (unprivileged) versions don't contain any of the physical +hardware device drivers, leading to a 30\% reduction in size; hence +you may prefer these for your non-privileged domains. The `0' +suffixed privileged versions can be used to boot the system, as well +as in driver domains and unprivileged domains. + +\subsection{Installing the Binaries} + +The files produced by the build process are stored under the +\path{dist/install/} directory. To install them in their default +locations, do: +\begin{quote} +\begin{verbatim} +# make install +\end{verbatim} +\end{quote} + +Alternatively, users with special installation requirements may wish +to install them manually by copying the files to their appropriate +destinations. + +%% Files in \path{install/boot/} include: +%% \begin{itemize} +%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' +%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 +%% XenLinux kernel +%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged +%% XenLinux kernel +%% \end{itemize} + +The \path{dist/install/boot} directory will also contain the config +files used for building the XenLinux kernels, and also versions of Xen +and XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} +and \path{vmlinux-syms-2.6.11.11-xen0}) which are essential for +interpreting crash dumps. Retain these files as the developers may +wish to see them if you post on the mailing list. + + +\section{Configuration} +\label{s:configure} + +Once you have built and installed the Xen distribution, it is simple +to prepare the machine for booting and running Xen. + +\subsection{GRUB Configuration} + +An entry should be added to \path{grub.conf} (often found under +\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. +This file is sometimes called \path{menu.lst}, depending on your +distribution. The entry should look something like the following: + +{\small +\begin{verbatim} +title Xen 2.0 / XenLinux 2.6 + kernel /boot/xen-2.0.gz dom0_mem=131072 + module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 +\end{verbatim} +} + +The kernel line tells GRUB where to find Xen itself and what boot +parameters should be passed to it (in this case, setting domain 0's +memory allocation in kilobytes and the settings for the serial port). +For more details on the various Xen boot parameters see +Section~\ref{s:xboot}. + +The module line of the configuration describes the location of the +XenLinux kernel that Xen should start and the parameters that should +be passed to it (these are standard Linux parameters, identifying the +root device and specifying it be initially mounted read only and +instructing that console output be sent to the screen). Some +distributions such as SuSE do not require the \path{ro} parameter. + +%% \framebox{\parbox{5in}{ +%% {\bf Distro specific:} \\ +%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux +%% kernel command line, since the partition won't be remounted rw +%% during boot. }} + + +If you want to use an initrd, just add another \path{module} line to +the configuration, as usual: + +{\small +\begin{verbatim} + module /boot/my_initrd.gz +\end{verbatim} +} + +As always when installing a new kernel, it is recommended that you do +not delete existing menu options from \path{menu.lst} --- you may want +to boot your old Linux kernel in future, particularly if you have +problems. + +\subsection{Serial Console (optional)} + +%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 +%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro + + +In order to configure Xen serial console output, it is necessary to +add an boot option to your GRUB config; e.g.\ replace the above kernel +line with: +\begin{quote} +{\small +\begin{verbatim} + kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 +\end{verbatim}} +\end{quote} + +This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1 +stop bit and no parity. Modify these parameters for your set up. + +One can also configure XenLinux to share the serial console; to +achieve this append ``\path{console=ttyS0}'' to your module line. + +If you wish to be able to log in over the XenLinux serial console it +is necessary to add a line into \path{/etc/inittab}, just as per +regular Linux. Simply add the line: +\begin{quote} {\small {\tt c:2345:respawn:/sbin/mingetty ttyS0}} +\end{quote} + +and you should be able to log in. Note that to successfully log in as +root over the serial line will require adding \path{ttyS0} to +\path{/etc/securetty} in most modern distributions. + +\subsection{TLS Libraries} + +Users of the XenLinux 2.6 kernel should disable Thread Local Storage +(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before +attempting to run with a XenLinux kernel\footnote{If you boot without + first disabling TLS, you will get a warning message during the boot + process. In this case, simply perform the rename after the machine + is up and then run \texttt{/sbin/ldconfig} to make it take effect.}. +You can always reenable it by restoring the directory to its original +location (i.e.\ \path{mv /lib/tls.disabled /lib/tls}). + +The reason for this is that the current TLS implementation uses +segmentation in a way that is not permissible under Xen. If TLS is +not disabled, an emulation mode is used within Xen which reduces +performance substantially. + +We hope that this issue can be resolved by working with Linux +distribution vendors to implement a minor backward-compatible change +to the TLS library. + + +\section{Booting Xen} + +It should now be possible to restart the system and use Xen. Reboot +as usual but choose the new Xen option when the Grub screen appears. + +What follows should look much like a conventional Linux boot. The +first portion of the output comes from Xen itself, supplying low level +information about itself and the machine it is running on. The +following portion of the output comes from XenLinux. + +You may see some errors during the XenLinux boot. These are not +necessarily anything to worry about --- they may result from kernel +configuration differences between your XenLinux kernel and the one you +usually use. + +When the boot completes, you should be able to log into your system as +usual. If you are unable to log in to your system running Xen, you +should still be able to reboot with your normal Linux kernel. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/introduction.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/introduction.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,143 @@ +\chapter{Introduction} + + +Xen is a \emph{paravirtualising} virtual machine monitor (VMM), or +`hypervisor', for the x86 processor architecture. Xen can securely +execute multiple virtual machines on a single physical system with +close-to-native performance. The virtual machine technology +facilitates enterprise-grade functionality, including: + +\begin{itemize} +\item Virtual machines with performance close to native hardware. +\item Live migration of running virtual machines between physical + hosts. +\item Excellent hardware support (supports most Linux device drivers). +\item Sandboxed, re-startable device drivers. +\end{itemize} + +Paravirtualisation permits very high performance virtualisation, even +on architectures like x86 that are traditionally very hard to +virtualise. + +The drawback of this approach is that it requires operating systems to +be \emph{ported} to run on Xen. Porting an OS to run on Xen is +similar to supporting a new hardware platform, however the process is +simplified because the paravirtual machine architecture is very +similar to the underlying native hardware. Even though operating +system kernels must explicitly support Xen, a key feature is that user +space applications and libraries \emph{do not} require modification. + +Xen support is available for increasingly many operating systems: +right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. +A FreeBSD port is undergoing testing and will be incorporated into the +release soon. Other OS ports, including Plan 9, are in progress. We +hope that that arch-xen patches will be incorporated into the +mainstream releases of these operating systems in due course (as has +already happened for NetBSD). + +Possible usage scenarios for Xen include: + +\begin{description} +\item [Kernel development.] Test and debug kernel modifications in a + sandboxed virtual machine --- no need for a separate test machine. +\item [Multiple OS configurations.] Run multiple operating systems + simultaneously, for instance for compatibility or QA purposes. +\item [Server consolidation.] Move multiple servers onto a single + physical host with performance and fault isolation provided at + virtual machine boundaries. +\item [Cluster computing.] Management at VM granularity provides more + flexibility than separately managing each physical host, but better + control and isolation than single-system image solutions, + particularly by using live migration for load balancing. +\item [Hardware support for custom OSes.] Allow development of new + OSes while benefiting from the wide-ranging hardware support of + existing OSes such as Linux. +\end{description} + + +\section{Structure of a Xen-Based System} + +A Xen system has multiple layers, the lowest and most privileged of +which is Xen itself. + +Xen in turn may host multiple \emph{guest} operating systems, each of +which is executed within a secure virtual machine (in Xen terminology, +a \emph{domain}). Domains are scheduled by Xen to make effective use +of the available physical CPUs. Each guest OS manages its own +applications, which includes responsibility for scheduling each +application within the time allotted to the VM by Xen. + +The first domain, \emph{domain 0}, is created automatically when the +system boots and has special management privileges. Domain 0 builds +other domains and manages their virtual devices. It also performs +administrative tasks such as suspending, resuming and migrating other +virtual machines. + +Within domain 0, a process called \emph{xend} runs to manage the +system. \Xend is responsible for managing virtual machines and +providing access to their consoles. Commands are issued to \xend over +an HTTP interface, either from a command-line tool or from a web +browser. + + +\section{Hardware Support} + +Xen currently runs only on the x86 architecture, requiring a `P6' or +newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, +Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are +supported, and we also have basic support for HyperThreading (SMT), +although this remains a topic for ongoing research. A port +specifically for x86/64 is in progress, although Xen already runs on +such systems in 32-bit legacy mode. In addition a port to the IA64 +architecture is approaching completion. We hope to add other +architectures such as PPC and ARM in due course. + +Xen can currently use up to 4GB of memory. It is possible for x86 +machines to address up to 64GB of physical memory but there are no +current plans to support these systems: The x86/64 port is the planned +route to supporting larger memory sizes. + +Xen offloads most of the hardware support issues to the guest OS +running in Domain~0. Xen itself contains only the code required to +detect and start secondary processors, set up interrupt routing, and +perform PCI bus enumeration. Device drivers run within a privileged +guest OS rather than within Xen itself. This approach provides +compatibility with the majority of device hardware supported by Linux. +The default XenLinux build contains support for relatively modern +server-class network and disk hardware, but you can add support for +other hardware by configuring your XenLinux kernel in the normal way. + + +\section{History} + +Xen was originally developed by the Systems Research Group at the +University of Cambridge Computer Laboratory as part of the XenoServers +project, funded by the UK-EPSRC. + +XenoServers aim to provide a `public infrastructure for global +distributed computing', and Xen plays a key part in that, allowing us +to efficiently partition a single machine to enable multiple +independent clients to run their operating systems and applications in +an environment providing protection, resource isolation and +accounting. The project web page contains further information along +with pointers to papers and technical reports: +\path{http://www.cl.cam.ac.uk/xeno} + +Xen has since grown into a fully-fledged project in its own right, +enabling us to investigate interesting research issues regarding the +best techniques for virtualising resources such as the CPU, memory, +disk and network. The project has been bolstered by support from +Intel Research Cambridge, and HP Labs, who are now working closely +with us. + +Xen was first described in a paper presented at SOSP in +2003\footnote{\tt + http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the +first public release (1.0) was made that October. Since then, Xen has +significantly matured and is now used in production scenarios on many +sites. + +Xen 2.0 features greatly enhanced hardware support, configuration +flexibility, usability and a larger complement of supported operating +systems. This latest release takes Xen a step closer to becoming the +definitive open source solution for virtualisation. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/redhat.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/redhat.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,61 @@ +\chapter{Installing Xen / XenLinux on Red~Hat or Fedora Core} + +When using Xen / XenLinux on a standard Linux distribution there are a +couple of things to watch out for: + +Note that, because domains greater than 0 don't have any privileged +access at all, certain commands in the default boot sequence will fail +e.g.\ attempts to update the hwclock, change the console font, update +the keytable map, start apmd (power management), or gpm (mouse +cursor). Either ignore the errors (they should be harmless), or +remove them from the startup scripts. Deleting the following links +are a good start: {\path{S24pcmcia}}, {\path{S09isdn}}, +{\path{S17keytable}}, {\path{S26apmd}}, {\path{S85gpm}}. + +If you want to use a single root file system that works cleanly for +both domain~0 and unprivileged domains, a useful trick is to use +different `init' run levels. For example, use run level 3 for +domain~0, and run level 4 for other domains. This enables different +startup scripts to be run in depending on the run level number passed +on the kernel command line. + +If using NFS root files systems mounted either from an external server +or from domain0 there are a couple of other gotchas. The default +{\path{/etc/sysconfig/iptables}} rules block NFS, so part way through +the boot sequence things will suddenly go dead. + +If you're planning on having a separate NFS {\path{/usr}} partition, +the RH9 boot scripts don't make life easy - they attempt to mount NFS +file systems way to late in the boot process. The easiest way I found +to do this was to have a {\path{/linuxrc}} script run ahead of +{\path{/sbin/init}} that mounts {\path{/usr}}: + +\begin{quote} + \begin{small}\begin{verbatim} + #!/bin/bash + /sbin/ipconfig lo 127.0.0.1 + /sbin/portmap + /bin/mount /usr + exec /sbin/init "$@" <>/dev/console 2>&1 +\end{verbatim}\end{small} +\end{quote} + +%% $ XXX SMH: font lock fix :-) + +The one slight complication with the above is that +{\path{/sbin/portmap}} is dynamically linked against +{\path{/usr/lib/libwrap.so.0}} Since this is in {\path{/usr}}, it +won't work. This can be solved by copying the file (and link) below +the {\path{/usr}} mount point, and just let the file be `covered' when +the mount happens. + +In some installations, where a shared read-only {\path{/usr}} is being +used, it may be desirable to move other large directories over into +the read-only {\path{/usr}}. For example, you might replace +{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with links into +{\path{/usr/root/bin}}, {\path{/usr/root/lib}} and +{\path{/usr/root/sbin}} respectively. This creates other problems for +running the {\path{/linuxrc}} script, requiring bash, portmap, mount, +ifconfig, and a handful of other shared libraries to be copied below +the mount point --- a simple statically-linked C program would solve +this problem. diff -r 97dbd9524a7e -r 06d84bf87159 docs/src/user/start_addl_dom.tex --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/docs/src/user/start_addl_dom.tex Thu Sep 22 17:42:01 2005 @@ -0,0 +1,172 @@ +\chapter{Starting Additional Domains} + +The first step in creating a new domain is to prepare a root +filesystem for it to boot from. Typically, this might be stored in a +normal partition, an LVM or other volume manager partition, a disk +file or on an NFS server. A simple way to do this is simply to boot +from your standard OS install CD and install the distribution into +another partition on your hard drive. + +To start the \xend\ control daemon, type +\begin{quote} + \verb!# xend start! +\end{quote} + +If you wish the daemon to start automatically, see the instructions in +Section~\ref{s:xend}. Once the daemon is running, you can use the +\path{xm} tool to monitor and maintain the domains running on your +system. This chapter provides only a brief tutorial. We provide full +details of the \path{xm} tool in the next chapter. + +% \section{From the web interface} +% +% Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} +% for more details) using the command: \\ +% \verb_# xensv start_ \\ +% This will also start Xend (see Chapter~\ref{cha:xend} for more +% information). +% +% The domain management interface will then be available at {\tt +% http://your\_machine:8080/}. This provides a user friendly wizard +% for starting domains and functions for managing running domains. +% +% \section{From the command line} + + +\section{Creating a Domain Configuration File} + +Before you can start an additional domain, you must create a +configuration file. We provide two example files which you can use as +a starting point: +\begin{itemize} +\item \path{/etc/xen/xmexample1} is a simple template configuration + file for describing a single VM. + +\item \path{/etc/xen/xmexample2} file is a template description that + is intended to be reused for multiple virtual machines. Setting the + value of the \path{vmid} variable on the \path{xm} command line + fills in parts of this template. +\end{itemize} + +Copy one of these files and edit it as appropriate. Typical values +you may wish to edit include: + +\begin{quote} +\begin{description} +\item[kernel] Set this to the path of the kernel you compiled for use + with Xen (e.g.\ \path{kernel = `/boot/vmlinuz-2.6-xenU'}) +\item[memory] Set this to the size of the domain's memory in megabytes + (e.g.\ \path{memory = 64}) +\item[disk] Set the first entry in this list to calculate the offset + of the domain's root partition, based on the domain ID. Set the + second to the location of \path{/usr} if you are sharing it between + domains (e.g.\ \path{disk = [`phy:your\_hard\_drive\%d,sda1,w' \% + (base\_partition\_number + vmid), + `phy:your\_usr\_partition,sda6,r' ]} +\item[dhcp] Uncomment the dhcp variable, so that the domain will + receive its IP address from a DHCP server (e.g.\ \path{dhcp=`dhcp'}) +\end{description} +\end{quote} + +You may also want to edit the {\bf vif} variable in order to choose +the MAC address of the virtual ethernet interface yourself. For +example: +\begin{quote} +\verb_vif = [`mac=00:06:AA:F6:BB:B3']_ +\end{quote} +If you do not set this variable, \xend\ will automatically generate a +random MAC address from an unused range. + + +\section{Booting the Domain} + +The \path{xm} tool provides a variety of commands for managing +domains. Use the \path{create} command to start new domains. Assuming +you've created a configuration file \path{myvmconf} based around +\path{/etc/xen/xmexample2}, to start a domain with virtual machine +ID~1 you should type: + +\begin{quote} +\begin{verbatim} +# xm create -c myvmconf vmid=1 +\end{verbatim} +\end{quote} + +The \path{-c} switch causes \path{xm} to turn into the domain's +console after creation. The \path{vmid=1} sets the \path{vmid} +variable used in the \path{myvmconf} file. + +You should see the console boot messages from the new domain appearing +in the terminal in which you typed the command, culminating in a login +prompt. + + +\section{Example: ttylinux} + +Ttylinux is a very small Linux distribution, designed to require very +few resources. We will use it as a concrete example of how to start a +Xen domain. Most users will probably want to install a full-featured +distribution once they have mastered the basics\footnote{ttylinux is + maintained by Pascal Schmidt. You can download source packages from + the distribution's home page: {\tt + http://www.minimalinux.org/ttylinux/}}. + +\begin{enumerate} +\item Download and extract the ttylinux disk image from the Files + section of the project's SourceForge site (see + \path{http://sf.net/projects/xen/}). +\item Create a configuration file like the following: +\begin{verbatim} +kernel = "/boot/vmlinuz-2.6-xenU" +memory = 64 +name = "ttylinux" +nics = 1 +ip = "1.2.3.4" +disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] +root = "/dev/sda1 ro" +\end{verbatim} +\item Now start the domain and connect to its console: +\begin{verbatim} +xm create configfile -c +\end{verbatim} +\item Login as root, password root. +\end{enumerate} + + +\section{Starting / Stopping Domains Automatically} + +It is possible to have certain domains start automatically at boot +time and to have dom0 wait for all running domains to shutdown before +it shuts down the system. + +To specify a domain is to start at boot-time, place its configuration +file (or a link to it) under \path{/etc/xen/auto/}. + +A Sys-V style init script for Red Hat and LSB-compliant systems is +provided and will be automatically copied to \path{/etc/init.d/} +during install. You can then enable it in the appropriate way for +your distribution. + +For instance, on Red Hat: + +\begin{quote} + \verb_# chkconfig --add xendomains_ +\end{quote} + +By default, this will start the boot-time domains in runlevels 3, 4 +and 5. + +You can also use the \path{service} command to run this script +manually, e.g: + +\begin{quote} + \verb_# service xendomains start_ + + Starts all the domains with config files under /etc/xen/auto/. +\end{quote} + +\begin{quote} + \verb_# service xendomains stop_ + + Shuts down ALL running Xen domains. +\end{quote} diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/util.c --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/util.c Thu Sep 22 17:42:01 2005 @@ -0,0 +1,73 @@ +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <asm/uaccess.h> +#include <asm-xen/driver_util.h> + +static int f(pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + /* generic_page_range() does all the hard work. */ + return 0; +} + +struct vm_struct *alloc_vm_area(unsigned long size) +{ + struct vm_struct *area; + + area = get_vm_area(size, VM_IOREMAP); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (generic_page_range(&init_mm, (unsigned long)area->addr, + area->size, f, NULL)) { + free_vm_area(area); + return NULL; + } + + return area; +} + +void free_vm_area(struct vm_struct *area) +{ + BUG_ON(remove_vm_area(area->addr) != area); + kfree(area); +} + +void lock_vm_area(struct vm_struct *area) +{ + unsigned long i; + char c; + + /* + * Prevent context switch to a lazy mm that doesn't have this area + * mapped into its page tables. + */ + preempt_disable(); + + /* + * Ensure that the page tables are mapped into the current mm. The + * page-fault path will copy the page directory pointers from init_mm. + */ + for (i = 0; i < area->size; i += PAGE_SIZE) + (void)__get_user(c, (char *)area->addr + i); +} + +void unlock_vm_area(struct vm_struct *area) +{ + preempt_enable(); +} + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/include/asm-xen/driver_util.h --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/driver_util.h Thu Sep 22 17:42:01 2005 @@ -0,0 +1,16 @@ + +#ifndef __ASM_XEN_DRIVER_UTIL_H__ +#define __ASM_XEN_DRIVER_UTIL_H__ + +#include <linux/config.h> +#include <linux/vmalloc.h> + +/* Allocate/destroy a 'vmalloc' VM area. */ +extern struct vm_struct *alloc_vm_area(unsigned long size); +extern void free_vm_area(struct vm_struct *area); + +/* Lock an area so that PTEs are accessible in the current address space. */ +extern void lock_vm_area(struct vm_struct *area); +extern void unlock_vm_area(struct vm_struct *area); + +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */ diff -r 97dbd9524a7e -r 06d84bf87159 patches/linux-2.6.12/tpm_partial_read.patch --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/patches/linux-2.6.12/tpm_partial_read.patch Thu Sep 22 17:42:01 2005 @@ -0,0 +1,74 @@ +--- ref-linux-2.6.12/drivers/char/tpm/tpm.c 2005-06-17 15:48:29.000000000 -0400 ++++ linux-2.6-xen-sparse/drivers/char/tpm/tpm.c 2005-09-15 14:56:05.000000000 -0400 +@@ -473,6 +401,7 @@ ssize_t tpm_write(struct file * file, co + out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE); + + atomic_set(&chip->data_pending, out_size); ++ atomic_set(&chip->data_position, 0); + up(&chip->buffer_mutex); + + /* Set a timeout by which the reader must come claim the result */ +@@ -494,29 +423,34 @@ ssize_t tpm_read(struct file * file, cha + { + struct tpm_chip *chip = file->private_data; + int ret_size = -ENODATA; ++ int pos, pending = 0; + +- if (atomic_read(&chip->data_pending) != 0) { /* Result available */ ++ down(&chip->buffer_mutex); ++ ret_size = atomic_read(&chip->data_pending); ++ if ( ret_size > 0 ) { /* Result available */ ++ if (size < ret_size) ++ ret_size = size; ++ ++ pos = atomic_read(&chip->data_position); ++ ++ if (copy_to_user((void __user *) buf, ++ &chip->data_buffer[pos], ret_size)) { ++ ret_size = -EFAULT; ++ } else { ++ pending = atomic_read(&chip->data_pending) - ret_size; ++ if ( pending ) { ++ atomic_set( &chip->data_pending, pending ); ++ atomic_set( &chip->data_position, pos+ret_size ); ++ } ++ } ++ } ++ up(&chip->buffer_mutex); ++ ++ if ( ret_size <= 0 || pending == 0 ) { ++ atomic_set( &chip->data_pending, 0 ); + down(&chip->timer_manipulation_mutex); + del_singleshot_timer_sync(&chip->user_read_timer); + up(&chip->timer_manipulation_mutex); +- +- down(&chip->buffer_mutex); +- +- ret_size = atomic_read(&chip->data_pending); +- atomic_set(&chip->data_pending, 0); +- +- if (ret_size == 0) /* timeout just occurred */ +- ret_size = -ETIME; +- else if (ret_size > 0) { /* relay data */ +- if (size < ret_size) +- ret_size = size; +- +- if (copy_to_user((void __user *) buf, +- chip->data_buffer, ret_size)) { +- ret_size = -EFAULT; +- } +- } +- up(&chip->buffer_mutex); + } + + return ret_size; +--- ref-linux-2.6.12/drivers/char/tpm/tpm.h 2005-06-17 15:48:29.000000000 -0400 ++++ linux-2.6-xen-sparse/drivers/char/tpm/tpm.h 2005-09-15 14:56:05.000000000 -0400 +@@ -54,6 +54,7 @@ struct tpm_chip { + /* Data passed to and from the tpm via the read/write calls */ + u8 *data_buffer; + atomic_t data_pending; ++ atomic_t data_position; + struct semaphore buffer_mutex; + + struct timer_list user_read_timer; /* user needs to claim result */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/debugger/gdb/README --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/tools/debugger/gdb/README Thu Sep 22 17:42:01 2005 @@ -0,0 +1,29 @@ + +DomU GDB server for 32-bit (PAE and non-PAE) systems +---------------------------------------------------- + +Lines marked below with [*] are optional, if you want full +source-level debugging of your kernel image. + +To build the GDB server: + 1. Run ./gdbbuild from within this directory. + 2. Copy ./gdb-6.2.1-linux-i386-xen/gdb/gdbserver/gdbserver-xen + to your test machine. + +To build a debuggable guest kernel image: + 1. cd linux-2.6.12-xenU + 2. ARCH=xen make menuconfig + 3. From within the configurator, enable the following options: + # Kernel hacking -> Compile the kernel with debug info [*] + -> Compile the kernel with frame pointers + 4. (Re)build and (re)install your xenU kernel image. + +To debug a running guest: + 1. Use 'xm list' to discover its domain id ($domid). + 2. Run 'gdbserver-xen 127.0.0.1:9999 --attach $domid' + 3. Run 'gdb /path/to/vmlinux-syms-2.6.xx-xenU' + 4. From within the gdb client session: + # directory /path/to/linux-2.6.xx-xenU [*] + # target remote 127.0.0.1:9999 + # bt + # disass diff -r 97dbd9524a7e -r 06d84bf87159 tools/firmware/vmxassist/acpi_madt.c --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/tools/firmware/vmxassist/acpi_madt.c Thu Sep 22 17:42:01 2005 @@ -0,0 +1,145 @@ +/* + * acpi_madt.c: Update ACPI MADT table for multiple processor guest. + * + * Yu Ke, ke.yu@xxxxxxxxx + * Copyright (c) 2005, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ +#include "../acpi/acpi2_0.h" +#include "../acpi/acpi_madt.h" + +#define NULL ((void*)0) + +extern int puts(const char *s); + +#define VCPU_MAGIC 0x76637075 /* "vcpu" */ + +/* xc_vmx_builder wrote vcpu block at 0x9F800. Return it. */ +static int +get_vcpus(void) +{ + unsigned long *vcpus; + + vcpus = (unsigned long *)0x9F800; + if (vcpus[0] != VCPU_MAGIC) { + puts("Bad vcpus magic, set vcpu number=1\n"); + return 1; + } + + return vcpus[1]; +} + +static void * +acpi_madt_get_madt(unsigned char *acpi_start) +{ + ACPI_2_0_RSDP *rsdp=NULL; + ACPI_2_0_RSDT *rsdt=NULL; + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt; + + rsdp = (ACPI_2_0_RSDP *)(acpi_start + sizeof(ACPI_2_0_FACS)); + if (rsdp->Signature != ACPI_2_0_RSDP_SIGNATURE) { + puts("Bad RSDP signature\n"); + return NULL; + } + + rsdt= (ACPI_2_0_RSDT *) + (acpi_start + rsdp->RsdtAddress - ACPI_PHYSICAL_ADDRESS); + if (rsdt->Header.Signature != ACPI_2_0_RSDT_SIGNATURE) { + puts("Bad RSDT signature\n"); + return NULL; + } + + madt = (ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *) + ( acpi_start+ rsdt->Entry[1] - ACPI_PHYSICAL_ADDRESS); + if (madt->Header.Header.Signature != + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE_SIGNATURE) { + puts("Bad MADT signature \n"); + return NULL; + } + + return madt; +} + +static void +set_checksum(void *start, int checksum_offset, int len) +{ + unsigned char sum = 0; + unsigned char *ptr; + + ptr = start; + ptr[checksum_offset] = 0; + while (len--) + sum += *ptr++; + + ptr = start; + ptr[checksum_offset] = -sum; +} + +static int +acpi_madt_set_local_apics( + int nr_vcpu, + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt) +{ + int i; + + if ((nr_vcpu > MAX_VIRT_CPUS) || (nr_vcpu < 0) || !madt) + return -1; + + for (i = 0; i < nr_vcpu; i++) { + madt->LocalApic[i].Type = ACPI_PROCESSOR_LOCAL_APIC; + madt->LocalApic[i].Length = sizeof (ACPI_LOCAL_APIC_STRUCTURE); + madt->LocalApic[i].AcpiProcessorId = i; + madt->LocalApic[i].ApicId = i; + madt->LocalApic[i].Flags = 1; + } + + madt->Header.Header.Length = + sizeof(ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE) - + (MAX_VIRT_CPUS - nr_vcpu)* sizeof(ACPI_LOCAL_APIC_STRUCTURE); + + return 0; +} + +#define FIELD_OFFSET(TYPE,Field) ((unsigned int)(&(((TYPE *) 0)->Field))) + +int acpi_madt_update(unsigned char *acpi_start) +{ + int rc; + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt; + + madt = acpi_madt_get_madt(acpi_start); + if (!madt) + return -1; + + rc = acpi_madt_set_local_apics(get_vcpus(), madt); + if (rc != 0) + return rc; + + set_checksum( + madt, FIELD_OFFSET(ACPI_TABLE_HEADER, Checksum), + madt->Header.Header.Length); + + return 0; +} + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/DevController.py --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/tools/python/xen/xend/server/DevController.py Thu Sep 22 17:42:01 2005 @@ -0,0 +1,203 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd +#============================================================================ + + +from xen.xend import sxp +from xen.xend.XendError import VmError +from xen.xend.XendLogging import log +from xen.xend.xenstore.xstransact import xstransact + + +class DevController: + """Abstract base class for a device controller. Device controllers create + appropriate entries in the store to trigger the creation, reconfiguration, + and destruction of devices in guest domains. Each subclass of + DevController is responsible for a particular device-class, and + understands the details of configuration specific to that device-class. + + DevController itself provides the functionality common to all device + creation tasks, as well as providing an interface to XendDomainInfo for + triggering those events themselves. + """ + + # Set when registered. + deviceClass = None + + + ## public: + + def __init__(self, vm): + self.vm = vm + + + def createDevice(self, config): + """Trigger the creation of a device with the given configuration. + + @return The ID for the newly created device. + """ + (devid, back, front) = self.getDeviceDetails(config) + + self.writeDetails(config, devid, back, front) + + return devid + + + def reconfigureDevice(self, devid, config): + """Reconfigure the specified device. + + The implementation here just raises VmError. This may be overridden + by those subclasses that can reconfigure their devices. + """ + raise VmError('%s devices may not be reconfigured' % self.deviceClass) + + + def destroyDevice(self, devid): + """Destroy the specified device. + + The implementation here simply deletes the appropriate paths from + the store. This may be overridden by subclasses who need to perform + other tasks on destruction. + """ + + frontpath = self.frontendPath(devid) + backpath = xstransact.Read("%s/backend" % frontpath) + + xstransact.Remove(frontpath) + xstransact.Remove(backpath) + + + def sxpr(self, devid): + """@return an s-expression describing the specified device. + """ + return [self.deviceClass, ['dom', self.vm.getDomid(), + 'id', devid]] + + + ## protected: + + def getDeviceDetails(self, config): + """Compute the details for creation of a device corresponding to the + given configuration. These details consist of a tuple of (devID, + backDetails, frontDetails), where devID is the ID for the new device, + and backDetails and frontDetails are the device configuration + specifics for the backend and frontend respectively. + + backDetails and frontDetails should be dictionaries, the keys and + values of which will be used as paths in the store. There is no need + for these dictionaries to include the references from frontend to + backend, nor vice versa, as these will be handled by DevController. + + Abstract; must be implemented by every subclass. + + @return (devID, backDetails, frontDetails), as specified above. + """ + + raise NotImplementedError() + + + def getDomid(self): + """Stub to {@link XendDomainInfo.getDomid}, for use by our + subclasses. + """ + return self.vm.getDomid() + + + def allocateDeviceID(self): + """Allocate a device ID, allocating them consecutively on a + per-domain, per-device-class basis, and using the store to record the + next available ID. + + This method is available to our subclasses, though it is not + compulsory to use it; subclasses may prefer to allocate IDs based upon + the device configuration instead. + """ + path = self.frontendMiscPath() + t = xstransact(path) + try: + result = t.read("nextDeviceID") + if result: + result = int(result) + else: + result = 1 + t.write("nextDeviceID", str(result + 1)) + t.commit() + return result + except: + t.abort() + raise + + + ## private: + + def writeDetails(self, config, devid, backDetails, frontDetails): + """Write the details in the store to trigger creation of a device. + The backend domain ID is taken from the given config, paths for + frontend and backend are computed, and these are written to the store + appropriately, including references from frontend to backend and vice + versa. + + @param config The configuration of the device, as given to + {@link #createDevice}. + @param devid As returned by {@link #getDeviceDetails}. + @param backDetails As returned by {@link #getDeviceDetails}. + @param frontDetails As returned by {@link #getDeviceDetails}. + """ + + import xen.xend.XendDomain + backdom = xen.xend.XendDomain.instance().domain_lookup_by_name( + sxp.child_value(config, 'backend', '0')) + + frontpath = self.frontendPath(devid) + backpath = self.backendPath(backdom, devid) + + frontDetails.update({ + 'backend' : backpath, + 'backend-id' : "%i" % backdom.getDomid() + }) + + + backDetails.update({ + 'domain' : self.vm.getName(), + 'frontend' : frontpath, + 'frontend-id' : "%i" % self.vm.getDomid() + }) + + log.debug('DevController: writing %s to %s.', str(frontDetails), + frontpath) + log.debug('DevController: writing %s to %s.', str(backDetails), + backpath) + + xstransact.Write(frontpath, frontDetails) + xstransact.Write(backpath, backDetails) + + + def backendPath(self, backdom, devid): + """@param backdom [XendDomainInfo] The backend domain info.""" + + return "%s/backend/%s/%s/%d" % (backdom.getPath(), + self.deviceClass, + self.vm.getUuid(), devid) + + + def frontendPath(self, devid): + return "%s/device/%s/%d" % (self.vm.getPath(), self.deviceClass, + devid) + + + def frontendMiscPath(self): + return "%s/device-misc/%s" % (self.vm.getPath(), self.deviceClass) diff -r 97dbd9524a7e -r 06d84bf87159 tools/vtpm/tpm_emulator-0.2b-x86_64.patch --- /dev/null Thu Sep 22 17:34:14 2005 +++ b/tools/vtpm/tpm_emulator-0.2b-x86_64.patch Thu Sep 22 17:42:01 2005 @@ -0,0 +1,499 @@ +diff -uprN tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c +--- tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c 2005-09-19 14:10:29.000000000 -0700 +@@ -79,7 +79,7 @@ void __attribute__ ((regparm(0))) *kerne + { + void *ret = (void*)kmalloc(size, GFP_KERNEL); + if (!ret) panic(KERN_CRIT TPM_MODULE_NAME +- "GMP: cannot allocate memory (size=%u)\n", size); ++ "GMP: cannot allocate memory (size=%Zu)\n", size); + return ret; + } + +@@ -88,7 +88,7 @@ void __attribute__ ((regparm(0))) *kerne + { + void *ret = (void*)kmalloc(new_size, GFP_KERNEL); + if (!ret) panic(KERN_CRIT TPM_MODULE_NAME "GMP: Cannot reallocate memory " +- "(old_size=%u new_size=%u)\n", old_size, new_size); ++ "(old_size=%Zu new_size=%Zu)\n", old_size, new_size); + memcpy(ret, oldptr, old_size); + kfree(oldptr); + return ret; +diff -uprN tpm_emulator-0.2/linux_module.c tpm_emulator-0.2-x86_64/linux_module.c +--- tpm_emulator-0.2/linux_module.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/linux_module.c 2005-09-19 14:10:29.000000000 -0700 +@@ -66,7 +66,7 @@ static int tpm_release(struct inode *ino + + static ssize_t tpm_read(struct file *file, char *buf, size_t count, loff_t *ppos) + { +- debug("%s(%d)", __FUNCTION__, count); ++ debug("%s(%Zu)", __FUNCTION__, count); + down(&tpm_mutex); + if (tpm_response.data != NULL) { + count = min(count, (size_t)tpm_response.size - (size_t)*ppos); +@@ -81,7 +81,7 @@ static ssize_t tpm_read(struct file *fil + + static ssize_t tpm_write(struct file *file, const char *buf, size_t count, loff_t *ppos) + { +- debug("%s(%d)", __FUNCTION__, count); ++ debug("%s(%Zu)", __FUNCTION__, count); + down(&tpm_mutex); + *ppos = 0; + if (tpm_response.data != NULL) kfree(tpm_response.data); +diff -uprN tpm_emulator-0.2/linux_module.h tpm_emulator-0.2-x86_64/linux_module.h +--- tpm_emulator-0.2/linux_module.h 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/linux_module.h 2005-09-19 14:10:29.000000000 -0700 +@@ -28,8 +28,10 @@ + + /* module settings */ + ++#ifndef STR + #define STR(s) __STR__(s) + #define __STR__(s) #s ++#endif + #include "tpm_version.h" + + #define TPM_DEVICE_MINOR 224 +diff -uprN tpm_emulator-0.2/Makefile tpm_emulator-0.2-x86_64/Makefile +--- tpm_emulator-0.2/Makefile 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/Makefile 2005-09-19 14:10:29.000000000 -0700 +@@ -7,6 +7,7 @@ + KERNEL_RELEASE := $(shell uname -r) + KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build + MOD_SUBDIR := misc ++COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/) + + # module settings + MODULE_NAME := tpm_emulator +@@ -17,8 +18,14 @@ VERSION_BUILD := $(shell date +"%s") + # enable/disable DEBUG messages + EXTRA_CFLAGS += -DDEBUG -g + ++ifeq ($(COMPILE_ARCH),x86_64) ++LIBDIR = lib64 ++else ++LIBDIR = lib ++endif ++ + # GNU MP configuration +-GMP_LIB := /usr/lib/libgmp.a ++GMP_LIB := /usr/$(LIBDIR)/libgmp.a + GMP_HEADER := /usr/include/gmp.h + + # sources and objects +diff -uprN tpm_emulator-0.2/README tpm_emulator-0.2-x86_64/README +--- tpm_emulator-0.2/README 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/README 2005-09-19 14:21:43.000000000 -0700 +@@ -45,6 +45,12 @@ Example: + GMP_LIB := /usr/lib/libgmp.a + GMP_HEADER := /usr/include/gmp.h + ++GNU MP Library on 64 bit Systems ++-------------------------------------------------------------------------- ++Some 64-bit kernels have problems with importing the user-space gmp ++library (/usr/lib*/libgmp.a) into kernel space. These kernels will require ++that the gmp library be recompiled for kernel space with -mcmodel=kernel. ++ + Installation + -------------------------------------------------------------------------- + The compilation and installation process uses the build environment for +diff -uprN tpm_emulator-0.2/tpm/tpm_credentials.c tpm_emulator-0.2-x86_64/tpm/tpm_credentials.c +--- tpm_emulator-0.2/tpm/tpm_credentials.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_credentials.c 2005-09-19 14:10:29.000000000 -0700 +@@ -47,16 +47,16 @@ int tpm_compute_pubkey_checksum(TPM_NONC + + TPM_RESULT tpm_get_pubek(TPM_PUBKEY *pubEndorsementKey) + { +- UINT32 key_length; ++ size_t key_length; + if (!tpmData.permanent.data.endorsementKey.size) return TPM_NO_ENDORSEMENT; + /* setup TPM_PUBKEY structure */ +- key_length = tpmData.permanent.data.endorsementKey.size; +- pubEndorsementKey->pubKey.keyLength = key_length >> 3; ++ pubEndorsementKey->pubKey.keyLength = tpmData.permanent.data.endorsementKey.size >> 3; + pubEndorsementKey->pubKey.key = tpm_malloc(pubEndorsementKey->pubKey.keyLength); + if (pubEndorsementKey->pubKey.key == NULL) return TPM_FAIL; + rsa_export_modulus(&tpmData.permanent.data.endorsementKey, +- pubEndorsementKey->pubKey.key, +- &pubEndorsementKey->pubKey.keyLength); ++ pubEndorsementKey->pubKey.key, ++ &key_length); ++ pubEndorsementKey->pubKey.keyLength = key_length; + pubEndorsementKey->algorithmParms.algorithmID = TPM_ALG_RSA; + pubEndorsementKey->algorithmParms.encScheme = TPM_ES_RSAESOAEP_SHA1_MGF1; + pubEndorsementKey->algorithmParms.sigScheme = TPM_SS_NONE; +@@ -169,6 +169,7 @@ TPM_RESULT TPM_OwnerReadInternalPub(TPM_ + { + TPM_RESULT res; + TPM_KEY_DATA *srk = &tpmData.permanent.data.srk; ++ size_t key_length; + info("TPM_OwnerReadInternalPub()"); + /* verify authorization */ + res = tpm_verify_auth(auth1, tpmData.permanent.data.ownerAuth, TPM_KH_OWNER); +@@ -180,7 +181,8 @@ TPM_RESULT TPM_OwnerReadInternalPub(TPM_ + publicPortion->pubKey.key = tpm_malloc(publicPortion->pubKey.keyLength); + if (publicPortion->pubKey.key == NULL) return TPM_FAIL; + rsa_export_modulus(&srk->key, publicPortion->pubKey.key, +- &publicPortion->pubKey.keyLength); ++ &key_length); ++ publicPortion->pubKey.keyLength = key_length; + publicPortion->algorithmParms.algorithmID = TPM_ALG_RSA; + publicPortion->algorithmParms.encScheme = srk->encScheme; + publicPortion->algorithmParms.sigScheme = srk->sigScheme; +diff -uprN tpm_emulator-0.2/tpm/tpm_crypto.c tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c +--- tpm_emulator-0.2/tpm/tpm_crypto.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c 2005-09-19 14:10:29.000000000 -0700 +@@ -182,7 +182,8 @@ TPM_RESULT TPM_CertifyKey(TPM_KEY_HANDLE + TPM_KEY_DATA *cert, *key; + sha1_ctx_t sha1_ctx; + BYTE *buf, *p; +- UINT32 length; ++ UINT32 length32; ++ size_t length; + info("TPM_CertifyKey()"); + /* get keys */ + cert = tpm_get_key(certHandle); +@@ -264,14 +265,15 @@ TPM_RESULT TPM_CertifyKey(TPM_KEY_HANDLE + /* compute the digest of the CERTIFY_INFO[2] structure and sign it */ + length = sizeof_TPM_CERTIFY_INFO((*certifyInfo)); + p = buf = tpm_malloc(length); ++ length32=(UINT32) length; + if (buf == NULL +- || tpm_marshal_TPM_CERTIFY_INFO(&p, &length, certifyInfo)) { ++ || tpm_marshal_TPM_CERTIFY_INFO(&p, &length32, certifyInfo)) { + free_TPM_KEY_PARMS(certifyInfo->algorithmParms); + return TPM_FAIL; + } + length = sizeof_TPM_CERTIFY_INFO((*certifyInfo)); + sha1_init(&sha1_ctx); +- sha1_update(&sha1_ctx, buf, length); ++ sha1_update(&sha1_ctx, buf, (size_t) length); + sha1_final(&sha1_ctx, buf); + res = tpm_sign(cert, auth1, FALSE, buf, SHA1_DIGEST_LENGTH, outData, outDataSize); + tpm_free(buf); +@@ -292,7 +294,8 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL + TPM_KEY_DATA *cert, *key; + sha1_ctx_t sha1_ctx; + BYTE *buf, *p; +- UINT32 length; ++ size_t length; ++ UINT32 length32; + info("TPM_CertifyKey2()"); + /* get keys */ + cert = tpm_get_key(certHandle); +@@ -362,8 +365,9 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL + /* compute the digest of the CERTIFY_INFO[2] structure and sign it */ + length = sizeof_TPM_CERTIFY_INFO((*certifyInfo)); + p = buf = tpm_malloc(length); ++ length32 = (UINT32) length; + if (buf == NULL +- || tpm_marshal_TPM_CERTIFY_INFO(&p, &length, certifyInfo)) { ++ || tpm_marshal_TPM_CERTIFY_INFO(&p, &length32, certifyInfo)) { + free_TPM_KEY_PARMS(certifyInfo->algorithmParms); + return TPM_FAIL; + } +diff -uprN tpm_emulator-0.2/tpm/tpm_data.c tpm_emulator-0.2-x86_64/tpm/tpm_data.c +--- tpm_emulator-0.2/tpm/tpm_data.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_data.c 2005-09-19 14:10:29.000000000 -0700 +@@ -179,7 +179,7 @@ static int read_from_file(uint8_t **data + int tpm_store_permanent_data(void) + { + uint8_t *buf, *ptr; +- size_t buf_length, len; ++ UINT32 buf_length, len; + + /* marshal data */ + buf_length = len = sizeof_TPM_STCLEAR_FLAGS(tpmData.stclear.flags) +@@ -207,13 +207,14 @@ int tpm_store_permanent_data(void) + int tpm_restore_permanent_data(void) + { + uint8_t *buf, *ptr; +- size_t buf_length, len; ++ size_t buf_length; ++ UINT32 len; + TPM_VERSION ver; + + /* read data */ + if (read_from_file(&buf, &buf_length)) return -1; + ptr = buf; +- len = buf_length; ++ len = (uint32_t) buf_length; + /* unmarshal data */ + if (tpm_unmarshal_TPM_VERSION(&ptr, &len, &ver) + || memcmp(&ver, &tpmData.permanent.data.version, sizeof(TPM_VERSION)) +diff -uprN tpm_emulator-0.2/tpm/tpm_marshalling.c tpm_emulator-0.2-x86_64/tpm/tpm_marshalling.c +--- tpm_emulator-0.2/tpm/tpm_marshalling.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_marshalling.c 2005-09-19 14:10:29.000000000 -0700 +@@ -981,7 +981,7 @@ int tpm_unmarshal_TPM_STANY_FLAGS(BYTE * + + int tpm_marshal_RSA(BYTE **ptr, UINT32 *length, rsa_private_key_t *v) + { +- UINT32 m_len, e_len, q_len; ++ size_t m_len, e_len, q_len; + if (*length < sizeof_RSA((*v))) return -1; + if (v->size > 0) { + rsa_export_modulus(v, &(*ptr)[6], &m_len); +diff -uprN tpm_emulator-0.2/tpm/tpm_owner.c tpm_emulator-0.2-x86_64/tpm/tpm_owner.c +--- tpm_emulator-0.2/tpm/tpm_owner.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_owner.c 2005-09-19 14:10:29.000000000 -0700 +@@ -108,7 +108,7 @@ TPM_RESULT TPM_TakeOwnership(TPM_PROTOCO + TPM_RESULT res; + rsa_private_key_t *ek = &tpmData.permanent.data.endorsementKey; + TPM_KEY_DATA *srk = &tpmData.permanent.data.srk; +- UINT32 buf_size = ek->size >> 3; ++ size_t buf_size = ek->size >> 3, key_length; + BYTE buf[buf_size]; + + info("TPM_TakeOwnership()"); +@@ -172,7 +172,8 @@ TPM_RESULT TPM_TakeOwnership(TPM_PROTOCO + return TPM_FAIL; + } + rsa_export_modulus(&srk->key, srkPub->pubKey.key, +- &srkPub->pubKey.keyLength); ++ &key_length); ++ srkPub->pubKey.keyLength = (UINT32) key_length; + /* setup tpmProof and set state to owned */ + tpm_get_random_bytes(tpmData.permanent.data.tpmProof.nonce, + sizeof(tpmData.permanent.data.tpmProof.nonce)); +diff -uprN tpm_emulator-0.2/tpm/tpm_storage.c tpm_emulator-0.2-x86_64/tpm/tpm_storage.c +--- tpm_emulator-0.2/tpm/tpm_storage.c 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm/tpm_storage.c 2005-09-19 14:10:29.000000000 -0700 +@@ -58,6 +58,7 @@ int encrypt_sealed_data(TPM_KEY_DATA *ke + BYTE *enc, UINT32 *enc_size) + { + UINT32 len; ++ size_t enc_size32 = *enc_size; + BYTE *buf, *ptr; + rsa_public_key_t pub_key; + int scheme; +@@ -72,7 +73,7 @@ int encrypt_sealed_data(TPM_KEY_DATA *ke + if (buf == NULL + || tpm_marshal_TPM_SEALED_DATA(&ptr, &len, seal) + || rsa_encrypt(&pub_key, scheme, buf, sizeof_TPM_SEALED_DATA((*seal)), +- enc, enc_size)) { ++ enc, &enc_size32)) { + tpm_free(buf); + rsa_release_public_key(&pub_key); + return -1; +@@ -85,7 +86,8 @@ int encrypt_sealed_data(TPM_KEY_DATA *ke + int decrypt_sealed_data(TPM_KEY_DATA *key, BYTE *enc, UINT32 enc_size, + TPM_SEALED_DATA *seal, BYTE **buf) + { +- UINT32 len; ++ size_t len; ++ UINT32 len32; + BYTE *ptr; + int scheme; + switch (key->encScheme) { +@@ -96,8 +98,12 @@ int decrypt_sealed_data(TPM_KEY_DATA *ke + len = enc_size; + *buf = ptr = tpm_malloc(len); + if (*buf == NULL +- || rsa_decrypt(&key->key, scheme, enc, enc_size, *buf, &len) +- || tpm_unmarshal_TPM_SEALED_DATA(&ptr, &len, seal)) { ++ || rsa_decrypt(&key->key, scheme, enc, enc_size, *buf, &len) ){ ++ tpm_free(*buf); ++ return -1; ++ } ++ len32 = len; ++ if (tpm_unmarshal_TPM_SEALED_DATA(&ptr, &len32, seal)) { + tpm_free(*buf); + return -1; + } +@@ -237,11 +243,12 @@ TPM_RESULT TPM_Unseal(TPM_KEY_HANDLE par + + TPM_RESULT TPM_UnBind(TPM_KEY_HANDLE keyHandle, UINT32 inDataSize, + BYTE *inData, TPM_AUTH *auth1, +- UINT32 *outDataSize, BYTE **outData) ++ UINT32 *outDataSize32, BYTE **outData) + { + TPM_RESULT res; + TPM_KEY_DATA *key; + int scheme; ++ size_t outDataSize; + info("TPM_UnBind()"); + /* get key */ + key = tpm_get_key(keyHandle); +@@ -258,8 +265,8 @@ TPM_RESULT TPM_UnBind(TPM_KEY_HANDLE key + /* the size of the input data muss be greater than zero */ + if (inDataSize == 0) return TPM_BAD_PARAMETER; + /* decrypt data */ +- *outDataSize = inDataSize; +- *outData = tpm_malloc(*outDataSize); ++ outDataSize = inDataSize; ++ *outData = tpm_malloc(outDataSize); + if (*outData == NULL) return TPM_FAIL; + switch (key->encScheme) { + case TPM_ES_RSAESOAEP_SHA1_MGF1: scheme = RSA_ES_OAEP_SHA1; break; +@@ -267,20 +274,21 @@ TPM_RESULT TPM_UnBind(TPM_KEY_HANDLE key + default: tpm_free(*outData); return TPM_DECRYPT_ERROR; + } + if (rsa_decrypt(&key->key, scheme, inData, inDataSize, +- *outData, outDataSize)) { ++ *outData, &outDataSize) ) { + tpm_free(*outData); + return TPM_DECRYPT_ERROR; + } + /* verify data if it is of type TPM_BOUND_DATA */ + if (key->encScheme == TPM_ES_RSAESOAEP_SHA1_MGF1 + || key->keyUsage != TPM_KEY_LEGACY) { +- if (*outDataSize < 5 || memcmp(*outData, "\x01\x01\00\x00\x02", 5) != 0) { ++ if (outDataSize < 5 || memcmp(*outData, "\x01\x01\00\x00\x02", 5) != 0) { + tpm_free(*outData); + return TPM_DECRYPT_ERROR; + } +- *outDataSize -= 5; +- memmove(*outData, &(*outData)[5], *outDataSize); +- } ++ outDataSize -= 5; ++ memmove(*outData, &(*outData)[5], outDataSize); ++ } ++ *outDataSize32 = (UINT32) outDataSize; + return TPM_SUCCESS; + } + +@@ -311,12 +319,13 @@ static int verify_key_digest(TPM_KEY *ke + } + + int encrypt_private_key(TPM_KEY_DATA *key, TPM_STORE_ASYMKEY *store, +- BYTE *enc, UINT32 *enc_size) ++ BYTE *enc, UINT32 *enc_size32) + { + UINT32 len; + BYTE *buf, *ptr; + rsa_public_key_t pub_key; + int scheme; ++ size_t enc_size; + switch (key->encScheme) { + case TPM_ES_RSAESOAEP_SHA1_MGF1: scheme = RSA_ES_OAEP_SHA1; break; + case TPM_ES_RSAESPKCSv15: scheme = RSA_ES_PKCSV15; break; +@@ -328,11 +337,12 @@ int encrypt_private_key(TPM_KEY_DATA *ke + if (buf == NULL + || tpm_marshal_TPM_STORE_ASYMKEY(&ptr, &len, store) + || rsa_encrypt(&pub_key, scheme, buf, sizeof_TPM_STORE_ASYMKEY((*store)), +- enc, enc_size)) { ++ enc, &enc_size)) { + tpm_free(buf); + rsa_release_public_key(&pub_key); + return -1; + } ++ *enc_size32 = (UINT32) enc_size; + tpm_free(buf); + rsa_release_public_key(&pub_key); + return 0; +@@ -341,7 +351,8 @@ int encrypt_private_key(TPM_KEY_DATA *ke + int decrypt_private_key(TPM_KEY_DATA *key, BYTE *enc, UINT32 enc_size, + TPM_STORE_ASYMKEY *store, BYTE **buf) + { +- UINT32 len; ++ UINT32 len32; ++ size_t len; + BYTE *ptr; + int scheme; + switch (key->encScheme) { +@@ -352,11 +363,16 @@ int decrypt_private_key(TPM_KEY_DATA *ke + len = enc_size; + *buf = ptr = tpm_malloc(len); + if (*buf == NULL +- || rsa_decrypt(&key->key, scheme, enc, enc_size, *buf, &len) +- || tpm_unmarshal_TPM_STORE_ASYMKEY(&ptr, &len, store)) { ++ || rsa_decrypt(&key->key, scheme, enc, enc_size, *buf, &len) ) { ++ tpm_free(*buf); ++ return -1; ++ } ++ len32 = (UINT32) len; ++ if (tpm_unmarshal_TPM_STORE_ASYMKEY(&ptr, &len32, store)) { + tpm_free(*buf); + return -1; + } ++ + return 0; + } + +@@ -371,7 +387,7 @@ TPM_RESULT TPM_CreateWrapKey(TPM_KEY_HAN + TPM_SESSION_DATA *session; + TPM_STORE_ASYMKEY store; + rsa_private_key_t rsa; +- UINT32 key_length; ++ size_t key_length; + + info("TPM_CreateWrapKey()"); + /* get parent key */ +@@ -428,11 +444,11 @@ TPM_RESULT TPM_CreateWrapKey(TPM_KEY_HAN + } + if (compute_key_digest(wrappedKey, &store.pubDataDigest)) return TPM_FAIL; + /* generate key and store it */ +- key_length = keyInfo->algorithmParms.parms.rsa.keyLength; +- if (rsa_generate_key(&rsa, key_length)) return TPM_FAIL; +- wrappedKey->pubKey.keyLength = key_length >> 3; ++ if (rsa_generate_key(&rsa, keyInfo->algorithmParms.parms.rsa.keyLength)) ++ return TPM_FAIL; ++ wrappedKey->pubKey.keyLength = keyInfo->algorithmParms.parms.rsa.keyLength >> 3; + wrappedKey->pubKey.key = tpm_malloc(wrappedKey->pubKey.keyLength); +- store.privKey.keyLength = key_length >> 4; ++ store.privKey.keyLength = keyInfo->algorithmParms.parms.rsa.keyLength >> 4; + store.privKey.key = tpm_malloc(store.privKey.keyLength); + wrappedKey->encDataSize = parent->key.size >> 3; + wrappedKey->encData = tpm_malloc(wrappedKey->encDataSize); +@@ -444,9 +460,11 @@ TPM_RESULT TPM_CreateWrapKey(TPM_KEY_HAN + tpm_free(wrappedKey->encData); + return TPM_FAIL; + } +- rsa_export_modulus(&rsa, wrappedKey->pubKey.key, +- &wrappedKey->pubKey.keyLength); +- rsa_export_prime1(&rsa, store.privKey.key, &store.privKey.keyLength); ++ rsa_export_modulus(&rsa, wrappedKey->pubKey.key, ++ &key_length); ++ wrappedKey->pubKey.keyLength = (UINT32) key_length; ++ rsa_export_prime1(&rsa, store.privKey.key, &key_length); ++ store.privKey.keyLength = (UINT32) key_length; + rsa_release_private_key(&rsa); + /* encrypt private key data */ + if (encrypt_private_key(parent, &store, wrappedKey->encData, +@@ -560,6 +578,7 @@ TPM_RESULT TPM_LoadKey(TPM_KEY_HANDLE pa + + int tpm_setup_key_parms(TPM_KEY_DATA *key, TPM_KEY_PARMS *parms) + { ++ size_t key_length; + parms->algorithmID = TPM_ALG_RSA; + parms->encScheme = key->encScheme; + parms->sigScheme = key->sigScheme; +@@ -569,7 +588,8 @@ int tpm_setup_key_parms(TPM_KEY_DATA *ke + parms->parms.rsa.exponent = tpm_malloc(parms->parms.rsa.exponentSize); + if (parms->parms.rsa.exponent == NULL) return -1; + rsa_export_exponent(&key->key, parms->parms.rsa.exponent, +- &parms->parms.rsa.exponentSize); ++ &key_length); ++ parms->parms.rsa.exponentSize = (UINT32) key_length; + parms->parmSize = 12 + parms->parms.rsa.exponentSize; + return 0; + } +@@ -580,6 +600,7 @@ TPM_RESULT TPM_GetPubKey(TPM_KEY_HANDLE + TPM_RESULT res; + TPM_KEY_DATA *key; + TPM_DIGEST digest; ++ size_t key_length; + info("TPM_GetPubKey()"); + /* get key */ + if (keyHandle == TPM_KH_SRK) return TPM_BAD_PARAMETER; +@@ -607,8 +628,8 @@ TPM_RESULT TPM_GetPubKey(TPM_KEY_HANDLE + pubKey->pubKey.keyLength = key->key.size >> 3; + pubKey->pubKey.key = tpm_malloc(pubKey->pubKey.keyLength); + if (pubKey->pubKey.key == NULL) return TPM_FAIL; +- rsa_export_modulus(&key->key, pubKey->pubKey.key, +- &pubKey->pubKey.keyLength); ++ rsa_export_modulus(&key->key, pubKey->pubKey.key, &key_length); ++ pubKey->pubKey.keyLength = (UINT32) key_length; + if (tpm_setup_key_parms(key, &pubKey->algorithmParms) != 0) { + tpm_free(pubKey->pubKey.key); + return TPM_FAIL; +diff -uprN tpm_emulator-0.2/tpm_version.h tpm_emulator-0.2-x86_64/tpm_version.h +--- tpm_emulator-0.2/tpm_version.h 2005-08-15 00:58:57.000000000 -0700 ++++ tpm_emulator-0.2-x86_64/tpm_version.h 1969-12-31 16:00:00.000000000 -0800 +@@ -1,6 +0,0 @@ +-#ifndef _TPM_VERSION_H_ +-#define _TPM_VERSION_H_ +-#define VERSION_MAJOR 0 +-#define VERSION_MINOR 2 +-#define VERSION_BUILD 1123950310 +-#endif /* _TPM_VERSION_H_ */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/char/tpm/tpm.c --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,627 +0,0 @@ -/* - * Copyright (C) 2004 IBM Corporation - * - * Authors: - * Leendert van Doorn <leendert@xxxxxxxxxxxxxx> - * Dave Safford <safford@xxxxxxxxxxxxxx> - * Reiner Sailer <sailer@xxxxxxxxxxxxxx> - * Kylene Hall <kjhall@xxxxxxxxxx> - * - * Maintained by: <tpmdd_devel@xxxxxxxxxxxxxxxxxxxxx> - * - * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - * Note, the TPM chip is not interrupt driven (only polling) - * and can have very long timeouts (minutes!). Hence the unusual - * calls to schedule_timeout. - * - */ - -#include <linux/sched.h> -#include <linux/poll.h> -#include <linux/spinlock.h> -#include "tpm.h" - -#define TPM_MINOR 224 /* officially assigned */ - -#define TPM_BUFSIZE 2048 - -static LIST_HEAD(tpm_chip_list); -static DEFINE_SPINLOCK(driver_lock); -static int dev_mask[32]; - -static void user_reader_timeout(unsigned long ptr) -{ - struct tpm_chip *chip = (struct tpm_chip *) ptr; - - down(&chip->buffer_mutex); - atomic_set(&chip->data_pending, 0); - memset(chip->data_buffer, 0, TPM_BUFSIZE); - up(&chip->buffer_mutex); -} - -void tpm_time_expired(unsigned long ptr) -{ - int *exp = (int *) ptr; - *exp = 1; -} - -EXPORT_SYMBOL_GPL(tpm_time_expired); - -/* - * Internal kernel interface to transmit TPM commands - */ -static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf, - size_t bufsiz) -{ - ssize_t len; - u32 count; - __be32 *native_size; - - native_size = (__force __be32 *) (buf + 2); - count = be32_to_cpu(*native_size); - - if (count == 0) - return -ENODATA; - if (count > bufsiz) { - dev_err(&chip->pci_dev->dev, - "invalid count value %x %zx \n", count, bufsiz); - return -E2BIG; - } - - down(&chip->tpm_mutex); - - if ((len = chip->vendor->send(chip, (u8 *) buf, count)) < 0) { - dev_err(&chip->pci_dev->dev, - "tpm_transmit: tpm_send: error %zd\n", len); - return len; - } - - down(&chip->timer_manipulation_mutex); - chip->time_expired = 0; - init_timer(&chip->device_timer); - chip->device_timer.function = tpm_time_expired; - chip->device_timer.expires = jiffies + 2 * 60 * HZ; - chip->device_timer.data = (unsigned long) &chip->time_expired; - add_timer(&chip->device_timer); - up(&chip->timer_manipulation_mutex); - - do { - u8 status = inb(chip->vendor->base + 1); - if ((status & chip->vendor->req_complete_mask) == - chip->vendor->req_complete_val) { - down(&chip->timer_manipulation_mutex); - del_singleshot_timer_sync(&chip->device_timer); - up(&chip->timer_manipulation_mutex); - goto out_recv; - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(TPM_TIMEOUT); - rmb(); - } while (!chip->time_expired); - - - chip->vendor->cancel(chip); - dev_err(&chip->pci_dev->dev, "Time expired\n"); - up(&chip->tpm_mutex); - return -EIO; - -out_recv: - len = chip->vendor->recv(chip, (u8 *) buf, bufsiz); - if (len < 0) - dev_err(&chip->pci_dev->dev, - "tpm_transmit: tpm_recv: error %zd\n", len); - up(&chip->tpm_mutex); - return len; -} - -#define TPM_DIGEST_SIZE 20 -#define CAP_PCR_RESULT_SIZE 18 -static u8 cap_pcr[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 22, /* length */ - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ - 0, 0, 0, 5, - 0, 0, 0, 4, - 0, 0, 1, 1 -}; - -#define READ_PCR_RESULT_SIZE 30 -static u8 pcrread[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 14, /* length */ - 0, 0, 0, 21, /* TPM_ORD_PcrRead */ - 0, 0, 0, 0 /* PCR index */ -}; - -static ssize_t show_pcrs(struct device *dev, char *buf) -{ - u8 data[READ_PCR_RESULT_SIZE]; - ssize_t len; - int i, j, index, num_pcrs; - char *str = buf; - - struct tpm_chip *chip = - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); - if (chip == NULL) - return -ENODEV; - - memcpy(data, cap_pcr, sizeof(cap_pcr)); - if ((len = tpm_transmit(chip, data, sizeof(data))) - < CAP_PCR_RESULT_SIZE) - return len; - - num_pcrs = be32_to_cpu(*((__force __be32 *) (data + 14))); - - for (i = 0; i < num_pcrs; i++) { - memcpy(data, pcrread, sizeof(pcrread)); - index = cpu_to_be32(i); - memcpy(data + 10, &index, 4); - if ((len = tpm_transmit(chip, data, sizeof(data))) - < READ_PCR_RESULT_SIZE) - return len; - str += sprintf(str, "PCR-%02d: ", i); - for (j = 0; j < TPM_DIGEST_SIZE; j++) - str += sprintf(str, "%02X ", *(data + 10 + j)); - str += sprintf(str, "\n"); - } - return str - buf; -} - -static DEVICE_ATTR(pcrs, S_IRUGO, show_pcrs, NULL); - -#define READ_PUBEK_RESULT_SIZE 314 -static u8 readpubek[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 30, /* length */ - 0, 0, 0, 124, /* TPM_ORD_ReadPubek */ -}; - -static ssize_t show_pubek(struct device *dev, char *buf) -{ - u8 data[READ_PUBEK_RESULT_SIZE]; - ssize_t len; - __be32 *native_val; - int i; - char *str = buf; - - struct tpm_chip *chip = - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); - if (chip == NULL) - return -ENODEV; - - memcpy(data, readpubek, sizeof(readpubek)); - memset(data + sizeof(readpubek), 0, 20); /* zero nonce */ - - if ((len = tpm_transmit(chip, data, sizeof(data))) < - READ_PUBEK_RESULT_SIZE) - return len; - - /* - ignore header 10 bytes - algorithm 32 bits (1 == RSA ) - encscheme 16 bits - sigscheme 16 bits - parameters (RSA 12->bytes: keybit, #primes, expbit) - keylenbytes 32 bits - 256 byte modulus - ignore checksum 20 bytes - */ - - native_val = (__force __be32 *) (data + 34); - - str += - sprintf(str, - "Algorithm: %02X %02X %02X %02X\nEncscheme: %02X %02X\n" - "Sigscheme: %02X %02X\nParameters: %02X %02X %02X %02X" - " %02X %02X %02X %02X %02X %02X %02X %02X\n" - "Modulus length: %d\nModulus: \n", - data[10], data[11], data[12], data[13], data[14], - data[15], data[16], data[17], data[22], data[23], - data[24], data[25], data[26], data[27], data[28], - data[29], data[30], data[31], data[32], data[33], - be32_to_cpu(*native_val) - ); - - for (i = 0; i < 256; i++) { - str += sprintf(str, "%02X ", data[i + 39]); - if ((i + 1) % 16 == 0) - str += sprintf(str, "\n"); - } - return str - buf; -} - -static DEVICE_ATTR(pubek, S_IRUGO, show_pubek, NULL); - -#define CAP_VER_RESULT_SIZE 18 -static u8 cap_version[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 18, /* length */ - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ - 0, 0, 0, 6, - 0, 0, 0, 0 -}; - -#define CAP_MANUFACTURER_RESULT_SIZE 18 -static u8 cap_manufacturer[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 22, /* length */ - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ - 0, 0, 0, 5, - 0, 0, 0, 4, - 0, 0, 1, 3 -}; - -static ssize_t show_caps(struct device *dev, char *buf) -{ - u8 data[READ_PUBEK_RESULT_SIZE]; - ssize_t len; - char *str = buf; - - struct tpm_chip *chip = - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); - if (chip == NULL) - return -ENODEV; - - memcpy(data, cap_manufacturer, sizeof(cap_manufacturer)); - - if ((len = tpm_transmit(chip, data, sizeof(data))) < - CAP_MANUFACTURER_RESULT_SIZE) - return len; - - str += sprintf(str, "Manufacturer: 0x%x\n", - be32_to_cpu(*(data + 14))); - - memcpy(data, cap_version, sizeof(cap_version)); - - if ((len = tpm_transmit(chip, data, sizeof(data))) < - CAP_VER_RESULT_SIZE) - return len; - - str += - sprintf(str, "TCG version: %d.%d\nFirmware version: %d.%d\n", - (int) data[14], (int) data[15], (int) data[16], - (int) data[17]); - - return str - buf; -} - -static DEVICE_ATTR(caps, S_IRUGO, show_caps, NULL); - -/* - * Device file system interface to the TPM - */ -int tpm_open(struct inode *inode, struct file *file) -{ - int rc = 0, minor = iminor(inode); - struct tpm_chip *chip = NULL, *pos; - - spin_lock(&driver_lock); - - list_for_each_entry(pos, &tpm_chip_list, list) { - if (pos->vendor->miscdev.minor == minor) { - chip = pos; - break; - } - } - - if (chip == NULL) { - rc = -ENODEV; - goto err_out; - } - - if (chip->num_opens) { - dev_dbg(&chip->pci_dev->dev, - "Another process owns this TPM\n"); - rc = -EBUSY; - goto err_out; - } - - chip->num_opens++; - pci_dev_get(chip->pci_dev); - - spin_unlock(&driver_lock); - - chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL); - if (chip->data_buffer == NULL) { - chip->num_opens--; - pci_dev_put(chip->pci_dev); - return -ENOMEM; - } - - atomic_set(&chip->data_pending, 0); - - file->private_data = chip; - return 0; - -err_out: - spin_unlock(&driver_lock); - return rc; -} - -EXPORT_SYMBOL_GPL(tpm_open); - -int tpm_release(struct inode *inode, struct file *file) -{ - struct tpm_chip *chip = file->private_data; - - file->private_data = NULL; - - spin_lock(&driver_lock); - chip->num_opens--; - spin_unlock(&driver_lock); - - down(&chip->timer_manipulation_mutex); - if (timer_pending(&chip->user_read_timer)) - del_singleshot_timer_sync(&chip->user_read_timer); - else if (timer_pending(&chip->device_timer)) - del_singleshot_timer_sync(&chip->device_timer); - up(&chip->timer_manipulation_mutex); - - kfree(chip->data_buffer); - atomic_set(&chip->data_pending, 0); - - pci_dev_put(chip->pci_dev); - return 0; -} - -EXPORT_SYMBOL_GPL(tpm_release); - -ssize_t tpm_write(struct file * file, const char __user * buf, - size_t size, loff_t * off) -{ - struct tpm_chip *chip = file->private_data; - int in_size = size, out_size; - - /* cannot perform a write until the read has cleared - either via tpm_read or a user_read_timer timeout */ - while (atomic_read(&chip->data_pending) != 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(TPM_TIMEOUT); - } - - down(&chip->buffer_mutex); - - if (in_size > TPM_BUFSIZE) - in_size = TPM_BUFSIZE; - - if (copy_from_user - (chip->data_buffer, (void __user *) buf, in_size)) { - up(&chip->buffer_mutex); - return -EFAULT; - } - - /* atomic tpm command send and result receive */ - out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE); - - atomic_set(&chip->data_pending, out_size); - atomic_set(&chip->data_position, 0); - up(&chip->buffer_mutex); - - /* Set a timeout by which the reader must come claim the result */ - down(&chip->timer_manipulation_mutex); - init_timer(&chip->user_read_timer); - chip->user_read_timer.function = user_reader_timeout; - chip->user_read_timer.data = (unsigned long) chip; - chip->user_read_timer.expires = jiffies + (60 * HZ); - add_timer(&chip->user_read_timer); - up(&chip->timer_manipulation_mutex); - - return in_size; -} - -EXPORT_SYMBOL_GPL(tpm_write); - -ssize_t tpm_read(struct file * file, char __user * buf, - size_t size, loff_t * off) -{ - struct tpm_chip *chip = file->private_data; - int ret_size = -ENODATA; - int pos, pending = 0; - - down(&chip->buffer_mutex); - ret_size = atomic_read(&chip->data_pending); - if ( ret_size > 0 ) { /* Result available */ - if (size < ret_size) - ret_size = size; - - pos = atomic_read(&chip->data_position); - - if (copy_to_user((void __user *) buf, - &chip->data_buffer[pos], ret_size)) { - ret_size = -EFAULT; - } else { - pending = atomic_read(&chip->data_pending) - ret_size; - if ( pending ) { - atomic_set( &chip->data_pending, pending ); - atomic_set( &chip->data_position, pos+ret_size ); - } - } - } - up(&chip->buffer_mutex); - - if ( ret_size <= 0 || pending == 0 ) { - atomic_set( &chip->data_pending, 0 ); - down(&chip->timer_manipulation_mutex); - del_singleshot_timer_sync(&chip->user_read_timer); - up(&chip->timer_manipulation_mutex); - } - - return ret_size; -} - -EXPORT_SYMBOL_GPL(tpm_read); - -void __devexit tpm_remove(struct pci_dev *pci_dev) -{ - struct tpm_chip *chip = pci_get_drvdata(pci_dev); - - if (chip == NULL) { - dev_err(&pci_dev->dev, "No device data found\n"); - return; - } - - spin_lock(&driver_lock); - - list_del(&chip->list); - - spin_unlock(&driver_lock); - - pci_set_drvdata(pci_dev, NULL); - misc_deregister(&chip->vendor->miscdev); - - device_remove_file(&pci_dev->dev, &dev_attr_pubek); - device_remove_file(&pci_dev->dev, &dev_attr_pcrs); - device_remove_file(&pci_dev->dev, &dev_attr_caps); - - pci_disable_device(pci_dev); - - dev_mask[chip->dev_num / 32] &= !(1 << (chip->dev_num % 32)); - - kfree(chip); - - pci_dev_put(pci_dev); -} - -EXPORT_SYMBOL_GPL(tpm_remove); - -static u8 savestate[] = { - 0, 193, /* TPM_TAG_RQU_COMMAND */ - 0, 0, 0, 10, /* blob length (in bytes) */ - 0, 0, 0, 152 /* TPM_ORD_SaveState */ -}; - -/* - * We are about to suspend. Save the TPM state - * so that it can be restored. - */ -int tpm_pm_suspend(struct pci_dev *pci_dev, pm_message_t pm_state) -{ - struct tpm_chip *chip = pci_get_drvdata(pci_dev); - if (chip == NULL) - return -ENODEV; - - tpm_transmit(chip, savestate, sizeof(savestate)); - return 0; -} - -EXPORT_SYMBOL_GPL(tpm_pm_suspend); - -/* - * Resume from a power safe. The BIOS already restored - * the TPM state. - */ -int tpm_pm_resume(struct pci_dev *pci_dev) -{ - struct tpm_chip *chip = pci_get_drvdata(pci_dev); - - if (chip == NULL) - return -ENODEV; - - return 0; -} - -EXPORT_SYMBOL_GPL(tpm_pm_resume); - -/* - * Called from tpm_<specific>.c probe function only for devices - * the driver has determined it should claim. Prior to calling - * this function the specific probe function has called pci_enable_device - * upon errant exit from this function specific probe function should call - * pci_disable_device - */ -int tpm_register_hardware(struct pci_dev *pci_dev, - struct tpm_vendor_specific *entry) -{ - char devname[7]; - struct tpm_chip *chip; - int i, j; - - /* Driver specific per-device data */ - chip = kmalloc(sizeof(*chip), GFP_KERNEL); - if (chip == NULL) - return -ENOMEM; - - memset(chip, 0, sizeof(struct tpm_chip)); - - init_MUTEX(&chip->buffer_mutex); - init_MUTEX(&chip->tpm_mutex); - init_MUTEX(&chip->timer_manipulation_mutex); - INIT_LIST_HEAD(&chip->list); - - chip->vendor = entry; - - chip->dev_num = -1; - - for (i = 0; i < 32; i++) - for (j = 0; j < 8; j++) - if ((dev_mask[i] & (1 << j)) == 0) { - chip->dev_num = i * 32 + j; - dev_mask[i] |= 1 << j; - goto dev_num_search_complete; - } - -dev_num_search_complete: - if (chip->dev_num < 0) { - dev_err(&pci_dev->dev, - "No available tpm device numbers\n"); - kfree(chip); - return -ENODEV; - } else if (chip->dev_num == 0) - chip->vendor->miscdev.minor = TPM_MINOR; - else - chip->vendor->miscdev.minor = MISC_DYNAMIC_MINOR; - - snprintf(devname, sizeof(devname), "%s%d", "tpm", chip->dev_num); - chip->vendor->miscdev.name = devname; - - chip->vendor->miscdev.dev = &(pci_dev->dev); - chip->pci_dev = pci_dev_get(pci_dev); - - if (misc_register(&chip->vendor->miscdev)) { - dev_err(&chip->pci_dev->dev, - "unable to misc_register %s, minor %d\n", - chip->vendor->miscdev.name, - chip->vendor->miscdev.minor); - pci_dev_put(pci_dev); - kfree(chip); - dev_mask[i] &= !(1 << j); - return -ENODEV; - } - - pci_set_drvdata(pci_dev, chip); - - list_add(&chip->list, &tpm_chip_list); - - device_create_file(&pci_dev->dev, &dev_attr_pubek); - device_create_file(&pci_dev->dev, &dev_attr_pcrs); - device_create_file(&pci_dev->dev, &dev_attr_caps); - - return 0; -} - -EXPORT_SYMBOL_GPL(tpm_register_hardware); - -static int __init init_tpm(void) -{ - return 0; -} - -static void __exit cleanup_tpm(void) -{ - -} - -module_init(init_tpm); -module_exit(cleanup_tpm); - -MODULE_AUTHOR("Leendert van Doorn (leendert@xxxxxxxxxxxxxx)"); -MODULE_DESCRIPTION("TPM Driver"); -MODULE_VERSION("2.0"); -MODULE_LICENSE("GPL"); diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/char/tpm/tpm.h --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm.h Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2004 IBM Corporation - * - * Authors: - * Leendert van Doorn <leendert@xxxxxxxxxxxxxx> - * Dave Safford <safford@xxxxxxxxxxxxxx> - * Reiner Sailer <sailer@xxxxxxxxxxxxxx> - * Kylene Hall <kjhall@xxxxxxxxxx> - * - * Maintained by: <tpmdd_devel@xxxxxxxxxxxxxxxxxxxxx> - * - * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - */ -#include <linux/module.h> -#include <linux/version.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/fs.h> -#include <linux/miscdevice.h> - -#define TPM_TIMEOUT msecs_to_jiffies(5) - -/* TPM addresses */ -#define TPM_ADDR 0x4E -#define TPM_DATA 0x4F - -struct tpm_chip; - -struct tpm_vendor_specific { - u8 req_complete_mask; - u8 req_complete_val; - u16 base; /* TPM base address */ - - int (*recv) (struct tpm_chip *, u8 *, size_t); - int (*send) (struct tpm_chip *, u8 *, size_t); - void (*cancel) (struct tpm_chip *); - struct miscdevice miscdev; -}; - -struct tpm_chip { - struct pci_dev *pci_dev; /* PCI device stuff */ - - int dev_num; /* /dev/tpm# */ - int num_opens; /* only one allowed */ - int time_expired; - - /* Data passed to and from the tpm via the read/write calls */ - u8 *data_buffer; - atomic_t data_pending; - atomic_t data_position; - struct semaphore buffer_mutex; - - struct timer_list user_read_timer; /* user needs to claim result */ - struct semaphore tpm_mutex; /* tpm is processing */ - struct timer_list device_timer; /* tpm is processing */ - struct semaphore timer_manipulation_mutex; - - struct tpm_vendor_specific *vendor; - - struct list_head list; -}; - -static inline int tpm_read_index(int index) -{ - outb(index, TPM_ADDR); - return inb(TPM_DATA) & 0xFF; -} - -static inline void tpm_write_index(int index, int value) -{ - outb(index, TPM_ADDR); - outb(value & 0xFF, TPM_DATA); -} - -extern void tpm_time_expired(unsigned long); -extern int tpm_register_hardware(struct pci_dev *, - struct tpm_vendor_specific *); -extern int tpm_open(struct inode *, struct file *); -extern int tpm_release(struct inode *, struct file *); -extern ssize_t tpm_write(struct file *, const char __user *, size_t, - loff_t *); -extern ssize_t tpm_read(struct file *, char __user *, size_t, loff_t *); -extern void __devexit tpm_remove(struct pci_dev *); -extern int tpm_pm_suspend(struct pci_dev *, pm_message_t); -extern int tpm_pm_resume(struct pci_dev *); diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/char/tpm/tpm_atmel.c --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_atmel.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,220 +0,0 @@ -/* - * Copyright (C) 2004 IBM Corporation - * - * Authors: - * Leendert van Doorn <leendert@xxxxxxxxxxxxxx> - * Dave Safford <safford@xxxxxxxxxxxxxx> - * Reiner Sailer <sailer@xxxxxxxxxxxxxx> - * Kylene Hall <kjhall@xxxxxxxxxx> - * - * Maintained by: <tpmdd_devel@xxxxxxxxxxxxxxxxxxxxx> - * - * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - */ - -#include "tpm.h" - -/* Atmel definitions */ -enum tpm_atmel_addr { - TPM_ATMEL_BASE_ADDR_LO = 0x08, - TPM_ATMEL_BASE_ADDR_HI = 0x09 -}; - -/* write status bits */ -#define ATML_STATUS_ABORT 0x01 -#define ATML_STATUS_LASTBYTE 0x04 - -/* read status bits */ -#define ATML_STATUS_BUSY 0x01 -#define ATML_STATUS_DATA_AVAIL 0x02 -#define ATML_STATUS_REWRITE 0x04 - - -static int tpm_atml_recv(struct tpm_chip *chip, u8 * buf, size_t count) -{ - u8 status, *hdr = buf; - u32 size; - int i; - __be32 *native_size; - - /* start reading header */ - if (count < 6) - return -EIO; - - for (i = 0; i < 6; i++) { - status = inb(chip->vendor->base + 1); - if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(&chip->pci_dev->dev, - "error reading header\n"); - return -EIO; - } - *buf++ = inb(chip->vendor->base); - } - - /* size of the data received */ - native_size = (__force __be32 *) (hdr + 2); - size = be32_to_cpu(*native_size); - - if (count < size) { - dev_err(&chip->pci_dev->dev, - "Recv size(%d) less than available space\n", size); - for (; i < size; i++) { /* clear the waiting data anyway */ - status = inb(chip->vendor->base + 1); - if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(&chip->pci_dev->dev, - "error reading data\n"); - return -EIO; - } - } - return -EIO; - } - - /* read all the data available */ - for (; i < size; i++) { - status = inb(chip->vendor->base + 1); - if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(&chip->pci_dev->dev, - "error reading data\n"); - return -EIO; - } - *buf++ = inb(chip->vendor->base); - } - - /* make sure data available is gone */ - status = inb(chip->vendor->base + 1); - if (status & ATML_STATUS_DATA_AVAIL) { - dev_err(&chip->pci_dev->dev, "data available is stuck\n"); - return -EIO; - } - - return size; -} - -static int tpm_atml_send(struct tpm_chip *chip, u8 * buf, size_t count) -{ - int i; - - dev_dbg(&chip->pci_dev->dev, "tpm_atml_send: "); - for (i = 0; i < count; i++) { - dev_dbg(&chip->pci_dev->dev, "0x%x(%d) ", buf[i], buf[i]); - outb(buf[i], chip->vendor->base); - } - - return count; -} - -static void tpm_atml_cancel(struct tpm_chip *chip) -{ - outb(ATML_STATUS_ABORT, chip->vendor->base + 1); -} - -static struct file_operations atmel_ops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .open = tpm_open, - .read = tpm_read, - .write = tpm_write, - .release = tpm_release, -}; - -static struct tpm_vendor_specific tpm_atmel = { - .recv = tpm_atml_recv, - .send = tpm_atml_send, - .cancel = tpm_atml_cancel, - .req_complete_mask = ATML_STATUS_BUSY | ATML_STATUS_DATA_AVAIL, - .req_complete_val = ATML_STATUS_DATA_AVAIL, - .miscdev = { .fops = &atmel_ops, }, -}; - -static int __devinit tpm_atml_init(struct pci_dev *pci_dev, - const struct pci_device_id *pci_id) -{ - u8 version[4]; - int rc = 0; - int lo, hi; - - if (pci_enable_device(pci_dev)) - return -EIO; - - lo = tpm_read_index( TPM_ATMEL_BASE_ADDR_LO ); - hi = tpm_read_index( TPM_ATMEL_BASE_ADDR_HI ); - - tpm_atmel.base = (hi<<8)|lo; - dev_dbg( &pci_dev->dev, "Operating with base: 0x%x\n", tpm_atmel.base); - - /* verify that it is an Atmel part */ - if (tpm_read_index(4) != 'A' || tpm_read_index(5) != 'T' - || tpm_read_index(6) != 'M' || tpm_read_index(7) != 'L') { - rc = -ENODEV; - goto out_err; - } - - /* query chip for its version number */ - if ((version[0] = tpm_read_index(0x00)) != 0xFF) { - version[1] = tpm_read_index(0x01); - version[2] = tpm_read_index(0x02); - version[3] = tpm_read_index(0x03); - } else { - dev_info(&pci_dev->dev, "version query failed\n"); - rc = -ENODEV; - goto out_err; - } - - if ((rc = tpm_register_hardware(pci_dev, &tpm_atmel)) < 0) - goto out_err; - - dev_info(&pci_dev->dev, - "Atmel TPM version %d.%d.%d.%d\n", version[0], version[1], - version[2], version[3]); - - return 0; -out_err: - pci_disable_device(pci_dev); - return rc; -} - -static struct pci_device_id tpm_pci_tbl[] __devinitdata = { - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8111_LPC)}, - {0,} -}; - -MODULE_DEVICE_TABLE(pci, tpm_pci_tbl); - -static struct pci_driver atmel_pci_driver = { - .name = "tpm_atmel", - .id_table = tpm_pci_tbl, - .probe = tpm_atml_init, - .remove = __devexit_p(tpm_remove), - .suspend = tpm_pm_suspend, - .resume = tpm_pm_resume, -}; - -static int __init init_atmel(void) -{ - return pci_register_driver(&atmel_pci_driver); -} - -static void __exit cleanup_atmel(void) -{ - pci_unregister_driver(&atmel_pci_driver); -} - -module_init(init_atmel); -module_exit(cleanup_atmel); - -MODULE_AUTHOR("Leendert van Doorn (leendert@xxxxxxxxxxxxxx)"); -MODULE_DESCRIPTION("TPM Driver"); -MODULE_VERSION("2.0"); -MODULE_LICENSE("GPL"); diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/char/tpm/tpm_nsc.c --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_nsc.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,377 +0,0 @@ -/* - * Copyright (C) 2004 IBM Corporation - * - * Authors: - * Leendert van Doorn <leendert@xxxxxxxxxxxxxx> - * Dave Safford <safford@xxxxxxxxxxxxxx> - * Reiner Sailer <sailer@xxxxxxxxxxxxxx> - * Kylene Hall <kjhall@xxxxxxxxxx> - * - * Maintained by: <tpmdd_devel@xxxxxxxxxxxxxxxxxxxxx> - * - * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * - */ - -#include "tpm.h" - -/* National definitions */ -#define TPM_NSC_BASE 0x360 -#define TPM_NSC_IRQ 0x07 -#define TPM_NSC_BASE0_HI 0x60 -#define TPM_NSC_BASE0_LO 0x61 -#define TPM_NSC_BASE1_HI 0x62 -#define TPM_NSC_BASE1_LO 0x63 - -#define NSC_LDN_INDEX 0x07 -#define NSC_SID_INDEX 0x20 -#define NSC_LDC_INDEX 0x30 -#define NSC_DIO_INDEX 0x60 -#define NSC_CIO_INDEX 0x62 -#define NSC_IRQ_INDEX 0x70 -#define NSC_ITS_INDEX 0x71 - -#define NSC_STATUS 0x01 -#define NSC_COMMAND 0x01 -#define NSC_DATA 0x00 - -/* status bits */ -#define NSC_STATUS_OBF 0x01 /* output buffer full */ -#define NSC_STATUS_IBF 0x02 /* input buffer full */ -#define NSC_STATUS_F0 0x04 /* F0 */ -#define NSC_STATUS_A2 0x08 /* A2 */ -#define NSC_STATUS_RDY 0x10 /* ready to receive command */ -#define NSC_STATUS_IBR 0x20 /* ready to receive data */ - -/* command bits */ -#define NSC_COMMAND_NORMAL 0x01 /* normal mode */ -#define NSC_COMMAND_EOC 0x03 -#define NSC_COMMAND_CANCEL 0x22 - -/* - * Wait for a certain status to appear - */ -static int wait_for_stat(struct tpm_chip *chip, u8 mask, u8 val, u8 * data) -{ - int expired = 0; - struct timer_list status_timer = - TIMER_INITIALIZER(tpm_time_expired, jiffies + 10 * HZ, - (unsigned long) &expired); - - /* status immediately available check */ - *data = inb(chip->vendor->base + NSC_STATUS); - if ((*data & mask) == val) - return 0; - - /* wait for status */ - add_timer(&status_timer); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(TPM_TIMEOUT); - *data = inb(chip->vendor->base + 1); - if ((*data & mask) == val) { - del_singleshot_timer_sync(&status_timer); - return 0; - } - } - while (!expired); - - return -EBUSY; -} - -static int nsc_wait_for_ready(struct tpm_chip *chip) -{ - int status; - int expired = 0; - struct timer_list status_timer = - TIMER_INITIALIZER(tpm_time_expired, jiffies + 100, - (unsigned long) &expired); - - /* status immediately available check */ - status = inb(chip->vendor->base + NSC_STATUS); - if (status & NSC_STATUS_OBF) - status = inb(chip->vendor->base + NSC_DATA); - if (status & NSC_STATUS_RDY) - return 0; - - /* wait for status */ - add_timer(&status_timer); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(TPM_TIMEOUT); - status = inb(chip->vendor->base + NSC_STATUS); - if (status & NSC_STATUS_OBF) - status = inb(chip->vendor->base + NSC_DATA); - if (status & NSC_STATUS_RDY) { - del_singleshot_timer_sync(&status_timer); - return 0; - } - } - while (!expired); - - dev_info(&chip->pci_dev->dev, "wait for ready failed\n"); - return -EBUSY; -} - - -static int tpm_nsc_recv(struct tpm_chip *chip, u8 * buf, size_t count) -{ - u8 *buffer = buf; - u8 data, *p; - u32 size; - __be32 *native_size; - - if (count < 6) - return -EIO; - - if (wait_for_stat(chip, NSC_STATUS_F0, NSC_STATUS_F0, &data) < 0) { - dev_err(&chip->pci_dev->dev, "F0 timeout\n"); - return -EIO; - } - if ((data = - inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_NORMAL) { - dev_err(&chip->pci_dev->dev, "not in normal mode (0x%x)\n", - data); - return -EIO; - } - - /* read the whole packet */ - for (p = buffer; p < &buffer[count]; p++) { - if (wait_for_stat - (chip, NSC_STATUS_OBF, NSC_STATUS_OBF, &data) < 0) { - dev_err(&chip->pci_dev->dev, - "OBF timeout (while reading data)\n"); - return -EIO; - } - if (data & NSC_STATUS_F0) - break; - *p = inb(chip->vendor->base + NSC_DATA); - } - - if ((data & NSC_STATUS_F0) == 0) { - dev_err(&chip->pci_dev->dev, "F0 not set\n"); - return -EIO; - } - if ((data = inb(chip->vendor->base + NSC_DATA)) != NSC_COMMAND_EOC) { - dev_err(&chip->pci_dev->dev, - "expected end of command(0x%x)\n", data); - return -EIO; - } - - native_size = (__force __be32 *) (buf + 2); - size = be32_to_cpu(*native_size); - - if (count < size) - return -EIO; - - return size; -} - -static int tpm_nsc_send(struct tpm_chip *chip, u8 * buf, size_t count) -{ - u8 data; - int i; - - /* - * If we hit the chip with back to back commands it locks up - * and never set IBF. Hitting it with this "hammer" seems to - * fix it. Not sure why this is needed, we followed the flow - * chart in the manual to the letter. - */ - outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND); - - if (nsc_wait_for_ready(chip) != 0) - return -EIO; - - if (wait_for_stat(chip, NSC_STATUS_IBF, 0, &data) < 0) { - dev_err(&chip->pci_dev->dev, "IBF timeout\n"); - return -EIO; - } - - outb(NSC_COMMAND_NORMAL, chip->vendor->base + NSC_COMMAND); - if (wait_for_stat(chip, NSC_STATUS_IBR, NSC_STATUS_IBR, &data) < 0) { - dev_err(&chip->pci_dev->dev, "IBR timeout\n"); - return -EIO; - } - - for (i = 0; i < count; i++) { - if (wait_for_stat(chip, NSC_STATUS_IBF, 0, &data) < 0) { - dev_err(&chip->pci_dev->dev, - "IBF timeout (while writing data)\n"); - return -EIO; - } - outb(buf[i], chip->vendor->base + NSC_DATA); - } - - if (wait_for_stat(chip, NSC_STATUS_IBF, 0, &data) < 0) { - dev_err(&chip->pci_dev->dev, "IBF timeout\n"); - return -EIO; - } - outb(NSC_COMMAND_EOC, chip->vendor->base + NSC_COMMAND); - - return count; -} - -static void tpm_nsc_cancel(struct tpm_chip *chip) -{ - outb(NSC_COMMAND_CANCEL, chip->vendor->base + NSC_COMMAND); -} - -static struct file_operations nsc_ops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .open = tpm_open, - .read = tpm_read, - .write = tpm_write, - .release = tpm_release, -}; - -static struct tpm_vendor_specific tpm_nsc = { - .recv = tpm_nsc_recv, - .send = tpm_nsc_send, - .cancel = tpm_nsc_cancel, - .req_complete_mask = NSC_STATUS_OBF, - .req_complete_val = NSC_STATUS_OBF, - .miscdev = { .fops = &nsc_ops, }, - -}; - -static int __devinit tpm_nsc_init(struct pci_dev *pci_dev, - const struct pci_device_id *pci_id) -{ - int rc = 0; - int lo, hi; - - hi = tpm_read_index(TPM_NSC_BASE0_HI); - lo = tpm_read_index(TPM_NSC_BASE0_LO); - - tpm_nsc.base = (hi<<8) | lo; - - if (pci_enable_device(pci_dev)) - return -EIO; - - /* verify that it is a National part (SID) */ - if (tpm_read_index(NSC_SID_INDEX) != 0xEF) { - rc = -ENODEV; - goto out_err; - } - - dev_dbg(&pci_dev->dev, "NSC TPM detected\n"); - dev_dbg(&pci_dev->dev, - "NSC LDN 0x%x, SID 0x%x, SRID 0x%x\n", - tpm_read_index(0x07), tpm_read_index(0x20), - tpm_read_index(0x27)); - dev_dbg(&pci_dev->dev, - "NSC SIOCF1 0x%x SIOCF5 0x%x SIOCF6 0x%x SIOCF8 0x%x\n", - tpm_read_index(0x21), tpm_read_index(0x25), - tpm_read_index(0x26), tpm_read_index(0x28)); - dev_dbg(&pci_dev->dev, "NSC IO Base0 0x%x\n", - (tpm_read_index(0x60) << 8) | tpm_read_index(0x61)); - dev_dbg(&pci_dev->dev, "NSC IO Base1 0x%x\n", - (tpm_read_index(0x62) << 8) | tpm_read_index(0x63)); - dev_dbg(&pci_dev->dev, "NSC Interrupt number and wakeup 0x%x\n", - tpm_read_index(0x70)); - dev_dbg(&pci_dev->dev, "NSC IRQ type select 0x%x\n", - tpm_read_index(0x71)); - dev_dbg(&pci_dev->dev, - "NSC DMA channel select0 0x%x, select1 0x%x\n", - tpm_read_index(0x74), tpm_read_index(0x75)); - dev_dbg(&pci_dev->dev, - "NSC Config " - "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", - tpm_read_index(0xF0), tpm_read_index(0xF1), - tpm_read_index(0xF2), tpm_read_index(0xF3), - tpm_read_index(0xF4), tpm_read_index(0xF5), - tpm_read_index(0xF6), tpm_read_index(0xF7), - tpm_read_index(0xF8), tpm_read_index(0xF9)); - - dev_info(&pci_dev->dev, - "NSC PC21100 TPM revision %d\n", - tpm_read_index(0x27) & 0x1F); - - if (tpm_read_index(NSC_LDC_INDEX) == 0) - dev_info(&pci_dev->dev, ": NSC TPM not active\n"); - - /* select PM channel 1 */ - tpm_write_index(NSC_LDN_INDEX, 0x12); - tpm_read_index(NSC_LDN_INDEX); - - /* disable the DPM module */ - tpm_write_index(NSC_LDC_INDEX, 0); - tpm_read_index(NSC_LDC_INDEX); - - /* set the data register base addresses */ - tpm_write_index(NSC_DIO_INDEX, TPM_NSC_BASE >> 8); - tpm_write_index(NSC_DIO_INDEX + 1, TPM_NSC_BASE); - tpm_read_index(NSC_DIO_INDEX); - tpm_read_index(NSC_DIO_INDEX + 1); - - /* set the command register base addresses */ - tpm_write_index(NSC_CIO_INDEX, (TPM_NSC_BASE + 1) >> 8); - tpm_write_index(NSC_CIO_INDEX + 1, (TPM_NSC_BASE + 1)); - tpm_read_index(NSC_DIO_INDEX); - tpm_read_index(NSC_DIO_INDEX + 1); - - /* set the interrupt number to be used for the host interface */ - tpm_write_index(NSC_IRQ_INDEX, TPM_NSC_IRQ); - tpm_write_index(NSC_ITS_INDEX, 0x00); - tpm_read_index(NSC_IRQ_INDEX); - - /* enable the DPM module */ - tpm_write_index(NSC_LDC_INDEX, 0x01); - tpm_read_index(NSC_LDC_INDEX); - - if ((rc = tpm_register_hardware(pci_dev, &tpm_nsc)) < 0) - goto out_err; - - return 0; - -out_err: - pci_disable_device(pci_dev); - return rc; -} - -static struct pci_device_id tpm_pci_tbl[] __devinitdata = { - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0)}, - {PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8111_LPC)}, - {0,} -}; - -MODULE_DEVICE_TABLE(pci, tpm_pci_tbl); - -static struct pci_driver nsc_pci_driver = { - .name = "tpm_nsc", - .id_table = tpm_pci_tbl, - .probe = tpm_nsc_init, - .remove = __devexit_p(tpm_remove), - .suspend = tpm_pm_suspend, - .resume = tpm_pm_resume, -}; - -static int __init init_nsc(void) -{ - return pci_register_driver(&nsc_pci_driver); -} - -static void __exit cleanup_nsc(void) -{ - pci_unregister_driver(&nsc_pci_driver); -} - -module_init(init_nsc); -module_exit(cleanup_nsc); - -MODULE_AUTHOR("Leendert van Doorn (leendert@xxxxxxxxxxxxxx)"); -MODULE_DESCRIPTION("TPM Driver"); -MODULE_VERSION("2.0"); -MODULE_LICENSE("GPL"); diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,229 +0,0 @@ -/* - * blktap.h - * - * Interfaces for the Xen block tap driver. - * - * (c) 2004, Andrew Warfield, University of Cambridge - * - */ - -#ifndef __BLKTAP_H__ -#define __BLKTAP_H__ - -#include <linux/version.h> -#include <linux/blkdev.h> -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <asm/io.h> -#include <asm/setup.h> -#include <asm/pgalloc.h> -#include <asm/hypervisor.h> -#include <asm-xen/xen-public/io/blkif.h> -#include <asm-xen/xen-public/io/ring.h> - -/* Used to signal to the backend that this is a tap domain. */ -#define BLKTAP_COOKIE 0xbeadfeed - -/* -------[ debug / pretty printing ]--------------------------------- */ - -#define PRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#if 0 -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) - - -/* -------[ state descriptors ]--------------------------------------- */ - -#define BLKIF_STATE_CLOSED 0 -#define BLKIF_STATE_DISCONNECTED 1 -#define BLKIF_STATE_CONNECTED 2 - -/* -------[ connection tracking ]------------------------------------- */ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#endif - -extern spinlock_t blkif_io_lock; - -typedef struct blkif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - /* Comms information. */ - blkif_back_ring_t blk_ring; - - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - struct blkif_st *hash_next; - struct list_head blkdev_list; - spinlock_t blk_ring_lock; - atomic_t refcnt; - struct work_struct work; -#ifdef CONFIG_XEN_BLKDEV_GRANT - u16 shmem_handle; - unsigned long shmem_vaddr; - grant_ref_t shmem_ref; -#endif -} blkif_t; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); -void blkif_disconnect_complete(blkif_t *blkif); -#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define blkif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - blkif_disconnect_complete(_b); \ - } while (0) - - -/* -------[ active request tracking ]--------------------------------- */ - -typedef struct { - blkif_t *blkif; - unsigned long id; - int nr_pages; - int next_free; -} active_req_t; - -typedef unsigned int ACTIVE_RING_IDX; - -active_req_t *lookup_active_req(ACTIVE_RING_IDX idx); - -extern inline unsigned int ID_TO_IDX(unsigned long id) -{ - return ( id & 0x0000ffff ); -} - -extern inline domid_t ID_TO_DOM(unsigned long id) -{ - return (id >> 16); -} - -void active_reqs_init(void); - -/* -------[ interposition -> character device interface ]------------- */ - -/* /dev/xen/blktap resides at device number major=10, minor=200 */ -#define BLKTAP_MINOR 202 - -/* size of the extra VMA area to map in attached pages. */ -#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE - -/* blktap IOCTLs: */ -#define BLKTAP_IOCTL_KICK_FE 1 -#define BLKTAP_IOCTL_KICK_BE 2 -#define BLKTAP_IOCTL_SETMODE 3 -#define BLKTAP_IOCTL_PRINT_IDXS 100 - -/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ -#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ -#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 -#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 -#define BLKTAP_MODE_COPY_FE 0x00000004 -#define BLKTAP_MODE_COPY_BE 0x00000008 -#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 -#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 - -#define BLKTAP_MODE_INTERPOSE \ - (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) - -#define BLKTAP_MODE_COPY_BOTH \ - (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE) - -#define BLKTAP_MODE_COPY_BOTH_PAGES \ - (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES) - -static inline int BLKTAP_MODE_VALID(unsigned long arg) -{ - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERCEPT_BE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) || - ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || - ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || - ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) - ); -} - - - -/* -------[ Mappings to User VMA ]------------------------------------ */ -#define BATCH_PER_DOMAIN 16 - -/* -------[ Here be globals ]----------------------------------------- */ -extern unsigned long blktap_mode; - -/* Connection to a single backend domain. */ -extern blkif_front_ring_t blktap_be_ring; -extern unsigned int blktap_be_evtchn; -extern unsigned int blktap_be_state; - -/* User ring status. */ -extern unsigned long blktap_ring_ok; - -/* -------[ ...and function prototypes. ]----------------------------- */ - -/* init function for character device interface. */ -int blktap_init(void); - -/* init function for the blkif cache. */ -void __init blkif_interface_init(void); -void __init blkdev_schedule_init(void); -void blkif_deschedule(blkif_t *blkif); - -/* interfaces to the char driver, passing messages to and from apps. */ -void blktap_kick_user(void); - -/* user ring access functions: */ -int blktap_write_fe_ring(blkif_request_t *req); -int blktap_write_be_ring(blkif_response_t *rsp); -int blktap_write_ctrl_ring(ctrl_msg_t *msg); - -/* fe/be ring access functions: */ -int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp); -int write_req_to_be_ring(blkif_request_t *req); - -/* event notification functions */ -void kick_fe_domain(blkif_t *blkif); -void kick_be_domain(void); - -/* Interrupt handlers. */ -irqreturn_t blkif_ptbe_int(int irq, void *dev_id, - struct pt_regs *ptregs); -irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs); - -/* Control message receiver. */ -extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id); - -/* debug */ -void print_fe_ring_idxs(void); -void print_be_ring_idxs(void); - -#define __BLKINT_H__ -#endif diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/usbback/common.h Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,83 +0,0 @@ - -#ifndef __USBIF__BACKEND__COMMON_H__ -#define __USBIF__BACKEND__COMMON_H__ - -#include <linux/config.h> -#include <linux/version.h> -#include <linux/module.h> -#include <linux/rbtree.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <asm/io.h> -#include <asm/setup.h> -#include <asm/pgalloc.h> -#include <asm/hypervisor.h> - -#include <asm-xen/xen-public/io/usbif.h> - -#if 0 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define ASSERT(_p) ((void)0) -#define DPRINTK(_f, _a...) ((void)0) -#endif - -typedef struct usbif_priv_st usbif_priv_t; - -struct usbif_priv_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - /* Comms Information */ - usbif_back_ring_t usb_ring; - /* Private fields. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - usbif_priv_t *hash_next; - struct list_head usbif_list; - spinlock_t usb_ring_lock; - atomic_t refcnt; - - struct work_struct work; -}; - -void usbif_create(usbif_be_create_t *create); -void usbif_destroy(usbif_be_destroy_t *destroy); -void usbif_connect(usbif_be_connect_t *connect); -int usbif_disconnect(usbif_be_disconnect_t *disconnect, u8 rsp_id); -void usbif_disconnect_complete(usbif_priv_t *up); - -void usbif_release_port(usbif_be_release_port_t *msg); -int usbif_claim_port(usbif_be_claim_port_t *msg); -void usbif_release_ports(usbif_priv_t *up); - -usbif_priv_t *usbif_find(domid_t domid); -#define usbif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define usbif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - usbif_disconnect_complete(_b); \ - } while (0) - - -void usbif_interface_init(void); -void usbif_ctrlif_init(void); - -void usbif_deschedule(usbif_priv_t *up); -void remove_from_usbif_list(usbif_priv_t *up); - -irqreturn_t usbif_be_int(int irq, void *dev_id, struct pt_regs *regs); - -#endif /* __USBIF__BACKEND__COMMON_H__ */ diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbback/control.c --- a/linux-2.6-xen-sparse/drivers/xen/usbback/control.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,61 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void usbif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received usbif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_USBIF_BE_CREATE: - usbif_create((usbif_be_create_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_DESTROY: - usbif_destroy((usbif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_CONNECT: - usbif_connect((usbif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_DISCONNECT: - if ( !usbif_disconnect((usbif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - case CMSG_USBIF_BE_CLAIM_PORT: - usbif_claim_port((usbif_be_claim_port_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_RELEASE_PORT: - usbif_release_port((usbif_be_release_port_t *)&msg->msg[0]); - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void usbif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - usbif_be_driver_status_changed_t st; - - (void)ctrl_if_register_receiver(CMSG_USBIF_BE, usbif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_USBIF_BE; - cmsg.subtype = CMSG_USBIF_BE_DRIVER_STATUS_CHANGED; - cmsg.length = sizeof(usbif_be_driver_status_changed_t); - st.status = USBIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/usbback/interface.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,242 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/interface.c - * - * USB device interface management. - * - * by Mark Williamson, Copyright (c) 2004 - */ - -#include "common.h" - -#define USBIF_HASHSZ 1024 -#define USBIF_HASH(_d) (((int)(_d))&(USBIF_HASHSZ-1)) - -static kmem_cache_t *usbif_priv_cachep; -static usbif_priv_t *usbif_priv_hash[USBIF_HASHSZ]; - -usbif_priv_t *usbif_find(domid_t domid) -{ - usbif_priv_t *up = usbif_priv_hash[USBIF_HASH(domid)]; - while ( (up != NULL ) && ( up->domid != domid ) ) - up = up->hash_next; - return up; -} - -static void __usbif_disconnect_complete(void *arg) -{ - usbif_priv_t *usbif = (usbif_priv_t *)arg; - ctrl_msg_t cmsg; - usbif_be_disconnect_t disc; - - /* - * These can't be done in usbif_disconnect() because at that point there - * may be outstanding requests at the device whose asynchronous responses - * must still be notified to the remote driver. - */ - vfree(usbif->usb_ring.sring); - - /* Construct the deferred response message. */ - cmsg.type = CMSG_USBIF_BE; - cmsg.subtype = CMSG_USBIF_BE_DISCONNECT; - cmsg.id = usbif->disconnect_rspid; - cmsg.length = sizeof(usbif_be_disconnect_t); - disc.domid = usbif->domid; - disc.status = USBIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'usbif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( usbif->status != DISCONNECTING ) - BUG(); - usbif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void usbif_disconnect_complete(usbif_priv_t *up) -{ - INIT_WORK(&up->work, __usbif_disconnect_complete, (void *)up); - schedule_work(&up->work); -} - -void usbif_create(usbif_be_create_t *create) -{ - domid_t domid = create->domid; - usbif_priv_t **pup, *up; - - if ( (up = kmem_cache_alloc(usbif_priv_cachep, GFP_KERNEL)) == NULL ) - { - DPRINTK("Could not create usbif: out of memory\n"); - create->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - memset(up, 0, sizeof(*up)); - up->domid = domid; - up->status = DISCONNECTED; - spin_lock_init(&up->usb_ring_lock); - atomic_set(&up->refcnt, 0); - - pup = &usbif_priv_hash[USBIF_HASH(domid)]; - while ( *pup != NULL ) - { - if ( (*pup)->domid == domid ) - { - create->status = USBIF_BE_STATUS_INTERFACE_EXISTS; - kmem_cache_free(usbif_priv_cachep, up); - return; - } - pup = &(*pup)->hash_next; - } - - up->hash_next = *pup; - *pup = up; - - create->status = USBIF_BE_STATUS_OKAY; -} - -void usbif_destroy(usbif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - usbif_priv_t **pup, *up; - - pup = &usbif_priv_hash[USBIF_HASH(domid)]; - while ( (up = *pup) != NULL ) - { - if ( up->domid == domid ) - { - if ( up->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pup = &up->hash_next; - } - - destroy->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = USBIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pup = up->hash_next; - usbif_release_ports(up); - kmem_cache_free(usbif_priv_cachep, up); - destroy->status = USBIF_BE_STATUS_OKAY; -} - -void usbif_connect(usbif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int evtchn = connect->evtchn; - unsigned long shmem_frame = connect->shmem_frame; - struct vm_struct *vma; - pgprot_t prot; - int error; - usbif_priv_t *up; - usbif_sring_t *sring; - - up = usbif_find(domid); - if ( unlikely(up == NULL) ) - { - DPRINTK("usbif_connect attempted for non-existent usbif (%u)\n", - connect->domid); - connect->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_pfn_range(&init_mm, VMALLOC_VMADDR(vma->addr), - shmem_frame, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = USBIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = USBIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } - - if ( up->status != DISCONNECTED ) - { - connect->status = USBIF_BE_STATUS_INTERFACE_CONNECTED; - vfree(vma->addr); - return; - } - - sring = (usbif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&up->usb_ring, sring, PAGE_SIZE); - - up->evtchn = evtchn; - up->shmem_frame = shmem_frame; - up->status = CONNECTED; - usbif_get(up); - - (void)bind_evtchn_to_irqhandler( - evtchn, usbif_be_int, 0, "usbif-backend", up); - - connect->status = USBIF_BE_STATUS_OKAY; -} - -/* Remove URBs for this interface before destroying it. */ -void usbif_deschedule(usbif_priv_t *up) -{ - remove_from_usbif_list(up); -} - -int usbif_disconnect(usbif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - usbif_priv_t *up; - - up = usbif_find(domid); - if ( unlikely(up == NULL) ) - { - DPRINTK("usbif_disconnect attempted for non-existent usbif" - " (%u)\n", disconnect->domid); - disconnect->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( up->status == CONNECTED ) - { - up->status = DISCONNECTING; - up->disconnect_rspid = rsp_id; - wmb(); /* Let other CPUs see the status change. */ - unbind_evtchn_from_irqhandler(up->evtchn, up); - usbif_deschedule(up); - usbif_put(up); - return 0; /* Caller should not send response message. */ - } - - disconnect->status = USBIF_BE_STATUS_OKAY; - return 1; -} - -void __init usbif_interface_init(void) -{ - usbif_priv_cachep = kmem_cache_create("usbif_priv_cache", - sizeof(usbif_priv_t), - 0, 0, NULL, NULL); - memset(usbif_priv_hash, 0, sizeof(usbif_priv_hash)); -} diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbback/usbback.c --- a/linux-2.6-xen-sparse/drivers/xen/usbback/usbback.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,1068 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/main.c - * - * Backend for the Xen virtual USB driver - provides an abstraction of a - * USB host controller to the corresponding frontend driver. - * - * by Mark Williamson - * Copyright (c) 2004 Intel Research Cambridge - * Copyright (c) 2004, 2005 Mark Williamson - * - * Based on arch/xen/drivers/blkif/backend/main.c - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - */ - -#include "common.h" - - -#include <linux/list.h> -#include <linux/usb.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/tqueue.h> - -/* - * This is rather arbitrary. - */ -#define MAX_PENDING_REQS 4 -#define BATCH_PER_DOMAIN 1 - -static unsigned long mmap_vstart; - -/* Needs to be sufficiently large that we can map the (large) buffers - * the USB mass storage driver wants. */ -#define MMAP_PAGES_PER_REQUEST \ - (128) -#define MMAP_PAGES \ - (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) - -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - - -static spinlock_t owned_ports_lock; -LIST_HEAD(owned_ports); - -/* A list of these structures is used to track ownership of physical USB - * ports. */ -typedef struct -{ - usbif_priv_t *usbif_priv; - char path[16]; - int guest_port; - int enabled; - struct list_head list; - unsigned long guest_address; /* The USB device address that has been - * assigned by the guest. */ - int dev_present; /* Is there a device present? */ - struct usb_device * dev; - unsigned long ifaces; /* What interfaces are present on this device? */ -} owned_port_t; - - -/* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. The request is complete, the specified - * domain has a response queued for it, with the saved 'id' passed back. - */ -typedef struct { - usbif_priv_t *usbif_priv; - unsigned long id; - int nr_pages; - unsigned short operation; - int status; -} pending_req_t; - -/* - * We can't allocate pending_req's in order, since they may complete out of - * order. We therefore maintain an allocation ring. This ring also indicates - * when enough work has been passed down -- at that point the allocation ring - * will be empty. - */ -static pending_req_t pending_reqs[MAX_PENDING_REQS]; -static unsigned char pending_ring[MAX_PENDING_REQS]; -static spinlock_t pend_prod_lock; - -/* NB. We use a different index type to differentiate from shared usb rings. */ -typedef unsigned int PEND_RING_IDX; -#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) -static PEND_RING_IDX pending_prod, pending_cons; -#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) - -static int do_usb_io_op(usbif_priv_t *usbif, int max_to_do); -static void make_response(usbif_priv_t *usbif, unsigned long id, - unsigned short op, int st, int inband, - unsigned long actual_length); -static void dispatch_usb_probe(usbif_priv_t *up, unsigned long id, unsigned long port); -static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req); -static void dispatch_usb_reset(usbif_priv_t *up, unsigned long portid); -static owned_port_t *usbif_find_port(char *); - -/****************************************************************** - * PRIVATE DEBUG FUNCTIONS - */ - -#undef DEBUG -#ifdef DEBUG - -static void dump_port(owned_port_t *p) -{ - printk(KERN_DEBUG "owned_port_t @ %p\n" - " usbif_priv @ %p\n" - " path: %s\n" - " guest_port: %d\n" - " guest_address: %ld\n" - " dev_present: %d\n" - " dev @ %p\n" - " ifaces: 0x%lx\n", - p, p->usbif_priv, p->path, p->guest_port, p->guest_address, - p->dev_present, p->dev, p->ifaces); -} - - -static void dump_request(usbif_request_t *req) -{ - printk(KERN_DEBUG "id = 0x%lx\n" - "devnum %d\n" - "endpoint 0x%x\n" - "direction %d\n" - "speed %d\n" - "pipe_type 0x%x\n" - "transfer_buffer 0x%lx\n" - "length 0x%lx\n" - "transfer_flags 0x%lx\n" - "setup = { 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x }\n" - "iso_schedule = 0x%lx\n" - "num_iso %ld\n", - req->id, req->devnum, req->endpoint, req->direction, req->speed, - req->pipe_type, req->transfer_buffer, req->length, - req->transfer_flags, req->setup[0], req->setup[1], req->setup[2], - req->setup[3], req->setup[4], req->setup[5], req->setup[6], - req->setup[7], req->iso_schedule, req->num_iso); -} - -static void dump_urb(struct urb *urb) -{ - printk(KERN_DEBUG "dumping urb @ %p\n", urb); - -#define DUMP_URB_FIELD(name, format) \ - printk(KERN_DEBUG " " # name " " format "\n", urb-> name) - - DUMP_URB_FIELD(pipe, "0x%x"); - DUMP_URB_FIELD(status, "%d"); - DUMP_URB_FIELD(transfer_flags, "0x%x"); - DUMP_URB_FIELD(transfer_buffer, "%p"); - DUMP_URB_FIELD(transfer_buffer_length, "%d"); - DUMP_URB_FIELD(actual_length, "%d"); -} - -static void dump_response(usbif_response_t *resp) -{ - printk(KERN_DEBUG "usbback: Sending response:\n" - " id = 0x%x\n" - " op = %d\n" - " status = %d\n" - " data = %d\n" - " length = %d\n", - resp->id, resp->op, resp->status, resp->data, resp->length); -} - -#else /* DEBUG */ - -#define dump_port(blah) ((void)0) -#define dump_request(blah) ((void)0) -#define dump_urb(blah) ((void)0) -#define dump_response(blah) ((void)0) - -#endif /* DEBUG */ - -/****************************************************************** - * MEMORY MANAGEMENT - */ - -static void fast_flush_area(int idx, int nr_pages) -{ - multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; - int i; - - for ( i = 0; i < nr_pages; i++ ) - { - MULTI_update_va_mapping(mcl+i, MMAP_VADDR(idx, i), - __pte(0), 0); - } - - mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) - BUG(); -} - - -/****************************************************************** - * USB INTERFACE SCHEDULER LIST MAINTENANCE - */ - -static struct list_head usbio_schedule_list; -static spinlock_t usbio_schedule_list_lock; - -static int __on_usbif_list(usbif_priv_t *up) -{ - return up->usbif_list.next != NULL; -} - -void remove_from_usbif_list(usbif_priv_t *up) -{ - unsigned long flags; - if ( !__on_usbif_list(up) ) return; - spin_lock_irqsave(&usbio_schedule_list_lock, flags); - if ( __on_usbif_list(up) ) - { - list_del(&up->usbif_list); - up->usbif_list.next = NULL; - usbif_put(up); - } - spin_unlock_irqrestore(&usbio_schedule_list_lock, flags); -} - -static void add_to_usbif_list_tail(usbif_priv_t *up) -{ - unsigned long flags; - if ( __on_usbif_list(up) ) return; - spin_lock_irqsave(&usbio_schedule_list_lock, flags); - if ( !__on_usbif_list(up) && (up->status == CONNECTED) ) - { - list_add_tail(&up->usbif_list, &usbio_schedule_list); - usbif_get(up); - } - spin_unlock_irqrestore(&usbio_schedule_list_lock, flags); -} - -void free_pending(int pending_idx) -{ - unsigned long flags; - - /* Free the pending request. */ - spin_lock_irqsave(&pend_prod_lock, flags); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - spin_unlock_irqrestore(&pend_prod_lock, flags); -} - -/****************************************************************** - * COMPLETION CALLBACK -- Called as urb->complete() - */ - -static void maybe_trigger_usbio_schedule(void); - -static void __end_usb_io_op(struct urb *purb) -{ - pending_req_t *pending_req; - int pending_idx; - - pending_req = purb->context; - - pending_idx = pending_req - pending_reqs; - - ASSERT(purb->actual_length <= purb->transfer_buffer_length); - ASSERT(purb->actual_length <= pending_req->nr_pages * PAGE_SIZE); - - /* An error fails the entire request. */ - if ( purb->status ) - { - printk(KERN_WARNING "URB @ %p failed. Status %d\n", purb, purb->status); - } - - if ( usb_pipetype(purb->pipe) == 0 ) - { - int i; - usbif_iso_t *sched = (usbif_iso_t *)MMAP_VADDR(pending_idx, pending_req->nr_pages - 1); - - /* If we're dealing with an iso pipe, we need to copy back the schedule. */ - for ( i = 0; i < purb->number_of_packets; i++ ) - { - sched[i].length = purb->iso_frame_desc[i].actual_length; - ASSERT(sched[i].buffer_offset == - purb->iso_frame_desc[i].offset); - sched[i].status = purb->iso_frame_desc[i].status; - } - } - - fast_flush_area(pending_req - pending_reqs, pending_req->nr_pages); - - kfree(purb->setup_packet); - - make_response(pending_req->usbif_priv, pending_req->id, - pending_req->operation, pending_req->status, 0, purb->actual_length); - usbif_put(pending_req->usbif_priv); - - usb_free_urb(purb); - - free_pending(pending_idx); - - rmb(); - - /* Check for anything still waiting in the rings, having freed a request... */ - maybe_trigger_usbio_schedule(); -} - -/****************************************************************** - * SCHEDULER FUNCTIONS - */ - -static DECLARE_WAIT_QUEUE_HEAD(usbio_schedule_wait); - -static int usbio_schedule(void *arg) -{ - DECLARE_WAITQUEUE(wq, current); - - usbif_priv_t *up; - struct list_head *ent; - - daemonize(); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&usbio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || - list_empty(&usbio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&usbio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&usbio_schedule_list) ) - { - ent = usbio_schedule_list.next; - up = list_entry(ent, usbif_priv_t, usbif_list); - usbif_get(up); - remove_from_usbif_list(up); - if ( do_usb_io_op(up, BATCH_PER_DOMAIN) ) - add_to_usbif_list_tail(up); - usbif_put(up); - } - } -} - -static void maybe_trigger_usbio_schedule(void) -{ - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( !list_empty(&usbio_schedule_list) ) - wake_up(&usbio_schedule_wait); -} - - -/****************************************************************************** - * NOTIFICATION FROM GUEST OS. - */ - -irqreturn_t usbif_be_int(int irq, void *dev_id, struct pt_regs *regs) -{ - usbif_priv_t *up = dev_id; - - smp_mb(); - - add_to_usbif_list_tail(up); - - /* Will in fact /always/ trigger an io schedule in this case. */ - maybe_trigger_usbio_schedule(); - - return IRQ_HANDLED; -} - - - -/****************************************************************** - * DOWNWARD CALLS -- These interface with the usb-device layer proper. - */ - -static int do_usb_io_op(usbif_priv_t *up, int max_to_do) -{ - usbif_back_ring_t *usb_ring = &up->usb_ring; - usbif_request_t *req; - RING_IDX i, rp; - int more_to_do = 0; - - rp = usb_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - /* Take items off the comms ring, taking care not to overflow. */ - for ( i = usb_ring->req_cons; - (i != rp) && !RING_REQUEST_CONS_OVERFLOW(usb_ring, i); - i++ ) - { - if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) - { - more_to_do = 1; - break; - } - - req = RING_GET_REQUEST(usb_ring, i); - - switch ( req->operation ) - { - case USBIF_OP_PROBE: - dispatch_usb_probe(up, req->id, req->port); - break; - - case USBIF_OP_IO: - /* Assemble an appropriate URB. */ - dispatch_usb_io(up, req); - break; - - case USBIF_OP_RESET: - dispatch_usb_reset(up, req->port); - break; - - default: - DPRINTK("error: unknown USB io operation [%d]\n", - req->operation); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - break; - } - } - - usb_ring->req_cons = i; - - return more_to_do; -} - -static owned_port_t *find_guest_port(usbif_priv_t *up, int port) -{ - unsigned long flags; - struct list_head *l; - - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each(l, &owned_ports) - { - owned_port_t *p = list_entry(l, owned_port_t, list); - if(p->usbif_priv == up && p->guest_port == port) - { - spin_unlock_irqrestore(&owned_ports_lock, flags); - return p; - } - } - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return NULL; -} - -static void dispatch_usb_reset(usbif_priv_t *up, unsigned long portid) -{ - owned_port_t *port = find_guest_port(up, portid); - int ret = 0; - - - /* Allowing the guest to actually reset the device causes more problems - * than it's worth. We just fake it out in software but we will do a real - * reset when the interface is destroyed. */ - - dump_port(port); - - port->guest_address = 0; - /* If there's an attached device then the port is now enabled. */ - if ( port->dev_present ) - port->enabled = 1; - else - port->enabled = 0; - - make_response(up, 0, USBIF_OP_RESET, ret, 0, 0); -} - -static void dispatch_usb_probe(usbif_priv_t *up, unsigned long id, unsigned long portid) -{ - owned_port_t *port = find_guest_port(up, portid); - int ret; - - if ( port != NULL ) - ret = port->dev_present; - else - { - ret = -EINVAL; - printk(KERN_INFO "dispatch_usb_probe(): invalid port probe request " - "(port %ld)\n", portid); - } - - /* Probe result is sent back in-band. Probes don't have an associated id - * right now... */ - make_response(up, id, USBIF_OP_PROBE, ret, portid, 0); -} - -/** - * check_iso_schedule - safety check the isochronous schedule for an URB - * @purb : the URB in question - */ -static int check_iso_schedule(struct urb *purb) -{ - int i; - unsigned long total_length = 0; - - for ( i = 0; i < purb->number_of_packets; i++ ) - { - struct usb_iso_packet_descriptor *desc = &purb->iso_frame_desc[i]; - - if ( desc->offset >= purb->transfer_buffer_length - || ( desc->offset + desc->length) > purb->transfer_buffer_length ) - return -EINVAL; - - total_length += desc->length; - - if ( total_length > purb->transfer_buffer_length ) - return -EINVAL; - } - - return 0; -} - -owned_port_t *find_port_for_request(usbif_priv_t *up, usbif_request_t *req); - -static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req) -{ - unsigned long buffer_mach; - int i = 0, offset = 0, - pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - pending_req_t *pending_req; - unsigned long remap_prot; - multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; - struct urb *purb = NULL; - owned_port_t *port; - unsigned char *setup; - - dump_request(req); - - if ( NR_PENDING_REQS == MAX_PENDING_REQS ) - { - printk(KERN_WARNING "usbback: Max requests already queued. " - "Giving up!\n"); - - return; - } - - port = find_port_for_request(up, req); - - if ( port == NULL ) - { - printk(KERN_WARNING "No such device! (%d)\n", req->devnum); - dump_request(req); - - make_response(up, req->id, req->operation, -ENODEV, 0, 0); - return; - } - else if ( !port->dev_present ) - { - /* In normal operation, we'll only get here if a device is unplugged - * and the frontend hasn't noticed yet. */ - make_response(up, req->id, req->operation, -ENODEV, 0, 0); - return; - } - - - setup = kmalloc(8, GFP_KERNEL); - - if ( setup == NULL ) - goto no_mem; - - /* Copy request out for safety. */ - memcpy(setup, req->setup, 8); - - if( setup[0] == 0x0 && setup[1] == 0x5) - { - /* To virtualise the USB address space, we need to intercept - * set_address messages and emulate. From the USB specification: - * bmRequestType = 0x0; - * Brequest = SET_ADDRESS (i.e. 0x5) - * wValue = device address - * wIndex = 0 - * wLength = 0 - * data = None - */ - /* Store into the guest transfer buffer using cpu_to_le16 */ - port->guest_address = le16_to_cpu(*(u16 *)(setup + 2)); - /* Make a successful response. That was easy! */ - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - else if ( setup[0] == 0x0 && setup[1] == 0x9 ) - { - /* The host kernel needs to know what device configuration is in use - * because various error checks get confused otherwise. We just do - * configuration settings here, under controlled conditions. - */ - - /* Ignore configuration setting and hope that the host kernel - did it right. */ - /* usb_set_configuration(port->dev, setup[2]); */ - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - else if ( setup[0] == 0x1 && setup[1] == 0xB ) - { - /* The host kernel needs to know what device interface is in use - * because various error checks get confused otherwise. We just do - * configuration settings here, under controlled conditions. - */ - usb_set_interface(port->dev, (setup[4] | setup[5] << 8), - (setup[2] | setup[3] << 8) ); - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - - if ( ( req->transfer_buffer - (req->transfer_buffer & PAGE_MASK) - + req->length ) - > MMAP_PAGES_PER_REQUEST * PAGE_SIZE ) - { - printk(KERN_WARNING "usbback: request of %lu bytes too large\n", - req->length); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - kfree(setup); - return; - } - - buffer_mach = req->transfer_buffer; - - if( buffer_mach == 0 ) - goto no_remap; - - ASSERT((req->length >> PAGE_SHIFT) <= MMAP_PAGES_PER_REQUEST); - ASSERT(buffer_mach); - - /* Always map writeable for now. */ - remap_prot = _KERNPG_TABLE; - - for ( i = 0, offset = 0; offset < req->length; - i++, offset += PAGE_SIZE ) - { - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma((buffer_mach + offset) >> PAGE_SHIFT, remap_prot), - 0, up->domid); - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME((buffer_mach + offset) >> PAGE_SHIFT); - - ASSERT(virt_to_mfn(MMAP_VADDR(pending_idx, i)) - == ((buffer_mach >> PAGE_SHIFT) + i)); - } - - if ( req->pipe_type == 0 && req->num_iso > 0 ) /* Maybe schedule ISO... */ - { - /* Map in ISO schedule, if necessary. */ - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma(req->iso_schedule >> PAGE_SHIFT, remap_prot), - 0, up->domid); - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(req->iso_schedule >> PAGE_SHIFT); - - i++; - } - - if ( unlikely(HYPERVISOR_multicall(mcl, i) != 0) ) - BUG(); - - { - int j; - for ( j = 0; j < i; j++ ) - { - if ( unlikely(mcl[j].result != 0) ) - { - printk(KERN_WARNING - "invalid buffer %d -- could not remap it\n", j); - fast_flush_area(pending_idx, i); - goto bad_descriptor; - } - } - } - - no_remap: - - ASSERT(i <= MMAP_PAGES_PER_REQUEST); - ASSERT(i * PAGE_SIZE >= req->length); - - /* We have to do this because some things might complete out of order. */ - pending_req = &pending_reqs[pending_idx]; - pending_req->usbif_priv= up; - pending_req->id = req->id; - pending_req->operation = req->operation; - pending_req->nr_pages = i; - - pending_cons++; - - usbif_get(up); - - /* Fill out an actual request for the USB layer. */ - purb = usb_alloc_urb(req->num_iso); - - if ( purb == NULL ) - { - usbif_put(up); - free_pending(pending_idx); - goto no_mem; - } - - purb->dev = port->dev; - purb->context = pending_req; - purb->transfer_buffer = - (void *)(MMAP_VADDR(pending_idx, 0) + (buffer_mach & ~PAGE_MASK)); - if(buffer_mach == 0) - purb->transfer_buffer = NULL; - purb->complete = __end_usb_io_op; - purb->transfer_buffer_length = req->length; - purb->transfer_flags = req->transfer_flags; - - purb->pipe = 0; - purb->pipe |= req->direction << 7; - purb->pipe |= port->dev->devnum << 8; - purb->pipe |= req->speed << 26; - purb->pipe |= req->pipe_type << 30; - purb->pipe |= req->endpoint << 15; - - purb->number_of_packets = req->num_iso; - - if ( purb->number_of_packets * sizeof(usbif_iso_t) > PAGE_SIZE ) - goto urb_error; - - /* Make sure there's always some kind of timeout. */ - purb->timeout = ( req->timeout > 0 ) ? (req->timeout * HZ) / 1000 - : 1000; - - purb->setup_packet = setup; - - if ( req->pipe_type == 0 ) /* ISO */ - { - int j; - usbif_iso_t *iso_sched = (usbif_iso_t *)MMAP_VADDR(pending_idx, i - 1); - - /* If we're dealing with an iso pipe, we need to copy in a schedule. */ - for ( j = 0; j < purb->number_of_packets; j++ ) - { - purb->iso_frame_desc[j].length = iso_sched[j].length; - purb->iso_frame_desc[j].offset = iso_sched[j].buffer_offset; - iso_sched[j].status = 0; - } - } - - if ( check_iso_schedule(purb) != 0 ) - goto urb_error; - - if ( usb_submit_urb(purb) != 0 ) - goto urb_error; - - return; - - urb_error: - dump_urb(purb); - usbif_put(up); - free_pending(pending_idx); - - bad_descriptor: - kfree ( setup ); - if ( purb != NULL ) - usb_free_urb(purb); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - return; - - no_mem: - if ( setup != NULL ) - kfree(setup); - make_response(up, req->id, req->operation, -ENOMEM, 0, 0); - return; -} - - - -/****************************************************************** - * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING - */ - - -static void make_response(usbif_priv_t *up, unsigned long id, - unsigned short op, int st, int inband, - unsigned long length) -{ - usbif_response_t *resp; - unsigned long flags; - usbif_back_ring_t *usb_ring = &up->usb_ring; - - /* Place on the response ring for the relevant domain. */ - spin_lock_irqsave(&up->usb_ring_lock, flags); - resp = RING_GET_RESPONSE(usb_ring, usb_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - resp->data = inband; - resp->length = length; - wmb(); /* Ensure other side can see the response fields. */ - - dump_response(resp); - - usb_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES(usb_ring); - spin_unlock_irqrestore(&up->usb_ring_lock, flags); - - /* Kick the relevant domain. */ - notify_via_evtchn(up->evtchn); -} - -/** - * usbif_claim_port - claim devices on a port on behalf of guest - * - * Once completed, this will ensure that any device attached to that - * port is claimed by this driver for use by the guest. - */ -int usbif_claim_port(usbif_be_claim_port_t *msg) -{ - owned_port_t *o_p; - - /* Sanity... */ - if ( usbif_find_port(msg->path) != NULL ) - { - printk(KERN_WARNING "usbback: Attempted to claim USB port " - "we already own!\n"); - return -EINVAL; - } - - /* No need for a slab cache - this should be infrequent. */ - o_p = kmalloc(sizeof(owned_port_t), GFP_KERNEL); - - if ( o_p == NULL ) - return -ENOMEM; - - o_p->enabled = 0; - o_p->usbif_priv = usbif_find(msg->domid); - o_p->guest_port = msg->usbif_port; - o_p->dev_present = 0; - o_p->guest_address = 0; /* Default address. */ - - strcpy(o_p->path, msg->path); - - spin_lock_irq(&owned_ports_lock); - - list_add(&o_p->list, &owned_ports); - - spin_unlock_irq(&owned_ports_lock); - - printk(KERN_INFO "usbback: Claimed USB port (%s) for %d.%d\n", o_p->path, - msg->domid, msg->usbif_port); - - /* Force a reprobe for unclaimed devices. */ - usb_scan_devices(); - - return 0; -} - -owned_port_t *find_port_for_request(usbif_priv_t *up, usbif_request_t *req) -{ - unsigned long flags; - struct list_head *port; - - /* I'm assuming this is not called from IRQ context - correct? I think - * it's probably only called in response to control messages or plug events - * in the USB hub kernel thread, so should be OK. */ - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each(port, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if(p->usbif_priv == up && p->guest_address == req->devnum && p->enabled ) - { - dump_port(p); - - spin_unlock_irqrestore(&owned_ports_lock, flags); - return p; - } - } - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return NULL; -} - -owned_port_t *__usbif_find_port(char *path) -{ - struct list_head *port; - - list_for_each(port, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if(!strcmp(path, p->path)) - { - return p; - } - } - - return NULL; -} - -owned_port_t *usbif_find_port(char *path) -{ - owned_port_t *ret; - unsigned long flags; - - spin_lock_irqsave(&owned_ports_lock, flags); - ret = __usbif_find_port(path); - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return ret; -} - - -static void *probe(struct usb_device *dev, unsigned iface, - const struct usb_device_id *id) -{ - owned_port_t *p; - - /* We don't care what the device is - if we own the port, we want it. We - * don't deal with device-specifics in this driver, so we don't care what - * the device actually is ;-) */ - if ( ( p = usbif_find_port(dev->devpath) ) != NULL ) - { - printk(KERN_INFO "usbback: claimed device attached to owned port\n"); - - p->dev_present = 1; - p->dev = dev; - set_bit(iface, &p->ifaces); - - return p->usbif_priv; - } - else - printk(KERN_INFO "usbback: hotplug for non-owned port (%s), ignoring\n", - dev->devpath); - - - return NULL; -} - -static void disconnect(struct usb_device *dev, void *usbif) -{ - /* Note the device is removed so we can tell the guest when it probes. */ - owned_port_t *port = usbif_find_port(dev->devpath); - port->dev_present = 0; - port->dev = NULL; - port->ifaces = 0; -} - - -struct usb_driver driver = -{ - .owner = THIS_MODULE, - .name = "Xen USB Backend", - .probe = probe, - .disconnect = disconnect, - .id_table = NULL, -}; - -/* __usbif_release_port - internal mechanics for releasing a port */ -void __usbif_release_port(owned_port_t *p) -{ - int i; - - for ( i = 0; p->ifaces != 0; i++) - if ( p->ifaces & 1 << i ) - { - usb_driver_release_interface(&driver, usb_ifnum_to_if(p->dev, i)); - clear_bit(i, &p->ifaces); - } - list_del(&p->list); - - /* Reset the real device. We don't simulate disconnect / probe for other - * drivers in this kernel because we assume the device is completely under - * the control of ourselves (i.e. the guest!). This should ensure that the - * device is in a sane state for the next customer ;-) */ - - /* MAW NB: we're not resetting the real device here. This looks perfectly - * valid to me but it causes memory corruption. We seem to get away with not - * resetting for now, although it'd be nice to have this tracked down. */ -/* if ( p->dev != NULL) */ -/* usb_reset_device(p->dev); */ - - kfree(p); -} - - -/** - * usbif_release_port - stop claiming devices on a port on behalf of guest - */ -void usbif_release_port(usbif_be_release_port_t *msg) -{ - owned_port_t *p; - - spin_lock_irq(&owned_ports_lock); - p = __usbif_find_port(msg->path); - __usbif_release_port(p); - spin_unlock_irq(&owned_ports_lock); -} - -void usbif_release_ports(usbif_priv_t *up) -{ - struct list_head *port, *tmp; - unsigned long flags; - - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each_safe(port, tmp, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if ( p->usbif_priv == up ) - __usbif_release_port(p); - } - spin_unlock_irqrestore(&owned_ports_lock, flags); -} - -static int __init usbif_init(void) -{ - int i; - struct page *page; - - if ( !(xen_start_info->flags & SIF_INITDOMAIN) && - !(xen_start_info->flags & SIF_USB_BE_DOMAIN) ) - return 0; - - page = balloon_alloc_empty_page_range(MMAP_PAGES); - BUG_ON(page == NULL); - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - memset(pending_reqs, 0, sizeof(pending_reqs)); - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; - - spin_lock_init(&pend_prod_lock); - - spin_lock_init(&owned_ports_lock); - INIT_LIST_HEAD(&owned_ports); - - spin_lock_init(&usbio_schedule_list_lock); - INIT_LIST_HEAD(&usbio_schedule_list); - - if ( kernel_thread(usbio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); - - usbif_interface_init(); - - usbif_ctrlif_init(); - - usb_register(&driver); - - printk(KERN_INFO "Xen USB Backend Initialised"); - - return 0; -} - -__initcall(usbif_init); diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbfront/usbfront.c --- a/linux-2.6-xen-sparse/drivers/xen/usbfront/usbfront.c Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,1735 +0,0 @@ -/* - * Xen Virtual USB Frontend Driver - * - * This file contains the first version of the Xen virtual USB hub - * that I've managed not to delete by mistake (3rd time lucky!). - * - * Based on Linux's uhci.c, original copyright notices are displayed - * below. Portions also (c) 2004 Intel Research Cambridge - * and (c) 2004, 2005 Mark Williamson - * - * Contact <mark.williamson@xxxxxxxxxxxx> or - * <xen-devel@xxxxxxxxxxxxxxxxxxxxx> regarding this code. - * - * Still to be (maybe) implemented: - * - migration / backend restart support? - * - support for building / using as a module - */ - -/* - * Universal Host Controller Interface driver for USB. - * - * Maintainer: Johannes Erdfelt <johannes@xxxxxxxxxxx> - * - * (C) Copyright 1999 Linus Torvalds - * (C) Copyright 1999-2002 Johannes Erdfelt, johannes@xxxxxxxxxxx - * (C) Copyright 1999 Randy Dunlap - * (C) Copyright 1999 Georg Acher, acher@xxxxxxxxx - * (C) Copyright 1999 Deti Fliegl, deti@xxxxxxxxx - * (C) Copyright 1999 Thomas Sailer, sailer@xxxxxxxxxxxxxx - * (C) Copyright 1999 Roman Weissgaerber, weissg@xxxxxxxxx - * (C) Copyright 2000 Yggdrasil Computing, Inc. (port of new PCI interface - * support from usb-ohci.c by Adam Richter, adam@xxxxxxxxxxxxx). - * (C) Copyright 1999 Gregory P. Smith (from usb-ohci.c) - * - * Intel documents this fairly well, and as far as I know there - * are no royalties or anything like that, but even so there are - * people who decided that they want to do the same thing in a - * completely different way. - * - * WARNING! The USB documentation is downright evil. Most of it - * is just crap, written by a committee. You're better off ignoring - * most of it, the important stuff is: - * - the low-level protocol (fairly simple but lots of small details) - * - working around the horridness of the rest - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/smp_lock.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/spinlock.h> -#ifdef CONFIG_USB_DEBUG -#define DEBUG -#else -#undef DEBUG -#endif -#include <linux/usb.h> - -#include <asm/irq.h> -#include <asm/system.h> - -#include "xhci.h" - -#include "../../../../../drivers/usb/hcd.h" - -#include <asm-xen/xen-public/io/usbif.h> -#include <asm/xen-public/io/domain_controller.h> - -/* - * Version Information - */ -#define DRIVER_VERSION "v1.0" -#define DRIVER_AUTHOR "Linus 'Frodo Rabbit' Torvalds, Johannes Erdfelt, " \ - "Randy Dunlap, Georg Acher, Deti Fliegl, " \ - "Thomas Sailer, Roman Weissgaerber, Mark Williamson" -#define DRIVER_DESC "Xen Virtual USB Host Controller Interface" - -/* - * debug = 0, no debugging messages - * debug = 1, dump failed URB's except for stalls - * debug = 2, dump all failed URB's (including stalls) - */ -#ifdef DEBUG -static int debug = 1; -#else -static int debug = 0; -#endif -MODULE_PARM(debug, "i"); -MODULE_PARM_DESC(debug, "Debug level"); -static char *errbuf; -#define ERRBUF_LEN (PAGE_SIZE * 8) - -static int rh_submit_urb(struct urb *urb); -static int rh_unlink_urb(struct urb *urb); -static int xhci_unlink_urb(struct urb *urb); -static void xhci_call_completion(struct urb *urb); -static void xhci_drain_ring(void); -static void xhci_transfer_result(struct xhci *xhci, struct urb *urb); -static void xhci_finish_completion(void); - -#define MAX_URB_LOOP 2048 /* Maximum number of linked URB's */ - -static kmem_cache_t *xhci_up_cachep; /* urb_priv cache */ -static struct xhci *xhci; /* XHCI structure for the interface */ - -/****************************************************************************** - * DEBUGGING - */ - -#ifdef DEBUG - -static void dump_urb(struct urb *urb) -{ - printk(KERN_DEBUG "dumping urb @ %p\n" - " hcpriv = %p\n" - " next = %p\n" - " dev = %p\n" - " pipe = 0x%lx\n" - " status = %d\n" - " transfer_flags = 0x%lx\n" - " transfer_buffer = %p\n" - " transfer_buffer_length = %d\n" - " actual_length = %d\n" - " bandwidth = %d\n" - " setup_packet = %p\n", - urb, urb->hcpriv, urb->next, urb->dev, urb->pipe, urb->status, - urb->transfer_flags, urb->transfer_buffer, - urb->transfer_buffer_length, urb->actual_length, urb->bandwidth, - urb->setup_packet); - if ( urb->setup_packet != NULL ) - printk(KERN_DEBUG - "setup = { 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x }\n", - urb->setup_packet[0], urb->setup_packet[1], - urb->setup_packet[2], urb->setup_packet[3], - urb->setup_packet[4], urb->setup_packet[5], - urb->setup_packet[6], urb->setup_packet[7]); - printk(KERN_DEBUG "complete = %p\n" - "interval = %d\n", urb->complete, urb->interval); - -} - -static void xhci_show_resp(usbif_response_t *r) -{ - printk(KERN_DEBUG "dumping response @ %p\n" - " id=0x%lx\n" - " op=0x%x\n" - " data=0x%x\n" - " status=0x%x\n" - " length=0x%lx\n", - r->id, r->operation, r->data, r->status, r->length); -} - -#define DPRINK(...) printk(KERN_DEBUG __VA_ARGS__) - -#else /* DEBUG */ - -#define dump_urb(blah) ((void)0) -#define xhci_show_resp(blah) ((void)0) -#define DPRINTK(blah,...) ((void)0) - -#endif /* DEBUG */ - -/****************************************************************************** - * RING REQUEST HANDLING - */ - -#define RING_PLUGGED(_hc) ( RING_FULL(&_hc->usb_ring) || _hc->recovery ) - -/** - * xhci_construct_isoc - add isochronous information to a request - */ -static int xhci_construct_isoc(usbif_request_t *req, struct urb *urb) -{ - usbif_iso_t *schedule; - int i; - struct urb_priv *urb_priv = urb->hcpriv; - - req->num_iso = urb->number_of_packets; - schedule = (usbif_iso_t *)__get_free_page(GFP_KERNEL); - - if ( schedule == NULL ) - return -ENOMEM; - - for ( i = 0; i < req->num_iso; i++ ) - { - schedule[i].buffer_offset = urb->iso_frame_desc[i].offset; - schedule[i].length = urb->iso_frame_desc[i].length; - } - - urb_priv->schedule = schedule; - req->iso_schedule = virt_to_mfn(schedule) << PAGE_SHIFT; - - return 0; -} - -/** - * xhci_queue_req - construct and queue request for an URB - */ -static int xhci_queue_req(struct urb *urb) -{ - unsigned long flags; - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - -#if DEBUG - printk(KERN_DEBUG - "usbif = %p, req_prod = %d (@ 0x%lx), resp_prod = %d, resp_cons = %d\n", - usbif, usbif->req_prod, virt_to_mfn(&usbif->req_prod), - usbif->resp_prod, xhci->usb_resp_cons); -#endif - - spin_lock_irqsave(&xhci->ring_lock, flags); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_queue_req(): USB ring plugged, not queuing request\n"); - spin_unlock_irqrestore(&xhci->ring_lock, flags); - return -ENOBUFS; - } - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - req->operation = USBIF_OP_IO; - req->port = 0; /* We don't care what the port is. */ - req->id = (unsigned long) urb->hcpriv; - req->transfer_buffer = virt_to_mfn(urb->transfer_buffer) << PAGE_SHIFT; - req->devnum = usb_pipedevice(urb->pipe); - req->direction = usb_pipein(urb->pipe); - req->speed = usb_pipeslow(urb->pipe); - req->pipe_type = usb_pipetype(urb->pipe); - req->length = urb->transfer_buffer_length; - req->transfer_flags = urb->transfer_flags; - req->endpoint = usb_pipeendpoint(urb->pipe); - req->speed = usb_pipeslow(urb->pipe); - req->timeout = urb->timeout * (1000 / HZ); - - if ( usb_pipetype(urb->pipe) == 0 ) /* ISO */ - { - int ret = xhci_construct_isoc(req, urb); - if ( ret != 0 ) - return ret; - } - - if(urb->setup_packet != NULL) - memcpy(req->setup, urb->setup_packet, 8); - else - memset(req->setup, 0, 8); - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock_irqrestore(&xhci->ring_lock, flags); - - notify_via_evtchn(xhci->evtchn); - - DPRINTK("Queued request for an URB.\n"); - dump_urb(urb); - - return -EINPROGRESS; -} - -/** - * xhci_queue_probe - queue a probe request for a particular port - */ -static inline usbif_request_t *xhci_queue_probe(usbif_vdev_t port) -{ - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - -#if DEBUG - printk(KERN_DEBUG - "queuing probe: req_prod = %d (@ 0x%lx), resp_prod = %d, " - "resp_cons = %d\n", usbif->req_prod, - virt_to_mfn(&usbif->req_prod), - usbif->resp_prod, xhci->usb_resp_cons); -#endif - - /* This is always called from the timer interrupt. */ - spin_lock(&xhci->ring_lock); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_queue_probe(): ring full, not queuing request\n"); - spin_unlock(&xhci->ring_lock); - return NULL; - } - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - memset(req, 0, sizeof(*req)); - - req->operation = USBIF_OP_PROBE; - req->port = port; - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock(&xhci->ring_lock); - - notify_via_evtchn(xhci->evtchn); - - return req; -} - -/** - * xhci_port_reset - queue a reset request for a particular port - */ -static int xhci_port_reset(usbif_vdev_t port) -{ - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - - /* Only ever happens from process context (hub thread). */ - spin_lock_irq(&xhci->ring_lock); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_port_reset(): ring plugged, not queuing request\n"); - spin_unlock_irq(&xhci->ring_lock); - return -ENOBUFS; - } - - /* We only reset one port at a time, so we only need one variable per - * hub. */ - xhci->awaiting_reset = 1; - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - memset(req, 0, sizeof(*req)); - - req->operation = USBIF_OP_RESET; - req->port = port; - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock_irq(&xhci->ring_lock); - - notify_via_evtchn(xhci->evtchn); - - while ( xhci->awaiting_reset > 0 ) - { - mdelay(1); - xhci_drain_ring(); - } - - xhci->rh.ports[port].pe = 1; - xhci->rh.ports[port].pe_chg = 1; - - return xhci->awaiting_reset; -} - - -/****************************************************************************** - * RING RESPONSE HANDLING - */ - -static void receive_usb_reset(usbif_response_t *resp) -{ - xhci->awaiting_reset = resp->status; - rmb(); - -} - -static void receive_usb_probe(usbif_response_t *resp) -{ - spin_lock(&xhci->rh.port_state_lock); - - if ( resp->status >= 0 ) - { - if ( resp->status == 1 ) - { - /* If theres a device there and there wasn't one before there must - * have been a connection status change. */ - if( xhci->rh.ports[resp->data].cs == 0 ) - { - xhci->rh.ports[resp->data].cs = 1; - xhci->rh.ports[resp->data].cs_chg = 1; - } - } - else if ( resp->status == 0 ) - { - if(xhci->rh.ports[resp->data].cs == 1 ) - { - xhci->rh.ports[resp->data].cs = 0; - xhci->rh.ports[resp->data].cs_chg = 1; - xhci->rh.ports[resp->data].pe = 0; - /* According to USB Spec v2.0, 11.24.2.7.2.2, we don't need - * to set pe_chg since an error has not occurred. */ - } - } - else - printk(KERN_WARNING "receive_usb_probe(): unexpected status %d " - "for port %d\n", resp->status, resp->data); - } - else if ( resp->status < 0) - printk(KERN_WARNING "receive_usb_probe(): got error status %d\n", - resp->status); - - spin_unlock(&xhci->rh.port_state_lock); -} - -static void receive_usb_io(usbif_response_t *resp) -{ - struct urb_priv *urbp = (struct urb_priv *)resp->id; - struct urb *urb = urbp->urb; - - urb->actual_length = resp->length; - urbp->in_progress = 0; - - if( usb_pipetype(urb->pipe) == 0 ) /* ISO */ - { - int i; - - /* Copy ISO schedule results back in. */ - for ( i = 0; i < urb->number_of_packets; i++ ) - { - urb->iso_frame_desc[i].status - = urbp->schedule[i].status; - urb->iso_frame_desc[i].actual_length - = urbp->schedule[i].length; - } - free_page((unsigned long)urbp->schedule); - } - - /* Only set status if it's not been changed since submission. It might - * have been changed if the URB has been unlinked asynchronously, for - * instance. */ - if ( urb->status == -EINPROGRESS ) - urbp->status = urb->status = resp->status; -} - -/** - * xhci_drain_ring - drain responses from the ring, calling handlers - * - * This may be called from interrupt context when an event is received from the - * backend domain, or sometimes in process context whilst waiting for a port - * reset or URB completion. - */ -static void xhci_drain_ring(void) -{ - struct list_head *tmp, *head; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - usbif_response_t *resp; - RING_IDX i, rp; - - /* Walk the ring here to get responses, updating URBs to show what - * completed. */ - - rp = usb_ring->sring->rsp_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - /* Take items off the comms ring, taking care not to overflow. */ - for ( i = usb_ring->rsp_cons; i != rp; i++ ) - { - resp = RING_GET_RESPONSE(usb_ring, i); - - /* May need to deal with batching and with putting a ceiling on - the number dispatched for performance and anti-dos reasons */ - - xhci_show_resp(resp); - - switch ( resp->operation ) - { - case USBIF_OP_PROBE: - receive_usb_probe(resp); - break; - - case USBIF_OP_IO: - receive_usb_io(resp); - break; - - case USBIF_OP_RESET: - receive_usb_reset(resp); - break; - - default: - printk(KERN_WARNING - "error: unknown USB io operation response [%d]\n", - resp->operation); - break; - } - } - - usb_ring->rsp_cons = i; - - /* Walk the list of pending URB's to see which ones completed and do - * callbacks, etc. */ - spin_lock(&xhci->urb_list_lock); - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *urb = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - /* Checks the status and does all of the magic necessary */ - xhci_transfer_result(xhci, urb); - } - spin_unlock(&xhci->urb_list_lock); - - xhci_finish_completion(); -} - - -static void xhci_interrupt(int irq, void *__xhci, struct pt_regs *regs) -{ - xhci_drain_ring(); -} - -/****************************************************************************** - * HOST CONTROLLER FUNCTIONALITY - */ - -/** - * no-op implementation of private device alloc / free routines - */ -static int xhci_do_nothing_dev(struct usb_device *dev) -{ - return 0; -} - -static inline void xhci_add_complete(struct urb *urb) -{ - struct urb_priv *urbp = (struct urb_priv *)urb->hcpriv; - unsigned long flags; - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - list_add_tail(&urbp->complete_list, &xhci->complete_list); - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); -} - -/* When this returns, the owner of the URB may free its - * storage. - * - * We spin and wait for the URB to complete before returning. - * - * Call with urb->lock acquired. - */ -static void xhci_delete_urb(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = urb->hcpriv; - - /* If there's no urb_priv structure for this URB then it can't have - * been submitted at all. */ - if ( urbp == NULL ) - return; - - /* For now we just spin until the URB completes. It shouldn't take too - * long and we don't expect to have to do this very often. */ - while ( urb->status == -EINPROGRESS ) - { - xhci_drain_ring(); - mdelay(1); - } - - /* Now we know that further transfers to the buffer won't - * occur, so we can safely return. */ -} - -static struct urb_priv *xhci_alloc_urb_priv(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = kmem_cache_alloc(xhci_up_cachep, SLAB_ATOMIC); - if (!urbp) { - err("xhci_alloc_urb_priv: couldn't allocate memory for urb_priv\n"); - return NULL; - } - - memset((void *)urbp, 0, sizeof(*urbp)); - - urbp->inserttime = jiffies; - urbp->urb = urb; - urbp->dev = urb->dev; - - INIT_LIST_HEAD(&urbp->complete_list); - - urb->hcpriv = urbp; - - return urbp; -} - -/* - * MUST be called with urb->lock acquired - */ -/* When is this called? Do we need to stop the transfer (as we - * currently do)? */ -static void xhci_destroy_urb_priv(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = (struct urb_priv *)urb->hcpriv; - if (!urbp) - return; - - if (!list_empty(&urb->urb_list)) - warn("xhci_destroy_urb_priv: urb %p still on xhci->urb_list", urb); - - if (!list_empty(&urbp->complete_list)) - warn("xhci_destroy_urb_priv: urb %p still on xhci->complete_list", urb); - - kmem_cache_free(xhci_up_cachep, urb->hcpriv); - - urb->hcpriv = NULL; -} - -/** - * Try to find URBs in progress on the same pipe to the same device. - * - * MUST be called with xhci->urb_list_lock acquired - */ -static struct urb *xhci_find_urb_ep(struct xhci *xhci, struct urb *urb) -{ - struct list_head *tmp, *head; - - /* We don't match Isoc transfers since they are special */ - if (usb_pipeisoc(urb->pipe)) - return NULL; - - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - if (u->dev == urb->dev && u->pipe == urb->pipe && - u->status == -EINPROGRESS) - return u; - } - - return NULL; -} - -static int xhci_submit_urb(struct urb *urb) -{ - int ret = -EINVAL; - unsigned long flags; - struct urb *eurb; - int bustime; - - DPRINTK("URB submitted to XHCI driver.\n"); - dump_urb(urb); - - if (!urb) - return -EINVAL; - - if (!urb->dev || !urb->dev->bus || !urb->dev->bus->hcpriv) { - warn("xhci_submit_urb: urb %p belongs to disconnected device or bus?", urb); - return -ENODEV; - } - - if ( urb->dev->devpath == NULL ) - BUG(); - - usb_inc_dev_use(urb->dev); - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - spin_lock(&urb->lock); - - if (urb->status == -EINPROGRESS || urb->status == -ECONNRESET || - urb->status == -ECONNABORTED) { - dbg("xhci_submit_urb: urb not available to submit (status = %d)", urb->status); - /* Since we can have problems on the out path */ - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - usb_dec_dev_use(urb->dev); - - return ret; - } - - INIT_LIST_HEAD(&urb->urb_list); - if (!xhci_alloc_urb_priv(urb)) { - ret = -ENOMEM; - - goto out; - } - - ( (struct urb_priv *)urb->hcpriv )->in_progress = 1; - - eurb = xhci_find_urb_ep(xhci, urb); - if (eurb && !(urb->transfer_flags & USB_QUEUE_BULK)) { - ret = -ENXIO; - - goto out; - } - - /* Short circuit the virtual root hub */ - if (urb->dev == xhci->rh.dev) { - ret = rh_submit_urb(urb); - - goto out; - } - - switch (usb_pipetype(urb->pipe)) { - case PIPE_CONTROL: - case PIPE_BULK: - ret = xhci_queue_req(urb); - break; - - case PIPE_INTERRUPT: - if (urb->bandwidth == 0) { /* not yet checked/allocated */ - bustime = usb_check_bandwidth(urb->dev, urb); - if (bustime < 0) - ret = bustime; - else { - ret = xhci_queue_req(urb); - if (ret == -EINPROGRESS) - usb_claim_bandwidth(urb->dev, urb, - bustime, 0); - } - } else /* bandwidth is already set */ - ret = xhci_queue_req(urb); - break; - - case PIPE_ISOCHRONOUS: - if (urb->bandwidth == 0) { /* not yet checked/allocated */ - if (urb->number_of_packets <= 0) { - ret = -EINVAL; - break; - } - bustime = usb_check_bandwidth(urb->dev, urb); - if (bustime < 0) { - ret = bustime; - break; - } - - ret = xhci_queue_req(urb); - if (ret == -EINPROGRESS) - usb_claim_bandwidth(urb->dev, urb, bustime, 1); - } else /* bandwidth is already set */ - ret = xhci_queue_req(urb); - break; - } -out: - urb->status = ret; - - if (ret == -EINPROGRESS) { - /* We use _tail to make find_urb_ep more efficient */ - list_add_tail(&urb->urb_list, &xhci->urb_list); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - return 0; - } - - xhci_delete_urb(urb); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - /* Only call completion if it was successful */ - if (!ret) - xhci_call_completion(urb); - - return ret; -} - -/* - * Return the result of a transfer - * - * MUST be called with urb_list_lock acquired - */ -static void xhci_transfer_result(struct xhci *xhci, struct urb *urb) -{ - int ret = 0; - unsigned long flags; - struct urb_priv *urbp; - - /* The root hub is special */ - if (urb->dev == xhci->rh.dev) - return; - - spin_lock_irqsave(&urb->lock, flags); - - urbp = (struct urb_priv *)urb->hcpriv; - - if ( ( (struct urb_priv *)urb->hcpriv )->in_progress ) - ret = -EINPROGRESS; - - if (urb->actual_length < urb->transfer_buffer_length) { - if (urb->transfer_flags & USB_DISABLE_SPD) { - ret = -EREMOTEIO; - } - } - - if (urb->status == -EPIPE) - { - ret = urb->status; - /* endpoint has stalled - mark it halted */ - usb_endpoint_halt(urb->dev, usb_pipeendpoint(urb->pipe), - usb_pipeout(urb->pipe)); - } - - if ((debug == 1 && ret != 0 && ret != -EPIPE) || - (ret != 0 && debug > 1)) { - /* Some debugging code */ - dbg("xhci_result_interrupt/bulk() failed with status %x", - status); - } - - if (ret == -EINPROGRESS) - goto out; - - switch (usb_pipetype(urb->pipe)) { - case PIPE_CONTROL: - case PIPE_BULK: - case PIPE_ISOCHRONOUS: - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) - usb_release_bandwidth(urb->dev, urb, 1); - xhci_delete_urb(urb); - break; - case PIPE_INTERRUPT: - /* Interrupts are an exception */ - if (urb->interval) - goto out_complete; - - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) - usb_release_bandwidth(urb->dev, urb, 0); - xhci_delete_urb(urb); - break; - default: - info("xhci_transfer_result: unknown pipe type %d for urb %p\n", - usb_pipetype(urb->pipe), urb); - } - - /* Remove it from xhci->urb_list */ - list_del_init(&urb->urb_list); - -out_complete: - xhci_add_complete(urb); - -out: - spin_unlock_irqrestore(&urb->lock, flags); -} - -static int xhci_unlink_urb(struct urb *urb) -{ - unsigned long flags; - struct urb_priv *urbp = urb->hcpriv; - - if (!urb) - return -EINVAL; - - if (!urb->dev || !urb->dev->bus || !urb->dev->bus->hcpriv) - return -ENODEV; - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - spin_lock(&urb->lock); - - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) { - switch (usb_pipetype(urb->pipe)) { - case PIPE_INTERRUPT: - usb_release_bandwidth(urb->dev, urb, 0); - break; - case PIPE_ISOCHRONOUS: - usb_release_bandwidth(urb->dev, urb, 1); - break; - default: - break; - } - } - - if (urb->status != -EINPROGRESS) { - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - return 0; - } - - list_del_init(&urb->urb_list); - - /* Short circuit the virtual root hub */ - if (urb->dev == xhci->rh.dev) { - rh_unlink_urb(urb); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - xhci_call_completion(urb); - } else { - if (urb->transfer_flags & USB_ASYNC_UNLINK) { - /* We currently don't currently attempt to cancel URBs - * that have been queued in the ring. We handle async - * unlinked URBs when they complete. */ - urbp->status = urb->status = -ECONNABORTED; - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - } else { - urb->status = -ENOENT; - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - if (in_interrupt()) { /* wait at least 1 frame */ - static int errorcount = 10; - - if (errorcount--) - dbg("xhci_unlink_urb called from interrupt for urb %p", urb); - udelay(1000); - } else - schedule_timeout(1+1*HZ/1000); - - xhci_delete_urb(urb); - - xhci_call_completion(urb); - } - } - - return 0; -} - -static void xhci_call_completion(struct urb *urb) -{ - struct urb_priv *urbp; - struct usb_device *dev = urb->dev; - int is_ring = 0, killed, resubmit_interrupt, status; - struct urb *nurb; - unsigned long flags; - - spin_lock_irqsave(&urb->lock, flags); - - urbp = (struct urb_priv *)urb->hcpriv; - if (!urbp || !urb->dev) { - spin_unlock_irqrestore(&urb->lock, flags); - return; - } - - killed = (urb->status == -ENOENT || urb->status == -ECONNABORTED || - urb->status == -ECONNRESET); - resubmit_interrupt = (usb_pipetype(urb->pipe) == PIPE_INTERRUPT && - urb->interval); - - nurb = urb->next; - if (nurb && !killed) { - int count = 0; - - while (nurb && nurb != urb && count < MAX_URB_LOOP) { - if (nurb->status == -ENOENT || - nurb->status == -ECONNABORTED || - nurb->status == -ECONNRESET) { - killed = 1; - break; - } - - nurb = nurb->next; - count++; - } - - if (count == MAX_URB_LOOP) - err("xhci_call_completion: too many linked URB's, loop? (first loop)"); - - /* Check to see if chain is a ring */ - is_ring = (nurb == urb); - } - - status = urbp->status; - if (!resubmit_interrupt || killed) - /* We don't need urb_priv anymore */ - xhci_destroy_urb_priv(urb); - - if (!killed) - urb->status = status; - - spin_unlock_irqrestore(&urb->lock, flags); - - if (urb->complete) - urb->complete(urb); - - if (resubmit_interrupt) - /* Recheck the status. The completion handler may have */ - /* unlinked the resubmitting interrupt URB */ - killed = (urb->status == -ENOENT || - urb->status == -ECONNABORTED || - urb->status == -ECONNRESET); - - if (resubmit_interrupt && !killed) { - if ( urb->dev != xhci->rh.dev ) - xhci_queue_req(urb); /* XXX What if this fails? */ - /* Don't need to resubmit URBs for the virtual root dev. */ - } else { - if (is_ring && !killed) { - urb->dev = dev; - xhci_submit_urb(urb); - } else { - /* We decrement the usage count after we're done */ - /* with everything */ - usb_dec_dev_use(dev); - } - } -} - -static void xhci_finish_completion(void) -{ - struct list_head *tmp, *head; - unsigned long flags; - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - head = &xhci->complete_list; - tmp = head->next; - while (tmp != head) { - struct urb_priv *urbp = list_entry(tmp, struct urb_priv, - complete_list); - struct urb *urb = urbp->urb; - - list_del_init(&urbp->complete_list); - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); - - xhci_call_completion(urb); - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - head = &xhci->complete_list; - tmp = head->next; - } - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); -} - -static struct usb_operations xhci_device_operations = { - .allocate = xhci_do_nothing_dev, - .deallocate = xhci_do_nothing_dev, - /* It doesn't look like any drivers actually care what the frame number - * is at the moment! If necessary, we could approximate the current - * frame nubmer by passing it from the backend in response messages. */ - .get_frame_number = NULL, - .submit_urb = xhci_submit_urb, - .unlink_urb = xhci_unlink_urb -}; - -/****************************************************************************** - * VIRTUAL ROOT HUB EMULATION - */ - -static __u8 root_hub_dev_des[] = -{ - 0x12, /* __u8 bLength; */ - 0x01, /* __u8 bDescriptorType; Device */ - 0x00, /* __u16 bcdUSB; v1.0 */ - 0x01, - 0x09, /* __u8 bDeviceClass; HUB_CLASSCODE */ - 0x00, /* __u8 bDeviceSubClass; */ - 0x00, /* __u8 bDeviceProtocol; */ - 0x08, /* __u8 bMaxPacketSize0; 8 Bytes */ - 0x00, /* __u16 idVendor; */ - 0x00, - 0x00, /* __u16 idProduct; */ - 0x00, - 0x00, /* __u16 bcdDevice; */ - 0x00, - 0x00, /* __u8 iManufacturer; */ - 0x02, /* __u8 iProduct; */ - 0x01, /* __u8 iSerialNumber; */ - 0x01 /* __u8 bNumConfigurations; */ -}; - - -/* Configuration descriptor */ -static __u8 root_hub_config_des[] = -{ - 0x09, /* __u8 bLength; */ - 0x02, /* __u8 bDescriptorType; Configuration */ - 0x19, /* __u16 wTotalLength; */ - 0x00, - 0x01, /* __u8 bNumInterfaces; */ - 0x01, /* __u8 bConfigurationValue; */ - 0x00, /* __u8 iConfiguration; */ - 0x40, /* __u8 bmAttributes; - Bit 7: Bus-powered, 6: Self-powered, - Bit 5 Remote-wakeup, 4..0: resvd */ - 0x00, /* __u8 MaxPower; */ - - /* interface */ - 0x09, /* __u8 if_bLength; */ - 0x04, /* __u8 if_bDescriptorType; Interface */ - 0x00, /* __u8 if_bInterfaceNumber; */ - 0x00, /* __u8 if_bAlternateSetting; */ - 0x01, /* __u8 if_bNumEndpoints; */ - 0x09, /* __u8 if_bInterfaceClass; HUB_CLASSCODE */ - 0x00, /* __u8 if_bInterfaceSubClass; */ - 0x00, /* __u8 if_bInterfaceProtocol; */ - 0x00, /* __u8 if_iInterface; */ - - /* endpoint */ - 0x07, /* __u8 ep_bLength; */ - 0x05, /* __u8 ep_bDescriptorType; Endpoint */ - 0x81, /* __u8 ep_bEndpointAddress; IN Endpoint 1 */ - 0x03, /* __u8 ep_bmAttributes; Interrupt */ - 0x08, /* __u16 ep_wMaxPacketSize; 8 Bytes */ - 0x00, - 0xff /* __u8 ep_bInterval; 255 ms */ -}; - -static __u8 root_hub_hub_des[] = -{ - 0x09, /* __u8 bLength; */ - 0x29, /* __u8 bDescriptorType; Hub-descriptor */ - 0x02, /* __u8 bNbrPorts; */ - 0x00, /* __u16 wHubCharacteristics; */ - 0x00, - 0x01, /* __u8 bPwrOn2pwrGood; 2ms */ - 0x00, /* __u8 bHubContrCurrent; 0 mA */ - 0x00, /* __u8 DeviceRemovable; *** 7 Ports max *** */ - 0xff /* __u8 PortPwrCtrlMask; *** 7 ports max *** */ -}; - -/* prepare Interrupt pipe transaction data; HUB INTERRUPT ENDPOINT */ -static int rh_send_irq(struct urb *urb) -{ - struct urb_priv *urbp = (struct urb_priv *)urb->hcpriv; - xhci_port_t *ports = xhci->rh.ports; - unsigned long flags; - int i, len = 1; - __u16 data = 0; - - spin_lock_irqsave(&urb->lock, flags); - for (i = 0; i < xhci->rh.numports; i++) { - /* Set a bit if anything at all has changed on the port, as per - * USB spec 11.12 */ - data |= (ports[i].cs_chg || ports[i].pe_chg ) - ? (1 << (i + 1)) - : 0; - - len = (i + 1) / 8 + 1; - } - - *(__u16 *) urb->transfer_buffer = cpu_to_le16(data); - urb->actual_length = len; - urbp->status = 0; - - spin_unlock_irqrestore(&urb->lock, flags); - - if ((data > 0) && (xhci->rh.send != 0)) { - dbg("root-hub INT complete: data: %x", data); - xhci_call_completion(urb); - } - - return 0; -} - -/* Virtual Root Hub INTs are polled by this timer every "interval" ms */ -static int rh_init_int_timer(struct urb *urb); - -static void rh_int_timer_do(unsigned long ptr) -{ - struct urb *urb = (struct urb *)ptr; - struct list_head list, *tmp, *head; - unsigned long flags; - int i; - - for ( i = 0; i < xhci->rh.numports; i++) - xhci_queue_probe(i); - - if (xhci->rh.send) - rh_send_irq(urb); - - INIT_LIST_HEAD(&list); - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - struct urb_priv *up = (struct urb_priv *)u->hcpriv; - - tmp = tmp->next; - - spin_lock(&u->lock); - - /* Check if the URB timed out */ - if (u->timeout && time_after_eq(jiffies, - up->inserttime + u->timeout)) { - list_del(&u->urb_list); - list_add_tail(&u->urb_list, &list); - } - - spin_unlock(&u->lock); - } - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - head = &list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - u->transfer_flags |= USB_ASYNC_UNLINK | USB_TIMEOUT_KILLED; - xhci_unlink_urb(u); - } - - rh_init_int_timer(urb); -} - -/* Root Hub INTs are polled by this timer */ -static int rh_init_int_timer(struct urb *urb) -{ - xhci->rh.interval = urb->interval; - init_timer(&xhci->rh.rh_int_timer); - xhci->rh.rh_int_timer.function = rh_int_timer_do; - xhci->rh.rh_int_timer.data = (unsigned long)urb; - xhci->rh.rh_int_timer.expires = jiffies - + (HZ * (urb->interval < 30 ? 30 : urb->interval)) / 1000; - add_timer(&xhci->rh.rh_int_timer); - - return 0; -} - -#define OK(x) len = (x); break - -/* Root Hub Control Pipe */ -static int rh_submit_urb(struct urb *urb) -{ - unsigned int pipe = urb->pipe; - struct usb_ctrlrequest *cmd = - (struct usb_ctrlrequest *)urb->setup_packet; - void *data = urb->transfer_buffer; - int leni = urb->transfer_buffer_length; - int len = 0; - xhci_port_t *status; - int stat = 0; - int i; - int retstatus; - unsigned long flags; - - __u16 cstatus; - __u16 bmRType_bReq; - __u16 wValue; - __u16 wIndex; - __u16 wLength; - - if (usb_pipetype(pipe) == PIPE_INTERRUPT) { - xhci->rh.urb = urb; - xhci->rh.send = 1; - xhci->rh.interval = urb->interval; - rh_init_int_timer(urb); - - return -EINPROGRESS; - } - - bmRType_bReq = cmd->bRequestType | cmd->bRequest << 8; - wValue = le16_to_cpu(cmd->wValue); - wIndex = le16_to_cpu(cmd->wIndex); - wLength = le16_to_cpu(cmd->wLength); - - for (i = 0; i < 8; i++) - xhci->rh.c_p_r[i] = 0; - - status = &xhci->rh.ports[wIndex - 1]; - - spin_lock_irqsave(&xhci->rh.port_state_lock, flags); - - switch (bmRType_bReq) { - /* Request Destination: - without flags: Device, - RH_INTERFACE: interface, - RH_ENDPOINT: endpoint, - RH_CLASS means HUB here, - RH_OTHER | RH_CLASS almost ever means HUB_PORT here - */ - - case RH_GET_STATUS: - *(__u16 *)data = cpu_to_le16(1); - OK(2); - case RH_GET_STATUS | RH_INTERFACE: - *(__u16 *)data = cpu_to_le16(0); - OK(2); - case RH_GET_STATUS | RH_ENDPOINT: - *(__u16 *)data = cpu_to_le16(0); - OK(2); - case RH_GET_STATUS | RH_CLASS: - *(__u32 *)data = cpu_to_le32(0); - OK(4); /* hub power */ - case RH_GET_STATUS | RH_OTHER | RH_CLASS: - cstatus = (status->cs_chg) | - (status->pe_chg << 1) | - (xhci->rh.c_p_r[wIndex - 1] << 4); - retstatus = (status->cs) | - (status->pe << 1) | - (status->susp << 2) | - (1 << 8) | /* power on */ - (status->lsda << 9); - *(__u16 *)data = cpu_to_le16(retstatus); - *(__u16 *)(data + 2) = cpu_to_le16(cstatus); - OK(4); - case RH_CLEAR_FEATURE | RH_ENDPOINT: - switch (wValue) { - case RH_ENDPOINT_STALL: - OK(0); - } - break; - case RH_CLEAR_FEATURE | RH_CLASS: - switch (wValue) { - case RH_C_HUB_OVER_CURRENT: - OK(0); /* hub power over current */ - } - break; - case RH_CLEAR_FEATURE | RH_OTHER | RH_CLASS: - switch (wValue) { - case RH_PORT_ENABLE: - status->pe = 0; - OK(0); - case RH_PORT_SUSPEND: - status->susp = 0; - OK(0); - case RH_PORT_POWER: - OK(0); /* port power */ - case RH_C_PORT_CONNECTION: - status->cs_chg = 0; - OK(0); - case RH_C_PORT_ENABLE: - status->pe_chg = 0; - OK(0); - case RH_C_PORT_SUSPEND: - /*** WR_RH_PORTSTAT(RH_PS_PSSC); */ - OK(0); - case RH_C_PORT_OVER_CURRENT: - OK(0); /* port power over current */ - case RH_C_PORT_RESET: - xhci->rh.c_p_r[wIndex - 1] = 0; - OK(0); - } - break; - case RH_SET_FEATURE | RH_OTHER | RH_CLASS: - switch (wValue) { - case RH_PORT_SUSPEND: - status->susp = 1; - OK(0); - case RH_PORT_RESET: - { - int ret; - xhci->rh.c_p_r[wIndex - 1] = 1; - status->pr = 0; - status->pe = 1; - ret = xhci_port_reset(wIndex - 1); - /* XXX MAW: should probably cancel queued transfers during reset... *\/ */ - if ( ret == 0 ) { OK(0); } - else { return ret; } - } - break; - case RH_PORT_POWER: - OK(0); /* port power ** */ - case RH_PORT_ENABLE: - status->pe = 1; - OK(0); - } - break; - case RH_SET_ADDRESS: - xhci->rh.devnum = wValue; - OK(0); - case RH_GET_DESCRIPTOR: - switch ((wValue & 0xff00) >> 8) { - case 0x01: /* device descriptor */ - len = min_t(unsigned int, leni, - min_t(unsigned int, - sizeof(root_hub_dev_des), wLength)); - memcpy(data, root_hub_dev_des, len); - OK(len); - case 0x02: /* configuration descriptor */ - len = min_t(unsigned int, leni, - min_t(unsigned int, - sizeof(root_hub_config_des), wLength)); - memcpy (data, root_hub_config_des, len); - OK(len); - case 0x03: /* string descriptors */ - len = usb_root_hub_string (wValue & 0xff, - 0, "XHCI-alt", - data, wLength); - if (len > 0) { - OK(min_t(int, leni, len)); - } else - stat = -EPIPE; - } - break; - case RH_GET_DESCRIPTOR | RH_CLASS: - root_hub_hub_des[2] = xhci->rh.numports; - len = min_t(unsigned int, leni, - min_t(unsigned int, sizeof(root_hub_hub_des), wLength)); - memcpy(data, root_hub_hub_des, len); - OK(len); - case RH_GET_CONFIGURATION: - *(__u8 *)data = 0x01; - OK(1); - case RH_SET_CONFIGURATION: - OK(0); - case RH_GET_INTERFACE | RH_INTERFACE: - *(__u8 *)data = 0x00; - OK(1); - case RH_SET_INTERFACE | RH_INTERFACE: - OK(0); - default: - stat = -EPIPE; - } - - spin_unlock_irqrestore(&xhci->rh.port_state_lock, flags); - - urb->actual_length = len; - - return stat; -} - -/* - * MUST be called with urb->lock acquired - */ -static int rh_unlink_urb(struct urb *urb) -{ - if (xhci->rh.urb == urb) { - urb->status = -ENOENT; - xhci->rh.send = 0; - xhci->rh.urb = NULL; - del_timer(&xhci->rh.rh_int_timer); - } - return 0; -} - -/****************************************************************************** - * CONTROL PLANE FUNCTIONALITY - */ - -/** - * alloc_xhci - initialise a new virtual root hub for a new USB device channel - */ -static int alloc_xhci(void) -{ - int retval; - struct usb_bus *bus; - - retval = -EBUSY; - - xhci = kmalloc(sizeof(*xhci), GFP_KERNEL); - if (!xhci) { - err("couldn't allocate xhci structure"); - retval = -ENOMEM; - goto err_alloc_xhci; - } - - xhci->state = USBIF_STATE_CLOSED; - - spin_lock_init(&xhci->urb_list_lock); - INIT_LIST_HEAD(&xhci->urb_list); - - spin_lock_init(&xhci->complete_list_lock); - INIT_LIST_HEAD(&xhci->complete_list); - - spin_lock_init(&xhci->frame_list_lock); - - bus = usb_alloc_bus(&xhci_device_operations); - - if (!bus) { - err("unable to allocate bus"); - goto err_alloc_bus; - } - - xhci->bus = bus; - bus->bus_name = "XHCI"; - bus->hcpriv = xhci; - - usb_register_bus(xhci->bus); - - /* Initialize the root hub */ - - xhci->rh.numports = 0; - - xhci->bus->root_hub = xhci->rh.dev = usb_alloc_dev(NULL, xhci->bus); - if (!xhci->rh.dev) { - err("unable to allocate root hub"); - goto err_alloc_root_hub; - } - - xhci->state = 0; - - return 0; - -/* - * error exits: - */ -err_alloc_root_hub: - usb_deregister_bus(xhci->bus); - usb_free_bus(xhci->bus); - xhci->bus = NULL; - -err_alloc_bus: - kfree(xhci); - -err_alloc_xhci: - return retval; -} - -/** - * usbif_status_change - deal with an incoming USB_INTERFACE_STATUS_ message - */ -static void usbif_status_change(usbif_fe_interface_status_changed_t *status) -{ - ctrl_msg_t cmsg; - usbif_fe_interface_connect_t up; - long rc; - usbif_sring_t *sring; - - switch ( status->status ) - { - case USBIF_INTERFACE_STATUS_DESTROYED: - printk(KERN_WARNING "Unexpected usbif-DESTROYED message in state %d\n", - xhci->state); - break; - - case USBIF_INTERFACE_STATUS_DISCONNECTED: - if ( xhci->state != USBIF_STATE_CLOSED ) - { - printk(KERN_WARNING "Unexpected usbif-DISCONNECTED message" - " in state %d\n", xhci->state); - break; - /* Not bothering to do recovery here for now. Keep things - * simple. */ - - spin_lock_irq(&xhci->ring_lock); - - /* Clean up resources. */ - free_page((unsigned long)xhci->usb_ring.sring); - unbind_evtchn_from_irqhandler(xhci->evtchn, xhci); - - /* Plug the ring. */ - xhci->recovery = 1; - wmb(); - - spin_unlock_irq(&xhci->ring_lock); - } - - /* Move from CLOSED to DISCONNECTED state. */ - sring = (usbif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&xhci->usb_ring, sring, PAGE_SIZE); - xhci->state = USBIF_STATE_DISCONNECTED; - - /* Construct an interface-CONNECT message for the domain controller. */ - cmsg.type = CMSG_USBIF_FE; - cmsg.subtype = CMSG_USBIF_FE_INTERFACE_CONNECT; - cmsg.length = sizeof(usbif_fe_interface_connect_t); - up.shmem_frame = virt_to_mfn(sring); - memcpy(cmsg.msg, &up, sizeof(up)); - - /* Tell the controller to bring up the interface. */ - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - break; - - case USBIF_INTERFACE_STATUS_CONNECTED: - if ( xhci->state == USBIF_STATE_CLOSED ) - { - printk(KERN_WARNING "Unexpected usbif-CONNECTED message" - " in state %d\n", xhci->state); - break; - } - - xhci->evtchn = status->evtchn; - xhci->bandwidth = status->bandwidth; - xhci->rh.numports = status->num_ports; - - xhci->rh.ports = kmalloc (sizeof(xhci_port_t) * xhci->rh.numports, GFP_KERNEL); - - if ( xhci->rh.ports == NULL ) - goto alloc_ports_nomem; - - memset(xhci->rh.ports, 0, sizeof(xhci_port_t) * xhci->rh.numports); - - usb_connect(xhci->rh.dev); - - if (usb_new_device(xhci->rh.dev) != 0) { - err("unable to start root hub"); - } - - /* Allocate the appropriate USB bandwidth here... Need to - * somehow know what the total available is thought to be so we - * can calculate the reservation correctly. */ - usb_claim_bandwidth(xhci->rh.dev, xhci->rh.urb, - 1000 - xhci->bandwidth, 0); - - if ( (rc = bind_evtchn_to_irqhandler(xhci->evtchn, xhci_interrupt, - SA_SAMPLE_RANDOM, "usbif", xhci)) ) - printk(KERN_ALERT"usbfront request_irq failed (%ld)\n",rc); - - DPRINTK(KERN_INFO __FILE__ - ": USB XHCI: SHM at %p (0x%lx), EVTCHN %d\n", - xhci->usb_ring.sring, virt_to_mfn(xhci->usbif), - xhci->evtchn); - - xhci->state = USBIF_STATE_CONNECTED; - - break; - - default: - printk(KERN_WARNING "Status change to unknown value %d\n", - status->status); - break; - } - - return; - - alloc_ports_nomem: - printk(KERN_WARNING "Failed to allocate port memory, XHCI failed to connect.\n"); - return; -} - -/** - * usbif_ctrlif_rx - demux control messages by subtype - */ -static void usbif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->subtype ) - { - case CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED: - usbif_status_change((usbif_fe_interface_status_changed_t *) - &msg->msg[0]); - break; - - /* New interface...? */ - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -static void send_driver_up(void) -{ - control_msg_t cmsg; - usbif_fe_interface_status_changed_t st; - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_USBIF_FE; - cmsg.subtype = CMSG_USBIF_FE_DRIVER_STATUS_CHANGED; - cmsg.length = sizeof(usbif_fe_driver_status_changed_t); - st.status = USBIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -void usbif_resume(void) -{ - int i; - - /* Fake disconnection on all virtual USB ports (suspending / migrating - * will destroy hard state associated will the USB devices anyhow). */ - /* No need to lock here. */ - for ( i = 0; i < xhci->rh.numports; i++ ) - { - xhci->rh.ports[i].cs = 0; - xhci->rh.ports[i].cs_chg = 1; - xhci->rh.ports[i].pe = 0; - } - - send_driver_up(); -} - -static int __init xhci_hcd_init(void) -{ - int retval = -ENOMEM, i; - - if ( (xen_start_info->flags & SIF_INITDOMAIN) || - (xen_start_info->flags & SIF_USB_BE_DOMAIN) ) - return 0; - - info(DRIVER_DESC " " DRIVER_VERSION); - - if (debug) { - errbuf = kmalloc(ERRBUF_LEN, GFP_KERNEL); - if (!errbuf) - goto errbuf_failed; - } - - xhci_up_cachep = kmem_cache_create("xhci_urb_priv", - sizeof(struct urb_priv), 0, 0, NULL, NULL); - if (!xhci_up_cachep) - goto up_failed; - - /* Let the domain controller know we're here. For now we wait until - * connection, as for the block and net drivers. This is only strictly - * necessary if we're going to boot off a USB device. */ - printk(KERN_INFO "Initialising Xen virtual USB hub\n"); - - (void)ctrl_if_register_receiver(CMSG_USBIF_FE, usbif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - alloc_xhci(); - - send_driver_up(); - - /* - * We should read 'nr_interfaces' from response message and wait - * for notifications before proceeding. For now we assume that we - * will be notified of exactly one interface. - */ - for ( i=0; (xhci->state != USBIF_STATE_CONNECTED) && (i < 10*HZ); i++ ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - if (xhci->state != USBIF_STATE_CONNECTED) - printk(KERN_WARNING "Timeout connecting USB frontend driver!\n"); - - return 0; - -up_failed: - if (errbuf) - kfree(errbuf); - -errbuf_failed: - return retval; -} - -module_init(xhci_hcd_init); - -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_LICENSE("GPL"); - diff -r 97dbd9524a7e -r 06d84bf87159 linux-2.6-xen-sparse/drivers/xen/usbfront/xhci.h --- a/linux-2.6-xen-sparse/drivers/xen/usbfront/xhci.h Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,182 +0,0 @@ -/****************************************************************************** - * xhci.h - * - * Private definitions for the Xen Virtual USB Controller. Based on - * drivers/usb/host/uhci.h from Linux. Copyright for the imported content is - * retained by the original authors. - * - * Modifications are: - * Copyright (C) 2004 Intel Research Cambridge - * Copyright (C) 2004, 2005 Mark Williamson - */ - -#ifndef __LINUX_XHCI_H -#define __LINUX_XHCI_H - -#include <linux/list.h> -#include <linux/usb.h> -#include <asm-xen/xen-public/io/usbif.h> -#include <linux/spinlock.h> - -/* xhci_port_t - current known state of a virtual hub ports */ -typedef struct { - unsigned int cs :1; /* Connection status. */ - unsigned int cs_chg :1; /* Connection status change. */ - unsigned int pe :1; /* Port enable. */ - unsigned int pe_chg :1; /* Port enable change. */ - unsigned int susp :1; /* Suspended. */ - unsigned int lsda :1; /* Low speed device attached. */ - unsigned int pr :1; /* Port reset. */ -} xhci_port_t; - -/* struct virt_root_hub - state related to the virtual root hub */ -struct virt_root_hub { - struct usb_device *dev; - int devnum; /* Address of Root Hub endpoint */ - struct urb *urb; - void *int_addr; - int send; - int interval; - int numports; - int c_p_r[8]; - struct timer_list rh_int_timer; - spinlock_t port_state_lock; - xhci_port_t *ports; -}; - -/* struct xhci - contains the state associated with a single USB interface */ -struct xhci { - -#ifdef CONFIG_PROC_FS - /* procfs */ - int num; - struct proc_dir_entry *proc_entry; -#endif - - int evtchn; /* Interdom channel to backend */ - enum { - USBIF_STATE_CONNECTED = 2, - USBIF_STATE_DISCONNECTED = 1, - USBIF_STATE_CLOSED = 0 - } state; /* State of this USB interface */ - unsigned long recovery; /* boolean recovery in progress flag */ - - unsigned long bandwidth; - - struct usb_bus *bus; - - /* Main list of URB's currently controlled by this HC */ - spinlock_t urb_list_lock; - struct list_head urb_list; /* P: xhci->urb_list_lock */ - - /* List of URB's awaiting completion callback */ - spinlock_t complete_list_lock; - struct list_head complete_list; /* P: xhci->complete_list_lock */ - - struct virt_root_hub rh; /* private data of the virtual root hub */ - - spinlock_t ring_lock; - usbif_front_ring_t usb_ring; - - int awaiting_reset; -}; - -/* per-URB private data structure for the host controller */ -struct urb_priv { - struct urb *urb; - usbif_iso_t *schedule; - struct usb_device *dev; - - int in_progress : 1; /* QH was queued (not linked in) */ - int short_control_packet : 1; /* If we get a short packet during */ - /* a control transfer, retrigger */ - /* the status phase */ - - int status; /* Final status */ - - unsigned long inserttime; /* In jiffies */ - - struct list_head complete_list; /* P: xhci->complete_list_lock */ -}; - -/* - * Locking in xhci.c - * - * spinlocks are used extensively to protect the many lists and data - * structures we have. It's not that pretty, but it's necessary. We - * need to be done with all of the locks (except complete_list_lock) when - * we call urb->complete. I've tried to make it simple enough so I don't - * have to spend hours racking my brain trying to figure out if the - * locking is safe. - * - * Here's the safe locking order to prevent deadlocks: - * - * #1 xhci->urb_list_lock - * #2 urb->lock - * #3 xhci->urb_remove_list_lock - * #4 xhci->complete_list_lock - * - * If you're going to grab 2 or more locks at once, ALWAYS grab the lock - * at the lowest level FIRST and NEVER grab locks at the same level at the - * same time. - * - * So, if you need xhci->urb_list_lock, grab it before you grab urb->lock - */ - -/* ------------------------------------------------------------------------- - Virtual Root HUB - ------------------------------------------------------------------------- */ -/* destination of request */ -#define RH_DEVICE 0x00 -#define RH_INTERFACE 0x01 -#define RH_ENDPOINT 0x02 -#define RH_OTHER 0x03 - -#define RH_CLASS 0x20 -#define RH_VENDOR 0x40 - -/* Requests: bRequest << 8 | bmRequestType */ -#define RH_GET_STATUS 0x0080 -#define RH_CLEAR_FEATURE 0x0100 -#define RH_SET_FEATURE 0x0300 -#define RH_SET_ADDRESS 0x0500 -#define RH_GET_DESCRIPTOR 0x0680 -#define RH_SET_DESCRIPTOR 0x0700 -#define RH_GET_CONFIGURATION 0x0880 -#define RH_SET_CONFIGURATION 0x0900 -#define RH_GET_STATE 0x0280 -#define RH_GET_INTERFACE 0x0A80 -#define RH_SET_INTERFACE 0x0B00 -#define RH_SYNC_FRAME 0x0C80 -/* Our Vendor Specific Request */ -#define RH_SET_EP 0x2000 - -/* Hub port features */ -#define RH_PORT_CONNECTION 0x00 -#define RH_PORT_ENABLE 0x01 -#define RH_PORT_SUSPEND 0x02 -#define RH_PORT_OVER_CURRENT 0x03 -#define RH_PORT_RESET 0x04 -#define RH_PORT_POWER 0x08 -#define RH_PORT_LOW_SPEED 0x09 -#define RH_C_PORT_CONNECTION 0x10 -#define RH_C_PORT_ENABLE 0x11 -#define RH_C_PORT_SUSPEND 0x12 -#define RH_C_PORT_OVER_CURRENT 0x13 -#define RH_C_PORT_RESET 0x14 - -/* Hub features */ -#define RH_C_HUB_LOCAL_POWER 0x00 -#define RH_C_HUB_OVER_CURRENT 0x01 -#define RH_DEVICE_REMOTE_WAKEUP 0x00 -#define RH_ENDPOINT_STALL 0x01 - -/* Our Vendor Specific feature */ -#define RH_REMOVE_EP 0x00 - -#define RH_ACK 0x01 -#define RH_REQ_ERR -1 -#define RH_NACK 0x00 - -#endif - diff -r 97dbd9524a7e -r 06d84bf87159 tools/python/xen/xend/server/controller.py --- a/tools/python/xen/xend/server/controller.py Thu Sep 22 17:34:14 2005 +++ /dev/null Thu Sep 22 17:42:01 2005 @@ -1,423 +0,0 @@ -#============================================================================ -# This library is free software; you can redistribute it and/or -# modify it under the terms of version 2.1 of the GNU Lesser General Public -# License as published by the Free Software Foundation. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -#============================================================================ -# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> -#============================================================================ - -"""General support for controllers, which handle devices -for a domain. -""" - -from xen.xend.XendError import XendError -from xen.xend.xenstore import DBVar - -DEBUG = 0 - -class DevControllerTable: - """Table of device controller classes, indexed by type name. - """ - - def __init__(self): - self.controllerClasses = {} - - def getDevControllerClass(self, type): - return self.controllerClasses.get(type) - - def addDevControllerClass(self, cls): - self.controllerClasses[cls.getType()] = cls - - def delDevControllerClass(self, type): - if type in self.controllerClasses: - del self.controllerClasses[type] - - def createDevController(self, type, vm, recreate=False): - cls = self.getDevControllerClass(type) - if not cls: - raise XendError("unknown device type: " + str(type)) - return cls.createDevController(vm, recreate=recreate) - -def getDevControllerTable(): - """Singleton constructor for the controller table. - """ - global devControllerTable - try: - devControllerTable - except: - devControllerTable = DevControllerTable() - return devControllerTable - -def addDevControllerClass(name, cls): - """Add a device controller class to the controller table. - """ - cls.type = name - getDevControllerTable().addDevControllerClass(cls) - - -def isDevControllerClass(name): - """@return True if a device controller class has been registered with - the controller table under the given name.""" - return name in getDevControllerTable().controllerClasses - - -def createDevController(name, vm, recreate=False): - return getDevControllerTable().createDevController(name, vm, recreate=recreate) - -class DevController: - """Abstract class for a device controller attached to a domain. - A device controller manages all the devices of a given type for a domain. - There is exactly one device controller for each device type for - a domain. - - """ - - # State: - # controller/<type> : for controller - # device/<type>/<id> : for each device - - def createDevController(cls, vm, recreate=False): - """Class method to create a dev controller. - """ - ctrl = cls(vm, recreate=recreate) - ctrl.initController(recreate=recreate) - ctrl.exportToDB() - return ctrl - - createDevController = classmethod(createDevController) - - def getType(cls): - return cls.type - - getType = classmethod(getType) - - __exports__ = [ - DBVar('type', 'str'), - DBVar('destroyed', 'bool'), - ] - - # Set when registered. - type = None - - def __init__(self, vm, recreate=False): - self.destroyed = False - self.vm = vm - self.db = self.getDB() - self.deviceId = 0 - self.devices = {} - self.device_order = [] - - def getDB(self): - """Get the db node to use for a controller. - """ - return self.vm.db.addChild("/controller/%s" % self.getType()) - - def getDevDB(self, id): - """Get the db node to use for a device. - """ - return self.vm.db.addChild("/device/%s/%s" % (self.getType(), id)) - - def exportToDB(self, save=False): - self.db.exportToDB(self, fields=self.__exports__, save=save) - - def importFromDB(self): - self.db.importFromDB(self, fields=self.__exports__) - - def getDevControllerType(self): - return self.dctype - - def getDomain(self): - return self.vm.getDomain() - - def getDomainName(self): - return self.vm.getName() - - def getDomainInfo(self): - return self.vm - - #---------------------------------------------------------------------------- - # Subclass interface. - # Subclasses should define the unimplemented methods.. - # Redefinitions must have the same arguments. - - def initController(self, recreate=False, reboot=False): - """Initialise the controller. Called when the controller is - first created, and again after the domain is rebooted (with reboot True). - If called with recreate True (and reboot False) the controller is being - recreated after a xend restart. - - As this can be a re-init (after reboot) any controller state should - be reset. For example the destroyed flag. - """ - self.destroyed = False - if reboot: - self.rebootDevices() - - def newDevice(self, id, config, recreate=False): - """Create a device with the given config. - Must be defined in subclass. - Called with recreate True when the device is being recreated after a - xend restart. - - @return device - """ - raise NotImplementedError() - - def createDevice(self, config, recreate=False, change=False): - """Create a device and attach to its front- and back-ends. - If recreate is true the device is being recreated after a xend restart. - If change is true the device is a change to an existing domain, - i.e. it is being added at runtime rather than when the domain is created. - """ - dev = self.newDevice(self.nextDeviceId(), config, recreate=recreate) - if self.vm.recreate: - dev.importFromDB() - dev.init(recreate=recreate) - self.addDevice(dev) - if not recreate: - dev.exportToDB() - dev.attach(recreate=recreate, change=change) - dev.exportToDB() - - return dev - - def configureDevice(self, id, config, change=False): - """Reconfigure an existing device. - May be defined in subclass.""" - dev = self.getDevice(id, error=True) - dev.configure(config, change=change) - - def destroyDevice(self, id, change=False, reboot=False): - """Destroy a device. - May be defined in subclass. - - If reboot is true the device is being destroyed for a domain reboot. - - The device is not deleted, since it may be recreated later. - """ - dev = self.getDevice(id, error=True) - dev.destroy(change=change, reboot=reboot) - return dev - - def deleteDevice(self, id, change=True): - """Destroy a device and delete it. - Normally called to remove a device from a domain at runtime. - """ - dev = self.destroyDevice(id, change=change) - self.removeDevice(dev) - - def destroyController(self, reboot=False): - """Destroy all devices and clean up. - May be defined in subclass. - If reboot is true the controller is being destroyed for a domain reboot. - Called at domain shutdown. - """ - self.destroyed = True - self.destroyDevices(reboot=reboot) - - #---------------------------------------------------------------------------- - - def isDestroyed(self): - return self.destroyed - - def getDevice(self, id, error=False): - dev = self.devices.get(int(id)) - if error and not dev: - raise XendError("invalid device id: " + str(id)) - return dev - - def getDeviceIds(self): - return [ dev.getId() for dev in self.device_order ] - - def getDevices(self): - return self.device_order - - def getDeviceConfig(self, id): - return self.getDevice(id).getConfig() - - def getDeviceConfigs(self): - return [ dev.getConfig() for dev in self.device_order ] - - def getDeviceSxprs(self): - return [ dev.sxpr() for dev in self.device_order ] - - def addDevice(self, dev): - self.devices[dev.getId()] = dev - self.device_order.append(dev) - return dev - - def removeDevice(self, dev): - if dev.getId() in self.devices: - del self.devices[dev.getId()] - if dev in self.device_order: - self.device_order.remove(dev) - - def rebootDevices(self): - for dev in self.getDevices(): - dev.reboot() - - def destroyDevices(self, reboot=False): - """Destroy all devices. - """ - for dev in self.getDevices(): - dev.destroy(reboot=reboot) - - def getMaxDeviceId(self): - maxid = 0 - for id in self.devices: - if id > maxid: - maxid = id - return maxid - - def nextDeviceId(self): - id = self.deviceId - self.deviceId += 1 - return id - - def getDeviceCount(self): - return len(self.devices) - -class Dev: - """Abstract class for a device attached to a device controller. - - @ivar id: identifier - @type id: int - @ivar controller: device controller - @type controller: DevController - """ - - # ./status : need 2: actual and requested? - # down-down: initial. - # up-up: fully up. - # down-up: down requested, still up. Watch front and back, when both - # down go to down-down. But what if one (or both) is not connected? - # Still have front/back trees with status? Watch front/status, back/status? - # up-down: up requested, still down. - # Back-end watches ./status, front/status - # Front-end watches ./status, back/status - # i.e. each watches the other 2. - # Each is status/request status/actual? - # - # backend? - # frontend? - - __exports__ = [ - DBVar('id', ty='int'), - DBVar('type', ty='str'), - DBVar('config', ty='sxpr'), - DBVar('destroyed', ty='bool'), - ] - - def __init__(self, controller, id, config, recreate=False): - self.controller = controller - self.id = id - self.config = config - self.destroyed = False - self.type = self.getType() - - self.db = controller.getDevDB(id) - - def exportToDB(self, save=False): - self.db.exportToDB(self, fields=self.__exports__, save=save) - - def importFromDB(self): - self.db.importFromDB(self, fields=self.__exports__) - - def getDomain(self): - return self.controller.getDomain() - - def getDomainName(self): - return self.controller.getDomainName() - - def getDomainInfo(self): - return self.controller.getDomainInfo() - - def getController(self): - return self.controller - - def getType(self): - return self.controller.getType() - - def getId(self): - return self.id - - def getConfig(self): - return self.config - - def isDestroyed(self): - return self.destroyed - - #---------------------------------------------------------------------------- - # Subclass interface. - # Define methods in subclass as needed. - # Redefinitions must have the same arguments. - - def init(self, recreate=False, reboot=False): - """Initialization. Called on initial create (when reboot is False) - and on reboot (when reboot is True). When xend is restarting is - called with recreate True. Define in subclass if needed. - - Device instance variables must be defined in the class constructor, - but given null or default values. The real values should be initialised - in this method. This allows devices to be re-initialised. - - Since this can be called to re-initialise a device any state flags - should be reset. - """ - self.destroyed = False - - def attach(self, recreate=False, change=False): - """Attach the device to its front and back ends. - Define in subclass if needed. - """ - pass - - def reboot(self): - """Reconnect the device when the domain is rebooted. - """ - self.init(reboot=True) - self.attach() - - def sxpr(self): - """Get the s-expression for the deivice. - Implement in a subclass if needed. - - @return: sxpr - """ - return self.getConfig() - - def configure(self, config, change=False): - """Reconfigure the device. - - Implement in subclass. - """ - raise NotImplementedError() - - def refresh(self): - """Refresh the device.. - Default no-op. Define in subclass if needed. - """ - pass - - def destroy(self, change=False, reboot=False): - """Destroy the device. - If change is True notify destruction (runtime change). - If reboot is True the device is being destroyed for a reboot. - Redefine in subclass if needed. - - Called at domain shutdown and when a device is deleted from - a running domain (with change True). - """ - self.destroyed = True - pass - - #---------------------------------------------------------------------------- _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.