Xen project Mailing List

Re: [Minios-devel] [UNIKRAFT PATCH v2 04/23] lib/vfscore: Initial import of OSv vfs
To: Yuri Volchkov <yuri.volchkov@xxxxxxxxx>, <minios-devel@xxxxxxxxxxxxx>
From: Sharan Santhanam <sharan.santhanam@xxxxxxxxx>
Date: Wed, 6 Feb 2019 17:33:55 +0100
Delivery-date: Wed, 06 Feb 2019 16:34:08 +0000
List-id: Mini-os development list <minios-devel.lists.xenproject.org>
Hello yuri, This patch seems fine. Reviewed-by: Sharan Santhanam <sharan.santhanam@xxxxxxxxx> Thanks & Regards Sharan On 2/4/19 3:36 PM, Yuri Volchkov wrote:
The code is imported as is.

Commit f1f42915a33bebe120e70af1f32c1a4d92bac780

Signed-off-by: Yuri Volchkov <yuri.volchkov@xxxxxxxxx>
---
  lib/vfscore/dentry.c                 |  234 +++
  lib/vfscore/fops.c                   |  189 ++
  lib/vfscore/include/vfscore/dentry.h |   45 +
  lib/vfscore/include/vfscore/mount.h  |  171 ++
  lib/vfscore/include/vfscore/prex.h   |   34 +
  lib/vfscore/include/vfscore/uio.h    |   89 +
  lib/vfscore/include/vfscore/vnode.h  |  246 +++
  lib/vfscore/lookup.c                 |  375 ++++
  lib/vfscore/main.c                   | 2413 ++++++++++++++++++++++++++
  lib/vfscore/mount.c                  |  491 ++++++
  lib/vfscore/subr_uio.c               |   73 +
  lib/vfscore/syscalls.c               | 1486 ++++++++++++++++
  lib/vfscore/task.c                   |  167 ++
  lib/vfscore/vfs.h                    |  189 ++
  lib/vfscore/vnode.c                  |  522 ++++++
  15 files changed, 6724 insertions(+)
  create mode 100644 lib/vfscore/dentry.c
  create mode 100644 lib/vfscore/fops.c
  create mode 100644 lib/vfscore/include/vfscore/dentry.h
  create mode 100644 lib/vfscore/include/vfscore/mount.h
  create mode 100644 lib/vfscore/include/vfscore/prex.h
  create mode 100644 lib/vfscore/include/vfscore/uio.h
  create mode 100644 lib/vfscore/include/vfscore/vnode.h
  create mode 100644 lib/vfscore/lookup.c
  create mode 100644 lib/vfscore/main.c
  create mode 100644 lib/vfscore/mount.c
  create mode 100644 lib/vfscore/subr_uio.c
  create mode 100644 lib/vfscore/syscalls.c
  create mode 100644 lib/vfscore/task.c
  create mode 100644 lib/vfscore/vfs.h
  create mode 100644 lib/vfscore/vnode.c

diff --git a/lib/vfscore/dentry.c b/lib/vfscore/dentry.c
new file mode 100644
index 00000000..facd9eaa
--- /dev/null
+++ b/lib/vfscore/dentry.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include <osv/dentry.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+#define DENTRY_BUCKETS 32
+
+static LIST_HEAD(dentry_hash_head, dentry) dentry_hash_table[DENTRY_BUCKETS];
+static LIST_HEAD(fake, dentry) fake;
+static mutex dentry_hash_lock;
+
+/*
+ * Get the hash value from the mount point and path name.
+ * XXX: replace with a better hash for 64-bit pointers.
+ */
+static u_int
+dentry_hash(struct mount *mp, const char *path)
+{
+    u_int val = 0;
+
+    if (path) {
+        while (*path) {
+            val = ((val << 5) + val) + *path++;
+        }
+    }
+    return (val ^ (unsigned long) mp) & (DENTRY_BUCKETS - 1);
+}
+
+
+struct dentry *
+dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const char *path)
+{
+    struct mount *mp = vp->v_mount;
+    struct dentry *dp = (dentry*)calloc(sizeof(*dp), 1);
+
+    if (!dp) {
+        return nullptr;
+    }
+
+    vref(vp);
+
+    dp->d_refcnt = 1;
+    dp->d_vnode = vp;
+    dp->d_mount = mp;
+    dp->d_path = strdup(path);
+    LIST_INIT(&dp->d_children);
+
+    if (parent_dp) {
+        dref(parent_dp);
+        WITH_LOCK(parent_dp->d_lock) {
+            // Insert dp into its parent's children list.
+            LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link);
+        }
+    }
+    dp->d_parent = parent_dp;
+
+    vn_add_name(vp, dp);
+
+    mutex_lock(&dentry_hash_lock);
+    LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(mp, path)], dp, d_link);
+    mutex_unlock(&dentry_hash_lock);
+    return dp;
+};
+
+struct dentry *
+dentry_lookup(struct mount *mp, char *path)
+{
+    struct dentry *dp;
+
+    mutex_lock(&dentry_hash_lock);
+    LIST_FOREACH(dp, &dentry_hash_table[dentry_hash(mp, path)], d_link) {
+        if (dp->d_mount == mp && !strncmp(dp->d_path, path, PATH_MAX)) {
+            dp->d_refcnt++;
+            mutex_unlock(&dentry_hash_lock);
+            return dp;
+        }
+    }
+    mutex_unlock(&dentry_hash_lock);
+    return nullptr;                /* not found */
+}
+
+static void dentry_children_remove(struct dentry *dp)
+{
+    struct dentry *entry = nullptr;
+
+    WITH_LOCK(dp->d_lock) {
+        LIST_FOREACH(entry, &dp->d_children, d_children_link) {
+            ASSERT(entry);
+            ASSERT(entry->d_refcnt > 0);
+            LIST_REMOVE(entry, d_link);
+        }
+    }
+}
+
+void
+dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path)
+{
+    struct dentry *old_pdp = dp->d_parent;
+    char *old_path = dp->d_path;
+
+    if (old_pdp) {
+        WITH_LOCK(old_pdp->d_lock) {
+            // Remove dp from its old parent's children list.
+            LIST_REMOVE(dp, d_children_link);
+        }
+    }
+
+    if (parent_dp) {
+        dref(parent_dp);
+        WITH_LOCK(parent_dp->d_lock) {
+            // Insert dp into its new parent's children list.
+            LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link);
+        }
+    }
+
+    WITH_LOCK(dentry_hash_lock) {
+        // Remove all dp's child dentries from the hashtable.
+        dentry_children_remove(dp);
+        // Remove dp with outdated hash info from the hashtable.
+        LIST_REMOVE(dp, d_link);
+        // Update dp.
+        dp->d_path = strdup(path);
+        dp->d_parent = parent_dp;
+        // Insert dp updated hash info into the hashtable.
+        LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(dp->d_mount, path)],
+            dp, d_link);
+    }
+
+    if (old_pdp) {
+        drele(old_pdp);
+    }
+
+    free(old_path);
+}
+
+void
+dentry_remove(struct dentry *dp)
+{
+    mutex_lock(&dentry_hash_lock);
+    LIST_REMOVE(dp, d_link);
+    /* put it on a fake list for drele() to work*/
+    LIST_INSERT_HEAD(&fake, dp, d_link);
+    mutex_unlock(&dentry_hash_lock);
+}
+
+void
+dref(struct dentry *dp)
+{
+    ASSERT(dp);
+    ASSERT(dp->d_refcnt > 0);
+
+    mutex_lock(&dentry_hash_lock);
+    dp->d_refcnt++;
+    mutex_unlock(&dentry_hash_lock);
+}
+
+void
+drele(struct dentry *dp)
+{
+    ASSERT(dp);
+    ASSERT(dp->d_refcnt > 0);
+
+    mutex_lock(&dentry_hash_lock);
+    if (--dp->d_refcnt) {
+        mutex_unlock(&dentry_hash_lock);
+        return;
+    }
+    LIST_REMOVE(dp, d_link);
+    vn_del_name(dp->d_vnode, dp);
+
+    mutex_unlock(&dentry_hash_lock);
+
+    if (dp->d_parent) {
+        WITH_LOCK(dp->d_parent->d_lock) {
+            // Remove dp from its parent's children list.
+            LIST_REMOVE(dp, d_children_link);
+        }
+        drele(dp->d_parent);
+    }
+
+    vrele(dp->d_vnode);
+
+    free(dp->d_path);
+    free(dp);
+}
+
+void
+dentry_init(void)
+{
+    int i;
+
+    for (i = 0; i < DENTRY_BUCKETS; i++) {
+        LIST_INIT(&dentry_hash_table[i]);
+    }
+}
diff --git a/lib/vfscore/fops.c b/lib/vfscore/fops.c
new file mode 100644
index 00000000..3a8f98b4
--- /dev/null
+++ b/lib/vfscore/fops.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <osv/file.h>
+#include <osv/poll.h>
+#include <fs/vfs/vfs.h>
+#include <osv/vfs_file.hh>
+#include <osv/mmu.hh>
+#include <osv/pagecache.hh>
+
+vfs_file::vfs_file(unsigned flags)
+       : file(flags, DTYPE_VNODE)
+{
+}
+
+int vfs_file::close()
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       int error;
+
+       vn_lock(vp);
+       error = VOP_CLOSE(vp, fp);
+       vn_unlock(vp);
+
+       if (error)
+               return error;
+
+       fp->f_dentry.reset();
+       return 0;
+}
+
+int vfs_file::read(struct uio *uio, int flags)
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       int error;
+       size_t count;
+       ssize_t bytes;
+
+       bytes = uio->uio_resid;
+
+       vn_lock(vp);
+       if ((flags & FOF_OFFSET) == 0)
+               uio->uio_offset = fp->f_offset;
+
+       error = VOP_READ(vp, fp, uio, 0);
+       if (!error) {
+               count = bytes - uio->uio_resid;
+               if ((flags & FOF_OFFSET) == 0)
+                       fp->f_offset += count;
+       }
+       vn_unlock(vp);
+
+       return error;
+}
+
+
+int vfs_file::write(struct uio *uio, int flags)
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       int ioflags = 0;
+       int error;
+       size_t count;
+       ssize_t bytes;
+
+       bytes = uio->uio_resid;
+
+       vn_lock(vp);
+
+       if (fp->f_flags & O_APPEND)
+               ioflags |= IO_APPEND;
+       if (fp->f_flags & (O_DSYNC|O_SYNC))
+               ioflags |= IO_SYNC;
+
+       if ((flags & FOF_OFFSET) == 0)
+               uio->uio_offset = fp->f_offset;
+
+       error = VOP_WRITE(vp, uio, ioflags);
+       if (!error) {
+               count = bytes - uio->uio_resid;
+               if ((flags & FOF_OFFSET) == 0)
+                       fp->f_offset += count;
+       }
+
+       vn_unlock(vp);
+       return error;
+}
+
+int vfs_file::ioctl(u_long com, void *data)
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       int error;
+
+       vn_lock(vp);
+       error = VOP_IOCTL(vp, fp, com, data);
+       vn_unlock(vp);
+
+       return error;
+}
+
+int vfs_file::stat(struct stat *st)
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       int error;
+
+       vn_lock(vp);
+       error = vn_stat(vp, st);
+       vn_unlock(vp);
+
+       return error;
+}
+
+int vfs_file::poll(int events)
+{
+       return poll_no_poll(events);
+}
+
+int vfs_file::truncate(off_t len)
+{
+       // somehow this is handled outside file ops
+       abort();
+}
+
+int vfs_file::chmod(mode_t mode)
+{
+       // somehow this is handled outside file ops
+       abort();
+}
+
+bool vfs_file::map_page(uintptr_t off, mmu::hw_ptep<0> ptep, 
mmu::pt_element<0> pte, bool write, bool shared)
+{
+    return pagecache::get(this, off, ptep, pte, write, shared);
+}
+
+bool vfs_file::put_page(void *addr, uintptr_t off, mmu::hw_ptep<0> ptep)
+{
+    return pagecache::release(this, addr, off, ptep);
+}
+
+void vfs_file::sync(off_t start, off_t end)
+{
+    pagecache::sync(this, start, end);
+}
+
+// Locking: VOP_CACHE will call into the filesystem, and that can trigger an
+// eviction that will hold the mmu-side lock that protects the mappings
+// Always follow that order. We however can't just get rid of the mmu-side 
lock,
+// because not all invalidations will be synchronous.
+int vfs_file::get_arcbuf(void* key, off_t offset)
+{
+    struct vnode *vp = f_dentry->d_vnode;
+
+    iovec io[1];
+
+    io[0].iov_base = key;
+    uio data;
+    data.uio_iov = io;
+    data.uio_iovcnt = 1;
+    data.uio_offset = offset;
+    data.uio_resid = mmu::page_size;
+    data.uio_rw = UIO_READ;
+
+    vn_lock(vp);
+    assert(VOP_CACHE(vp, this, &data) == 0);
+    vn_unlock(vp);
+
+    return (data.uio_resid != 0) ? -1 : 0;
+}
+
+std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned 
flags, unsigned perm, off_t offset)
+{
+       auto fp = this;
+       struct vnode *vp = fp->f_dentry->d_vnode;
+       if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) {
+               return mmu::default_file_mmap(this, range, flags, perm, offset);
+       }
+       return mmu::map_file_mmap(this, range, flags, perm, offset);
+}
diff --git a/lib/vfscore/include/vfscore/dentry.h 
b/lib/vfscore/include/vfscore/dentry.h
new file mode 100644
index 00000000..a2545af8
--- /dev/null
+++ b/lib/vfscore/include/vfscore/dentry.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#ifndef _OSV_DENTRY_H
+#define _OSV_DENTRY_H 1
+
+#include <osv/mutex.h>
+#include <bsd/sys/sys/queue.h>
+
+struct vnode;
+
+struct dentry {
+       LIST_ENTRY(dentry) d_link;      /* link for hash list */
+       int             d_refcnt;       /* reference count */
+       char            *d_path;        /* pointer to path in fs */
+       struct vnode    *d_vnode;
+       struct mount    *d_mount;
+       struct dentry   *d_parent; /* pointer to parent */
+       LIST_ENTRY(dentry) d_names_link; /* link fo vnode::d_names */
+       mutex_t         d_lock;
+       LIST_HEAD(, dentry) d_children;
+       LIST_ENTRY(dentry) d_children_link;
+};
+
+#ifdef __cplusplus
+
+#include <boost/intrusive_ptr.hpp>
+
+using dentry_ref = boost::intrusive_ptr<dentry>;
+
+extern "C" {
+    void dref(struct dentry* dp);
+    void drele(struct dentry* dp);
+};
+
+inline void intrusive_ptr_add_ref(dentry* dp) { dref(dp); }
+inline void intrusive_ptr_release(dentry* dp) { drele(dp); }
+
+#endif
+
+#endif /* _OSV_DENTRY_H */
diff --git a/lib/vfscore/include/vfscore/mount.h 
b/lib/vfscore/include/vfscore/mount.h
new file mode 100644
index 00000000..7268d8ce
--- /dev/null
+++ b/lib/vfscore/include/vfscore/mount.h
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)mount.h     8.21 (Berkeley) 5/20/95
+ */
+
+#ifndef _SYS_MOUNT_H_
+#define _SYS_MOUNT_H_
+
+#include <sys/cdefs.h>
+#include <sys/statfs.h>
+#include <osv/vnode.h>
+#include <bsd/sys/sys/queue.h>
+
+__BEGIN_DECLS
+
+#ifdef _KERNEL
+
+/*
+ * Mount data
+ */
+struct mount {
+       struct vfsops   *m_op;          /* pointer to vfs operation */
+       int             m_flags;        /* mount flag */
+       int             m_count;        /* reference count */
+       char            m_path[PATH_MAX]; /* mounted path */
+       char            m_special[PATH_MAX]; /* resource */
+       struct device   *m_dev;         /* mounted device */
+       struct dentry   *m_root;        /* root vnode */
+       struct dentry   *m_covered;     /* vnode covered on parent fs */
+       void            *m_data;        /* private data for fs */
+       fsid_t          m_fsid;         /* id that uniquely identifies the fs */
+};
+
+#endif
+
+/*
+ * Mount flags.
+ */
+#define        MNT_RDONLY      0x00000001      /* read only filesystem */
+#define        MNT_SYNCHRONOUS 0x00000002      /* file system written 
synchronously */
+#define        MNT_NOEXEC      0x00000004      /* can't exec from filesystem */
+#define        MNT_NOSUID      0x00000008      /* don't honor setuid bits on 
fs */
+#define        MNT_NODEV       0x00000010      /* don't interpret special 
files */
+#define        MNT_UNION       0x00000020      /* union with underlying 
filesystem */
+#define        MNT_ASYNC       0x00000040      /* file system written 
asynchronously */
+
+/*
+ * Unmount flags.
+ */
+#define MNT_FORCE      0x00000001      /* forced unmount */
+
+/*
+ * exported mount flags.
+ */
+#define        MNT_EXRDONLY    0x00000080      /* exported read only */
+#define        MNT_EXPORTED    0x00000100      /* file system is exported */
+#define        MNT_DEFEXPORTED 0x00000200      /* exported to the world */
+#define        MNT_EXPORTANON  0x00000400      /* use anon uid mapping for 
everyone */
+#define        MNT_EXKERB      0x00000800      /* exported with Kerberos uid 
mapping */
+
+/*
+ * Flags set by internal operations.
+ */
+#define        MNT_LOCAL       0x00001000      /* filesystem is stored locally 
*/
+#define        MNT_QUOTA       0x00002000      /* quotas are enabled on 
filesystem */
+#define        MNT_ROOTFS      0x00004000      /* identifies the root 
filesystem */
+
+/*
+ * Mask of flags that are visible to statfs()
+ */
+#define        MNT_VISFLAGMASK 0x0000ffff
+
+#ifdef _KERNEL
+
+/*
+ * Filesystem type switch table.
+ */
+struct vfssw {
+       const char      *vs_name;       /* name of file system */
+       int             (*vs_init)(void); /* initialize routine */
+       struct vfsops   *vs_op;         /* pointer to vfs operation */
+};
+
+/*
+ * Operations supported on virtual file system.
+ */
+struct vfsops {
+       int (*vfs_mount)        (struct mount *, const char *, int, const void 
*);
+       int (*vfs_unmount)      (struct mount *, int flags);
+       int (*vfs_sync)         (struct mount *);
+       int (*vfs_vget)         (struct mount *, struct vnode *);
+       int (*vfs_statfs)       (struct mount *, struct statfs *);
+       struct vnops    *vfs_vnops;
+};
+
+typedef int (*vfsop_mount_t)(struct mount *, const char *, int, const void *);
+typedef int (*vfsop_umount_t)(struct mount *, int flags);
+typedef int (*vfsop_sync_t)(struct mount *);
+typedef int (*vfsop_vget_t)(struct mount *, struct vnode *);
+typedef int (*vfsop_statfs_t)(struct mount *, struct statfs *);
+
+/*
+ * VFS interface
+ */
+#define VFS_MOUNT(MP, DEV, FL, DAT) ((MP)->m_op->vfs_mount)(MP, DEV, FL, DAT)
+#define VFS_UNMOUNT(MP, FL)         ((MP)->m_op->vfs_unmount)(MP, FL)
+#define VFS_SYNC(MP)                ((MP)->m_op->vfs_sync)(MP)
+#define VFS_VGET(MP, VP)            ((MP)->m_op->vfs_vget)(MP, VP)
+#define VFS_STATFS(MP, SFP)         ((MP)->m_op->vfs_statfs)(MP, SFP)
+
+#define VFS_NULL                   ((void *)vfs_null)
+
+int    vfs_nullop(void);
+int    vfs_einval(void);
+
+void    vfs_busy(struct mount *mp);
+void    vfs_unbusy(struct mount *mp);
+
+void    release_mp_dentries(struct mount *mp);
+
+#endif
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+#include <vector>
+#include <string>
+
+namespace osv {
+
+struct mount_desc {
+    std::string special;
+    std::string path;
+    std::string type;
+    std::string options;
+};
+
+std::vector<mount_desc> current_mounts();
+
+}
+
+#endif
+
+#endif /* !_SYS_MOUNT_H_ */
diff --git a/lib/vfscore/include/vfscore/prex.h 
b/lib/vfscore/include/vfscore/prex.h
new file mode 100644
index 00000000..43650340
--- /dev/null
+++ b/lib/vfscore/include/vfscore/prex.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#ifndef _OSV_PREX_H
+#define _OSV_PREX_H 1
+
+
+#include <unistd.h>
+#include <osv/fcntl.h>
+
+__BEGIN_DECLS
+
+#define __packed        __attribute__((__packed__))
+
+#define        BSIZE   512             /* size of secondary block (bytes) */
+
+#define DO_RDWR                0x2
+
+#define PAGE_SIZE      4096
+#define PAGE_MASK      (PAGE_SIZE-1)
+#define round_page(x)  (((x) + PAGE_MASK) & ~PAGE_MASK)
+
+size_t strlcat(char *dst, const char *src, size_t siz);
+size_t strlcpy(char *dst, const char *src, size_t siz);
+
+void sys_panic(const char *);
+
+__END_DECLS
+
+#endif /* _OSV_PREX_H */
diff --git a/lib/vfscore/include/vfscore/uio.h 
b/lib/vfscore/include/vfscore/uio.h
new file mode 100644
index 00000000..696b01cf
--- /dev/null
+++ b/lib/vfscore/include/vfscore/uio.h
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)uio.h       8.5 (Berkeley) 2/22/94
+ * $FreeBSD$
+ */
+
+#ifndef _UIO_H_
+#define        _UIO_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <limits.h>
+
+__BEGIN_DECLS
+
+enum   uio_rw { UIO_READ, UIO_WRITE };
+
+/*
+ * Safe default to prevent possible overflows in user code, otherwise could
+ * be SSIZE_T_MAX.
+ */
+#define IOSIZE_MAX      INT_MAX
+
+#define UIO_MAXIOV 1024
+
+#define UIO_SYSSPACE 0
+
+struct uio {
+       struct iovec *uio_iov;          /* scatter/gather list */
+       int     uio_iovcnt;             /* length of scatter/gather list */
+       off_t   uio_offset;             /* offset in target object */
+       ssize_t uio_resid;              /* remaining bytes to process */
+       enum    uio_rw uio_rw;          /* operation */
+};
+
+int    uiomove(void *cp, int n, struct uio *uio);
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+template <typename F>
+static inline void linearize_uio_write(struct uio *uio, int ioflag, F f)
+{
+    while (uio->uio_resid > 0) {
+        struct iovec *iov = uio->uio_iov;
+
+        if (iov->iov_len) {
+            f(reinterpret_cast<const char *>(iov->iov_base),
+                iov->iov_len);
+        }
+
+        uio->uio_iov++;
+        uio->uio_iovcnt--;
+        uio->uio_resid -= iov->iov_len;
+        uio->uio_offset += iov->iov_len;
+    }
+}
+
+#endif
+
+#endif /* !_UIO_H_ */
diff --git a/lib/vfscore/include/vfscore/vnode.h 
b/lib/vfscore/include/vfscore/vnode.h
new file mode 100644
index 00000000..e35aa830
--- /dev/null
+++ b/lib/vfscore/include/vfscore/vnode.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYS_VNODE_H_
+#define _SYS_VNODE_H_
+
+#ifdef _KERNEL
+
+#include <sys/cdefs.h>
+#include <sys/stat.h>
+#include <osv/prex.h>
+#include <osv/uio.h>
+#include <osv/mutex.h>
+#include "file.h"
+#include "dirent.h"
+
+__BEGIN_DECLS
+
+struct vfsops;
+struct vnops;
+struct vnode;
+struct file;
+
+/*
+ * Vnode types.
+ */
+enum vtype {
+       VNON,       /* no type */
+       VREG,       /* regular file  */
+       VDIR,       /* directory */
+       VBLK,       /* block device */
+       VCHR,       /* character device */
+       VLNK,       /* symbolic link */
+       VSOCK,      /* socks */
+       VFIFO,      /* FIFO */
+       VBAD
+};
+
+/*
+ * Reading or writing any of these items requires holding the
+ * appropriate lock.
+ */
+struct vnode {
+       uint64_t        v_ino;          /* inode number */
+       LIST_ENTRY(vnode) v_link;       /* link for hash list */
+       struct mount    *v_mount;       /* mounted vfs pointer */
+       struct vnops    *v_op;          /* vnode operations */
+       int             v_refcnt;       /* reference count */
+       int             v_type;         /* vnode type */
+       int             v_flags;        /* vnode flag */
+       mode_t          v_mode;         /* file mode */
+       off_t           v_size;         /* file size */
+       mutex_t         v_lock;         /* lock for this vnode */
+       LIST_HEAD(, dentry) v_names;    /* directory entries pointing at this */
+       int             v_nrlocks;      /* lock count (for debug) */
+       void            *v_data;        /* private data for fs */
+};
+
+/* flags for vnode */
+#define VROOT          0x0001          /* root of its file system */
+#define VISTTY         0x0002          /* device is tty */
+#define VPROTDEV       0x0004          /* protected device */
+
+/*
+ * Vnode attribute
+ */
+struct vattr {
+       unsigned int    va_mask;
+       enum vtype      va_type;        /* vnode type */
+       mode_t          va_mode;        /* file access mode */
+       nlink_t         va_nlink;
+       uid_t           va_uid;
+       gid_t           va_gid;
+       dev_t           va_fsid;        /* id of the underlying filesystem */
+       ino_t           va_nodeid;
+       struct timespec va_atime;
+       struct timespec va_mtime;
+       struct timespec va_ctime;
+       dev_t           va_rdev;
+       uint64_t        va_nblocks;
+       off_t           va_size;
+};
+
+/*
+ *  Modes.
+ */
+#define VAPPEND 00010
+#define        VREAD   00004           /* read, write, execute permissions */
+#define        VWRITE  00002
+#define        VEXEC   00001
+
+#define IO_APPEND      0x0001
+#define IO_SYNC                0x0002
+
+/*
+ * ARC actions
+ */
+#define ARC_ACTION_QUERY    0
+#define ARC_ACTION_HOLD     1
+#define ARC_ACTION_RELEASE  2
+
+typedef        int (*vnop_open_t)      (struct file *);
+typedef        int (*vnop_close_t)     (struct vnode *, struct file *);
+typedef        int (*vnop_read_t)      (struct vnode *, struct file *, struct 
uio *, int);
+typedef        int (*vnop_write_t)     (struct vnode *, struct uio *, int);
+typedef        int (*vnop_seek_t)      (struct vnode *, struct file *, off_t, 
off_t);
+typedef        int (*vnop_ioctl_t)     (struct vnode *, struct file *, u_long, 
void *);
+typedef        int (*vnop_fsync_t)     (struct vnode *, struct file *);
+typedef        int (*vnop_readdir_t)   (struct vnode *, struct file *, struct 
dirent *);
+typedef        int (*vnop_lookup_t)    (struct vnode *, char *, struct vnode 
**);
+typedef        int (*vnop_create_t)    (struct vnode *, char *, mode_t);
+typedef        int (*vnop_remove_t)    (struct vnode *, struct vnode *, char 
*);
+typedef        int (*vnop_rename_t)    (struct vnode *, struct vnode *, char *,
+                                struct vnode *, struct vnode *, char *);
+typedef        int (*vnop_mkdir_t)     (struct vnode *, char *, mode_t);
+typedef        int (*vnop_rmdir_t)     (struct vnode *, struct vnode *, char 
*);
+typedef        int (*vnop_getattr_t)   (struct vnode *, struct vattr *);
+typedef        int (*vnop_setattr_t)   (struct vnode *, struct vattr *);
+typedef        int (*vnop_inactive_t)  (struct vnode *);
+typedef        int (*vnop_truncate_t)  (struct vnode *, off_t);
+typedef        int (*vnop_link_t)      (struct vnode *, struct vnode *, char 
*);
+typedef int (*vnop_cache_t) (struct vnode *, struct file *, struct uio *);
+typedef int (*vnop_fallocate_t) (struct vnode *, int, loff_t, loff_t);
+typedef int (*vnop_readlink_t)  (struct vnode *, struct uio *);
+typedef int (*vnop_symlink_t)   (struct vnode *, char *, char *);
+
+/*
+ * vnode operations
+ */
+struct vnops {
+       vnop_open_t             vop_open;
+       vnop_close_t            vop_close;
+       vnop_read_t             vop_read;
+       vnop_write_t            vop_write;
+       vnop_seek_t             vop_seek;
+       vnop_ioctl_t            vop_ioctl;
+       vnop_fsync_t            vop_fsync;
+       vnop_readdir_t          vop_readdir;
+       vnop_lookup_t           vop_lookup;
+       vnop_create_t           vop_create;
+       vnop_remove_t           vop_remove;
+       vnop_rename_t           vop_rename;
+       vnop_mkdir_t            vop_mkdir;
+       vnop_rmdir_t            vop_rmdir;
+       vnop_getattr_t          vop_getattr;
+       vnop_setattr_t          vop_setattr;
+       vnop_inactive_t         vop_inactive;
+       vnop_truncate_t         vop_truncate;
+       vnop_link_t             vop_link;
+       vnop_cache_t            vop_cache;
+       vnop_fallocate_t        vop_fallocate;
+       vnop_readlink_t         vop_readlink;
+       vnop_symlink_t          vop_symlink;
+};
+
+/*
+ * vnode interface
+ */
+#define VOP_OPEN(VP, FP)          ((VP)->v_op->vop_open)(FP)
+#define VOP_CLOSE(VP, FP)         ((VP)->v_op->vop_close)(VP, FP)
+#define VOP_READ(VP, FP, U, F)    ((VP)->v_op->vop_read)(VP, FP, U, F)
+#define VOP_CACHE(VP, FP, U)      ((VP)->v_op->vop_cache)(VP, FP, U)
+#define VOP_WRITE(VP, U, F)       ((VP)->v_op->vop_write)(VP, U, F)
+#define VOP_SEEK(VP, FP, OLD, NEW) ((VP)->v_op->vop_seek)(VP, FP, OLD, NEW)
+#define VOP_IOCTL(VP, FP, C, A)           ((VP)->v_op->vop_ioctl)(VP, FP, C, A)
+#define VOP_FSYNC(VP, FP)         ((VP)->v_op->vop_fsync)(VP, FP)
+#define VOP_READDIR(VP, FP, DIR)   ((VP)->v_op->vop_readdir)(VP, FP, DIR)
+#define VOP_LOOKUP(DVP, N, VP)    ((DVP)->v_op->vop_lookup)(DVP, N, VP)
+#define VOP_CREATE(DVP, N, M)     ((DVP)->v_op->vop_create)(DVP, N, M)
+#define VOP_REMOVE(DVP, VP, N)    ((DVP)->v_op->vop_remove)(DVP, VP, N)
+#define VOP_RENAME(DVP1, VP1, N1, DVP2, VP2, N2) \
+                          ((DVP1)->v_op->vop_rename)(DVP1, VP1, N1, DVP2, VP2, 
N2)
+#define VOP_MKDIR(DVP, N, M)      ((DVP)->v_op->vop_mkdir)(DVP, N, M)
+#define VOP_RMDIR(DVP, VP, N)     ((DVP)->v_op->vop_rmdir)(DVP, VP, N)
+#define VOP_GETATTR(VP, VAP)      ((VP)->v_op->vop_getattr)(VP, VAP)
+#define VOP_SETATTR(VP, VAP)      ((VP)->v_op->vop_setattr)(VP, VAP)
+#define VOP_INACTIVE(VP)          ((VP)->v_op->vop_inactive)(VP)
+#define VOP_TRUNCATE(VP, N)       ((VP)->v_op->vop_truncate)(VP, N)
+#define VOP_LINK(DVP, SVP, N)     ((DVP)->v_op->vop_link)(DVP, SVP, N)
+#define VOP_FALLOCATE(VP, M, OFF, LEN) ((VP)->v_op->vop_fallocate)(VP, M, OFF, 
LEN)
+#define VOP_READLINK(VP, U)        ((VP)->v_op->vop_readlink)(VP, U)
+#define VOP_SYMLINK(DVP, OP, NP)   ((DVP)->v_op->vop_symlink)(DVP, OP, NP)
+
+int     vop_nullop(void);
+int     vop_einval(void);
+int     vop_eperm(void);
+int     vop_erofs(void);
+struct vnode *vn_lookup(struct mount *, uint64_t);
+void    vn_lock(struct vnode *);
+void    vn_unlock(struct vnode *);
+int     vn_stat(struct vnode *, struct stat *);
+int     vn_settimes(struct vnode *, struct timespec[2]);
+int     vn_setmode(struct vnode *, mode_t mode);
+int     vn_access(struct vnode *, int);
+int     vget(struct mount *, uint64_t ino, struct vnode **vpp);
+void    vput(struct vnode *);
+void    vref(struct vnode *);
+void    vrele(struct vnode *);
+void    vflush(struct mount *);
+void vn_add_name(struct vnode *, struct dentry *);
+void vn_del_name(struct vnode *, struct dentry *);
+
+extern enum vtype iftovt_tab[];
+extern int vttoif_tab[];
+#define IFTOVT(mode)    (iftovt_tab[((mode) & S_IFMT) >> 12])
+#define VTTOIF(indx)   (vttoif_tab[(int)(indx)])
+#define MAKEIMODE(indx, mode)   (int)(VTTOIF(indx) | (mode))
+
+#define VATTR_NULL(vp) (*(vp) = (vattr_t){})
+
+static inline void vnode_pager_setsize(struct vnode *vp, off_t size)
+{
+       vp->v_size = size;
+}
+
+__END_DECLS
+
+#endif
+
+#endif /* !_SYS_VNODE_H_ */
diff --git a/lib/vfscore/lookup.c b/lib/vfscore/lookup.c
new file mode 100644
index 00000000..ad03fe25
--- /dev/null
+++ b/lib/vfscore/lookup.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include <osv/dentry.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+static ssize_t
+read_link(struct vnode *vp, char *buf, size_t bufsz, ssize_t *sz)
+{
+    struct iovec iov = {buf, bufsz};
+    struct uio   uio = {&iov, 1, 0, (ssize_t) bufsz, UIO_READ};
+    int rc;
+
+    *sz = 0;
+    vn_lock(vp);
+    rc  = VOP_READLINK(vp, &uio);
+    vn_unlock(vp);
+
+    if (rc != 0) {
+        return (rc);
+    }
+
+    *sz = bufsz - uio.uio_resid;
+    return (0);
+}
+
+int
+namei_follow_link(struct dentry *dp, char *node, char *name, char *fp, size_t 
mountpoint_len)
+{
+    std::unique_ptr<char []> link (new char[PATH_MAX]);
+    std::unique_ptr<char []> t (new char[PATH_MAX]);
+    char    *lp;
+    int     error;
+    ssize_t sz;
+    char    *p;
+    int     c;
+
+    lp    = link.get();
+    error = read_link(dp->d_vnode, lp, PATH_MAX, &sz);
+    if (error != 0) {
+        return (error);
+    }
+    lp[sz] = 0;
+
+    p = fp + mountpoint_len + strlen(node);
+    c = strlen(node) - strlen(name) - 1;
+    node[c] = 0;
+
+    if (lp[0] == '/') {
+        strlcat(lp, p, PATH_MAX);
+        strlcpy(fp, lp, PATH_MAX);
+    } else {
+        strlcpy(t.get(), p, PATH_MAX);
+        strlcpy(node, fp, mountpoint_len + c + 1);
+        path_conv(node, lp, fp);
+        strlcat(fp, t.get(), PATH_MAX);
+    }
+    node[0] = 0;
+    name[0] = 0;
+    return (0);
+}
+/*
+ * Convert a pathname into a pointer to a dentry
+ *
+ * @path: full path name.
+ * @dpp:  dentry to be returned.
+ */
+int
+namei(const char *path, struct dentry **dpp)
+{
+    char *p;
+    char node[PATH_MAX];
+    char name[PATH_MAX];
+    std::unique_ptr<char []> fp (new char [PATH_MAX]);
+    std::unique_ptr<char []> t (new char [PATH_MAX]);
+    struct mount *mp;
+    struct dentry *dp, *ddp;
+    struct vnode *dvp, *vp;
+    int error, i;
+    int links_followed;
+    bool need_continue;
+
+    DPRINTF(VFSDB_VNODE, ("namei: path=%s\n", path));
+
+    links_followed = 0;
+    strlcpy(fp.get(), path, PATH_MAX);
+
+    do {
+        need_continue = false;
+        /*
+         * Convert a full path name to its mount point and
+         * the local node in the file system.
+         */
+        if (vfs_findroot(fp.get(), &mp, &p)) {
+            return ENOTDIR;
+        }
+        int mountpoint_len = p - fp.get() - 1;
+        strlcpy(node, "/", sizeof(node));
+        strlcat(node, p, sizeof(node));
+        dp = dentry_lookup(mp, node);
+        if (dp) {
+            /* vnode is already active. */
+            *dpp = dp;
+            return 0;
+        }
+        /*
+         * Find target vnode, started from root directory.
+         * This is done to attach the fs specific data to
+         * the target vnode.
+         */
+        ddp = mp->m_root;
+        if (!ddp) {
+            sys_panic("VFS: no root");
+        }
+        dref(ddp);
+
+        node[0] = '\0';
+
+        while (*p != '\0') {
+            /*
+             * Get lower directory/file name.
+             */
+            while (*p == '/') {
+                p++;
+            }
+
+            if (*p == '\0') {
+                break;
+            }
+
+            for (i = 0; i < PATH_MAX; i++) {
+                if (*p == '\0' || *p == '/') {
+                    break;
+                }
+                name[i] = *p++;
+            }
+            name[i] = '\0';
+
+            /*
+             * Get a vnode for the target.
+             */
+            strlcat(node, "/", sizeof(node));
+            strlcat(node, name, sizeof(node));
+            dvp = ddp->d_vnode;
+            vn_lock(dvp);
+            dp = dentry_lookup(mp, node);
+            if (dp == nullptr) {
+                /* Find a vnode in this directory. */
+                error = VOP_LOOKUP(dvp, name, &vp);
+                if (error) {
+                    vn_unlock(dvp);
+                    drele(ddp);
+                    return error;
+                }
+
+                dp = dentry_alloc(ddp, vp, node);
+                vput(vp);
+
+                if (!dp) {
+                    vn_unlock(dvp);
+                    drele(ddp);
+                    return ENOMEM;
+                }
+            }
+            vn_unlock(dvp);
+            drele(ddp);
+            ddp = dp;
+
+            if (dp->d_vnode->v_type == VLNK) {
+                error = namei_follow_link(dp, node, name, fp.get(), 
mountpoint_len);
+                if (error) {
+                    drele(dp);
+                    return (error);
+                }
+
+                drele(dp);
+
+                p       = fp.get();
+                dp      = nullptr;
+                ddp     = nullptr;
+                vp      = nullptr;
+                dvp     = nullptr;
+                name[0] = 0;
+                node[0] = 0;
+
+                if (++links_followed >= MAXSYMLINKS) {
+                    return (ELOOP);
+                }
+                need_continue = true;
+                break;
+            }
+
+            if (*p == '/' && ddp->d_vnode->v_type != VDIR) {
+                drele(ddp);
+                return ENOTDIR;
+            }
+        }
+    } while (need_continue == true);
+
+    *dpp = dp;
+    return 0;
+}
+
+/*
+ * Convert last component in the path to pointer to dentry
+ *
+ * @path: full path name
+ * @ddp : pointer to dentry of parent
+ * @dpp : dentry to be returned
+ */
+int
+namei_last_nofollow(char *path, struct dentry *ddp, struct dentry **dpp)
+{
+    char          *name;
+    int           error;
+    struct mount  *mp;
+    char          *p;
+    struct dentry *dp;
+    struct vnode  *dvp;
+    struct vnode  *vp;
+    std::unique_ptr<char []> node (new char[PATH_MAX]);
+
+    dvp  = nullptr;
+
+    if (path[0] != '/') {
+        return (ENOTDIR);
+    }
+
+    name = strrchr(path, '/');
+    if (name == nullptr) {
+        return (ENOENT);
+    }
+    name++;
+
+    error = vfs_findroot(path, &mp, &p);
+    if (error != 0) {
+        return (ENOTDIR);
+    }
+
+    strlcpy(node.get(), "/", PATH_MAX);
+    strlcat(node.get(), p, PATH_MAX);
+
+    // We want to treat things like /tmp/ the same as /tmp. Best way to do that
+    // is to ignore the last character, except when we're stating the root.
+    auto l = strlen(node.get()) - 1;
+    if (l && node.get()[l] == '/') {
+        node.get()[l] = '\0';
+    }
+
+    dvp = ddp->d_vnode;
+    vn_lock(dvp);
+    dp = dentry_lookup(mp, node.get());
+    if (dp == nullptr) {
+        error = VOP_LOOKUP(dvp, name, &vp);
+        if (error != 0) {
+            goto out;
+        }
+
+        dp = dentry_alloc(ddp, vp, node.get());
+        vput(vp);
+
+        if (dp == nullptr) {
+            error = ENOMEM;
+            goto out;
+        }
+    }
+
+    *dpp  = dp;
+    error = 0;
+out:
+    if (dvp != nullptr) {
+        vn_unlock(dvp);
+    }
+    return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central but not so complicated routine. ;-P
+ *
+ * @path: full path.
+ * @dpp:  pointer to dentry for directory.
+ * @name: if non-null, pointer to file name in path.
+ *
+ * This routine returns a locked directory vnode and file name.
+ */
+int
+lookup(char *path, struct dentry **dpp, char **name)
+{
+    char buf[PATH_MAX];
+    char root[] = "/";
+    char *file, *dir;
+    struct dentry *dp;
+    int error;
+
+    DPRINTF(VFSDB_VNODE, ("lookup: path=%s\n", path));
+
+    /*
+     * Get the path for directory.
+     */
+    strlcpy(buf, path, sizeof(buf));
+    file = strrchr(buf, '/');
+    if (!buf[0]) {
+        return ENOTDIR;
+    }
+    if (file == buf) {
+        dir = root;
+    } else {
+        *file = '\0';
+        dir = buf;
+    }
+    /*
+     * Get the vnode for directory
+     */
+    if ((error = namei(dir, &dp)) != 0) {
+        return error;
+    }
+    if (dp->d_vnode->v_type != VDIR) {
+        drele(dp);
+        return ENOTDIR;
+    }
+
+    *dpp = dp;
+
+    if (name) {
+        /*
+         * Get the file name
+         */
+        *name = strrchr(path, '/') + 1;
+    }
+    return 0;
+}
+
+/*
+ * vnode_init() is called once (from vfs_init)
+ * in initialization.
+ */
+void
+lookup_init(void)
+{
+    dentry_init();
+}
diff --git a/lib/vfscore/main.c b/lib/vfscore/main.c
new file mode 100644
index 00000000..cd141117
--- /dev/null
+++ b/lib/vfscore/main.c
@@ -0,0 +1,2413 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/sendfile.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#define open __open_variadic
+#define fcntl __fcntl_variadic
+#include <fcntl.h>
+#undef open
+#undef fcntl
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/stubbing.hh>
+#include <osv/ioctl.h>
+#include <osv/trace.hh>
+#include <osv/run.hh>
+#include <drivers/console.hh>
+
+#include "vfs.h"
+
+#include "libc/internal/libc.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include <sys/file.h>
+
+#include "fs/fs.hh"
+#include "libc/libc.hh"
+
+#include <mntent.h>
+#include <sys/mman.h>
+
+#include <osv/clock.hh>
+#include <api/utime.h>
+#include <chrono>
+
+using namespace std;
+
+
+#ifdef DEBUG_VFS
+int    vfs_debug = VFSDB_FLAGS;
+#endif
+
+std::atomic<mode_t> global_umask{S_IWGRP | S_IWOTH};
+
+static inline mode_t apply_umask(mode_t mode)
+{
+    return mode & ~global_umask.load(std::memory_order_relaxed);
+}
+
+TRACEPOINT(trace_vfs_open, "\"%s\" 0x%x 0%0o", const char*, int, mode_t);
+TRACEPOINT(trace_vfs_open_ret, "%d", int);
+TRACEPOINT(trace_vfs_open_err, "%d", int);
+
+struct task *main_task;        /* we only have a single process */
+
+extern "C"
+int open(const char *pathname, int flags, ...)
+{
+    mode_t mode = 0;
+    if (flags & O_CREAT) {
+        va_list ap;
+        va_start(ap, flags);
+        mode = apply_umask(va_arg(ap, mode_t));
+        va_end(ap);
+    }
+
+    trace_vfs_open(pathname, flags, mode);
+
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    struct file *fp;
+    int fd, error;
+    int acc;
+
+    acc = 0;
+    switch (flags & O_ACCMODE) {
+    case O_RDONLY:
+        acc = VREAD;
+        break;
+    case O_WRONLY:
+        acc = VWRITE;
+        break;
+    case O_RDWR:
+        acc = VREAD | VWRITE;
+        break;
+    }
+
+    error = task_conv(t, pathname, acc, path);
+    if (error)
+        goto out_errno;
+
+    error = sys_open(path, flags, mode, &fp);
+    if (error)
+        goto out_errno;
+
+    error = fdalloc(fp, &fd);
+    if (error)
+        goto out_fput;
+    fdrop(fp);
+    trace_vfs_open_ret(fd);
+    return fd;
+
+    out_fput:
+    fdrop(fp);
+    out_errno:
+    errno = error;
+    trace_vfs_open_err(error);
+    return -1;
+}
+
+LFS64(open);
+
+int openat(int dirfd, const char *pathname, int flags, ...)
+{
+    mode_t mode = 0;
+    if (flags & O_CREAT) {
+        va_list ap;
+        va_start(ap, flags);
+        mode = apply_umask(va_arg(ap, mode_t));
+        va_end(ap);
+    }
+
+    if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+        return open(pathname, flags, mode);
+    }
+
+    struct file *fp;
+    int error = fget(dirfd, &fp);
+    if (error) {
+        errno = error;
+        return -1;
+    }
+
+    struct vnode *vp = fp->f_dentry->d_vnode;
+    vn_lock(vp);
+
+    std::unique_ptr<char []> up (new char[PATH_MAX]);
+    char *p = up.get();
+
+    /* build absolute path */
+    strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+    strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+    strlcat(p, "/", PATH_MAX);
+    strlcat(p, pathname, PATH_MAX);
+
+    error = open(p, flags, mode);
+
+    vn_unlock(vp);
+    fdrop(fp);
+
+    return error;
+}
+LFS64(openat);
+
+// open() has an optional third argument, "mode", which is only needed in
+// some cases (when the O_CREAT mode is used). As a safety feature, recent
+// versions of Glibc add a feature where open() with two arguments is replaced
+// by a call to __open_2(), which verifies it isn't called with O_CREATE.
+extern "C" int __open_2(const char *pathname, int flags)
+{
+    assert(!(flags & O_CREAT));
+    return open(pathname, flags, 0);
+}
+
+extern "C" int __open64_2(const char *file, int flags)
+{
+    if (flags & O_CREAT) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    return open64(file, flags);
+}
+
+int creat(const char *pathname, mode_t mode)
+{
+    return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode);
+}
+LFS64(creat);
+
+TRACEPOINT(trace_vfs_close, "%d", int);
+TRACEPOINT(trace_vfs_close_ret, "");
+TRACEPOINT(trace_vfs_close_err, "%d", int);
+
+int close(int fd)
+{
+    int error;
+
+    trace_vfs_close(fd);
+    error = fdclose(fd);
+    if (error)
+        goto out_errno;
+
+    trace_vfs_close_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_close_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_mknod, "\"%s\" 0%0o 0x%x", const char*, mode_t, dev_t);
+TRACEPOINT(trace_vfs_mknod_ret, "");
+TRACEPOINT(trace_vfs_mknod_err, "%d", int);
+
+
+extern "C"
+int __xmknod(int ver, const char *pathname, mode_t mode, dev_t *dev)
+{
+    assert(ver == 0); // On x86-64 Linux, _MKNOD_VER_LINUX is 0.
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    trace_vfs_mknod(pathname, mode, *dev);
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+
+    error = sys_mknod(path, mode);
+    if (error)
+        goto out_errno;
+
+    trace_vfs_mknod_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_mknod_err(error);
+    errno = error;
+    return -1;
+}
+
+int mknod(const char *pathname, mode_t mode, dev_t dev)
+{
+    return __xmknod(0, pathname, mode, &dev);
+}
+
+
+TRACEPOINT(trace_vfs_lseek, "%d 0x%x %d", int, off_t, int);
+TRACEPOINT(trace_vfs_lseek_ret, "0x%x", off_t);
+TRACEPOINT(trace_vfs_lseek_err, "%d", int);
+
+off_t lseek(int fd, off_t offset, int whence)
+{
+    struct file *fp;
+    off_t org;
+    int error;
+
+    trace_vfs_lseek(fd, offset, whence);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_lseek(fp, offset, whence, &org);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_lseek_ret(org);
+    return org;
+
+    out_errno:
+    trace_vfs_lseek_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(lseek);
+
+TRACEPOINT(trace_vfs_pread, "%d %p 0x%x 0x%x", int, void*, size_t, off_t);
+TRACEPOINT(trace_vfs_pread_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pread_err, "%d", int);
+
+// In BSD's internal implementation of read() and write() code, for example
+// sosend_generic(), a partial read or write returns both an EWOULDBLOCK error
+// *and* a non-zero number of written bytes. In that case, we need to zero the
+// error, so the system call appear a successful partial read/write.
+// In FreeBSD, dofilewrite() and dofileread() (sys_generic.c) do this too.
+static inline bool has_error(int error, int bytes)
+{
+    return error && (
+            (bytes == 0) ||
+            (error != EWOULDBLOCK && error != EINTR && error != ERESTART));
+}
+
+
+ssize_t pread(int fd, void *buf, size_t count, off_t offset)
+{
+    trace_vfs_pread(fd, buf, count, offset);
+    struct iovec iov = {
+            .iov_base  = buf,
+            .iov_len   = count,
+    };
+    struct file *fp;
+    size_t bytes;
+    int error;
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_read(fp, &iov, 1, offset, &bytes);
+    fdrop(fp);
+
+    if (has_error(error, bytes))
+        goto out_errno;
+    trace_vfs_pread_ret(bytes);
+    return bytes;
+
+    out_errno:
+    trace_vfs_pread_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(pread);
+
+ssize_t read(int fd, void *buf, size_t count)
+{
+    return pread(fd, buf, count, -1);
+}
+
+TRACEPOINT(trace_vfs_pwrite, "%d %p 0x%x 0x%x", int, const void*, size_t, 
off_t);
+TRACEPOINT(trace_vfs_pwrite_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pwrite_err, "%d", int);
+
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+    trace_vfs_pwrite(fd, buf, count, offset);
+    struct iovec iov = {
+            .iov_base  = (void *)buf,
+            .iov_len   = count,
+    };
+    struct file *fp;
+    size_t bytes;
+    int error;
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_write(fp, &iov, 1, offset, &bytes);
+    fdrop(fp);
+
+    if (has_error(error, bytes))
+        goto out_errno;
+    trace_vfs_pwrite_ret(bytes);
+    return bytes;
+
+    out_errno:
+    trace_vfs_pwrite_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(pwrite);
+
+ssize_t write(int fd, const void *buf, size_t count)
+{
+    return pwrite(fd, buf, count, -1);
+}
+
+ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+    struct file *fp;
+    size_t bytes;
+    int error;
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_read(fp, iov, iovcnt, offset, &bytes);
+    fdrop(fp);
+
+    if (has_error(error, bytes))
+        goto out_errno;
+    return bytes;
+
+    out_errno:
+    errno = error;
+    return -1;
+}
+
+LFS64(preadv);
+
+ssize_t readv(int fd, const struct iovec *iov, int iovcnt)
+{
+    return preadv(fd, iov, iovcnt, -1);
+}
+
+TRACEPOINT(trace_vfs_pwritev, "%d %p 0x%x 0x%x", int, const struct iovec*, 
int, off_t);
+TRACEPOINT(trace_vfs_pwritev_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pwritev_err, "%d", int);
+
+ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+    struct file *fp;
+    size_t bytes;
+    int error;
+
+    trace_vfs_pwritev(fd, iov, iovcnt, offset);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_write(fp, iov, iovcnt, offset, &bytes);
+    fdrop(fp);
+
+    if (has_error(error, bytes))
+        goto out_errno;
+    trace_vfs_pwritev_ret(bytes);
+    return bytes;
+
+    out_errno:
+    trace_vfs_pwritev_err(error);
+    errno = error;
+    return -1;
+}
+LFS64(pwritev);
+
+ssize_t writev(int fd, const struct iovec *iov, int iovcnt)
+{
+    return pwritev(fd, iov, iovcnt, -1);
+}
+
+TRACEPOINT(trace_vfs_ioctl, "%d 0x%x", int, unsigned long);
+TRACEPOINT(trace_vfs_ioctl_ret, "");
+TRACEPOINT(trace_vfs_ioctl_err, "%d", int);
+
+int ioctl(int fd, unsigned long int request, ...)
+{
+    struct file *fp;
+    int error;
+    va_list ap;
+    void* arg;
+
+    trace_vfs_ioctl(fd, request);
+    /* glibc ABI provides a variadic prototype for ioctl so we need to agree
+     * with it, since we now include sys/ioctl.h
+     * read the first argument and pass it to sys_ioctl() */
+    va_start(ap, request);
+    arg = va_arg(ap, void*);
+    va_end(ap);
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_ioctl(fp, request, arg);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_ioctl_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_ioctl_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_fsync, "%d", int);
+TRACEPOINT(trace_vfs_fsync_ret, "");
+TRACEPOINT(trace_vfs_fsync_err, "%d", int);
+
+int fsync(int fd)
+{
+    struct file *fp;
+    int error;
+
+    trace_vfs_fsync(fd);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_fsync(fp);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_fsync_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_fsync_err(error);
+    errno = error;
+    return -1;
+}
+
+int fdatasync(int fd)
+{
+    // TODO: See if we can do less than fsync().
+    return fsync(fd);
+}
+
+TRACEPOINT(trace_vfs_fstat, "%d %p", int, struct stat*);
+TRACEPOINT(trace_vfs_fstat_ret, "");
+TRACEPOINT(trace_vfs_fstat_err, "%d", int);
+
+extern "C"
+int __fxstat(int ver, int fd, struct stat *st)
+{
+    struct file *fp;
+    int error;
+
+    trace_vfs_fstat(fd, st);
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_fstat(fp, st);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_fstat_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_fstat_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(__fxstat);
+
+extern "C"
+int fstat(int fd, struct stat *st)
+{
+    return __fxstat(1, fd, st);
+}
+
+LFS64(fstat);
+
+extern "C"
+int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st,
+        int flags)
+{
+    if (flags & AT_SYMLINK_NOFOLLOW) {
+        UNIMPLEMENTED("fstatat() with AT_SYMLINK_NOFOLLOW");
+    }
+
+    if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+        return stat(pathname, st);
+    }
+    // If AT_EMPTY_PATH and pathname is an empty string, fstatat() operates on
+    // dirfd itself, and in that case it doesn't have to be a directory.
+    if ((flags & AT_EMPTY_PATH) && !pathname[0]) {
+        return fstat(dirfd, st);
+    }
+
+    struct file *fp;
+    int error = fget(dirfd, &fp);
+    if (error) {
+        errno = error;
+        return -1;
+    }
+
+    struct vnode *vp = fp->f_dentry->d_vnode;
+    vn_lock(vp);
+
+    std::unique_ptr<char []> up (new char[PATH_MAX]);
+    char *p = up.get();
+    /* build absolute path */
+    strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+    strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+    strlcat(p, "/", PATH_MAX);
+    strlcat(p, pathname, PATH_MAX);
+
+    error = stat(p, st);
+
+    vn_unlock(vp);
+    fdrop(fp);
+
+    return error;
+}
+
+LFS64(__fxstatat);
+
+extern "C"
+int fstatat(int dirfd, const char *path, struct stat *st, int flags)
+{
+    return __fxstatat(1, dirfd, path, st, flags);
+}
+
+LFS64(fstatat);
+
+extern "C" int flock(int fd, int operation)
+{
+    if (!fileref_from_fd(fd)) {
+        return libc_error(EBADF);
+    }
+
+    switch (operation) {
+    case LOCK_SH:
+    case LOCK_SH | LOCK_NB:
+    case LOCK_EX:
+    case LOCK_EX | LOCK_NB:
+    case LOCK_UN:
+        break;
+    default:
+        return libc_error(EINVAL);
+    }
+
+    return 0;
+}
+
+TRACEPOINT(trace_vfs_readdir, "%d %p", int, dirent*);
+TRACEPOINT(trace_vfs_readdir_ret, "");
+TRACEPOINT(trace_vfs_readdir_err, "%d", int);
+
+struct __dirstream
+{
+    int fd;
+};
+
+DIR *opendir(const char *path)
+{
+    DIR *dir = new DIR;
+
+    if (!dir)
+        return libc_error_ptr<DIR>(ENOMEM);
+
+    dir->fd = open(path, O_RDONLY);
+    if (dir->fd < 0) {
+        delete dir;
+        return nullptr;
+    }
+    return dir;
+}
+
+DIR *fdopendir(int fd)
+{
+    DIR *dir;
+    struct stat st;
+    if (fstat(fd, &st) < 0) {
+        return nullptr;
+    }
+    if (!S_ISDIR(st.st_mode)) {
+        errno = ENOTDIR;
+        return nullptr;
+    }
+    dir = new DIR;
+    dir->fd = fd;
+    return dir;
+
+}
+
+int dirfd(DIR *dirp)
+{
+    if (!dirp) {
+        return libc_error(EINVAL);
+    }
+
+    return dirp->fd;
+}
+
+int closedir(DIR *dir)
+{
+    close(dir->fd);
+    delete dir;
+    return 0;
+}
+
+struct dirent *readdir(DIR *dir)
+{
+    static __thread struct dirent entry, *result;
+    int ret;
+
+    ret = readdir_r(dir, &entry, &result);
+    if (ret)
+        return libc_error_ptr<struct dirent>(ret);
+
+    errno = 0;
+    return result;
+}
+
+int readdir_r(DIR *dir, struct dirent *entry, struct dirent **result)
+{
+    int error;
+    struct file *fp;
+
+    trace_vfs_readdir(dir->fd, entry);
+    error = fget(dir->fd, &fp);
+    if (error) {
+        trace_vfs_readdir_err(error);
+    } else {
+        error = sys_readdir(fp, entry);
+        fdrop(fp);
+        if (error) {
+            trace_vfs_readdir_err(error);
+        } else {
+            trace_vfs_readdir_ret();
+        }
+    }
+    // Our dirent has (like Linux) a d_reclen field, but a constant size.
+    entry->d_reclen = sizeof(*entry);
+
+    if (error) {
+        *result = nullptr;
+    } else {
+        *result = entry;
+    }
+    return error == ENOENT ? 0 : error;
+}
+
+// FIXME: in 64bit dirent64 and dirent are identical, so it's safe to alias
+#undef readdir64_r
+extern "C" int readdir64_r(DIR *dir, struct dirent64 *entry,
+        struct dirent64 **result)
+        __attribute__((alias("readdir_r")));
+
+#undef readdir64
+extern "C" struct dirent *readdir64(DIR *dir) 
__attribute__((alias("readdir")));
+
+void rewinddir(DIR *dirp)
+{
+    struct file *fp;
+
+    auto error = fget(dirp->fd, &fp);
+    if (error) {
+        // POSIX specifies that what rewinddir() does in the case of error
+        // is undefined...
+        return;
+    }
+
+    sys_rewinddir(fp);
+    // Again, error code from sys_rewinddir() is ignored.
+    fdrop(fp);
+}
+
+long telldir(DIR *dirp)
+{
+    struct file *fp;
+    int error = fget(dirp->fd, &fp);
+    if (error) {
+        return libc_error(error);
+    }
+
+    long loc;
+    error = sys_telldir(fp, &loc);
+    fdrop(fp);
+    if (error) {
+        return libc_error(error);
+    }
+    return loc;
+}
+
+void seekdir(DIR *dirp, long loc)
+{
+    struct file *fp;
+    int error = fget(dirp->fd, &fp);
+    if (error) {
+        // POSIX specifies seekdir() cannot return errors.
+        return;
+    }
+    sys_seekdir(fp, loc);
+    // Again, error code from sys_seekdir() is ignored.
+    fdrop(fp);
+}
+
+TRACEPOINT(trace_vfs_mkdir, "\"%s\" 0%0o", const char*, mode_t);
+TRACEPOINT(trace_vfs_mkdir_ret, "");
+TRACEPOINT(trace_vfs_mkdir_err, "%d", int);
+
+int
+mkdir(const char *pathname, mode_t mode)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    mode = apply_umask(mode);
+
+    trace_vfs_mkdir(pathname, mode);
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+
+    error = sys_mkdir(path, mode);
+    if (error)
+        goto out_errno;
+    trace_vfs_mkdir_ret();
+    return 0;
+    out_errno:
+    trace_vfs_mkdir_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_rmdir, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_rmdir_ret, "");
+TRACEPOINT(trace_vfs_rmdir_err, "%d", int);
+
+int rmdir(const char *pathname)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    trace_vfs_rmdir(pathname);
+    error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+
+    error = sys_rmdir(path);
+    if (error)
+        goto out_errno;
+    trace_vfs_rmdir_ret();
+    return 0;
+    out_errno:
+    trace_vfs_rmdir_err(error);
+    errno = error;
+    return -1;
+}
+
+static void
+get_last_component(const char *path, char *dst)
+{
+    int pos = strlen(path) - 1;
+
+    while (pos >= 0 && path[pos] == '/')
+        pos--;
+
+    int component_end = pos;
+
+    while (pos >= 0 && path[pos] != '/')
+        pos--;
+
+    int component_start = pos + 1;
+
+    int len = component_end - component_start + 1;
+    memcpy(dst, path + component_start, len);
+    dst[len] = 0;
+}
+
+static bool null_or_empty(const char *str)
+{
+    return str == nullptr || *str == '\0';
+}
+
+TRACEPOINT(trace_vfs_rename, "\"%s\" \"%s\"", const char*, const char*);
+TRACEPOINT(trace_vfs_rename_ret, "");
+TRACEPOINT(trace_vfs_rename_err, "%d", int);
+
+int rename(const char *oldpath, const char *newpath)
+{
+    trace_vfs_rename(oldpath, newpath);
+    struct task *t = main_task;
+    char src[PATH_MAX];
+    char dest[PATH_MAX];
+    int error;
+
+    error = ENOENT;
+    if (null_or_empty(oldpath) || null_or_empty(newpath))
+        goto out_errno;
+
+    get_last_component(oldpath, src);
+    if (!strcmp(src, ".") || !strcmp(src, "..")) {
+        error = EINVAL;
+        goto out_errno;
+    }
+
+    get_last_component(newpath, dest);
+    if (!strcmp(dest, ".") || !strcmp(dest, "..")) {
+        error = EINVAL;
+        goto out_errno;
+    }
+
+    if ((error = task_conv(t, oldpath, VREAD, src)) != 0)
+        goto out_errno;
+
+    if ((error = task_conv(t, newpath, VWRITE, dest)) != 0)
+        goto out_errno;
+
+    error = sys_rename(src, dest);
+    if (error)
+        goto out_errno;
+    trace_vfs_rename_ret();
+    return 0;
+    out_errno:
+    trace_vfs_rename_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_chdir, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_chdir_ret, "");
+TRACEPOINT(trace_vfs_chdir_err, "%d", int);
+
+static int replace_cwd(struct task *t, struct file *new_cwdfp,
+                       std::function<int (void)> chdir_func)
+{
+    struct file *old = nullptr;
+
+    if (!t) {
+        return 0;
+    }
+
+    if (t->t_cwdfp) {
+        old = t->t_cwdfp;
+    }
+
+    /* Do the actual chdir operation here */
+    int error = chdir_func();
+
+    t->t_cwdfp = new_cwdfp;
+    if (old) {
+        fdrop(old);
+    }
+
+    return error;
+}
+
+int chdir(const char *pathname)
+{
+    trace_vfs_chdir(pathname);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    struct file *fp;
+    int error;
+
+    error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+
+    if ((error = task_conv(t, pathname, VREAD, path)) != 0)
+        goto out_errno;
+
+    /* Check if directory exits */
+    error = sys_open(path, O_DIRECTORY, 0, &fp);
+    if (error) {
+        goto out_errno;
+    }
+
+    replace_cwd(t, fp, [&]() { strlcpy(t->t_cwd, path, sizeof(t->t_cwd)); 
return 0; });
+
+    trace_vfs_chdir_ret();
+    return 0;
+    out_errno:
+    errno = error;
+    trace_vfs_chdir_err(errno);
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_fchdir, "%d", int);
+TRACEPOINT(trace_vfs_fchdir_ret, "");
+TRACEPOINT(trace_vfs_fchdir_err, "%d", int);
+
+int fchdir(int fd)
+{
+    trace_vfs_fchdir(fd);
+    struct task *t = main_task;
+    struct file *fp;
+    int error;
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = replace_cwd(t, fp, [&]() { return sys_fchdir(fp, t->t_cwd); });
+    if (error) {
+        fdrop(fp);
+        goto out_errno;
+    }
+
+    trace_vfs_fchdir_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_fchdir_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_link, "\"%s\" \"%s\"", const char*, const char*);
+TRACEPOINT(trace_vfs_link_ret, "");
+TRACEPOINT(trace_vfs_link_err, "%d", int);
+
+int link(const char *oldpath, const char *newpath)
+{
+    struct task *t = main_task;
+    char path1[PATH_MAX];
+    char path2[PATH_MAX];
+    int error;
+
+    trace_vfs_link(oldpath, newpath);
+
+    error = ENOENT;
+    if (oldpath == nullptr || newpath == nullptr)
+        goto out_errno;
+    if ((error = task_conv(t, oldpath, VWRITE, path1)) != 0)
+        goto out_errno;
+    if ((error = task_conv(t, newpath, VWRITE, path2)) != 0)
+        goto out_errno;
+
+    error = sys_link(path1, path2);
+    if (error)
+        goto out_errno;
+    trace_vfs_link_ret();
+    return 0;
+    out_errno:
+    trace_vfs_link_err(error);
+    errno = error;
+    return -1;
+}
+
+
+TRACEPOINT(trace_vfs_symlink, "oldpath=%s, newpath=%s", const char*, const 
char*);
+TRACEPOINT(trace_vfs_symlink_ret, "");
+TRACEPOINT(trace_vfs_symlink_err, "errno=%d", int);
+
+int symlink(const char *oldpath, const char *newpath)
+{
+    int error;
+
+    trace_vfs_symlink(oldpath, newpath);
+
+    error = ENOENT;
+    if (oldpath == nullptr || newpath == nullptr) {
+        errno = ENOENT;
+        trace_vfs_symlink_err(error);
+        return (-1);
+    }
+
+    error = sys_symlink(oldpath, newpath);
+    if (error) {
+        errno = error;
+        trace_vfs_symlink_err(error);
+        return (-1);
+    }
+
+    trace_vfs_symlink_ret();
+    return 0;
+}
+
+TRACEPOINT(trace_vfs_unlink, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_unlink_ret, "");
+TRACEPOINT(trace_vfs_unlink_err, "%d", int);
+
+int unlink(const char *pathname)
+{
+    trace_vfs_unlink(pathname);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+
+    error = sys_unlink(path);
+    if (error)
+        goto out_errno;
+    trace_vfs_unlink_ret();
+    return 0;
+    out_errno:
+    trace_vfs_unlink_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_stat, "\"%s\" %p", const char*, struct stat*);
+TRACEPOINT(trace_vfs_stat_ret, "");
+TRACEPOINT(trace_vfs_stat_err, "%d", int);
+
+extern "C"
+int __xstat(int ver, const char *pathname, struct stat *st)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    trace_vfs_stat(pathname, st);
+
+    error = task_conv(t, pathname, 0, path);
+    if (error)
+        goto out_errno;
+
+    error = sys_stat(path, st);
+    if (error)
+        goto out_errno;
+    trace_vfs_stat_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_stat_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(__xstat);
+
+int stat(const char *pathname, struct stat *st)
+{
+    return __xstat(1, pathname, st);
+}
+
+LFS64(stat);
+
+TRACEPOINT(trace_vfs_lstat, "pathname=%s, stat=%p", const char*, struct stat*);
+TRACEPOINT(trace_vfs_lstat_ret, "");
+TRACEPOINT(trace_vfs_lstat_err, "errno=%d", int);
+extern "C"
+int __lxstat(int ver, const char *pathname, struct stat *st)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    trace_vfs_lstat(pathname, st);
+
+    error = task_conv(t, pathname, 0, path);
+    if (error) {
+        errno = error;
+        trace_vfs_lstat_err(error);
+        return (-1);
+    }
+
+    error = sys_lstat(path, st);
+    if (error) {
+        errno = error;
+        trace_vfs_lstat_err(error);
+        return (-1);
+    }
+
+    trace_vfs_lstat_ret();
+    return 0;
+}
+
+LFS64(__lxstat);
+
+int lstat(const char *pathname, struct stat *st)
+{
+    return __lxstat(1, pathname, st);
+}
+
+LFS64(lstat);
+
+TRACEPOINT(trace_vfs_statfs, "\"%s\" %p", const char*, struct statfs*);
+TRACEPOINT(trace_vfs_statfs_ret, "");
+TRACEPOINT(trace_vfs_statfs_err, "%d", int);
+
+extern "C"
+int __statfs(const char *pathname, struct statfs *buf)
+{
+    trace_vfs_statfs(pathname, buf);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    error = task_conv(t, pathname, 0, path);
+    if (error)
+        goto out_errno;
+
+    error = sys_statfs(path, buf);
+    if (error)
+        goto out_errno;
+    trace_vfs_statfs_ret();
+    return 0;
+    out_errno:
+    trace_vfs_statfs_err(error);
+    errno = error;
+    return -1;
+}
+weak_alias(__statfs, statfs);
+
+LFS64(statfs);
+
+TRACEPOINT(trace_vfs_fstatfs, "\"%s\" %p", int, struct statfs*);
+TRACEPOINT(trace_vfs_fstatfs_ret, "");
+TRACEPOINT(trace_vfs_fstatfs_err, "%d", int);
+
+extern "C"
+int __fstatfs(int fd, struct statfs *buf)
+{
+    struct file *fp;
+    int error;
+
+    trace_vfs_fstatfs(fd, buf);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_fstatfs(fp, buf);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_fstatfs_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_fstatfs_err(error);
+    errno = error;
+    return -1;
+}
+weak_alias(__fstatfs, fstatfs);
+
+LFS64(fstatfs);
+
+static int
+statfs_to_statvfs(struct statvfs *dst, struct statfs *src)
+{
+    dst->f_bsize = src->f_bsize;
+    dst->f_frsize = src->f_bsize;
+    dst->f_blocks = src->f_blocks;
+    dst->f_bfree = src->f_bfree;
+    dst->f_bavail = src->f_bavail;
+    dst->f_files = src->f_files;
+    dst->f_ffree = src->f_ffree;
+    dst->f_favail = 0;
+    dst->f_fsid = src->f_fsid.__val[0];
+    dst->f_flag = src->f_flags;
+    dst->f_namemax = src->f_namelen;
+    return 0;
+}
+
+int
+statvfs(const char *pathname, struct statvfs *buf)
+{
+    struct statfs st;
+
+    if (__statfs(pathname, &st) < 0)
+        return -1;
+    return statfs_to_statvfs(buf, &st);
+}
+
+LFS64(statvfs);
+
+int
+fstatvfs(int fd, struct statvfs *buf)
+{
+    struct statfs st;
+
+    if (__fstatfs(fd, &st) < 0)
+        return -1;
+    return statfs_to_statvfs(buf, &st);
+}
+
+LFS64(fstatvfs);
+
+
+TRACEPOINT(trace_vfs_getcwd, "%p %d", char*, size_t);
+TRACEPOINT(trace_vfs_getcwd_ret, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_getcwd_err, "%d", int);
+
+char *getcwd(char *path, size_t size)
+{
+    trace_vfs_getcwd(path, size);
+    struct task *t = main_task;
+    int len = strlen(t->t_cwd) + 1;
+    int error;
+
+    if (!path) {
+        if (!size)
+            size = len;
+        path = (char*)malloc(size);
+        if (!path) {
+            error = ENOMEM;
+            goto out_errno;
+        }
+    } else {
+        if (!size) {
+            error = EINVAL;
+            goto out_errno;
+        }
+    }
+
+    if (size < len) {
+        error = ERANGE;
+        goto out_errno;
+    }
+
+    memcpy(path, t->t_cwd, len);
+    trace_vfs_getcwd_ret(path);
+    return path;
+
+    out_errno:
+    trace_vfs_getcwd_err(error);
+    errno = error;
+    return nullptr;
+}
+
+TRACEPOINT(trace_vfs_dup, "%d", int);
+TRACEPOINT(trace_vfs_dup_ret, "\"%s\"", int);
+TRACEPOINT(trace_vfs_dup_err, "%d", int);
+/*
+ * Duplicate a file descriptor
+ */
+int dup(int oldfd)
+{
+    struct file *fp;
+    int newfd;
+    int error;
+
+    trace_vfs_dup(oldfd);
+    error = fget(oldfd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = fdalloc(fp, &newfd);
+    if (error)
+        goto out_fdrop;
+
+    fdrop(fp);
+    trace_vfs_dup_ret(newfd);
+    return newfd;
+
+    out_fdrop:
+    fdrop(fp);
+    out_errno:
+    trace_vfs_dup_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_dup3, "%d %d 0x%x", int, int, int);
+TRACEPOINT(trace_vfs_dup3_ret, "%d", int);
+TRACEPOINT(trace_vfs_dup3_err, "%d", int);
+/*
+ * Duplicate a file descriptor to a particular value.
+ */
+int dup3(int oldfd, int newfd, int flags)
+{
+    struct file *fp;
+    int error;
+
+    trace_vfs_dup3(oldfd, newfd, flags);
+    /*
+     * Don't allow any argument but O_CLOEXEC.  But we even ignore
+     * that as we don't support exec() and thus don't care.
+     */
+    if ((flags & ~O_CLOEXEC) != 0) {
+        error = EINVAL;
+        goto out_errno;
+    }
+
+    if (oldfd == newfd) {
+        error = EINVAL;
+        goto out_errno;
+    }
+
+    error = fget(oldfd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = fdset(newfd, fp);
+    if (error) {
+        fdrop(fp);
+        goto out_errno;
+    }
+
+    fdrop(fp);
+    trace_vfs_dup3_ret(newfd);
+    return newfd;
+
+    out_errno:
+    trace_vfs_dup3_err(error);
+    errno = error;
+    return -1;
+}
+
+int dup2(int oldfd, int newfd)
+{
+    if (oldfd == newfd)
+        return newfd;
+
+    return dup3(oldfd, newfd, 0);
+}
+
+/*
+ * The file control system call.
+ */
+#define SETFL (O_APPEND | O_ASYNC | O_DIRECT | O_NOATIME | O_NONBLOCK)
+
+TRACEPOINT(trace_vfs_fcntl, "%d %d 0x%x", int, int, int);
+TRACEPOINT(trace_vfs_fcntl_ret, "\"%s\"", int);
+TRACEPOINT(trace_vfs_fcntl_err, "%d", int);
+
+extern "C"
+int fcntl(int fd, int cmd, int arg)
+{
+    struct file *fp;
+    int ret = 0, error;
+    int tmp;
+
+    trace_vfs_fcntl(fd, cmd, arg);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    // An important note about our handling of FD_CLOEXEC / O_CLOEXEC:
+    // close-on-exec shouldn't have been a file flag (fp->f_flags) - it is a
+    // file descriptor flag, meaning that that two dup()ed file descriptors
+    // could have different values for FD_CLOEXEC. Our current implementation
+    // *wrongly* makes close-on-exec an f_flag (using the bit O_CLOEXEC).
+    // There is little practical difference, though, because this flag is
+    // ignored in OSv anyway, as it doesn't support exec().
+    switch (cmd) {
+    case F_DUPFD:
+        error = _fdalloc(fp, &ret, arg);
+        if (error)
+            goto out_errno;
+        break;
+    case F_GETFD:
+        ret = (fp->f_flags & O_CLOEXEC) ? FD_CLOEXEC : 0;
+        break;
+    case F_SETFD:
+        FD_LOCK(fp);
+        fp->f_flags = (fp->f_flags & ~O_CLOEXEC) |
+                ((arg & FD_CLOEXEC) ? O_CLOEXEC : 0);
+        FD_UNLOCK(fp);
+        break;
+    case F_GETFL:
+        // As explained above, the O_CLOEXEC should have been in f_flags,
+        // and shouldn't be returned. Linux always returns 0100000 ("the
+        // flag formerly known as O_LARGEFILE) so let's do it too.
+        ret = (oflags(fp->f_flags) & ~O_CLOEXEC) | 0100000;
+        break;
+    case F_SETFL:
+        FD_LOCK(fp);
+        fp->f_flags = fflags((oflags(fp->f_flags) & ~SETFL) |
+                (arg & SETFL));
+        FD_UNLOCK(fp);
+
+        /* Sync nonblocking/async state with file flags */
+        tmp = fp->f_flags & FNONBLOCK;
+        fp->ioctl(FIONBIO, &tmp);
+        tmp = fp->f_flags & FASYNC;
+        fp->ioctl(FIOASYNC, &tmp);
+
+        break;
+    case F_SETLK:
+        WARN_ONCE("fcntl(F_SETLK) stubbed\n");
+        break;
+    case F_GETLK:
+        WARN_ONCE("fcntl(F_GETLK) stubbed\n");
+        break;
+    case F_SETLKW:
+        WARN_ONCE("fcntl(F_SETLKW) stubbed\n");
+        break;
+    case F_SETOWN:
+        WARN_ONCE("fcntl(F_SETOWN) stubbed\n");
+        break;
+    default:
+        kprintf("unsupported fcntl cmd 0x%x\n", cmd);
+        error = EINVAL;
+    }
+
+    fdrop(fp);
+    if (error)
+        goto out_errno;
+    trace_vfs_fcntl_ret(ret);
+    return ret;
+
+    out_errno:
+    trace_vfs_fcntl_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_access, "\"%s\" 0%0o", const char*, int);
+TRACEPOINT(trace_vfs_access_ret, "");
+TRACEPOINT(trace_vfs_access_err, "%d", int);
+
+/*
+ * Check permission for file access
+ */
+int access(const char *pathname, int mode)
+{
+    trace_vfs_access(pathname, mode);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int acc, error = 0;
+
+    acc = 0;
+    if (mode & R_OK)
+        acc |= VREAD;
+    if (mode & W_OK)
+        acc |= VWRITE;
+
+    if ((error = task_conv(t, pathname, acc, path)) != 0)
+        goto out_errno;
+
+    error = sys_access(path, mode);
+    if (error)
+        goto out_errno;
+    trace_vfs_access_ret();
+    return 0;
+    out_errno:
+    errno = error;
+    trace_vfs_access_err(error);
+    return -1;
+}
+
+int faccessat(int dirfd, const char *pathname, int mode, int flags)
+{
+    if (flags & AT_SYMLINK_NOFOLLOW) {
+        UNIMPLEMENTED("faccessat() with AT_SYMLINK_NOFOLLOW");
+    }
+
+    if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+        return access(pathname, mode);
+    }
+
+    struct file *fp;
+    int error = fget(dirfd, &fp);
+    if (error) {
+        errno = error;
+        return -1;
+    }
+
+    struct vnode *vp = fp->f_dentry->d_vnode;
+    vn_lock(vp);
+
+    std::unique_ptr<char []> up (new char[PATH_MAX]);
+    char *p = up.get();
+
+    /* build absolute path */
+    strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+    strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+    strlcat(p, "/", PATH_MAX);
+    strlcat(p, pathname, PATH_MAX);
+
+    error = access(p, mode);
+
+    vn_unlock(vp);
+    fdrop(fp);
+
+    return error;
+}
+
+extern "C"
+int euidaccess(const char *pathname, int mode)
+{
+    return access(pathname, mode);
+}
+
+weak_alias(euidaccess,eaccess);
+
+#if 0
+static int
+fs_pipe(struct task *t, struct msg *msg)
+{
+#ifdef CONFIG_FIFOFS
+    char path[PATH_MAX];
+    file_t rfp, wfp;
+    int error, rfd, wfd;
+
+    DPRINTF(VFSDB_CORE, ("fs_pipe\n"));
+
+    if ((rfd = task_newfd(t)) == -1)
+        return EMFILE;
+    t->t_ofile[rfd] = (file_t)1; /* temp */
+
+    if ((wfd = task_newfd(t)) == -1) {
+        t->t_ofile[rfd] = nullptr;
+        return EMFILE;
+    }
+    sprintf(path, "/mnt/fifo/pipe-%x-%d", (u_int)t->t_taskid, rfd);
+
+    if ((error = sys_mknod(path, S_IFIFO)) != 0)
+        goto out;
+    if ((error = sys_open(path, O_RDONLY | O_NONBLOCK, 0, &rfp)) != 0) {
+        goto out;
+    }
+    if ((error = sys_open(path, O_WRONLY | O_NONBLOCK, 0, &wfp)) != 0) {
+        goto out;
+    }
+    t->t_ofile[rfd] = rfp;
+    t->t_ofile[wfd] = wfp;
+    t->t_nopens += 2;
+    msg->data[0] = rfd;
+    msg->data[1] = wfd;
+    return 0;
+    out:
+    t->t_ofile[rfd] = nullptr;
+    t->t_ofile[wfd] = nullptr;
+    return error;
+#else
+    return ENOSYS;
+#endif
+}
+#endif
+
+TRACEPOINT(trace_vfs_isatty, "%d", int);
+TRACEPOINT(trace_vfs_isatty_ret, "%d", int);
+TRACEPOINT(trace_vfs_isatty_err, "%d", int);
+
+/*
+ * Return if specified file is a tty
+ */
+int isatty(int fd)
+{
+    struct file *fp;
+    int istty = 0;
+
+    trace_vfs_isatty(fd);
+    fileref f(fileref_from_fd(fd));
+    if (!f) {
+        errno = EBADF;
+        trace_vfs_isatty_err(errno);
+        return -1;
+    }
+
+    fp = f.get();
+    if (dynamic_cast<tty_file*>(fp) ||
+        (fp->f_dentry && fp->f_dentry->d_vnode->v_flags & VISTTY)) {
+        istty = 1;
+    }
+
+    trace_vfs_isatty_ret(istty);
+    return istty;
+}
+
+TRACEPOINT(trace_vfs_truncate, "\"%s\" 0x%x", const char*, off_t);
+TRACEPOINT(trace_vfs_truncate_ret, "");
+TRACEPOINT(trace_vfs_truncate_err, "%d", int);
+
+int truncate(const char *pathname, off_t length)
+{
+    trace_vfs_truncate(pathname, length);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+
+    error = sys_truncate(path, length);
+    if (error)
+        goto out_errno;
+    trace_vfs_truncate_ret();
+    return 0;
+    out_errno:
+    errno = error;
+    trace_vfs_truncate_err(error);
+    return -1;
+}
+
+LFS64(truncate);
+
+TRACEPOINT(trace_vfs_ftruncate, "%d 0x%x", int, off_t);
+TRACEPOINT(trace_vfs_ftruncate_ret, "");
+TRACEPOINT(trace_vfs_ftruncate_err, "%d", int);
+
+int ftruncate(int fd, off_t length)
+{
+    trace_vfs_ftruncate(fd, length);
+    struct file *fp;
+    int error;
+
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_ftruncate(fp, length);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_ftruncate_ret();
+    return 0;
+
+    out_errno:
+    errno = error;
+    trace_vfs_ftruncate_err(error);
+    return -1;
+}
+
+LFS64(ftruncate);
+
+ssize_t readlink(const char *pathname, char *buf, size_t bufsize)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+    ssize_t size;
+
+    error = -EINVAL;
+    if (bufsize <= 0)
+        goto out_errno;
+
+    error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+    error = task_conv(t, pathname, VWRITE, path);
+    if (error)
+        goto out_errno;
+
+    size  = 0;
+    error = sys_readlink(path, buf, bufsize, &size);
+
+    if (error != 0)
+        goto out_errno;
+
+    return size;
+    out_errno:
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_fallocate, "%d %d 0x%x 0x%x", int, int, loff_t, loff_t);
+TRACEPOINT(trace_vfs_fallocate_ret, "");
+TRACEPOINT(trace_vfs_fallocate_err, "%d", int);
+
+int fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+    struct file *fp;
+    int error;
+
+    trace_vfs_fallocate(fd, mode, offset, len);
+    error = fget(fd, &fp);
+    if (error)
+        goto out_errno;
+
+    error = sys_fallocate(fp, mode, offset, len);
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    trace_vfs_fallocate_ret();
+    return 0;
+
+    out_errno:
+    trace_vfs_fallocate_err(error);
+    errno = error;
+    return -1;
+}
+
+LFS64(fallocate);
+
+TRACEPOINT(trace_vfs_utimes, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_utimes_ret, "");
+TRACEPOINT(trace_vfs_utimes_err, "%d", int);
+
+int futimes(int fd, const struct timeval times[2])
+{
+    return futimesat(fd, nullptr, times);
+}
+
+int futimesat(int dirfd, const char *pathname, const struct timeval times[2])
+{
+    struct stat st;
+    struct file *fp;
+    int error;
+    char *absolute_path;
+
+    if ((pathname && pathname[0] == '/') || dirfd == AT_FDCWD)
+        return utimes(pathname, times);
+
+    // Note: if pathname == nullptr, futimesat operates on dirfd itself, and in
+    // that case it doesn't have to be a directory.
+    if (pathname) {
+        error = fstat(dirfd, &st);
+        if (error) {
+            error = errno;
+            goto out_errno;
+        }
+
+        if (!S_ISDIR(st.st_mode)){
+            error = ENOTDIR;
+            goto out_errno;
+        }
+    }
+
+    error = fget(dirfd, &fp);
+    if (error)
+        goto out_errno;
+
+    /* build absolute path */
+    absolute_path = (char*)malloc(PATH_MAX);
+    strlcpy(absolute_path, fp->f_dentry->d_mount->m_path, PATH_MAX);
+    strlcat(absolute_path, fp->f_dentry->d_path, PATH_MAX);
+
+    if (pathname) {
+        strlcat(absolute_path, "/", PATH_MAX);
+        strlcat(absolute_path, pathname, PATH_MAX);
+    }
+
+    error = utimes(absolute_path, times);
+    free(absolute_path);
+
+    fdrop(fp);
+
+    if (error)
+        goto out_errno;
+    return 0;
+
+    out_errno:
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_utimensat, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_utimensat_ret, "");
+TRACEPOINT(trace_vfs_utimensat_err, "%d", int);
+
+extern "C"
+int utimensat(int dirfd, const char *pathname, const struct timespec times[2], 
int flags)
+{
+    trace_vfs_utimensat(pathname);
+
+    auto error = sys_utimensat(dirfd, pathname, times, flags);
+    if (error) {
+        trace_vfs_utimensat_err(error);
+        errno = error;
+        return -1;
+    }
+
+    trace_vfs_utimensat_ret();
+    return 0;
+}
+
+TRACEPOINT(trace_vfs_futimens, "%d", int);
+TRACEPOINT(trace_vfs_futimens_ret, "");
+TRACEPOINT(trace_vfs_futimens_err, "%d", int);
+
+extern "C"
+int futimens(int fd, const struct timespec times[2])
+{
+    trace_vfs_futimens(fd);
+
+    auto error = sys_futimens(fd, times);
+    if (error) {
+        trace_vfs_futimens_err(error);
+        errno = error;
+        return -1;
+    }
+
+    trace_vfs_futimens_ret();
+    return 0;
+}
+
+static int do_utimes(const char *pathname, const struct timeval times[2], int 
flags)
+{
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error;
+
+    trace_vfs_utimes(pathname);
+
+    error = task_conv(t, pathname, 0, path);
+    if (error) {
+        trace_vfs_utimes_err(error);
+        return libc_error(error);
+    }
+
+    error = sys_utimes(path, times, flags);
+    if (error) {
+        trace_vfs_utimes_err(error);
+        return libc_error(error);
+    }
+
+    trace_vfs_utimes_ret();
+    return 0;
+}
+
+extern "C"
+int utimes(const char *pathname, const struct timeval times[2])
+{
+    return do_utimes(pathname, times, 0);
+}
+
+extern "C"
+int lutimes(const char *pathname, const struct timeval times[2])
+{
+    return do_utimes(pathname, times, AT_SYMLINK_NOFOLLOW);
+}
+
+extern "C"
+int utime(const char *pathname, const struct utimbuf *t)
+{
+    using namespace std::chrono;
+
+    struct timeval times[2];
+    times[0].tv_usec = 0;
+    times[1].tv_usec = 0;
+    if (!t) {
+        long int tsec = 
duration_cast<seconds>(osv::clock::wall::now().time_since_epoch()).count();
+        times[0].tv_sec = tsec;
+        times[1].tv_sec = tsec;
+    } else {
+        times[0].tv_sec = t->actime;
+        times[1].tv_sec = t->modtime;
+    }
+
+    return utimes(pathname, times);
+}
+
+TRACEPOINT(trace_vfs_chmod, "\"%s\" 0%0o", const char*, mode_t);
+TRACEPOINT(trace_vfs_chmod_ret, "");
+TRACEPOINT(trace_vfs_chmod_err, "%d", int);
+
+int chmod(const char *pathname, mode_t mode)
+{
+    trace_vfs_chmod(pathname, mode);
+    struct task *t = main_task;
+    char path[PATH_MAX];
+    int error = ENOENT;
+    if (pathname == nullptr)
+        goto out_errno;
+    if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+        goto out_errno;
+    error = sys_chmod(path, mode & ALLPERMS);
+    if (error)
+        goto out_errno;
+    trace_vfs_chmod_ret();
+    return 0;
+out_errno:
+    trace_vfs_chmod_err(error);
+    errno = error;
+    return -1;
+}
+
+TRACEPOINT(trace_vfs_fchmod, "\"%d\" 0%0o", int, mode_t);
+TRACEPOINT(trace_vfs_fchmod_ret, "");
+
+int fchmod(int fd, mode_t mode)
+{
+    trace_vfs_fchmod(fd, mode);
+    auto error = sys_fchmod(fd, mode & ALLPERMS);
+    trace_vfs_fchmod_ret();
+    if (error) {
+        errno = error;
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+TRACEPOINT(trace_vfs_fchown, "\"%d\" %d %d", int, uid_t, gid_t);
+TRACEPOINT(trace_vfs_fchown_ret, "");
+
+int fchown(int fd, uid_t owner, gid_t group)
+{
+    trace_vfs_fchown(fd, owner, group);
+    WARN_STUBBED();
+    trace_vfs_fchown_ret();
+    return 0;
+}
+
+int chown(const char *path, uid_t owner, gid_t group)
+{
+    WARN_STUBBED();
+    return 0;
+}
+
+int lchown(const char *path, uid_t owner, gid_t group)
+{
+    WARN_STUBBED();
+    return 0;
+}
+
+
+ssize_t sendfile(int out_fd, int in_fd, off_t *_offset, size_t count)
+{
+    struct file *in_fp;
+    struct file *out_fp;
+    fileref in_f{fileref_from_fd(in_fd)};
+    fileref out_f{fileref_from_fd(out_fd)};
+
+    if (!in_f || !out_f) {
+        return libc_error(EBADF);
+    }
+
+    in_fp = in_f.get();
+    out_fp = out_f.get();
+
+    if (!in_fp->f_dentry) {
+        return libc_error(EBADF);
+    }
+
+    if (!(in_fp->f_flags & FREAD)) {
+        return libc_error(EBADF);
+    }
+
+    if (out_fp->f_type & DTYPE_VNODE) {
+        if (!out_fp->f_dentry) {
+            return libc_error(EBADF);
+       } else if (!(out_fp->f_flags & FWRITE)) {
+            return libc_error(EBADF);
+       }
+    }
+
+    off_t offset ;
+
+    if (_offset != nullptr) {
+        offset = *_offset;
+    } else {
+        /* if _offset is nullptr, we need to read from the present position of 
in_fd */
+        offset = lseek(in_fd, 0, SEEK_CUR);
+    }
+
+    // Constrain count to the extent of the file...
+    struct stat st;
+    if (fstat(in_fd, &st) < 0) {
+        return -1;
+    } else {
+        if (offset >= st.st_size) {
+            return 0;
+        } else if ((offset + count) >= st.st_size) {
+            count = st.st_size - offset;
+            if (count == 0) {
+                return 0;
+            }
+        }
+    }
+
+    size_t bytes_to_mmap = count + (offset % mmu::page_size);
+    off_t offset_for_mmap =  align_down(offset, (off_t)mmu::page_size);
+
+    char *src = static_cast<char *>(mmap(nullptr, bytes_to_mmap, PROT_READ, 
MAP_SHARED, in_fd, offset_for_mmap));
+
+    if (src == MAP_FAILED) {
+        return -1;
+    }
+
+    auto ret = write(out_fd, src + (offset % PAGESIZE), count);
+
+    if (ret < 0) {
+        return libc_error(errno);
+    } else if(_offset == nullptr) {
+        lseek(in_fd, ret, SEEK_CUR);
+    } else {
+        *_offset += ret;
+    }
+
+    assert(munmap(src, count) == 0);
+
+    return ret;
+}
+
+#undef sendfile64
+LFS64(sendfile);
+
+NO_SYS(int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags));
+
+mode_t umask(mode_t newmask)
+{
+    return global_umask.exchange(newmask, std::memory_order_relaxed);
+}
+
+int
+fs_noop(void)
+{
+    return 0;
+}
+
+int chroot(const char *path)
+{
+    WARN_STUBBED();
+    errno = ENOSYS;
+    return -1;
+}
+
+// unpack_bootfs() unpacks a collection of files stored as part of the OSv
+// executable (in memory location "bootfs_start") into the file system,
+// normally the in-memory filesystem ramfs.
+// The files are packed in the executable in an ad-hoc format defined here.
+// Code in scripts/mkbootfs.py packs files into this format.
+#define BOOTFS_PATH_MAX 111
+enum class bootfs_file_type : char { other = 0, symlink = 1 };
+struct bootfs_metadata {
+    uint64_t size;
+    uint64_t offset;
+    // The file's type. Can be "symlink" or "other". A directory is an "other"
+    // file with its name ending with a "/" (and no content).
+    bootfs_file_type type;
+    // name must end with a null. For symlink files, the content must end
+    // with a null as well.
+    char name[BOOTFS_PATH_MAX];
+};
+
+extern char bootfs_start;
+
+int ramfs_set_file_data(struct vnode *vp, const void *data, size_t size);
+void unpack_bootfs(void)
+{
+    struct bootfs_metadata *md = (struct bootfs_metadata *)&bootfs_start;
+    int fd, i;
+
+    for (i = 0; md[i].name[0]; i++) {
+        int ret;
+        char *p;
+
+        // mkdir() directories needed for this path name, as necessary
+        char tmp[BOOTFS_PATH_MAX];
+        strlcpy(tmp, md[i].name, BOOTFS_PATH_MAX);
+        for (p = tmp; *p; ++p) {
+            if (*p == '/') {
+                *p = '\0';
+                mkdir(tmp, 0666);  // silently ignore errors and existing dirs
+                *p = '/';
+            }
+        }
+
+        if (md[i].type == bootfs_file_type::symlink) {
+            // This is a symbolic link record. The file's content is the
+            // target path, and we assume ends with a null.
+            if (symlink(&bootfs_start + md[i].offset, md[i].name) != 0) {
+                kprintf("couldn't symlink %s: %d\n", md[i].name, errno);
+                sys_panic("unpack_bootfs failed");
+            }
+            continue;
+        }
+        if (*(p-1) == '/' && md[i].size == 0) {
+            // This is directory record. Nothing else to do
+            continue;
+        }
+
+        fd = creat(md[i].name, 0666);
+        if (fd < 0) {
+            kprintf("couldn't create %s: %d\n",
+                    md[i].name, errno);
+            sys_panic("unpack_bootfs failed");
+        }
+
+        struct file *fp;
+        int error = fget(fd, &fp);
+        if (error) {
+            kprintf("couldn't fget %s: %d\n",
+                    md[i].name, error);
+            sys_panic("unpack_bootfs failed");
+        }
+
+        struct vnode *vp = fp->f_dentry->d_vnode;
+        ret = ramfs_set_file_data(vp, &bootfs_start + md[i].offset, 
md[i].size);
+        if (ret) {
+            kprintf("ramfs_set_file_data failed, ret = %d\n", ret);
+            sys_panic("unpack_bootfs failed");
+        }
+
+        fdrop(fp);
+        close(fd);
+    }
+}
+
+void mount_rootfs(void)
+{
+    int ret;
+
+    ret = sys_mount("", "/", "ramfs", 0, nullptr);
+    if (ret)
+        kprintf("failed to mount rootfs, error = %s\n", strerror(ret));
+
+    if (mkdir("/dev", 0755) < 0)
+        kprintf("failed to create /dev, error = %s\n", strerror(errno));
+
+    ret = sys_mount("", "/dev", "devfs", 0, nullptr);
+    if (ret)
+        kprintf("failed to mount devfs, error = %s\n", strerror(ret));
+}
+
+extern "C"
+int nmount(struct iovec *iov, unsigned niov, int flags)
+{
+    struct args {
+        char* fstype = nullptr;
+        char* fspath = nullptr;
+        char* from = nullptr;
+    };
+    static unordered_map<string, char* args::*> argmap {
+        { "fstype", &args::fstype },
+        { "fspath", &args::fspath },
+        { "from", &args::from },
+    };
+    args a;
+    for (size_t i = 0; i < niov; i += 2) {
+        std::string s(static_cast<const char*>(iov[i].iov_base));
+        if (argmap.count(s)) {
+            a.*(argmap[s]) = static_cast<char*>(iov[i+1].iov_base);
+        }
+    }
+    return sys_mount(a.from, a.fspath, a.fstype, flags, nullptr);
+}
+
+static void import_extra_zfs_pools(void)
+{
+    struct stat st;
+    int ret;
+
+    // The file '/etc/mnttab' is a LibZFS requirement and will not
+    // exist during cpiod phase. The functionality provided by this
+    // function isn't needed during that phase, so let's skip it.
+    if (stat("/etc/mnttab" , &st) != 0) {
+        return;
+    }
+
+    // Import extra pools mounting datasets there contained.
+    // Datasets from osv pool will not be mounted here.
+    if (access("zpool.so", X_OK) != 0) {
+        return;
+    }
+    vector<string> zpool_args = {"zpool", "import", "-f", "-a" };
+    auto ok = osv::run("zpool.so", zpool_args, &ret);
+    assert(ok);
+
+    if (!ret) {
+        debug("zfs: extra ZFS pool(s) found.\n");
+    }
+}
+
+void pivot_rootfs(const char* path)
+{
+    int ret = sys_pivot_root(path, "/");
+    if (ret)
+        kprintf("failed to pivot root, error = %s\n", strerror(ret));
+
+    auto ent = setmntent("/etc/fstab", "r");
+    if (!ent) {
+        return;
+    }
+
+    struct mntent *m = nullptr;
+    while ((m = getmntent(ent)) != nullptr) {
+        if (!strcmp(m->mnt_dir, "/")) {
+            continue;
+        }
+
+        if ((m->mnt_opts != nullptr) && strcmp(m->mnt_opts, MNTOPT_DEFAULTS)) {
+            printf("Warning: opts %s, ignored for fs %s\n", m->mnt_opts, 
m->mnt_type);
+        }
+
+        // FIXME: Right now, ignoring mntops. In the future we may have an 
option parser
+        ret = sys_mount(m->mnt_fsname, m->mnt_dir, m->mnt_type, 0, nullptr);
+        if (ret) {
+            printf("failed to mount %s, error = %s\n", m->mnt_type, 
strerror(ret));
+        }
+    }
+    endmntent(ent);
+}
+
+extern "C" void unmount_devfs()
+{
+    int ret = sys_umount("/dev");
+    if (ret)
+        kprintf("failed to unmount /dev, error = %s\n", strerror(ret));
+}
+
+extern "C" int mount_rofs_rootfs(bool pivot_root)
+{
+    int ret;
+
+    if (mkdir("/rofs", 0755) < 0)
+        kprintf("failed to create /rofs, error = %s\n", strerror(errno));
+
+    ret = sys_mount("/dev/vblk0.1", "/rofs", "rofs", MNT_RDONLY, 0);
+
+    if (ret) {
+        kprintf("failed to mount /rofs, error = %s\n", strerror(ret));
+        rmdir("/rofs");
+        return ret;
+    }
+
+    if (pivot_root) {
+        pivot_rootfs("/rofs");
+    }
+
+    return 0;
+}
+
+extern "C" void mount_zfs_rootfs(bool pivot_root)
+{
+    if (mkdir("/zfs", 0755) < 0)
+        kprintf("failed to create /zfs, error = %s\n", strerror(errno));
+
+    int ret = sys_mount("/dev/vblk0.1", "/zfs", "zfs", 0, (void *)"osv/zfs");
+
+    if (ret)
+        kprintf("failed to mount /zfs, error = %s\n", strerror(ret));
+
+    if (!pivot_root) {
+        return;
+    }
+
+    pivot_rootfs("/zfs");
+
+    import_extra_zfs_pools();
+}
+
+extern "C" void unmount_rootfs(void)
+{
+    int ret;
+
+    sys_umount("/dev");
+
+    ret = sys_umount("/proc");
+    if (ret) {
+        kprintf("Warning: unmount_rootfs: failed to unmount /proc, "
+            "error = %s\n", strerror(ret));
+    }
+
+    ret = sys_umount2("/", MNT_FORCE);
+    if (ret) {
+        kprintf("Warning: unmount_rootfs: failed to unmount /, "
+            "error = %s\n", strerror(ret));
+    }
+}
+
+extern "C" void bio_init(void);
+extern "C" void bio_sync(void);
+
+int vfs_initialized;
+
+extern "C"
+void
+vfs_init(void)
+{
+    const struct vfssw *fs;
+
+    bio_init();
+    lookup_init();
+    vnode_init();
+    task_alloc(&main_task);
+
+    /*
+     * Initialize each file system.
+     */
+    for (fs = vfssw; fs->vs_name; fs++) {
+        if (fs->vs_init) {
+            DPRINTF(VFSDB_CORE, ("VFS: initializing %s\n",
+                    fs->vs_name));
+            fs->vs_init();
+        }
+    }
+
+    mount_rootfs();
+    unpack_bootfs();
+
+    // if (open("/dev/console", O_RDWR, 0) != 0)
+    if (console::open() != 0)
+        kprintf("failed to open console, error = %d\n", errno);
+    if (dup(0) != 1)
+        kprintf("failed to dup console (1)\n");
+    if (dup(0) != 2)
+        kprintf("failed to dup console (2)\n");
+    vfs_initialized = 1;
+}
+
+void vfs_exit(void)
+{
+    /* Free up main_task (stores cwd data) resources */
+    replace_cwd(main_task, nullptr, []() { return 0; });
+    /* Unmount all file systems */
+    unmount_rootfs();
+    /* Finish with the bio layer */
+    bio_sync();
+}
+
+void sys_panic(const char *str)
+{
+    abort("panic: %s", str);
+}
+
diff --git a/lib/vfscore/mount.c b/lib/vfscore/mount.c
new file mode 100644
index 00000000..dac4d09c
--- /dev/null
+++ b/lib/vfscore/mount.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_mount.c - mount operations
+ */
+
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <dirent.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/device.h>
+#include <osv/debug.h>
+#include <osv/mutex.h>
+#include "vfs.h"
+
+#include <memory>
+#include <list>
+
+/*
+ * List for VFS mount points.
+ */
+static std::list<mount*> mount_list;
+
+/*
+ * Global lock to access mount point.
+ */
+static mutex mount_lock;
+
+/*
+ * Lookup file system.
+ */
+static const struct vfssw *
+fs_getfs(const char *name)
+{
+    const struct vfssw *fs;
+
+    for (fs = vfssw; fs->vs_name; fs++) {
+        if (!strncmp(name, fs->vs_name, FSMAXNAMES))
+            break;
+    }
+    if (!fs->vs_name)
+        return nullptr;
+    return fs;
+}
+
+const char*
+fs_getfsname(vfsops* ops)
+{
+    for (auto fs = vfssw; fs->vs_name; fs++) {
+        if (fs->vs_op == ops) {
+            return fs->vs_name;
+        }
+    }
+    abort();
+}
+
+int
+sys_mount(const char *dev, const char *dir, const char *fsname, int flags, 
const void *data)
+{
+    const struct vfssw *fs;
+    struct mount *mp;
+    struct device *device;
+    struct dentry *dp_covered;
+    struct vnode *vp;
+    int error;
+
+    kprintf("VFS: mounting %s at %s\n", fsname, dir);
+
+    if (!dir || *dir == '\0')
+        return ENOENT;
+
+    /* Find a file system. */
+    if (!(fs = fs_getfs(fsname)))
+        return ENODEV;  /* No such file system */
+
+    /* Open device. nullptr can be specified as a device. */
+    // Allow device_open() to fail, in which case dev is interpreted
+    // by the file system mount routine (e.g zfs pools)
+    device = 0;
+    if (dev && strncmp(dev, "/dev/", 5) == 0)
+        device_open(dev + 5, DO_RDWR, &device);
+
+    /* Check if device or directory has already been mounted. */
+    // We need to avoid the situation where after we already verified that
+    // the mount point is free, but before we actually add it to mount_list,
+    // another concurrent mount adds it. So we use a new mutex to ensure
+    // that only one sys_mount() runs at a time. We cannot reuse the existing
+    // mount_lock for this purpose: If we take mount_lock and then do
+    // lookups, this is lock order inversion and can result in deadlock.
+    static mutex sys_mount_lock;
+    SCOPE_LOCK(sys_mount_lock);
+    WITH_LOCK(mount_lock) {
+        for (auto&& mp : mount_list) {
+            if (!strcmp(mp->m_path, dir) ||
+                (device && mp->m_dev == device)) {
+                error = EBUSY;  /* Already mounted */
+                goto err1;
+            }
+        }
+    }
+    /*
+     * Create VFS mount entry.
+     */
+    if (!(mp = new mount)) {
+        error = ENOMEM;
+        goto err1;
+    }
+    mp->m_count = 0;
+    mp->m_op = fs->vs_op;
+    mp->m_flags = flags;
+    mp->m_dev = device;
+    mp->m_data = nullptr;
+    strlcpy(mp->m_path, dir, sizeof(mp->m_path));
+    strlcpy(mp->m_special, dev, sizeof(mp->m_special));
+
+    /*
+     * Get vnode to be covered in the upper file system.
+     */
+    if (*dir == '/' && *(dir + 1) == '\0') {
+        /* Ignore if it mounts to global root directory. */
+        dp_covered = nullptr;
+    } else {
+        if ((error = namei(dir, &dp_covered)) != 0) {
+
+            error = ENOENT;
+            goto err2;
+        }
+        if (dp_covered->d_vnode->v_type != VDIR) {
+            error = ENOTDIR;
+            goto err3;
+        }
+    }
+    mp->m_covered = dp_covered;
+
+    /*
+     * Create a root vnode for this file system.
+     */
+    vget(mp, 0, &vp);
+    if (vp == nullptr) {
+        error = ENOMEM;
+        goto err3;
+    }
+    vp->v_type = VDIR;
+    vp->v_flags = VROOT;
+    vp->v_mode = S_IFDIR | S_IRUSR | S_IWUSR | S_IXUSR;
+
+    mp->m_root = dentry_alloc(nullptr, vp, "/");
+    if (!mp->m_root) {
+        vput(vp);
+        goto err3;
+    }
+    vput(vp);
+
+    /*
+     * Call a file system specific routine.
+     */
+    if ((error = VFS_MOUNT(mp, dev, flags, data)) != 0)
+        goto err4;
+
+    if (mp->m_flags & MNT_RDONLY)
+        vp->v_mode &=~S_IWUSR;
+
+    /*
+     * Insert to mount list
+     */
+    WITH_LOCK(mount_lock) {
+        mount_list.push_back(mp);
+    }
+
+    return 0;   /* success */
+ err4:
+    drele(mp->m_root);
+ err3:
+    if (dp_covered)
+        drele(dp_covered);
+ err2:
+    delete mp;
+ err1:
+    if (device)
+        device_close(device);
+
+    return error;
+}
+
+void
+release_mp_dentries(struct mount *mp)
+{
+    /* Decrement referece count of root vnode */
+    if (mp->m_covered) {
+        drele(mp->m_covered);
+    }
+
+    /* Release root dentry */
+    drele(mp->m_root);
+}
+
+int
+sys_umount2(const char *path, int flags)
+{
+    struct mount *mp;
+    int error, pathlen;
+
+    kprintf("VFS: unmounting %s\n", path);
+
+    SCOPE_LOCK(mount_lock);
+
+    pathlen = strlen(path);
+    if (pathlen >= MAXPATHLEN) {
+        error = ENAMETOOLONG;
+        goto out;
+    }
+
+    /* Get mount entry */
+    for (auto&& tmp : mount_list) {
+        if (!strcmp(path, tmp->m_path)) {
+            mp = tmp;
+            goto found;
+        }
+    }
+
+    error = EINVAL;
+    goto out;
+
+found:
+    /*
+     * Root fs can not be unmounted.
+     */
+    if (mp->m_covered == nullptr && !(flags & MNT_FORCE)) {
+        error = EINVAL;
+        goto out;
+    }
+
+    if ((error = VFS_UNMOUNT(mp, flags)) != 0)
+        goto out;
+    mount_list.remove(mp);
+
+#ifdef HAVE_BUFFERS
+    /* Flush all buffers */
+    binval(mp->m_dev);
+#endif
+
+    if (mp->m_dev)
+        device_close(mp->m_dev);
+    delete mp;
+ out:
+    return error;
+}
+
+int
+sys_umount(const char *path)
+{
+    return sys_umount2(path, 0);
+}
+
+int
+sys_pivot_root(const char *new_root, const char *put_old)
+{
+    struct mount *newmp = nullptr, *oldmp = nullptr;
+    int error;
+
+    WITH_LOCK(mount_lock) {
+        for (auto&& mp : mount_list) {
+            if (!strcmp(mp->m_path, new_root)) {
+                newmp = mp;
+            }
+            if (!strcmp(mp->m_path, put_old)) {
+                oldmp = mp;
+            }
+        }
+        if (!newmp || !oldmp || newmp == oldmp) {
+            return EINVAL;
+        }
+        for (auto&& mp : mount_list) {
+            if (mp == newmp || mp == oldmp) {
+                continue;
+            }
+            if (!strncmp(mp->m_path, put_old, strlen(put_old))) {
+                return EBUSY;
+            }
+        }
+        if ((error = VFS_UNMOUNT(oldmp, 0)) != 0) {
+            return error;
+        }
+        mount_list.remove(oldmp);
+
+        newmp->m_root->d_vnode->v_mount = newmp;
+
+        if (newmp->m_covered) {
+            drele(newmp->m_covered);
+        }
+        newmp->m_covered = nullptr;
+
+        if (newmp->m_root->d_parent) {
+            drele(newmp->m_root->d_parent);
+        }
+        newmp->m_root->d_parent = nullptr;
+
+        strlcpy(newmp->m_path, "/", sizeof(newmp->m_path));
+    }
+    return 0;
+}
+
+int
+sys_sync(void)
+{
+    /* Call each mounted file system. */
+    WITH_LOCK(mount_lock) {
+        for (auto&& mp : mount_list) {
+            VFS_SYNC(mp);
+        }
+    }
+#ifdef HAVE_BUFFERS
+    bio_sync();
+#endif
+    return 0;
+}
+
+/*
+ * Compare two path strings. Return matched length.
+ * @path: target path.
+ * @root: vfs root path as mount point.
+ */
+static size_t
+count_match(const char *path, char *mount_root)
+{
+    size_t len = 0;
+
+    while (*path && *mount_root) {
+        if (*path != *mount_root)
+            break;
+
+        path++;
+        mount_root++;
+        len++;
+    }
+    if (*mount_root != '\0')
+        return 0;
+
+    if (len == 1 && *(path - 1) == '/')
+        return 1;
+
+    if (*path == '\0' || *path == '/')
+        return len;
+    return 0;
+}
+
+/*
+ * Get the root directory and mount point for specified path.
+ * @path: full path.
+ * @mp: mount point to return.
+ * @root: pointer to root directory in path.
+ */
+int
+vfs_findroot(const char *path, struct mount **mp, char **root)
+{
+    struct mount *m = nullptr;
+    size_t len, max_len = 0;
+
+    if (!path)
+        return -1;
+
+    /* Find mount point from nearest path */
+    SCOPE_LOCK(mount_lock);
+    for (auto&& tmp : mount_list) {
+        len = count_match(path, tmp->m_path);
+        if (len > max_len) {
+            max_len = len;
+            m = tmp;
+        }
+    }
+    if (m == nullptr)
+        return -1;
+    *root = (char *)(path + max_len);
+    if (**root == '/')
+        (*root)++;
+    *mp = m;
+    return 0;
+}
+
+/*
+ * Mark a mount point as busy.
+ */
+void
+vfs_busy(struct mount *mp)
+{
+    SCOPE_LOCK(mount_lock);
+    mp->m_count++;
+}
+
+
+/*
+ * Mark a mount point as busy.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+    SCOPE_LOCK(mount_lock);
+    mp->m_count--;
+}
+
+int
+vfs_nullop(void)
+{
+    return 0;
+}
+
+int
+vfs_einval(void)
+{
+    return EINVAL;
+}
+
+namespace osv {
+
+mount_desc to_mount_desc(mount* m)
+{
+    mount_desc ret;
+    ret.special = m->m_special;
+    ret.path = m->m_path;
+    ret.type = fs_getfsname(m->m_op);
+    // FIXME: record options
+    ret.options = "";
+    return ret;
+}
+
+std::vector<mount_desc>
+current_mounts()
+{
+    WITH_LOCK(mount_lock) {
+        std::vector<mount_desc> ret;
+        for (auto&& mp : mount_list) {
+            ret.push_back(to_mount_desc(mp));
+        }
+        return ret;
+    }
+}
+
+}
+
+#ifdef DEBUG_VFS
+void
+mount_dump(void)
+{
+    SCOPE_LOCK(mount_lock);
+
+    kprintf("mount_dump\n");
+    kprintf("dev      count root\n");
+    kprintf("-------- ----- --------\n");
+
+    for (auto&& mp : mount_list) {
+        kprintf("%8x %5d %s\n", mp->m_dev, mp->m_count, mp->m_path);
+    }
+}
+#endif
diff --git a/lib/vfscore/subr_uio.c b/lib/vfscore/subr_uio.c
new file mode 100644
index 00000000..bf138b8e
--- /dev/null
+++ b/lib/vfscore/subr_uio.c
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <osv/uio.h>
+
+int
+uiomove(void *cp, int n, struct uio *uio)
+{
+       assert(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
+
+       while (n > 0 && uio->uio_resid) {
+               struct iovec *iov = uio->uio_iov;
+               int cnt = iov->iov_len;
+               if (cnt == 0) {
+                       uio->uio_iov++;
+                       uio->uio_iovcnt--;
+                       continue;
+               }
+               if (cnt > n)
+                       cnt = n;
+
+               if (uio->uio_rw == UIO_READ)
+                       memcpy(iov->iov_base, cp, cnt);
+               else
+                       memcpy(cp, iov->iov_base, cnt);
+
+               iov->iov_base = (char *)iov->iov_base + cnt;
+               iov->iov_len -= cnt;
+               uio->uio_resid -= cnt;
+               uio->uio_offset += cnt;
+               cp = (char *)cp + cnt;
+               n -= cnt;
+       }
+
+       return 0;
+}
diff --git a/lib/vfscore/syscalls.c b/lib/vfscore/syscalls.c
new file mode 100644
index 00000000..487d5729
--- /dev/null
+++ b/lib/vfscore/syscalls.c
@@ -0,0 +1,1486 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_syscalls.c - everything in this file is a routine implementing
+ *                  a VFS system call.
+ */
+
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/vfs_file.hh>
+#include "vfs.h"
+#include <fs/fs.hh>
+
+extern struct task *main_task;
+
+static int
+open_no_follow_chk(char *path)
+{
+       int           error;
+       struct dentry *ddp;
+       char          *name;
+       struct dentry *dp;
+       struct vnode  *vp;
+
+       ddp = nullptr;
+       dp  = nullptr;
+       vp  = nullptr;
+
+       error = lookup(path, &ddp, &name);
+       if (error) {
+               return (error);
+       }
+
+       error = namei_last_nofollow(path, ddp, &dp);
+       if (error) {
+               goto out;
+       }
+
+       vp = dp->d_vnode;
+       vn_lock(vp);
+       if (vp->v_type == VLNK) {
+               error = ELOOP;
+               goto out;
+       }
+
+       error = 0;
+out:
+       if (vp != nullptr) {
+               vn_unlock(vp);
+       }
+
+       if (dp != nullptr) {
+               drele(dp);
+       }
+
+       if (ddp != nullptr) {
+               drele(ddp);
+       }
+
+       return (error);
+}
+
+int
+sys_open(char *path, int flags, mode_t mode, struct file **fpp)
+{
+       file *fp;
+       struct dentry *dp, *ddp;
+       struct vnode *vp;
+       char *filename;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_open: path=%s flags=%x mode=%x\n",
+                               path, flags, mode));
+
+       flags = fflags(flags);
+       if (flags & O_CREAT) {
+               error = namei(path, &dp);
+               if (error == ENOENT) {
+                       /* Create new file. */
+                       if ((error = lookup(path, &ddp, &filename)) != 0)
+                               return error;
+
+                       vn_lock(ddp->d_vnode);
+                       if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) {
+                               vn_unlock(ddp->d_vnode);
+                               drele(ddp);
+                               return error;
+                       }
+                       mode &= ~S_IFMT;
+                       mode |= S_IFREG;
+                       error = VOP_CREATE(ddp->d_vnode, filename, mode);
+                       vn_unlock(ddp->d_vnode);
+                       drele(ddp);
+
+                       if (error)
+                               return error;
+                       if ((error = namei(path, &dp)) != 0)
+                               return error;
+
+                       vp = dp->d_vnode;
+                       flags &= ~O_TRUNC;
+               } else if (error) {
+                       return error;
+               } else {
+                       /* File already exits */
+                       if (flags & O_EXCL) {
+                               error = EEXIST;
+                               goto out_drele;
+                       }
+               }
+
+               vp = dp->d_vnode;
+               flags &= ~O_CREAT;
+       } else {
+               /* Open */
+               if (flags & O_NOFOLLOW) {
+                       error = open_no_follow_chk(path);
+                       if (error != 0) {
+                               return (error);
+                       }
+               }
+               error = namei(path, &dp);
+               if (error)
+                       return error;
+
+               vp = dp->d_vnode;
+
+               if (flags & FWRITE || flags & O_TRUNC) {
+                       error = vn_access(vp, VWRITE);
+                       if (error)
+                               goto out_drele;
+
+                       error = EISDIR;
+                       if (vp->v_type == VDIR)
+                               goto out_drele;
+               }
+               if (flags & O_DIRECTORY) {
+                   if (vp->v_type != VDIR) {
+                       error = ENOTDIR;
+                       goto out_drele;
+                   }
+               }
+       }
+
+       vn_lock(vp);
+       /* Process truncate request */
+       if (flags & O_TRUNC) {
+               error = EINVAL;
+               if (!(flags & FWRITE) || vp->v_type == VDIR)
+                       goto out_vn_unlock;
+
+               error = VOP_TRUNCATE(vp, 0);
+               if (error)
+                       goto out_vn_unlock;
+       }
+
+       try {
+           fileref f = make_file<vfs_file>(flags);
+           fp = f.get();
+           fhold(fp);
+       } catch (int err) {
+           error = err;
+           goto out_vn_unlock;
+       }
+       // change to std::move once dp is a dentry_ref
+       fp->f_dentry = dentry_ref(dp, false);
+       dp = nullptr;
+
+       error = VOP_OPEN(vp, fp);
+       if (error) {
+               vn_unlock(vp);
+               // Note direct delete of fp instead of fdrop(fp). fp was never
+               // returned so cannot be in use, and because it wasn't opened
+               // it cannot be close()ed.
+               delete fp;
+               return error;
+       }
+       vn_unlock(vp);
+
+       *fpp = fp;
+       return 0;
+
+out_vn_unlock:
+       vn_unlock(vp);
+out_drele:
+       if (dp) {
+               drele(dp);
+       }
+       return error;
+}
+
+int
+sys_close(struct file *fp)
+{
+
+       return 0;
+}
+
+int
+sys_read(struct file *fp, const struct iovec *iov, size_t niov,
+               off_t offset, size_t *count)
+{
+    if ((fp->f_flags & FREAD) == 0)
+        return EBADF;
+
+    size_t bytes = 0;
+    auto iovp = iov;
+    for (unsigned i = 0; i < niov; i++) {
+        if (iovp->iov_len > IOSIZE_MAX - bytes) {
+            return EINVAL;
+        }
+        bytes += iovp->iov_len;
+        iovp++;
+    }
+
+    if (bytes == 0) {
+        *count = 0;
+        return 0;
+    }
+
+    struct uio uio;
+    // Unfortunately, the current implementation of fp->read zeros the
+    // iov_len fields when it reads from disk, so we have to copy iov.
+    std::vector<iovec> copy_iov(iov, iov + niov);
+    uio.uio_iov = copy_iov.data();
+    uio.uio_iovcnt = niov;
+    uio.uio_offset = offset;
+    uio.uio_resid = bytes;
+    uio.uio_rw = UIO_READ;
+    auto error = fp->read(&uio, (offset == -1) ? 0 : FOF_OFFSET);
+    *count = bytes - uio.uio_resid;
+    return error;
+}
+
+int
+sys_write(struct file *fp, const struct iovec *iov, size_t niov,
+               off_t offset, size_t *count)
+{
+    if ((fp->f_flags & FWRITE) == 0)
+        return EBADF;
+
+    size_t bytes = 0;
+    auto iovp = iov;
+    for (unsigned i = 0; i < niov; i++) {
+        if (iovp->iov_len > IOSIZE_MAX - bytes) {
+            return EINVAL;
+        }
+        bytes += iovp->iov_len;
+        iovp++;
+    }
+
+    if (bytes == 0) {
+        *count = 0;
+        return 0;
+    }
+
+    struct uio uio;
+    // Unfortunately, the current implementation of fp->write zeros the
+    // iov_len fields when it writes to disk, so we have to copy iov.
+    std::vector<iovec> copy_iov(iov, iov + niov);
+    uio.uio_iov = copy_iov.data();
+    uio.uio_iovcnt = niov;
+    uio.uio_offset = offset;
+    uio.uio_resid = bytes;
+    uio.uio_rw = UIO_WRITE;
+    auto error = fp->write(&uio, (offset == -1) ? 0 : FOF_OFFSET);
+    *count = bytes - uio.uio_resid;
+    return error;
+}
+
+int
+sys_lseek(struct file *fp, off_t off, int type, off_t *origin)
+{
+       struct vnode *vp;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_seek: fp=%x off=%d type=%d\n",
+                               (u_long)fp, (u_int)off, type));
+
+       if (!fp->f_dentry) {
+           // Linux doesn't implement lseek() on pipes, sockets, or ttys.
+           // In OSV, we only implement lseek() on regular files, backed by 
vnode
+           return ESPIPE;
+       }
+
+       vp = fp->f_dentry->d_vnode;
+       int error = EINVAL;
+       vn_lock(vp);
+       switch (type) {
+       case SEEK_CUR:
+               off = fp->f_offset + off;
+               break;
+       case SEEK_END:
+               off = vp->v_size + off;
+               break;
+       }
+       if (off >= 0) {
+               error = VOP_SEEK(vp, fp, fp->f_offset, off);
+               if (!error) {
+                       *origin      = off;
+                       fp->f_offset = off;
+               }
+       }
+       vn_unlock(vp);
+       return error;
+}
+
+int
+sys_ioctl(struct file *fp, u_long request, void *buf)
+{
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: fp=%x request=%x\n", fp, request));
+
+       if ((fp->f_flags & (FREAD | FWRITE)) == 0)
+               return EBADF;
+
+       error = fp->ioctl(request, buf);
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: comp error=%d\n", error));
+       return error;
+}
+
+int
+sys_fsync(struct file *fp)
+{
+       struct vnode *vp;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_fsync: fp=%x\n", fp));
+
+       if (!fp->f_dentry)
+               return EINVAL;
+
+       vp = fp->f_dentry->d_vnode;
+       vn_lock(vp);
+       error = VOP_FSYNC(vp, fp);
+       vn_unlock(vp);
+       return error;
+}
+
+int
+sys_fstat(struct file *fp, struct stat *st)
+{
+       int error = 0;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_fstat: fp=%x\n", fp));
+
+       error = fp->stat(st);
+
+       return error;
+}
+
+/*
+ * Return 0 if directory is empty
+ */
+static int
+check_dir_empty(char *path)
+{
+       int error;
+       struct file *fp;
+       struct dirent dir;
+
+       DPRINTF(VFSDB_SYSCALL, ("check_dir_empty\n"));
+
+       error = sys_open(path, O_RDONLY, 0, &fp);
+       if (error)
+               goto out_error;
+
+       do {
+               error = sys_readdir(fp, &dir);
+               if (error != 0 && error != EACCES)
+                       break;
+       } while (!strcmp(dir.d_name, ".") || !strcmp(dir.d_name, ".."));
+
+       if (error == ENOENT)
+               error = 0;
+       else if (error == 0) {
+           // Posix specifies to return EEXIST in this case (rmdir of non-empty
+           // directory, but Linux actually returns ENOTEMPTY).
+               error = ENOTEMPTY;
+       }
+       fdrop(fp);
+out_error:
+       return error;
+}
+
+int
+sys_readdir(struct file *fp, struct dirent *dir)
+{
+       struct vnode *dvp;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_readdir: fp=%x\n", fp));
+
+       if (!fp->f_dentry)
+               return ENOTDIR;
+
+       dvp = fp->f_dentry->d_vnode;
+       vn_lock(dvp);
+       if (dvp->v_type != VDIR) {
+               vn_unlock(dvp);
+               return ENOTDIR;
+       }
+       error = VOP_READDIR(dvp, fp, dir);
+       DPRINTF(VFSDB_SYSCALL, ("sys_readdir: error=%d path=%s\n",
+                               error, dir->d_name));
+       vn_unlock(dvp);
+       return error;
+}
+
+int
+sys_rewinddir(struct file *fp)
+{
+       struct vnode *dvp;
+
+       if (!fp->f_dentry)
+               return ENOTDIR;
+
+       dvp = fp->f_dentry->d_vnode;
+       vn_lock(dvp);
+       if (dvp->v_type != VDIR) {
+               vn_unlock(dvp);
+               return EBADF;
+       }
+       fp->f_offset = 0;
+       vn_unlock(dvp);
+       return 0;
+}
+
+int
+sys_seekdir(struct file *fp, long loc)
+{
+       struct vnode *dvp;
+
+       if (!fp->f_dentry)
+               return ENOTDIR;
+
+       dvp = fp->f_dentry->d_vnode;
+       vn_lock(dvp);
+       if (dvp->v_type != VDIR) {
+               vn_unlock(dvp);
+               return EBADF;
+       }
+       fp->f_offset = (off_t)loc;
+       vn_unlock(dvp);
+       return 0;
+}
+
+int
+sys_telldir(struct file *fp, long *loc)
+{
+       struct vnode *dvp;
+
+       if (!fp->f_dentry)
+               return ENOTDIR;
+
+       dvp = fp->f_dentry->d_vnode;
+       vn_lock(dvp);
+       if (dvp->v_type != VDIR) {
+               vn_unlock(dvp);
+               return EBADF;
+       }
+       *loc = (long)fp->f_offset;
+       vn_unlock(dvp);
+       return 0;
+}
+
+int
+sys_mkdir(char *path, mode_t mode)
+{
+       char *name;
+       struct dentry *dp, *ddp;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_mkdir: path=%s mode=%d\n",       path, 
mode));
+
+       error = namei(path, &dp);
+       if (!error) {
+               /* File already exists */
+               drele(dp);
+               return EEXIST;
+       }
+
+       if ((error = lookup(path, &ddp, &name)) != 0) {
+               /* Directory already exists */
+               return error;
+       }
+
+       vn_lock(ddp->d_vnode);
+       if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0)
+               goto out;
+       mode &= ~S_IFMT;
+       mode |= S_IFDIR;
+
+       error = VOP_MKDIR(ddp->d_vnode, name, mode);
+ out:
+       vn_unlock(ddp->d_vnode);
+       drele(ddp);
+       return error;
+}
+
+int
+sys_rmdir(char *path)
+{
+       struct dentry *dp, *ddp;
+       struct vnode *vp;
+       int error;
+       char *name;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_rmdir: path=%s\n", path));
+
+       if ((error = check_dir_empty(path)) != 0)
+               return error;
+       error = namei(path, &dp);
+       if (error)
+               return error;
+
+       vp = dp->d_vnode;
+       vn_lock(vp);
+       if ((error = vn_access(vp, VWRITE)) != 0)
+               goto out;
+       if (vp->v_type != VDIR) {
+               error = ENOTDIR;
+               goto out;
+       }
+       if (vp->v_flags & VROOT || vp->v_refcnt >= 2) {
+               error = EBUSY;
+               goto out;
+       }
+       if ((error = lookup(path, &ddp, &name)) != 0)
+               goto out;
+
+       vn_lock(ddp->d_vnode);
+       error = VOP_RMDIR(ddp->d_vnode, vp, name);
+       vn_unlock(ddp->d_vnode);
+
+       vn_unlock(vp);
+       dentry_remove(dp);
+       drele(ddp);
+       drele(dp);
+       return error;
+
+ out:
+       vn_unlock(vp);
+       drele(dp);
+       return error;
+}
+
+int
+sys_mknod(char *path, mode_t mode)
+{
+       char *name;
+       struct dentry *dp, *ddp;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_mknod: path=%s mode=%d\n",       path, 
mode));
+
+       switch (mode & S_IFMT) {
+       case S_IFREG:
+       case S_IFDIR:
+       case S_IFIFO:
+       case S_IFSOCK:
+               /* OK */
+               break;
+       default:
+               return EINVAL;
+       }
+
+       error = namei(path, &dp);
+       if (!error) {
+               drele(dp);
+               return EEXIST;
+       }
+
+       if ((error = lookup(path, &ddp, &name)) != 0)
+               return error;
+
+       vn_lock(ddp->d_vnode);
+       if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0)
+               goto out;
+       if (S_ISDIR(mode))
+               error = VOP_MKDIR(ddp->d_vnode, name, mode);
+       else
+               error = VOP_CREATE(ddp->d_vnode, name, mode);
+ out:
+       vn_unlock(ddp->d_vnode);
+       drele(ddp);
+       return error;
+}
+
+/*
+ * Returns true when @parent path could represent parent directory
+ * of a file or directory represented by @child path.
+ *
+ * Assumes both paths do not have trailing slashes.
+ */
+static bool
+is_parent(const char *parent, const char *child)
+{
+       size_t p_len = strlen(parent);
+       return !strncmp(parent, child, p_len) && (parent[p_len-1] == '/' || 
child[p_len] == '/');
+}
+
+static bool
+has_trailing(const char *path, char ch)
+{
+       size_t len = strlen(path);
+       return len && path[len - 1] == ch;
+}
+
+static void
+strip_trailing(char *path, char ch)
+{
+       size_t len = strlen(path);
+
+       while (len && path[len - 1] == ch)
+               len--;
+
+       path[len] = '\0';
+}
+
+int
+sys_rename(char *src, char *dest)
+{
+       struct dentry *dp1, *dp2 = 0, *ddp1, *ddp2;
+       struct vnode *vp1, *vp2 = 0, *dvp1, *dvp2;
+       char *sname, *dname;
+       int error;
+       char root[] = "/";
+       bool ts; /* trailing slash */
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_rename: src=%s dest=%s\n", src, dest));
+
+       ts = false;
+       if (has_trailing(src, '/') == true) {
+               if (strlen(src) != 1) {
+                       /* remove trailing slash iff path is none root */
+                       strip_trailing(src, '/');
+                       ts = true;
+               }
+       }
+
+       error = lookup(src, &ddp1, &sname);
+       if (error != 0) {
+               return (error);
+       }
+
+       error = namei_last_nofollow(src, ddp1, &dp1);
+       if (error != 0) {
+               drele(ddp1);
+               return (error);
+       }
+
+       vp1 = dp1->d_vnode;
+       vn_lock(vp1);
+
+       if (vp1->v_type != VDIR && ts == true) {
+               error = ENOTDIR;
+               goto err1;
+       }
+
+       ts = false;
+       if (has_trailing(dest, '/') == true) {
+               if (strlen(dest) != 1) {
+                       /* remove trailing slash iff path is none root */
+                       strip_trailing(dest, '/');
+                       ts = true;
+               }
+       }
+
+       error = lookup(dest, &ddp2, &dname);
+       if (error != 0) {
+               goto err1;
+       }
+
+       error = namei_last_nofollow(dest, ddp2, &dp2);
+       if (error == 0) {
+               /* target exists */
+
+               vp2 = dp2->d_vnode;
+               vn_lock(vp2);
+
+               if (vp2->v_type != VDIR && vp2->v_type != VLNK) {
+                       if (vp1->v_type == VDIR || ts == true) {
+                               error = ENOTDIR;
+                               goto err2;
+                       }
+               } else if (vp1->v_type != VDIR && vp2->v_type == VDIR) {
+                       error = EISDIR;
+                       goto err2;
+               }
+               if (vp2->v_type == VDIR && check_dir_empty(dest)) {
+                       error = EEXIST;
+                       goto err2;
+               }
+       } else if (error == ENOENT) {
+               if (vp1->v_type != VDIR && ts == true) {
+                       error = ENOTDIR;
+                       goto err2;
+               }
+       } else {
+               goto err2;
+       }
+
+       if (strcmp(dest, "/"))
+               strip_trailing(dest, '/');
+
+       if (strcmp(src, "/"))
+               strip_trailing(src, '/');
+
+       /* If source and dest are the same, do nothing */
+       if (!strncmp(src, dest, PATH_MAX))
+               goto err2;
+
+       /* Check if target is directory of source */
+       if (is_parent(src, dest)) {
+               error = EINVAL;
+               goto err2;
+       }
+
+       dname = strrchr(dest, '/');
+       if (dname == nullptr) {
+               error = ENOTDIR;
+               goto err2;
+       }
+       if (dname == dest)
+               dest = root;
+
+       *dname = 0;
+       dname++;
+
+       dvp1 = ddp1->d_vnode;
+       vn_lock(dvp1);
+
+       dvp2 = ddp2->d_vnode;
+       vn_lock(dvp2);
+
+       /* Source and destination directions should be writable) */
+       if ((error = vn_access(dvp1, VWRITE)) != 0)
+           goto err3;
+       if ((error = vn_access(dvp2, VWRITE)) != 0)
+           goto err3;
+
+       /* The source and dest must be same file system */
+       if (dvp1->v_mount != dvp2->v_mount) {
+               error = EXDEV;
+               goto err3;
+       }
+
+       error = VOP_RENAME(dvp1, vp1, sname, dvp2, vp2, dname);
+
+       dentry_move(dp1, ddp2, dname);
+       if (dp2)
+               dentry_remove(dp2);
+
+ err3:
+       vn_unlock(dvp2);
+       vn_unlock(dvp1);
+ err2:
+       if (vp2) {
+               vn_unlock(vp2);
+               drele(dp2);
+       }
+       drele(ddp2);
+ err1:
+       vn_unlock(vp1);
+       drele(dp1);
+       drele(ddp1);
+       return error;
+}
+
+int
+sys_symlink(const char *oldpath, const char *newpath)
+{
+       struct task     *t = main_task;
+       int             error;
+       std::unique_ptr<char []> up_op (new char[PATH_MAX]);
+       char            *op = up_op.get();
+       std::unique_ptr<char []> up_np (new char[PATH_MAX]);
+       char            *np = up_np.get();
+       struct dentry   *newdp;
+       struct dentry   *newdirdp;
+       char            *name;
+
+       if (oldpath == nullptr || newpath == nullptr) {
+               return (EFAULT);
+       }
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n",
+                               oldpath, newpath));
+
+       newdp           = nullptr;
+       newdirdp        = nullptr;
+
+       error = task_conv(t, newpath, VWRITE, np);
+       if (error != 0) {
+               return (error);
+       }
+
+       /* parent directory for new path must exist */
+       if ((error = lookup(np, &newdirdp, &name)) != 0) {
+               error = ENOENT;
+               goto out;
+       }
+       vn_lock(newdirdp->d_vnode);
+
+       /* newpath should not already exist */
+       if (namei_last_nofollow(np, newdirdp, &newdp) == 0) {
+               drele(newdp);
+               error = EEXIST;
+               goto out;
+       }
+
+       /* check for write access at newpath */
+       if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0) {
+               goto out;
+       }
+
+       /* oldpath may not be const char * to VOP_SYMLINK - need to copy */
+       size_t tocopy;
+       tocopy = strlcpy(op, oldpath, PATH_MAX);
+       if (tocopy >= PATH_MAX - 1) {
+               error = ENAMETOOLONG;
+               goto out;
+       }
+       error = VOP_SYMLINK(newdirdp->d_vnode, name, op);
+
+out:
+       if (newdirdp != nullptr) {
+               vn_unlock(newdirdp->d_vnode);
+               drele(newdirdp);
+       }
+
+       return (error);
+}
+
+int
+sys_link(char *oldpath, char *newpath)
+{
+       struct dentry *olddp, *newdp, *newdirdp;
+       struct vnode *vp;
+       char *name;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n",
+                               oldpath, newpath));
+
+       /* File from oldpath must exist */
+       if ((error = namei(oldpath, &olddp)) != 0)
+               return error;
+
+       vp = olddp->d_vnode;
+       vn_lock(vp);
+
+       if (vp->v_type == VDIR) {
+               error = EPERM;
+               goto out;
+       }
+
+       /* If newpath exists, it shouldn't be overwritten */
+       if (!namei(newpath, &newdp)) {
+               error = EEXIST;
+               goto out;
+       }
+
+       /* Get pointer to the parent dentry of newpath */
+       if ((error = lookup(newpath, &newdirdp, &name)) != 0)
+               goto out;
+
+       vn_lock(newdirdp->d_vnode);
+
+       /* Both files must reside on the same mounted file system */
+       if (olddp->d_mount != newdirdp->d_mount) {
+               error = EXDEV;
+               goto out1;
+       }
+
+       /* Write access to the dir containing newpath is required */
+       if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0)
+               goto out1;
+
+       /* Map newpath into dentry hash with the same vnode as oldpath */
+       if (!(newdp = dentry_alloc(newdirdp, vp, newpath))) {
+               error = ENOMEM;
+               goto out1;
+       }
+
+       error = VOP_LINK(newdirdp->d_vnode, vp, name);
+ out1:
+       vn_unlock(newdirdp->d_vnode);
+       drele(newdirdp);
+ out:
+       vn_unlock(vp);
+       drele(olddp);
+       drele(newdp);
+       return error;
+}
+
+int
+sys_unlink(char *path)
+{
+       char *name;
+       struct dentry *dp, *ddp;
+       struct vnode *vp;
+       int error;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_unlink: path=%s\n", path));
+
+       ddp   = nullptr;
+       dp    = nullptr;
+       vp    = nullptr;
+
+       error = lookup(path, &ddp, &name);
+       if (error != 0) {
+               return (error);
+       }
+
+       error = namei_last_nofollow(path, ddp, &dp);
+       if (error != 0) {
+               goto out;
+       }
+
+       vp = dp->d_vnode;
+       vn_lock(vp);
+       if (vp->v_type == VDIR) {
+           // Posix specifies that we should return EPERM here, but Linux
+           // actually returns EISDIR.
+               error = EISDIR;
+               goto out;
+       }
+       if (vp->v_flags & VROOT) {
+               error = EBUSY;
+               goto out;
+       }
+
+       vn_lock(ddp->d_vnode);
+       if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) {
+           vn_unlock(ddp->d_vnode);
+           goto out;
+       }
+       error = VOP_REMOVE(ddp->d_vnode, vp, name);
+       vn_unlock(ddp->d_vnode);
+
+       vn_unlock(vp);
+       dentry_remove(dp);
+       drele(ddp);
+       drele(dp);
+       return error;
+ out:
+       if (vp != nullptr) {
+               vn_unlock(vp);
+       }
+
+       if (dp != nullptr) {
+               drele(dp);
+       }
+
+       if (ddp != nullptr) {
+               drele(ddp);
+       }
+       return error;
+}
+
+int
+sys_access(char *path, int mode)
+{
+       struct dentry *dp;
+       int error, flags;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_access: path=%s mode=%x\n", path, mode));
+
+       /* If F_OK is set, we return here if file is not found. */
+       error = namei(path, &dp);
+       if (error)
+               return error;
+
+       flags = 0;
+       if (mode & R_OK)
+               flags |= VREAD;
+       if (mode & W_OK)
+               flags |= VWRITE;
+       if (mode & X_OK)
+               flags |= VEXEC;
+
+       error = vn_access(dp->d_vnode, flags);
+
+       drele(dp);
+       return error;
+}
+
+int
+sys_stat(char *path, struct stat *st)
+{
+       DPRINTF(VFSDB_SYSCALL, ("sys_stat: path=%s\n", path));
+
+       try {
+               dentry_ref dp = namei(path);
+               if (!dp) {
+                       return ENOENT;
+               }
+               return vn_stat(dp->d_vnode, st);
+       } catch (error e) {
+               return e.get();
+       }
+}
+
+int sys_lstat(char *path, struct stat *st)
+{
+       int           error;
+       struct dentry *ddp;
+       char          *name;
+       struct dentry *dp;
+
+       DPRINTF(VFSDB_SYSCALL, ("sys_lstat: path=%s\n", path));
+
+       error = lookup(path, &ddp, &name);
+       if (error) {
+               return (error);
+       }
+
+       error = namei_last_nofollow(path, ddp, &dp);
+       if (error) {
+               drele(ddp);
+               return error;
+       }
+
+       error = vn_stat(dp->d_vnode, st);
+       drele(dp);
+       drele(ddp);
+       return error;
+}
+
+int
+sys_statfs(char *path, struct statfs *buf)
+{
+       memset(buf, 0, sizeof(*buf));
+       try {
+               dentry_ref dp = namei(path);
+               if (!dp) {
+                       return ENOENT;
+               }
+               return VFS_STATFS(dp->d_mount, buf);
+       } catch (error e) {
+               return e.get();
+       }
+}
+
+int
+sys_fstatfs(struct file *fp, struct statfs *buf)
+{
+       struct vnode *vp;
+       int error = 0;
+
+       if (!fp->f_dentry)
+               return EBADF;
+
+       vp = fp->f_dentry->d_vnode;
+       memset(buf, 0, sizeof(*buf));
+
+       vn_lock(vp);
+       error = VFS_STATFS(vp->v_mount, buf);
+       vn_unlock(vp);
+
+       return error;
+}
+
+int
+sys_truncate(char *path, off_t length)
+{
+       struct dentry *dp;
+       int error;
+
+       error = namei(path, &dp);
+       if (error)
+               return error;
+
+       vn_lock(dp->d_vnode);
+       error = VOP_TRUNCATE(dp->d_vnode, length);
+       vn_unlock(dp->d_vnode);
+
+       drele(dp);
+       return error;
+}
+
+int
+sys_ftruncate(struct file *fp, off_t length)
+{
+       struct vnode *vp;
+       int error;
+
+       if (!fp->f_dentry)
+               return EBADF;
+
+       vp = fp->f_dentry->d_vnode;
+       vn_lock(vp);
+       error = VOP_TRUNCATE(vp, length);
+       vn_unlock(vp);
+
+       return error;
+}
+
+int
+sys_fchdir(struct file *fp, char *cwd)
+{
+       struct vnode *dvp;
+
+       if (!fp->f_dentry)
+               return EBADF;
+
+       dvp = fp->f_dentry->d_vnode;
+       vn_lock(dvp);
+       if (dvp->v_type != VDIR) {
+               vn_unlock(dvp);
+               return EBADF;
+       }
+       strlcpy(cwd, fp->f_dentry->d_path, PATH_MAX);
+       vn_unlock(dvp);
+       return 0;
+}
+
+int
+sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size)
+{
+       int             error;
+       struct dentry   *ddp;
+       char            *name;
+       struct dentry   *dp;
+       struct vnode    *vp;
+       struct iovec    vec;
+       struct uio      uio;
+
+       *size = 0;
+       error = lookup(path, &ddp, &name);
+       if (error) {
+               return (error);
+       }
+
+       error = namei_last_nofollow(path, ddp, &dp);
+       if (error) {
+               drele(ddp);
+               return (error);
+       }
+
+       if (dp->d_vnode->v_type != VLNK) {
+               drele(dp);
+               drele(ddp);
+               return (EINVAL);
+       }
+       vec.iov_base    = buf;
+       vec.iov_len     = bufsize;
+
+       uio.uio_iov     = &vec;
+       uio.uio_iovcnt  = 1;
+       uio.uio_offset  = 0;
+       uio.uio_resid   = bufsize;
+       uio.uio_rw      = UIO_READ;
+
+       vp = dp->d_vnode;
+       vn_lock(vp);
+       error = VOP_READLINK(vp, &uio);
+       vn_unlock(vp);
+
+       drele(dp);
+       drele(ddp);
+
+       if (error) {
+               return (error);
+       }
+
+       *size = bufsize - uio.uio_resid;
+       return (0);
+}
+
+/*
+ * Check the validity of the members of a struct timeval.
+ */
+static bool is_timeval_valid(const struct timeval *time)
+{
+    return (time->tv_sec >= 0) &&
+           (time->tv_usec >= 0 && time->tv_usec < 1000000);
+}
+
+/*
+ * Convert a timeval struct to a timespec one.
+ */
+static void convert_timeval(struct timespec &to, const struct timeval *from)
+{
+    if (from) {
+        to.tv_sec = from->tv_sec;
+        to.tv_nsec = from->tv_usec * 1000; // Convert microseconds to 
nanoseconds
+    } else {
+        clock_gettime(CLOCK_REALTIME, &to);
+    }
+}
+
+int
+sys_utimes(char *path, const struct timeval times[2], int flags)
+{
+    int error;
+    struct dentry *dp;
+    struct timespec timespec_times[2];
+
+    DPRINTF(VFSDB_SYSCALL, ("sys_utimes: path=%s\n", path));
+
+    if (times && (!is_timeval_valid(&times[0]) || 
!is_timeval_valid(&times[1])))
+        return EINVAL;
+
+    // Convert each element of timeval array to the timespec type
+    convert_timeval(timespec_times[0], times ? times + 0 : nullptr);
+    convert_timeval(timespec_times[1], times ? times + 1 : nullptr);
+
+    if (flags & AT_SYMLINK_NOFOLLOW) {
+        struct dentry *ddp;
+        error = lookup(path, &ddp, nullptr);
+        if (error) {
+            return error;
+        }
+
+        error = namei_last_nofollow(path, ddp, &dp);
+        if (ddp != nullptr) {
+            drele(ddp);
+        }
+        if (error) {
+            return error;
+        }
+    } else {
+        error = namei(path, &dp);
+        if (error)
+            return error;
+    }
+
+    if (dp->d_mount->m_flags & MNT_RDONLY) {
+        error = EROFS;
+    } else {
+        error = vn_settimes(dp->d_vnode, timespec_times);
+    }
+
+    drele(dp);
+    return error;
+}
+
+/*
+ * Check the validity of members of a struct timespec
+ */
+static bool is_timespec_valid(const struct timespec &time)
+{
+    return (time.tv_sec >= 0) &&
+          ((time.tv_nsec >= 0 && time.tv_nsec <= 999999999) ||
+           time.tv_nsec == UTIME_NOW ||
+           time.tv_nsec == UTIME_OMIT);
+}
+
+void init_timespec(struct timespec &_times, const struct timespec *times)
+{
+    if (times == nullptr || times->tv_nsec == UTIME_NOW) {
+        clock_gettime(CLOCK_REALTIME, &_times);
+    } else {
+        _times.tv_sec = times->tv_sec;
+        _times.tv_nsec = times->tv_nsec;
+    }
+    return;
+}
+
+int
+sys_utimensat(int dirfd, const char *pathname, const struct timespec times[2], 
int flags)
+{
+    int error;
+    std::string ap;
+    struct timespec timespec_times[2];
+    extern struct task *main_task;
+    struct dentry *dp;
+
+    /* utimensat should return ENOENT when pathname is empty */
+    if(pathname && pathname[0] == 0)
+        return ENOENT;
+
+    if (flags && !(flags & AT_SYMLINK_NOFOLLOW))
+        return EINVAL;
+
+    if (times && (!is_timespec_valid(times[0]) || 
!is_timespec_valid(times[1])))
+        return EINVAL;
+
+    init_timespec(timespec_times[0], times ? times + 0 : nullptr);
+    init_timespec(timespec_times[1], times ? times + 1 : nullptr);
+
+    if (pathname && pathname[0] == '/') {
+       ap = pathname;
+    } else if (dirfd == AT_FDCWD) {
+       if (!pathname)
+           return EFAULT;
+       ap = std::string(main_task->t_cwd) + "/" + pathname;
+    } else {
+        struct file *fp;
+        fileref f(fileref_from_fd(dirfd));
+
+        if (!f)
+           return EBADF;
+
+       fp = f.get();
+
+       if(!fp->f_dentry)
+           return EBADF;
+
+       if (!(fp->f_dentry->d_vnode->v_type & VDIR))
+           return ENOTDIR;
+
+       if (pathname)
+           ap = std::string(fp->f_dentry->d_path) + "/" + pathname;
+       else
+           ap = fp->f_dentry->d_path;
+
+       ap = std::string(fp->f_dentry->d_mount->m_path) + "/" + ap;
+    }
+
+    /* FIXME: Add support for AT_SYMLINK_NOFOLLOW */
+
+    error = namei(ap.c_str(), &dp);
+
+    if (error)
+        return error;
+
+    if (dp->d_mount->m_flags & MNT_RDONLY) {
+        error = EROFS;
+    } else {
+        if (vn_access(dp->d_vnode, VWRITE)) {
+            return EACCES;
+        }
+           if (times &&
+               (times[0].tv_nsec != UTIME_NOW || times[1].tv_nsec != UTIME_NOW) 
&&
+               (times[0].tv_nsec != UTIME_OMIT || times[1].tv_nsec != UTIME_OMIT) 
&&
+              (!(dp->d_vnode->v_mode & ~VAPPEND)))
+               return EPERM;
+        error = vn_settimes(dp->d_vnode, timespec_times);
+    }
+
+    drele(dp);
+    return error;
+}
+
+int
+sys_futimens(int fd, const struct timespec times[2])
+{
+    struct file *fp;
+
+    fileref f(fileref_from_fd(fd));
+    if (!f)
+        return EBADF;
+
+    fp = f.get();
+
+    if (!fp->f_dentry)
+        return EBADF;
+
+    std::string pathname = fp->f_dentry->d_path;
+    auto error = sys_utimensat(AT_FDCWD, pathname.c_str(), times, 0);
+    return error;
+}
+
+int
+sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len)
+{
+    int error;
+    struct vnode *vp;
+
+    DPRINTF(VFSDB_SYSCALL, ("sys_fallocate: fp=%x", fp));
+
+    if (!fp->f_dentry || !(fp->f_flags & FWRITE)) {
+        return EBADF;
+    }
+
+    if (offset < 0 || len <= 0) {
+        return EINVAL;
+    }
+
+    // Strange, but that's what Linux returns.
+    if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) {
+        return ENOTSUP;
+    }
+
+    vp = fp->f_dentry->d_vnode;
+    vn_lock(vp);
+
+    // NOTE: It's not detected here whether or not the device underlying
+    // the fs is a block device. It's up to the fs itself tell us whether
+    // or not fallocate is supported. See below:
+    if (vp->v_type != VREG && vp->v_type != VDIR) {
+        error = ENODEV;
+        goto ret;
+    }
+
+    // EOPNOTSUPP here means that the underlying file system
+    // referred by vp doesn't support fallocate.
+    if (!vp->v_op->vop_fallocate) {
+        error = EOPNOTSUPP;
+        goto ret;
+    }
+
+    error = VOP_FALLOCATE(vp, mode, offset, len);
+ret:
+    vn_unlock(vp);
+    return error;
+}
+
+int
+sys_chmod(const char *path, mode_t mode)
+{
+    int error;
+    struct dentry *dp;
+    DPRINTF(VFSDB_SYSCALL, ("sys_chmod: path=%s\n", path));
+    error = namei(path, &dp);
+    if (error)
+        return error;
+    if (dp->d_mount->m_flags & MNT_RDONLY) {
+        error = EROFS;
+    } else {
+        error = vn_setmode(dp->d_vnode, mode);
+    }
+    drele(dp);
+    return error;
+}
+
+int
+sys_fchmod(int fd, mode_t mode)
+{
+    fileref f(fileref_from_fd(fd));
+    if (!f)
+        return EBADF;
+    // Posix is ambivalent on what fchmod() should do on an fd that does not
+    // refer to a real file. It suggests an implementation may (but not must)
+    // fail EINVAL on a pipe, can behave in an "unspecified" manner on a
+    // socket, and for a STREAM, it must succeed and do nothing. Linux seems
+    // to just do the last thing (do nothing and succeed).
+    if (!f->f_dentry) {
+        return 0;
+    }
+    if (f->f_dentry->d_mount->m_flags & MNT_RDONLY) {
+        return EROFS;
+    } else {
+        return vn_setmode(f->f_dentry->d_vnode, mode);
+    }
+}
diff --git a/lib/vfscore/task.c b/lib/vfscore/task.c
new file mode 100644
index 00000000..7a355034
--- /dev/null
+++ b/lib/vfscore/task.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2007, Kohsuke Ohtani All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_task.c - Routines to manage the per task data.
+ */
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+#include <osv/prex.h>
+#include "vfs.h"
+
+/*
+ * Allocate new task.
+ */
+int
+task_alloc(struct task **pt)
+{
+       struct task *t;
+
+    // FIXME: where do we free task ?
+       if (!(t = new task))
+               return ENOMEM;
+       memset(t, 0, sizeof(struct task));
+       strlcpy(t->t_cwd, "/", sizeof(t->t_cwd));
+
+       *pt = t;
+       return 0;
+}
+
+/*
+ * Convert to full path from the cwd of task and path.
+ * @wd:   working directory
+ * @path: target path
+ * @full: full path to be returned
+ */
+int
+path_conv(char *wd, const char *cpath, char *full)
+{
+       char path[PATH_MAX];
+       char *src, *tgt, *p, *end;
+       size_t len = 0;
+
+       strlcpy(path, cpath, PATH_MAX);
+       path[PATH_MAX - 1] = '\0';
+
+       len = strlen(path);
+       if (len >= PATH_MAX)
+               return ENAMETOOLONG;
+       if (strlen(wd) + len >= PATH_MAX)
+               return ENAMETOOLONG;
+       src = path;
+       tgt = full;
+       end = src + len;
+       if (path[0] == '/') {
+               *tgt++ = *src++;
+               len = 1;
+       } else {
+               strlcpy(full, wd, PATH_MAX);
+               len = strlen(wd);
+               tgt += len;
+               if (len > 1 && path[0] != '.') {
+                       *tgt = '/';
+                       tgt++;
+                       len++;
+               }
+       }
+       while (*src) {
+               p = src;
+               while (*p != '/' && *p != '\0')
+                       p++;
+               *p = '\0';
+               if (!strcmp(src, "..")) {
+                       if (len >= 2) {
+                               len -= 2;
+                               tgt -= 2;       /* skip previous '/' */
+                               while (*tgt != '/') {
+                                       tgt--;
+                                       len--;
+                               }
+                               if (len == 0) {
+                                       tgt++;
+                                       len++;
+                               }
+                       }
+               } else if (!strcmp(src, ".")) {
+                       /* Ignore "." */
+               } else {
+                       while (*src != '\0') {
+                               *tgt++ = *src++;
+                               len++;
+                       }
+               }
+               if (p == end)
+                       break;
+               if (len > 0 && *(tgt - 1) != '/') {
+                       *tgt++ = '/';
+                       len++;
+               }
+               src = p + 1;
+       }
+       *tgt = '\0';
+
+       return (0);
+}
+
+/*
+ * Convert to full path from the cwd of task and path.
+ * @t:    task structure
+ * @path: target path
+ * @full: full path to be returned
+ * @acc: access mode
+ */
+int
+task_conv(struct task *t, const char *cpath, int acc, char *full)
+{
+       int rc;
+
+       rc = path_conv(t->t_cwd, cpath, full);
+       if (rc != 0) {
+               return (rc);
+       }
+
+       /* Check if the client task has required permission */
+       return (0); //sec_file_permission(t->t_taskid, full, acc);
+}
+
+/*
+ * Safe copying function that checks for overflow.
+ */
+int vfs_dname_copy(char *dest, const char *src, size_t size)
+{
+    if (strlcpy(dest, src, size) >= size) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/lib/vfscore/vfs.h b/lib/vfscore/vfs.h
new file mode 100644
index 00000000..d86ef957
--- /dev/null
+++ b/lib/vfscore/vfs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VFS_H
+#define _VFS_H
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#include <dirent.h>
+#include <limits.h>
+
+#include <osv/prex.h>
+#include <osv/file.h>
+#include <osv/mount.h>
+#include <osv/vnode.h>
+#include <osv/dentry.h>
+#include <osv/error.h>
+
+/*
+ * Import vnode attributes flags
+ */
+#include <osv/vnode_attr.h>
+
+/* #define DEBUG_VFS 1 */
+
+/*
+ * Tunable parameters
+ */
+#define FSMAXNAMES     16              /* max length of 'file system' name */
+
+#ifdef DEBUG_VFS
+#include <osv/debug.h>
+
+extern int vfs_debug;
+
+#define        VFSDB_CORE      0x00000001
+#define        VFSDB_SYSCALL   0x00000002
+#define        VFSDB_VNODE     0x00000004
+#define        VFSDB_BIO       0x00000008
+#define        VFSDB_CAP       0x00000010
+
+#define VFSDB_FLAGS    0x00000013
+
+#define        DPRINTF(_m,X)   if (vfs_debug & (_m)) kprintf X
+#else
+#define        DPRINTF(_m, X)
+#endif
+
+#define ASSERT(e)      assert(e)
+
+#define OPEN_MAX       256
+
+/*
+ * per task data
+ */
+struct task {
+       char        t_cwd[PATH_MAX];    /* current working directory */
+       struct file *t_cwdfp;           /* directory for cwd */
+};
+
+extern const struct vfssw vfssw[];
+
+__BEGIN_DECLS
+int     sys_open(char *path, int flags, mode_t mode, struct file **fp);
+int     sys_read(struct file *fp, const struct iovec *iov, size_t niov,
+               off_t offset, size_t *count);
+int     sys_write(struct file *fp, const struct iovec *iov, size_t niov,
+               off_t offset, size_t *count);
+int     sys_lseek(struct file *fp, off_t off, int type, off_t * cur_off);
+int     sys_ioctl(struct file *fp, u_long request, void *buf);
+int     sys_fstat(struct file *fp, struct stat *st);
+int     sys_fstatfs(struct file *fp, struct statfs *buf);
+int     sys_fsync(struct file *fp);
+int     sys_ftruncate(struct file *fp, off_t length);
+
+int     sys_readdir(struct file *fp, struct dirent *dirent);
+int     sys_rewinddir(struct file *fp);
+int     sys_seekdir(struct file *fp, long loc);
+int     sys_telldir(struct file *fp, long *loc);
+int     sys_fchdir(struct file *fp, char *path);
+
+int     sys_mkdir(char *path, mode_t mode);
+int     sys_rmdir(char *path);
+int     sys_mknod(char *path, mode_t mode);
+int     sys_rename(char *src, char *dest);
+int     sys_link(char *oldpath, char *newpath);
+int     sys_unlink(char *path);
+int     sys_symlink(const char *oldpath, const char *newpath);
+int     sys_access(char *path, int mode);
+int     sys_stat(char *path, struct stat *st);
+int     sys_lstat(char *path, struct stat *st);
+int     sys_statfs(char *path, struct statfs *buf);
+int     sys_truncate(char *path, off_t length);
+int     sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size);
+int  sys_utimes(char *path, const struct timeval times[2], int flags);
+int  sys_utimensat(int dirfd, const char *pathname,
+                   const struct timespec times[2], int flags);
+int  sys_futimens(int fd, const struct timespec times[2]);
+int  sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len);
+
+int     sys_mount(const char *dev, const char *dir, const char *fsname, int 
flags, const void *data);
+int     sys_umount2(const char *path, int flags);
+int     sys_umount(const char *path);
+int     sys_pivot_root(const char *new_root, const char *old_put);
+int     sys_sync(void);
+int     sys_chmod(const char *path, mode_t mode);
+int     sys_fchmod(int fd, mode_t mode);
+
+
+int     task_alloc(struct task **pt);
+int     task_conv(struct task *t, const char *path, int mode, char *full);
+int     path_conv(char *wd, const char *cpath, char *full);
+
+//int   sec_file_permission(task_t task, char *path, int mode);
+int     sec_vnode_permission(char *path);
+
+int     namei(const char *path, struct dentry **dpp);
+int     namei_last_nofollow(char *path, struct dentry *ddp, struct dentry 
**dp);
+int     lookup(char *path, struct dentry **dpp, char **name);
+void    vnode_init(void);
+void    lookup_init(void);
+
+int     vfs_findroot(const char *path, struct mount **mp, char **root);
+int     vfs_dname_copy(char *dest, const char *src, size_t size);
+
+int     fs_noop(void);
+
+struct dentry *dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const 
char *path);
+struct dentry *dentry_lookup(struct mount *mp, char *path);
+void dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path);
+void dentry_remove(struct dentry *dp);
+void dref(struct dentry *dp);
+void drele(struct dentry *dp);
+void dentry_init(void);
+
+#ifdef DEBUG_VFS
+void    vnode_dump(void);
+void    mount_dump(void);
+#endif
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+// Convert a path to a dentry_ref.  Returns an empty
+// reference if not found (ENOENT) for efficiency, throws
+// an error on other errors.
+inline dentry_ref namei(char* path)
+{
+       dentry* dp;
+       auto err = namei(path, &dp);
+       if (err == ENOENT) {
+               return dentry_ref();
+       } else if (err) {
+               throw make_error(err);
+       } else {
+               return dentry_ref(dp, false);
+       }
+}
+
+#endif
+
+#endif /* !_VFS_H */
diff --git a/lib/vfscore/vnode.c b/lib/vfscore/vnode.c
new file mode 100644
index 00000000..a292344f
--- /dev/null
+++ b/lib/vfscore/vnode.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2005-2008, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_vnode.c - vnode service
+ */
+
+#include <limits.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+enum vtype iftovt_tab[16] = {
+       VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+       VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[10] = {
+       0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+       S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * Memo:
+ *
+ * Function   Ref count Lock
+ * ---------- --------- ----------
+ * vn_lock     *        Lock
+ * vn_unlock   *        Unlock
+ * vget        1        Lock
+ * vput       -1        Unlock
+ * vref       +1        *
+ * vrele      -1        *
+ */
+
+#define VNODE_BUCKETS 32               /* size of vnode hash table */
+
+/*
+ * vnode table.
+ * All active (opened) vnodes are stored on this hash table.
+ * They can be accessed by its path name.
+ */
+static LIST_HEAD(vnode_hash_head, vnode) vnode_table[VNODE_BUCKETS];
+
+/*
+ * Global lock to access all vnodes and vnode table.
+ * If a vnode is already locked, there is no need to
+ * lock this global lock to access internal data.
+ */
+static mutex_t vnode_lock = MUTEX_INITIALIZER;
+#define VNODE_LOCK()   mutex_lock(&vnode_lock)
+#define VNODE_UNLOCK() mutex_unlock(&vnode_lock)
+#define VNODE_OWNED()  mutex_owned(&vnode_lock)
+
+/*
+ * Get the hash value from the mount point and path name.
+ * XXX(hch): replace with a better hash for 64-bit pointers.
+ */
+static u_int
+vn_hash(struct mount *mp, uint64_t ino)
+{
+       return (ino ^ (unsigned long)mp) & (VNODE_BUCKETS - 1);
+}
+
+/*
+ * Returns locked vnode for specified mount point and path.
+ * vn_lock() will increment the reference count of vnode.
+ *
+ * Locking: VNODE_LOCK must be held.
+ */
+struct vnode *
+vn_lookup(struct mount *mp, uint64_t ino)
+{
+       struct vnode *vp;
+
+       assert(VNODE_OWNED());
+       LIST_FOREACH(vp, &vnode_table[vn_hash(mp, ino)], v_link) {
+               if (vp->v_mount == mp && vp->v_ino == ino) {
+                       vp->v_refcnt++;
+                       mutex_lock(&vp->v_lock);
+                       vp->v_nrlocks++;
+                       return vp;
+               }
+       }
+       return nullptr;         /* not found */
+}
+
+#ifdef DEBUG_VFS
+static const char *
+vn_path(struct vnode *vp)
+{
+       struct dentry *dp;
+
+       if (LIST_EMPTY(&vp->v_names) == 1) {
+               return (" ");
+       }
+       dp = LIST_FIRST(&vp->v_names);
+       return (dp->d_path);
+}
+#endif
+
+/*
+ * Lock vnode
+ */
+void
+vn_lock(struct vnode *vp)
+{
+       ASSERT(vp);
+       ASSERT(vp->v_refcnt > 0);
+
+       mutex_lock(&vp->v_lock);
+       vp->v_nrlocks++;
+       DPRINTF(VFSDB_VNODE, ("vn_lock:   %s\n", vn_path(vp)));
+}
+
+/*
+ * Unlock vnode
+ */
+void
+vn_unlock(struct vnode *vp)
+{
+       ASSERT(vp);
+       ASSERT(vp->v_refcnt > 0);
+       ASSERT(vp->v_nrlocks > 0);
+
+       vp->v_nrlocks--;
+       mutex_unlock(&vp->v_lock);
+       DPRINTF(VFSDB_VNODE, ("vn_lock:   %s\n", vn_path(vp)));
+}
+
+/*
+ * Allocate new vnode for specified path.
+ * Increment its reference count and lock it.
+ * Returns 1 if vnode was found in cache; otherwise returns 0.
+ */
+int
+vget(struct mount *mp, uint64_t ino, struct vnode **vpp)
+{
+       struct vnode *vp;
+       int error;
+
+       *vpp = nullptr;
+
+       DPRINTF(VFSDB_VNODE, ("vget %LLu\n", ino));
+
+       VNODE_LOCK();
+
+       vp = vn_lookup(mp, ino);
+       if (vp) {
+               VNODE_UNLOCK();
+               *vpp = vp;
+               return 1;
+       }
+
+       if (!(vp = new vnode())) {
+               VNODE_UNLOCK();
+               return 0;
+       }
+
+       LIST_INIT(&vp->v_names);
+       vp->v_ino = ino;
+       vp->v_mount = mp;
+       vp->v_refcnt = 1;
+       vp->v_op = mp->m_op->vfs_vnops;
+       vp->v_nrlocks = 0;
+
+       /*
+        * Request to allocate fs specific data for vnode.
+        */
+       if ((error = VFS_VGET(mp, vp)) != 0) {
+               VNODE_UNLOCK();
+               delete vp;
+               return error;
+       }
+       vfs_busy(vp->v_mount);
+       mutex_lock(&vp->v_lock);
+       vp->v_nrlocks++;
+
+       LIST_INSERT_HEAD(&vnode_table[vn_hash(mp, ino)], vp, v_link);
+       VNODE_UNLOCK();
+
+       *vpp = vp;
+
+       return 0;
+}
+
+/*
+ * Unlock vnode and decrement its reference count.
+ */
+void
+vput(struct vnode *vp)
+{
+       ASSERT(vp);
+       ASSERT(vp->v_nrlocks > 0);
+       ASSERT(vp->v_refcnt > 0);
+       DPRINTF(VFSDB_VNODE, ("vput: ref=%d %s\n", vp->v_refcnt, vn_path(vp)));
+
+       VNODE_LOCK();
+       vp->v_refcnt--;
+       if (vp->v_refcnt > 0) {
+           VNODE_UNLOCK();
+               vn_unlock(vp);
+               return;
+       }
+       LIST_REMOVE(vp, v_link);
+       VNODE_UNLOCK();
+
+       /*
+        * Deallocate fs specific vnode data
+        */
+       if (vp->v_op->vop_inactive)
+               VOP_INACTIVE(vp);
+       vfs_unbusy(vp->v_mount);
+       vp->v_nrlocks--;
+       ASSERT(vp->v_nrlocks == 0);
+       mutex_unlock(&vp->v_lock);
+       delete vp;
+}
+
+/*
+ * Increment the reference count on an active vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+       ASSERT(vp);
+       ASSERT(vp->v_refcnt > 0); /* Need vget */
+
+       VNODE_LOCK();
+       DPRINTF(VFSDB_VNODE, ("vref: ref=%d\n", vp->v_refcnt));
+       vp->v_refcnt++;
+       VNODE_UNLOCK();
+}
+
+/*
+ * Decrement the reference count of the vnode.
+ * Any code in the system which is using vnode should call vrele()
+ * when it is finished with the vnode.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+       ASSERT(vp);
+       ASSERT(vp->v_refcnt > 0);
+
+       VNODE_LOCK();
+       DPRINTF(VFSDB_VNODE, ("vrele: ref=%d\n", vp->v_refcnt));
+       vp->v_refcnt--;
+       if (vp->v_refcnt > 0) {
+               VNODE_UNLOCK();
+               return;
+       }
+       LIST_REMOVE(vp, v_link);
+       VNODE_UNLOCK();
+
+       /*
+        * Deallocate fs specific vnode data
+        */
+       VOP_INACTIVE(vp);
+       vfs_unbusy(vp->v_mount);
+       delete vp;
+}
+
+/*
+ * Remove all vnode in the vnode table for unmount.
+ */
+void
+vflush(struct mount *mp)
+{
+}
+
+int
+vn_stat(struct vnode *vp, struct stat *st)
+{
+       struct vattr vattr;
+       struct vattr *vap;
+       mode_t mode;
+       int error;
+
+       vap = &vattr;
+
+       memset(st, 0, sizeof(struct stat));
+
+       memset(vap, 0, sizeof(struct vattr));
+
+       error = VOP_GETATTR(vp, vap);
+       if (error)
+               return error;
+
+       st->st_ino = (ino_t)vap->va_nodeid;
+       st->st_size = vap->va_size;
+       mode = vap->va_mode;
+       switch (vp->v_type) {
+       case VREG:
+               mode |= S_IFREG;
+               break;
+       case VDIR:
+               mode |= S_IFDIR;
+               break;
+       case VBLK:
+               mode |= S_IFBLK;
+               break;
+       case VCHR:
+               mode |= S_IFCHR;
+               break;
+       case VLNK:
+               mode |= S_IFLNK;
+               break;
+       case VSOCK:
+               mode |= S_IFSOCK;
+               break;
+       case VFIFO:
+               mode |= S_IFIFO;
+               break;
+       default:
+               return EBADF;
+       };
+       st->st_mode = mode;
+       st->st_nlink = vap->va_nlink;
+       st->st_blksize = BSIZE;
+       st->st_blocks = vap->va_size / S_BLKSIZE;
+       st->st_uid = vap->va_uid;
+       st->st_gid = vap->va_gid;
+       st->st_dev = vap->va_fsid;
+       if (vp->v_type == VCHR || vp->v_type == VBLK)
+               st->st_rdev = vap->va_rdev;
+
+       st->st_atim = vap->va_atime;
+       st->st_mtim = vap->va_mtime;
+       st->st_ctim = vap->va_ctime;
+
+       return 0;
+}
+
+/*
+ * Set access and modification times of the vnode
+ */
+int
+vn_settimes(struct vnode *vp, struct timespec times[2])
+{
+    struct vattr vattr;
+    struct vattr *vap;
+    int error;
+
+    vap = &vattr;
+    memset(vap, 0, sizeof(struct vattr));
+
+    vap->va_atime = times[0];
+    vap->va_mtime = times[1];
+    vap->va_mask = ((times[0].tv_nsec == UTIME_OMIT) ? 0 : AT_ATIME)
+                    | ((times[1].tv_nsec == UTIME_OMIT) ? 0 : AT_MTIME);
+    vn_lock(vp);
+    error = VOP_SETATTR(vp, vap);
+    vn_unlock(vp);
+
+    return error;
+}
+
+/*
+ * Set chmod permissions on the vnode.
+ */
+int
+vn_setmode(struct vnode *vp, mode_t new_mode)
+{
+    struct vattr vattr;
+    memset(&vattr, 0, sizeof(vattr));
+    vattr.va_mode = new_mode;
+    vattr.va_mask = AT_MODE;
+    vn_lock(vp);
+    vp->v_mode = new_mode;
+    int error = VOP_SETATTR(vp, &vattr);
+    vn_unlock(vp);
+    return error;
+}
+
+/*
+ * Check permission on vnode pointer.
+ */
+int
+vn_access(struct vnode *vp, int flags)
+{
+       int error = 0;
+
+       if ((flags & VEXEC) && (vp->v_mode & 0111) == 0) {
+               error = EACCES;
+               goto out;
+       }
+       if ((flags & VREAD) && (vp->v_mode & 0444) == 0) {
+               error = EACCES;
+               goto out;
+       }
+       if (flags & VWRITE) {
+               if (vp->v_mount->m_flags & MNT_RDONLY) {
+                       error = EROFS;
+                       goto out;
+               }
+               if ((vp->v_mode & 0222) == 0) {
+                       error = EACCES;
+                       goto out;
+               }
+       }
+ out:
+       return error;
+}
+
+#ifdef DEBUG_VFS
+/*
+ * Dump all all vnode.
+ */
+void
+vnode_dump(void)
+{
+       int i;
+       struct vnode *vp;
+       struct mount *mp;
+       char type[][6] = { "VNON ", "VREG ", "VDIR ", "VBLK ", "VCHR ",
+                          "VLNK ", "VSOCK", "VFIFO" };
+
+       VNODE_LOCK();
+       kprintf("Dump vnode\n");
+       kprintf(" vnode    mount    type  refcnt blkno    path\n");
+       kprintf(" -------- -------- ----- ------ -------- 
------------------------------\n");
+
+       for (i = 0; i < VNODE_BUCKETS; i++) {
+               LIST_FOREACH(vp, &vnode_table[i], v_link) {
+                       mp = vp->v_mount;
+
+                       kprintf(" %08x %08x %s %6d %8d %s%s\n", (u_long)vp,
+                               (u_long)mp, type[vp->v_type], vp->v_refcnt,
+                               (strlen(mp->m_path) == 1) ? "\0" : mp->m_path,
+                               vn_path(vp));
+               }
+       }
+       kprintf("\n");
+       VNODE_UNLOCK();
+}
+#endif
+
+int
+vop_nullop(void)
+{
+       return 0;
+}
+
+int
+vop_einval(void)
+{
+       return EINVAL;
+}
+
+int
+vop_eperm(void)
+{
+       return EPERM;
+}
+
+int
+vop_erofs(void)
+{
+       return EROFS;
+}
+
+/*
+ * vnode_init() is called once (from vfs_init)
+ * in initialization.
+ */
+void
+vnode_init(void)
+{
+       int i;
+
+       for (i = 0; i < VNODE_BUCKETS; i++)
+               LIST_INIT(&vnode_table[i]);
+}
+
+void vn_add_name(struct vnode *vp, struct dentry *dp)
+{
+       vn_lock(vp);
+       LIST_INSERT_HEAD(&vp->v_names, dp, d_names_link);
+       vn_unlock(vp);
+}
+
+void vn_del_name(struct vnode *vp, struct dentry *dp)
+{
+       vn_lock(vp);
+       LIST_REMOVE(dp, d_names_link);
+       vn_unlock(vp);
+}
+
_______________________________________________ Minios-devel mailing list Minios-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/minios-devel
©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.