[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[qemu-xen staging-4.13] block: posix: Always allocate the first block



commit 3d018ff3bdd8aec260254036b600cfa8d694ced4
Author:     Nir Soffer <nirsof@xxxxxxxxx>
AuthorDate: Tue Aug 27 04:05:27 2019 +0300
Commit:     Michael Roth <mdroth@xxxxxxxxxxxxxxxxxx>
CommitDate: Tue Nov 12 11:59:58 2019 -0600

    block: posix: Always allocate the first block
    
    When creating an image with preallocation "off" or "falloc", the first
    block of the image is typically not allocated. When using Gluster
    storage backed by XFS filesystem, reading this block using direct I/O
    succeeds regardless of request length, fooling alignment detection.
    
    In this case we fallback to a safe value (4096) instead of the optimal
    value (512), which may lead to unneeded data copying when aligning
    requests.  Allocating the first block avoids the fallback.
    
    Since we allocate the first block even with preallocation=off, we no
    longer create images with zero disk size:
    
        $ ./qemu-img create -f raw test.raw 1g
        Formatting 'test.raw', fmt=raw size=1073741824
    
        $ ls -lhs test.raw
        4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw
    
    And converting the image requires additional cluster:
    
        $ ./qemu-img measure -f raw -O qcow2 test.raw
        required size: 458752
        fully allocated size: 1074135040
    
    When using format like vmdk with multiple files per image, we allocate
    one block per file:
    
        $ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g
        Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off 
hwversion=undefined subformat=twoGbMaxExtentFlat
    
        $ ls -lhs test*.vmdk
        4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk
        4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk
        4.0K -rw-r--r--. 1 nsoffer nsoffer  353 Aug 27 03:23 test.vmdk
    
    I did quick performance test for copying disks with qemu-img convert to
    new raw target image to Gluster storage with sector size of 512 bytes:
    
        for i in $(seq 10); do
            rm -f dst.raw
            sleep 10
            time ./qemu-img convert -f raw -O raw -t none -T none src.raw 
dst.raw
        done
    
    Here is a table comparing the total time spent:
    
    Type    Before(s)   After(s)    Diff(%)
    ---------------------------------------
    real      530.028    469.123      -11.4
    user       17.204     10.768      -37.4
    sys        17.881      7.011      -60.7
    
    We can see very clear improvement in CPU usage.
    
    Signed-off-by: Nir Soffer <nsoffer@xxxxxxxxxx>
    Message-id: 20190827010528.8818-2-nsoffer@xxxxxxxxxx
    Reviewed-by: Max Reitz <mreitz@xxxxxxxxxx>
    Signed-off-by: Max Reitz <mreitz@xxxxxxxxxx>
    
    (cherry picked from commit 3a20013fbb26d2a1bd11ef148eefdb1508783787)
    
    Signed-off-by: Michael Roth <mdroth@xxxxxxxxxxxxxxxxxx>
---
 block/file-posix.c               | 51 ++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/059.out       |  2 +-
 tests/qemu-iotests/150.out       | 11 ---------
 tests/qemu-iotests/150.out.qcow2 | 11 +++++++++
 tests/qemu-iotests/150.out.raw   | 12 ++++++++++
 tests/qemu-iotests/175           | 19 ++++++++++-----
 tests/qemu-iotests/175.out       |  8 +++----
 tests/qemu-iotests/178.out.qcow2 |  4 ++--
 tests/qemu-iotests/221.out       | 12 ++++++----
 tests/qemu-iotests/253.out       | 12 ++++++----
 10 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index be32dd8c51..2184aa980c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1674,6 +1674,43 @@ static int handle_aiocb_discard(void *opaque)
     return ret;
 }
 
+/*
+ * Help alignment probing by allocating the first block.
+ *
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
+ * reading succeeds regardless of request length. In this case we fallback to
+ * safe alignment which is not optimal. Allocating the first block avoids this
+ * fallback.
+ *
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
+ * request alignment, so we use safe values.
+ *
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
+ * caller may ignore failures.
+ */
+static int allocate_first_block(int fd, size_t max_size)
+{
+    size_t write_size = (max_size < MAX_BLOCKSIZE)
+        ? BDRV_SECTOR_SIZE
+        : MAX_BLOCKSIZE;
+    size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+    void *buf;
+    ssize_t n;
+    int ret;
+
+    buf = qemu_memalign(max_align, write_size);
+    memset(buf, 0, write_size);
+
+    do {
+        n = pwrite(fd, buf, write_size, 0);
+    } while (n == -1 && errno == EINTR);
+
+    ret = (n == -1) ? -errno : 0;
+
+    qemu_vfree(buf);
+    return ret;
+}
+
 static int handle_aiocb_truncate(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -1713,6 +1750,17 @@ static int handle_aiocb_truncate(void *opaque)
                 /* posix_fallocate() doesn't set errno. */
                 error_setg_errno(errp, -result,
                                  "Could not preallocate new data");
+            } else if (current_length == 0) {
+                /*
+                 * posix_fallocate() uses fallocate() if the filesystem
+                 * supports it, or fallback to manually writing zeroes. If
+                 * fallocate() was used, unaligned reads from the fallocated
+                 * area in raw_probe_alignment() will succeed, hence we need to
+                 * allocate the first block.
+                 *
+                 * Optimize future alignment probing; ignore failures.
+                 */
+                allocate_first_block(fd, offset);
             }
         } else {
             result = 0;
@@ -1774,6 +1822,9 @@ static int handle_aiocb_truncate(void *opaque)
         if (ftruncate(fd, offset) != 0) {
             result = -errno;
             error_setg_errno(errp, -result, "Could not resize file");
+        } else if (current_length == 0 && offset > current_length) {
+            /* Optimize future alignment probing; ignore failures. */
+            allocate_first_block(fd, offset);
         }
         return result;
     default:
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index 4fab42a28c..fe3f861f3c 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 
subformat=twoGbMax
 image: TEST_DIR/t.vmdk
 file format: vmdk
 virtual size: 0.977 TiB (1073741824000 bytes)
-disk size: 16 KiB
+disk size: 1.97 MiB
 Format specific information:
     cid: XXXXXXXX
     parent cid: XXXXXXXX
diff --git a/tests/qemu-iotests/150.out b/tests/qemu-iotests/150.out
deleted file mode 100644
index 2a54e8dcfa..0000000000
--- a/tests/qemu-iotests/150.out
+++ /dev/null
@@ -1,11 +0,0 @@
-QA output created by 150
-
-=== Mapping sparse conversion ===
-
-Offset          Length          File
-
-=== Mapping non-sparse conversion ===
-
-Offset          Length          File
-0               0x100000        TEST_DIR/t.IMGFMT
-*** done
diff --git a/tests/qemu-iotests/150.out.qcow2 b/tests/qemu-iotests/150.out.qcow2
new file mode 100644
index 0000000000..2a54e8dcfa
--- /dev/null
+++ b/tests/qemu-iotests/150.out.qcow2
@@ -0,0 +1,11 @@
+QA output created by 150
+
+=== Mapping sparse conversion ===
+
+Offset          Length          File
+
+=== Mapping non-sparse conversion ===
+
+Offset          Length          File
+0               0x100000        TEST_DIR/t.IMGFMT
+*** done
diff --git a/tests/qemu-iotests/150.out.raw b/tests/qemu-iotests/150.out.raw
new file mode 100644
index 0000000000..3cdc7727a5
--- /dev/null
+++ b/tests/qemu-iotests/150.out.raw
@@ -0,0 +1,12 @@
+QA output created by 150
+
+=== Mapping sparse conversion ===
+
+Offset          Length          File
+0               0x1000          TEST_DIR/t.IMGFMT
+
+=== Mapping non-sparse conversion ===
+
+Offset          Length          File
+0               0x100000        TEST_DIR/t.IMGFMT
+*** done
diff --git a/tests/qemu-iotests/175 b/tests/qemu-iotests/175
index 51e62c8276..7ba28b3c1b 100755
--- a/tests/qemu-iotests/175
+++ b/tests/qemu-iotests/175
@@ -37,14 +37,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 # the file size.  This function hides the resulting difference in the
 # stat -c '%b' output.
 # Parameter 1: Number of blocks an empty file occupies
-# Parameter 2: Image size in bytes
+# Parameter 2: Minimal number of blocks in an image
+# Parameter 3: Image size in bytes
 _filter_blocks()
 {
     extra_blocks=$1
-    img_size=$2
+    min_blocks=$2
+    img_size=$3
 
-    sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
-        -e "s/blocks=$((extra_blocks + img_size / 
512))\\(\$\\|[^0-9]\\)/everything allocated/"
+    sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \
+        -e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max 
allocation/"
 }
 
 # get standard environment, filters and checks
@@ -60,16 +62,21 @@ size=$((1 * 1024 * 1024))
 touch "$TEST_DIR/empty"
 extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
 
+# We always write the first byte; check how many blocks this filesystem
+# allocates to match empty image alloation.
+printf "\0" > "$TEST_DIR/empty"
+min_blocks=$(stat -c '%b' "$TEST_DIR/empty")
+
 echo
 echo "== creating image with default preallocation =="
 _make_test_img $size | _filter_imgfmt
-stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
+stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks 
$min_blocks $size
 
 for mode in off full falloc; do
     echo
     echo "== creating image with preallocation $mode =="
     IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
-    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
+    stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks 
$min_blocks $size
 done
 
 # success, all done
diff --git a/tests/qemu-iotests/175.out b/tests/qemu-iotests/175.out
index 6d9a5ed84e..263e521262 100644
--- a/tests/qemu-iotests/175.out
+++ b/tests/qemu-iotests/175.out
@@ -2,17 +2,17 @@ QA output created by 175
 
 == creating image with default preallocation ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
-size=1048576, nothing allocated
+size=1048576, min allocation
 
 == creating image with preallocation off ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
-size=1048576, nothing allocated
+size=1048576, min allocation
 
 == creating image with preallocation full ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
-size=1048576, everything allocated
+size=1048576, max allocation
 
 == creating image with preallocation falloc ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
-size=1048576, everything allocated
+size=1048576, max allocation
  *** done
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
index 55a8dc926f..9e7d8c44df 100644
--- a/tests/qemu-iotests/178.out.qcow2
+++ b/tests/qemu-iotests/178.out.qcow2
@@ -101,7 +101,7 @@ converted image file size in bytes: 196608
 == raw input image with data (human) ==
 
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
-required size: 393216
+required size: 458752
 fully allocated size: 1074135040
 wrote 512/512 bytes at offset 512
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
@@ -257,7 +257,7 @@ converted image file size in bytes: 196608
 
 Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
 {
-    "required": 393216,
+    "required": 458752,
     "fully-allocated": 1074135040
 }
 wrote 512/512 bytes at offset 512
diff --git a/tests/qemu-iotests/221.out b/tests/qemu-iotests/221.out
index 9f9dd52bb0..dca024a0c3 100644
--- a/tests/qemu-iotests/221.out
+++ b/tests/qemu-iotests/221.out
@@ -3,14 +3,18 @@ QA output created by 221
 === Check mapping of unaligned raw image ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
-[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
 wrote 1/1 bytes at offset 65536
 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
-[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
 { "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
 { "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
 *** done
diff --git a/tests/qemu-iotests/253.out b/tests/qemu-iotests/253.out
index 607c0baa0b..3d08b305d7 100644
--- a/tests/qemu-iotests/253.out
+++ b/tests/qemu-iotests/253.out
@@ -3,12 +3,16 @@ QA output created by 253
 === Check mapping of unaligned raw image ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
-[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET}]
 wrote 65535/65535 bytes at offset 983040
 63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET}]
-[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
+[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, 
"offset": OFFSET},
 { "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET}]
 *** done
--
generated by git-patchbot for /home/xen/git/qemu-xen.git#staging-4.13



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.