[Spice-commits] 122 commits - Makefile.objs block.c block/backup.c block/blkdebug.c block/blkverify.c block/cow.c block/curl.c block/gluster.c block/iscsi.c block/mirror.c block/qapi.c block/qcow.c block/qcow2.c block/qcow2.h block/qed.c block/raw-posix.c block/raw-win32.c block/rbd.c block/sheepdog.c block/stream.c block/vhdx.c block/vmdk.c blockdev.c configure exec.c hmp-commands.hx hmp.c hw/audio hw/block hw/i386 hw/ide hw/scsi hw/usb hw/virtio include/block include/hw include/monitor include/qapi include/qemu include/qemu-io.h kvm-all.c monitor.c qapi-schema.json qemu-doc.texi qemu-img.texi qemu-io-cmds.c qemu-io.c qemu-seccomp.c qmp-commands.hx qobject/qdict.c readline.c scripts/qapi.py target-i386/cpu.c target-i386/cpu.h target-i386/kvm.c target-i386/machine.c tests/Makefile tests/check-qdict.c tests/fdc-test.c tests/ide-test.c tests/qemu-iotests trace-events ui/gtk.c util/Makefile.objs util/error.c util/oslib-posix.c util/oslib-win32.c util/qemu-config.c util/qemu-progress.c util/rea dline.c

Gerd Hoffmann kraxel at kemper.freedesktop.org
Tue Jan 28 01:38:04 PST 2014


 Makefile.objs                |    1 
 block.c                      | 1024 ++++++++++++++++++++++++++++++++-----------
 block/backup.c               |    7 
 block/blkdebug.c             |   81 ++-
 block/blkverify.c            |   31 -
 block/cow.c                  |    3 
 block/curl.c                 |   81 ++-
 block/gluster.c              |  318 +++++++------
 block/iscsi.c                |   67 +-
 block/mirror.c               |   39 +
 block/qapi.c                 |  114 ++--
 block/qcow.c                 |    3 
 block/qcow2.c                |   13 
 block/qcow2.h                |    6 
 block/qed.c                  |   15 
 block/raw-posix.c            |  102 +++-
 block/raw-win32.c            |   41 +
 block/rbd.c                  |  130 -----
 block/sheepdog.c             |   25 -
 block/stream.c               |    2 
 block/vhdx.c                 |    2 
 block/vmdk.c                 |   45 +
 blockdev.c                   |  112 +++-
 configure                    |    8 
 exec.c                       |    2 
 hmp-commands.hx              |    5 
 hmp.c                        |   14 
 hw/audio/hda-codec.c         |   18 
 hw/block/virtio-blk.c        |    2 
 hw/i386/pc_piix.c            |    4 
 hw/ide/core.c                |    2 
 hw/scsi/scsi-bus.c           |    2 
 hw/scsi/scsi-disk.c          |    3 
 hw/scsi/scsi-generic.c       |    2 
 hw/scsi/virtio-scsi.c        |    6 
 hw/usb/Makefile.objs         |    2 
 hw/usb/bus.c                 |    2 
 hw/usb/desc-msos.c           |  234 +++++++++
 hw/usb/desc.c                |   37 +
 hw/usb/desc.h                |   11 
 hw/usb/dev-hid.c             |    8 
 hw/virtio/dataplane/vring.c  |    2 
 include/block/block.h        |   44 +
 include/block/block_int.h    |   48 +-
 include/block/qapi.h         |    1 
 include/hw/i386/pc.h         |    9 
 include/hw/usb.h             |    3 
 include/monitor/monitor.h    |    2 
 include/monitor/readline.h   |   56 --
 include/qapi/qmp/qdict.h     |    1 
 include/qemu-io.h            |    3 
 include/qemu/config-file.h   |    6 
 include/qemu/osdep.h         |    2 
 include/qemu/readline.h      |   62 ++
 kvm-all.c                    |   14 
 monitor.c                    |   41 +
 qapi-schema.json             |  183 ++++++-
 qemu-doc.texi                |    8 
 qemu-img.texi                |   19 
 qemu-io-cmds.c               |   57 ++
 qemu-io.c                    |  119 ++--
 qemu-seccomp.c               |    7 
 qmp-commands.hx              |  117 ++++
 qobject/qdict.c              |   91 +++
 readline.c                   |  493 --------------------
 scripts/qapi.py              |    2 
 target-i386/cpu.c            |    7 
 target-i386/cpu.h            |   25 -
 target-i386/kvm.c            |   69 ++
 target-i386/machine.c        |   51 ++
 tests/Makefile               |    2 
 tests/check-qdict.c          |  156 ++++++
 tests/fdc-test.c             |    5 
 tests/ide-test.c             |    3 
 tests/qemu-iotests/017       |    1 
 tests/qemu-iotests/018       |    1 
 tests/qemu-iotests/019       |    3 
 tests/qemu-iotests/020       |    3 
 tests/qemu-iotests/034       |    3 
 tests/qemu-iotests/037       |    3 
 tests/qemu-iotests/051.out   |    2 
 tests/qemu-iotests/059       |   16 
 tests/qemu-iotests/059.out   |   79 +++
 tests/qemu-iotests/063       |    3 
 tests/qemu-iotests/069       |    1 
 tests/qemu-iotests/071       |  239 ++++++++++
 tests/qemu-iotests/071.out   |   90 +++
 tests/qemu-iotests/072       |   69 ++
 tests/qemu-iotests/072.out   |   21 
 tests/qemu-iotests/077       |  278 +++++++++++
 tests/qemu-iotests/077.out   |  202 ++++++++
 tests/qemu-iotests/common.rc |   28 +
 tests/qemu-iotests/group     |    3 
 trace-events                 |    1 
 ui/gtk.c                     |   18 
 util/Makefile.objs           |    1 
 util/error.c                 |    8 
 util/oslib-posix.c           |   23 
 util/oslib-win32.c           |   19 
 util/qemu-config.c           |  100 ++++
 util/qemu-progress.c         |   11 
 util/readline.c              |  495 ++++++++++++++++++++
 102 files changed, 4609 insertions(+), 1444 deletions(-)

New commits:
commit 0169c511554cb0014a00290b0d3d26c31a49818f
Merge: 1c51e68 439d19f
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:52:44 2014 -0800

    Merge remote-tracking branch 'qemu-kvm/uq/master' into staging
    
    * qemu-kvm/uq/master:
      kvm: always update the MPX model specific register
      KVM: fix addr type for KVM_IOEVENTFD
      KVM: Retry KVM_CREATE_VM on EINTR
      mempath prefault: fix off-by-one error
      kvm: x86: Separately write feature control MSR on reset
      roms: Flush icache when writing roms to guest memory
      target-i386: clear guest TSC on reset
      target-i386: do not special case TSC writeback
      target-i386: Intel MPX
    
    Conflicts:
    	exec.c
    
    aliguori: fix trivial merge conflict in exec.c
    
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit 1c51e68b182bb335464bb19ad2517fd43c58c127
Merge: 7d64b2c 918b94e
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:52:16 2014 -0800

    Merge remote-tracking branch 'otubo/seccomp' into staging
    
    * otubo/seccomp:
      seccomp: add some basic shared memory syscalls to the whitelist
      seccomp: add mkdir() and fchmod() to the whitelist
    
    Message-id: 1390231004-18392-1-git-send-email-otubo at linux.vnet.ibm.com
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit 7d64b2c2e22d956b358a97323f0d70060dcd9a06
Merge: 14ac4fe 2777ccc
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:52:08 2014 -0800

    Merge remote-tracking branch 'sweil/tags/for_anthony' into staging
    
    Initial patch for QEMU GTK support on Windows
    
    # gpg: Signature made Mon 20 Jan 2014 11:37:58 AM PST using RSA key ID FAD62069
    # gpg: Can't check signature: public key not found
    
    * sweil/tags/for_anthony:
      gtk: Support keyboard translation for hosts running Windows
    
    Message-id: 1390246909-18757-1-git-send-email-sw at weilnetz.de
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit 14ac4febb22b4083a5a64b251ab15c94d7d65833
Merge: f4b2779 39e6a38
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:51:38 2014 -0800

    Merge remote-tracking branch 'kraxel/tags/pull-audio-2' into staging
    
    hda-codec: disable streams on reset
    
    # gpg: Signature made Tue 21 Jan 2014 02:17:12 AM PST using RSA key ID D3E87138
    # gpg: Can't check signature: public key not found
    
    * kraxel/tags/pull-audio-2:
      hda-codec: disable streams on reset
    
    Message-id: 1390299589-5082-1-git-send-email-kraxel at redhat.com
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit f4b27793a8b948178ced486d1d32d1919bea81b2
Merge: e9f526a 88678fb
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:51:23 2014 -0800

    Merge remote-tracking branch 'kraxel/tags/pull-usb-2' into staging
    
    usb core+hid: add support for microsoft os descriptors
    
    # gpg: Signature made Tue 21 Jan 2014 02:21:29 AM PST using RSA key ID D3E87138
    # gpg: Can't check signature: public key not found
    
    * kraxel/tags/pull-usb-2:
      usb-hid: add microsoft os descriptor support
      usb: add support for microsoft os descriptors
    
    Message-id: 1390299772-5368-1-git-send-email-kraxel at redhat.com
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit e9f526ab7b01662c323a47446e22308968221ac1
Merge: 0d688cf 1cb27d9
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:50:14 2014 -0800

    Merge remote-tracking branch 'bonzini/scsi-next' into staging
    
    * bonzini/scsi-next:
      scsi: Support TEST UNIT READY in the dummy LUN0
      block: add .bdrv_reopen_prepare() stub for iscsi
      virtio-scsi: Prevent assertion on missed events
      virtio-scsi: Cleanup of I/Os that never started
      scsi: Assign cancel_io vector for scsi_disk_emulate_ops
    
    Conflicts:
    	block/iscsi.c
    
    aliguori: resolve trivial merge conflict in block/iscsi.c
    
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

diff --cc block/iscsi.c
index 890bd81,5976bd1..6f4af72
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@@ -1310,31 -1326,11 +1310,38 @@@ static void iscsi_close(BlockDriverStat
      memset(iscsilun, 0, sizeof(IscsiLun));
  }
  
 +static int iscsi_refresh_limits(BlockDriverState *bs)
 +{
 +    IscsiLun *iscsilun = bs->opaque;
 +
 +    /* We don't actually refresh here, but just return data queried in
 +     * iscsi_open(): iscsi targets don't change their limits. */
 +    if (iscsilun->lbp.lbpu || iscsilun->lbp.lbpws) {
 +        if (iscsilun->bl.max_unmap < 0xffffffff) {
 +            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
 +                                                 iscsilun);
 +        }
 +        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
 +                                                   iscsilun);
 +
 +        if (iscsilun->bl.max_ws_len < 0xffffffff) {
 +            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
 +                                                      iscsilun);
 +        }
 +        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
 +                                                        iscsilun);
 +
 +        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
 +                                                     iscsilun);
 +    }
++    return 0;
++}
 +
+ /* We have nothing to do for iSCSI reopen, stub just returns
+  * success */
+ static int iscsi_reopen_prepare(BDRVReopenState *state,
+                                 BlockReopenQueue *queue, Error **errp)
+ {
      return 0;
  }
  
commit 0d688cf7d8d71bce2aab83173552a784e96b6729
Merge: 732c66c d510358
Author: Anthony Liguori <aliguori at amazon.com>
Date:   Fri Jan 24 15:43:30 2014 -0800

    Merge remote-tracking branch 'kwolf/tags/for-anthony' into staging
    
    Block patches
    
    # gpg: Signature made Fri 24 Jan 2014 08:40:53 AM PST using RSA key ID C88F2FD6
    # gpg: Can't check signature: public key not found
    
    * kwolf/tags/for-anthony: (93 commits)
      block: Switch bdrv_io_limits_intercept() to byte granularity
      qemu-iotests: Test pwritev RMW logic
      qemu-io: New command 'sleep'
      blkdebug: Make required alignment configurable
      iscsi: Set bs->request_alignment
      block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper
      block: Make bdrv_pread() a bdrv_prwv_co() wrapper
      block: Change coroutine wrapper to byte granularity
      block: Assert serialisation assumptions in pwritev
      block: Align requests in bdrv_co_do_pwritev()
      block: Allow wait_serialising_requests() at any point
      block: Make overlap range for serialisation dynamic
      block: Generalise and optimise COR serialisation
      block: Make zero-after-EOF work with larger alignment
      block: Allow waiting for overlapping requests between begin/end
      block: Switch BdrvTrackedRequest to byte granularity
      block: Introduce bdrv_co_do_pwritev()
      block: write: Handle COR dependency after I/O throttling
      block: Introduce bdrv_aligned_pwritev()
      block: Introduce bdrv_co_do_preadv()
      ...
    
    Message-id: 1390584136-24703-1-git-send-email-kwolf at redhat.com
    Signed-off-by: Anthony Liguori <aliguori at amazon.com>

commit d5103588aa39157c8eea3bb5fb6780bbd8be21b7
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Jan 16 13:29:10 2014 +0100

    block: Switch bdrv_io_limits_intercept() to byte granularity
    
    Request sizes used to be rounded down to the next sector boundary,
    allowing to bypass the I/O limit. Now all requests are accounted for
    with their exact byte size.
    
    Reported-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 932ce58..cb21a5f 100644
--- a/block.c
+++ b/block.c
@@ -192,7 +192,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs)
  * @is_write:   is the IO a write
  */
 static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     int nb_sectors,
+                                     unsigned int bytes,
                                      bool is_write)
 {
     /* does this io must wait */
@@ -205,9 +205,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
     }
 
     /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state,
-                     is_write,
-                     nb_sectors * BDRV_SECTOR_SIZE);
+    throttle_account(&bs->throttle_state, is_write, bytes);
+
 
     /* if the next request must wait -> do nothing */
     if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
@@ -2968,8 +2967,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        /* TODO Switch to byte granularity */
-        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
+        bdrv_io_limits_intercept(bs, bytes, false);
     }
 
     /* Align read if necessary by padding qiov */
@@ -3193,8 +3191,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        /* TODO Switch to byte granularity */
-        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
+        bdrv_io_limits_intercept(bs, bytes, true);
     }
 
     /*
commit 9e1cb96d9a5e434f389a4d7b7ff4dcdd71e8ec0f
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Jan 14 15:37:03 2014 +0100

    qemu-iotests: Test pwritev RMW logic
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 179d27a..932ce58 100644
--- a/block.c
+++ b/block.c
@@ -3141,10 +3141,13 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     if (ret < 0) {
         /* Do nothing, write notifier decided to fail this request */
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
     } else {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
     }
+    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
 
     if (ret == 0 && !bs->enable_write_cache) {
         ret = bdrv_co_flush(bs);
@@ -3215,11 +3218,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         };
         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
 
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
         ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
                                   align, &head_qiov, 0);
         if (ret < 0) {
             goto fail;
         }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
 
         qemu_iovec_init(&local_qiov, qiov->niov + 2);
         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
@@ -3247,11 +3252,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         };
         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
 
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
         ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
                                   align, &tail_qiov, 0);
         if (ret < 0) {
             goto fail;
         }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
 
         if (!use_local_qiov) {
             qemu_iovec_init(&local_qiov, qiov->niov + 1);
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 2c03698..56c4cd0 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -186,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
 
     [BLKDBG_FLUSH_TO_OS]                    = "flush_to_os",
     [BLKDBG_FLUSH_TO_DISK]                  = "flush_to_disk",
+
+    [BLKDBG_PWRITEV_RMW_HEAD]               = "pwritev_rmw.head",
+    [BLKDBG_PWRITEV_RMW_AFTER_HEAD]         = "pwritev_rmw.after_head",
+    [BLKDBG_PWRITEV_RMW_TAIL]               = "pwritev_rmw.tail",
+    [BLKDBG_PWRITEV_RMW_AFTER_TAIL]         = "pwritev_rmw.after_tail",
+    [BLKDBG_PWRITEV]                        = "pwritev",
+    [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
+    [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
diff --git a/include/block/block.h b/include/block/block.h
index 1085992..963a61f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -527,6 +527,14 @@ typedef enum {
     BLKDBG_FLUSH_TO_OS,
     BLKDBG_FLUSH_TO_DISK,
 
+    BLKDBG_PWRITEV_RMW_HEAD,
+    BLKDBG_PWRITEV_RMW_AFTER_HEAD,
+    BLKDBG_PWRITEV_RMW_TAIL,
+    BLKDBG_PWRITEV_RMW_AFTER_TAIL,
+    BLKDBG_PWRITEV,
+    BLKDBG_PWRITEV_ZERO,
+    BLKDBG_PWRITEV_DONE,
+
     BLKDBG_EVENT_MAX,
 } BlkDebugEvent;
 
diff --git a/tests/qemu-iotests/077 b/tests/qemu-iotests/077
new file mode 100755
index 0000000..bbf7b51
--- /dev/null
+++ b/tests/qemu-iotests/077
@@ -0,0 +1,278 @@
+#!/bin/bash
+#
+# Test concurrent pread/pwrite
+#
+# Copyright (C) 2014 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf at redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+
+CLUSTER_SIZE=4k
+size=128M
+
+_make_test_img $size
+
+echo
+echo "== Some concurrent requests involving RMW =="
+
+function test_io()
+{
+echo "open -o file.align=4k blkdebug::$TEST_IMG"
+# A simple RMW request
+cat  <<EOF
+aio_write -P 10 0x200 0x200
+aio_flush
+EOF
+
+# Sequential RMW requests on the same physical sector
+off=0x1000
+for ev in "head" "after_head" "tail" "after_tail"; do
+cat  <<EOF
+break pwritev_rmw.$ev A
+aio_write -P 10 $((off + 0x200)) 0x200
+wait_break A
+aio_write -P 11 $((off + 0x400)) 0x200
+sleep 100
+resume A
+aio_flush
+EOF
+off=$((off + 0x1000))
+done
+
+# Chained dependencies
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0x5000 0x200
+wait_break A
+aio_write -P 11 0x5200 0x200
+aio_write -P 12 0x5400 0x200
+aio_write -P 13 0x5600 0x200
+aio_write -P 14 0x5800 0x200
+aio_write -P 15 0x5a00 0x200
+aio_write -P 16 0x5c00 0x200
+aio_write -P 17 0x5e00 0x200
+sleep 100
+resume A
+aio_flush
+EOF
+
+# Overlapping multiple requests
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0x6000 0x200
+wait_break A
+break pwritev_rmw.after_head B
+aio_write -P 10 0x7e00 0x200
+wait_break B
+aio_write -P 11 0x6800 0x1000
+resume A
+sleep 100
+resume B
+aio_flush
+EOF
+
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0x8000 0x200
+wait_break A
+break pwritev_rmw.after_head B
+aio_write -P 10 0x9e00 0x200
+wait_break B
+aio_write -P 11 0x8800 0x1000
+resume B
+sleep 100
+resume A
+aio_flush
+EOF
+
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0xa000 0x200
+wait_break A
+aio_write -P 11 0xa800 0x1000
+break pwritev_rmw.after_head B
+aio_write -P 10 0xbe00 0x200
+wait_break B
+resume A
+sleep 100
+resume B
+aio_flush
+EOF
+
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0xc000 0x200
+wait_break A
+aio_write -P 11 0xc800 0x1000
+break pwritev_rmw.after_head B
+aio_write -P 10 0xde00 0x200
+wait_break B
+resume B
+sleep 100
+resume A
+aio_flush
+EOF
+
+# Only RMW for the tail part
+cat  <<EOF
+break pwritev_rmw.after_tail A
+aio_write -P 10 0xe000 0x1800
+wait_break A
+aio_write -P 11 0xf000 0xc00
+sleep 100
+resume A
+aio_flush
+EOF
+
+cat  <<EOF
+break pwritev A
+aio_write -P 10 0x10000 0x800
+wait_break A
+break pwritev_rmw.after_tail B
+aio_write -P 11 0x10000 0x400
+break pwritev_done C
+resume A
+wait_break C
+resume C
+sleep 100
+wait_break B
+resume B
+aio_flush
+EOF
+
+cat  <<EOF
+break pwritev A
+aio_write -P 10 0x11000 0x800
+wait_break A
+aio_write -P 11 0x11000 0x1000
+sleep 100
+resume A
+aio_flush
+EOF
+}
+
+test_io | $QEMU_IO  | _filter_qemu_io | \
+    sed -e 's,[0-9/]* bytes at offset [0-9]*,XXX/XXX bytes at offset XXX,g' \
+        -e 's/^[0-9]* \(bytes\|KiB\)/XXX bytes/' \
+        -e '/Suspended/d'
+
+echo
+echo "== Verify image content =="
+
+function verify_io()
+{
+    # A simple RMW request
+    echo read -P 0       0 0x200
+    echo read -P 10  0x200 0x200
+    echo read -P 0   0x400 0xc00
+
+    # Sequential RMW requests on the same physical sector
+    echo read -P 0  0x1000 0x200
+    echo read -P 10 0x1200 0x200
+    echo read -P 11 0x1400 0x200
+    echo read -P 0  0x1600 0xa00
+
+    echo read -P 0  0x2000 0x200
+    echo read -P 10 0x2200 0x200
+    echo read -P 11 0x2400 0x200
+    echo read -P 0  0x2600 0xa00
+
+    echo read -P 0  0x3000 0x200
+    echo read -P 10 0x3200 0x200
+    echo read -P 11 0x3400 0x200
+    echo read -P 0  0x3600 0xa00
+
+    echo read -P 0  0x4000 0x200
+    echo read -P 10 0x4200 0x200
+    echo read -P 11 0x4400 0x200
+    echo read -P 0  0x4600 0xa00
+
+    # Chained dependencies
+    echo read -P 10 0x5000 0x200
+    echo read -P 11 0x5200 0x200
+    echo read -P 12 0x5400 0x200
+    echo read -P 13 0x5600 0x200
+    echo read -P 14 0x5800 0x200
+    echo read -P 15 0x5a00 0x200
+    echo read -P 16 0x5c00 0x200
+    echo read -P 17 0x5e00 0x200
+
+    # Overlapping multiple requests
+    echo read -P 10 0x6000 0x200
+    echo read -P  0 0x6200 0x600
+    echo read -P 11 0x6800 0x1000
+    echo read -P  0 0x7800 0x600
+    echo read -P 10 0x7e00 0x200
+
+    echo read -P 10 0x8000 0x200
+    echo read -P  0 0x8200 0x600
+    echo read -P 11 0x8800 0x1000
+    echo read -P  0 0x9800 0x600
+    echo read -P 10 0x9e00 0x200
+
+    echo read -P 10 0xa000 0x200
+    echo read -P  0 0xa200 0x600
+    echo read -P 11 0xa800 0x1000
+    echo read -P  0 0xb800 0x600
+    echo read -P 10 0xbe00 0x200
+
+    echo read -P 10 0xc000 0x200
+    echo read -P  0 0xc200 0x600
+    echo read -P 11 0xc800 0x1000
+    echo read -P  0 0xd800 0x600
+    echo read -P 10 0xde00 0x200
+
+    # Only RMW for the tail part
+    echo read -P 10 0xe000 0x1000
+    echo read -P 11 0xf800 0x400
+    echo read -P  0 0xfc00 0x400
+
+    echo read -P 11 0x10000 0x400
+    echo read -P 10 0x10400 0x400
+
+    echo read -P 11 0x11800 0x800
+}
+
+verify_io | $QEMU_IO "$TEST_IMG" | _filter_qemu_io
+
+_check_test_img
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/077.out b/tests/qemu-iotests/077.out
new file mode 100644
index 0000000..ab61234
--- /dev/null
+++ b/tests/qemu-iotests/077.out
@@ -0,0 +1,202 @@
+QA output created by 077
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+
+== Some concurrent requests involving RMW ==
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'B'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'B'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'B'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'B'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+blkdebug: Resuming request 'C'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'B'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkdebug: Resuming request 'A'
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Verify image content ==
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 512
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3072/3072 bytes at offset 1024
+3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 4096
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 4608
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 5120
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2560/2560 bytes at offset 5632
+2.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 8192
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 8704
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 9216
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2560/2560 bytes at offset 9728
+2.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 12288
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 12800
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 13312
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2560/2560 bytes at offset 13824
+2.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 16384
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 16896
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 17408
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2560/2560 bytes at offset 17920
+2.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 20480
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 20992
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 21504
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 22016
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 22528
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 23040
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 23552
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 24064
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 24576
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 25088
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 26624
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 30720
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 32256
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 32768
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 33280
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 34816
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 38912
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 40448
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 40960
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 41472
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 43008
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 47104
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 48640
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 49152
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 49664
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 51200
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1536/1536 bytes at offset 55296
+1.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 56832
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 57344
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 63488
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 64512
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 66560
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2048/2048 bytes at offset 71680
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+No errors were found on the image.
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 5860c40..03c762f 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -81,3 +81,4 @@
 072 rw auto
 073 rw auto
 074 rw auto
+077 rw auto
commit cd33d02a1012e58ee0d3c8259159e8c60cfa0a4d
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Jan 15 15:39:10 2014 +0100

    qemu-io: New command 'sleep'
    
    There is no easy way to check that a request correctly waits for a
    different request. With a sleep command we can at least approximate it.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 6dfb4a5..f1de24c 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -12,6 +12,7 @@
 #include "block/block_int.h"
 #include "block/qapi.h"
 #include "qemu/main-loop.h"
+#include "qemu/timer.h"
 
 #define CMD_NOFILE_OK   0x01
 
@@ -2053,6 +2054,46 @@ static const cmdinfo_t abort_cmd = {
        .oneline        = "simulate a program crash using abort(3)",
 };
 
+static void sleep_cb(void *opaque)
+{
+    bool *expired = opaque;
+    *expired = true;
+}
+
+static int sleep_f(BlockDriverState *bs, int argc, char **argv)
+{
+    char *endptr;
+    long ms;
+    struct QEMUTimer *timer;
+    bool expired = false;
+
+    ms = strtol(argv[1], &endptr, 0);
+    if (ms < 0 || *endptr != '\0') {
+        printf("%s is not a valid number\n", argv[1]);
+        return 0;
+    }
+
+    timer = timer_new_ns(QEMU_CLOCK_HOST, sleep_cb, &expired);
+    timer_mod(timer, qemu_clock_get_ns(QEMU_CLOCK_HOST) + SCALE_MS * ms);
+
+    while (!expired) {
+        main_loop_wait(false);
+    }
+
+    timer_free(timer);
+
+    return 0;
+}
+
+static const cmdinfo_t sleep_cmd = {
+       .name           = "sleep",
+       .argmin         = 1,
+       .argmax         = 1,
+       .cfunc          = sleep_f,
+       .flags          = CMD_NOFILE_OK,
+       .oneline        = "waits for the given value in milliseconds",
+};
+
 static void help_oneline(const char *cmd, const cmdinfo_t *ct)
 {
     if (cmd) {
@@ -2166,4 +2207,5 @@ static void __attribute((constructor)) init_qemuio_commands(void)
     qemuio_add_command(&resume_cmd);
     qemuio_add_command(&wait_break_cmd);
     qemuio_add_command(&abort_cmd);
+    qemuio_add_command(&sleep_cmd);
 }
commit b35ee7fb2308e09092488029b5a9e456ce61bbe6
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Jan 14 13:44:35 2014 +0100

    blkdebug: Make required alignment configurable
    
    The new 'align' option of blkdebug can be used in order to emulate
    backends with a required 4k alignment on hosts which only really require
    512 byte alignment.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index c8f8d56..2c03698 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -364,6 +364,11 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_STRING,
             .help = "[internal use only, will be removed]",
         },
+        {
+            .name = "align",
+            .type = QEMU_OPT_SIZE,
+            .help = "Required alignment in bytes",
+        },
         { /* end of list */ }
     },
 };
@@ -375,6 +380,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *config;
+    uint64_t align;
     int ret;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -403,6 +409,16 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    /* Set request alignment */
+    align = qemu_opt_get_size(opts, "align", bs->request_alignment);
+    if (align > 0 && align < INT_MAX && !(align & (align - 1))) {
+        bs->request_alignment = align;
+    } else {
+        error_setg(errp, "Invalid alignment");
+        ret = -EINVAL;
+        goto fail;
+    }
+
     ret = 0;
 fail:
     qemu_opts_del(opts);
diff --git a/qapi-schema.json b/qapi-schema.json
index 1ff607a..05ced9d 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4321,6 +4321,8 @@
 #
 # @config:          #optional filename of the configuration file
 #
+# @align:           #optional required alignment for requests in bytes
+#
 # @inject-error:    #optional array of error injection descriptions
 #
 # @set-state:       #optional array of state-change descriptions
@@ -4330,6 +4332,7 @@
 { 'type': 'BlockdevOptionsBlkdebug',
   'data': { 'image': 'BlockdevRef',
             '*config': 'str',
+            '*align': 'int',
             '*inject-error': ['BlkdebugInjectErrorOptions'],
             '*set-state': ['BlkdebugSetStateOptions'] } }
 
commit 2c9880c45e2f9a98d11d44ce9966515c23870a86
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Tue Nov 29 12:41:35 2011 +0100

    iscsi: Set bs->request_alignment
    
    The iSCSI backend already gets the block size from the READ CAPACITY
    command it sends.  Save it so that the generic block layer gets it
    too.
    
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block/iscsi.c b/block/iscsi.c
index c8bf8dc..890bd81 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1217,6 +1217,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         goto out;
     }
     bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun);
+    bs->request_alignment = iscsilun->block_size;
 
     /* Medium changer or tape. We dont have any emulation for this so this must
      * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
commit 8407d5d7e265911b05949ee2ffd9e45c97bf0505
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Dec 5 12:34:02 2013 +0100

    block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper
    
    Instead of implementing the alignment adjustment here, use the now
    existing functionality of bdrv_co_do_pwritev().
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 07114f3..179d27a 100644
--- a/block.c
+++ b/block.c
@@ -2667,11 +2667,6 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
     return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
-{
-    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, qiov, true, 0);
-}
-
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
                       int nb_sectors, BdrvRequestFlags flags)
 {
@@ -2745,70 +2740,29 @@ int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
 {
-    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-    int len, nb_sectors, count;
-    int64_t sector_num;
     int ret;
 
-    count = qiov->size;
-
-    /* first write to align to sector start */
-    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
-    if (len > count)
-        len = count;
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    if (len > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)),
-                          len);
-        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        count -= len;
-        if (count == 0)
-            return qiov->size;
-        sector_num++;
-    }
-
-    /* write the sectors "in place" */
-    nb_sectors = count >> BDRV_SECTOR_BITS;
-    if (nb_sectors > 0) {
-        QEMUIOVector qiov_inplace;
-
-        qemu_iovec_init(&qiov_inplace, qiov->niov);
-        qemu_iovec_concat(&qiov_inplace, qiov, len,
-                          nb_sectors << BDRV_SECTOR_BITS);
-        ret = bdrv_writev(bs, sector_num, &qiov_inplace);
-        qemu_iovec_destroy(&qiov_inplace);
-        if (ret < 0) {
-            return ret;
-        }
-
-        sector_num += nb_sectors;
-        len = nb_sectors << BDRV_SECTOR_BITS;
-        count -= len;
+    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    if (ret < 0) {
+        return ret;
     }
 
-    /* add data from the last sector */
-    if (count > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count);
-        if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-    }
     return qiov->size;
 }
 
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-                const void *buf, int count1)
+                const void *buf, int bytes)
 {
     QEMUIOVector qiov;
     struct iovec iov = {
         .iov_base   = (void *) buf,
-        .iov_len    = count1,
+        .iov_len    = bytes,
     };
 
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
     qemu_iovec_init_external(&qiov, &iov, 1);
     return bdrv_pwritev(bs, offset, &qiov);
 }
diff --git a/include/block/block.h b/include/block/block.h
index a2f5657..1085992 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -224,7 +224,6 @@ BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, int64_t sector_num
                                         int nb_sectors, BdrvRequestFlags flags,
                                         BlockDriverCompletionFunc *cb, void *opaque);
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
                void *buf, int count);
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
commit a3ef65718506fb94cb9e5a903ef9bf9ad8fbe6de
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Dec 5 12:29:59 2013 +0100

    block: Make bdrv_pread() a bdrv_prwv_co() wrapper
    
    Instead of implementing the alignment adjustment here, use the now
    existing functionality of bdrv_co_do_preadv().
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index fefa3f2..07114f3 100644
--- a/block.c
+++ b/block.c
@@ -2721,49 +2721,26 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 }
 
-int bdrv_pread(BlockDriverState *bs, int64_t offset,
-               void *buf, int count1)
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 {
-    uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-    int len, nb_sectors, count;
-    int64_t sector_num;
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = bytes,
+    };
     int ret;
 
-    count = count1;
-    /* first read to align to sector start */
-    len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
-    if (len > count)
-        len = count;
-    sector_num = offset >> BDRV_SECTOR_BITS;
-    if (len > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
-        count -= len;
-        if (count == 0)
-            return count1;
-        sector_num++;
-        buf += len;
+    if (bytes < 0) {
+        return -EINVAL;
     }
 
-    /* read the sectors "in place" */
-    nb_sectors = count >> BDRV_SECTOR_BITS;
-    if (nb_sectors > 0) {
-        if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
-            return ret;
-        sector_num += nb_sectors;
-        len = nb_sectors << BDRV_SECTOR_BITS;
-        buf += len;
-        count -= len;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+    if (ret < 0) {
+        return ret;
     }
 
-    /* add data from the last sector */
-    if (count > 0) {
-        if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
-            return ret;
-        memcpy(buf, tmp_buf, count);
-    }
-    return count1;
+    return bytes;
 }
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
commit 775aa8b6e0ea25f8cca74d0fcb1e30a764cf624f
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Dec 5 12:09:38 2013 +0100

    block: Change coroutine wrapper to byte granularity
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index e1b2c8d..fefa3f2 100644
--- a/block.c
+++ b/block.c
@@ -70,11 +70,11 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags);
 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                int64_t sector_num,
@@ -2554,8 +2554,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
 
 typedef struct RwCo {
     BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
     QEMUIOVector *qiov;
     bool is_write;
     int ret;
@@ -2567,34 +2566,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
     RwCo *rwco = opaque;
 
     if (!rwco->is_write) {
-        rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
-                                     rwco->nb_sectors, rwco->qiov,
-                                     rwco->flags);
-    } else {
-        rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
-                                      rwco->nb_sectors, rwco->qiov,
+        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+                                      rwco->qiov->size, rwco->qiov,
                                       rwco->flags);
+    } else {
+        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+                                       rwco->qiov->size, rwco->qiov,
+                                       rwco->flags);
     }
 }
 
 /*
  * Process a vectored synchronous request using coroutines
  */
-static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
-                       QEMUIOVector *qiov, bool is_write,
-                       BdrvRequestFlags flags)
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+                        QEMUIOVector *qiov, bool is_write,
+                        BdrvRequestFlags flags)
 {
     Coroutine *co;
     RwCo rwco = {
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = qiov->size >> BDRV_SECTOR_BITS,
+        .offset = offset,
         .qiov = qiov,
         .is_write = is_write,
         .ret = NOT_DONE,
         .flags = flags,
     };
-    assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0);
 
     /**
      * In sync call context, when the vcpu is blocked, this throttling timer
@@ -2633,7 +2630,8 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
     };
 
     qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+                        &qiov, is_write, flags);
 }
 
 /* return < 0 if error. See bdrv_write() for the return codes */
@@ -2671,7 +2669,7 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
 
 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
 {
-    return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, qiov, true, 0);
 }
 
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
@@ -4856,9 +4854,15 @@ int bdrv_flush(BlockDriverState *bs)
     return rwco.ret;
 }
 
+typedef struct DiscardCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int ret;
+} DiscardCo;
 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 {
-    RwCo *rwco = opaque;
+    DiscardCo *rwco = opaque;
 
     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }
@@ -4942,7 +4946,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
 {
     Coroutine *co;
-    RwCo rwco = {
+    DiscardCo rwco = {
         .bs = bs,
         .sector_num = sector_num,
         .nb_sectors = nb_sectors,
commit 28de2dcd88de31f50bbd43d9c2fcb046c3a727cb
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Jan 14 11:41:35 2014 +0100

    block: Assert serialisation assumptions in pwritev
    
    If a request calls wait_serialising_requests() and actually has to wait
    in this function (i.e. a coroutine yield), other requests can run and
    previously read data (like the head or tail buffer) could become
    outdated. In this case, we would have to restart from the beginning to
    read in the updated data.
    
    However, we're lucky and don't actually need to do that: A request can
    only wait in the first call of wait_serialising_requests() because we
    mark it as serialising before that call, so any later requests would
    wait. So as we don't wait in practice, we don't have to reload the data.
    
    This is an important assumption that may not be broken or data
    corruption will happen. Document it with some assertions.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 04a3c5a..e1b2c8d 100644
--- a/block.c
+++ b/block.c
@@ -2303,14 +2303,15 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
     return true;
 }
 
-static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
     BlockDriverState *bs = self->bs;
     BdrvTrackedRequest *req;
     bool retry;
+    bool waited = false;
 
     if (!bs->serialising_in_flight) {
-        return;
+        return false;
     }
 
     do {
@@ -2336,11 +2337,14 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                     qemu_co_queue_wait(&req->wait_queue);
                     self->waiting_for = NULL;
                     retry = true;
+                    waited = true;
                     break;
                 }
             }
         }
     } while (retry);
+
+    return waited;
 }
 
 /*
@@ -3191,6 +3195,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
+    bool waited;
     int ret;
 
     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
@@ -3199,7 +3204,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    wait_serialising_requests(req);
+    waited = wait_serialising_requests(req);
+    assert(!waited || !req->serialising);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
@@ -3299,9 +3305,11 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         QEMUIOVector tail_qiov;
         struct iovec tail_iov;
         size_t tail_bytes;
+        bool waited;
 
         mark_request_serialising(&req, align);
-        wait_serialising_requests(&req);
+        waited = wait_serialising_requests(&req);
+        assert(!waited || !use_local_qiov);
 
         tail_buf = qemu_blockalign(bs, align);
         tail_iov = (struct iovec) {
commit 3b8242e0ea2a2c201ef3d1bd24080490dae33080
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 16:34:41 2013 +0100

    block: Align requests in bdrv_co_do_pwritev()
    
    This patch changes bdrv_co_do_pwritev() to actually be what its name
    promises. If requests aren't properly aligned, it performs a RMW.
    
    Requests touching the same block are serialised against the RMW request.
    Further optimisation of this is possible by differentiating types of
    requests (concurrent reads should actually be okay here).
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 9690585..04a3c5a 100644
--- a/block.c
+++ b/block.c
@@ -3235,6 +3235,12 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
     BdrvRequestFlags flags)
 {
     BdrvTrackedRequest req;
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
     int ret;
 
     if (!bs->drv) {
@@ -3253,10 +3259,88 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
     }
 
+    /*
+     * Align write if necessary by performing a read-modify-write cycle.
+     * Pad qiov with the read parts and be sure to have a tracked request not
+     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+     */
     tracked_request_begin(&req, bs, offset, bytes, true);
-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, qiov, flags);
+
+    if (offset & (align - 1)) {
+        QEMUIOVector head_qiov;
+        struct iovec head_iov;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        head_buf = qemu_blockalign(bs, align);
+        head_iov = (struct iovec) {
+            .iov_base   = head_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+                                  align, &head_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        QEMUIOVector tail_qiov;
+        struct iovec tail_iov;
+        size_t tail_bytes;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        tail_buf = qemu_blockalign(bs, align);
+        tail_iov = (struct iovec) {
+            .iov_base   = tail_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+                                  align, &tail_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+
+        tail_bytes = (offset + bytes) & (align - 1);
+        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+                               use_local_qiov ? &local_qiov : qiov,
+                               flags);
+
+fail:
     tracked_request_end(&req);
 
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+        qemu_vfree(head_buf);
+        qemu_vfree(tail_buf);
+    }
+
     return ret;
 }
 
commit 6460440f34c709461b84375cfd8a86b27d433225
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Fri Dec 13 13:04:35 2013 +0100

    block: Allow wait_serialising_requests() at any point
    
    We can only have a single wait_serialising_requests() call per request
    because otherwise we can run into deadlocks where requests are waiting
    for each other. The same is true when wait_serialising_requests() is not
    at the very beginning of a request, so that other requests can be issued
    between the start of the tracking and wait_serialising_requests().
    
    Fix this by changing wait_serialising_requests() to ignore requests that
    are already (directly or indirectly) waiting for the calling request.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 784ff07..9690585 100644
--- a/block.c
+++ b/block.c
@@ -2328,9 +2328,16 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  */
                 assert(qemu_coroutine_self() != req->co);
 
-                qemu_co_queue_wait(&req->wait_queue);
-                retry = true;
-                break;
+                /* If the request is already (indirectly) waiting for us, or
+                 * will wait for us as soon as it wakes up, then just go on
+                 * (instead of producing a deadlock in the former case). */
+                if (!req->waiting_for) {
+                    self->waiting_for = req;
+                    qemu_co_queue_wait(&req->wait_queue);
+                    self->waiting_for = NULL;
+                    retry = true;
+                    break;
+                }
             }
         }
     } while (retry);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 0ee955c..0bcf1c9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -68,6 +68,8 @@ typedef struct BdrvTrackedRequest {
     QLIST_ENTRY(BdrvTrackedRequest) list;
     Coroutine *co; /* owner, used for deadlock detection */
     CoQueue wait_queue; /* coroutines blocked on this request */
+
+    struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
 struct BlockDriver {
commit 7327145f63a224c9ba9c16d0c29781feffef8dc6
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 4 17:08:50 2013 +0100

    block: Make overlap range for serialisation dynamic
    
    Copy on Read wants to serialise with all requests touching the same
    cluster, so wait_serialising_requests() rounded to cluster boundaries.
    Other users like alignment RMW will have different requirements, though
    (requests touching the same sector), so make it dynamic.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 493c75f..784ff07 100644
--- a/block.c
+++ b/block.c
@@ -2231,6 +2231,8 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
         .is_write       = is_write,
         .co             = qemu_coroutine_self(),
         .serialising    = false,
+        .overlap_offset = offset,
+        .overlap_bytes  = bytes,
     };
 
     qemu_co_queue_init(&req->wait_queue);
@@ -2238,12 +2240,19 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 }
 
-static void mark_request_serialising(BdrvTrackedRequest *req)
+static void mark_request_serialising(BdrvTrackedRequest *req, size_t align)
 {
+    int64_t overlap_offset = req->offset & ~(align - 1);
+    int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+                      - overlap_offset;
+
     if (!req->serialising) {
         req->bs->serialising_in_flight++;
         req->serialising = true;
     }
+
+    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 }
 
 /**
@@ -2267,20 +2276,16 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
-static void round_bytes_to_clusters(BlockDriverState *bs,
-                                    int64_t offset, unsigned int bytes,
-                                    int64_t *cluster_offset,
-                                    unsigned int *cluster_bytes)
+static int bdrv_get_cluster_size(BlockDriverState *bs)
 {
     BlockDriverInfo bdi;
+    int ret;
 
-    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-        *cluster_offset = offset;
-        *cluster_bytes = bytes;
+    ret = bdrv_get_info(bs, &bdi);
+    if (ret < 0 || bdi.cluster_size == 0) {
+        return bs->request_alignment;
     } else {
-        *cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size);
-        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes,
-                                       bdi.cluster_size);
+        return bdi.cluster_size;
     }
 }
 
@@ -2288,11 +2293,11 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                      int64_t offset, unsigned int bytes)
 {
     /*        aaaa   bbbb */
-    if (offset >= req->offset + req->bytes) {
+    if (offset >= req->overlap_offset + req->overlap_bytes) {
         return false;
     }
     /* bbbb   aaaa        */
-    if (req->offset >= offset + bytes) {
+    if (req->overlap_offset >= offset + bytes) {
         return false;
     }
     return true;
@@ -2302,30 +2307,21 @@ static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
     BlockDriverState *bs = self->bs;
     BdrvTrackedRequest *req;
-    int64_t cluster_offset;
-    unsigned int cluster_bytes;
     bool retry;
 
     if (!bs->serialising_in_flight) {
         return;
     }
 
-    /* If we touch the same cluster it counts as an overlap.  This guarantees
-     * that allocating writes will be serialized and not race with each other
-     * for the same cluster.  For example, in copy-on-read it ensures that the
-     * CoR read and write operations are atomic and guest writes cannot
-     * interleave between them.
-     */
-    round_bytes_to_clusters(bs, self->offset, self->bytes,
-                            &cluster_offset, &cluster_bytes);
-
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
             if (req == self || (!req->serialising && !self->serialising)) {
                 continue;
             }
-            if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
+            if (tracked_request_overlaps(req, self->overlap_offset,
+                                         self->overlap_bytes))
+            {
                 /* Hitting this means there was a reentrant request, for
                  * example, a block driver issuing nested requests.  This must
                  * never happen since it means deadlock.
@@ -2941,7 +2937,12 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
-        mark_request_serialising(req);
+        /* If we touch the same cluster it counts as an overlap.  This
+         * guarantees that allocating writes will be serialized and not race
+         * with each other for the same cluster.  For example, in copy-on-read
+         * it ensures that the CoR read and write operations are atomic and
+         * guest writes cannot interleave between them. */
+        mark_request_serialising(req, bdrv_get_cluster_size(bs));
     }
 
     wait_serialising_requests(req);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index c1153cb..0ee955c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -60,7 +60,11 @@ typedef struct BdrvTrackedRequest {
     int64_t offset;
     unsigned int bytes;
     bool is_write;
+
     bool serialising;
+    int64_t overlap_offset;
+    unsigned int overlap_bytes;
+
     QLIST_ENTRY(BdrvTrackedRequest) list;
     Coroutine *co; /* owner, used for deadlock detection */
     CoQueue wait_queue; /* coroutines blocked on this request */
commit 2dbafdc012d3ea81a97fec6226ca82d644539c9a
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 4 16:43:44 2013 +0100

    block: Generalise and optimise COR serialisation
    
    Change the API so that specific requests can be marked serialising. Only
    these requests are checked for overlaps then.
    
    This means that during a Copy on Read operation, not all requests
    overlapping other requests are serialised any more, but only those that
    actually overlap with the specific COR request.
    
    Also remove COR from function and variable names because this
    functionality can be useful in other contexts.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 0a391bd..493c75f 100644
--- a/block.c
+++ b/block.c
@@ -2208,6 +2208,10 @@ int bdrv_commit_all(void)
  */
 static void tracked_request_end(BdrvTrackedRequest *req)
 {
+    if (req->serialising) {
+        req->bs->serialising_in_flight--;
+    }
+
     QLIST_REMOVE(req, list);
     qemu_co_queue_restart_all(&req->wait_queue);
 }
@@ -2222,10 +2226,11 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
 {
     *req = (BdrvTrackedRequest){
         .bs = bs,
-        .offset = offset,
-        .bytes = bytes,
-        .is_write = is_write,
-        .co = qemu_coroutine_self(),
+        .offset         = offset,
+        .bytes          = bytes,
+        .is_write       = is_write,
+        .co             = qemu_coroutine_self(),
+        .serialising    = false,
     };
 
     qemu_co_queue_init(&req->wait_queue);
@@ -2233,6 +2238,14 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 }
 
+static void mark_request_serialising(BdrvTrackedRequest *req)
+{
+    if (!req->serialising) {
+        req->bs->serialising_in_flight++;
+        req->serialising = true;
+    }
+}
+
 /**
  * Round a region to cluster boundaries
  */
@@ -2285,26 +2298,31 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
     return true;
 }
 
-static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-        BdrvTrackedRequest *self, int64_t offset, unsigned int bytes)
+static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
+    BlockDriverState *bs = self->bs;
     BdrvTrackedRequest *req;
     int64_t cluster_offset;
     unsigned int cluster_bytes;
     bool retry;
 
+    if (!bs->serialising_in_flight) {
+        return;
+    }
+
     /* If we touch the same cluster it counts as an overlap.  This guarantees
      * that allocating writes will be serialized and not race with each other
      * for the same cluster.  For example, in copy-on-read it ensures that the
      * CoR read and write operations are atomic and guest writes cannot
      * interleave between them.
      */
-    round_bytes_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
+    round_bytes_to_clusters(bs, self->offset, self->bytes,
+                            &cluster_offset, &cluster_bytes);
 
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (req == self) {
+            if (req == self || (!req->serialising && !self->serialising)) {
                 continue;
             }
             if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
@@ -2923,12 +2941,10 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 
     /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
-        bs->copy_on_read_in_flight++;
+        mark_request_serialising(req);
     }
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, req, offset, bytes);
-    }
+    wait_serialising_requests(req);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
@@ -2977,10 +2993,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
 out:
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        bs->copy_on_read_in_flight--;
-    }
-
     return ret;
 }
 
@@ -3179,9 +3191,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, req, offset, bytes);
-    }
+    wait_serialising_requests(req);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index bcdd98c..c1153cb 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -60,6 +60,7 @@ typedef struct BdrvTrackedRequest {
     int64_t offset;
     unsigned int bytes;
     bool is_write;
+    bool serialising;
     QLIST_ENTRY(BdrvTrackedRequest) list;
     Coroutine *co; /* owner, used for deadlock detection */
     CoQueue wait_queue; /* coroutines blocked on this request */
@@ -302,8 +303,8 @@ struct BlockDriverState {
     /* Callback before write request is processed */
     NotifierWithReturnList before_write_notifiers;
 
-    /* number of in-flight copy-on-read requests */
-    unsigned int copy_on_read_in_flight;
+    /* number of in-flight serialising requests */
+    unsigned int serialising_in_flight;
 
     /* I/O throttling */
     ThrottleState throttle_state;
commit ec746e10cb2e6276a8d2e036454792fe0674864a
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 4 12:13:10 2013 +0100

    block: Make zero-after-EOF work with larger alignment
    
    Odd file sizes could make bdrv_aligned_preadv() shorten the request in
    non-aligned ways. Fix it by rounding to the required alignment instead
    of 512 bytes.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 1591649..0a391bd 100644
--- a/block.c
+++ b/block.c
@@ -2910,7 +2910,7 @@ err:
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    QEMUIOVector *qiov, int flags)
+    int64_t align, QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
     int ret;
@@ -2958,7 +2958,8 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
 
         total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
-        max_nb_sectors = MAX(0, total_sectors - sector_num);
+        max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num,
+                                         align >> BDRV_SECTOR_BITS));
         if (max_nb_sectors > 0) {
             ret = drv->bdrv_co_readv(bs, sector_num,
                                      MIN(nb_sectors, max_nb_sectors), qiov);
@@ -3044,7 +3045,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
     }
 
     tracked_request_begin(&req, bs, offset, bytes, false);
-    ret = bdrv_aligned_preadv(bs, &req, offset, bytes,
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
     tracked_request_end(&req);
commit 65afd211c71fc91750d8a18f9604c1e57a5202fb
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 14:55:55 2013 +0100

    block: Allow waiting for overlapping requests between begin/end
    
    Previously, it was not possible to use wait_for_overlapping_requests()
    between tracked_request_begin()/end() because it would wait for itself.
    
    Ignore the current request in the overlap check and run more of the
    bdrv_co_do_preadv/pwritev code with a BdrvTrackedRequest present.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index a32c81d..1591649 100644
--- a/block.c
+++ b/block.c
@@ -2286,7 +2286,7 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-        int64_t offset, unsigned int bytes)
+        BdrvTrackedRequest *self, int64_t offset, unsigned int bytes)
 {
     BdrvTrackedRequest *req;
     int64_t cluster_offset;
@@ -2304,6 +2304,9 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (req == self) {
+                continue;
+            }
             if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
                 /* Hitting this means there was a reentrant request, for
                  * example, a block driver issuing nested requests.  This must
@@ -2906,10 +2909,10 @@ err:
  * implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
     int ret;
 
     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
@@ -2924,11 +2927,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, offset, bytes);
+        wait_for_overlapping_requests(bs, req, offset, bytes);
     }
 
-    tracked_request_begin(&req, bs, offset, bytes, false);
-
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
 
@@ -2975,8 +2976,6 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
 out:
-    tracked_request_end(&req);
-
     if (flags & BDRV_REQ_COPY_ON_READ) {
         bs->copy_on_read_in_flight--;
     }
@@ -2992,6 +2991,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
     BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+
     /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
     uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
     uint8_t *head_buf = NULL;
@@ -3042,9 +3043,11 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         bytes = ROUND_UP(bytes, align);
     }
 
-    ret = bdrv_aligned_preadv(bs, offset, bytes,
+    tracked_request_begin(&req, bs, offset, bytes, false);
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
+    tracked_request_end(&req);
 
     if (use_local_qiov) {
         qemu_iovec_destroy(&local_qiov);
@@ -3163,10 +3166,10 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
  * Forwards an already correctly aligned write request to the BlockDriver.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
     int ret;
 
     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
@@ -3176,12 +3179,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
     if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, offset, bytes);
+        wait_for_overlapping_requests(bs, req, offset, bytes);
     }
 
-    tracked_request_begin(&req, bs, offset, bytes, true);
-
-    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
     if (ret < 0) {
         /* Do nothing, write notifier decided to fail this request */
@@ -3204,8 +3205,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
         bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
     }
 
-    tracked_request_end(&req);
-
     return ret;
 }
 
@@ -3216,6 +3215,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
+    BdrvTrackedRequest req;
     int ret;
 
     if (!bs->drv) {
@@ -3234,7 +3234,9 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
     }
 
-    ret = bdrv_aligned_pwritev(bs, offset, bytes, qiov, flags);
+    tracked_request_begin(&req, bs, offset, bytes, true);
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, qiov, flags);
+    tracked_request_end(&req);
 
     return ret;
 }
commit 793ed47a7a2b09b67cb2a8863dff531436532b5c
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 15:31:25 2013 +0100

    block: Switch BdrvTrackedRequest to byte granularity
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 576859e..a32c81d 100644
--- a/block.c
+++ b/block.c
@@ -2217,13 +2217,13 @@ static void tracked_request_end(BdrvTrackedRequest *req)
  */
 static void tracked_request_begin(BdrvTrackedRequest *req,
                                   BlockDriverState *bs,
-                                  int64_t sector_num,
-                                  int nb_sectors, bool is_write)
+                                  int64_t offset,
+                                  unsigned int bytes, bool is_write)
 {
     *req = (BdrvTrackedRequest){
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
+        .offset = offset,
+        .bytes = bytes,
         .is_write = is_write,
         .co = qemu_coroutine_self(),
     };
@@ -2254,25 +2254,43 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
+static void round_bytes_to_clusters(BlockDriverState *bs,
+                                    int64_t offset, unsigned int bytes,
+                                    int64_t *cluster_offset,
+                                    unsigned int *cluster_bytes)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_offset = offset;
+        *cluster_bytes = bytes;
+    } else {
+        *cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size);
+        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes,
+                                       bdi.cluster_size);
+    }
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
-                                     int64_t sector_num, int nb_sectors) {
+                                     int64_t offset, unsigned int bytes)
+{
     /*        aaaa   bbbb */
-    if (sector_num >= req->sector_num + req->nb_sectors) {
+    if (offset >= req->offset + req->bytes) {
         return false;
     }
     /* bbbb   aaaa        */
-    if (req->sector_num >= sector_num + nb_sectors) {
+    if (req->offset >= offset + bytes) {
         return false;
     }
     return true;
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors)
+        int64_t offset, unsigned int bytes)
 {
     BdrvTrackedRequest *req;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
+    int64_t cluster_offset;
+    unsigned int cluster_bytes;
     bool retry;
 
     /* If we touch the same cluster it counts as an overlap.  This guarantees
@@ -2281,14 +2299,12 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
      * CoR read and write operations are atomic and guest writes cannot
      * interleave between them.
      */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
+    round_bytes_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (tracked_request_overlaps(req, cluster_sector_num,
-                                         cluster_nb_sectors)) {
+            if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
                 /* Hitting this means there was a reentrant request, for
                  * example, a block driver issuing nested requests.  This must
                  * never happen since it means deadlock.
@@ -2908,10 +2924,10 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+        wait_for_overlapping_requests(bs, offset, bytes);
     }
 
-    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+    tracked_request_begin(&req, bs, offset, bytes, false);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
@@ -3160,10 +3176,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
     if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+        wait_for_overlapping_requests(bs, offset, bytes);
     }
 
-    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+    tracked_request_begin(&req, bs, offset, bytes, true);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
 
diff --git a/block/backup.c b/block/backup.c
index 0198514..15a2e55 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -181,8 +181,13 @@ static int coroutine_fn backup_before_write_notify(
         void *opaque)
 {
     BdrvTrackedRequest *req = opaque;
+    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
 
-    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
+    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5f9cecd..bcdd98c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -57,8 +57,8 @@
 
 typedef struct BdrvTrackedRequest {
     BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
+    unsigned int bytes;
     bool is_write;
     QLIST_ENTRY(BdrvTrackedRequest) list;
     Coroutine *co; /* owner, used for deadlock detection */
commit 6601553e27091ffe240bea69227adce941fe12e8
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 14:40:18 2013 +0100

    block: Introduce bdrv_co_do_pwritev()
    
    This is going to become the bdrv_co_do_preadv() equivalent for writes.
    In this patch, however, just a function taking byte offsets is created,
    it doesn't align anything yet.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 20a3853..576859e 100644
--- a/block.c
+++ b/block.c
@@ -3196,8 +3196,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 /*
  * Handle a write request in coroutine context
  */
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
     int ret;
@@ -3208,21 +3208,33 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     if (bs->read_only) {
         return -EACCES;
     }
-    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+    if (bdrv_check_byte_request(bs, offset, bytes)) {
         return -EIO;
     }
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, nb_sectors, true);
+        /* TODO Switch to byte granularity */
+        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, true);
     }
 
-    ret = bdrv_aligned_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
-                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+    ret = bdrv_aligned_pwritev(bs, offset, bytes, qiov, flags);
 
     return ret;
 }
 
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
commit 244eadef5c797c674b0aef96366671be4b33d03a
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 14:30:44 2013 +0100

    block: write: Handle COR dependency after I/O throttling
    
    First waiting for all COR requests to complete and calling the
    throttling function afterwards means that the request could be delayed
    and we still need to wait for the COR request even if it was issued only
    after the throttled write request.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index d9c472b..20a3853 100644
--- a/block.c
+++ b/block.c
@@ -3159,6 +3159,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
+    if (bs->copy_on_read_in_flight) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
@@ -3208,10 +3212,6 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
-    }
-
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
         bdrv_io_limits_intercept(bs, nb_sectors, true);
commit b404f72036716ab8ace04b83a8f0a93be4739a6a
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Tue Dec 3 14:02:23 2013 +0100

    block: Introduce bdrv_aligned_pwritev()
    
    This separates the part of bdrv_co_do_writev() that needs to happen
    before the request is modified to match the backend alignment, and a
    part that needs to be executed afterwards and passes the request to the
    BlockDriver.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index fa4feb0..d9c472b 100644
--- a/block.c
+++ b/block.c
@@ -3144,34 +3144,20 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 }
 
 /*
- * Handle a write request in coroutine context
+ * Forwards an already correctly aligned write request to the BlockDriver.
  */
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
     int ret;
 
-    if (!bs->drv) {
-        return -ENOMEDIUM;
-    }
-    if (bs->read_only) {
-        return -EACCES;
-    }
-    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
-        return -EIO;
-    }
-
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
-    }
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
 
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, nb_sectors, true);
-    }
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
@@ -3203,6 +3189,40 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     return ret;
 }
 
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    int ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    if (bs->copy_on_read_in_flight) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, nb_sectors, true);
+    }
+
+    ret = bdrv_aligned_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                               nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+
+    return ret;
+}
+
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
commit 1b0288ae7fc695a8e652973f75e92464bbc13416
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Mon Dec 2 16:09:46 2013 +0100

    block: Introduce bdrv_co_do_preadv()
    
    Similar to bdrv_pread(), which aligns byte-aligned request to 512 byte
    sectors, bdrv_co_do_preadv() takes a byte-aligned request and aligns it
    to the alignment specified in bs->request_alignment.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 701e95af..fa4feb0 100644
--- a/block.c
+++ b/block.c
@@ -2971,17 +2971,23 @@ out:
 /*
  * Handle a read request in coroutine context
  */
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
     int ret;
 
     if (!drv) {
         return -ENOMEDIUM;
     }
-    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+    if (bdrv_check_byte_request(bs, offset, bytes)) {
         return -EIO;
     }
 
@@ -2991,14 +2997,60 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, nb_sectors, false);
+        /* TODO Switch to byte granularity */
+        bdrv_io_limits_intercept(bs, bytes >> BDRV_SECTOR_BITS, false);
+    }
+
+    /* Align read if necessary by padding qiov */
+    if (offset & (align - 1)) {
+        head_buf = qemu_blockalign(bs, align);
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+        tail_buf = qemu_blockalign(bs, align);
+        qemu_iovec_add(&local_qiov, tail_buf,
+                       align - ((offset + bytes) & (align - 1)));
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    ret = bdrv_aligned_preadv(bs, offset, bytes,
+                              use_local_qiov ? &local_qiov : qiov,
+                              flags);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+        qemu_vfree(head_buf);
+        qemu_vfree(tail_buf);
     }
 
-    ret = bdrv_aligned_preadv(bs, sector_num << BDRV_SECTOR_BITS,
-                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
     return ret;
 }
 
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
commit d0c7f642f5eb2cb21d0c3acf766cb375eaaf4666
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Mon Dec 2 15:07:48 2013 +0100

    block: Introduce bdrv_aligned_preadv()
    
    This separates the part of bdrv_co_do_readv() that needs to happen
    before the request is modified to match the backend alignment, and a
    part that needs to be executed afterwards and passes the request to the
    BlockDriver.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index d4dd7fe..701e95af 100644
--- a/block.c
+++ b/block.c
@@ -2885,26 +2885,24 @@ err:
 }
 
 /*
- * Handle a read request in coroutine context
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
  */
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
 {
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
     int ret;
 
-    if (!drv) {
-        return -ENOMEDIUM;
-    }
-    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
-        return -EIO;
-    }
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
 
-    if (bs->copy_on_read) {
-        flags |= BDRV_REQ_COPY_ON_READ;
-    }
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    /* Handle Copy on Read and associated serialisation */
     if (flags & BDRV_REQ_COPY_ON_READ) {
         bs->copy_on_read_in_flight++;
     }
@@ -2913,11 +2911,6 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
 
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, nb_sectors, false);
-    }
-
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
@@ -2934,6 +2927,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         }
     }
 
+    /* Forward the request to the BlockDriver */
     if (!(bs->zero_beyond_eof && bs->growable)) {
         ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
     } else {
@@ -2974,6 +2968,37 @@ out:
     return ret;
 }
 
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    if (bs->copy_on_read) {
+        flags |= BDRV_REQ_COPY_ON_READ;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, nb_sectors, false);
+    }
+
+    ret = bdrv_aligned_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+    return ret;
+}
+
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov)
 {
commit c25f53b06eba1575d5d0e92a0132455c97825b83
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Tue Nov 29 12:42:20 2011 +0100

    raw: Probe required direct I/O alignment
    
    Add a bs->request_alignment field that contains the required
    offset/length alignment for I/O requests and fill it in the raw block
    drivers. Use ioctls if possible, else see what alignment it takes for
    O_DIRECT to succeed.
    
    While at it, also expose the memory alignment requirements, which may be
    (and in practice are) different from the disk alignment requirements.
    
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 195beff..d4dd7fe 100644
--- a/block.c
+++ b/block.c
@@ -852,6 +852,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     bs->open_flags = flags;
     bs->guest_block_size = 512;
+    bs->request_alignment = 512;
     bs->zero_beyond_eof = true;
     open_flags = bdrv_open_flags(bs, flags);
     bs->read_only = !(open_flags & BDRV_O_RDWR);
@@ -920,6 +921,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
     }
 
     bdrv_refresh_limits(bs);
+    assert(bdrv_opt_mem_align(bs) != 0);
+    assert(bs->request_alignment != 0);
 
 #ifndef _WIN32
     if (bs->is_temporary) {
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 0676037..126a634 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,6 +127,8 @@ typedef struct BDRVRawState {
     int fd;
     int type;
     int open_flags;
+    size_t buf_align;
+
 #if defined(__linux__)
     /* linux floppy specific */
     int64_t fd_open_time;
@@ -213,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    char *buf;
+    unsigned int sector_size;
+
+    /* For /dev/sg devices the alignment is not really used.
+       With buffered I/O, we don't have any restrictions. */
+    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+        bs->request_alignment = 1;
+        s->buf_align = 1;
+        return;
+    }
+
+    /* Try a few ioctls to get the right size */
+    bs->request_alignment = 0;
+    s->buf_align = 0;
+
+#ifdef BLKSSZGET
+    if (ioctl(s->fd, BLKSSZGET, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DKIOCGETBLOCKSIZE
+    if (ioctl(s->fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DIOCGSECTORSIZE
+    if (ioctl(s->fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef CONFIG_XFS
+    if (s->is_xfs) {
+        struct dioattr da;
+        if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
+            bs->request_alignment = da.d_miniosz;
+            /* The kernel returns wrong information for d_mem */
+            /* s->buf_align = da.d_mem; */
+        }
+    }
+#endif
+
+    /* If we could not get the sizes so far, we can only guess them */
+    if (!s->buf_align) {
+        size_t align;
+        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+                s->buf_align = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+
+    if (!bs->request_alignment) {
+        size_t align;
+        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf, align, 0) >= 0) {
+                bs->request_alignment = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+}
+
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
 {
     assert(open_flags != NULL);
@@ -463,7 +535,6 @@ static int raw_reopen_prepare(BDRVReopenState *state,
     return ret;
 }
 
-
 static void raw_reopen_commit(BDRVReopenState *state)
 {
     BDRVRawReopenState *raw_s = state->opaque;
@@ -499,23 +570,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
     state->opaque = NULL;
 }
 
+static int raw_refresh_limits(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
 
-/* XXX: use host sector size if necessary with:
-#ifdef DIOCGSECTORSIZE
-        {
-            unsigned int sectorsize = 512;
-            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
-                sectorsize > bufsize)
-                bufsize = sectorsize;
-        }
-#endif
-#ifdef CONFIG_COCOA
-        uint32_t blockSize = 512;
-        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
-            bufsize = blockSize;
-        }
-#endif
-*/
+    raw_probe_alignment(bs);
+    bs->bl.opt_mem_alignment = s->buf_align;
+
+    return 0;
+}
 
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
@@ -1363,6 +1426,7 @@ static BlockDriver bdrv_file = {
     .bdrv_aio_writev = raw_aio_writev,
     .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
@@ -1740,6 +1804,7 @@ static BlockDriver bdrv_host_device = {
     .bdrv_aio_writev	= raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -1871,6 +1936,7 @@ static BlockDriver bdrv_host_floppy = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -1981,6 +2047,7 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -2110,6 +2177,7 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_refresh_limits = raw_refresh_limits,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index ce314fd..beb7f23 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -202,6 +202,35 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
+    DISK_GEOMETRY_EX dg;
+    BOOL status;
+
+    if (s->type == FTYPE_CD) {
+        bs->request_alignment = 2048;
+        return;
+    }
+    if (s->type == FTYPE_HARDDISK) {
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            bs->request_alignment = dg.Geometry.BytesPerSector;
+            return;
+        }
+        /* try GetDiskFreeSpace too */
+    }
+
+    if (s->drive_path[0]) {
+        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
+                         &dg.Geometry.BytesPerSector,
+                         &freeClusters, &totalClusters);
+        bs->request_alignment = dg.Geometry.BytesPerSector;
+    }
+}
+
 static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
 {
     assert(access_flags != NULL);
@@ -269,6 +298,17 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
         }
     }
 
+    if (filename[0] && filename[1] == ':') {
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
+    } else if (filename[0] == '\\' && filename[1] == '\\') {
+        s->drive_path[0] = 0;
+    } else {
+        /* Relative path.  */
+        char buf[MAX_PATH];
+        GetCurrentDirectory(MAX_PATH, buf);
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]);
+    }
+
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           OPEN_EXISTING, overlapped, NULL);
@@ -293,6 +333,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
         s->aio = aio;
     }
 
+    raw_probe_alignment(bs);
     ret = 0;
 fail:
     qemu_opts_del(opts);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index ae609bd..5f9cecd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -325,6 +325,9 @@ struct BlockDriverState {
     /* Whether produces zeros when read beyond eof */
     bool zero_beyond_eof;
 
+    /* Alignment requirement for offset/length of I/O requests */
+    unsigned int request_alignment;
+
     /* the block size for which the guest device expects atomicity */
     int guest_block_size;
 
commit 1b7fd729559c6d3b273303aa48bc653ceef08747
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Tue Nov 29 11:35:47 2011 +0100

    block: rename buffer_alignment to guest_block_size
    
    The alignment field is now set to the value that is promised to the
    guest, rather than required by the host.  The next patches will make
    QEMU aware of the host-provided values, so make this clear.
    
    The alignment is also not about memory buffers, but about the sectors on
    the disk, change the documentation of the field.
    
    At this point, the field is set by the device emulation, but completely
    ignored by the block layer.
    
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index e5f214d..195beff 100644
--- a/block.c
+++ b/block.c
@@ -851,7 +851,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
     }
 
     bs->open_flags = flags;
-    bs->buffer_alignment = 512;
+    bs->guest_block_size = 512;
     bs->zero_beyond_eof = true;
     open_flags = bdrv_open_flags(bs, flags);
     bs->read_only = !(open_flags & BDRV_O_RDWR);
@@ -1796,7 +1796,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->dev_ops            = bs_src->dev_ops;
     bs_dest->dev_opaque         = bs_src->dev_opaque;
     bs_dest->dev                = bs_src->dev;
-    bs_dest->buffer_alignment   = bs_src->buffer_alignment;
+    bs_dest->guest_block_size   = bs_src->guest_block_size;
     bs_dest->copy_on_read       = bs_src->copy_on_read;
 
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
@@ -1953,7 +1953,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev)
     bs->dev = NULL;
     bs->dev_ops = NULL;
     bs->dev_opaque = NULL;
-    bs->buffer_alignment = 512;
+    bs->guest_block_size = 512;
 }
 
 /* TODO change to return DeviceState * when all users are qdevified */
@@ -4806,9 +4806,9 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
     return NULL;
 }
 
-void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
 {
-    bs->buffer_alignment = align;
+    bs->guest_block_size = align;
 }
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 19d0961..8a568e5 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -731,7 +731,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
                     virtio_blk_save, virtio_blk_load, s);
     bdrv_set_dev_ops(s->bs, &virtio_block_ops, s);
-    bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size);
+    bdrv_set_guest_block_size(s->bs, s->conf->logical_block_size);
 
     bdrv_iostatus_enable(s->bs);
 
diff --git a/hw/ide/core.c b/hw/ide/core.c
index e1f4c33..036cd4a 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2103,7 +2103,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
     s->smart_selftest_count = 0;
     if (kind == IDE_CD) {
         bdrv_set_dev_ops(bs, &ide_cd_block_ops, s);
-        bdrv_set_buffer_alignment(bs, 2048);
+        bdrv_set_guest_block_size(bs, 2048);
     } else {
         if (!bdrv_is_inserted(s->bs)) {
             error_report("Device needs media, but drive is empty");
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index bce617c..6491091 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2254,7 +2254,7 @@ static int scsi_initfn(SCSIDevice *dev)
     } else {
         bdrv_set_dev_ops(s->qdev.conf.bs, &scsi_disk_block_ops, s);
     }
-    bdrv_set_buffer_alignment(s->qdev.conf.bs, s->qdev.blocksize);
+    bdrv_set_guest_block_size(s->qdev.conf.bs, s->qdev.blocksize);
 
     bdrv_iostatus_enable(s->qdev.conf.bs);
     add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, NULL);
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 8f195be..f08b64e 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -210,7 +210,7 @@ static void scsi_read_complete(void * opaque, int ret)
             s->blocksize = ldl_be_p(&r->buf[8]);
             s->max_lba = ldq_be_p(&r->buf[0]);
         }
-        bdrv_set_buffer_alignment(s->conf.bs, s->blocksize);
+        bdrv_set_guest_block_size(s->conf.bs, s->blocksize);
 
         scsi_req_data(&r->req, len);
         if (!r->req.io_canceled) {
diff --git a/include/block/block.h b/include/block/block.h
index 332ebb9..a2f5657 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -431,7 +431,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
 /* Returns the alignment in bytes that is required so that no bounce buffer
  * is required throughout the stack */
 size_t bdrv_opt_mem_align(BlockDriverState *bs);
-void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 74a78a6..ae609bd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -325,8 +325,8 @@ struct BlockDriverState {
     /* Whether produces zeros when read beyond eof */
     bool zero_beyond_eof;
 
-    /* the memory alignment required for the buffers handled by this driver */
-    int buffer_alignment;
+    /* the block size for which the guest device expects atomicity */
+    int guest_block_size;
 
     /* do we need to tell the quest if we have a volatile write cache? */
     int enable_write_cache;
commit 339064d5063924e5176842abbf6c8089f3479c5b
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Nov 28 10:23:32 2013 +0100

    block: Don't use guest sector size for qemu_blockalign()
    
    bs->buffer_alignment is set by the device emulation and contains the
    logical block size of the guest device. This isn't something that the
    block layer should know, and even less something to use for determining
    the right alignment of buffers to be used for the host.
    
    The new BlockLimits field opt_mem_alignment tells the qemu block layer
    the optimal alignment to be used so that no bounce buffer must be used
    in the driver.
    
    This patch may change the buffer alignment from 4k to 512 for all
    callers that used qemu_blockalign() with the top-level image format
    BlockDriverState. The value was never propagated to other levels in the
    tree, so in particular raw-posix never required anything else than 512.
    
    While on disks with 4k sectors direct I/O requires a 4k alignment,
    memory may still be okay when aligned to 512 byte boundaries. This is
    what must have happened in practice, because otherwise this would
    already have failed earlier. Therefore I don't expect regressions even
    with this intermediate state. Later, raw-posix can implement the hook
    and expose a different memory alignment requirement.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 275d387..e5f214d 100644
--- a/block.c
+++ b/block.c
@@ -218,6 +218,16 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
 }
 
+size_t bdrv_opt_mem_align(BlockDriverState *bs)
+{
+    if (!bs || !bs->drv) {
+        /* 4k should be on the safe side */
+        return 4096;
+    }
+
+    return bs->bl.opt_mem_alignment;
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -497,6 +507,9 @@ int bdrv_refresh_limits(BlockDriverState *bs)
     if (bs->file) {
         bdrv_refresh_limits(bs->file);
         bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+    } else {
+        bs->bl.opt_mem_alignment = 512;
     }
 
     if (bs->backing_hd) {
@@ -504,6 +517,9 @@ int bdrv_refresh_limits(BlockDriverState *bs)
         bs->bl.opt_transfer_length =
             MAX(bs->bl.opt_transfer_length,
                 bs->backing_hd->bl.opt_transfer_length);
+        bs->bl.opt_mem_alignment =
+            MAX(bs->bl.opt_mem_alignment,
+                bs->backing_hd->bl.opt_mem_alignment);
     }
 
     /* Then let the driver override it */
@@ -4797,7 +4813,7 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
-    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
+    return qemu_memalign(bdrv_opt_mem_align(bs), size);
 }
 
 /*
@@ -4806,12 +4822,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     int i;
+    size_t alignment = bdrv_opt_mem_align(bs);
 
     for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
+        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
             return false;
         }
-        if (qiov->iov[i].iov_len % bs->buffer_alignment) {
+        if (qiov->iov[i].iov_len % alignment) {
             return false;
         }
     }
diff --git a/include/block/block.h b/include/block/block.h
index 00a8790..332ebb9 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -428,6 +428,9 @@ void bdrv_img_create(const char *filename, const char *fmt,
                      char *options, uint64_t img_size, int flags,
                      Error **errp, bool quiet);
 
+/* Returns the alignment in bytes that is required so that no bounce buffer
+ * is required throughout the stack */
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f6fa1f6..74a78a6 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -258,6 +258,9 @@ typedef struct BlockLimits {
 
     /* optimal transfer length in sectors */
     int opt_transfer_length;
+
+    /* memory alignment so that no bounce buffer is needed */
+    size_t opt_mem_alignment;
 } BlockLimits;
 
 /*
commit 1ff735bdc417945bc6df1857861b127644b3f461
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Thu Dec 5 13:01:46 2013 +0100

    block: Detect unaligned length in bdrv_qiov_is_aligned()
    
    For an O_DIRECT request to succeed, it's not only necessary that all
    base addresses in the qiov are aligned, but also that each length in it
    is aligned.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>

diff --git a/block.c b/block.c
index 9d0cfc4..275d387 100644
--- a/block.c
+++ b/block.c
@@ -4811,6 +4811,9 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
         if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
             return false;
         }
+        if (qiov->iov[i].iov_len % bs->buffer_alignment) {
+            return false;
+        }
     }
 
     return true;
commit e5354657a626b325c31888f33de88ac6d39e2fcb
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Fri Nov 29 21:29:17 2013 +0100

    qemu_memalign: Allow small alignments
    
    The functions used by qemu_memalign() require an alignment that is at
    least sizeof(void*). Adjust it if it is too small.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoît Canet <benoit at irqsave.net>

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index f5c4016..d5dca47 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -88,6 +88,11 @@ void *qemu_oom_check(void *ptr)
 void *qemu_memalign(size_t alignment, size_t size)
 {
     void *ptr;
+
+    if (alignment < sizeof(void*)) {
+        alignment = sizeof(void*);
+    }
+
 #if defined(_POSIX_C_SOURCE) && !defined(__sun__)
     int ret;
     ret = posix_memalign(&ptr, alignment, size);
commit 355ef4ac95a7a47d5c7201ccd910056a100d2fdf
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 11 20:14:09 2013 +0100

    block: Update BlockLimits when they might have changed
    
    When reopening with different flags, or when backing files disappear
    from the chain, the limits may change. Make sure they get updated in
    these cases.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoît Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 8a66927..9d0cfc4 100644
--- a/block.c
+++ b/block.c
@@ -483,7 +483,7 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
     return ret;
 }
 
-static int bdrv_refresh_limits(BlockDriverState *bs)
+int bdrv_refresh_limits(BlockDriverState *bs)
 {
     BlockDriver *drv = bs->drv;
 
@@ -1607,6 +1607,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)
     reopen_state->bs->enable_write_cache = !!(reopen_state->flags &
                                               BDRV_O_CACHE_WB);
     reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR);
+
+    bdrv_refresh_limits(reopen_state->bs);
 }
 
 /*
@@ -2441,6 +2443,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
     }
     new_top_bs->backing_hd = base_bs;
 
+    bdrv_refresh_limits(new_top_bs);
 
     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
         /* so that bdrv_close() does not recursively close the chain */
diff --git a/block/stream.c b/block/stream.c
index 46bec7d..dd0b4ac 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -75,6 +75,8 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
         unused->backing_hd = NULL;
         bdrv_unref(unused);
     }
+
+    bdrv_refresh_limits(top);
 }
 
 static void coroutine_fn stream_run(void *opaque)
diff --git a/include/block/block.h b/include/block/block.h
index 59d9f12..00a8790 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -253,6 +253,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+int bdrv_refresh_limits(BlockDriverState *bs);
 int bdrv_commit(BlockDriverState *bs);
 int bdrv_commit_all(void);
 int bdrv_change_backing_file(BlockDriverState *bs,
commit 466ad822deef3a03757d505218a52993c5d56b5d
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 11 19:50:32 2013 +0100

    block: Inherit opt_transfer_length
    
    When there is a format driver between the backend, it's not guaranteed
    that exposing the opt_transfer_length for the format driver results in
    the optimal requests (because of fragmentation etc.), but it can't make
    things worse, so let's just do it.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Wenchao Xia <xiawenc at linux.vnet.ibm.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoît Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 0a3d12c..8a66927 100644
--- a/block.c
+++ b/block.c
@@ -489,7 +489,25 @@ static int bdrv_refresh_limits(BlockDriverState *bs)
 
     memset(&bs->bl, 0, sizeof(bs->bl));
 
-    if (drv && drv->bdrv_refresh_limits) {
+    if (!drv) {
+        return 0;
+    }
+
+    /* Take some limits from the children as a default */
+    if (bs->file) {
+        bdrv_refresh_limits(bs->file);
+        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+    }
+
+    if (bs->backing_hd) {
+        bdrv_refresh_limits(bs->backing_hd);
+        bs->bl.opt_transfer_length =
+            MAX(bs->bl.opt_transfer_length,
+                bs->backing_hd->bl.opt_transfer_length);
+    }
+
+    /* Then let the driver override it */
+    if (drv->bdrv_refresh_limits) {
         return drv->bdrv_refresh_limits(bs);
     }
 
commit d34682cd4a06efe9ee3fc8cb7e8a0ea445299989
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Wed Dec 11 19:26:16 2013 +0100

    block: Move initialisation of BlockLimits to bdrv_refresh_limits()
    
    This function separates filling the BlockLimits from bdrv_open(), which
    allows it to call it from other operations which may change the limits
    (e.g. modifications to the backing file chain or bdrv_reopen)
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 2106ae9..0a3d12c 100644
--- a/block.c
+++ b/block.c
@@ -483,6 +483,19 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options,
     return ret;
 }
 
+static int bdrv_refresh_limits(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+
+    memset(&bs->bl, 0, sizeof(bs->bl));
+
+    if (drv && drv->bdrv_refresh_limits) {
+        return drv->bdrv_refresh_limits(bs);
+    }
+
+    return 0;
+}
+
 /*
  * Create a uniquely-named empty temporary file.
  * Return 0 upon success, otherwise a negative errno value.
@@ -872,6 +885,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
         goto free_and_fail;
     }
 
+    bdrv_refresh_limits(bs);
+
 #ifndef _WIN32
     if (bs->is_temporary) {
         assert(bs->filename[0] != '\0');
@@ -1085,6 +1100,9 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
                 bs->backing_hd->file->filename);
     }
 
+    /* Recalculate the BlockLimits with the backing file */
+    bdrv_refresh_limits(bs);
+
     return 0;
 }
 
diff --git a/block/iscsi.c b/block/iscsi.c
index 76b3c96..c8bf8dc 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1265,23 +1265,6 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
                sizeof(struct scsi_inquiry_block_limits));
         scsi_free_scsi_task(task);
         task = NULL;
-
-        if (iscsilun->bl.max_unmap < 0xffffffff) {
-            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
-                                                 iscsilun);
-        }
-        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                   iscsilun);
-
-        if (iscsilun->bl.max_ws_len < 0xffffffff) {
-            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
-                                                      iscsilun);
-        }
-        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
-                                                        iscsilun);
-
-        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
-                                                     iscsilun);
     }
 
 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
@@ -1326,6 +1309,34 @@ static void iscsi_close(BlockDriverState *bs)
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+static int iscsi_refresh_limits(BlockDriverState *bs)
+{
+    IscsiLun *iscsilun = bs->opaque;
+
+    /* We don't actually refresh here, but just return data queried in
+     * iscsi_open(): iscsi targets don't change their limits. */
+    if (iscsilun->lbp.lbpu || iscsilun->lbp.lbpws) {
+        if (iscsilun->bl.max_unmap < 0xffffffff) {
+            bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap,
+                                                 iscsilun);
+        }
+        bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                   iscsilun);
+
+        if (iscsilun->bl.max_ws_len < 0xffffffff) {
+            bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len,
+                                                      iscsilun);
+        }
+        bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran,
+                                                        iscsilun);
+
+        bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len,
+                                                     iscsilun);
+    }
+
+    return 0;
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
     IscsiLun *iscsilun = bs->opaque;
@@ -1438,6 +1449,7 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_getlength  = iscsi_getlength,
     .bdrv_get_info   = iscsi_get_info,
     .bdrv_truncate   = iscsi_truncate,
+    .bdrv_refresh_limits = iscsi_refresh_limits,
 
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
     .bdrv_co_get_block_status = iscsi_co_get_block_status,
diff --git a/block/qcow2.c b/block/qcow2.c
index e15a4dd..2da62b8 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -718,7 +718,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     qemu_opts_del(opts);
-    bs->bl.write_zeroes_alignment = s->cluster_sectors;
 
     if (s->use_lazy_refcounts && s->qcow_version < 3) {
         error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
@@ -751,6 +750,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     return ret;
 }
 
+static int qcow2_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->cluster_sectors;
+
+    return 0;
+}
+
 static int qcow2_set_key(BlockDriverState *bs, const char *key)
 {
     BDRVQcowState *s = bs->opaque;
@@ -2268,6 +2276,7 @@ static BlockDriver bdrv_qcow2 = {
 
     .bdrv_change_backing_file   = qcow2_change_backing_file,
 
+    .bdrv_refresh_limits        = qcow2_refresh_limits,
     .bdrv_invalidate_cache      = qcow2_invalidate_cache,
 
     .create_options = qcow2_create_options,
diff --git a/block/qed.c b/block/qed.c
index 0dd5c58..694e6e2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -495,7 +495,6 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
         }
     }
 
-    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
     s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
                                             qed_need_check_timer_cb, s);
 
@@ -507,6 +506,15 @@ out:
     return ret;
 }
 
+static int bdrv_qed_refresh_limits(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS;
+
+    return 0;
+}
+
 /* We have nothing to do for QED reopen, stubs just return
  * success */
 static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
@@ -1616,6 +1624,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,
+    .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
     .bdrv_change_backing_file = bdrv_qed_change_backing_file,
     .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
     .bdrv_check               = bdrv_qed_check,
diff --git a/block/vmdk.c b/block/vmdk.c
index 67b5f96..99ca60f 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -428,10 +428,6 @@ static int vmdk_add_extent(BlockDriverState *bs,
     extent->l2_size = l2_size;
     extent->cluster_sectors = flat ? sectors : cluster_sectors;
 
-    if (!flat) {
-        bs->bl.write_zeroes_alignment =
-            MAX(bs->bl.write_zeroes_alignment, cluster_sectors);
-    }
     if (s->num_extents > 1) {
         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
     } else {
@@ -902,6 +898,23 @@ fail:
     return ret;
 }
 
+
+static int vmdk_refresh_limits(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        if (!s->extents[i].flat) {
+            bs->bl.write_zeroes_alignment =
+                MAX(bs->bl.write_zeroes_alignment,
+                    s->extents[i].cluster_sectors);
+        }
+    }
+
+    return 0;
+}
+
 static int get_whole_cluster(BlockDriverState *bs,
                 VmdkExtent *extent,
                 uint64_t cluster_offset,
@@ -2013,6 +2026,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
     .bdrv_has_zero_init           = vmdk_has_zero_init,
     .bdrv_get_specific_info       = vmdk_get_specific_info,
+    .bdrv_refresh_limits          = vmdk_refresh_limits,
 
     .create_options               = vmdk_create_options,
 };
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 611a955..f6fa1f6 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -232,6 +232,8 @@ struct BlockDriver {
     int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
     bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
 
+    int (*bdrv_refresh_limits)(BlockDriverState *bs);
+
     /*
      * Returns 1 if newly created images are guaranteed to contain only
      * zeros, 0 otherwise.
commit dabfa6cc2e2a06269026fcb42772894f67bd0c3e
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Fri Jan 24 14:00:43 2014 +0100

    block: Fix bdrv_commit return value
    
    bdrv_commit() could return 0 or 1 on success, depending on whether or
    not the last sector was allocated in the overlay and whether the overlay
    format had a .bdrv_make_empty callback.
    
    Most callers ignored it, but qemu-img commit would print an error
    message while the operation actually succeeded.
    
    Also clean up the handling of I/O errors to return the real error code
    instead of -EIO.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/block.c b/block.c
index 72bccb1..2106ae9 100644
--- a/block.c
+++ b/block.c
@@ -2089,13 +2089,13 @@ int bdrv_commit(BlockDriverState *bs)
             goto ro_cleanup;
         }
         if (ret) {
-            if (bdrv_read(bs, sector, buf, n) != 0) {
-                ret = -EIO;
+            ret = bdrv_read(bs, sector, buf, n);
+            if (ret < 0) {
                 goto ro_cleanup;
             }
 
-            if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
-                ret = -EIO;
+            ret = bdrv_write(bs->backing_hd, sector, buf, n);
+            if (ret < 0) {
                 goto ro_cleanup;
             }
         }
@@ -2103,6 +2103,9 @@ int bdrv_commit(BlockDriverState *bs)
 
     if (drv->bdrv_make_empty) {
         ret = drv->bdrv_make_empty(bs);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
         bdrv_flush(bs);
     }
 
@@ -2110,9 +2113,11 @@ int bdrv_commit(BlockDriverState *bs)
      * Make sure all data we wrote to the backing device is actually
      * stable on disk.
      */
-    if (bs->backing_hd)
+    if (bs->backing_hd) {
         bdrv_flush(bs->backing_hd);
+    }
 
+    ret = 0;
 ro_cleanup:
     g_free(buf);
 
commit 37222900743962e146a82b7077a18c3f39859a19
Author: Jeff Cody <jcody at redhat.com>
Date:   Fri Jan 24 09:02:37 2014 -0500

    block: update block commit documentation regarding image truncation
    
    This updates the documentation for commiting snapshot images.
    Specifically, this highlights what happens when the base image
    is either smaller or larger than the snapshot image being committed.
    
    In the case of the base image being smaller, it is resized to the
    larger size of the snapshot image.  In the case of the base image
    being larger, it is not resized automatically, but once the commit
    has completed it is safe for the user to truncate the base image.
    
    Signed-off-by: Jeff Cody <jcody at redhat.com>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/hmp-commands.hx b/hmp-commands.hx
index feca084..f3fc514 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -35,6 +35,11 @@ STEXI
 @item commit
 @findex commit
 Commit changes to the disk images (if -snapshot is used) or backing files.
+If the backing file is smaller than the snapshot, then the backing file will be
+resized to be the same size as the snapshot.  If the snapshot is smaller than
+the backing file, the backing file will not be truncated.  If you want the
+backing file to match the size of the smaller snapshot, you can safely truncate
+it yourself once the commit operation successfully completes.
 ETEXI
 
     {
diff --git a/qapi-schema.json b/qapi-schema.json
index e04949d..1ff607a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1994,6 +1994,13 @@
 #                    user needs to complete the job with the block-job-complete
 #                    command after getting the ready event. (Since 2.0)
 #
+#                    If the base image is smaller than top, then the base image
+#                    will be resized to be the same size as top.  If top is
+#                    smaller than the base image, the base will not be
+#                    truncated.  If you want the base image size to match the
+#                    size of the smaller top, you can safely truncate it
+#                    yourself once the commit operation successfully completes.
+#
 #
 # @speed:  #optional the maximum speed, in bytes per second
 #
diff --git a/qemu-img.texi b/qemu-img.texi
index f86a86d..526d56a 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -142,7 +142,12 @@ it doesn't need to be specified separately in this case.
 
 @item commit [-f @var{fmt}] [-t @var{cache}] @var{filename}
 
-Commit the changes recorded in @var{filename} in its base image.
+Commit the changes recorded in @var{filename} in its base image or backing file.
+If the backing file is smaller than the snapshot, then the backing file will be
+resized to be the same size as the snapshot.  If the snapshot is smaller than
+the backing file, the backing file will not be truncated.  If you want the
+backing file to match the size of the smaller snapshot, you can safely truncate
+it yourself once the commit operation successfully completes.
 
 @item compare [-f @var{fmt}] [-F @var{fmt}] [-p] [-s] [-q] @var{filename1} @var{filename2}
 
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 4f28250..cce6b81 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -966,6 +966,45 @@ EQMP
         .mhandler.cmd_new = qmp_marshal_input_block_commit,
     },
 
+SQMP
+block-commit
+------------
+
+Live commit of data from overlay image nodes into backing nodes - i.e., writes
+data between 'top' and 'base' into 'base'.
+
+Arguments:
+
+- "device": The device's ID, must be unique (json-string)
+- "base": The file name of the backing image to write data into.
+          If not specified, this is the deepest backing image
+          (json-string, optional)
+- "top":  The file name of the backing image within the image chain,
+          which contains the topmost data to be committed down.
+
+          If top == base, that is an error.
+          If top == active, the job will not be completed by itself,
+          user needs to complete the job with the block-job-complete
+          command after getting the ready event. (Since 2.0)
+
+          If the base image is smaller than top, then the base image
+          will be resized to be the same size as top.  If top is
+          smaller than the base image, the base will not be
+          truncated.  If you want the base image size to match the
+          size of the smaller top, you can safely truncate it
+          yourself once the commit operation successfully completes.
+          (json-string)
+- "speed":  the maximum speed, in bytes per second (json-int, optional)
+
+
+Example:
+
+-> { "execute": "block-commit", "arguments": { "device": "virtio0",
+                                              "top": "/tmp/snap1.qcow2" } }
+<- { "return": {} }
+
+EQMP
+
     {
         .name       = "drive-backup",
         .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
commit 4da83585961631bfc10831dd26c4afda2a8b23e8
Author: Jeff Cody <jcody at redhat.com>
Date:   Fri Jan 24 09:02:36 2014 -0500

    block: resize backing image during active layer commit, if needed
    
    If the top image to commit is the active layer, and also larger than
    the base image, then an I/O error will likely be returned during
    block-commit.
    
    For instance, if we have a base image with a virtual size 10G, and a
    active layer image of size 20G, then committing the snapshot via
    'block-commit' will likely fail.
    
    This will automatically attempt to resize the base image, if the
    active layer image to be committed is larger.
    
    Signed-off-by: Jeff Cody <jcody at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/mirror.c b/block/mirror.c
index 05758e5..2a43334 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -631,11 +631,49 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                          BlockDriverCompletionFunc *cb,
                          void *opaque, Error **errp)
 {
+    int64_t length, base_length;
+    int orig_base_flags;
+
+    orig_base_flags = bdrv_get_flags(base);
+
     if (bdrv_reopen(base, bs->open_flags, errp)) {
         return;
     }
+
+    length = bdrv_getlength(bs);
+    if (length < 0) {
+        error_setg(errp, "Unable to determine length of %s", bs->filename);
+        goto error_restore_flags;
+    }
+
+    base_length = bdrv_getlength(base);
+    if (base_length < 0) {
+        error_setg(errp, "Unable to determine length of %s", base->filename);
+        goto error_restore_flags;
+    }
+
+    if (length > base_length) {
+        if (bdrv_truncate(base, length) < 0) {
+            error_setg(errp, "Top image %s is larger than base image %s, and "
+                             "resize of base image failed",
+                             bs->filename, base->filename);
+            goto error_restore_flags;
+        }
+    }
+
     bdrv_ref(base);
     mirror_start_job(bs, base, speed, 0, 0,
                      on_error, on_error, cb, opaque, errp,
                      &commit_active_job_driver, false, base);
+    if (error_is_set(errp)) {
+        goto error_restore_flags;
+    }
+
+    return;
+
+error_restore_flags:
+    /* ignore error and errp for bdrv_reopen, because we want to propagate
+     * the original error */
+    bdrv_reopen(base, orig_base_flags, NULL);
+    return;
 }
commit 72706ea4cd38bfcb151265df0178ba21863d7518
Author: Jeff Cody <jcody at redhat.com>
Date:   Fri Jan 24 09:02:35 2014 -0500

    block: resize backing file image during offline commit, if necessary
    
    Currently, if an image file is logically larger than its backing file,
    committing it via 'qemu-img commit' will fail.
    
    For instance, if we have a base image with a virtual size 10G, and a
    snapshot image of size 20G, then committing the snapshot offline with
    'qemu-img commit' will likely fail.
    
    This will automatically attempt to resize the base image, if the
    snapshot image to be committed is larger.
    
    Signed-off-by: Jeff Cody <jcody at redhat.com>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 18c0a8d..72bccb1 100644
--- a/block.c
+++ b/block.c
@@ -2030,10 +2030,10 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
 int bdrv_commit(BlockDriverState *bs)
 {
     BlockDriver *drv = bs->drv;
-    int64_t sector, total_sectors;
+    int64_t sector, total_sectors, length, backing_length;
     int n, ro, open_flags;
     int ret = 0;
-    uint8_t *buf;
+    uint8_t *buf = NULL;
     char filename[PATH_MAX];
 
     if (!drv)
@@ -2058,7 +2058,29 @@ int bdrv_commit(BlockDriverState *bs)
         }
     }
 
-    total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    length = bdrv_getlength(bs);
+    if (length < 0) {
+        ret = length;
+        goto ro_cleanup;
+    }
+
+    backing_length = bdrv_getlength(bs->backing_hd);
+    if (backing_length < 0) {
+        ret = backing_length;
+        goto ro_cleanup;
+    }
+
+    /* If our top snapshot is larger than the backing file image,
+     * grow the backing file image if possible.  If not possible,
+     * we must return an error */
+    if (length > backing_length) {
+        ret = bdrv_truncate(bs->backing_hd, length);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
+    }
+
+    total_sectors = length >> BDRV_SECTOR_BITS;
     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 
     for (sector = 0; sector < total_sectors; sector += n) {
commit 031fd1be5618c347f9aeb44ec294f14a541e42b2
Author: Peter Maydell <peter.maydell at linaro.org>
Date:   Fri Jan 24 14:56:17 2014 +0100

    block/curl: Implement the libcurl timer callback interface
    
    libcurl versions 7.16.0 and later have a timer callback interface which
    must be implemented in order for libcurl to make forward progress (it
    will sometimes rely on being called back on the timeout if there are
    no file descriptors registered). Implement the callback, and use a
    QEMU AIO timer to ensure we prod libcurl again when it asks us to.
    
    Based on Peter's original patch plus my fix to add curl_multi_timeout_do.
    Should compile just fine even on older versions of libcurl.
    
    I also tried copy-on-read and streaming:
    
        $ ./qemu-img create -f qcow2 -o \
             backing_file=http://download.fedoraproject.org/pub/fedora/linux/releases/20/Live/x86_64/Fedora-Live-Desktop-x86_64-20-1.iso \
             foo.qcow2 1G
        $ x86_64-softmmu/qemu-system-x86_64 \
             -drive if=none,file=foo.qcow2,copy-on-read=on,id=cd \
             -device ide-cd,drive=cd --enable-kvm -m 1024
    
    Direct http usage is probably too slow, but with copy-on-read ultimately
    the image does boot!
    
    After some time, streaming gets canceled by an EIO, which needs further
    investigation.
    
    Signed-off-by: Peter Maydell <peter.maydell at linaro.org>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/curl.c b/block/curl.c
index a603936..a807584 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -34,6 +34,11 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif
 
+#if LIBCURL_VERSION_NUM >= 0x071000
+/* The multi interface timer callback was introduced in 7.16.0 */
+#define NEED_CURL_TIMER_CALLBACK
+#endif
+
 #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \
                    CURLPROTO_FTP | CURLPROTO_FTPS | \
                    CURLPROTO_TFTP)
@@ -77,6 +82,7 @@ typedef struct CURLState
 
 typedef struct BDRVCURLState {
     CURLM *multi;
+    QEMUTimer timer;
     size_t len;
     CURLState states[CURL_NUM_STATES];
     char *url;
@@ -87,6 +93,23 @@ typedef struct BDRVCURLState {
 static void curl_clean_state(CURLState *s);
 static void curl_multi_do(void *arg);
 
+#ifdef NEED_CURL_TIMER_CALLBACK
+static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque)
+{
+    BDRVCURLState *s = opaque;
+
+    DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms);
+    if (timeout_ms == -1) {
+        timer_del(&s->timer);
+    } else {
+        int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000;
+        timer_mod(&s->timer,
+                  qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns);
+    }
+    return 0;
+}
+#endif
+
 static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action,
                         void *s, void *sp)
 {
@@ -209,20 +232,10 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len,
     return FIND_RET_NONE;
 }
 
-static void curl_multi_do(void *arg)
+static void curl_multi_read(BDRVCURLState *s)
 {
-    BDRVCURLState *s = (BDRVCURLState *)arg;
-    int running;
-    int r;
     int msgs_in_queue;
 
-    if (!s->multi)
-        return;
-
-    do {
-        r = curl_multi_socket_all(s->multi, &running);
-    } while(r == CURLM_CALL_MULTI_PERFORM);
-
     /* Try to find done transfers, so we can free the easy
      * handle again. */
     do {
@@ -266,6 +279,41 @@ static void curl_multi_do(void *arg)
     } while(msgs_in_queue);
 }
 
+static void curl_multi_do(void *arg)
+{
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+    int r;
+
+    if (!s->multi) {
+        return;
+    }
+
+    do {
+        r = curl_multi_socket_all(s->multi, &running);
+    } while(r == CURLM_CALL_MULTI_PERFORM);
+
+    curl_multi_read(s);
+}
+
+static void curl_multi_timeout_do(void *arg)
+{
+#ifdef NEED_CURL_TIMER_CALLBACK
+    BDRVCURLState *s = (BDRVCURLState *)arg;
+    int running;
+
+    if (!s->multi) {
+        return;
+    }
+
+    curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
+
+    curl_multi_read(s);
+#else
+    abort();
+#endif
+}
+
 static CURLState *curl_init_state(BDRVCURLState *s)
 {
     CURLState *state = NULL;
@@ -473,12 +521,20 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
     curl_easy_cleanup(state->curl);
     state->curl = NULL;
 
+    aio_timer_init(bdrv_get_aio_context(bs), &s->timer,
+                   QEMU_CLOCK_REALTIME, SCALE_NS,
+                   curl_multi_timeout_do, s);
+
     // Now we know the file exists and its size, so let's
     // initialize the multi interface!
 
     s->multi = curl_multi_init();
     curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s);
     curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb);
+#ifdef NEED_CURL_TIMER_CALLBACK
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s);
+    curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb);
+#endif
     curl_multi_do(s);
 
     qemu_opts_del(opts);
@@ -597,6 +653,9 @@ static void curl_close(BlockDriverState *bs)
     }
     if (s->multi)
         curl_multi_cleanup(s->multi);
+
+    timer_del(&s->timer);
+
     g_free(s->url);
 }
 
commit 0901f67ecdb74d9ba1451e3b4367194cd43f96b4
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:38 2014 +0100

    qmp: Allow to take external snapshots on bs graphs node.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/blockdev.c b/blockdev.c
index d192370..36ceece 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -947,14 +947,22 @@ static void blockdev_do_action(int kind, void *data, Error **errp)
     qmp_transaction(&list, errp);
 }
 
-void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file,
+void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
+                                bool has_node_name, const char *node_name,
+                                const char *snapshot_file,
+                                bool has_snapshot_node_name,
+                                const char *snapshot_node_name,
                                 bool has_format, const char *format,
-                                bool has_mode, enum NewImageMode mode,
-                                Error **errp)
+                                bool has_mode, NewImageMode mode, Error **errp)
 {
     BlockdevSnapshot snapshot = {
+        .has_device = has_device,
         .device = (char *) device,
+        .has_node_name = has_node_name,
+        .node_name = (char *) node_name,
         .snapshot_file = (char *) snapshot_file,
+        .has_snapshot_node_name = has_snapshot_node_name,
+        .snapshot_node_name = (char *) snapshot_node_name,
         .has_format = has_format,
         .format = (char *) format,
         .has_mode = has_mode,
@@ -1192,8 +1200,14 @@ static void external_snapshot_prepare(BlkTransactionState *common,
 {
     BlockDriver *drv;
     int flags, ret;
+    QDict *options = NULL;
     Error *local_err = NULL;
+    bool has_device = false;
     const char *device;
+    bool has_node_name = false;
+    const char *node_name;
+    bool has_snapshot_node_name = false;
+    const char *snapshot_node_name;
     const char *new_image_file;
     const char *format = "qcow2";
     enum NewImageMode mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
@@ -1204,7 +1218,14 @@ static void external_snapshot_prepare(BlkTransactionState *common,
     /* get parameters */
     g_assert(action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC);
 
+    has_device = action->blockdev_snapshot_sync->has_device;
     device = action->blockdev_snapshot_sync->device;
+    has_node_name = action->blockdev_snapshot_sync->has_node_name;
+    node_name = action->blockdev_snapshot_sync->node_name;
+    has_snapshot_node_name =
+        action->blockdev_snapshot_sync->has_snapshot_node_name;
+    snapshot_node_name = action->blockdev_snapshot_sync->snapshot_node_name;
+
     new_image_file = action->blockdev_snapshot_sync->snapshot_file;
     if (action->blockdev_snapshot_sync->has_format) {
         format = action->blockdev_snapshot_sync->format;
@@ -1220,9 +1241,21 @@ static void external_snapshot_prepare(BlkTransactionState *common,
         return;
     }
 
-    state->old_bs = bdrv_find(device);
-    if (!state->old_bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    state->old_bs = bdrv_lookup_bs(has_device ? device : NULL,
+                                   has_node_name ? node_name : NULL,
+                                   &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (has_node_name && !has_snapshot_node_name) {
+        error_setg(errp, "New snapshot node name missing");
+        return;
+    }
+
+    if (has_snapshot_node_name && bdrv_find_node(snapshot_node_name)) {
+        error_setg(errp, "New snapshot node name already existing");
         return;
     }
 
@@ -1262,15 +1295,23 @@ static void external_snapshot_prepare(BlkTransactionState *common,
         }
     }
 
+    if (has_snapshot_node_name) {
+        options = qdict_new();
+        qdict_put(options, "node-name",
+                  qstring_from_str(snapshot_node_name));
+    }
+
     /* We will manually add the backing_hd field to the bs later */
     state->new_bs = bdrv_new("");
     /* TODO Inherit bs->options or only take explicit options with an
      * extended QMP command? */
-    ret = bdrv_open(state->new_bs, new_image_file, NULL,
+    ret = bdrv_open(state->new_bs, new_image_file, options,
                     flags | BDRV_O_NO_BACKING, drv, &local_err);
     if (ret != 0) {
         error_propagate(errp, local_err);
     }
+
+    QDECREF(options);
 }
 
 static void external_snapshot_commit(BlkTransactionState *common)
diff --git a/hmp.c b/hmp.c
index 66c8d7e..1af0809 100644
--- a/hmp.c
+++ b/hmp.c
@@ -972,7 +972,9 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
     }
 
     mode = reuse ? NEW_IMAGE_MODE_EXISTING : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
-    qmp_blockdev_snapshot_sync(device, filename, !!format, format,
+    qmp_blockdev_snapshot_sync(true, device, false, NULL,
+                               filename, false, NULL,
+                               !!format, format,
                                true, mode, &errp);
     hmp_handle_error(mon, &errp);
 }
diff --git a/qapi-schema.json b/qapi-schema.json
index f21243a..e04949d 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1761,18 +1761,25 @@
 ##
 # @BlockdevSnapshot
 #
-# @device:  the name of the device to generate the snapshot from.
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the device to generate the snapshot from.
+#
+# @node-name: #optional graph node name to generate the snapshot from (Since 2.0)
 #
 # @snapshot-file: the target of the new image. A new file will be created.
 #
+# @snapshot-node-name: #optional the graph node name of the new image (Since 2.0)
+#
 # @format: #optional the format of the snapshot image, default is 'qcow2'.
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
 ##
 { 'type': 'BlockdevSnapshot',
-  'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str',
-            '*mode': 'NewImageMode' } }
+  'data': { '*device': 'str', '*node-name': 'str',
+            'snapshot-file': 'str', '*snapshot-node-name': 'str',
+            '*format': 'str', '*mode': 'NewImageMode' } }
 
 ##
 # @BlockdevSnapshotInternal
diff --git a/qmp-commands.hx b/qmp-commands.hx
index a45f26c..4f28250 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1089,7 +1089,9 @@ actions array:
     - "data": a dictionary.  The contents depend on the value
       of "type".  When "type" is "blockdev-snapshot-sync":
       - "device": device name to snapshot (json-string)
+      - "node-name": graph node name to snapshot (json-string)
       - "snapshot-file": name of new image file (json-string)
+      - "snapshot-node-name": graph node name of the new snapshot (json-string)
       - "format": format of new image (json-string, optional)
       - "mode": whether and how QEMU should create the snapshot file
         (NewImageMode, optional, default "absolute-paths")
@@ -1104,6 +1106,11 @@ Example:
          { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd0",
                                          "snapshot-file": "/some/place/my-image",
                                          "format": "qcow2" } },
+         { 'type': 'blockdev-snapshot-sync', 'data' : { "node-name": "myfile",
+                                         "snapshot-file": "/some/place/my-image2",
+                                         "snapshot-node-name": "node3432",
+                                         "mode": "existing",
+                                         "format": "qcow2" } },
          { 'type': 'blockdev-snapshot-sync', 'data' : { "device": "ide-hd1",
                                          "snapshot-file": "/some/place/my-image2",
                                          "mode": "existing",
@@ -1117,7 +1124,7 @@ EQMP
 
     {
         .name       = "blockdev-snapshot-sync",
-        .args_type  = "device:B,snapshot-file:s,format:s?,mode:s?",
+        .args_type  = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
         .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
     },
 
@@ -1134,7 +1141,9 @@ snapshot image, default is qcow2.
 Arguments:
 
 - "device": device name to snapshot (json-string)
+- "node-name": graph node name to snapshot (json-string)
 - "snapshot-file": name of new image file (json-string)
+- "snapshot-node-name": graph node name of the new snapshot (json-string)
 - "mode": whether and how QEMU should create the snapshot file
   (NewImageMode, optional, default "absolute-paths")
 - "format": format of new image (json-string, optional)
commit 3b1dbd11a60d75e99af5fc9b73c34f4af9d4f510
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:37 2014 +0100

    qmp: Allow block_resize to manipulate bs graph nodes.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/blockdev.c b/blockdev.c
index 32a356e..d192370 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1683,14 +1683,24 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
     return 0;
 }
 
-void qmp_block_resize(const char *device, int64_t size, Error **errp)
+void qmp_block_resize(bool has_device, const char *device,
+                      bool has_node_name, const char *node_name,
+                      int64_t size, Error **errp)
 {
+    Error *local_err = NULL;
     BlockDriverState *bs;
     int ret;
 
-    bs = bdrv_find(device);
-    if (!bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    bs = bdrv_lookup_bs(has_device ? device : NULL,
+                        has_node_name ? node_name : NULL,
+                        &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (!bdrv_is_first_non_filter(bs)) {
+        error_set(errp, QERR_FEATURE_DISABLED, "resize");
         return;
     }
 
diff --git a/hmp.c b/hmp.c
index 5804a5a..66c8d7e 100644
--- a/hmp.c
+++ b/hmp.c
@@ -893,7 +893,7 @@ void hmp_block_resize(Monitor *mon, const QDict *qdict)
     int64_t size = qdict_get_int(qdict, "size");
     Error *errp = NULL;
 
-    qmp_block_resize(device, size, &errp);
+    qmp_block_resize(true, device, false, NULL, size, &errp);
     hmp_handle_error(mon, &errp);
 }
 
diff --git a/qapi-schema.json b/qapi-schema.json
index 3f48ae9..f21243a 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1724,7 +1724,11 @@
 #
 # Resize a block image while a guest is running.
 #
-# @device:  the name of the device to get the image resized
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the device to get the image resized
+#
+# @node-name: #optional graph node name to get the image resized (Since 2.0)
 #
 # @size:  new image size in bytes
 #
@@ -1733,7 +1737,9 @@
 #
 # Since: 0.14.0
 ##
-{ 'command': 'block_resize', 'data': { 'device': 'str', 'size': 'int' }}
+{ 'command': 'block_resize', 'data': { '*device': 'str',
+                                       '*node-name': 'str',
+                                       'size': 'int' }}
 
 ##
 # @NewImageMode
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 5492eb0..a45f26c 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -931,7 +931,7 @@ EQMP
 
     {
         .name       = "block_resize",
-        .args_type  = "device:B,size:o",
+        .args_type  = "device:s?,node-name:s?,size:o",
         .mhandler.cmd_new = qmp_marshal_input_block_resize,
     },
 
@@ -944,6 +944,7 @@ Resize a block image while a guest is running.
 Arguments:
 
 - "device": the device's ID, must be unique (json-string)
+- "node-name": the node name in the block driver state graph (json-string)
 - "size": new size
 
 Example:
commit 212a5a8f095de9a1624de6b4a589d60688b02747
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:36 2014 +0100

    block: Create authorizations mechanism for external snapshot and resize.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 064df90..18c0a8d 100644
--- a/block.c
+++ b/block.c
@@ -5094,21 +5094,68 @@ int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options)
     return bs->drv->bdrv_amend_options(bs, options);
 }
 
-ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs)
+/* Used to recurse on single child block filters.
+ * Single child block filter will store their child in bs->file.
+ */
+bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate)
 {
-    if (bs->drv->bdrv_check_ext_snapshot) {
-        return bs->drv->bdrv_check_ext_snapshot(bs);
+    if (!bs->drv) {
+        return false;
+    }
+
+    if (!bs->drv->authorizations[BS_IS_A_FILTER]) {
+        if (bs == candidate) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) {
+        return false;
     }
 
-    if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) {
-        return bs->file->drv->bdrv_check_ext_snapshot(bs);
+    if (!bs->file) {
+        return false;
+    }
+
+    return bdrv_recurse_is_first_non_filter(bs->file, candidate);
+}
+
+bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate)
+{
+    if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) {
+        return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate);
     }
 
-    /* external snapshots are allowed by default */
-    return EXT_SNAPSHOT_ALLOWED;
+    return bdrv_generic_is_first_non_filter(bs, candidate);
 }
 
-ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs)
+/* This function checks if the candidate is the first non filter bs down it's
+ * bs chain. Since we don't have pointers to parents it explore all bs chains
+ * from the top. Some filters can choose not to pass down the recursion.
+ */
+bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
-    return EXT_SNAPSHOT_FORBIDDEN;
+    BlockDriverState *bs;
+
+    /* walk down the bs forest recursively */
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        bool perm;
+
+        if (!bs->file) {
+            continue;
+        }
+
+        perm = bdrv_recurse_is_first_non_filter(bs->file, candidate);
+
+        /* candidate is the first non filter */
+        if (perm) {
+            return true;
+        }
+    }
+
+    return false;
 }
diff --git a/block/blkverify.c b/block/blkverify.c
index a2e8f5f..cfcbcf4 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -404,7 +404,7 @@ static BlockDriver bdrv_blkverify = {
     .bdrv_aio_writev        = blkverify_aio_writev,
     .bdrv_aio_flush         = blkverify_aio_flush,
 
-    .bdrv_check_ext_snapshot = bdrv_check_ext_snapshot_forbidden,
+    .authorizations         = { true, false },
 };
 
 static void bdrv_blkverify_init(void)
diff --git a/blockdev.c b/blockdev.c
index 2f4065e..32a356e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1243,7 +1243,7 @@ static void external_snapshot_prepare(BlkTransactionState *common,
         }
     }
 
-    if (bdrv_check_ext_snapshot(state->old_bs) != EXT_SNAPSHOT_ALLOWED) {
+    if (!bdrv_is_first_non_filter(state->old_bs)) {
         error_set(errp, QERR_FEATURE_DISABLED, "snapshot");
         return;
     }
diff --git a/include/block/block.h b/include/block/block.h
index b4a77e6..59d9f12 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -287,16 +287,16 @@ int bdrv_amend_options(BlockDriverState *bs_new, QEMUOptionParameter *options);
 /* external snapshots */
 
 typedef enum {
-    EXT_SNAPSHOT_ALLOWED,
-    EXT_SNAPSHOT_FORBIDDEN,
-} ExtSnapshotPerm;
-
-/* return EXT_SNAPSHOT_ALLOWED if external snapshot is allowed
- * return EXT_SNAPSHOT_FORBIDDEN if external snapshot is forbidden
- */
-ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs);
-/* helper used to forbid external snapshots like in blkverify */
-ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs);
+    BS_IS_A_FILTER,
+    BS_FILTER_PASS_DOWN,
+    BS_AUTHORIZATION_COUNT,
+} BsAuthorization;
+
+bool bdrv_generic_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate);
+bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
+                                      BlockDriverState *candidate);
+bool bdrv_is_first_non_filter(BlockDriverState *candidate);
 
 /* async block I/O */
 typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f3f518c..611a955 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -69,10 +69,16 @@ struct BlockDriver {
     const char *format_name;
     int instance_size;
 
-    /* if not defined external snapshots are allowed
-     * future block filters will query their children to build the response
+    /* this table of boolean contains authorizations for the block operations */
+    bool authorizations[BS_AUTHORIZATION_COUNT];
+    /* for snapshots complex block filter like Quorum can implement the
+     * following recursive callback instead of BS_IS_A_FILTER.
+     * It's purpose is to recurse on the filter children while calling
+     * bdrv_recurse_is_first_non_filter on them.
+     * For a sample implementation look in the future Quorum block filter.
      */
-    ExtSnapshotPerm (*bdrv_check_ext_snapshot)(BlockDriverState *bs);
+    bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
+                                             BlockDriverState *candidate);
 
     int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
     int (*bdrv_probe_device)(const char *filename);
commit 12d3ba821da9f8a039240a8a1bc01e27a12f9c22
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:35 2014 +0100

    qmp: Allow to change password on named block driver states.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    
    There was two candidate ways to implement named node manipulation:
    
    1)
    { 'command': 'block_passwd', 'data': {'*device': 'str',
                                          '*node-name': 'str', 'password': 'str'}
    }
    
    2)
    
    { 'command': 'block_passwd', 'data': {'device': 'str',
                                          '*device-is-node': 'bool',
                                          'password': 'str'} }
    
    Luiz proposed 1 and says 2 was an abuse of the QMP interface and proposed to
    rewrite the QMP block interface for 2.0.
    
    Luiz does not like in 1 the fact that 2 fields are optional but one of them must
    be specified leading to an abuse of the QMP semantic.
    
    Kevin argumented that 2 what a clear abuse of the device field and would not be
    practical when reading fast some log file because the user would read "device"
    and think that a device is manipulated when it's in fact a node name.
    Documentation of 1 make it pretty clear what to do for the user.
    
    Kevin argued that all bs are node including devices ones so 2 does not make
    sense.
    
    Kevin also argued that rewriting the QMP block interface would not make disapear
    the current one.
    
    Kevin pushed the argument that making the QAPI generator compatible with the
    semantic of the operation would need a rewrite that no one has done yet.
    
    A vote has been done on the list to elect the version to use and 1 won.
    
    For reference the complete thread is:
    "[Qemu-devel] [PATCH V4 4/7] qmp: Allow to change password on names block driver
    states."
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 5f6308d..064df90 100644
--- a/block.c
+++ b/block.c
@@ -3309,6 +3309,38 @@ BlockDeviceInfoList *bdrv_named_nodes_list(void)
     return list;
 }
 
+BlockDriverState *bdrv_lookup_bs(const char *device,
+                                 const char *node_name,
+                                 Error **errp)
+{
+    BlockDriverState *bs = NULL;
+
+    if ((!device && !node_name) || (device && node_name)) {
+        error_setg(errp, "Use either device or node-name but not both");
+        return NULL;
+    }
+
+    if (device) {
+        bs = bdrv_find(device);
+
+        if (!bs) {
+            error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+            return NULL;
+        }
+
+        return bs;
+    }
+
+    bs = bdrv_find_node(node_name);
+
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
+        return NULL;
+    }
+
+    return bs;
+}
+
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
     if (!bs) {
diff --git a/blockdev.c b/blockdev.c
index 0bfe380..2f4065e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1481,14 +1481,19 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
     eject_device(bs, force, errp);
 }
 
-void qmp_block_passwd(const char *device, const char *password, Error **errp)
+void qmp_block_passwd(bool has_device, const char *device,
+                      bool has_node_name, const char *node_name,
+                      const char *password, Error **errp)
 {
+    Error *local_err = NULL;
     BlockDriverState *bs;
     int err;
 
-    bs = bdrv_find(device);
-    if (!bs) {
-        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+    bs = bdrv_lookup_bs(has_device ? device : NULL,
+                        has_node_name ? node_name : NULL,
+                        &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
         return;
     }
 
diff --git a/hmp.c b/hmp.c
index 468f97d..5804a5a 100644
--- a/hmp.c
+++ b/hmp.c
@@ -871,7 +871,7 @@ void hmp_block_passwd(Monitor *mon, const QDict *qdict)
     const char *password = qdict_get_str(qdict, "password");
     Error *errp = NULL;
 
-    qmp_block_passwd(device, password, &errp);
+    qmp_block_passwd(true, device, false, NULL, password, &errp);
     hmp_handle_error(mon, &errp);
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index 13654ed..b4a77e6 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -380,6 +380,9 @@ const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find(const char *name);
 BlockDriverState *bdrv_find_node(const char *node_name);
 BlockDeviceInfoList *bdrv_named_nodes_list(void);
+BlockDriverState *bdrv_lookup_bs(const char *device,
+                                 const char *node_name,
+                                 Error **errp);
 BlockDriverState *bdrv_next(BlockDriverState *bs);
 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs),
                   void *opaque);
diff --git a/qapi-schema.json b/qapi-schema.json
index b619f01..3f48ae9 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1678,7 +1678,11 @@
 # determine which ones are encrypted, set the passwords with this command, and
 # then start the guest with the @cont command.
 #
-# @device:   the name of the device to set the password on
+# Either @device or @node-name must be set but not both.
+#
+# @device: #optional the name of the block backend device to set the password on
+#
+# @node-name: #optional graph node name to set the password on (Since 2.0)
 #
 # @password: the password to use for the device
 #
@@ -1692,7 +1696,8 @@
 #
 # Since: 0.14.0
 ##
-{ 'command': 'block_passwd', 'data': {'device': 'str', 'password': 'str'} }
+{ 'command': 'block_passwd', 'data': {'*device': 'str',
+                                      '*node-name': 'str', 'password': 'str'} }
 
 ##
 # @balloon:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 11b44c5..5492eb0 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1503,7 +1503,7 @@ EQMP
 
     {
         .name       = "block_passwd",
-        .args_type  = "device:B,password:s",
+        .args_type  = "device:s?,node-name:s?,password:s",
         .mhandler.cmd_new = qmp_marshal_input_block_passwd,
     },
 
@@ -1516,6 +1516,7 @@ Set the password of encrypted block devices.
 Arguments:
 
 - "device": device name (json-string)
+- "node-name": name in the block driver state graph (json-string)
 - "password": password (json-string)
 
 Example:
commit c13163fba151f0be5176eaf55907bc1dbff3a1d4
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:34 2014 +0100

    qmp: Add QMP query-named-block-nodes to list the named BlockDriverState nodes.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index f043669..5f6308d 100644
--- a/block.c
+++ b/block.c
@@ -32,6 +32,7 @@
 #include "sysemu/sysemu.h"
 #include "qemu/notify.h"
 #include "block/coroutine.h"
+#include "block/qapi.h"
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 
@@ -3291,6 +3292,23 @@ BlockDriverState *bdrv_find_node(const char *node_name)
     return NULL;
 }
 
+/* Put this QMP function here so it can access the static graph_bdrv_states. */
+BlockDeviceInfoList *bdrv_named_nodes_list(void)
+{
+    BlockDeviceInfoList *list, *entry;
+    BlockDriverState *bs;
+
+    list = NULL;
+    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        entry = g_malloc0(sizeof(*entry));
+        entry->value = bdrv_block_device_info(bs);
+        entry->next = list;
+        list = entry;
+    }
+
+    return list;
+}
+
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
     if (!bs) {
diff --git a/block/qapi.c b/block/qapi.c
index 98b1b83..8f4134b 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -29,6 +29,60 @@
 #include "qapi/qmp-output-visitor.h"
 #include "qapi/qmp/types.h"
 
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+{
+    BlockDeviceInfo *info = g_malloc0(sizeof(*info));
+
+    info->file                   = g_strdup(bs->filename);
+    info->ro                     = bs->read_only;
+    info->drv                    = g_strdup(bs->drv->format_name);
+    info->encrypted              = bs->encrypted;
+    info->encryption_key_missing = bdrv_key_required(bs);
+
+    if (bs->node_name[0]) {
+        info->has_node_name = true;
+        info->node_name = g_strdup(bs->node_name);
+    }
+
+    if (bs->backing_file[0]) {
+        info->has_backing_file = true;
+        info->backing_file = g_strdup(bs->backing_file);
+    }
+
+    info->backing_file_depth = bdrv_get_backing_file_depth(bs);
+
+    if (bs->io_limits_enabled) {
+        ThrottleConfig cfg;
+        throttle_get_config(&bs->throttle_state, &cfg);
+        info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
+        info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
+        info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
+
+        info->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
+        info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
+        info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
+
+        info->has_bps_max     = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->bps_max         = cfg.buckets[THROTTLE_BPS_TOTAL].max;
+        info->has_bps_rd_max  = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->bps_rd_max      = cfg.buckets[THROTTLE_BPS_READ].max;
+        info->has_bps_wr_max  = cfg.buckets[THROTTLE_BPS_WRITE].max;
+        info->bps_wr_max      = cfg.buckets[THROTTLE_BPS_WRITE].max;
+
+        info->has_iops_max    = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->iops_max        = cfg.buckets[THROTTLE_OPS_TOTAL].max;
+        info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->iops_rd_max     = cfg.buckets[THROTTLE_OPS_READ].max;
+        info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max;
+        info->iops_wr_max     = cfg.buckets[THROTTLE_OPS_WRITE].max;
+
+        info->has_iops_size = cfg.op_size;
+        info->iops_size = cfg.op_size;
+    }
+
+    return info;
+}
+
 /*
  * Returns 0 on success, with *p_list either set to describe snapshot
  * information, or NULL because there are no snapshots.  Returns -errno on
@@ -211,60 +265,7 @@ void bdrv_query_info(BlockDriverState *bs,
 
     if (bs->drv) {
         info->has_inserted = true;
-        info->inserted = g_malloc0(sizeof(*info->inserted));
-        info->inserted->file = g_strdup(bs->filename);
-        info->inserted->ro = bs->read_only;
-        info->inserted->drv = g_strdup(bs->drv->format_name);
-        info->inserted->encrypted = bs->encrypted;
-        info->inserted->encryption_key_missing = bdrv_key_required(bs);
-
-        if (bs->backing_file[0]) {
-            info->inserted->has_backing_file = true;
-            info->inserted->backing_file = g_strdup(bs->backing_file);
-        }
-
-        info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
-
-        if (bs->io_limits_enabled) {
-            ThrottleConfig cfg;
-            throttle_get_config(&bs->throttle_state, &cfg);
-            info->inserted->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
-            info->inserted->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
-            info->inserted->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
-
-            info->inserted->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
-            info->inserted->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
-            info->inserted->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
-
-            info->inserted->has_bps_max     =
-                cfg.buckets[THROTTLE_BPS_TOTAL].max;
-            info->inserted->bps_max         =
-                cfg.buckets[THROTTLE_BPS_TOTAL].max;
-            info->inserted->has_bps_rd_max  =
-                cfg.buckets[THROTTLE_BPS_READ].max;
-            info->inserted->bps_rd_max      =
-                cfg.buckets[THROTTLE_BPS_READ].max;
-            info->inserted->has_bps_wr_max  =
-                cfg.buckets[THROTTLE_BPS_WRITE].max;
-            info->inserted->bps_wr_max      =
-                cfg.buckets[THROTTLE_BPS_WRITE].max;
-
-            info->inserted->has_iops_max    =
-                cfg.buckets[THROTTLE_OPS_TOTAL].max;
-            info->inserted->iops_max        =
-                cfg.buckets[THROTTLE_OPS_TOTAL].max;
-            info->inserted->has_iops_rd_max =
-                cfg.buckets[THROTTLE_OPS_READ].max;
-            info->inserted->iops_rd_max     =
-                cfg.buckets[THROTTLE_OPS_READ].max;
-            info->inserted->has_iops_wr_max =
-                cfg.buckets[THROTTLE_OPS_WRITE].max;
-            info->inserted->iops_wr_max     =
-                cfg.buckets[THROTTLE_OPS_WRITE].max;
-
-            info->inserted->has_iops_size = cfg.op_size;
-            info->inserted->iops_size = cfg.op_size;
-        }
+        info->inserted = bdrv_block_device_info(bs);
 
         bs0 = bs;
         p_image_info = &info->inserted->image;
diff --git a/blockdev.c b/blockdev.c
index 386109a..0bfe380 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1952,6 +1952,11 @@ void qmp_drive_backup(const char *device, const char *target,
     }
 }
 
+BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
+{
+    return bdrv_named_nodes_list();
+}
+
 #define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
 
 void qmp_drive_mirror(const char *device, const char *target,
diff --git a/include/block/block.h b/include/block/block.h
index 501b555..13654ed 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -379,6 +379,7 @@ void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find(const char *name);
 BlockDriverState *bdrv_find_node(const char *node_name);
+BlockDeviceInfoList *bdrv_named_nodes_list(void);
 BlockDriverState *bdrv_next(BlockDriverState *bs);
 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs),
                   void *opaque);
diff --git a/include/block/qapi.h b/include/block/qapi.h
index 9518ee4..e92c00d 100644
--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -29,6 +29,7 @@
 #include "block/block.h"
 #include "block/snapshot.h"
 
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs);
 int bdrv_query_snapshot_info_list(BlockDriverState *bs,
                                   SnapshotInfoList **p_list,
                                   Error **errp);
diff --git a/qapi-schema.json b/qapi-schema.json
index 26e370b..b619f01 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -810,6 +810,8 @@
 #
 # @file: the filename of the backing device
 #
+# @node-name: #optional the name of the block driver node (Since 2.0)
+#
 # @ro: true if the backing device was open read-only
 #
 # @drv: the name of the block format used to open the backing device. As of
@@ -857,10 +859,9 @@
 #
 # Since: 0.14.0
 #
-# Notes: This interface is only found in @BlockInfo.
 ##
 { 'type': 'BlockDeviceInfo',
-  'data': { 'file': 'str', 'ro': 'bool', 'drv': 'str',
+  'data': { 'file': 'str', '*node-name': 'str', 'ro': 'bool', 'drv': 'str',
             '*backing_file': 'str', 'backing_file_depth': 'int',
             'encrypted': 'bool', 'encryption_key_missing': 'bool',
             'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
@@ -2011,6 +2012,17 @@
 { 'command': 'drive-backup', 'data': 'DriveBackup' }
 
 ##
+# @query-named-block-nodes
+#
+# Get the named block driver list
+#
+# Returns: the list of BlockDeviceInfo
+#
+# Since 2.0
+##
+{ 'command': 'query-named-block-nodes', 'returns': [ 'BlockDeviceInfo' ] }
+
+##
 # @drive-mirror
 #
 # Start mirroring a block device's writes to a new destination.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 02cc815..11b44c5 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -3346,3 +3346,64 @@ Example (2):
 <- { "return": {} }
 
 EQMP
+
+    {
+        .name       = "query-named-block-nodes",
+        .args_type  = "",
+        .mhandler.cmd_new = qmp_marshal_input_query_named_block_nodes,
+    },
+
+SQMP
+ at query-named-block-nodes
+------------------------
+
+Return a list of BlockDeviceInfo for all the named block driver nodes
+
+Example:
+
+-> { "execute": "query-named-block-nodes" }
+<- { "return": [ { "ro":false,
+                   "drv":"qcow2",
+                   "encrypted":false,
+                   "file":"disks/test.qcow2",
+                   "node-name": "my-node",
+                   "backing_file_depth":1,
+                   "bps":1000000,
+                   "bps_rd":0,
+                   "bps_wr":0,
+                   "iops":1000000,
+                   "iops_rd":0,
+                   "iops_wr":0,
+                   "bps_max": 8000000,
+                   "bps_rd_max": 0,
+                   "bps_wr_max": 0,
+                   "iops_max": 0,
+                   "iops_rd_max": 0,
+                   "iops_wr_max": 0,
+                   "iops_size": 0,
+                   "image":{
+                      "filename":"disks/test.qcow2",
+                      "format":"qcow2",
+                      "virtual-size":2048000,
+                      "backing_file":"base.qcow2",
+                      "full-backing-filename":"disks/base.qcow2",
+                      "backing-filename-format:"qcow2",
+                      "snapshots":[
+                         {
+                            "id": "1",
+                            "name": "snapshot1",
+                            "vm-state-size": 0,
+                            "date-sec": 10000200,
+                            "date-nsec": 12,
+                            "vm-clock-sec": 206,
+                            "vm-clock-nsec": 30
+                         }
+                      ],
+                      "backing-image":{
+                          "filename":"disks/base.qcow2",
+                          "format":"qcow2",
+                          "virtual-size":2048000
+                      }
+                   } } ] }
+
+EQMP
commit 6913c0c2ce00c0e886b2bd20b05073090fa5308a
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:33 2014 +0100

    block: Allow the user to define "node-name" option both on command line and QMP.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 8562685..f043669 100644
--- a/block.c
+++ b/block.c
@@ -735,6 +735,33 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags)
     return open_flags;
 }
 
+static int bdrv_assign_node_name(BlockDriverState *bs,
+                                 const char *node_name,
+                                 Error **errp)
+{
+    if (!node_name) {
+        return 0;
+    }
+
+    /* empty string node name is invalid */
+    if (node_name[0] == '\0') {
+        error_setg(errp, "Empty node name");
+        return -EINVAL;
+    }
+
+    /* takes care of avoiding duplicates node names */
+    if (bdrv_find_node(node_name)) {
+        error_setg(errp, "Duplicate node name");
+        return -EINVAL;
+    }
+
+    /* copy node name into the bs and insert it into the graph list */
+    pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
+    QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
+
+    return 0;
+}
+
 /*
  * Common part for opening disk images and files
  *
@@ -745,6 +772,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 {
     int ret, open_flags;
     const char *filename;
+    const char *node_name = NULL;
     Error *local_err = NULL;
 
     assert(drv != NULL);
@@ -759,6 +787,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name);
 
+    node_name = qdict_get_try_str(options, "node-name");
+    ret = bdrv_assign_node_name(bs, node_name, errp);
+    if (ret < 0) {
+        return ret;
+    }
+    qdict_del(options, "node-name");
+
     /* bdrv_open() with directly using a protocol as drv. This layer is already
      * opened, so assign it to bs (while file becomes a closed BlockDriverState)
      * and return immediately. */
diff --git a/qapi-schema.json b/qapi-schema.json
index a433869..26e370b 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4092,6 +4092,7 @@
 # @id:          #optional id by which the new block device can be referred to.
 #               This is a required option on the top level of blockdev-add, and
 #               currently not allowed on any other level.
+# @node-name:   #optional the name of a block driver state node (Since 2.0)
 # @discard:     #optional discard-related options (default: ignore)
 # @cache:       #optional cache-related options
 # @aio:         #optional AIO backend (default: threads)
@@ -4107,6 +4108,7 @@
 { 'type': 'BlockdevOptionsBase',
   'data': { 'driver': 'str',
             '*id': 'str',
+            '*node-name': 'str',
             '*discard': 'BlockdevDiscardOptions',
             '*cache': 'BlockdevCacheOptions',
             '*aio': 'BlockdevAioOptions',
commit dc364f4cdca0c49e37376b16c3ee0bf3b4a96f4c
Author: Benoît Canet <benoit at irqsave.net>
Date:   Thu Jan 23 21:31:32 2014 +0100

    block: Add bs->node_name to hold the name of a bs node of the bs graph.
    
    Add the minimum of code to prepare for the following patches.
    
    Signed-off-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 53cc9e0..8562685 100644
--- a/block.c
+++ b/block.c
@@ -90,6 +90,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
+static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
+    QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
+
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
@@ -327,7 +330,7 @@ BlockDriverState *bdrv_new(const char *device_name)
     QLIST_INIT(&bs->dirty_bitmaps);
     pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
     if (device_name[0] != '\0') {
-        QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
+        QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
     }
     bdrv_iostatus_disable(bs);
     notifier_list_init(&bs->close_notifiers);
@@ -1606,7 +1609,7 @@ void bdrv_close_all(void)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         bdrv_close(bs);
     }
 }
@@ -1635,7 +1638,7 @@ static bool bdrv_requests_pending(BlockDriverState *bs)
 static bool bdrv_requests_pending_all(void)
 {
     BlockDriverState *bs;
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         if (bdrv_requests_pending(bs)) {
             return true;
         }
@@ -1662,7 +1665,7 @@ void bdrv_drain_all(void)
     BlockDriverState *bs;
 
     while (busy) {
-        QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
             bdrv_start_throttled_reqs(bs);
         }
 
@@ -1671,14 +1674,19 @@ void bdrv_drain_all(void)
     }
 }
 
-/* make a BlockDriverState anonymous by removing from bdrv_state list.
+/* make a BlockDriverState anonymous by removing from bdrv_state and
+ * graph_bdrv_state list.
    Also, NULL terminate the device_name to prevent double remove */
 void bdrv_make_anon(BlockDriverState *bs)
 {
     if (bs->device_name[0] != '\0') {
-        QTAILQ_REMOVE(&bdrv_states, bs, list);
+        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
     }
     bs->device_name[0] = '\0';
+    if (bs->node_name[0] != '\0') {
+        QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list);
+    }
+    bs->node_name[0] = '\0';
 }
 
 static void bdrv_rebind(BlockDriverState *bs)
@@ -1732,7 +1740,12 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     /* keep the same entry in bdrv_states */
     pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name),
             bs_src->device_name);
-    bs_dest->list = bs_src->list;
+    bs_dest->device_list = bs_src->device_list;
+
+    /* keep the same entry in graph_bdrv_states
+     * We do want to swap name but don't want to swap linked list entries
+     */
+    bs_dest->node_list   = bs_src->node_list;
 }
 
 /*
@@ -2057,7 +2070,7 @@ int bdrv_commit_all(void)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         if (bs->drv && bs->backing_hd) {
             int ret = bdrv_commit(bs);
             if (ret < 0) {
@@ -3215,11 +3228,12 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
     }
 }
 
+/* This function is to find block backend bs */
 BlockDriverState *bdrv_find(const char *name)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         if (!strcmp(name, bs->device_name)) {
             return bs;
         }
@@ -3227,19 +3241,34 @@ BlockDriverState *bdrv_find(const char *name)
     return NULL;
 }
 
+/* This function is to find a node in the bs graph */
+BlockDriverState *bdrv_find_node(const char *node_name)
+{
+    BlockDriverState *bs;
+
+    assert(node_name);
+
+    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        if (!strcmp(node_name, bs->node_name)) {
+            return bs;
+        }
+    }
+    return NULL;
+}
+
 BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
     if (!bs) {
         return QTAILQ_FIRST(&bdrv_states);
     }
-    return QTAILQ_NEXT(bs, list);
+    return QTAILQ_NEXT(bs, device_list);
 }
 
 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         it(opaque, bs);
     }
 }
@@ -3259,7 +3288,7 @@ int bdrv_flush_all(void)
     BlockDriverState *bs;
     int result = 0;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         int ret = bdrv_flush(bs);
         if (ret < 0 && !result) {
             result = ret;
@@ -4383,7 +4412,7 @@ void bdrv_invalidate_cache_all(void)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         bdrv_invalidate_cache(bs);
     }
 }
@@ -4392,7 +4421,7 @@ void bdrv_clear_incoming_migration_all(void)
 {
     BlockDriverState *bs;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
         bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
     }
 }
diff --git a/include/block/block.h b/include/block/block.h
index a47f3d4..501b555 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -378,6 +378,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find(const char *name);
+BlockDriverState *bdrv_find_node(const char *node_name);
 BlockDriverState *bdrv_next(BlockDriverState *bs);
 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs),
                   void *opaque);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 2772f2f..f3f518c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -325,11 +325,18 @@ struct BlockDriverState {
     BlockdevOnError on_read_error, on_write_error;
     bool iostatus_enabled;
     BlockDeviceIoStatus iostatus;
+
+    /* the following member gives a name to every node on the bs graph. */
+    char node_name[32];
+    /* element of the list of named nodes building the graph */
+    QTAILQ_ENTRY(BlockDriverState) node_list;
+    /* Device name is the name associated with the "drive" the guest sees */
     char device_name[32];
+    /* element of the list of "drives" the guest sees */
+    QTAILQ_ENTRY(BlockDriverState) device_list;
     QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
     int refcnt;
     int in_use; /* users other than guest access, eg. block migration */
-    QTAILQ_ENTRY(BlockDriverState) list;
 
     QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 
commit c8059b97e1f9b4635b836ee98373a0f72f9fc0b4
Author: Fam Zheng <famz at redhat.com>
Date:   Thu Jan 23 10:03:26 2014 +0800

    qapi: Add "backing" to BlockStats
    
    Currently there is no way to query BlockStats of the backing chain. This
    adds "backing" field into BlockStats to make it possible.
    
    The comment of "parent" is reworded.
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/qapi.c b/block/qapi.c
index a32cb79..98b1b83 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -318,6 +318,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs)
         s->parent = bdrv_query_stats(bs->file);
     }
 
+    if (bs->backing_hd) {
+        s->has_backing = true;
+        s->backing = bdrv_query_stats(bs->backing_hd);
+    }
+
     return s;
 }
 
diff --git a/qapi-schema.json b/qapi-schema.json
index 35f7b34..a433869 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1022,15 +1022,17 @@
 #
 # @stats:  A @BlockDeviceStats for the device.
 #
-# @parent: #optional This may point to the backing block device if this is a
-#          a virtual block device.  If it's a backing block, this will point
-#          to the backing file is one is present.
+# @parent: #optional This describes the file block device if it has one.
+#
+# @backing: #optional This describes the backing block device if it has one.
+#           (Since 2.0)
 #
 # Since: 0.14.0
 ##
 { 'type': 'BlockStats',
   'data': {'*device': 'str', 'stats': 'BlockDeviceStats',
-           '*parent': 'BlockStats'} }
+           '*parent': 'BlockStats',
+           '*backing': 'BlockStats'} }
 
 ##
 # @query-blockstats:
commit d8a7b061ae01e5692cc994f05ad6480d8c170125
Author: Fam Zheng <famz at redhat.com>
Date:   Thu Jan 23 15:10:52 2014 +0800

    vmdk: Fix format specific information (create type) for streamOptimized
    
    Previously the field is wrong:
    
        $ ./qemu-img create -f vmdk -o subformat=streamOptimized /tmp/a.vmdk 1G
    
        $ ./qemu-img info /tmp/a.vmdk
        image: /tmp/a.vmdk
        file format: vmdk
        virtual size: 1.0G (1073741824 bytes)
        disk size: 12K
        Format specific information:
            cid: 1390460459
            parent cid: 4294967295
    >>>     create type: monolithicSparse
            <snip>
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/vmdk.c b/block/vmdk.c
index 74c44bd..67b5f96 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -661,6 +661,10 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     }
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
+    if (extent->compressed) {
+        g_free(s->create_type);
+        s->create_type = g_strdup("streamOptimized");
+    }
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
     extent->version = le32_to_cpu(header.version);
     extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
commit 6df3bf8eb3ed428015c85cfbd554ac9b32164f40
Author: Zhang Min <rudy.zhangmin at huawei.com>
Date:   Thu Jan 23 15:59:16 2014 +0800

    drive mirror:fix memory leak
    
    In the function mirror_iteration() -> qemu_iovec_init(),
    it allocates memory for op->qiov.iov, when the write request calls back,
    but in the function mirror_iteration_done(), it only frees the op,
    not free the op->qiov.iov, so this causes memory leak.
    
    It should use qemu_iovec_destroy() to free op->qiov.
    
    Signed-off-by: Zhang Min <rudy.zhangmin at huawei.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/mirror.c b/block/mirror.c
index 2932bab..05758e5 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -96,6 +96,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret)
         bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
     }
 
+    qemu_iovec_destroy(&op->qiov);
     g_slice_free(MirrorOp, op);
     qemu_coroutine_enter(s->common.co, NULL);
 }
commit 9cd767376f137918dbe90abb452dfe119ae7d8f3
Author: Liu Yuan <namei.unix at gmail.com>
Date:   Wed Jan 22 01:14:11 2014 +0800

    sheepdog: fix 'qemu-img map'
    
    It was muted in the previous commit 4bc74be9. Let's revive it since nothing
    prevents us to do it.
    
    With this patch, following command will work as other formats:
    
    $ qemu-img map sheepdog:image
    
    Cc: qemu-devel at nongnu.org
    Cc: Kevin Wolf <kwolf at redhat.com>
    Cc: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Liu Yuan <namei.unix at gmail.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 2ce3d9b..672b9c9 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2442,11 +2442,12 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
-    unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+    unsigned long start = offset / SD_DATA_OBJ_SIZE,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
                                      BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
     unsigned long idx;
-    int64_t ret = BDRV_BLOCK_DATA;
+    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 
     for (idx = start; idx < end; idx++) {
         if (inode->data_vdi_id[idx] == 0) {
commit 0e3bd9932f862c1c1e4926939b4d0c602ce214ef
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Mon Jan 20 15:12:16 2014 +0100

    Documentation: qemu-img: Mention SIGUSR1 progress report
    
    Document the SIGUSR1 behaviour of qemu-img. Also, added compare to the
    list of subcommands that support -p.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/qemu-img.texi b/qemu-img.texi
index 778e967..f86a86d 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -57,7 +57,9 @@ indicates that target image must be compressed (qcow format only)
 @item -h
 with or without a command shows help and lists the supported formats
 @item -p
-display progress bar (convert and rebase commands only)
+display progress bar (compare, convert and rebase commands only).
+If the @var{-p} option is not used for a command that supports it, the
+progress is reported when the process receives a @code{SIGUSR1} signal.
 @item -q
 Quiet mode - do not print any output (except errors). There's no progress bar
 in case both @var{-q} and @var{-p} options are used.
commit 3c4b4e383e82ab3db307ee01f12ab0d4a28584dc
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Mon Jan 20 15:06:03 2014 +0100

    qemu-progress: Fix progress printing on SIGUSR1
    
    Since commit a7aae221 ('Switch SIG_IPI to SIGUSR1'), SIGUSR1 is blocked
    during startup, breaking the progress report in tools.
    
    This patch reenables the signal when initialising a progress report.
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/util/qemu-progress.c b/util/qemu-progress.c
index ad33fee..4ee5cd0 100644
--- a/util/qemu-progress.c
+++ b/util/qemu-progress.c
@@ -82,12 +82,22 @@ static void progress_dummy_init(void)
 {
 #ifdef CONFIG_POSIX
     struct sigaction action;
+    sigset_t set;
 
     memset(&action, 0, sizeof(action));
     sigfillset(&action.sa_mask);
     action.sa_handler = sigusr_print;
     action.sa_flags = 0;
     sigaction(SIGUSR1, &action, NULL);
+
+    /*
+     * SIGUSR1 is SIG_IPI and gets blocked in qemu_init_main_loop(). In the
+     * tools that use the progress report SIGUSR1 isn't used in this meaning
+     * and instead should print the progress, so reenable it.
+     */
+    sigemptyset(&set);
+    sigaddset(&set, SIGUSR1);
+    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 #endif
 
     state.print = progress_dummy_print;
commit e69968d472bd020a08c677c814237548090d2e59
Author: Kevin Wolf <kwolf at redhat.com>
Date:   Mon Jan 20 15:05:25 2014 +0100

    qemu-progress: Drop unused include
    
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Benoit Canet <benoit at irqsave.net>

diff --git a/util/qemu-progress.c b/util/qemu-progress.c
index 9a3f96c..ad33fee 100644
--- a/util/qemu-progress.c
+++ b/util/qemu-progress.c
@@ -24,7 +24,6 @@
 
 #include "qemu-common.h"
 #include "qemu/osdep.h"
-#include "sysemu/sysemu.h"
 #include <stdio.h>
 
 struct progress_state {
commit 34ceed81f9ca31829448276dafe3d9151d66962c
Author: Fam Zheng <famz at redhat.com>
Date:   Tue Jan 21 15:07:43 2014 +0800

    vmdk: Check for overhead when opening
    
    Report an error if file size is even smaller than metadata.
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/vmdk.c b/block/vmdk.c
index 599a928..74c44bd 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -640,6 +640,13 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
         l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
     }
+    if (bdrv_getlength(file) <
+            le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) {
+        error_report("File truncated, expecting at least %lld bytes",
+                le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE);
+        return -EINVAL;
+    }
+
     ret = vmdk_add_extent(bs, file, false,
                           le64_to_cpu(header.capacity),
                           le64_to_cpu(header.gd_offset) << 9,
diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059
index 64ed04c..2d604d3 100755
--- a/tests/qemu-iotests/059
+++ b/tests/qemu-iotests/059
@@ -98,6 +98,12 @@ EOF
 _img_info
 
 echo
+echo "=== Testing truncated sparse ==="
+IMGOPTS="subformat=monolithicSparse" _make_test_img 100G
+truncate -s 10M $TEST_IMG
+_img_info
+
+echo
 echo "=== Testing version 3 ==="
 _use_sample_img iotest-version3.vmdk.bz2
 _img_info
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index 5e30e69..4ffeb54 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -2043,6 +2043,11 @@ qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Invalid extent lines:
 RW 12582912 VMFS "dummy.IMGFMT" 1
 
 
+=== Testing truncated sparse ===
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=107374182400
+qemu-img: File truncated, expecting at least 13172736 bytes
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'TEST_DIR/t.IMGFMT': Wrong medium type
+
 === Testing version 3 ===
 image: TEST_DIR/iotest-version3.IMGFMT
 file format: IMGFMT
commit 46bae927134468d27f5e2508c3ced67ff58fa45b
Author: Hu Tao <hutao at cn.fujitsu.com>
Date:   Tue Jan 21 11:30:02 2014 +0800

    qcow2: fix wrong value of L1E_OFFSET_MASK, L2E_OFFSET_MASK and REFT_OFFSET_MASK
    
    Accoring to qcow spec, the offset fields in l1e, l2e and ref table entry
    start at bit 9. The offset is cluster offset, and the smallest possible
    cluster size is 512 bytes.
    
    Signed-off-by: Hu Tao <hutao at cn.fujitsu.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/qcow2.h b/block/qcow2.h
index 303eb26..b5b7d13 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -340,11 +340,11 @@ typedef enum QCow2MetadataOverlap {
 #define QCOW2_OL_ALL \
     (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2)
 
-#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
-#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
+#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL
+#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL
 #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
 
-#define REFT_OFFSET_MASK 0xffffffffffffff00ULL
+#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL
 
 static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset)
 {
commit 385c04d0b66917457b6a12fc2cfd99a6a40b2d89
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Mon Jan 13 18:47:39 2014 +0800

    dataplane: fix shadowed return value
    
    Propagate the error return value from get_indirect().  This bug was
    introduced in commit 4d684832 ("vring: create a common function to parse
    descriptors").
    
    Reviewed-by: Markus Armbruster <armbru at redhat.com>
    Reviewed-by: Peter Maydell <peter.maydell at linaro.org>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 250d45e..665a1ff 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -376,7 +376,7 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
         barrier();
 
         if (desc.flags & VRING_DESC_F_INDIRECT) {
-            int ret = get_indirect(vring, elem, &desc);
+            ret = get_indirect(vring, elem, &desc);
             if (ret < 0) {
                 goto out;
             }
commit d80ac658f2caacfb14ea386211c4a9bea0cea280
Author: Peter Feiner <peter at gridcentric.ca>
Date:   Wed Jan 8 19:43:25 2014 +0000

    block: fix backing file segfault
    
    When a backing file is opened such that (1) a protocol is directly
    used as the block driver and (2) the block driver has bdrv_file_open,
    bdrv_open_backing_file segfaults. The problem arises because
    bdrv_open_common returns without setting bd->backing_hd->file.
    
    To effect (1), you seem to have to use the -F flag in qemu-img. There
    are several block drivers that satisfy (2), such as "file" and "nbd".
    Here are some concrete examples:
    
        #!/bin/bash
    
        echo Test file format
        ./qemu-img create -f file base.file 1m
        ./qemu-img create -f qcow2 -F file -o backing_file=base.file\
            file-overlay.qcow2
        ./qemu-img convert -O raw file-overlay.qcow2 file-convert.raw
    
        echo Test nbd format
        SOCK=$PWD/nbd.sock
        ./qemu-img create -f raw base.raw 1m
        ./qemu-nbd -t -k $SOCK base.raw &
        trap "kill $!" EXIT
        while ! test -e $SOCK; do sleep 1; done
        ./qemu-img create -f qcow2 -F nbd -o backing_file=nbd:unix:$SOCK\
            nbd-overlay.qcow2
        ./qemu-img convert -O raw nbd-overlay.qcow2 nbd-convert.raw
    
    Without this patch, the two qemu-img convert commands segfault.
    
    This is a regression that was introduced in v1.7 by
    dbecebddfa4932d1c83915bcb9b5ba5984eb91be.
    
    Signed-off-by: Peter Feiner <peter at gridcentric.ca>
    Reviewed-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 6af5f6e..53cc9e0 100644
--- a/block.c
+++ b/block.c
@@ -1040,8 +1040,12 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
         error_free(local_err);
         return ret;
     }
-    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
-            bs->backing_hd->file->filename);
+
+    if (bs->backing_hd->file) {
+        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+                bs->backing_hd->file->filename);
+    }
+
     return 0;
 }
 
commit 91f84f652de14329d5ad0666499a78fd0db0f1c7
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:24 2013 +0100

    iotests: Test file format nesting
    
    Add a test for nested image formats.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/qemu-iotests/072 b/tests/qemu-iotests/072
new file mode 100755
index 0000000..a3876c2
--- /dev/null
+++ b/tests/qemu-iotests/072
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# Test case for nested image formats
+#
+# Copyright (C) 2013 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz at redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt vpc vmdk vhdx vdi qed qcow2 qcow cow
+_supported_proto generic
+_supported_os Linux
+
+IMG_SIZE=64M
+
+echo
+echo "=== Testing nested image formats ==="
+echo
+
+TEST_IMG="$TEST_IMG.base" _make_test_img $IMG_SIZE
+
+$QEMU_IO -c 'write -P 42 0 512' -c 'write -P 23 512 512' \
+         -c 'write -P 66 1024 512' "$TEST_IMG.base" | _filter_qemu_io
+
+$QEMU_IMG convert -f raw -O $IMGFMT "$TEST_IMG.base" "$TEST_IMG"
+
+$QEMU_IO -c "open -o driver=$IMGFMT,file.driver=$IMGFMT,file.file.filename=$TEST_IMG" \
+         -c 'read -P 42 0 512' -c 'read -P 23 512 512' \
+         -c 'read -P 66 1024 512' | _filter_qemu_io
+
+# When not giving any format, qemu should open only one "layer". Therefore, this
+# should not work for any image formats with a header.
+$QEMU_IO -c 'read -P 42 0 512' "$TEST_IMG" | _filter_qemu_io
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/072.out b/tests/qemu-iotests/072.out
new file mode 100644
index 0000000..efe577c
--- /dev/null
+++ b/tests/qemu-iotests/072.out
@@ -0,0 +1,21 @@
+QA output created by 072
+
+=== Testing nested image formats ===
+
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 512
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 1024
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 512
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 1024
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Pattern verification failed at offset 0, 512 bytes
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 1194339..5860c40 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -78,5 +78,6 @@
 069 rw auto
 070 rw auto
 071 rw auto
+072 rw auto
 073 rw auto
 074 rw auto
commit 30bd6a4dafe2f79909451ef5769561c9a9d3eaca
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:23 2013 +0100

    iotests: Test new blkdebug/blkverify interface
    
    Add a test for the new blkdebug/blkverify interface.
    
    This test is not written in Python, although it uses QMP. This is
    because it invokes the qemu-io HMP command, which outputs errors to
    stderr instead of returning them through QMP. Filtering and testing that
    output is easier in a shell script than with the Python infrastructure.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/qemu-iotests/071 b/tests/qemu-iotests/071
new file mode 100755
index 0000000..2a22546
--- /dev/null
+++ b/tests/qemu-iotests/071
@@ -0,0 +1,239 @@
+#!/bin/bash
+#
+# Test case for the QMP blkdebug and blkverify interfaces
+#
+# Copyright (C) 2013 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz at redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+
+function do_run_qemu()
+{
+    echo Testing: "$@" | _filter_imgfmt
+    $QEMU -nographic -qmp stdio -serial none "$@"
+    echo
+}
+
+function run_qemu()
+{
+    do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qmp | _filter_qemu_io
+}
+
+IMG_SIZE=64M
+
+echo
+echo "=== Testing blkverify through filename ==="
+echo
+
+TEST_IMG="$TEST_IMG.base" IMGOPTS="" IMGFMT="raw" _make_test_img $IMG_SIZE |\
+    _filter_imgfmt
+_make_test_img $IMG_SIZE
+$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \
+         -c 'read 0 512' -c 'write -P 42 0x38000 512' -c 'read -P 42 0x38000 512' | _filter_qemu_io
+
+$QEMU_IO -c 'write -P 42 0 512' "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \
+         -c 'read -P 42 0 512' | _filter_qemu_io
+
+echo
+echo "=== Testing blkverify through file blockref ==="
+echo
+
+TEST_IMG="$TEST_IMG.base" IMGOPTS="" IMGFMT="raw" _make_test_img $IMG_SIZE |\
+    _filter_imgfmt
+_make_test_img $IMG_SIZE
+$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base,file.test.driver=$IMGFMT,file.test.file.filename=$TEST_IMG" \
+         -c 'read 0 512' -c 'write -P 42 0x38000 512' -c 'read -P 42 0x38000 512' | _filter_qemu_io
+
+$QEMU_IO -c 'write -P 42 0 512' "$TEST_IMG" | _filter_qemu_io
+
+$QEMU_IO -c "open -o file.driver=blkverify,file.raw.filename=$TEST_IMG.base $TEST_IMG" \
+         -c 'read -P 42 0 512' | _filter_qemu_io
+
+echo
+echo "=== Testing blkdebug through filename ==="
+echo
+
+$QEMU_IO -c "open -o file.driver=blkdebug,file.inject-error.event=l2_load $TEST_IMG" \
+         -c 'read -P 42 0x38000 512'
+
+echo
+echo "=== Testing blkdebug through file blockref ==="
+echo
+
+$QEMU_IO -c "open -o driver=$IMGFMT,file.driver=blkdebug,file.inject-error.event=l2_load,file.image.filename=$TEST_IMG" \
+         -c 'read -P 42 0x38000 512'
+
+echo
+echo "=== Testing blkdebug on existing block device ==="
+echo
+
+run_qemu -drive "file=$TEST_IMG,format=raw,if=none,id=drive0" <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+    "arguments": {
+        "options": {
+            "driver": "$IMGFMT",
+            "id": "drive0-debug",
+            "file": {
+                "driver": "blkdebug",
+                "image": "drive0",
+                "inject-error": [{
+                    "event": "l2_load"
+                }]
+            }
+        }
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-debug "read 0 512"'
+    }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "=== Testing blkverify on existing block device ==="
+echo
+
+run_qemu -drive "file=$TEST_IMG,format=$IMGFMT,if=none,id=drive0" <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+    "arguments": {
+        "options": {
+            "driver": "blkverify",
+            "id": "drive0-verify",
+            "test": "drive0",
+            "raw": {
+                "driver": "raw",
+                "file": {
+                    "driver": "file",
+                    "filename": "$TEST_IMG.base"
+                }
+            }
+        }
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-verify "read 0 512"'
+    }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "=== Testing blkverify on existing raw block device ==="
+echo
+
+run_qemu -drive "file=$TEST_IMG.base,if=none,id=drive0" <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+    "arguments": {
+        "options": {
+            "driver": "blkverify",
+            "id": "drive0-verify",
+            "test": {
+                "driver": "$IMGFMT",
+                "file": {
+                    "driver": "file",
+                    "filename": "$TEST_IMG"
+                }
+            },
+            "raw": "drive0"
+        }
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-verify "read 0 512"'
+    }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "=== Testing blkdebug's set-state through QMP ==="
+echo
+
+run_qemu -drive "file=$TEST_IMG,format=raw,if=none,id=drive0" <<EOF
+{ "execute": "qmp_capabilities" }
+{ "execute": "blockdev-add",
+    "arguments": {
+        "options": {
+            "driver": "$IMGFMT",
+            "id": "drive0-debug",
+            "file": {
+                "driver": "blkdebug",
+                "image": "drive0",
+                "inject-error": [{
+                    "event": "read_aio",
+                    "state": 42
+                }],
+                "set-state": [{
+                    "event": "write_aio",
+                    "new_state": 42
+                }]
+            }
+        }
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-debug "read 0 512"'
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-debug "write 0 512"'
+    }
+}
+{ "execute": "human-monitor-command",
+    "arguments": {
+        "command-line": 'qemu-io drive0-debug "read 0 512"'
+    }
+}
+{ "execute": "quit" }
+EOF
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/071.out b/tests/qemu-iotests/071.out
new file mode 100644
index 0000000..5f840a9
--- /dev/null
+++ b/tests/qemu-iotests/071.out
@@ -0,0 +1,90 @@
+QA output created by 071
+
+=== Testing blkverify through filename ===
+
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 229376
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 229376
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
+
+=== Testing blkverify through file blockref ===
+
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 229376
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 229376
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0
+
+=== Testing blkdebug through filename ===
+
+read failed: Input/output error
+
+=== Testing blkdebug through file blockref ===
+
+read failed: Input/output error
+
+=== Testing blkdebug on existing block device ===
+
+Testing: -drive file=TEST_DIR/t.IMGFMT,format=raw,if=none,id=drive0
+QMP_VERSION
+{"return": {}}
+{"return": {}}
+read failed: Input/output error
+{"return": ""}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN"}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "ide1-cd0", "tray-open": true}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}}
+
+
+=== Testing blkverify on existing block device ===
+
+Testing: -drive file=TEST_DIR/t.IMGFMT,format=IMGFMT,if=none,id=drive0
+QMP_VERSION
+{"return": {}}
+{"return": {}}
+blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0
+
+
+=== Testing blkverify on existing raw block device ===
+
+Testing: -drive file=TEST_DIR/t.IMGFMT.base,if=none,id=drive0
+QMP_VERSION
+{"return": {}}
+{"return": {}}
+blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0
+
+
+=== Testing blkdebug's set-state through QMP ===
+
+Testing: -drive file=TEST_DIR/t.IMGFMT,format=raw,if=none,id=drive0
+QMP_VERSION
+{"return": {}}
+{"return": {}}
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
+read failed: Input/output error
+{"return": ""}
+{"return": {}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN"}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "ide1-cd0", "tray-open": true}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}}
+
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index cc750c9..1194339 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -77,5 +77,6 @@
 068 rw auto
 069 rw auto
 070 rw auto
+071 rw auto
 073 rw auto
 074 rw auto
commit 3fb11779ca5f1d601adeb5870ba79e61e81a4cce
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:22 2013 +0100

    tests: Add test for qdict_flatten()
    
    Add a test case for qdict_flatten() in tests/check-qdict.c. This test
    case covers the flattening of subordinate QLists as well.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/check-qdict.c b/tests/check-qdict.c
index cab7dd0..7a7461b 100644
--- a/tests/check-qdict.c
+++ b/tests/check-qdict.c
@@ -227,6 +227,81 @@ static void qdict_iterapi_test(void)
     QDECREF(tests_dict);
 }
 
+static void qdict_flatten_test(void)
+{
+    QList *list1 = qlist_new();
+    QList *list2 = qlist_new();
+    QDict *dict1 = qdict_new();
+    QDict *dict2 = qdict_new();
+    QDict *dict3 = qdict_new();
+
+    /*
+     * Test the flattening of
+     *
+     * {
+     *     "e": [
+     *         42,
+     *         [
+     *             23,
+     *             66,
+     *             {
+     *                 "a": 0,
+     *                 "b": 1
+     *             }
+     *         ]
+     *     ],
+     *     "f": {
+     *         "c": 2,
+     *         "d": 3,
+     *     },
+     *     "g": 4
+     * }
+     *
+     * to
+     *
+     * {
+     *     "e.0": 42,
+     *     "e.1.0": 23,
+     *     "e.1.1": 66,
+     *     "e.1.2.a": 0,
+     *     "e.1.2.b": 1,
+     *     "f.c": 2,
+     *     "f.d": 3,
+     *     "g": 4
+     * }
+     */
+
+    qdict_put(dict1, "a", qint_from_int(0));
+    qdict_put(dict1, "b", qint_from_int(1));
+
+    qlist_append_obj(list1, QOBJECT(qint_from_int(23)));
+    qlist_append_obj(list1, QOBJECT(qint_from_int(66)));
+    qlist_append_obj(list1, QOBJECT(dict1));
+    qlist_append_obj(list2, QOBJECT(qint_from_int(42)));
+    qlist_append_obj(list2, QOBJECT(list1));
+
+    qdict_put(dict2, "c", qint_from_int(2));
+    qdict_put(dict2, "d", qint_from_int(3));
+    qdict_put_obj(dict3, "e", QOBJECT(list2));
+    qdict_put_obj(dict3, "f", QOBJECT(dict2));
+    qdict_put(dict3, "g", qint_from_int(4));
+
+    qdict_flatten(dict3);
+
+    g_assert(qdict_get_int(dict3, "e.0") == 42);
+    g_assert(qdict_get_int(dict3, "e.1.0") == 23);
+    g_assert(qdict_get_int(dict3, "e.1.1") == 66);
+    g_assert(qdict_get_int(dict3, "e.1.2.a") == 0);
+    g_assert(qdict_get_int(dict3, "e.1.2.b") == 1);
+    g_assert(qdict_get_int(dict3, "f.c") == 2);
+    g_assert(qdict_get_int(dict3, "f.d") == 3);
+    g_assert(qdict_get_int(dict3, "g") == 4);
+
+    g_assert(qdict_size(dict3) == 8);
+
+    QDECREF(dict3);
+}
+
 static void qdict_array_split_test(void)
 {
     QDict *test_dict = qdict_new();
@@ -444,6 +519,7 @@ int main(int argc, char **argv)
     g_test_add_func("/public/del", qdict_del_test);
     g_test_add_func("/public/to_qdict", qobject_to_qdict_test);
     g_test_add_func("/public/iterapi", qdict_iterapi_test);
+    g_test_add_func("/public/flatten", qdict_flatten_test);
     g_test_add_func("/public/array_split", qdict_array_split_test);
 
     g_test_add_func("/errors/put_exists", qdict_put_exists_test);
commit be331341a1f35c2de2fcc05cc78e0342d2edeb8a
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:21 2013 +0100

    tests: Add test for qdict_array_split()
    
    Add a test case for qdict_array_split() in tests/check-qdict.c.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/check-qdict.c b/tests/check-qdict.c
index dc5f05a..cab7dd0 100644
--- a/tests/check-qdict.c
+++ b/tests/check-qdict.c
@@ -227,6 +227,85 @@ static void qdict_iterapi_test(void)
     QDECREF(tests_dict);
 }
 
+static void qdict_array_split_test(void)
+{
+    QDict *test_dict = qdict_new();
+    QDict *dict1, *dict2;
+    QList *test_list;
+
+    /*
+     * Test the split of
+     *
+     * {
+     *     "1.x": 0,
+     *     "3.y": 1,
+     *     "0.a": 42,
+     *     "o.o": 7,
+     *     "0.b": 23
+     * }
+     *
+     * to
+     *
+     * [
+     *     {
+     *         "a": 42,
+     *         "b": 23
+     *     },
+     *     {
+     *         "x": 0
+     *     }
+     * ]
+     *
+     * and
+     *
+     * {
+     *     "3.y": 1,
+     *     "o.o": 7
+     * }
+     *
+     * (remaining in the old QDict)
+     *
+     * This example is given in the comment of qdict_array_split().
+     */
+
+    qdict_put(test_dict, "1.x", qint_from_int(0));
+    qdict_put(test_dict, "3.y", qint_from_int(1));
+    qdict_put(test_dict, "0.a", qint_from_int(42));
+    qdict_put(test_dict, "o.o", qint_from_int(7));
+    qdict_put(test_dict, "0.b", qint_from_int(23));
+
+    qdict_array_split(test_dict, &test_list);
+
+    dict1 = qobject_to_qdict(qlist_pop(test_list));
+    dict2 = qobject_to_qdict(qlist_pop(test_list));
+
+    g_assert(dict1);
+    g_assert(dict2);
+    g_assert(qlist_empty(test_list));
+
+    QDECREF(test_list);
+
+    g_assert(qdict_get_int(dict1, "a") == 42);
+    g_assert(qdict_get_int(dict1, "b") == 23);
+
+    g_assert(qdict_size(dict1) == 2);
+
+    QDECREF(dict1);
+
+    g_assert(qdict_get_int(dict2, "x") == 0);
+
+    g_assert(qdict_size(dict2) == 1);
+
+    QDECREF(dict2);
+
+    g_assert(qdict_get_int(test_dict, "3.y") == 1);
+    g_assert(qdict_get_int(test_dict, "o.o") == 7);
+
+    g_assert(qdict_size(test_dict) == 2);
+
+    QDECREF(test_dict);
+}
+
 /*
  * Errors test-cases
  */
@@ -365,6 +444,7 @@ int main(int argc, char **argv)
     g_test_add_func("/public/del", qdict_del_test);
     g_test_add_func("/public/to_qdict", qobject_to_qdict_test);
     g_test_add_func("/public/iterapi", qdict_iterapi_test);
+    g_test_add_func("/public/array_split", qdict_array_split_test);
 
     g_test_add_func("/errors/put_exists", qdict_put_exists_test);
     g_test_add_func("/errors/get_not_exists", qdict_get_not_exists_test);
commit fd0fee34b5ae7699dc558c12ddc3663bdb580060
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:20 2013 +0100

    qemu-io: Make filename optional
    
    Giving a filename is actually not essential, since it can be specified
    through the options as well - on the contrary: Sometimes a filename must
    not be given.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qemu-io.c b/qemu-io.c
index bfb773e..d669028 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -163,11 +163,13 @@ static int open_f(BlockDriverState *bs, int argc, char **argv)
         flags |= BDRV_O_RDWR;
     }
 
-    if (optind != argc - 1) {
+    if (optind == argc - 1) {
+        return openfile(argv[optind], flags, growable, opts);
+    } else if (optind == argc) {
+        return openfile(NULL, flags, growable, opts);
+    } else {
         return qemuio_command_usage(&open_cmd);
     }
-
-    return openfile(argv[optind], flags, growable, opts);
 }
 
 static int quit_f(BlockDriverState *bs, int argc, char **argv)
commit 1bf20b8280186299c750018bbfa3b52f4afd71ea
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:19 2013 +0100

    qapi: QMP interface for blkdebug and blkverify
    
    Add structures to support blkdebug and blkverify in blockdev-add.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qapi-schema.json b/qapi-schema.json
index f27c48a..35f7b34 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4201,6 +4201,113 @@
             '*pass-discard-other': 'bool' } }
 
 ##
+# @BlkdebugEvent
+#
+# Trigger events supported by blkdebug.
+##
+{ 'enum': 'BlkdebugEvent',
+  'data': [ 'l1_update', 'l1_grow.alloc_table', 'l1_grow.write_table',
+            'l1_grow.activate_table', 'l2_load', 'l2_update',
+            'l2_update_compressed', 'l2_alloc.cow_read', 'l2_alloc.write',
+            'read_aio', 'read_backing_aio', 'read_compressed', 'write_aio',
+            'write_compressed', 'vmstate_load', 'vmstate_save', 'cow_read',
+            'cow_write', 'reftable_load', 'reftable_grow', 'reftable_update',
+            'refblock_load', 'refblock_update', 'refblock_update_part',
+            'refblock_alloc', 'refblock_alloc.hookup', 'refblock_alloc.write',
+            'refblock_alloc.write_blocks', 'refblock_alloc.write_table',
+            'refblock_alloc.switch_table', 'cluster_alloc',
+            'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
+            'flush_to_disk' ] }
+
+##
+# @BlkdebugInjectErrorOptions
+#
+# Describes a single error injection for blkdebug.
+#
+# @event:       trigger event
+#
+# @state:       #optional the state identifier blkdebug needs to be in to
+#               actually trigger the event; defaults to "any"
+#
+# @errno:       #optional error identifier (errno) to be returned; defaults to
+#               EIO
+#
+# @sector:      #optional specifies the sector index which has to be affected
+#               in order to actually trigger the event; defaults to "any
+#               sector"
+#
+# @once:        #optional disables further events after this one has been
+#               triggered; defaults to false
+#
+# @immediately: #optional fail immediately; defaults to false
+#
+# Since: 2.0
+##
+{ 'type': 'BlkdebugInjectErrorOptions',
+  'data': { 'event': 'BlkdebugEvent',
+            '*state': 'int',
+            '*errno': 'int',
+            '*sector': 'int',
+            '*once': 'bool',
+            '*immediately': 'bool' } }
+
+##
+# @BlkdebugSetStateOptions
+#
+# Describes a single state-change event for blkdebug.
+#
+# @event:       trigger event
+#
+# @state:       #optional the current state identifier blkdebug needs to be in;
+#               defaults to "any"
+#
+# @new_state:   the state identifier blkdebug is supposed to assume if
+#               this event is triggered
+#
+# Since: 2.0
+##
+{ 'type': 'BlkdebugSetStateOptions',
+  'data': { 'event': 'BlkdebugEvent',
+            '*state': 'int',
+            'new_state': 'int' } }
+
+##
+# @BlockdevOptionsBlkdebug
+#
+# Driver specific block device options for blkdebug.
+#
+# @image:           underlying raw block device (or image file)
+#
+# @config:          #optional filename of the configuration file
+#
+# @inject-error:    #optional array of error injection descriptions
+#
+# @set-state:       #optional array of state-change descriptions
+#
+# Since: 2.0
+##
+{ 'type': 'BlockdevOptionsBlkdebug',
+  'data': { 'image': 'BlockdevRef',
+            '*config': 'str',
+            '*inject-error': ['BlkdebugInjectErrorOptions'],
+            '*set-state': ['BlkdebugSetStateOptions'] } }
+
+##
+# @BlockdevOptionsBlkverify
+#
+# Driver specific block device options for blkverify.
+#
+# @test:    block device to be tested
+#
+# @raw:     raw image used for verification
+#
+# Since: 2.0
+##
+{ 'type': 'BlockdevOptionsBlkverify',
+  'data': { 'test': 'BlockdevRef',
+            'raw': 'BlockdevRef' } }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.
@@ -4224,10 +4331,8 @@
 # TODO sheepdog: Wait for structured options
 # TODO ssh: Should take InetSocketAddress for 'host'?
       'vvfat':      'BlockdevOptionsVVFAT',
-
-# TODO blkdebug: Wait for structured options
-# TODO blkverify: Wait for structured options
-
+      'blkdebug':   'BlockdevOptionsBlkdebug',
+      'blkverify':  'BlockdevOptionsBlkverify',
       'bochs':      'BlockdevOptionsGenericFormat',
       'cloop':      'BlockdevOptionsGenericFormat',
       'cow':        'BlockdevOptionsGenericCOWFormat',
commit 8592a545b61b99114a86ee7cecef7a5f284d1b6c
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:18 2013 +0100

    qapi: Add "errno" to the list of polluted words
    
    Using "errno" directly as an identifier results in various syntax
    errors; therefore it should be added to the list of polluted words.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/scripts/qapi.py b/scripts/qapi.py
index 750e9fb..9b3de4c 100644
--- a/scripts/qapi.py
+++ b/scripts/qapi.py
@@ -247,7 +247,7 @@ def c_var(name, protect=True):
                      'and', 'and_eq', 'bitand', 'bitor', 'compl', 'not',
                      'not_eq', 'or', 'or_eq', 'xor', 'xor_eq'])
     # namespace pollution:
-    polluted_words = set(['unix'])
+    polluted_words = set(['unix', 'errno'])
     if protect and (name in c89_words | c99_words | c11_words | gcc_words | cpp_words | polluted_words):
         return "q_" + name
     return name.replace('-', '_').lstrip("*")
commit 22511ad681348cc4e500ebafdc324b0909d41c95
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:17 2013 +0100

    blkverify: Don't require protocol filename
    
    If the filename is not prefixed by "blkverify:" in
    blkverify_parse_filename(), the blkverify driver was not selected
    through that protocol prefix, but by an explicit command line (or QMP)
    option (like driver=blkverify).
    
    If blkverify_parse_filename() has been called, a filename has been
    given. If it is not prefixed, it is probably really just a plain
    filename. This is no problem, since we can use it as the test image
    filename and rely on the user to specify the raw image filename through
    the new corresponding option.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkverify.c b/block/blkverify.c
index dc14290..a2e8f5f 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options,
 
     /* Parse the blkverify: prefix */
     if (!strstart(filename, "blkverify:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkverify:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
         return;
     }
 
commit 70b6198acc9643c3ce801e5cf4c24274722f2f4a
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:16 2013 +0100

    blkverify: Allow command-line configuration
    
    Introduce the "test" and "raw" options for specifying images.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkverify.c b/block/blkverify.c
index e15ac4c..dc14290 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -122,7 +122,6 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVBlkverifyState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *filename, *raw;
     int ret;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -133,33 +132,19 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    /* Parse the raw image filename */
-    raw = qemu_opt_get(opts, "x-raw");
-    if (raw == NULL) {
-        error_setg(errp, "Could not retrieve raw image filename");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = bdrv_file_open(&bs->file, raw, NULL, NULL, flags, &local_err);
+    /* Open the raw file */
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options,
+                          "raw", flags, true, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
     }
 
     /* Open the test file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        error_setg(errp, "Could not retrieve test image filename");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    s->test_file = bdrv_new("");
-    ret = bdrv_open(s->test_file, filename, NULL, flags, NULL, &local_err);
+    ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options,
+                          "test", flags, false, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
-        bdrv_unref(s->test_file);
         s->test_file = NULL;
         goto fail;
     }
commit 4373593d5111a8ed3b6d47ad4a458ee28ec942e3
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:15 2013 +0100

    blkdebug: Allow command-line file configuration
    
    Introduce the "image" option as an alternative to specifying the image
    through the filename.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 21a4931..c8f8d56 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -374,7 +374,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVBlkdebugState *s = bs->opaque;
     QemuOpts *opts;
     Error *local_err = NULL;
-    const char *filename, *config;
+    const char *config;
     int ret;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
@@ -396,14 +396,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     s->state = 1;
 
     /* Open the backing file */
-    filename = qemu_opt_get(opts, "x-image");
-    if (filename == NULL) {
-        error_setg(errp, "Could not retrieve image file name");
-        ret = -EINVAL;
-        goto fail;
-    }
-
-    ret = bdrv_file_open(&bs->file, filename, NULL, NULL, flags, &local_err);
+    ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image",
+                          flags, true, false, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
commit d095b465339b79929fd2adc25c0ab3598e80fd39
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:14 2013 +0100

    blockdev: Move "file" to legacy_opts
    
    Specifying the image filename through the "file" option is a legacy
    option and should not be supported by blockdev-add (in that case, giving
    a string for "file" references an existing block device).
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/blockdev.c b/blockdev.c
index e457494..386109a 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -307,12 +307,11 @@ static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
 typedef enum { MEDIA_DISK, MEDIA_CDROM } DriveMediaType;
 
 /* Takes the ownership of bs_opts */
-static DriveInfo *blockdev_init(QDict *bs_opts,
+static DriveInfo *blockdev_init(const char *file, QDict *bs_opts,
                                 BlockInterfaceType type,
                                 Error **errp)
 {
     const char *buf;
-    const char *file = NULL;
     const char *serial;
     int ro = 0;
     int bdrv_flags = 0;
@@ -354,7 +353,6 @@ static DriveInfo *blockdev_init(QDict *bs_opts,
     ro = qemu_opt_get_bool(opts, "read-only", 0);
     copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false);
 
-    file = qemu_opt_get(opts, "file");
     serial = qemu_opt_get(opts, "serial");
 
     if ((buf = qemu_opt_get(opts, "discard")) != NULL) {
@@ -599,6 +597,10 @@ QemuOptsList qemu_legacy_drive_opts = {
             .name = "addr",
             .type = QEMU_OPT_STRING,
             .help = "pci address (virtio only)",
+        },{
+            .name = "file",
+            .type = QEMU_OPT_STRING,
+            .help = "file name",
         },
 
         /* Options that are passed on, but have special semantics with -drive */
@@ -629,6 +631,7 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
     const char *devaddr;
     bool read_only = false;
     bool copy_on_read;
+    const char *filename;
     Error *local_err = NULL;
 
     /* Change legacy command line options into QMP ones */
@@ -867,8 +870,10 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
         }
     }
 
+    filename = qemu_opt_get(legacy_opts, "file");
+
     /* Actual block device init: Functionality shared with blockdev-add */
-    dinfo = blockdev_init(bs_opts, type, &local_err);
+    dinfo = blockdev_init(filename, bs_opts, type, &local_err);
     if (dinfo == NULL) {
         if (error_is_set(&local_err)) {
             qerror_report_err(local_err);
@@ -2210,7 +2215,7 @@ void qmp_blockdev_add(BlockdevOptions *options, Error **errp)
 
     qdict_flatten(qdict);
 
-    blockdev_init(qdict, IF_NONE, &local_err);
+    blockdev_init(NULL, qdict, IF_NONE, &local_err);
     if (error_is_set(&local_err)) {
         error_propagate(errp, local_err);
         goto fail;
@@ -2251,10 +2256,6 @@ QemuOptsList qemu_common_drive_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "enable/disable snapshot mode",
         },{
-            .name = "file",
-            .type = QEMU_OPT_STRING,
-            .help = "disk image",
-        },{
             .name = "discard",
             .type = QEMU_OPT_STRING,
             .help = "discard operation (ignore/off, unmap/on)",
commit 505d758334afcee07eb40aa1b33f2353c612c8ec
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:13 2013 +0100

    block: Allow recursive "file"s
    
    It should be possible to use a format as a driver for a file which in
    turn requires another file, i.e., nesting file formats.
    
    Allowing nested file formats results in e.g. qcow2 BlockDriverStates
    never being directly passed to bdrv_open_common() from bdrv_file_open(),
    but instead being handed through bdrv_open(). This changes the error
    message when trying to give a filename to qcow2, i.e. trying to use it
    as a driver for the protocol level. Therefore, change the reference
    output of I/O test 051 accordingly.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 9e4e85f..6af5f6e 100644
--- a/block.c
+++ b/block.c
@@ -948,14 +948,19 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename,
         goto fail;
     }
 
-    ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
+    if (!drv->bdrv_file_open) {
+        ret = bdrv_open(bs, filename, options, flags, drv, &local_err);
+        options = NULL;
+    } else {
+        ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err);
+    }
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
     }
 
     /* Check if any unknown options were used */
-    if (qdict_size(options) != 0) {
+    if (options && (qdict_size(options) != 0)) {
         const QDictEntry *entry = qdict_first(options);
         error_setg(errp, "Block protocol '%s' doesn't support the option '%s'",
                    drv->format_name, entry->key);
diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out
index c2cadba..d0c5173 100644
--- a/tests/qemu-iotests/051.out
+++ b/tests/qemu-iotests/051.out
@@ -222,7 +222,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) qququiquit
 
 Testing: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2
-QEMU_PROG: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2: could not open disk image TEST_DIR/t.qcow2: Can't use 'qcow2' as a block driver for the protocol level
+QEMU_PROG: -drive file=TEST_DIR/t.qcow2,file.driver=qcow2: could not open disk image TEST_DIR/t.qcow2: Block format 'qcow2' used by device '' doesn't support the option 'filename'
 
 
 === Parsing protocol from file name ===
commit 054963f8f082695ecb1f169024c83ce3e4eea3d8
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:12 2013 +0100

    block: Use bdrv_open_image() in bdrv_open()
    
    Using bdrv_open_image() instead of bdrv_file_open() directly in
    bdrv_open() is easier.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 76b6c25..9e4e85f 100644
--- a/block.c
+++ b/block.c
@@ -1128,8 +1128,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
     char tmp_filename[PATH_MAX + 1];
     BlockDriverState *file = NULL;
-    QDict *file_options = NULL;
-    const char *file_reference;
     const char *drvname;
     Error *local_err = NULL;
 
@@ -1215,17 +1213,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         flags |= BDRV_O_ALLOW_RDWR;
     }
 
-    qdict_extract_subqdict(options, &file_options, "file.");
-    file_reference = qdict_get_try_str(options, "file");
-
-    if (filename || file_reference || qdict_size(file_options)) {
-        ret = bdrv_file_open(&file, filename, file_reference, file_options,
-                             bdrv_open_flags(bs, flags | BDRV_O_UNMAP),
-                             &local_err);
-        qdict_del(options, "file");
-        if (ret < 0) {
-            goto fail;
-        }
+    ret = bdrv_open_image(&file, filename, options, "file",
+                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true,
+                          &local_err);
+    if (ret < 0) {
+        goto fail;
     }
 
     /* Find the right image format driver */
commit da557aac181fa71fde6a2a7c7a1eb2aea20caf64
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:11 2013 +0100

    block: Add bdrv_open_image()
    
    Add a common function for opening images to be used for block drivers
    specified through BlockdevRefs in an option QDict. The difference from
    bdrv_file_open() is that this function may invoke bdrv_open() instead,
    allowing auto-detection of the driver to be used; and second, it
    automatically extracts the BlockdevRef from the option QDict.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 7464fb2..76b6c25 100644
--- a/block.c
+++ b/block.c
@@ -1041,6 +1041,79 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp)
 }
 
 /*
+ * Opens a disk image whose options are given as BlockdevRef in another block
+ * device's options.
+ *
+ * If force_raw is true, bdrv_file_open() will be used, thereby preventing any
+ * image format auto-detection. If it is false and a filename is given,
+ * bdrv_open() will be used for auto-detection.
+ *
+ * If allow_none is true, no image will be opened if filename is false and no
+ * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned.
+ *
+ * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
+ * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
+ * itself, all options starting with "${bdref_key}." are considered part of the
+ * BlockdevRef.
+ *
+ * The BlockdevRef will be removed from the options QDict.
+ */
+int bdrv_open_image(BlockDriverState **pbs, const char *filename,
+                    QDict *options, const char *bdref_key, int flags,
+                    bool force_raw, bool allow_none, Error **errp)
+{
+    QDict *image_options;
+    int ret;
+    char *bdref_key_dot;
+    const char *reference;
+
+    bdref_key_dot = g_strdup_printf("%s.", bdref_key);
+    qdict_extract_subqdict(options, &image_options, bdref_key_dot);
+    g_free(bdref_key_dot);
+
+    reference = qdict_get_try_str(options, bdref_key);
+    if (!filename && !reference && !qdict_size(image_options)) {
+        if (allow_none) {
+            ret = 0;
+        } else {
+            error_setg(errp, "A block device must be specified for \"%s\"",
+                       bdref_key);
+            ret = -EINVAL;
+        }
+        goto done;
+    }
+
+    if (filename && !force_raw) {
+        /* If a filename is given and the block driver should be detected
+           automatically (instead of using none), use bdrv_open() in order to do
+           that auto-detection. */
+        BlockDriverState *bs;
+
+        if (reference) {
+            error_setg(errp, "Cannot reference an existing block device while "
+                       "giving a filename");
+            ret = -EINVAL;
+            goto done;
+        }
+
+        bs = bdrv_new("");
+        ret = bdrv_open(bs, filename, image_options, flags, NULL, errp);
+        if (ret < 0) {
+            bdrv_unref(bs);
+        } else {
+            *pbs = bs;
+        }
+    } else {
+        ret = bdrv_file_open(pbs, filename, reference, image_options, flags,
+                             errp);
+    }
+
+done:
+    qdict_del(options, bdref_key);
+    return ret;
+}
+
+/*
  * Opens a disk image (raw, qcow2, vmdk, ...)
  *
  * options is a QDict of options to pass to the block drivers, or NULL for an
diff --git a/include/block/block.h b/include/block/block.h
index e2b2a15..a47f3d4 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -186,6 +186,9 @@ int bdrv_parse_discard_flags(const char *mode, int *flags);
 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
                    const char *reference, QDict *options, int flags,
                    Error **errp);
+int bdrv_open_image(BlockDriverState **pbs, const char *filename,
+                    QDict *options, const char *bdref_key, int flags,
+                    bool force_raw, bool allow_none, Error **errp);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp);
 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
               int flags, BlockDriver *drv, Error **errp);
commit 2a05cbe426a7a3ddec63dbc67c9ac93013aebf77
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:10 2013 +0100

    block: Allow block devices without files
    
    blkdebug and blkverify will, in order to retain compatibility, not
    support the field "file" implicitly through bdrv_open(). In order to be
    able to use those drivers without giving a filename anyway, it is
    necessary to be able to have block devices without files implicitly
    opened by bdrv_open(). This is the case, if there was neither a file
    name, a reference to an existing block device to use as a file nor
    options specific to the file.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index bef4f82..7464fb2 100644
--- a/block.c
+++ b/block.c
@@ -1145,11 +1145,14 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     qdict_extract_subqdict(options, &file_options, "file.");
     file_reference = qdict_get_try_str(options, "file");
 
-    ret = bdrv_file_open(&file, filename, file_reference, file_options,
-                         bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
-    qdict_del(options, "file");
-    if (ret < 0) {
-        goto fail;
+    if (filename || file_reference || qdict_size(file_options)) {
+        ret = bdrv_file_open(&file, filename, file_reference, file_options,
+                             bdrv_open_flags(bs, flags | BDRV_O_UNMAP),
+                             &local_err);
+        qdict_del(options, "file");
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     /* Find the right image format driver */
@@ -1165,7 +1168,13 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     }
 
     if (!drv) {
-        ret = find_image_format(file, filename, &drv, &local_err);
+        if (file) {
+            ret = find_image_format(file, filename, &drv, &local_err);
+        } else {
+            error_setg(errp, "Must specify either driver or file");
+            ret = -EINVAL;
+            goto unlink_and_fail;
+        }
     }
 
     if (!drv) {
@@ -1178,7 +1187,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         goto unlink_and_fail;
     }
 
-    if (bs->file != file) {
+    if (file && (bs->file != file)) {
         bdrv_unref(file);
         file = NULL;
     }
commit 2258e3fe20990a13c9aa2c1adccafae073b7ce13
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:09 2013 +0100

    block: Pass reference to bdrv_file_open()
    
    With that now being possible, bdrv_open() should try to extract a block
    device reference from the options and pass it to bdrv_file_open().
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 1e53b3d..bef4f82 100644
--- a/block.c
+++ b/block.c
@@ -1056,6 +1056,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     char tmp_filename[PATH_MAX + 1];
     BlockDriverState *file = NULL;
     QDict *file_options = NULL;
+    const char *file_reference;
     const char *drvname;
     Error *local_err = NULL;
 
@@ -1142,9 +1143,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     }
 
     qdict_extract_subqdict(options, &file_options, "file.");
+    file_reference = qdict_get_try_str(options, "file");
 
-    ret = bdrv_file_open(&file, filename, NULL, file_options,
+    ret = bdrv_file_open(&file, filename, file_reference, file_options,
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
+    qdict_del(options, "file");
     if (ret < 0) {
         goto fail;
     }
commit 72daa72eeecb6b2ee06ab7d836ac3aa01ad7e6df
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:08 2013 +0100

    block: Allow reference for bdrv_file_open()
    
    Allow specifying a reference to an existing block device (by name) for
    bdrv_file_open() instead of a filename and/or options.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block.c b/block.c
index 64e7d22..1e53b3d 100644
--- a/block.c
+++ b/block.c
@@ -858,9 +858,10 @@ free_and_fail:
  * dictionary, it needs to use QINCREF() before calling bdrv_file_open.
  */
 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
-                   QDict *options, int flags, Error **errp)
+                   const char *reference, QDict *options, int flags,
+                   Error **errp)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
     BlockDriver *drv;
     const char *drvname;
     bool allow_protocol_prefix = false;
@@ -872,6 +873,24 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename,
         options = qdict_new();
     }
 
+    if (reference) {
+        if (filename || qdict_size(options)) {
+            error_setg(errp, "Cannot reference an existing block device with "
+                       "additional options or a new filename");
+            return -EINVAL;
+        }
+        QDECREF(options);
+
+        bs = bdrv_find(reference);
+        if (!bs) {
+            error_setg(errp, "Cannot find block device '%s'", reference);
+            return -ENODEV;
+        }
+        bdrv_ref(bs);
+        *pbs = bs;
+        return 0;
+    }
+
     bs = bdrv_new("");
     bs->options = options;
     options = qdict_clone_shallow(options);
@@ -1124,7 +1143,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
 
     qdict_extract_subqdict(options, &file_options, "file.");
 
-    ret = bdrv_file_open(&file, filename, file_options,
+    ret = bdrv_file_open(&file, filename, NULL, file_options,
                          bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err);
     if (ret < 0) {
         goto fail;
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 0bf3bb5..21a4931 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -403,7 +403,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_file_open(&bs->file, filename, NULL, flags, &local_err);
+    ret = bdrv_file_open(&bs->file, filename, NULL, NULL, flags, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
diff --git a/block/blkverify.c b/block/blkverify.c
index 1c1637f..e15ac4c 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -141,7 +141,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = bdrv_file_open(&bs->file, raw, NULL, flags, &local_err);
+    ret = bdrv_file_open(&bs->file, raw, NULL, NULL, flags, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto fail;
diff --git a/block/cow.c b/block/cow.c
index dc15e46..7fc0b12 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -351,7 +351,8 @@ static int cow_create(const char *filename, QEMUOptionParameter *options,
         return ret;
     }
 
-    ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&cow_bs, filename, NULL, NULL, BDRV_O_RDWR,
+                         &local_err);
     if (ret < 0) {
         qerror_report_err(local_err);
         error_free(local_err);
diff --git a/block/qcow.c b/block/qcow.c
index c470e05..948b0c5 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -691,7 +691,8 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options,
         return ret;
     }
 
-    ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&qcow_bs, filename, NULL, NULL, BDRV_O_RDWR,
+                         &local_err);
     if (ret < 0) {
         qerror_report_err(local_err);
         error_free(local_err);
diff --git a/block/qcow2.c b/block/qcow2.c
index 8ec9db1..e15a4dd 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1483,7 +1483,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         return ret;
diff --git a/block/qed.c b/block/qed.c
index 450a1fa..0dd5c58 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -563,8 +563,8 @@ static int qed_create(const char *filename, uint32_t cluster_size,
         return ret;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB,
-                         &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL,
+                         BDRV_O_RDWR | BDRV_O_CACHE_WB, &local_err);
     if (ret < 0) {
         qerror_report_err(local_err);
         error_free(local_err);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 6088fa5..2ce3d9b 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1534,7 +1534,7 @@ static int sd_prealloc(const char *filename)
     Error *local_err = NULL;
     int ret;
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
     if (ret < 0) {
         qerror_report_err(local_err);
         error_free(local_err);
@@ -1695,7 +1695,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
             goto out;
         }
 
-        ret = bdrv_file_open(&bs, backing_file, NULL, 0, &local_err);
+        ret = bdrv_file_open(&bs, backing_file, NULL, NULL, 0, &local_err);
         if (ret < 0) {
             qerror_report_err(local_err);
             error_free(local_err);
diff --git a/block/vhdx.c b/block/vhdx.c
index 1995778..9ee0a61 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1797,7 +1797,7 @@ static int vhdx_create(const char *filename, QEMUOptionParameter *options,
         goto exit;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index 22b99b0..599a928 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -769,8 +769,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
 
         path_combine(extent_path, sizeof(extent_path),
                 desc_file_path, fname);
-        ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags,
-                             errp);
+        ret = bdrv_file_open(&extent_file, extent_path, NULL, NULL,
+                             bs->open_flags, errp);
         if (ret) {
             return ret;
         }
@@ -1469,7 +1469,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
         goto exit;
     }
 
-    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
     if (ret < 0) {
         error_propagate(errp, local_err);
         goto exit;
@@ -1807,7 +1807,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
             goto exit;
         }
     }
-    ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    ret = bdrv_file_open(&new_bs, filename, NULL, NULL, BDRV_O_RDWR, &local_err);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
diff --git a/include/block/block.h b/include/block/block.h
index 36efaea..e2b2a15 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -184,7 +184,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
 int bdrv_parse_cache_flags(const char *mode, int *flags);
 int bdrv_parse_discard_flags(const char *mode, int *flags);
 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
-                   QDict *options, int flags, Error **errp);
+                   const char *reference, QDict *options, int flags,
+                   Error **errp);
 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp);
 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
               int flags, BlockDriver *drv, Error **errp);
diff --git a/qemu-io.c b/qemu-io.c
index fdc46a9..bfb773e 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -59,7 +59,7 @@ static int openfile(char *name, int flags, int growable, QDict *opts)
     }
 
     if (growable) {
-        if (bdrv_file_open(&qemuio_bs, name, opts, flags, &local_err)) {
+        if (bdrv_file_open(&qemuio_bs, name, NULL, opts, flags, &local_err)) {
             fprintf(stderr, "%s: can't open device %s: %s\n", progname, name,
                     error_get_pretty(local_err));
             error_free(local_err);
commit 89f2b21e36cce948c39fa7cf24226f6e5f042cc8
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:07 2013 +0100

    blkdebug: Use command-line in read_config()
    
    Use qemu_config_parse_qdict() to parse the command-line options in
    addition to the config file.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index acf23f2..0bf3bb5 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -271,11 +271,13 @@ static void remove_rule(BlkdebugRule *rule)
     g_free(rule);
 }
 
-static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp)
+static int read_config(BDRVBlkdebugState *s, const char *filename,
+                       QDict *options, Error **errp)
 {
     FILE *f = NULL;
     int ret;
     struct add_rule_data d;
+    Error *local_err = NULL;
 
     if (filename) {
         f = fopen(filename, "r");
@@ -292,6 +294,13 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp)
         }
     }
 
+    qemu_config_parse_qdict(options, config_groups, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto fail;
+    }
+
     d.s = s;
     d.action = ACTION_INJECT_ERROR;
     qemu_opts_foreach(&inject_error_opts, add_rule, &d, 0);
@@ -376,9 +385,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    /* Read rules from config file */
+    /* Read rules from config file or command line options */
     config = qemu_opt_get(opts, "config");
-    ret = read_config(s, config, errp);
+    ret = read_config(s, config, options, errp);
     if (ret) {
         goto fail;
     }
commit 85a040e5485413333da4fcf98bc8b28c92fa623f
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:06 2013 +0100

    blkdebug: Always call read_config()
    
    Move the check whether there actually is a config file into the
    read_config() function.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index fab76ce..acf23f2 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -273,21 +273,23 @@ static void remove_rule(BlkdebugRule *rule)
 
 static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp)
 {
-    FILE *f;
+    FILE *f = NULL;
     int ret;
     struct add_rule_data d;
 
-    f = fopen(filename, "r");
-    if (f == NULL) {
-        error_setg_errno(errp, errno, "Could not read blkdebug config file");
-        return -errno;
-    }
+    if (filename) {
+        f = fopen(filename, "r");
+        if (f == NULL) {
+            error_setg_errno(errp, errno, "Could not read blkdebug config file");
+            return -errno;
+        }
 
-    ret = qemu_config_parse(f, config_groups, filename);
-    if (ret < 0) {
-        error_setg(errp, "Could not parse blkdebug config file");
-        ret = -EINVAL;
-        goto fail;
+        ret = qemu_config_parse(f, config_groups, filename);
+        if (ret < 0) {
+            error_setg(errp, "Could not parse blkdebug config file");
+            ret = -EINVAL;
+            goto fail;
+        }
     }
 
     d.s = s;
@@ -301,7 +303,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp)
 fail:
     qemu_opts_reset(&inject_error_opts);
     qemu_opts_reset(&set_state_opts);
-    fclose(f);
+    if (f) {
+        fclose(f);
+    }
     return ret;
 }
 
@@ -374,11 +378,9 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Read rules from config file */
     config = qemu_opt_get(opts, "config");
-    if (config) {
-        ret = read_config(s, config, errp);
-        if (ret) {
-            goto fail;
-        }
+    ret = read_config(s, config, errp);
+    if (ret) {
+        goto fail;
     }
 
     /* Set initial state */
commit adf5c449e5beb163999e4ba7366d5f9aebb504a1
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:05 2013 +0100

    qemu-option: Add qemu_config_parse_qdict()
    
    This function basically parses command-line options given as a QDict
    replacing a config file.
    
    For instance, the QDict {"section.opt1": 42, "section.opt2": 23}
    corresponds to the config file:
    
    [section]
    opt1 = 42
    opt2 = 23
    
    It is possible to specify multiple sections and also multiple sections
    of the same type. On the command line, this looks like the following:
    
    inject-error.0.event=reftable_load,\
    inject-error.1.event=l2_load,\
    set-state.event=l1_update
    
    This would correspond to the following config file:
    
    [inject-error "inject-error.0"]
    event = reftable_load
    
    [inject-error "inject-error.1"]
    event = l2_load
    
    [set-state]
    event = l1_update
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/include/qemu/config-file.h b/include/qemu/config-file.h
index 508428f..dbd97c4 100644
--- a/include/qemu/config-file.h
+++ b/include/qemu/config-file.h
@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include "qemu/option.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
 
 QemuOptsList *qemu_find_opts(const char *group);
 QemuOptsList *qemu_find_opts_err(const char *group, Error **errp);
@@ -18,6 +19,11 @@ int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname);
 
 int qemu_read_config_file(const char *filename);
 
+/* Parse QDict options as a replacement for a config file (allowing multiple
+   enumerated (0..(n-1)) configuration "sections") */
+void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists,
+                             Error **errp);
+
 /* Read default QEMU config files
  */
 int qemu_read_default_config_files(bool userconfig);
diff --git a/util/qemu-config.c b/util/qemu-config.c
index 7973659..9298f55 100644
--- a/util/qemu-config.c
+++ b/util/qemu-config.c
@@ -356,3 +356,103 @@ int qemu_read_config_file(const char *filename)
         return -EINVAL;
     }
 }
+
+static void config_parse_qdict_section(QDict *options, QemuOptsList *opts,
+                                       Error **errp)
+{
+    QemuOpts *subopts;
+    QDict *subqdict;
+    QList *list = NULL;
+    Error *local_err = NULL;
+    size_t orig_size, enum_size;
+    char *prefix;
+
+    prefix = g_strdup_printf("%s.", opts->name);
+    qdict_extract_subqdict(options, &subqdict, prefix);
+    g_free(prefix);
+    orig_size = qdict_size(subqdict);
+    if (!orig_size) {
+        goto out;
+    }
+
+    subopts = qemu_opts_create(opts, NULL, 0, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        goto out;
+    }
+
+    qemu_opts_absorb_qdict(subopts, subqdict, &local_err);
+    if (error_is_set(&local_err)) {
+        error_propagate(errp, local_err);
+        goto out;
+    }
+
+    enum_size = qdict_size(subqdict);
+    if (enum_size < orig_size && enum_size) {
+        error_setg(errp, "Unknown option '%s' for [%s]",
+                   qdict_first(subqdict)->key, opts->name);
+        goto out;
+    }
+
+    if (enum_size) {
+        /* Multiple, enumerated sections */
+        QListEntry *list_entry;
+        unsigned i = 0;
+
+        /* Not required anymore */
+        qemu_opts_del(subopts);
+
+        qdict_array_split(subqdict, &list);
+        if (qdict_size(subqdict)) {
+            error_setg(errp, "Unused option '%s' for [%s]",
+                       qdict_first(subqdict)->key, opts->name);
+            goto out;
+        }
+
+        QLIST_FOREACH_ENTRY(list, list_entry) {
+            QDict *section = qobject_to_qdict(qlist_entry_obj(list_entry));
+            char *opt_name;
+
+            opt_name = g_strdup_printf("%s.%u", opts->name, i++);
+            subopts = qemu_opts_create(opts, opt_name, 1, &local_err);
+            g_free(opt_name);
+            if (error_is_set(&local_err)) {
+                error_propagate(errp, local_err);
+                goto out;
+            }
+
+            qemu_opts_absorb_qdict(subopts, section, &local_err);
+            if (error_is_set(&local_err)) {
+                error_propagate(errp, local_err);
+                qemu_opts_del(subopts);
+                goto out;
+            }
+
+            if (qdict_size(section)) {
+                error_setg(errp, "[%s] section doesn't support the option '%s'",
+                           opts->name, qdict_first(section)->key);
+                qemu_opts_del(subopts);
+                goto out;
+            }
+        }
+    }
+
+out:
+    QDECREF(subqdict);
+    QDECREF(list);
+}
+
+void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists,
+                             Error **errp)
+{
+    int i;
+    Error *local_err = NULL;
+
+    for (i = 0; lists[i]; i++) {
+        config_parse_qdict_section(options, lists[i], &local_err);
+        if (error_is_set(&local_err)) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
commit 9f23fc0c23ab16e9c16b41ed300786924f7a7768
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:04 2013 +0100

    qapi: extend qdict_flatten() for QLists
    
    Reversing qdict_array_split(), qdict_flatten() should flatten QLists as
    well by interpreting them as QDicts where every entry's key is its
    index.
    
    This allows bringing QDicts with QLists from QMP commands to the same
    form as they would be given as command-line options, thereby allowing
    them to be parsed the same way.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qobject/qdict.c b/qobject/qdict.c
index 2d5848d..a3924f2 100644
--- a/qobject/qdict.c
+++ b/qobject/qdict.c
@@ -477,7 +477,43 @@ static void qdict_destroy_obj(QObject *obj)
     g_free(qdict);
 }
 
-static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)
+static void qdict_flatten_qdict(QDict *qdict, QDict *target,
+                                const char *prefix);
+
+static void qdict_flatten_qlist(QList *qlist, QDict *target, const char *prefix)
+{
+    QObject *value;
+    const QListEntry *entry;
+    char *new_key;
+    int i;
+
+    /* This function is never called with prefix == NULL, i.e., it is always
+     * called from within qdict_flatten_q(list|dict)(). Therefore, it does not
+     * need to remove list entries during the iteration (the whole list will be
+     * deleted eventually anyway from qdict_flatten_qdict()). */
+    assert(prefix);
+
+    entry = qlist_first(qlist);
+
+    for (i = 0; entry; entry = qlist_next(entry), i++) {
+        value = qlist_entry_obj(entry);
+        new_key = g_strdup_printf("%s.%i", prefix, i);
+
+        if (qobject_type(value) == QTYPE_QDICT) {
+            qdict_flatten_qdict(qobject_to_qdict(value), target, new_key);
+        } else if (qobject_type(value) == QTYPE_QLIST) {
+            qdict_flatten_qlist(qobject_to_qlist(value), target, new_key);
+        } else {
+            /* All other types are moved to the target unchanged. */
+            qobject_incref(value);
+            qdict_put_obj(target, new_key, value);
+        }
+
+        g_free(new_key);
+    }
+}
+
+static void qdict_flatten_qdict(QDict *qdict, QDict *target, const char *prefix)
 {
     QObject *value;
     const QDictEntry *entry, *next;
@@ -500,8 +536,12 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)
         if (qobject_type(value) == QTYPE_QDICT) {
             /* Entries of QDicts are processed recursively, the QDict object
              * itself disappears. */
-            qdict_do_flatten(qobject_to_qdict(value), target,
-                             new_key ? new_key : entry->key);
+            qdict_flatten_qdict(qobject_to_qdict(value), target,
+                                new_key ? new_key : entry->key);
+            delete = true;
+        } else if (qobject_type(value) == QTYPE_QLIST) {
+            qdict_flatten_qlist(qobject_to_qlist(value), target,
+                                new_key ? new_key : entry->key);
             delete = true;
         } else if (prefix) {
             /* All other objects are moved to the target unchanged. */
@@ -526,12 +566,14 @@ static void qdict_do_flatten(QDict *qdict, QDict *target, const char *prefix)
 
 /**
  * qdict_flatten(): For each nested QDict with key x, all fields with key y
- * are moved to this QDict and their key is renamed to "x.y". This operation
- * is applied recursively for nested QDicts.
+ * are moved to this QDict and their key is renamed to "x.y". For each nested
+ * QList with key x, the field at index y is moved to this QDict with the key
+ * "x.y" (i.e., the reverse of what qdict_array_split() does).
+ * This operation is applied recursively for nested QDicts and QLists.
  */
 void qdict_flatten(QDict *qdict)
 {
-    qdict_do_flatten(qdict, qdict, NULL);
+    qdict_flatten_qdict(qdict, qdict, NULL);
 }
 
 /* extract all the src QDict entries starting by start into dst */
commit 05a8c2227157eda2540404999c4615d3bf343c18
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:03 2013 +0100

    qdict: Add qdict_array_split()
    
    This function splits a QDict consisting of entries prefixed by
    incrementally enumerated indices into a QList of QDicts.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/include/qapi/qmp/qdict.h b/include/qapi/qmp/qdict.h
index 5cefd80..1ddf97b 100644
--- a/include/qapi/qmp/qdict.h
+++ b/include/qapi/qmp/qdict.h
@@ -68,5 +68,6 @@ QDict *qdict_clone_shallow(const QDict *src);
 void qdict_flatten(QDict *qdict);
 
 void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start);
+void qdict_array_split(QDict *src, QList **dst);
 
 #endif /* QDICT_H */
diff --git a/qobject/qdict.c b/qobject/qdict.c
index 17e14f0..2d5848d 100644
--- a/qobject/qdict.c
+++ b/qobject/qdict.c
@@ -554,3 +554,40 @@ void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start)
         entry = next;
     }
 }
+
+/**
+ * qdict_array_split(): This function moves array-like elements of a QDict into
+ * a new QList of QDicts. Every entry in the original QDict with a key prefixed
+ * "%u.", where %u designates an unsigned integer starting at 0 and
+ * incrementally counting up, will be moved to a new QDict at index %u in the
+ * output QList with the key prefix removed. The function terminates when there
+ * is no entry in the QDict with a prefix directly (incrementally) following the
+ * last one.
+ * Example: {"0.a": 42, "0.b": 23, "1.x": 0, "3.y": 1, "o.o": 7}
+ *      (or {"1.x": 0, "3.y": 1, "0.a": 42, "o.o": 7, "0.b": 23})
+ *       => [{"a": 42, "b": 23}, {"x": 0}]
+ *      and {"3.y": 1, "o.o": 7} (remainder of the old QDict)
+ */
+void qdict_array_split(QDict *src, QList **dst)
+{
+    unsigned i;
+
+    *dst = qlist_new();
+
+    for (i = 0; i < UINT_MAX; i++) {
+        QDict *subqdict;
+        char prefix[32];
+        size_t snprintf_ret;
+
+        snprintf_ret = snprintf(prefix, 32, "%u.", i);
+        assert(snprintf_ret < 32);
+
+        qdict_extract_subqdict(src, &subqdict, prefix);
+        if (!qdict_size(subqdict)) {
+            QDECREF(subqdict);
+            break;
+        }
+
+        qlist_append_obj(*dst, QOBJECT(subqdict));
+    }
+}
commit d4881b9bcbbadc83ffa5d8e6d2d6deb36cd8faa6
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:02 2013 +0100

    blkdebug: Don't require sophisticated filename
    
    If the filename is not prefixed by "blkdebug:" in
    blkdebug_parse_filename(), the blkdebug driver was not selected through
    that protocol prefix, but by an explicit command line option
    (file.driver=blkdebug or something similar). Contrary to the current
    reaction, this is not a problem at all; we just need to store the
    filename (in the x-image option) and can go on; the user just has to
    manually specify the config option.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 2eb2e8b..fab76ce 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -313,7 +313,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options,
 
     /* Parse the blkdebug: prefix */
     if (!strstart(filename, "blkdebug:", &filename)) {
-        error_setg(errp, "File name string must start with 'blkdebug:'");
+        /* There was no prefix; therefore, all options have to be already
+           present in the QDict (except for the filename) */
+        qdict_put(options, "x-image", qstring_from_str(filename));
         return;
     }
 
commit 466b49f276310952ad64485d8b9fa87a5c8a9451
Author: Max Reitz <mreitz at redhat.com>
Date:   Fri Dec 20 19:28:01 2013 +0100

    blkdebug: Use errp for read_config()
    
    Use an Error variable in the read_config() function.
    
    Signed-off-by: Max Reitz <mreitz at redhat.com>
    Reviewed-by: Kevin Wolf <kwolf at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/blkdebug.c b/block/blkdebug.c
index ebc5f13..2eb2e8b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -271,7 +271,7 @@ static void remove_rule(BlkdebugRule *rule)
     g_free(rule);
 }
 
-static int read_config(BDRVBlkdebugState *s, const char *filename)
+static int read_config(BDRVBlkdebugState *s, const char *filename, Error **errp)
 {
     FILE *f;
     int ret;
@@ -279,11 +279,14 @@ static int read_config(BDRVBlkdebugState *s, const char *filename)
 
     f = fopen(filename, "r");
     if (f == NULL) {
+        error_setg_errno(errp, errno, "Could not read blkdebug config file");
         return -errno;
     }
 
     ret = qemu_config_parse(f, config_groups, filename);
     if (ret < 0) {
+        error_setg(errp, "Could not parse blkdebug config file");
+        ret = -EINVAL;
         goto fail;
     }
 
@@ -370,9 +373,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
     /* Read rules from config file */
     config = qemu_opt_get(opts, "config");
     if (config) {
-        ret = read_config(s, config);
-        if (ret < 0) {
-            error_setg_errno(errp, -ret, "Could not read blkdebug config file");
+        ret = read_config(s, config, errp);
+        if (ret) {
             goto fail;
         }
     }
commit 4694020d3c0d21f02408d5cc6f44b8fb55b4ee15
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Nov 14 11:54:18 2013 +0100

    qemu-io: add command completion
    
    Autocomplete qemu-io commands at the interactive prompt.
    
    Note this only completes command names and not their options.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/include/qemu-io.h b/include/qemu-io.h
index a418b46..7e7c07c 100644
--- a/include/qemu-io.h
+++ b/include/qemu-io.h
@@ -42,5 +42,8 @@ bool qemuio_command(BlockDriverState *bs, const char *cmd);
 
 void qemuio_add_command(const cmdinfo_t *ci);
 int qemuio_command_usage(const cmdinfo_t *ci);
+void qemuio_complete_command(const char *input,
+                             void (*fn)(const char *cmd, void *opaque),
+                             void *opaque);
 
 #endif /* QEMU_IO_H */
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 85e4982..6dfb4a5 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -94,6 +94,21 @@ static const cmdinfo_t *find_command(const char *cmd)
     return NULL;
 }
 
+/* Invoke fn() for commands with a matching prefix */
+void qemuio_complete_command(const char *input,
+                             void (*fn)(const char *cmd, void *opaque),
+                             void *opaque)
+{
+    cmdinfo_t *ct;
+    size_t input_len = strlen(input);
+
+    for (ct = cmdtab; ct < &cmdtab[ncmds]; ct++) {
+        if (strncmp(input, ct->name, input_len) == 0) {
+            fn(ct->name, opaque);
+        }
+    }
+}
+
 static char **breakline(char *input, int *count)
 {
     int c = 0;
diff --git a/qemu-io.c b/qemu-io.c
index d7c26d3..fdc46a9 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -236,9 +236,15 @@ static void readline_func(void *opaque, const char *str, void *readline_opaque)
     *line = g_strdup(str);
 }
 
+static void completion_match(const char *cmd, void *opaque)
+{
+    readline_add_completion(readline_state, cmd);
+}
+
 static void readline_completion_func(void *opaque, const char *str)
 {
-    /* No command or argument completion implemented yet */
+    readline_set_completion_index(readline_state, strlen(str));
+    qemuio_complete_command(str, completion_match, NULL);
 }
 
 static char *fetchline_readline(void)
commit 0cf17e181798063c3824c8200ba46f25f54faa1a
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Nov 14 11:54:17 2013 +0100

    qemu-io: use readline.c
    
    Use readline.c for command-line history.  There was support for GNU
    Readline and BSD Editline but it was never compiled in.  Since QEMU has
    its own readline.c, just use that when qemu-io runs with stdin attached
    to a terminal.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qemu-io.c b/qemu-io.c
index 3b3340a..d7c26d3 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -18,6 +18,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/option.h"
 #include "qemu/config-file.h"
+#include "qemu/readline.h"
 #include "block/block_int.h"
 #include "trace/control.h"
 
@@ -32,6 +33,8 @@ extern int qemuio_misalign;
 static int ncmdline;
 static char **cmdline;
 
+static ReadLineState *readline_state;
+
 static int close_f(BlockDriverState *bs, int argc, char **argv)
 {
     bdrv_unref(bs);
@@ -203,14 +206,6 @@ static void usage(const char *name)
     name);
 }
 
-
-#if defined(ENABLE_READLINE)
-# include <readline/history.h>
-# include <readline/readline.h>
-#elif defined(ENABLE_EDITLINE)
-# include <histedit.h>
-#endif
-
 static char *get_prompt(void)
 {
     static char prompt[FILENAME_MAX + 2 /*"> "*/ + 1 /*"\0"*/ ];
@@ -222,52 +217,47 @@ static char *get_prompt(void)
     return prompt;
 }
 
-#if defined(ENABLE_READLINE)
-static char *fetchline(void)
+static void readline_printf_func(void *opaque, const char *fmt, ...)
 {
-    char *line = readline(get_prompt());
-    if (line && *line) {
-        add_history(line);
-    }
-    return line;
+    va_list ap;
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
 }
-#elif defined(ENABLE_EDITLINE)
-static char *el_get_prompt(EditLine *e)
+
+static void readline_flush_func(void *opaque)
 {
-    return get_prompt();
+    fflush(stdout);
 }
 
-static char *fetchline(void)
+static void readline_func(void *opaque, const char *str, void *readline_opaque)
 {
-    static EditLine *el;
-    static History *hist;
-    HistEvent hevent;
-    char *line;
-    int count;
-
-    if (!el) {
-        hist = history_init();
-        history(hist, &hevent, H_SETSIZE, 100);
-        el = el_init(progname, stdin, stdout, stderr);
-        el_source(el, NULL);
-        el_set(el, EL_SIGNAL, 1);
-        el_set(el, EL_PROMPT, el_get_prompt);
-        el_set(el, EL_HIST, history, (const char *)hist);
-    }
-    line = strdup(el_gets(el, &count));
-    if (line) {
-        if (count > 0) {
-            line[count-1] = '\0';
-        }
-        if (*line) {
-            history(hist, &hevent, H_ENTER, line);
+    char **line = readline_opaque;
+    *line = g_strdup(str);
+}
+
+static void readline_completion_func(void *opaque, const char *str)
+{
+    /* No command or argument completion implemented yet */
+}
+
+static char *fetchline_readline(void)
+{
+    char *line = NULL;
+
+    readline_start(readline_state, get_prompt(), 0, readline_func, &line);
+    while (!line) {
+        int ch = getchar();
+        if (ch == EOF) {
+            break;
         }
+        readline_handle_byte(readline_state, ch);
     }
     return line;
 }
-#else
-# define MAXREADLINESZ 1024
-static char *fetchline(void)
+
+#define MAXREADLINESZ 1024
+static char *fetchline_fgets(void)
 {
     char *p, *line = g_malloc(MAXREADLINESZ);
 
@@ -283,7 +273,15 @@ static char *fetchline(void)
 
     return line;
 }
-#endif
+
+static char *fetchline(void)
+{
+    if (readline_state) {
+        return fetchline_readline();
+    } else {
+        return fetchline_fgets();
+    }
+}
 
 static void prep_fetchline(void *opaque)
 {
@@ -339,6 +337,11 @@ static void add_user_command(char *optarg)
     cmdline[ncmdline-1] = optarg;
 }
 
+static void reenable_tty_echo(void)
+{
+    qemu_set_tty_echo(STDIN_FILENO, true);
+}
+
 int main(int argc, char **argv)
 {
     int readonly = 0;
@@ -435,6 +438,15 @@ int main(int argc, char **argv)
     qemuio_add_command(&open_cmd);
     qemuio_add_command(&close_cmd);
 
+    if (isatty(STDIN_FILENO)) {
+        readline_state = readline_init(readline_printf_func,
+                                       readline_flush_func,
+                                       NULL,
+                                       readline_completion_func);
+        qemu_set_tty_echo(STDIN_FILENO, false);
+        atexit(reenable_tty_echo);
+    }
+
     /* open the device */
     if (!readonly) {
         flags |= BDRV_O_RDWR;
@@ -453,5 +465,6 @@ int main(int argc, char **argv)
     if (qemuio_bs) {
         bdrv_unref(qemuio_bs);
     }
+    g_free(readline_state);
     return 0;
 }
commit 13401ba0b982024b62a99388032bbb889dc98b43
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Nov 14 11:54:16 2013 +0100

    osdep: add qemu_set_tty_echo()
    
    Using stdin with readline.c requires disabling echo and line buffering.
    Add a portable wrapper to set the terminal attributes under Linux and
    Windows.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b3e2b6d..eac7172 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -240,4 +240,6 @@ static inline void qemu_init_auxval(char **envp) { }
 void qemu_init_auxval(char **envp);
 #endif
 
+void qemu_set_tty_echo(int fd, bool echo);
+
 #endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e00a44c..f5c4016 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -47,6 +47,9 @@ extern int daemon(int, int);
 #  define QEMU_VMALLOC_ALIGN getpagesize()
 #endif
 
+#include <termios.h>
+#include <unistd.h>
+
 #include <glib/gprintf.h>
 
 #include "config-host.h"
@@ -251,3 +254,18 @@ qemu_get_local_state_pathname(const char *relative_pathname)
     return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR,
                            relative_pathname);
 }
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+    struct termios tty;
+
+    tcgetattr(fd, &tty);
+
+    if (echo) {
+        tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
+    } else {
+        tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
+    }
+
+    tcsetattr(fd, TCSANOW, &tty);
+}
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 776ccfa..50be044 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -189,3 +189,22 @@ qemu_get_local_state_pathname(const char *relative_pathname)
     return g_strdup_printf("%s" G_DIR_SEPARATOR_S "%s", base_path,
                            relative_pathname);
 }
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+    HANDLE handle = (HANDLE)_get_osfhandle(fd);
+    DWORD dwMode = 0;
+
+    if (handle == INVALID_HANDLE_VALUE) {
+        return;
+    }
+
+    GetConsoleMode(handle, &dwMode);
+
+    if (echo) {
+        SetConsoleMode(handle, dwMode | ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT);
+    } else {
+        SetConsoleMode(handle,
+                       dwMode & ~(ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT));
+    }
+}
commit 0150cd81cf608b93778a067189829f354fe27e4b
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Nov 14 11:54:15 2013 +0100

    readline: move readline to a generic location
    
    Now that the monitor and readline are decoupled, readline.h no longer
    belongs in include/monitor/.  Put the header into include/qemu/.
    
    Move the source file into util/ so it can be linked as part of
    libqemuutil.a.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/Makefile.objs b/Makefile.objs
index 857bb53..ac1d0e1 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -43,7 +43,6 @@ libcacard-y += libcacard/vcardt.o
 ifeq ($(CONFIG_SOFTMMU),y)
 common-obj-y = $(block-obj-y) blockdev.o blockdev-nbd.o block/
 common-obj-y += net/
-common-obj-y += readline.o
 common-obj-y += qdev-monitor.o device-hotplug.o
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 22d8b8f..7e5f752 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -5,7 +5,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qapi/qmp/qdict.h"
 #include "block/block.h"
-#include "monitor/readline.h"
+#include "qemu/readline.h"
 
 extern Monitor *cur_mon;
 extern Monitor *default_mon;
diff --git a/include/monitor/readline.h b/include/monitor/readline.h
deleted file mode 100644
index a89fe4a..0000000
--- a/include/monitor/readline.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef READLINE_H
-#define READLINE_H
-
-#define READLINE_CMD_BUF_SIZE 4095
-#define READLINE_MAX_CMDS 64
-#define READLINE_MAX_COMPLETIONS 256
-
-typedef void ReadLinePrintfFunc(void *opaque, const char *fmt, ...);
-typedef void ReadLineFlushFunc(void *opaque);
-typedef void ReadLineFunc(void *opaque, const char *str,
-                          void *readline_opaque);
-typedef void ReadLineCompletionFunc(void *opaque,
-                                    const char *cmdline);
-
-typedef struct ReadLineState {
-    char cmd_buf[READLINE_CMD_BUF_SIZE + 1];
-    int cmd_buf_index;
-    int cmd_buf_size;
-
-    char last_cmd_buf[READLINE_CMD_BUF_SIZE + 1];
-    int last_cmd_buf_index;
-    int last_cmd_buf_size;
-
-    int esc_state;
-    int esc_param;
-
-    char *history[READLINE_MAX_CMDS];
-    int hist_entry;
-
-    ReadLineCompletionFunc *completion_finder;
-    char *completions[READLINE_MAX_COMPLETIONS];
-    int nb_completions;
-    int completion_index;
-
-    ReadLineFunc *readline_func;
-    void *readline_opaque;
-    int read_password;
-    char prompt[256];
-
-    ReadLinePrintfFunc *printf_func;
-    ReadLineFlushFunc *flush_func;
-    void *opaque;
-} ReadLineState;
-
-void readline_add_completion(ReadLineState *rs, const char *str);
-void readline_set_completion_index(ReadLineState *rs, int completion_index);
-
-const char *readline_get_history(ReadLineState *rs, unsigned int index);
-
-void readline_handle_byte(ReadLineState *rs, int ch);
-
-void readline_start(ReadLineState *rs, const char *prompt, int read_password,
-                    ReadLineFunc *readline_func, void *readline_opaque);
-void readline_restart(ReadLineState *rs);
-void readline_show_prompt(ReadLineState *rs);
-
-ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
-                             ReadLineFlushFunc *flush_func,
-                             void *opaque,
-                             ReadLineCompletionFunc *completion_finder);
-
-#endif /* !READLINE_H */
diff --git a/include/qemu/readline.h b/include/qemu/readline.h
new file mode 100644
index 0000000..a89fe4a
--- /dev/null
+++ b/include/qemu/readline.h
@@ -0,0 +1,62 @@
+#ifndef READLINE_H
+#define READLINE_H
+
+#define READLINE_CMD_BUF_SIZE 4095
+#define READLINE_MAX_CMDS 64
+#define READLINE_MAX_COMPLETIONS 256
+
+typedef void ReadLinePrintfFunc(void *opaque, const char *fmt, ...);
+typedef void ReadLineFlushFunc(void *opaque);
+typedef void ReadLineFunc(void *opaque, const char *str,
+                          void *readline_opaque);
+typedef void ReadLineCompletionFunc(void *opaque,
+                                    const char *cmdline);
+
+typedef struct ReadLineState {
+    char cmd_buf[READLINE_CMD_BUF_SIZE + 1];
+    int cmd_buf_index;
+    int cmd_buf_size;
+
+    char last_cmd_buf[READLINE_CMD_BUF_SIZE + 1];
+    int last_cmd_buf_index;
+    int last_cmd_buf_size;
+
+    int esc_state;
+    int esc_param;
+
+    char *history[READLINE_MAX_CMDS];
+    int hist_entry;
+
+    ReadLineCompletionFunc *completion_finder;
+    char *completions[READLINE_MAX_COMPLETIONS];
+    int nb_completions;
+    int completion_index;
+
+    ReadLineFunc *readline_func;
+    void *readline_opaque;
+    int read_password;
+    char prompt[256];
+
+    ReadLinePrintfFunc *printf_func;
+    ReadLineFlushFunc *flush_func;
+    void *opaque;
+} ReadLineState;
+
+void readline_add_completion(ReadLineState *rs, const char *str);
+void readline_set_completion_index(ReadLineState *rs, int completion_index);
+
+const char *readline_get_history(ReadLineState *rs, unsigned int index);
+
+void readline_handle_byte(ReadLineState *rs, int ch);
+
+void readline_start(ReadLineState *rs, const char *prompt, int read_password,
+                    ReadLineFunc *readline_func, void *readline_opaque);
+void readline_restart(ReadLineState *rs);
+void readline_show_prompt(ReadLineState *rs);
+
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
+                             ReadLineCompletionFunc *completion_finder);
+
+#endif /* !READLINE_H */
diff --git a/monitor.c b/monitor.c
index 32d0264..80456fb 100644
--- a/monitor.c
+++ b/monitor.c
@@ -37,7 +37,7 @@
 #include "ui/qemu-spice.h"
 #include "sysemu/sysemu.h"
 #include "monitor/monitor.h"
-#include "monitor/readline.h"
+#include "qemu/readline.h"
 #include "ui/console.h"
 #include "sysemu/blockdev.h"
 #include "audio/audio.h"
diff --git a/readline.c b/readline.c
deleted file mode 100644
index ca894d1..0000000
--- a/readline.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * QEMU readline utility
- *
- * Copyright (c) 2003-2004 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include "qemu-common.h"
-#include "monitor/readline.h"
-
-#define IS_NORM 0
-#define IS_ESC  1
-#define IS_CSI  2
-#define IS_SS3  3
-
-void readline_show_prompt(ReadLineState *rs)
-{
-    rs->printf_func(rs->opaque, "%s", rs->prompt);
-    rs->flush_func(rs->opaque);
-    rs->last_cmd_buf_index = 0;
-    rs->last_cmd_buf_size = 0;
-    rs->esc_state = IS_NORM;
-}
-
-/* update the displayed command line */
-static void readline_update(ReadLineState *rs)
-{
-    int i, delta, len;
-
-    if (rs->cmd_buf_size != rs->last_cmd_buf_size ||
-        memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) {
-        for(i = 0; i < rs->last_cmd_buf_index; i++) {
-            rs->printf_func(rs->opaque, "\033[D");
-        }
-        rs->cmd_buf[rs->cmd_buf_size] = '\0';
-        if (rs->read_password) {
-            len = strlen(rs->cmd_buf);
-            for(i = 0; i < len; i++)
-                rs->printf_func(rs->opaque, "*");
-        } else {
-            rs->printf_func(rs->opaque, "%s", rs->cmd_buf);
-        }
-        rs->printf_func(rs->opaque, "\033[K");
-        memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size);
-        rs->last_cmd_buf_size = rs->cmd_buf_size;
-        rs->last_cmd_buf_index = rs->cmd_buf_size;
-    }
-    if (rs->cmd_buf_index != rs->last_cmd_buf_index) {
-        delta = rs->cmd_buf_index - rs->last_cmd_buf_index;
-        if (delta > 0) {
-            for(i = 0;i < delta; i++) {
-                rs->printf_func(rs->opaque, "\033[C");
-            }
-        } else {
-            delta = -delta;
-            for(i = 0;i < delta; i++) {
-                rs->printf_func(rs->opaque, "\033[D");
-            }
-        }
-        rs->last_cmd_buf_index = rs->cmd_buf_index;
-    }
-    rs->flush_func(rs->opaque);
-}
-
-static void readline_insert_char(ReadLineState *rs, int ch)
-{
-    if (rs->cmd_buf_index < READLINE_CMD_BUF_SIZE) {
-        memmove(rs->cmd_buf + rs->cmd_buf_index + 1,
-                rs->cmd_buf + rs->cmd_buf_index,
-                rs->cmd_buf_size - rs->cmd_buf_index);
-        rs->cmd_buf[rs->cmd_buf_index] = ch;
-        rs->cmd_buf_size++;
-        rs->cmd_buf_index++;
-    }
-}
-
-static void readline_backward_char(ReadLineState *rs)
-{
-    if (rs->cmd_buf_index > 0) {
-        rs->cmd_buf_index--;
-    }
-}
-
-static void readline_forward_char(ReadLineState *rs)
-{
-    if (rs->cmd_buf_index < rs->cmd_buf_size) {
-        rs->cmd_buf_index++;
-    }
-}
-
-static void readline_delete_char(ReadLineState *rs)
-{
-    if (rs->cmd_buf_index < rs->cmd_buf_size) {
-        memmove(rs->cmd_buf + rs->cmd_buf_index,
-                rs->cmd_buf + rs->cmd_buf_index + 1,
-                rs->cmd_buf_size - rs->cmd_buf_index - 1);
-        rs->cmd_buf_size--;
-    }
-}
-
-static void readline_backspace(ReadLineState *rs)
-{
-    if (rs->cmd_buf_index > 0) {
-        readline_backward_char(rs);
-        readline_delete_char(rs);
-    }
-}
-
-static void readline_backword(ReadLineState *rs)
-{
-    int start;
-
-    if (rs->cmd_buf_index == 0 || rs->cmd_buf_index > rs->cmd_buf_size) {
-        return;
-    }
-
-    start = rs->cmd_buf_index - 1;
-
-    /* find first word (backwards) */
-    while (start > 0) {
-        if (!qemu_isspace(rs->cmd_buf[start])) {
-            break;
-        }
-
-        --start;
-    }
-
-    /* find first space (backwards) */
-    while (start > 0) {
-        if (qemu_isspace(rs->cmd_buf[start])) {
-            ++start;
-            break;
-        }
-
-        --start;
-    }
-
-    /* remove word */
-    if (start < rs->cmd_buf_index) {
-        memmove(rs->cmd_buf + start,
-                rs->cmd_buf + rs->cmd_buf_index,
-                rs->cmd_buf_size - rs->cmd_buf_index);
-        rs->cmd_buf_size -= rs->cmd_buf_index - start;
-        rs->cmd_buf_index = start;
-    }
-}
-
-static void readline_bol(ReadLineState *rs)
-{
-    rs->cmd_buf_index = 0;
-}
-
-static void readline_eol(ReadLineState *rs)
-{
-    rs->cmd_buf_index = rs->cmd_buf_size;
-}
-
-static void readline_up_char(ReadLineState *rs)
-{
-    int idx;
-
-    if (rs->hist_entry == 0)
-	return;
-    if (rs->hist_entry == -1) {
-	/* Find latest entry */
-	for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
-	    if (rs->history[idx] == NULL)
-		break;
-	}
-	rs->hist_entry = idx;
-    }
-    rs->hist_entry--;
-    if (rs->hist_entry >= 0) {
-	pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
-                rs->history[rs->hist_entry]);
-	rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
-    }
-}
-
-static void readline_down_char(ReadLineState *rs)
-{
-    if (rs->hist_entry == -1)
-        return;
-    if (rs->hist_entry < READLINE_MAX_CMDS - 1 &&
-        rs->history[++rs->hist_entry] != NULL) {
-	pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
-                rs->history[rs->hist_entry]);
-    } else {
-        rs->cmd_buf[0] = 0;
-	rs->hist_entry = -1;
-    }
-    rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
-}
-
-static void readline_hist_add(ReadLineState *rs, const char *cmdline)
-{
-    char *hist_entry, *new_entry;
-    int idx;
-
-    if (cmdline[0] == '\0')
-	return;
-    new_entry = NULL;
-    if (rs->hist_entry != -1) {
-	/* We were editing an existing history entry: replace it */
-	hist_entry = rs->history[rs->hist_entry];
-	idx = rs->hist_entry;
-	if (strcmp(hist_entry, cmdline) == 0) {
-	    goto same_entry;
-	}
-    }
-    /* Search cmdline in history buffers */
-    for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
-	hist_entry = rs->history[idx];
-	if (hist_entry == NULL)
-	    break;
-	if (strcmp(hist_entry, cmdline) == 0) {
-	same_entry:
-	    new_entry = hist_entry;
-	    /* Put this entry at the end of history */
-	    memmove(&rs->history[idx], &rs->history[idx + 1],
-		    (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *));
-	    rs->history[READLINE_MAX_CMDS - 1] = NULL;
-	    for (; idx < READLINE_MAX_CMDS; idx++) {
-		if (rs->history[idx] == NULL)
-		    break;
-	    }
-	    break;
-	}
-    }
-    if (idx == READLINE_MAX_CMDS) {
-	/* Need to get one free slot */
-        g_free(rs->history[0]);
-	memmove(rs->history, &rs->history[1],
-	        (READLINE_MAX_CMDS - 1) * sizeof(char *));
-	rs->history[READLINE_MAX_CMDS - 1] = NULL;
-	idx = READLINE_MAX_CMDS - 1;
-    }
-    if (new_entry == NULL)
-        new_entry = g_strdup(cmdline);
-    rs->history[idx] = new_entry;
-    rs->hist_entry = -1;
-}
-
-/* completion support */
-
-void readline_add_completion(ReadLineState *rs, const char *str)
-{
-    if (rs->nb_completions < READLINE_MAX_COMPLETIONS) {
-        rs->completions[rs->nb_completions++] = g_strdup(str);
-    }
-}
-
-void readline_set_completion_index(ReadLineState *rs, int index)
-{
-    rs->completion_index = index;
-}
-
-static void readline_completion(ReadLineState *rs)
-{
-    int len, i, j, max_width, nb_cols, max_prefix;
-    char *cmdline;
-
-    rs->nb_completions = 0;
-
-    cmdline = g_malloc(rs->cmd_buf_index + 1);
-    memcpy(cmdline, rs->cmd_buf, rs->cmd_buf_index);
-    cmdline[rs->cmd_buf_index] = '\0';
-    rs->completion_finder(rs->opaque, cmdline);
-    g_free(cmdline);
-
-    /* no completion found */
-    if (rs->nb_completions <= 0)
-        return;
-    if (rs->nb_completions == 1) {
-        len = strlen(rs->completions[0]);
-        for(i = rs->completion_index; i < len; i++) {
-            readline_insert_char(rs, rs->completions[0][i]);
-        }
-        /* extra space for next argument. XXX: make it more generic */
-        if (len > 0 && rs->completions[0][len - 1] != '/')
-            readline_insert_char(rs, ' ');
-    } else {
-        rs->printf_func(rs->opaque, "\n");
-        max_width = 0;
-        max_prefix = 0;	
-        for(i = 0; i < rs->nb_completions; i++) {
-            len = strlen(rs->completions[i]);
-            if (i==0) {
-                max_prefix = len;
-            } else {
-                if (len < max_prefix)
-                    max_prefix = len;
-                for(j=0; j<max_prefix; j++) {
-                    if (rs->completions[i][j] != rs->completions[0][j])
-                        max_prefix = j;
-                }
-            }
-            if (len > max_width)
-                max_width = len;
-        }
-        if (max_prefix > 0) 
-            for(i = rs->completion_index; i < max_prefix; i++) {
-                readline_insert_char(rs, rs->completions[0][i]);
-            }
-        max_width += 2;
-        if (max_width < 10)
-            max_width = 10;
-        else if (max_width > 80)
-            max_width = 80;
-        nb_cols = 80 / max_width;
-        j = 0;
-        for(i = 0; i < rs->nb_completions; i++) {
-            rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]);
-            if (++j == nb_cols || i == (rs->nb_completions - 1)) {
-                rs->printf_func(rs->opaque, "\n");
-                j = 0;
-            }
-        }
-        readline_show_prompt(rs);
-    }
-    for (i = 0; i < rs->nb_completions; i++) {
-        g_free(rs->completions[i]);
-    }
-}
-
-/* return true if command handled */
-void readline_handle_byte(ReadLineState *rs, int ch)
-{
-    switch(rs->esc_state) {
-    case IS_NORM:
-        switch(ch) {
-        case 1:
-            readline_bol(rs);
-            break;
-        case 4:
-            readline_delete_char(rs);
-            break;
-        case 5:
-            readline_eol(rs);
-            break;
-        case 9:
-            readline_completion(rs);
-            break;
-        case 10:
-        case 13:
-            rs->cmd_buf[rs->cmd_buf_size] = '\0';
-            if (!rs->read_password)
-                readline_hist_add(rs, rs->cmd_buf);
-            rs->printf_func(rs->opaque, "\n");
-            rs->cmd_buf_index = 0;
-            rs->cmd_buf_size = 0;
-            rs->last_cmd_buf_index = 0;
-            rs->last_cmd_buf_size = 0;
-            rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque);
-            break;
-        case 23:
-            /* ^W */
-            readline_backword(rs);
-            break;
-        case 27:
-            rs->esc_state = IS_ESC;
-            break;
-        case 127:
-        case 8:
-            readline_backspace(rs);
-            break;
-	case 155:
-            rs->esc_state = IS_CSI;
-	    break;
-        default:
-            if (ch >= 32) {
-                readline_insert_char(rs, ch);
-            }
-            break;
-        }
-        break;
-    case IS_ESC:
-        if (ch == '[') {
-            rs->esc_state = IS_CSI;
-            rs->esc_param = 0;
-        } else if (ch == 'O') {
-            rs->esc_state = IS_SS3;
-            rs->esc_param = 0;
-        } else {
-            rs->esc_state = IS_NORM;
-        }
-        break;
-    case IS_CSI:
-        switch(ch) {
-	case 'A':
-	case 'F':
-	    readline_up_char(rs);
-	    break;
-	case 'B':
-	case 'E':
-	    readline_down_char(rs);
-	    break;
-        case 'D':
-            readline_backward_char(rs);
-            break;
-        case 'C':
-            readline_forward_char(rs);
-            break;
-        case '0' ... '9':
-            rs->esc_param = rs->esc_param * 10 + (ch - '0');
-            goto the_end;
-        case '~':
-            switch(rs->esc_param) {
-            case 1:
-                readline_bol(rs);
-                break;
-            case 3:
-                readline_delete_char(rs);
-                break;
-            case 4:
-                readline_eol(rs);
-                break;
-            }
-            break;
-        default:
-            break;
-        }
-        rs->esc_state = IS_NORM;
-    the_end:
-        break;
-    case IS_SS3:
-        switch(ch) {
-        case 'F':
-            readline_eol(rs);
-            break;
-        case 'H':
-            readline_bol(rs);
-            break;
-        }
-        rs->esc_state = IS_NORM;
-        break;
-    }
-    readline_update(rs);
-}
-
-void readline_start(ReadLineState *rs, const char *prompt, int read_password,
-                    ReadLineFunc *readline_func, void *opaque)
-{
-    pstrcpy(rs->prompt, sizeof(rs->prompt), prompt);
-    rs->readline_func = readline_func;
-    rs->readline_opaque = opaque;
-    rs->read_password = read_password;
-    readline_restart(rs);
-}
-
-void readline_restart(ReadLineState *rs)
-{
-    rs->cmd_buf_index = 0;
-    rs->cmd_buf_size = 0;
-}
-
-const char *readline_get_history(ReadLineState *rs, unsigned int index)
-{
-    if (index >= READLINE_MAX_CMDS)
-        return NULL;
-    return rs->history[index];
-}
-
-ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
-                             ReadLineFlushFunc *flush_func,
-                             void *opaque,
-                             ReadLineCompletionFunc *completion_finder)
-{
-    ReadLineState *rs = g_malloc0(sizeof(*rs));
-
-    rs->hist_entry = -1;
-    rs->opaque = opaque;
-    rs->printf_func = printf_func;
-    rs->flush_func = flush_func;
-    rs->completion_finder = completion_finder;
-
-    return rs;
-}
diff --git a/util/Makefile.objs b/util/Makefile.objs
index af3e5cb..937376b 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -13,3 +13,4 @@ util-obj-y += hexdump.o
 util-obj-y += crc32c.o
 util-obj-y += throttle.o
 util-obj-y += getauxval.o
+util-obj-y += readline.o
diff --git a/util/readline.c b/util/readline.c
new file mode 100644
index 0000000..8441be4
--- /dev/null
+++ b/util/readline.c
@@ -0,0 +1,495 @@
+/*
+ * QEMU readline utility
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "qemu/readline.h"
+
+#define IS_NORM 0
+#define IS_ESC  1
+#define IS_CSI  2
+#define IS_SS3  3
+
+void readline_show_prompt(ReadLineState *rs)
+{
+    rs->printf_func(rs->opaque, "%s", rs->prompt);
+    rs->flush_func(rs->opaque);
+    rs->last_cmd_buf_index = 0;
+    rs->last_cmd_buf_size = 0;
+    rs->esc_state = IS_NORM;
+}
+
+/* update the displayed command line */
+static void readline_update(ReadLineState *rs)
+{
+    int i, delta, len;
+
+    if (rs->cmd_buf_size != rs->last_cmd_buf_size ||
+        memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) {
+        for(i = 0; i < rs->last_cmd_buf_index; i++) {
+            rs->printf_func(rs->opaque, "\033[D");
+        }
+        rs->cmd_buf[rs->cmd_buf_size] = '\0';
+        if (rs->read_password) {
+            len = strlen(rs->cmd_buf);
+            for(i = 0; i < len; i++)
+                rs->printf_func(rs->opaque, "*");
+        } else {
+            rs->printf_func(rs->opaque, "%s", rs->cmd_buf);
+        }
+        rs->printf_func(rs->opaque, "\033[K");
+        memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size);
+        rs->last_cmd_buf_size = rs->cmd_buf_size;
+        rs->last_cmd_buf_index = rs->cmd_buf_size;
+    }
+    if (rs->cmd_buf_index != rs->last_cmd_buf_index) {
+        delta = rs->cmd_buf_index - rs->last_cmd_buf_index;
+        if (delta > 0) {
+            for(i = 0;i < delta; i++) {
+                rs->printf_func(rs->opaque, "\033[C");
+            }
+        } else {
+            delta = -delta;
+            for(i = 0;i < delta; i++) {
+                rs->printf_func(rs->opaque, "\033[D");
+            }
+        }
+        rs->last_cmd_buf_index = rs->cmd_buf_index;
+    }
+    rs->flush_func(rs->opaque);
+}
+
+static void readline_insert_char(ReadLineState *rs, int ch)
+{
+    if (rs->cmd_buf_index < READLINE_CMD_BUF_SIZE) {
+        memmove(rs->cmd_buf + rs->cmd_buf_index + 1,
+                rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf_size - rs->cmd_buf_index);
+        rs->cmd_buf[rs->cmd_buf_index] = ch;
+        rs->cmd_buf_size++;
+        rs->cmd_buf_index++;
+    }
+}
+
+static void readline_backward_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index > 0) {
+        rs->cmd_buf_index--;
+    }
+}
+
+static void readline_forward_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index < rs->cmd_buf_size) {
+        rs->cmd_buf_index++;
+    }
+}
+
+static void readline_delete_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index < rs->cmd_buf_size) {
+        memmove(rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf + rs->cmd_buf_index + 1,
+                rs->cmd_buf_size - rs->cmd_buf_index - 1);
+        rs->cmd_buf_size--;
+    }
+}
+
+static void readline_backspace(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index > 0) {
+        readline_backward_char(rs);
+        readline_delete_char(rs);
+    }
+}
+
+static void readline_backword(ReadLineState *rs)
+{
+    int start;
+
+    if (rs->cmd_buf_index == 0 || rs->cmd_buf_index > rs->cmd_buf_size) {
+        return;
+    }
+
+    start = rs->cmd_buf_index - 1;
+
+    /* find first word (backwards) */
+    while (start > 0) {
+        if (!qemu_isspace(rs->cmd_buf[start])) {
+            break;
+        }
+
+        --start;
+    }
+
+    /* find first space (backwards) */
+    while (start > 0) {
+        if (qemu_isspace(rs->cmd_buf[start])) {
+            ++start;
+            break;
+        }
+
+        --start;
+    }
+
+    /* remove word */
+    if (start < rs->cmd_buf_index) {
+        memmove(rs->cmd_buf + start,
+                rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf_size - rs->cmd_buf_index);
+        rs->cmd_buf_size -= rs->cmd_buf_index - start;
+        rs->cmd_buf_index = start;
+    }
+}
+
+static void readline_bol(ReadLineState *rs)
+{
+    rs->cmd_buf_index = 0;
+}
+
+static void readline_eol(ReadLineState *rs)
+{
+    rs->cmd_buf_index = rs->cmd_buf_size;
+}
+
+static void readline_up_char(ReadLineState *rs)
+{
+    int idx;
+
+    if (rs->hist_entry == 0)
+	return;
+    if (rs->hist_entry == -1) {
+	/* Find latest entry */
+	for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+	    if (rs->history[idx] == NULL)
+		break;
+	}
+	rs->hist_entry = idx;
+    }
+    rs->hist_entry--;
+    if (rs->hist_entry >= 0) {
+	pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+                rs->history[rs->hist_entry]);
+	rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+    }
+}
+
+static void readline_down_char(ReadLineState *rs)
+{
+    if (rs->hist_entry == -1)
+        return;
+    if (rs->hist_entry < READLINE_MAX_CMDS - 1 &&
+        rs->history[++rs->hist_entry] != NULL) {
+	pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+                rs->history[rs->hist_entry]);
+    } else {
+        rs->cmd_buf[0] = 0;
+	rs->hist_entry = -1;
+    }
+    rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+}
+
+static void readline_hist_add(ReadLineState *rs, const char *cmdline)
+{
+    char *hist_entry, *new_entry;
+    int idx;
+
+    if (cmdline[0] == '\0')
+	return;
+    new_entry = NULL;
+    if (rs->hist_entry != -1) {
+	/* We were editing an existing history entry: replace it */
+	hist_entry = rs->history[rs->hist_entry];
+	idx = rs->hist_entry;
+	if (strcmp(hist_entry, cmdline) == 0) {
+	    goto same_entry;
+	}
+    }
+    /* Search cmdline in history buffers */
+    for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+	hist_entry = rs->history[idx];
+	if (hist_entry == NULL)
+	    break;
+	if (strcmp(hist_entry, cmdline) == 0) {
+	same_entry:
+	    new_entry = hist_entry;
+	    /* Put this entry at the end of history */
+	    memmove(&rs->history[idx], &rs->history[idx + 1],
+		    (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *));
+	    rs->history[READLINE_MAX_CMDS - 1] = NULL;
+	    for (; idx < READLINE_MAX_CMDS; idx++) {
+		if (rs->history[idx] == NULL)
+		    break;
+	    }
+	    break;
+	}
+    }
+    if (idx == READLINE_MAX_CMDS) {
+	/* Need to get one free slot */
+        g_free(rs->history[0]);
+	memmove(rs->history, &rs->history[1],
+	        (READLINE_MAX_CMDS - 1) * sizeof(char *));
+	rs->history[READLINE_MAX_CMDS - 1] = NULL;
+	idx = READLINE_MAX_CMDS - 1;
+    }
+    if (new_entry == NULL)
+        new_entry = g_strdup(cmdline);
+    rs->history[idx] = new_entry;
+    rs->hist_entry = -1;
+}
+
+/* completion support */
+
+void readline_add_completion(ReadLineState *rs, const char *str)
+{
+    if (rs->nb_completions < READLINE_MAX_COMPLETIONS) {
+        rs->completions[rs->nb_completions++] = g_strdup(str);
+    }
+}
+
+void readline_set_completion_index(ReadLineState *rs, int index)
+{
+    rs->completion_index = index;
+}
+
+static void readline_completion(ReadLineState *rs)
+{
+    int len, i, j, max_width, nb_cols, max_prefix;
+    char *cmdline;
+
+    rs->nb_completions = 0;
+
+    cmdline = g_malloc(rs->cmd_buf_index + 1);
+    memcpy(cmdline, rs->cmd_buf, rs->cmd_buf_index);
+    cmdline[rs->cmd_buf_index] = '\0';
+    rs->completion_finder(rs->opaque, cmdline);
+    g_free(cmdline);
+
+    /* no completion found */
+    if (rs->nb_completions <= 0)
+        return;
+    if (rs->nb_completions == 1) {
+        len = strlen(rs->completions[0]);
+        for(i = rs->completion_index; i < len; i++) {
+            readline_insert_char(rs, rs->completions[0][i]);
+        }
+        /* extra space for next argument. XXX: make it more generic */
+        if (len > 0 && rs->completions[0][len - 1] != '/')
+            readline_insert_char(rs, ' ');
+    } else {
+        rs->printf_func(rs->opaque, "\n");
+        max_width = 0;
+        max_prefix = 0;	
+        for(i = 0; i < rs->nb_completions; i++) {
+            len = strlen(rs->completions[i]);
+            if (i==0) {
+                max_prefix = len;
+            } else {
+                if (len < max_prefix)
+                    max_prefix = len;
+                for(j=0; j<max_prefix; j++) {
+                    if (rs->completions[i][j] != rs->completions[0][j])
+                        max_prefix = j;
+                }
+            }
+            if (len > max_width)
+                max_width = len;
+        }
+        if (max_prefix > 0) 
+            for(i = rs->completion_index; i < max_prefix; i++) {
+                readline_insert_char(rs, rs->completions[0][i]);
+            }
+        max_width += 2;
+        if (max_width < 10)
+            max_width = 10;
+        else if (max_width > 80)
+            max_width = 80;
+        nb_cols = 80 / max_width;
+        j = 0;
+        for(i = 0; i < rs->nb_completions; i++) {
+            rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]);
+            if (++j == nb_cols || i == (rs->nb_completions - 1)) {
+                rs->printf_func(rs->opaque, "\n");
+                j = 0;
+            }
+        }
+        readline_show_prompt(rs);
+    }
+    for (i = 0; i < rs->nb_completions; i++) {
+        g_free(rs->completions[i]);
+    }
+}
+
+/* return true if command handled */
+void readline_handle_byte(ReadLineState *rs, int ch)
+{
+    switch(rs->esc_state) {
+    case IS_NORM:
+        switch(ch) {
+        case 1:
+            readline_bol(rs);
+            break;
+        case 4:
+            readline_delete_char(rs);
+            break;
+        case 5:
+            readline_eol(rs);
+            break;
+        case 9:
+            readline_completion(rs);
+            break;
+        case 10:
+        case 13:
+            rs->cmd_buf[rs->cmd_buf_size] = '\0';
+            if (!rs->read_password)
+                readline_hist_add(rs, rs->cmd_buf);
+            rs->printf_func(rs->opaque, "\n");
+            rs->cmd_buf_index = 0;
+            rs->cmd_buf_size = 0;
+            rs->last_cmd_buf_index = 0;
+            rs->last_cmd_buf_size = 0;
+            rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque);
+            break;
+        case 23:
+            /* ^W */
+            readline_backword(rs);
+            break;
+        case 27:
+            rs->esc_state = IS_ESC;
+            break;
+        case 127:
+        case 8:
+            readline_backspace(rs);
+            break;
+	case 155:
+            rs->esc_state = IS_CSI;
+	    break;
+        default:
+            if (ch >= 32) {
+                readline_insert_char(rs, ch);
+            }
+            break;
+        }
+        break;
+    case IS_ESC:
+        if (ch == '[') {
+            rs->esc_state = IS_CSI;
+            rs->esc_param = 0;
+        } else if (ch == 'O') {
+            rs->esc_state = IS_SS3;
+            rs->esc_param = 0;
+        } else {
+            rs->esc_state = IS_NORM;
+        }
+        break;
+    case IS_CSI:
+        switch(ch) {
+	case 'A':
+	case 'F':
+	    readline_up_char(rs);
+	    break;
+	case 'B':
+	case 'E':
+	    readline_down_char(rs);
+	    break;
+        case 'D':
+            readline_backward_char(rs);
+            break;
+        case 'C':
+            readline_forward_char(rs);
+            break;
+        case '0' ... '9':
+            rs->esc_param = rs->esc_param * 10 + (ch - '0');
+            goto the_end;
+        case '~':
+            switch(rs->esc_param) {
+            case 1:
+                readline_bol(rs);
+                break;
+            case 3:
+                readline_delete_char(rs);
+                break;
+            case 4:
+                readline_eol(rs);
+                break;
+            }
+            break;
+        default:
+            break;
+        }
+        rs->esc_state = IS_NORM;
+    the_end:
+        break;
+    case IS_SS3:
+        switch(ch) {
+        case 'F':
+            readline_eol(rs);
+            break;
+        case 'H':
+            readline_bol(rs);
+            break;
+        }
+        rs->esc_state = IS_NORM;
+        break;
+    }
+    readline_update(rs);
+}
+
+void readline_start(ReadLineState *rs, const char *prompt, int read_password,
+                    ReadLineFunc *readline_func, void *opaque)
+{
+    pstrcpy(rs->prompt, sizeof(rs->prompt), prompt);
+    rs->readline_func = readline_func;
+    rs->readline_opaque = opaque;
+    rs->read_password = read_password;
+    readline_restart(rs);
+}
+
+void readline_restart(ReadLineState *rs)
+{
+    rs->cmd_buf_index = 0;
+    rs->cmd_buf_size = 0;
+}
+
+const char *readline_get_history(ReadLineState *rs, unsigned int index)
+{
+    if (index >= READLINE_MAX_CMDS)
+        return NULL;
+    return rs->history[index];
+}
+
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
+                             ReadLineCompletionFunc *completion_finder)
+{
+    ReadLineState *rs = g_malloc0(sizeof(*rs));
+
+    rs->hist_entry = -1;
+    rs->opaque = opaque;
+    rs->printf_func = printf_func;
+    rs->flush_func = flush_func;
+    rs->completion_finder = completion_finder;
+
+    return rs;
+}
commit c60bf3391bf4cb79b7adc6650094e21671ddaabd
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Nov 14 11:54:14 2013 +0100

    readline: decouple readline from the monitor
    
    Make the readline.c functionality reusable.  Instead of calling
    monitor_printf() and monitor_flush() directly, invoke function pointers
    provided by the user.
    
    This way readline.c does not know about Monitor and other users will be
    able to make use of readline.c.
    
    Note that there is already an "opaque" argument to the ReadLineFunc
    callback.  Consistently call it "readline_opaque" from now on to
    distinguish from the ReadLinePrintfFunc/ReadLineFlushFunc "opaque"
    argument.
    
    I also dropped the printf macro trickery since it's now highly unlikely
    that anyone modifying readline.c would call printf(3) directly.  We no
    longer need this protection.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/hmp.c b/hmp.c
index 79f9c7d..468f97d 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1092,11 +1092,11 @@ void hmp_eject(Monitor *mon, const QDict *qdict)
     hmp_handle_error(mon, &err);
 }
 
-static void hmp_change_read_arg(Monitor *mon, const char *password,
-                                void *opaque)
+static void hmp_change_read_arg(void *opaque, const char *password,
+                                void *readline_opaque)
 {
     qmp_change_vnc_password(password, NULL);
-    monitor_read_command(mon, 1);
+    monitor_read_command(opaque, 1);
 }
 
 void hmp_change(Monitor *mon, const QDict *qdict)
diff --git a/include/monitor/readline.h b/include/monitor/readline.h
index 0faf6e1..a89fe4a 100644
--- a/include/monitor/readline.h
+++ b/include/monitor/readline.h
@@ -1,14 +1,15 @@
 #ifndef READLINE_H
 #define READLINE_H
 
-#include "qemu-common.h"
-
 #define READLINE_CMD_BUF_SIZE 4095
 #define READLINE_MAX_CMDS 64
 #define READLINE_MAX_COMPLETIONS 256
 
-typedef void ReadLineFunc(Monitor *mon, const char *str, void *opaque);
-typedef void ReadLineCompletionFunc(Monitor *mon,
+typedef void ReadLinePrintfFunc(void *opaque, const char *fmt, ...);
+typedef void ReadLineFlushFunc(void *opaque);
+typedef void ReadLineFunc(void *opaque, const char *str,
+                          void *readline_opaque);
+typedef void ReadLineCompletionFunc(void *opaque,
                                     const char *cmdline);
 
 typedef struct ReadLineState {
@@ -35,7 +36,10 @@ typedef struct ReadLineState {
     void *readline_opaque;
     int read_password;
     char prompt[256];
-    Monitor *mon;
+
+    ReadLinePrintfFunc *printf_func;
+    ReadLineFlushFunc *flush_func;
+    void *opaque;
 } ReadLineState;
 
 void readline_add_completion(ReadLineState *rs, const char *str);
@@ -46,11 +50,13 @@ const char *readline_get_history(ReadLineState *rs, unsigned int index);
 void readline_handle_byte(ReadLineState *rs, int ch);
 
 void readline_start(ReadLineState *rs, const char *prompt, int read_password,
-                    ReadLineFunc *readline_func, void *opaque);
+                    ReadLineFunc *readline_func, void *readline_opaque);
 void readline_restart(ReadLineState *rs);
 void readline_show_prompt(ReadLineState *rs);
 
-ReadLineState *readline_init(Monitor *mon,
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
                              ReadLineCompletionFunc *completion_finder);
 
 #endif /* !READLINE_H */
diff --git a/monitor.c b/monitor.c
index 845f608..32d0264 100644
--- a/monitor.c
+++ b/monitor.c
@@ -217,8 +217,8 @@ static const mon_cmd_t qmp_cmds[];
 Monitor *cur_mon;
 Monitor *default_mon;
 
-static void monitor_command_cb(Monitor *mon, const char *cmdline,
-                               void *opaque);
+static void monitor_command_cb(void *opaque, const char *cmdline,
+                               void *readline_opaque);
 
 static inline int qmp_cmd_mode(const Monitor *mon)
 {
@@ -4338,9 +4338,10 @@ static void monitor_find_completion_by_table(Monitor *mon,
     }
 }
 
-static void monitor_find_completion(Monitor *mon,
+static void monitor_find_completion(void *opaque,
                                     const char *cmdline)
 {
+    Monitor *mon = opaque;
     char *args[MAX_ARGS];
     int nb_args, len;
 
@@ -4751,8 +4752,11 @@ static void monitor_read(void *opaque, const uint8_t *buf, int size)
     cur_mon = old_mon;
 }
 
-static void monitor_command_cb(Monitor *mon, const char *cmdline, void *opaque)
+static void monitor_command_cb(void *opaque, const char *cmdline,
+                               void *readline_opaque)
 {
+    Monitor *mon = opaque;
+
     monitor_suspend(mon);
     handle_user_command(mon, cmdline);
     monitor_resume(mon);
@@ -4881,6 +4885,22 @@ static void sortcmdlist(void)
  * End:
  */
 
+/* These functions just adapt the readline interface in a typesafe way.  We
+ * could cast function pointers but that discards compiler checks.
+ */
+static void monitor_readline_printf(void *opaque, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    monitor_vprintf(opaque, fmt, ap);
+    va_end(ap);
+}
+
+static void monitor_readline_flush(void *opaque)
+{
+    monitor_flush(opaque);
+}
+
 void monitor_init(CharDriverState *chr, int flags)
 {
     static int is_first_init = 1;
@@ -4898,7 +4918,10 @@ void monitor_init(CharDriverState *chr, int flags)
     mon->chr = chr;
     mon->flags = flags;
     if (flags & MONITOR_USE_READLINE) {
-        mon->rs = readline_init(mon, monitor_find_completion);
+        mon->rs = readline_init(monitor_readline_printf,
+                                monitor_readline_flush,
+                                mon,
+                                monitor_find_completion);
         monitor_read_command(mon, 0);
     }
 
@@ -4920,9 +4943,11 @@ void monitor_init(CharDriverState *chr, int flags)
         default_mon = mon;
 }
 
-static void bdrv_password_cb(Monitor *mon, const char *password, void *opaque)
+static void bdrv_password_cb(void *opaque, const char *password,
+                             void *readline_opaque)
 {
-    BlockDriverState *bs = opaque;
+    Monitor *mon = opaque;
+    BlockDriverState *bs = readline_opaque;
     int ret = 0;
 
     if (bdrv_set_key(bs, password) != 0) {
diff --git a/readline.c b/readline.c
index abf27dd..ca894d1 100644
--- a/readline.c
+++ b/readline.c
@@ -21,21 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+
+#include "qemu-common.h"
 #include "monitor/readline.h"
-#include "monitor/monitor.h"
 
 #define IS_NORM 0
 #define IS_ESC  1
 #define IS_CSI  2
 #define IS_SS3  3
 
-#undef printf
-#define printf do_not_use_printf
-
 void readline_show_prompt(ReadLineState *rs)
 {
-    monitor_printf(rs->mon, "%s", rs->prompt);
-    monitor_flush(rs->mon);
+    rs->printf_func(rs->opaque, "%s", rs->prompt);
+    rs->flush_func(rs->opaque);
     rs->last_cmd_buf_index = 0;
     rs->last_cmd_buf_size = 0;
     rs->esc_state = IS_NORM;
@@ -49,17 +47,17 @@ static void readline_update(ReadLineState *rs)
     if (rs->cmd_buf_size != rs->last_cmd_buf_size ||
         memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) {
         for(i = 0; i < rs->last_cmd_buf_index; i++) {
-            monitor_printf(rs->mon, "\033[D");
+            rs->printf_func(rs->opaque, "\033[D");
         }
         rs->cmd_buf[rs->cmd_buf_size] = '\0';
         if (rs->read_password) {
             len = strlen(rs->cmd_buf);
             for(i = 0; i < len; i++)
-                monitor_printf(rs->mon, "*");
+                rs->printf_func(rs->opaque, "*");
         } else {
-            monitor_printf(rs->mon, "%s", rs->cmd_buf);
+            rs->printf_func(rs->opaque, "%s", rs->cmd_buf);
         }
-        monitor_printf(rs->mon, "\033[K");
+        rs->printf_func(rs->opaque, "\033[K");
         memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size);
         rs->last_cmd_buf_size = rs->cmd_buf_size;
         rs->last_cmd_buf_index = rs->cmd_buf_size;
@@ -68,17 +66,17 @@ static void readline_update(ReadLineState *rs)
         delta = rs->cmd_buf_index - rs->last_cmd_buf_index;
         if (delta > 0) {
             for(i = 0;i < delta; i++) {
-                monitor_printf(rs->mon, "\033[C");
+                rs->printf_func(rs->opaque, "\033[C");
             }
         } else {
             delta = -delta;
             for(i = 0;i < delta; i++) {
-                monitor_printf(rs->mon, "\033[D");
+                rs->printf_func(rs->opaque, "\033[D");
             }
         }
         rs->last_cmd_buf_index = rs->cmd_buf_index;
     }
-    monitor_flush(rs->mon);
+    rs->flush_func(rs->opaque);
 }
 
 static void readline_insert_char(ReadLineState *rs, int ch)
@@ -284,7 +282,7 @@ static void readline_completion(ReadLineState *rs)
     cmdline = g_malloc(rs->cmd_buf_index + 1);
     memcpy(cmdline, rs->cmd_buf, rs->cmd_buf_index);
     cmdline[rs->cmd_buf_index] = '\0';
-    rs->completion_finder(rs->mon, cmdline);
+    rs->completion_finder(rs->opaque, cmdline);
     g_free(cmdline);
 
     /* no completion found */
@@ -299,7 +297,7 @@ static void readline_completion(ReadLineState *rs)
         if (len > 0 && rs->completions[0][len - 1] != '/')
             readline_insert_char(rs, ' ');
     } else {
-        monitor_printf(rs->mon, "\n");
+        rs->printf_func(rs->opaque, "\n");
         max_width = 0;
         max_prefix = 0;	
         for(i = 0; i < rs->nb_completions; i++) {
@@ -329,9 +327,9 @@ static void readline_completion(ReadLineState *rs)
         nb_cols = 80 / max_width;
         j = 0;
         for(i = 0; i < rs->nb_completions; i++) {
-            monitor_printf(rs->mon, "%-*s", max_width, rs->completions[i]);
+            rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]);
             if (++j == nb_cols || i == (rs->nb_completions - 1)) {
-                monitor_printf(rs->mon, "\n");
+                rs->printf_func(rs->opaque, "\n");
                 j = 0;
             }
         }
@@ -365,12 +363,12 @@ void readline_handle_byte(ReadLineState *rs, int ch)
             rs->cmd_buf[rs->cmd_buf_size] = '\0';
             if (!rs->read_password)
                 readline_hist_add(rs, rs->cmd_buf);
-            monitor_printf(rs->mon, "\n");
+            rs->printf_func(rs->opaque, "\n");
             rs->cmd_buf_index = 0;
             rs->cmd_buf_size = 0;
             rs->last_cmd_buf_index = 0;
             rs->last_cmd_buf_size = 0;
-            rs->readline_func(rs->mon, rs->cmd_buf, rs->readline_opaque);
+            rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque);
             break;
         case 23:
             /* ^W */
@@ -480,13 +478,17 @@ const char *readline_get_history(ReadLineState *rs, unsigned int index)
     return rs->history[index];
 }
 
-ReadLineState *readline_init(Monitor *mon,
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
                              ReadLineCompletionFunc *completion_finder)
 {
     ReadLineState *rs = g_malloc0(sizeof(*rs));
 
     rs->hist_entry = -1;
-    rs->mon = mon;
+    rs->opaque = opaque;
+    rs->printf_func = printf_func;
+    rs->flush_func = flush_func;
     rs->completion_finder = completion_finder;
 
     return rs;
commit 585ea0c841df47c1542d33e17c5c6d532316ef74
Author: Fam Zheng <famz at redhat.com>
Date:   Wed Jan 8 09:42:07 2014 +0800

    vmdk: Fix big flat extent IO
    
    Local variable "n" as int64_t avoids overflow with large sector number
    calculation. See test case change for failure case.
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/vmdk.c b/block/vmdk.c
index c6b60b4..22b99b0 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1325,8 +1325,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
-    int n, ret;
-    int64_t index_in_cluster;
+    int ret;
+    int64_t index_in_cluster, n;
     uint64_t extent_begin_sector, extent_relative_sector_num;
     uint64_t cluster_offset;
     VmdkMetaData m_data;
diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059
index d8215ae..64ed04c 100755
--- a/tests/qemu-iotests/059
+++ b/tests/qemu-iotests/059
@@ -102,6 +102,13 @@ echo "=== Testing version 3 ==="
 _use_sample_img iotest-version3.vmdk.bz2
 _img_info
 
+echo
+echo "=== Testing 4TB monolithicFlat creation and IO ==="
+IMGOPTS="subformat=monolithicFlat" _make_test_img 4T
+_img_info
+$QEMU_IO -c "write -P 0xa 900G 512" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -v 900G 1024" "$TEST_IMG" | _filter_qemu_io
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index 16ab7c6..5e30e69 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -2047,4 +2047,78 @@ RW 12582912 VMFS "dummy.IMGFMT" 1
 image: TEST_DIR/iotest-version3.IMGFMT
 file format: IMGFMT
 virtual size: 1.0G (1073741824 bytes)
+
+=== Testing 4TB monolithicFlat creation and IO ===
+Formatting 'TEST_DIR/iotest-version3.IMGFMT', fmt=IMGFMT size=4398046511104
+image: TEST_DIR/iotest-version3.IMGFMT
+file format: IMGFMT
+virtual size: 4.0T (4398046511104 bytes)
+wrote 512/512 bytes at offset 966367641600
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+e100000000:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000010:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000020:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000030:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000040:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000050:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000060:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000070:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000080:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000090:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000a0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000b0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000c0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000d0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000e0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000000f0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000100:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000110:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000120:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000130:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000140:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000150:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000160:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000170:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000180:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000190:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001a0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001b0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001c0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001d0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001e0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e1000001f0:  0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a 0a  ................
+e100000200:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000210:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000220:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000230:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000240:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000250:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000260:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000270:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000280:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000290:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002a0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002b0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002c0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002d0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002e0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000002f0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000300:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000310:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000320:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000330:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000340:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000350:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000360:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000370:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000380:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e100000390:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003a0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003b0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003c0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003d0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003e0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+e1000003f0:  00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+read 1024/1024 bytes at offset 966367641600
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 *** done
commit 7fa9e1f941b4be1f71bb42de2f2ed8805d7e7326
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Mon Jan 6 12:39:01 2014 +0800

    docs: qcow2 compat=1.1 is now the default
    
    Commit 9117b47717ad208b12786ce88eacb013f9b3dd1c ("qcow2: Change default
    for new images to compat=1.1") changed the default qcow2 image format
    version but forgot to update qemu-doc.texi and qemu-img.texi.
    
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Reviewed-by: Eric Blake <eblake at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/qemu-doc.texi b/qemu-doc.texi
index 4e9c6e9..ce61f30 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -536,11 +536,11 @@ support of multiple VM snapshots.
 Supported options:
 @table @code
 @item compat
-Determines the qcow2 version to use. @code{compat=0.10} uses the traditional
-image format that can be read by any QEMU since 0.10 (this is the default).
+Determines the qcow2 version to use. @code{compat=0.10} uses the
+traditional image format that can be read by any QEMU since 0.10.
 @code{compat=1.1} enables image format extensions that only QEMU 1.1 and
-newer understand. Amongst others, this includes zero clusters, which allow
-efficient copy-on-read for sparse images.
+newer understand (this is the default). Amongst others, this includes
+zero clusters, which allow efficient copy-on-read for sparse images.
 
 @item backing_file
 File name of a base image (see @option{create} subcommand)
diff --git a/qemu-img.texi b/qemu-img.texi
index 1bba91e..778e967 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -391,11 +391,11 @@ support of multiple VM snapshots.
 Supported options:
 @table @code
 @item compat
-Determines the qcow2 version to use. @code{compat=0.10} uses the traditional
-image format that can be read by any QEMU since 0.10 (this is the default).
+Determines the qcow2 version to use. @code{compat=0.10} uses the
+traditional image format that can be read by any QEMU since 0.10.
 @code{compat=1.1} enables image format extensions that only QEMU 1.1 and
-newer understand. Amongst others, this includes zero clusters, which allow
-efficient copy-on-read for sparse images.
+newer understand (this is the default). Amongst others, this includes zero
+clusters, which allow efficient copy-on-read for sparse images.
 
 @item backing_file
 File name of a base image (see @option{create} subcommand)
commit b7fcff01790d25f48d81ef6c8c3399577096a555
Author: Kewei Yu <keweihk at gmail.com>
Date:   Mon Jan 6 14:05:24 2014 +0800

    qtest: Fix the bug about disable vnc causes "make check" fail
    
    When we disable vnc from "./configure", QEMU can't use the vnc option.
    So qtest can't use the "vnc -none ", otherwise "make check" fails.
    If QEMU uses "-display none", "-vnc none" is excrescent, So we just need to drop it.
    
    Signed-off-by: Kewei Yu <keweihk at gmail.com>
    Reviewed-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/fdc-test.c b/tests/fdc-test.c
index 38b5b17..37096dc 100644
--- a/tests/fdc-test.c
+++ b/tests/fdc-test.c
@@ -518,7 +518,6 @@ static void fuzz_registers(void)
 int main(int argc, char **argv)
 {
     const char *arch = qtest_get_arch();
-    char *cmdline;
     int fd;
     int ret;
 
@@ -538,9 +537,7 @@ int main(int argc, char **argv)
     /* Run the tests */
     g_test_init(&argc, &argv, NULL);
 
-    cmdline = g_strdup_printf("-vnc none ");
-
-    qtest_start(cmdline);
+    qtest_start(NULL);
     qtest_irq_intercept_in(global_qtest, "ioapic");
     qtest_add_func("/fdc/cmos", test_cmos);
     qtest_add_func("/fdc/no_media_on_start", test_no_media_on_start);
diff --git a/tests/ide-test.c b/tests/ide-test.c
index d5cec5a..4a0d97f 100644
--- a/tests/ide-test.c
+++ b/tests/ide-test.c
@@ -380,7 +380,6 @@ static void test_bmdma_no_busmaster(void)
 static void test_bmdma_setup(void)
 {
     ide_test_start(
-        "-vnc none "
         "-drive file=%s,if=ide,serial=%s,cache=writeback "
         "-global ide-hd.ver=%s",
         tmp_path, "testdisk", "version");
@@ -410,7 +409,6 @@ static void test_identify(void)
     int ret;
 
     ide_test_start(
-        "-vnc none "
         "-drive file=%s,if=ide,serial=%s,cache=writeback "
         "-global ide-hd.ver=%s",
         tmp_path, "testdisk", "version");
@@ -455,7 +453,6 @@ static void test_flush(void)
     uint8_t data;
 
     ide_test_start(
-        "-vnc none "
         "-drive file=blkdebug::%s,if=ide,cache=writeback",
         tmp_path);
 
commit 9f23fce7b2e78b917f03ccd366e3e151c0a1a419
Author: Liu Yuan <namei.unix at gmail.com>
Date:   Fri Jan 3 20:13:12 2014 +0800

    sheepdog: fix clone operation by 'qemu-img create -b'
    
    We should pass base_inode->vdi_id to base_vdi_id of SheepdogVdiReq so that sheep
    can create a clone instead a fresh volume.
    
    This fixes following command:
    
    qemu-create -b sheepdog:base sheepdog:clone
    
    so users can boot sheepdog:clone as a normal volume.
    
    Cc: qemu-devel at nongnu.org
    Cc: Kevin Wolf <kwolf at redhat.com>
    Cc: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Liu Yuan <namei.unix at gmail.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/sheepdog.c b/block/sheepdog.c
index b94ab6e..6088fa5 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -161,7 +161,7 @@ typedef struct SheepdogVdiReq {
     uint32_t id;
     uint32_t data_length;
     uint64_t vdi_size;
-    uint32_t vdi_id;
+    uint32_t base_vdi_id;
     uint8_t copies;
     uint8_t copy_policy;
     uint8_t reserved[2];
@@ -1493,7 +1493,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot)
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
-    hdr.vdi_id = s->inode.vdi_id;
+    hdr.base_vdi_id = s->inode.vdi_id;
 
     wlen = SD_MAX_VDI_LEN;
 
@@ -1684,7 +1684,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
 
     if (backing_file) {
         BlockDriverState *bs;
-        BDRVSheepdogState *s;
+        BDRVSheepdogState *base;
         BlockDriver *drv;
 
         /* Currently, only Sheepdog backing image is supported. */
@@ -1702,15 +1702,15 @@ static int sd_create(const char *filename, QEMUOptionParameter *options,
             goto out;
         }
 
-        s = bs->opaque;
+        base = bs->opaque;
 
-        if (!is_snapshot(&s->inode)) {
+        if (!is_snapshot(&base->inode)) {
             error_report("cannot clone from a non snapshot vdi");
             bdrv_unref(bs);
             ret = -EINVAL;
             goto out;
         }
-
+        s->inode.vdi_id = base->inode.vdi_id;
         bdrv_unref(bs);
     }
 
@@ -1743,7 +1743,7 @@ static void sd_close(BlockDriverState *bs)
     memset(&hdr, 0, sizeof(hdr));
 
     hdr.opcode = SD_OP_RELEASE_VDI;
-    hdr.vdi_id = s->inode.vdi_id;
+    hdr.base_vdi_id = s->inode.vdi_id;
     wlen = strlen(s->name) + 1;
     hdr.data_length = wlen;
     hdr.flags = SD_FLAG_CMD_WRITE;
@@ -1846,7 +1846,7 @@ static bool sd_delete(BDRVSheepdogState *s)
     unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
     SheepdogVdiReq hdr = {
         .opcode = SD_OP_DEL_VDI,
-        .vdi_id = s->inode.vdi_id,
+        .base_vdi_id = s->inode.vdi_id,
         .data_length = wlen,
         .flags = SD_FLAG_CMD_WRITE,
     };
commit cf7f616b9d846b1cc21c7b692b5c9ff6f757a5e7
Author: Bharata B Rao <bharata at linux.vnet.ibm.com>
Date:   Sat Dec 21 14:51:26 2013 +0530

    gluster: Add support for creating zero-filled image
    
    GlusterFS supports creation of zero-filled file on GlusterFS volume
    by means of an API called glfs_zerofill(). Use this API from QEMU to
    create an image that is filled with zeroes by using the preallocation
    option of qemu-img.
    
    qemu-img create gluster://server/volume/image -o preallocation=full 10G
    
    The allowed values for preallocation are 'full' and 'off'. By default
    preallocation is off and image is not zero-filled.
    
    glfs_zerofill() offloads the writing of zeroes to the server and if
    the storage supports SCSI WRITESAME, GlusterFS server can issue
    BLKZEROOUT ioctl to achieve the zeroing.
    
    Signed-off-by: Bharata B Rao <bharata at linux.vnet.ibm.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/gluster.c b/block/gluster.c
index c11f60c..a009b15 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -354,6 +354,29 @@ out:
     g_slice_free(GlusterAIOCB, acb);
     return ret;
 }
+
+static inline bool gluster_supports_zerofill(void)
+{
+    return 1;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return glfs_zerofill(fd, offset, size);
+}
+
+#else
+static inline bool gluster_supports_zerofill(void)
+{
+    return 0;
+}
+
+static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
+        int64_t size)
+{
+    return 0;
+}
 #endif
 
 static int qemu_gluster_create(const char *filename,
@@ -362,6 +385,7 @@ static int qemu_gluster_create(const char *filename,
     struct glfs *glfs;
     struct glfs_fd *fd;
     int ret = 0;
+    int prealloc = 0;
     int64_t total_size = 0;
     GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
 
@@ -374,6 +398,19 @@ static int qemu_gluster_create(const char *filename,
     while (options && options->name) {
         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
             total_size = options->value.n / BDRV_SECTOR_SIZE;
+        } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
+            if (!options->value.s || !strcmp(options->value.s, "off")) {
+                prealloc = 0;
+            } else if (!strcmp(options->value.s, "full") &&
+                    gluster_supports_zerofill()) {
+                prealloc = 1;
+            } else {
+                error_setg(errp, "Invalid preallocation mode: '%s'"
+                    " or GlusterFS doesn't support zerofill API",
+                           options->value.s);
+                ret = -EINVAL;
+                goto out;
+            }
         }
         options++;
     }
@@ -383,9 +420,15 @@ static int qemu_gluster_create(const char *filename,
     if (!fd) {
         ret = -errno;
     } else {
-        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+        if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
+            if (prealloc && qemu_gluster_zerofill(fd, 0,
+                    total_size * BDRV_SECTOR_SIZE)) {
+                ret = -errno;
+            }
+        } else {
             ret = -errno;
         }
+
         if (glfs_close(fd) != 0) {
             ret = -errno;
         }
@@ -560,6 +603,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = {
         .type = OPT_SIZE,
         .help = "Virtual disk size"
     },
+    {
+        .name = BLOCK_OPT_PREALLOC,
+        .type = OPT_STRING,
+        .help = "Preallocation mode (allowed values: off, full)"
+    },
     { NULL }
 };
 
commit 7c815372f3b37754b2a568e82f0521c7f77a6f66
Author: Bharata B Rao <bharata at linux.vnet.ibm.com>
Date:   Sat Dec 21 14:51:25 2013 +0530

    gluster: Implement .bdrv_co_write_zeroes for gluster
    
    Support .bdrv_co_write_zeroes() from gluster driver by using GlusterFS API
    glfs_zerofill() that off-loads the writing of zeroes to GlusterFS server.
    
    Signed-off-by: Bharata B Rao <bharata at linux.vnet.ibm.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/gluster.c b/block/gluster.c
index f9aea0e..c11f60c 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -236,6 +236,25 @@ static void qemu_gluster_complete_aio(void *opaque)
     qemu_coroutine_enter(acb->coroutine, NULL);
 }
 
+/*
+ * AIO callback routine called from GlusterFS thread.
+ */
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+
+    if (!ret || ret == acb->size) {
+        acb->ret = 0; /* Success */
+    } else if (ret < 0) {
+        acb->ret = ret; /* Read/Write failed */
+    } else {
+        acb->ret = -EIO; /* Partial read/write - fail it */
+    }
+
+    acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
 /* TODO Convert to fine grained options */
 static QemuOptsList runtime_opts = {
     .name = "gluster",
@@ -308,6 +327,35 @@ out:
     return ret;
 }
 
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    BDRVGlusterState *s = bs->opaque;
+    off_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
+
+    acb->size = size;
+    acb->ret = 0;
+    acb->coroutine = qemu_coroutine_self();
+
+    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
+
+out:
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
+}
+#endif
+
 static int qemu_gluster_create(const char *filename,
         QEMUOptionParameter *options, Error **errp)
 {
@@ -350,25 +398,6 @@ out:
     return ret;
 }
 
-/*
- * AIO callback routine called from GlusterFS thread.
- */
-static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
-
-    if (!ret || ret == acb->size) {
-        acb->ret = 0; /* Success */
-    } else if (ret < 0) {
-        acb->ret = ret; /* Read/Write failed */
-    } else {
-        acb->ret = -EIO; /* Partial read/write - fail it */
-    }
-
-    acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
-    qemu_bh_schedule(acb->bh);
-}
-
 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
@@ -552,6 +581,9 @@ static BlockDriver bdrv_gluster = {
 #ifdef CONFIG_GLUSTERFS_DISCARD
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -573,6 +605,9 @@ static BlockDriver bdrv_gluster_tcp = {
 #ifdef CONFIG_GLUSTERFS_DISCARD
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -594,6 +629,9 @@ static BlockDriver bdrv_gluster_unix = {
 #ifdef CONFIG_GLUSTERFS_DISCARD
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
@@ -615,6 +653,9 @@ static BlockDriver bdrv_gluster_rdma = {
 #ifdef CONFIG_GLUSTERFS_DISCARD
     .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
+#ifdef CONFIG_GLUSTERFS_ZEROFILL
+    .bdrv_co_write_zeroes         = qemu_gluster_co_write_zeroes,
+#endif
     .create_options               = qemu_gluster_create_options,
 };
 
diff --git a/configure b/configure
index 3782a6a..b472694 100755
--- a/configure
+++ b/configure
@@ -256,6 +256,7 @@ coroutine_pool=""
 seccomp=""
 glusterfs=""
 glusterfs_discard="no"
+glusterfs_zerofill="no"
 virtio_blk_data_plane=""
 gtk=""
 gtkabi="2.0"
@@ -2701,6 +2702,9 @@ if test "$glusterfs" != "no" ; then
     if $pkg_config --atleast-version=5 glusterfs-api; then
       glusterfs_discard="yes"
     fi
+    if $pkg_config --atleast-version=6 glusterfs-api; then
+      glusterfs_zerofill="yes"
+    fi
   else
     if test "$glusterfs" = "yes" ; then
       feature_not_found "GlusterFS backend support"
@@ -4229,6 +4233,10 @@ if test "$glusterfs_discard" = "yes" ; then
   echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak
 fi
 
+if test "$glusterfs_zerofill" = "yes" ; then
+  echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak
+fi
+
 if test "$libssh2" = "yes" ; then
   echo "CONFIG_LIBSSH2=y" >> $config_host_mak
 fi
commit 15744b0b8f63d624bdd5825011cd201541a62094
Author: Bharata B Rao <bharata at linux.vnet.ibm.com>
Date:   Sat Dec 21 14:51:24 2013 +0530

    gluster: Convert aio routines into coroutines
    
    Convert the read, write, flush and discard implementations from aio-based
    ones to coroutine based ones.
    
    Signed-off-by: Bharata B Rao <bharata at linux.vnet.ibm.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/gluster.c b/block/gluster.c
index 563d497..f9aea0e 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -21,19 +21,15 @@
 #include "qemu/uri.h"
 
 typedef struct GlusterAIOCB {
-    BlockDriverAIOCB common;
     int64_t size;
     int ret;
-    bool *finished;
     QEMUBH *bh;
+    Coroutine *coroutine;
 } GlusterAIOCB;
 
 typedef struct BDRVGlusterState {
     struct glfs *glfs;
-    int fds[2];
     struct glfs_fd *fd;
-    int event_reader_pos;
-    GlusterAIOCB *event_acb;
 } BDRVGlusterState;
 
 #define GLUSTER_FD_READ  0
@@ -231,46 +227,13 @@ out:
     return NULL;
 }
 
-static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+static void qemu_gluster_complete_aio(void *opaque)
 {
-    int ret;
-    bool *finished = acb->finished;
-    BlockDriverCompletionFunc *cb = acb->common.cb;
-    void *opaque = acb->common.opaque;
-
-    if (!acb->ret || acb->ret == acb->size) {
-        ret = 0; /* Success */
-    } else if (acb->ret < 0) {
-        ret = acb->ret; /* Read/Write failed */
-    } else {
-        ret = -EIO; /* Partial read/write - fail it */
-    }
+    GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
 
-    qemu_aio_release(acb);
-    cb(opaque, ret);
-    if (finished) {
-        *finished = true;
-    }
-}
-
-static void qemu_gluster_aio_event_reader(void *opaque)
-{
-    BDRVGlusterState *s = opaque;
-    ssize_t ret;
-
-    do {
-        char *p = (char *)&s->event_acb;
-
-        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_acb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_acb)) {
-                s->event_reader_pos = 0;
-                qemu_gluster_complete_aio(s->event_acb, s);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_coroutine_enter(acb->coroutine, NULL);
 }
 
 /* TODO Convert to fine grained options */
@@ -309,7 +272,6 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
 
     filename = qemu_opt_get(opts, "filename");
 
-
     s->glfs = qemu_gluster_init(gconf, filename);
     if (!s->glfs) {
         ret = -errno;
@@ -329,17 +291,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
     s->fd = glfs_open(s->glfs, gconf->image, open_flags);
     if (!s->fd) {
         ret = -errno;
-        goto out;
-    }
-
-    ret = qemu_pipe(s->fds);
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
     }
-    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
-        qemu_gluster_aio_event_reader, NULL, s);
 
 out:
     qemu_opts_del(opts);
@@ -398,58 +350,37 @@ out:
     return ret;
 }
 
-static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
-    bool finished = false;
-
-    acb->finished = &finished;
-    while (!finished) {
-        qemu_aio_wait();
-    }
-}
-
-static const AIOCBInfo gluster_aiocb_info = {
-    .aiocb_size = sizeof(GlusterAIOCB),
-    .cancel = qemu_gluster_aio_cancel,
-};
-
+/*
+ * AIO callback routine called from GlusterFS thread.
+ */
 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
 {
     GlusterAIOCB *acb = (GlusterAIOCB *)arg;
-    BlockDriverState *bs = acb->common.bs;
-    BDRVGlusterState *s = bs->opaque;
-    int retval;
-
-    acb->ret = ret;
-    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
-    if (retval != sizeof(acb)) {
-        /*
-         * Gluster AIO callback thread failed to notify the waiting
-         * QEMU thread about IO completion.
-         */
-        error_report("Gluster AIO completion failed: %s", strerror(errno));
-        abort();
+
+    if (!ret || ret == acb->size) {
+        acb->ret = 0; /* Success */
+    } else if (ret < 0) {
+        acb->ret = ret; /* Read/Write failed */
+    } else {
+        acb->ret = -EIO; /* Partial read/write - fail it */
     }
+
+    acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
+    qemu_bh_schedule(acb->bh);
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int write)
+static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = size;
     acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();
 
     if (write) {
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
@@ -460,13 +391,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
     }
 
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 
 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -482,71 +416,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
     return 0;
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
 
-static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
 
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = 0;
     acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();
 
     ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 
 #ifdef CONFIG_GLUSTERFS_DISCARD
-static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb,
-        void *opaque)
+static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors)
 {
     int ret;
-    GlusterAIOCB *acb;
+    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
     BDRVGlusterState *s = bs->opaque;
-    size_t size;
-    off_t offset;
-
-    offset = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
+    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
+    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque);
     acb->size = 0;
     acb->ret = 0;
-    acb->finished = NULL;
+    acb->coroutine = qemu_coroutine_self();
 
     ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
     if (ret < 0) {
+        ret = -errno;
         goto out;
     }
-    return &acb->common;
+
+    qemu_coroutine_yield();
+    ret = acb->ret;
 
 out:
-    qemu_aio_release(acb);
-    return NULL;
+    g_slice_free(GlusterAIOCB, acb);
+    return ret;
 }
 #endif
 
@@ -581,10 +512,6 @@ static void qemu_gluster_close(BlockDriverState *bs)
 {
     BDRVGlusterState *s = bs->opaque;
 
-    close(s->fds[GLUSTER_FD_READ]);
-    close(s->fds[GLUSTER_FD_WRITE]);
-    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL);
-
     if (s->fd) {
         glfs_close(s->fd);
         s->fd = NULL;
@@ -618,12 +545,12 @@ static BlockDriver bdrv_gluster = {
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -639,12 +566,12 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -660,12 +587,12 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
@@ -681,12 +608,12 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_getlength               = qemu_gluster_getlength,
     .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
     .bdrv_truncate                = qemu_gluster_truncate,
-    .bdrv_aio_readv               = qemu_gluster_aio_readv,
-    .bdrv_aio_writev              = qemu_gluster_aio_writev,
-    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .bdrv_co_readv                = qemu_gluster_co_readv,
+    .bdrv_co_writev               = qemu_gluster_co_writev,
+    .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_aio_discard             = qemu_gluster_aio_discard,
+    .bdrv_co_discard              = qemu_gluster_co_discard,
 #endif
     .create_options               = qemu_gluster_create_options,
 };
commit 92397116a6789ed4455c6dacea0f378cae096d8d
Author: Peter Lieven <pl at kamp.de>
Date:   Fri Dec 20 10:02:47 2013 +0100

    block/iscsi: return -ENOMEM if an async call fails immediately
    
    if an async libiscsi call fails directly it can only be due
    to an out of memory condition. All other errors are returned
    through the callback.
    
    Signed-off-by: Peter Lieven <pl at kamp.de>
    Reviewed-by: Ronnie Sahlberg <ronniesahlberg at gmail.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/iscsi.c b/block/iscsi.c
index c0ea0c4..76b3c96 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -308,7 +308,7 @@ retry:
                                     iscsi_co_generic_cb, &iTask);
     if (iTask.task == NULL) {
         g_free(buf);
-        return -EIO;
+        return -ENOMEM;
     }
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
     scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
@@ -376,7 +376,7 @@ retry:
         break;
     }
     if (iTask.task == NULL) {
-        return -EIO;
+        return -ENOMEM;
     }
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
     scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov);
@@ -419,7 +419,7 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 retry:
     if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                       0, iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
     }
 
     while (!iTask.complete) {
@@ -669,7 +669,7 @@ retry:
                                   sector_qemu2lun(sector_num, iscsilun),
                                   8 + 16, iscsi_co_generic_cb,
                                   &iTask) == NULL) {
-        ret = -EIO;
+        ret = -ENOMEM;
         goto out;
     }
 
@@ -753,7 +753,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
 retry:
     if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
                      iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
     }
 
     while (!iTask.complete) {
@@ -822,7 +822,7 @@ retry:
                                iscsilun->zeroblock, iscsilun->block_size,
                                nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP),
                                0, 0, iscsi_co_generic_cb, &iTask) == NULL) {
-        return -EIO;
+        return -ENOMEM;
     }
 
     while (!iTask.complete) {
commit 487c1910023c83fa6d550a50c8ad7ee730e60bfa
Author: Fam Zheng <famz at redhat.com>
Date:   Tue Nov 26 14:40:34 2013 +0800

    qemu-iotests: Clean up all extents for vmdk
    
    This modifies _cleanup_test_img to remove all the extent files listed by
    "qemu-img info"'s format specific information.
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 2489c43..0f68156 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -170,6 +170,17 @@ _make_test_img()
     fi
 }
 
+_rm_test_img()
+{
+    local img=$1
+    if [ "$IMGFMT" = "vmdk" ]; then
+        # Remove all the extents for vmdk
+        $QEMU_IMG info $img 2>/dev/null | grep 'filename:' | cut -f 2 -d: \
+            | xargs -I {} rm -f "{}"
+    fi
+    rm -f $img
+}
+
 _cleanup_test_img()
 {
     case "$IMGPROTO" in
@@ -179,9 +190,9 @@ _cleanup_test_img()
             rm -f "$TEST_IMG_FILE"
             ;;
         file)
-            rm -f "$TEST_DIR/t.$IMGFMT"
-            rm -f "$TEST_DIR/t.$IMGFMT.orig"
-            rm -f "$TEST_DIR/t.$IMGFMT.base"
+            _rm_test_img "$TEST_DIR/t.$IMGFMT"
+            _rm_test_img "$TEST_DIR/t.$IMGFMT.orig"
+            _rm_test_img "$TEST_DIR/t.$IMGFMT.base"
             if [ -n "$SAMPLE_IMG_FILE" ]
             then
                 rm -f "$TEST_DIR/$SAMPLE_IMG_FILE"
commit d2329f27c9c8408d4134c7243313dbaa37270384
Author: Fam Zheng <famz at redhat.com>
Date:   Tue Nov 26 14:40:33 2013 +0800

    qemu-iotests: Add _unsupported_imgopts for vmdk subformats
    
    Some cases are not applicable for vmdk subformats those don't support
    certain features, e.g. backing file, and some others can't run on
    mult-file image, e.g. monolithicFlat. This adds declaration in test
    cases to skip them automatically, so that iotests on vmdk can go
    more smoothly (without manually picking of cases for each subformat).
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/qemu-iotests/017 b/tests/qemu-iotests/017
index aba3faf..3af3cdf 100755
--- a/tests/qemu-iotests/017
+++ b/tests/qemu-iotests/017
@@ -43,6 +43,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat"
 
 TEST_OFFSETS="0 4294967296"
 
diff --git a/tests/qemu-iotests/018 b/tests/qemu-iotests/018
index 15fcfe5..6f7f054 100755
--- a/tests/qemu-iotests/018
+++ b/tests/qemu-iotests/018
@@ -43,6 +43,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat"
 
 TEST_OFFSETS="0 4294967296"
 
diff --git a/tests/qemu-iotests/019 b/tests/qemu-iotests/019
index 5bb18d0..b43e70f 100755
--- a/tests/qemu-iotests/019
+++ b/tests/qemu-iotests/019
@@ -47,6 +47,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 TEST_OFFSETS="0 4294967296"
 CLUSTER_SIZE=65536
diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020
index b3c86d8..73a0429 100755
--- a/tests/qemu-iotests/020
+++ b/tests/qemu-iotests/020
@@ -45,6 +45,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 TEST_OFFSETS="0 4294967296"
 
diff --git a/tests/qemu-iotests/034 b/tests/qemu-iotests/034
index 67f1959..7349789 100755
--- a/tests/qemu-iotests/034
+++ b/tests/qemu-iotests/034
@@ -41,6 +41,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 CLUSTER_SIZE=4k
 size=128M
diff --git a/tests/qemu-iotests/037 b/tests/qemu-iotests/037
index 743bae3..e444349 100755
--- a/tests/qemu-iotests/037
+++ b/tests/qemu-iotests/037
@@ -41,6 +41,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 CLUSTER_SIZE=4k
 size=128M
diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059
index 65bea1d..d8215ae 100755
--- a/tests/qemu-iotests/059
+++ b/tests/qemu-iotests/059
@@ -42,6 +42,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt vmdk
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 capacity_offset=16
 granularity_offset=20
diff --git a/tests/qemu-iotests/063 b/tests/qemu-iotests/063
index 2ab8f20..77503a2 100755
--- a/tests/qemu-iotests/063
+++ b/tests/qemu-iotests/063
@@ -44,6 +44,9 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow qcow2 vmdk qed raw
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" \
+                     "subformat=twoGbMaxExtentFlat" \
+                     "subformat=twoGbMaxExtentSparse"
 
 _make_test_img 4M
 
diff --git a/tests/qemu-iotests/069 b/tests/qemu-iotests/069
index 3042803..50347d9 100755
--- a/tests/qemu-iotests/069
+++ b/tests/qemu-iotests/069
@@ -41,6 +41,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt cow qed qcow qcow2 vmdk
 _supported_proto generic
 _supported_os Linux
+_unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat"
 
 IMG_SIZE=128K
 
commit 2c77f52e39ff2ba071e3b549ad7a3ebea0758edd
Author: Fam Zheng <famz at redhat.com>
Date:   Tue Nov 26 14:40:32 2013 +0800

    qemu-iotests: Introduce _unsupported_imgopts
    
    Introduce _unsupported_imgopts that causes _notrun for specific image
    options.
    
    Signed-off-by: Fam Zheng <famz at redhat.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 28ba0d9..2489c43 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -406,6 +406,17 @@ _default_cache_mode()
     fi
 }
 
+_unsupported_imgopts()
+{
+    for bad_opt
+    do
+        if echo "$IMGOPTS" | grep -q 2>/dev/null "$bad_opt"
+        then
+            _notrun "not suitable for image option: $bad_opt"
+        fi
+    done
+}
+
 # this test requires that a specified command (executable) exists
 #
 _require_command()
commit e04fb07fd1676e9facd7f3f878c1bbe03bccd26b
Author: Stefan Hajnoczi <stefanha at redhat.com>
Date:   Thu Dec 5 16:38:33 2013 +0100

    rbd: switch from pipe to QEMUBH completion notification
    
    rbd callbacks are called from non-QEMU threads.  Up until now a pipe was
    used to signal completion back to the QEMU iothread.
    
    The pipe writer code handles EAGAIN using select(2).  The select(2) API
    is not scalable since fd_set size is static.  FD_SET() can write beyond
    the end of fd_set if the file descriptor number is too high.  (QEMU's
    main loop uses poll(2) to avoid this issue with select(2).)
    
    Since the pipe itself is quite clumsy to use and QEMUBH is now
    thread-safe, just schedule a BH from the rbd callback function.  This
    way we can simplify I/O completion in addition to eliminating the
    potential FD_SET() crash when file descriptor numbers become too high.
    
    Crash scenario: QEMU already has 1024 file descriptors open.  Hotplug an
    rbd drive and get the pipe writer to take the select(2) code path.
    
    Reviewed-by: Josh Durgin <josh.durgin at inktank.com>
    Tested-by: Josh Durgin <josh.durgin at inktank.com>
    Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
    Signed-off-by: Kevin Wolf <kwolf at redhat.com>

diff --git a/block/rbd.c b/block/rbd.c
index f453f04..121fae2 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -95,18 +95,13 @@ typedef struct RADOSCB {
 #define RBD_FD_WRITE 1
 
 typedef struct BDRVRBDState {
-    int fds[2];
     rados_t cluster;
     rados_ioctx_t io_ctx;
     rbd_image_t image;
     char name[RBD_MAX_IMAGE_NAME_SIZE];
     char *snap;
-    int event_reader_pos;
-    RADOSCB *event_rcb;
 } BDRVRBDState;
 
-static void rbd_aio_bh_cb(void *opaque);
-
 static int qemu_rbd_next_tok(char *dst, int dst_len,
                              char *src, char delim,
                              const char *name,
@@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options,
 }
 
 /*
- * This aio completion is being called from qemu_rbd_aio_event_reader()
- * and runs in qemu context. It schedules a bh, but just in case the aio
- * was not cancelled before.
+ * This aio completion is being called from rbd_finish_bh() and runs in qemu
+ * BH context.
  */
 static void qemu_rbd_complete_aio(RADOSCB *rcb)
 {
@@ -401,36 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
             acb->ret = r;
         }
     }
-    /* Note that acb->bh can be NULL in case where the aio was cancelled */
-    acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
-    g_free(rcb);
-}
 
-/*
- * aio fd read handler. It runs in the qemu context and calls the
- * completion handling of completed rados aio operations.
- */
-static void qemu_rbd_aio_event_reader(void *opaque)
-{
-    BDRVRBDState *s = opaque;
+    g_free(rcb);
 
-    ssize_t ret;
+    if (acb->cmd == RBD_AIO_READ) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
+    acb->status = 0;
 
-    do {
-        char *p = (char *)&s->event_rcb;
-
-        /* now read the rcb pointer that was sent from a non qemu thread */
-        ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos,
-                   sizeof(s->event_rcb) - s->event_reader_pos);
-        if (ret > 0) {
-            s->event_reader_pos += ret;
-            if (s->event_reader_pos == sizeof(s->event_rcb)) {
-                s->event_reader_pos = 0;
-                qemu_rbd_complete_aio(s->event_rcb);
-            }
-        }
-    } while (ret < 0 && errno == EINTR);
+    if (!acb->cancelled) {
+        qemu_aio_release(acb);
+    }
 }
 
 /* TODO Convert to fine grained options */
@@ -538,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
 
     bs->read_only = (s->snap != NULL);
 
-    s->event_reader_pos = 0;
-    r = qemu_pipe(s->fds);
-    if (r < 0) {
-        error_report("error opening eventfd");
-        goto failed;
-    }
-    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
-    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader,
-                            NULL, s);
-
-
     qemu_opts_del(opts);
     return 0;
 
-failed:
-    rbd_close(s->image);
 failed_open:
     rados_ioctx_destroy(s->io_ctx);
 failed_shutdown:
@@ -569,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
 
-    close(s->fds[0]);
-    close(s->fds[1]);
-    qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL);
-
     rbd_close(s->image);
     rados_ioctx_destroy(s->io_ctx);
     g_free(s->snap);
@@ -600,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = {
     .cancel = qemu_rbd_aio_cancel,
 };
 
-static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
+static void rbd_finish_bh(void *opaque)
 {
-    int ret = 0;
-    while (1) {
-        fd_set wfd;
-        int fd = s->fds[RBD_FD_WRITE];
-
-        /* send the op pointer to the qemu thread that is responsible
-           for the aio/op completion. Must do it in a qemu thread context */
-        ret = write(fd, (void *)&rcb, sizeof(rcb));
-        if (ret >= 0) {
-            break;
-        }
-        if (errno == EINTR) {
-            continue;
-        }
-        if (errno != EAGAIN) {
-            break;
-        }
-
-        FD_ZERO(&wfd);
-        FD_SET(fd, &wfd);
-        do {
-            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
-        } while (ret < 0 && errno == EINTR);
-    }
-
-    return ret;
+    RADOSCB *rcb = opaque;
+    qemu_bh_delete(rcb->acb->bh);
+    qemu_rbd_complete_aio(rcb);
 }
 
 /*
@@ -635,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb)
  *
  * Note: this function is being called from a non qemu thread so
  * we need to be careful about what we do here. Generally we only
- * write to the block notification pipe, and do the rest of the
- * io completion handling from qemu_rbd_aio_event_reader() which
- * runs in a qemu context.
+ * schedule a BH, and do the rest of the io completion handling
+ * from rbd_finish_bh() which runs in a qemu context.
  */
 static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
 {
-    int ret;
+    RBDAIOCB *acb = rcb->acb;
+
     rcb->ret = rbd_aio_get_return_value(c);
     rbd_aio_release(c);
-    ret = qemu_rbd_send_pipe(rcb->s, rcb);
-    if (ret < 0) {
-        error_report("failed writing to acb->s->fds");
-        g_free(rcb);
-    }
-}
-
-/* Callback when all queued rbd_aio requests are complete */
 
-static void rbd_aio_bh_cb(void *opaque)
-{
-    RBDAIOCB *acb = opaque;
-
-    if (acb->cmd == RBD_AIO_READ) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-    }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    acb->status = 0;
-
-    if (!acb->cancelled) {
-        qemu_aio_release(acb);
-    }
+    acb->bh = qemu_bh_new(rbd_finish_bh, rcb);
+    qemu_bh_schedule(acb->bh);
 }
 
 static int rbd_aio_discard_wrapper(rbd_image_t image,
commit 2777ccc55bfe90bfa813b01faf36fa6ea16fbea8
Author: Stefan Weil <sw at weilnetz.de>
Date:   Sat Dec 7 16:25:17 2013 +0100

    gtk: Support keyboard translation for hosts running Windows
    
    GTK uses different hardware keycodes on Windows hosts, so some special
    handling is needed to get the QEMU keycode.
    
    Signed-off-by: Stefan Weil <sw at weilnetz.de>

diff --git a/ui/gtk.c b/ui/gtk.c
index 6316f5b..a633d89 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -34,6 +34,10 @@
 #define GETTEXT_PACKAGE "qemu"
 #define LOCALEDIR "po"
 
+#ifdef _WIN32
+# define _WIN32_WINNT 0x0601 /* needed to get definition of MAPVK_VK_TO_VSC */
+#endif
+
 #include "qemu-common.h"
 
 #ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE
@@ -704,11 +708,18 @@ static gboolean gd_button_event(GtkWidget *widget, GdkEventButton *button,
 static gboolean gd_key_event(GtkWidget *widget, GdkEventKey *key, void *opaque)
 {
     GtkDisplayState *s = opaque;
-    int gdk_keycode;
-    int qemu_keycode;
+    int gdk_keycode = key->hardware_keycode;
     int i;
 
-    gdk_keycode = key->hardware_keycode;
+#ifdef _WIN32
+    UINT qemu_keycode = MapVirtualKey(gdk_keycode, MAPVK_VK_TO_VSC);
+    switch (qemu_keycode) {
+    case 103:   /* alt gr */
+        qemu_keycode = 56 | SCANCODE_GREY;
+        break;
+    }
+#else
+    int qemu_keycode;
 
     if (gdk_keycode < 9) {
         qemu_keycode = 0;
@@ -723,6 +734,7 @@ static gboolean gd_key_event(GtkWidget *widget, GdkEventKey *key, void *opaque)
     } else {
         qemu_keycode = 0;
     }
+#endif
 
     trace_gd_key_event(gdk_keycode, qemu_keycode,
                        (key->type == GDK_KEY_PRESS) ? "down" : "up");
commit 439d19f2922ac409ee224bc1e5522cee7009d829
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Mon Jan 20 14:22:25 2014 +0100

    kvm: always update the MPX model specific register
    
    The original patch from Liu Jinsong restricted them to reset or full
    state updates, but that's unnecessary (and wrong) since the BNDCFGS
    MSR has no side effects.
    
    Cc: Liu Jinsong <jinsong.liu at intel.com>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 221c8a0..d34981f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1161,6 +1161,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                           env->msr_ia32_misc_enable);
     }
+    if (has_msr_bndcfgs) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
+    }
 #ifdef TARGET_X86_64
     if (lm_capable_kernel) {
         kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1224,9 +1227,6 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         if (cpu->hyperv_vapic) {
             kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
         }
-        if (has_msr_bndcfgs) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
-        }
 
         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
          *       kvm_put_msr_feature_control. */
commit 918b94e2873cd5fe8aef06d269b4a4c7d0832ce7
Author: Paul Moore <pmoore at redhat.com>
Date:   Wed Jan 15 14:38:58 2014 -0500

    seccomp: add some basic shared memory syscalls to the whitelist
    
    PulseAudio requires the use of shared memory so add shmget(), shmat(),
    and shmdt() to the syscall whitelist.
    
    Reported-by: xuhan at redhat.com
    Signed-off-by: Paul Moore <pmoore at redhat.com>

diff --git a/qemu-seccomp.c b/qemu-seccomp.c
index 89f244f..caa926e 100644
--- a/qemu-seccomp.c
+++ b/qemu-seccomp.c
@@ -222,7 +222,10 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
     { SCMP_SYS(io_destroy), 241 },
     { SCMP_SYS(arch_prctl), 240 },
     { SCMP_SYS(mkdir), 240 },
-    { SCMP_SYS(fchmod), 240 }
+    { SCMP_SYS(fchmod), 240 },
+    { SCMP_SYS(shmget), 240 },
+    { SCMP_SYS(shmat), 240 },
+    { SCMP_SYS(shmdt), 240 }
 };
 
 int seccomp_start(void)
commit 0c2acb163fbb4579dad2d45595570b0a9ff71149
Author: Paul Moore <pmoore at redhat.com>
Date:   Wed Jan 15 14:38:51 2014 -0500

    seccomp: add mkdir() and fchmod() to the whitelist
    
    The PulseAudio library attempts to do a mkdir(2) and fchmod(2) on
    "/run/user/<UID>/pulse" which is currently blocked by the syscall
    filter; this patch adds the two missing syscalls to the whitelist.
    You can reproduce this problem with the following command:
    
     # qemu -monitor stdio -device intel-hda -device hda-duplex
    
    If watched under strace the following syscalls are shown:
    
     mkdir("/run/user/0/pulse", 0700)
     fchmod(11, 0700) [NOTE: 11 is the fd for /run/user/0/pulse]
    
    Reported-by: xuhan at redhat.com
    Signed-off-by: Paul Moore <pmoore at redhat.com>

diff --git a/qemu-seccomp.c b/qemu-seccomp.c
index b7c1253..89f244f 100644
--- a/qemu-seccomp.c
+++ b/qemu-seccomp.c
@@ -220,7 +220,9 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
     { SCMP_SYS(io_cancel), 241 },
     { SCMP_SYS(io_setup), 241 },
     { SCMP_SYS(io_destroy), 241 },
-    { SCMP_SYS(arch_prctl), 240 }
+    { SCMP_SYS(arch_prctl), 240 },
+    { SCMP_SYS(mkdir), 240 },
+    { SCMP_SYS(fchmod), 240 }
 };
 
 int seccomp_start(void)
commit 39e6a38cdd4b235b2918b4977f31fde2c0da3bc4
Author: Gerd Hoffmann <kraxel at redhat.com>
Date:   Mon Dec 16 13:23:53 2013 +0100

    hda-codec: disable streams on reset
    
    Signed-off-by: Gerd Hoffmann <kraxel at redhat.com>

diff --git a/hw/audio/hda-codec.c b/hw/audio/hda-codec.c
index 07a43bf..986f2a9 100644
--- a/hw/audio/hda-codec.c
+++ b/hw/audio/hda-codec.c
@@ -559,6 +559,21 @@ static int hda_audio_post_load(void *opaque, int version)
     return 0;
 }
 
+static void hda_audio_reset(DeviceState *dev)
+{
+    HDAAudioState *a = DO_UPCAST(HDAAudioState, hda.qdev, dev);
+    HDAAudioStream *st;
+    int i;
+
+    dprint(a, 1, "%s\n", __func__);
+    for (i = 0; i < ARRAY_SIZE(a->st); i++) {
+        st = a->st + i;
+        if (st->node != NULL) {
+            hda_audio_set_running(st, false);
+        }
+    }
+}
+
 static const VMStateDescription vmstate_hda_audio_stream = {
     .name = "hda-audio-stream",
     .version_id = 1,
@@ -640,6 +655,7 @@ static void hda_audio_output_class_init(ObjectClass *klass, void *data)
     k->stream = hda_audio_stream;
     set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
     dc->desc = "HDA Audio Codec, output-only (line-out)";
+    dc->reset = hda_audio_reset;
     dc->vmsd = &vmstate_hda_audio;
     dc->props = hda_audio_properties;
 }
@@ -662,6 +678,7 @@ static void hda_audio_duplex_class_init(ObjectClass *klass, void *data)
     k->stream = hda_audio_stream;
     set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
     dc->desc = "HDA Audio Codec, duplex (line-out, line-in)";
+    dc->reset = hda_audio_reset;
     dc->vmsd = &vmstate_hda_audio;
     dc->props = hda_audio_properties;
 }
@@ -684,6 +701,7 @@ static void hda_audio_micro_class_init(ObjectClass *klass, void *data)
     k->stream = hda_audio_stream;
     set_bit(DEVICE_CATEGORY_SOUND, dc->categories);
     dc->desc = "HDA Audio Codec, duplex (speaker, microphone)";
+    dc->reset = hda_audio_reset;
     dc->vmsd = &vmstate_hda_audio;
     dc->props = hda_audio_properties;
 }
commit 732c66ce641c69702a7e7fdb73b68f0c1b583ab5
Author: Peter Crosthwaite <peter.crosthwaite at xilinx.com>
Date:   Wed Jan 15 03:06:11 2014 -0800

    Revert "error: Don't use error_report() for assertion msgs."
    
    This reverts commit d32934c84c72f57e78d430c22974677b7bcabe5d.
    
    The original implementation before this patch makes abortive error
    messages much more friendly. The underlying bug that required this
    change is now fixed. Revert.
    
    Signed-off-by: Peter Crosthwaite <peter.crosthwaite at xilinx.com>
    Signed-off-by: Edgar E. Iglesias <edgar.iglesias at xilinx.com>

diff --git a/util/error.c b/util/error.c
index e5de34f..f11f1d5 100644
--- a/util/error.c
+++ b/util/error.c
@@ -44,7 +44,7 @@ void error_set(Error **errp, ErrorClass err_class, const char *fmt, ...)
     err->err_class = err_class;
 
     if (errp == &error_abort) {
-        fprintf(stderr, "%s\n", error_get_pretty(err));
+        error_report("%s", error_get_pretty(err));
         abort();
     }
 
@@ -80,7 +80,7 @@ void error_set_errno(Error **errp, int os_errno, ErrorClass err_class,
     err->err_class = err_class;
 
     if (errp == &error_abort) {
-        fprintf(stderr, "%s\n", error_get_pretty(err));
+        error_report("%s", error_get_pretty(err));
         abort();
     }
 
@@ -125,7 +125,7 @@ void error_set_win32(Error **errp, int win32_err, ErrorClass err_class,
     err->err_class = err_class;
 
     if (errp == &error_abort) {
-        fprintf(stderr, "%s\n", error_get_pretty(err));
+        error_report("%s", error_get_pretty(err));
         abort();
     }
 
@@ -171,7 +171,7 @@ void error_free(Error *err)
 void error_propagate(Error **dst_err, Error *local_err)
 {
     if (local_err && dst_err == &error_abort) {
-        fprintf(stderr, "%s\n", error_get_pretty(local_err));
+        error_report("%s", error_get_pretty(local_err));
         abort();
     } else if (dst_err && !*dst_err) {
         *dst_err = local_err;
commit 3dbe85b8404fa479ad0a75d5adb464949257f129
Author: Peter Crosthwaite <peter.crosthwaite at xilinx.com>
Date:   Wed Jan 15 03:05:36 2014 -0800

    tests: Add libqemustub to qom-interface-check
    
    The recent addition of util/error.c's dependency on error_report()
    causes this test to fail to link due to a number of missing monitor
    related symbols. All these symbols are however defined by libqemustub.
    Add this libary to the link.
    
    Signed-off-by: Peter Crosthwaite <peter.crosthwaite at xilinx.com>
    Reviewed-by: Igor Mammedov <imammedo at redhat.com>
    Signed-off-by: Edgar E. Iglesias <edgar.iglesias at xilinx.com>

diff --git a/tests/Makefile b/tests/Makefile
index 0aaf657..fd36eee 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -151,7 +151,7 @@ tests/check-qdict$(EXESUF): tests/check-qdict.o libqemuutil.a
 tests/check-qlist$(EXESUF): tests/check-qlist.o libqemuutil.a
 tests/check-qfloat$(EXESUF): tests/check-qfloat.o libqemuutil.a
 tests/check-qjson$(EXESUF): tests/check-qjson.o libqemuutil.a libqemustub.a
-tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(qom-core-obj) libqemuutil.a
+tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(qom-core-obj) libqemuutil.a libqemustub.a
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(block-obj-y) libqemuutil.a libqemustub.a
commit 1cb27d9233d572826b45bd8498d2fab1b6f01df9
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Thu Jan 16 13:06:13 2014 +0100

    scsi: Support TEST UNIT READY in the dummy LUN0
    
    SeaBIOS waits for LUN0 to respond to the TEST UNIT READY command
    in order to decide whether it should part of the boot sequence.
    If LUN0 does not respond to the command, boot is delayed by up
    to 5 seconds.  This currently happens when there is no LUN0 on
    a target.  Fix that by adding a trivial implementation of the
    command.
    
    Cc: qemu-stable at nongnu.org
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 3496c0b..50b89ad 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -469,6 +469,8 @@ static int32_t scsi_target_send_command(SCSIRequest *req, uint8_t *buf)
             r->req.dev->sense_is_ua = false;
         }
         break;
+    case TEST_UNIT_READY:
+        break;
     default:
         scsi_req_build_sense(req, SENSE_CODE(LUN_NOT_SUPPORTED));
         scsi_req_complete(req, CHECK_CONDITION);
commit 88678fbd9dbf01fd0988bcb651508378d85e868a
Author: Gerd Hoffmann <kraxel at redhat.com>
Date:   Wed Nov 20 07:33:50 2013 +0100

    usb-hid: add microsoft os descriptor support
    
    Set SelectiveSuspendEnabled registy entry to one.
    This makes Windows use remote suspend by default,
    without manual registry fiddeling.
    
    Signed-off-by: Gerd Hoffmann <kraxel at redhat.com>

diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c
index 5e667f0..2966066 100644
--- a/hw/usb/dev-hid.c
+++ b/hw/usb/dev-hid.c
@@ -261,6 +261,10 @@ static const USBDescDevice desc_device_keyboard = {
     },
 };
 
+static const USBDescMSOS desc_msos_suspend = {
+    .SelectiveSuspendEnabled = true,
+};
+
 static const USBDesc desc_mouse = {
     .id = {
         .idVendor          = 0x0627,
@@ -272,6 +276,7 @@ static const USBDesc desc_mouse = {
     },
     .full = &desc_device_mouse,
     .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };
 
 static const USBDesc desc_tablet = {
@@ -285,6 +290,7 @@ static const USBDesc desc_tablet = {
     },
     .full = &desc_device_tablet,
     .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };
 
 static const USBDesc desc_tablet2 = {
@@ -299,6 +305,7 @@ static const USBDesc desc_tablet2 = {
     .full = &desc_device_tablet,
     .high = &desc_device_tablet2,
     .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };
 
 static const USBDesc desc_keyboard = {
@@ -312,6 +319,7 @@ static const USBDesc desc_keyboard = {
     },
     .full = &desc_device_keyboard,
     .str  = desc_strings,
+    .msos = &desc_msos_suspend,
 };
 
 static const uint8_t qemu_mouse_hid_report_descriptor[] = {
commit 5319dc7b42610575cbd3a33f4340c1fb4f19b939
Author: Gerd Hoffmann <kraxel at redhat.com>
Date:   Wed Nov 20 07:32:31 2013 +0100

    usb: add support for microsoft os descriptors
    
    This patch adds support for special usb descriptors used by microsoft
    windows.  They allow more fine-grained control over driver binding and
    adding entries to the registry for configuration.
    
    As this is a guest-visible change the "msos-desc" compat property
    has been added to turn this off for 1.7 + older
    
    Signed-off-by: Gerd Hoffmann <kraxel at redhat.com>

diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 2766414..a327d71 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -393,6 +393,10 @@ static QEMUMachine pc_i440fx_machine_v1_7 = {
     PC_I440FX_1_7_MACHINE_OPTIONS,
     .name = "pc-i440fx-1.7",
     .init = pc_init_pci_1_7,
+    .compat_props = (GlobalProperty[]) {
+        PC_COMPAT_1_7,
+        { /* end of list */ }
+    },
 };
 
 #define PC_I440FX_1_6_MACHINE_OPTIONS PC_I440FX_MACHINE_OPTIONS
diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs
index a3eac3e..97b4575 100644
--- a/hw/usb/Makefile.objs
+++ b/hw/usb/Makefile.objs
@@ -1,5 +1,5 @@
 # usb subsystem core
-common-obj-y += core.o combined-packet.o bus.o desc.o
+common-obj-y += core.o combined-packet.o bus.o desc.o desc-msos.o
 common-obj-y += libhw.o
 
 # usb host adapters
diff --git a/hw/usb/bus.c b/hw/usb/bus.c
index 09848c6..fe70429 100644
--- a/hw/usb/bus.c
+++ b/hw/usb/bus.c
@@ -16,6 +16,8 @@ static Property usb_props[] = {
     DEFINE_PROP_STRING("serial", USBDevice, serial),
     DEFINE_PROP_BIT("full-path", USBDevice, flags,
                     USB_DEV_FLAG_FULL_PATH, true),
+    DEFINE_PROP_BIT("msos-desc", USBDevice, flags,
+                    USB_DEV_FLAG_MSOS_DESC_ENABLE, true),
     DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/hw/usb/desc-msos.c b/hw/usb/desc-msos.c
new file mode 100644
index 0000000..ed8d62c
--- /dev/null
+++ b/hw/usb/desc-msos.c
@@ -0,0 +1,234 @@
+#include "hw/usb.h"
+#include "hw/usb/desc.h"
+
+/*
+ * Microsoft OS Descriptors
+ *
+ * Windows tries to fetch some special descriptors with informations
+ * specifically for windows.  Presence is indicated using a special
+ * string @ index 0xee.  There are two kinds of descriptors:
+ *
+ * compatid descriptor
+ *   Used to bind drivers, if usb class isn't specific enougth.
+ *   Used for PTP/MTP for example (both share the same usb class).
+ *
+ * properties descriptor
+ *   Does carry registry entries.  They show up in
+ *   HLM\SYSTEM\CurrentControlSet\Enum\USB\<devid>\<serial>\Device Parameters
+ *
+ * Note that Windows caches the stuff it got in the registry, so when
+ * playing with this you have to delete registry subtrees to make
+ * windows query the device again:
+ *   HLM\SYSTEM\CurrentControlSet\Control\usbflags
+ *   HLM\SYSTEM\CurrentControlSet\Enum\USB
+ * Windows will complain it can't delete entries on the second one.
+ * It has deleted everything it had permissions too, which is enouth
+ * as this includes "Device Parameters".
+ *
+ * http://msdn.microsoft.com/en-us/library/windows/hardware/ff537430.aspx
+ *
+ */
+
+/* ------------------------------------------------------------------ */
+
+typedef struct msos_compat_hdr {
+    uint32_t dwLength;
+    uint8_t  bcdVersion_lo;
+    uint8_t  bcdVersion_hi;
+    uint8_t  wIndex_lo;
+    uint8_t  wIndex_hi;
+    uint8_t  bCount;
+    uint8_t  reserved[7];
+} QEMU_PACKED msos_compat_hdr;
+
+typedef struct msos_compat_func {
+    uint8_t  bFirstInterfaceNumber;
+    uint8_t  reserved_1;
+    uint8_t  compatibleId[8];
+    uint8_t  subCompatibleId[8];
+    uint8_t  reserved_2[6];
+} QEMU_PACKED msos_compat_func;
+
+static int usb_desc_msos_compat(const USBDesc *desc, uint8_t *dest)
+{
+    msos_compat_hdr *hdr = (void *)dest;
+    msos_compat_func *func;
+    int length = sizeof(*hdr);
+    int count = 0;
+
+    func = (void *)(dest + length);
+    func->bFirstInterfaceNumber = 0;
+    func->reserved_1 = 0x01;
+    length += sizeof(*func);
+    count++;
+
+    hdr->dwLength      = cpu_to_le32(length);
+    hdr->bcdVersion_lo = 0x00;
+    hdr->bcdVersion_hi = 0x01;
+    hdr->wIndex_lo     = 0x04;
+    hdr->wIndex_hi     = 0x00;
+    hdr->bCount        = count;
+    return length;
+}
+
+/* ------------------------------------------------------------------ */
+
+typedef struct msos_prop_hdr {
+    uint32_t dwLength;
+    uint8_t  bcdVersion_lo;
+    uint8_t  bcdVersion_hi;
+    uint8_t  wIndex_lo;
+    uint8_t  wIndex_hi;
+    uint8_t  wCount_lo;
+    uint8_t  wCount_hi;
+} QEMU_PACKED msos_prop_hdr;
+
+typedef struct msos_prop {
+    uint32_t dwLength;
+    uint32_t dwPropertyDataType;
+    uint8_t  dwPropertyNameLength_lo;
+    uint8_t  dwPropertyNameLength_hi;
+    uint8_t  bPropertyName[];
+} QEMU_PACKED msos_prop;
+
+typedef struct msos_prop_data {
+    uint32_t dwPropertyDataLength;
+    uint8_t  bPropertyData[];
+} QEMU_PACKED msos_prop_data;
+
+typedef enum msos_prop_type {
+    MSOS_REG_SZ        = 1,
+    MSOS_REG_EXPAND_SZ = 2,
+    MSOS_REG_BINARY    = 3,
+    MSOS_REG_DWORD_LE  = 4,
+    MSOS_REG_DWORD_BE  = 5,
+    MSOS_REG_LINK      = 6,
+    MSOS_REG_MULTI_SZ  = 7,
+} msos_prop_type;
+
+static int usb_desc_msos_prop_name(struct msos_prop *prop,
+                                   const wchar_t *name)
+{
+    int length = wcslen(name) + 1;
+    int i;
+
+    prop->dwPropertyNameLength_lo = usb_lo(length*2);
+    prop->dwPropertyNameLength_hi = usb_hi(length*2);
+    for (i = 0; i < length; i++) {
+        prop->bPropertyName[i*2]   = usb_lo(name[i]);
+        prop->bPropertyName[i*2+1] = usb_hi(name[i]);
+    }
+    return length*2;
+}
+
+static int usb_desc_msos_prop_str(uint8_t *dest, msos_prop_type type,
+                                  const wchar_t *name, const wchar_t *value)
+{
+    struct msos_prop *prop = (void *)dest;
+    struct msos_prop_data *data;
+    int length = sizeof(*prop);
+    int i, vlen = wcslen(value) + 1;
+
+    prop->dwPropertyDataType = cpu_to_le32(type);
+    length += usb_desc_msos_prop_name(prop, name);
+    data = (void *)(dest + length);
+
+    data->dwPropertyDataLength = cpu_to_le32(vlen*2);
+    length += sizeof(*prop);
+
+    for (i = 0; i < vlen; i++) {
+        data->bPropertyData[i*2]   = usb_lo(value[i]);
+        data->bPropertyData[i*2+1] = usb_hi(value[i]);
+    }
+    length += vlen*2;
+
+    prop->dwLength = cpu_to_le32(length);
+    return length;
+}
+
+static int usb_desc_msos_prop_dword(uint8_t *dest, const wchar_t *name,
+                                    uint32_t value)
+{
+    struct msos_prop *prop = (void *)dest;
+    struct msos_prop_data *data;
+    int length = sizeof(*prop);
+
+    prop->dwPropertyDataType = cpu_to_le32(MSOS_REG_DWORD_LE);
+    length += usb_desc_msos_prop_name(prop, name);
+    data = (void *)(dest + length);
+
+    data->dwPropertyDataLength = cpu_to_le32(4);
+    data->bPropertyData[0] = (value)       & 0xff;
+    data->bPropertyData[1] = (value >>  8) & 0xff;
+    data->bPropertyData[2] = (value >> 16) & 0xff;
+    data->bPropertyData[3] = (value >> 24) & 0xff;
+    length += sizeof(*prop) + 4;
+
+    prop->dwLength = cpu_to_le32(length);
+    return length;
+}
+
+static int usb_desc_msos_prop(const USBDesc *desc, uint8_t *dest)
+{
+    msos_prop_hdr *hdr = (void *)dest;
+    int length = sizeof(*hdr);
+    int count = 0;
+
+    if (desc->msos->Label) {
+        /*
+         * Given as example in the specs.  Havn't figured yet where
+         * this label shows up in the windows gui.
+         */
+        length += usb_desc_msos_prop_str(dest+length, MSOS_REG_SZ,
+                                         L"Label", desc->msos->Label);
+        count++;
+    }
+
+    if (desc->msos->SelectiveSuspendEnabled) {
+        /*
+         * Signaling remote wakeup capability in the standard usb
+         * descriptors isn't enouth to make windows actually use it.
+         * This is the "Yes, we really mean it" registy entry to flip
+         * the switch in the windows drivers.
+         */
+        length += usb_desc_msos_prop_dword(dest+length,
+                                           L"SelectiveSuspendEnabled", 1);
+        count++;
+    }
+
+    hdr->dwLength      = cpu_to_le32(length);
+    hdr->bcdVersion_lo = 0x00;
+    hdr->bcdVersion_hi = 0x01;
+    hdr->wIndex_lo     = 0x05;
+    hdr->wIndex_hi     = 0x00;
+    hdr->wCount_lo     = usb_lo(count);
+    hdr->wCount_hi     = usb_hi(count);
+    return length;
+}
+
+/* ------------------------------------------------------------------ */
+
+int usb_desc_msos(const USBDesc *desc,  USBPacket *p,
+                  int index, uint8_t *dest, size_t len)
+{
+    void *buf = g_malloc0(4096);
+    int length = 0;
+
+    switch (index) {
+    case 0x0004:
+        length = usb_desc_msos_compat(desc, buf);
+        break;
+    case 0x0005:
+        length = usb_desc_msos_prop(desc, buf);
+        break;
+    }
+
+    if (length > len) {
+        length = len;
+    }
+    memcpy(dest, buf, length);
+    free(buf);
+
+    p->actual_length = length;
+    return 0;
+}
diff --git a/hw/usb/desc.c b/hw/usb/desc.c
index f18a043..f133ddb 100644
--- a/hw/usb/desc.c
+++ b/hw/usb/desc.c
@@ -7,7 +7,7 @@
 /* ------------------------------------------------------------------ */
 
 int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
-                    uint8_t *dest, size_t len)
+                    bool msos, uint8_t *dest, size_t len)
 {
     uint8_t bLength = 0x12;
     USBDescriptor *d = (void *)dest;
@@ -19,8 +19,18 @@ int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
     d->bLength                     = bLength;
     d->bDescriptorType             = USB_DT_DEVICE;
 
-    d->u.device.bcdUSB_lo          = usb_lo(dev->bcdUSB);
-    d->u.device.bcdUSB_hi          = usb_hi(dev->bcdUSB);
+    if (msos && dev->bcdUSB < 0x0200) {
+        /*
+         * Version 2.0+ required for microsoft os descriptors to work.
+         * Done this way so msos-desc compat property will handle both
+         * the version and the new descriptors being present.
+         */
+        d->u.device.bcdUSB_lo          = usb_lo(0x0200);
+        d->u.device.bcdUSB_hi          = usb_hi(0x0200);
+    } else {
+        d->u.device.bcdUSB_lo          = usb_lo(dev->bcdUSB);
+        d->u.device.bcdUSB_hi          = usb_hi(dev->bcdUSB);
+    }
     d->u.device.bDeviceClass       = dev->bDeviceClass;
     d->u.device.bDeviceSubClass    = dev->bDeviceSubClass;
     d->u.device.bDeviceProtocol    = dev->bDeviceProtocol;
@@ -499,6 +509,10 @@ void usb_desc_init(USBDevice *dev)
     if (desc->super) {
         dev->speedmask |= USB_SPEED_MASK_SUPER;
     }
+    if (desc->msos && (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_ENABLE))) {
+        dev->flags |= (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE);
+        usb_desc_set_string(dev, 0xee, "MSFT100Q");
+    }
     usb_desc_setdefaults(dev);
 }
 
@@ -626,6 +640,7 @@ int usb_desc_string(USBDevice *dev, int index, uint8_t *dest, size_t len)
 int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,
                             int value, uint8_t *dest, size_t len)
 {
+    bool msos = (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE));
     const USBDesc *desc = usb_device_get_usb_desc(dev);
     const USBDescDevice *other_dev;
     uint8_t buf[256];
@@ -646,7 +661,7 @@ int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,
 
     switch(type) {
     case USB_DT_DEVICE:
-        ret = usb_desc_device(&desc->id, dev->device, buf, sizeof(buf));
+        ret = usb_desc_device(&desc->id, dev->device, msos, buf, sizeof(buf));
         trace_usb_desc_device(dev->addr, len, ret);
         break;
     case USB_DT_CONFIG:
@@ -703,6 +718,7 @@ int usb_desc_get_descriptor(USBDevice *dev, USBPacket *p,
 int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
         int request, int value, int index, int length, uint8_t *data)
 {
+    bool msos = (dev->flags & (1 << USB_DEV_FLAG_MSOS_DESC_IN_USE));
     const USBDesc *desc = usb_device_get_usb_desc(dev);
     int ret = -1;
 
@@ -782,6 +798,19 @@ int usb_desc_handle_control(USBDevice *dev, USBPacket *p,
         trace_usb_set_interface(dev->addr, index, value, ret);
         break;
 
+    case VendorDeviceRequest | 'Q':
+        if (msos) {
+            ret = usb_desc_msos(desc, p, index, data, length);
+            trace_usb_desc_msos(dev->addr, index, length, ret);
+        }
+        break;
+    case VendorInterfaceRequest | 'Q':
+        if (msos) {
+            ret = usb_desc_msos(desc, p, index, data, length);
+            trace_usb_desc_msos(dev->addr, index, length, ret);
+        }
+        break;
+
     }
     return ret;
 }
diff --git a/hw/usb/desc.h b/hw/usb/desc.h
index 81327b0..2b4fcda 100644
--- a/hw/usb/desc.h
+++ b/hw/usb/desc.h
@@ -2,6 +2,7 @@
 #define QEMU_HW_USB_DESC_H
 
 #include <inttypes.h>
+#include <wchar.h>
 
 /* binary representation */
 typedef struct USBDescriptor {
@@ -182,6 +183,11 @@ struct USBDescOther {
     const uint8_t             *data;
 };
 
+struct USBDescMSOS {
+    const wchar_t             *Label;
+    bool                      SelectiveSuspendEnabled;
+};
+
 typedef const char *USBDescStrings[256];
 
 struct USBDesc {
@@ -190,6 +196,7 @@ struct USBDesc {
     const USBDescDevice       *high;
     const USBDescDevice       *super;
     const char* const         *str;
+    const USBDescMSOS         *msos;
 };
 
 #define USB_DESC_FLAG_SUPER (1 << 1)
@@ -207,7 +214,7 @@ static inline uint8_t usb_hi(uint16_t val)
 
 /* generate usb packages from structs */
 int usb_desc_device(const USBDescID *id, const USBDescDevice *dev,
-                    uint8_t *dest, size_t len);
+                    bool msos, uint8_t *dest, size_t len);
 int usb_desc_device_qualifier(const USBDescDevice *dev,
                               uint8_t *dest, size_t len);
 int usb_desc_config(const USBDescConfig *conf, int flags,
@@ -219,6 +226,8 @@ int usb_desc_iface(const USBDescIface *iface, int flags,
 int usb_desc_endpoint(const USBDescEndpoint *ep, int flags,
                       uint8_t *dest, size_t len);
 int usb_desc_other(const USBDescOther *desc, uint8_t *dest, size_t len);
+int usb_desc_msos(const USBDesc *desc, USBPacket *p,
+                  int index, uint8_t *dest, size_t len);
 
 /* control message emulation helpers */
 void usb_desc_init(USBDevice *dev);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index eb3da96..7fe2bd1 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -241,6 +241,7 @@ uint16_t pvpanic_port(void);
 int e820_add_entry(uint64_t, uint64_t, uint32_t);
 
 #define PC_Q35_COMPAT_1_7 \
+        PC_COMPAT_1_7, \
         {\
             .driver   = "hpet",\
             .property = HPET_INTCAP,\
@@ -259,7 +260,15 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
         PC_COMPAT_1_4, \
         PC_Q35_COMPAT_1_5
 
+#define PC_COMPAT_1_7 \
+        {\
+            .driver   = TYPE_USB_DEVICE,\
+            .property = "msos-desc",\
+            .value    = "no",\
+        }
+
 #define PC_COMPAT_1_6 \
+        PC_COMPAT_1_7, \
         {\
             .driver   = "e1000",\
             .property = "mitigation",\
diff --git a/include/hw/usb.h b/include/hw/usb.h
index 2a3ea0c..3ef7af7 100644
--- a/include/hw/usb.h
+++ b/include/hw/usb.h
@@ -182,6 +182,7 @@ typedef struct USBDescIface USBDescIface;
 typedef struct USBDescEndpoint USBDescEndpoint;
 typedef struct USBDescOther USBDescOther;
 typedef struct USBDescString USBDescString;
+typedef struct USBDescMSOS USBDescMSOS;
 
 struct USBDescString {
     uint8_t index;
@@ -208,6 +209,8 @@ struct USBEndpoint {
 enum USBDeviceFlags {
     USB_DEV_FLAG_FULL_PATH,
     USB_DEV_FLAG_IS_HOST,
+    USB_DEV_FLAG_MSOS_DESC_ENABLE,
+    USB_DEV_FLAG_MSOS_DESC_IN_USE,
 };
 
 /* definition of a USB device */
diff --git a/trace-events b/trace-events
index 9f4456a..1b668d1 100644
--- a/trace-events
+++ b/trace-events
@@ -402,6 +402,7 @@ usb_desc_config(int addr, int index, int len, int ret) "dev %d query config %d,
 usb_desc_other_speed_config(int addr, int index, int len, int ret) "dev %d query config %d, len %d, ret %d"
 usb_desc_string(int addr, int index, int len, int ret) "dev %d query string %d, len %d, ret %d"
 usb_desc_bos(int addr, int len, int ret) "dev %d bos, len %d, ret %d"
+usb_desc_msos(int addr, int index, int len, int ret) "dev %d msos, index 0x%x, len %d, ret %d"
 usb_set_addr(int addr) "dev %d"
 usb_set_config(int addr, int config, int ret) "dev %d, config %d, ret %d"
 usb_set_interface(int addr, int iface, int alt, int ret) "dev %d, interface %d, altsetting %d, ret %d"
commit 584f2be79de148b0765a758ac0c1036a29c5e830
Author: Alexey Kardashevskiy <aik at ozlabs.ru>
Date:   Fri Jan 10 18:20:18 2014 +1100

    KVM: fix addr type for KVM_IOEVENTFD
    
    The @addr here is a guest physical address and can easily be bigger
    than 4G.
    
    This changes uint32_t to hwaddr.
    
    Cc: qemu-stable at nongnu.org
    Cc: Michael S. Tsirkin <mst at redhat.com>
    Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/kvm-all.c b/kvm-all.c
index 6df2ee1..eb38ee4 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -519,7 +519,7 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
-static int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val,
+static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
                                   bool assign, uint32_t size, bool datamatch)
 {
     int ret;
commit 94ccff133820552a859c0fb95e33a539e0b90a75
Author: thomas knych <thomaswk at google.com>
Date:   Thu Jan 9 13:14:23 2014 -0800

    KVM: Retry KVM_CREATE_VM on EINTR
    
    Upstreaming this change from Android (https://android-review.googlesource.com/54211).
    
    On heavily loaded machines with many VM instances we see KVM_CREATE_VM
    failing with EINTR on this path:
    
    kvm_dev_ioctl_create_vm -> kvm_create_vm -> kvm_init_mmu_notifier -> mmu_notifier_register ->  do_mmu_notifier_register -> mm_take_all_locks
    
    which checks if any signals have been raised while it was attaining locks
    and returns EINTR.  Retrying the system call greatly improves reliability.
    
    Cc: qemu-stable at nongnu.org
    Signed-off-by: thomas knych <thomaswk at google.com>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/kvm-all.c b/kvm-all.c
index 3937754..6df2ee1 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1442,16 +1442,22 @@ int kvm_init(void)
         nc++;
     }
 
-    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
-    if (s->vmfd < 0) {
+    do {
+        ret = kvm_ioctl(s, KVM_CREATE_VM, 0);
+    } while (ret == -EINTR);
+
+    if (ret < 0) {
+        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -s->vmfd,
+                strerror(-ret));
+
 #ifdef TARGET_S390X
         fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
                         "your host kernel command line\n");
 #endif
-        ret = s->vmfd;
         goto err;
     }
 
+    s->vmfd = ret;
     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
     if (!missing_cap) {
         missing_cap =
commit dc6afb99b39a78cf416c6d19e35f680f202016be
Author: Jeff Cody <jcody at redhat.com>
Date:   Tue Jan 14 13:10:24 2014 -0500

    block: add .bdrv_reopen_prepare() stub for iscsi
    
    To suppport reopen(), the .bdrv_reopen_prepare() stub must exist.
    iSCSI does not have anything that needs to be done to support reopen,
    so we can just implement the _prepare() stub.
    
    Signed-off-by: Jeff Cody <jcody at redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/block/iscsi.c b/block/iscsi.c
index c0ea0c4..5976bd1 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1326,6 +1326,14 @@ static void iscsi_close(BlockDriverState *bs)
     memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+/* We have nothing to do for iSCSI reopen, stub just returns
+ * success */
+static int iscsi_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
     IscsiLun *iscsilun = bs->opaque;
@@ -1434,6 +1442,7 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_close      = iscsi_close,
     .bdrv_create     = iscsi_create,
     .create_options  = iscsi_create_options,
+    .bdrv_reopen_prepare  = iscsi_reopen_prepare,
 
     .bdrv_getlength  = iscsi_getlength,
     .bdrv_get_info   = iscsi_get_info,
commit 49fb65c7f985baa56d2964e0a85c1f098e3e2a9d
Author: Eric Farman <farman at linux.vnet.ibm.com>
Date:   Tue Jan 14 14:16:26 2014 -0500

    virtio-scsi: Prevent assertion on missed events
    
    In some cases, an unplug can cause events to be dropped, which
    leads to an assertion failure when preparing to notify the guest
    kernel.
    
    Signed-off-by: Eric Farman <farman at linux.vnet.ibm.com>
    Cc: qemu-stable at nongnu.org
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 1da98cd..6610b3a 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -520,7 +520,7 @@ static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
     evt->event = event;
     evt->reason = reason;
     if (!dev) {
-        assert(event == VIRTIO_SCSI_T_NO_EVENT);
+        assert(event == VIRTIO_SCSI_T_EVENTS_MISSED);
     } else {
         evt->lun[0] = 1;
         evt->lun[1] = dev->id;
commit e9c0f0f58ad0a41c3c4b19e1911cfe095afc09ca
Author: Eric Farman <farman at linux.vnet.ibm.com>
Date:   Tue Jan 14 14:16:25 2014 -0500

    virtio-scsi: Cleanup of I/Os that never started
    
    There is still a small window that occurs when a cancel I/O affects
    an asynchronous I/O operation that hasn't started.  In other words,
    when the residual data length equals the expected data length.
    
    Today, the routine virtio_scsi_command_complete fails because the
    VirtIOSCSIReq pointer (from the hba_private field in SCSIRequest)
    was cleared earlier when virtio_scsi_complete_req was called by
    the virtio_scsi_request_cancelled routine.  As a result, the
    virtio_scsi_command_complete routine needs to simply return when
    it is processing a SCSIRequest block that was marked canceled.
    
    Signed-off-by: Eric Farman <farman at linux.vnet.ibm.com>
    Cc: qemu-stable at nongnu.org
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 6dcdd1b..1da98cd 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -306,6 +306,10 @@ static void virtio_scsi_command_complete(SCSIRequest *r, uint32_t status,
     VirtIOSCSIReq *req = r->hba_private;
     uint32_t sense_len;
 
+    if (r->io_canceled) {
+        return;
+    }
+
     req->resp.cmd->response = VIRTIO_SCSI_S_OK;
     req->resp.cmd->status = status;
     if (req->resp.cmd->status == GOOD) {
commit 33325a53f15ab5370e1917b2a11cadffc77c5a52
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Wed Jan 15 10:35:36 2014 +0100

    scsi: Assign cancel_io vector for scsi_disk_emulate_ops
    
    Some emulated disk operations (MODE SELECT, UNMAP, WRITE SAME)
    can trigger asynchronous I/Os.  Provide the cancel_io callback
    to ensure that AIOCBs are properly cleaned up.
    
    Signed-off-by: Eric Farman <farman at linux.vnet.ibm.com>
    Cc: qemu-stable at nongnu.org
    [Tweak commit message. - Paolo]
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index bce617c..ee1f5eb 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2306,6 +2306,7 @@ static const SCSIReqOps scsi_disk_emulate_reqops = {
     .send_command = scsi_disk_emulate_command,
     .read_data    = scsi_disk_emulate_read_data,
     .write_data   = scsi_disk_emulate_write_data,
+    .cancel_io    = scsi_cancel_io,
     .get_buf      = scsi_get_buf,
 };
 
commit 2ba82852894c762299b7d05e9a2be184116b80f0
Author: Marcelo Tosatti <mtosatti at redhat.com>
Date:   Wed Dec 18 16:42:17 2013 -0200

    mempath prefault: fix off-by-one error
    
    Fix off-by-one error (noticed by Andrea Arcangeli).
    
    Reviewed-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Marcelo Tosatti <mtosatti at redhat.com>

diff --git a/exec.c b/exec.c
index 896f7b8..52d451b 100644
--- a/exec.c
+++ b/exec.c
@@ -1001,7 +1001,7 @@ static void *file_ram_alloc(RAMBlock *block,
         }
 
         /* MAP_POPULATE silently ignores failures */
-        for (i = 0; i < (memory/hpagesize)-1; i++) {
+        for (i = 0; i < (memory/hpagesize); i++) {
             memset(area + (hpagesize*i), 0, 1);
         }
 
commit 6bdf863d942a267f984e4bd82be80cb2ac5b9915
Author: Jan Kiszka <jan.kiszka at siemens.com>
Date:   Tue Dec 17 20:05:13 2013 +0100

    kvm: x86: Separately write feature control MSR on reset
    
    If the guest is running in nested mode on system reset, clearing the
    feature MSR signals the kernel to leave this mode. Recent kernels
    processes this properly, but leave the VCPU state undefined behind. It
    is the job of userspace to bring it to a proper shape. Therefore, write
    this specific MSR first so that no state transfer gets lost.
    
    This allows to cleanly reset a guest with VMX in use.
    
    Signed-off-by: Jan Kiszka <jan.kiszka at siemens.com>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 285e1a3..221c8a0 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1115,6 +1115,25 @@ static int kvm_put_tscdeadline_msr(X86CPU *cpu)
     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
 }
 
+/*
+ * Provide a separate write service for the feature control MSR in order to
+ * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
+ * before writing any other state because forcibly leaving nested mode
+ * invalidates the VCPU state.
+ */
+static int kvm_put_msr_feature_control(X86CPU *cpu)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entry;
+    } msr_data;
+
+    kvm_msr_entry_set(&msr_data.entry, MSR_IA32_FEATURE_CONTROL,
+                      cpu->env.msr_ia32_feature_control);
+    msr_data.info.nmsrs = 1;
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
+}
+
 static int kvm_put_msrs(X86CPU *cpu, int level)
 {
     CPUX86State *env = &cpu->env;
@@ -1205,13 +1224,12 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         if (cpu->hyperv_vapic) {
             kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
         }
-        if (has_msr_feature_control) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_FEATURE_CONTROL,
-                              env->msr_ia32_feature_control);
-        }
         if (has_msr_bndcfgs) {
             kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
         }
+
+        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
+         *       kvm_put_msr_feature_control. */
     }
     if (env->mcg_cap) {
         int i;
@@ -1815,6 +1833,13 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 
     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+    if (level >= KVM_PUT_RESET_STATE && has_msr_feature_control) {
+        ret = kvm_put_msr_feature_control(x86_cpu);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
     ret = kvm_getput_regs(x86_cpu, 1);
     if (ret < 0) {
         return ret;
commit a94b36ddd6af28593c8a5171b5100e6c3dfc063e
Author: Alexander Graf <agraf at suse.de>
Date:   Thu Dec 12 10:29:19 2013 +0100

    roms: Flush icache when writing roms to guest memory
    
    We use the rom infrastructure to write firmware and/or initial kernel
    blobs into guest address space. So we're basically emulating the cache
    off phase on very early system bootup.
    
    That phase is usually responsible for clearing the instruction cache for
    anything it writes into cachable memory, to ensure that after reboot we
    don't happen to execute stale bits from the instruction cache.
    
    So we need to invalidate the icache every time we write a rom into guest
    address space. We do not need to do this for every DMA since the guest
    expects it has to flush the icache manually in that case.
    
    This fixes random reboot issues on e5500 (booke ppc) for me.
    
    Signed-off-by: Alexander Graf <agraf at suse.de>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/exec.c b/exec.c
index f4b9ef2..896f7b8 100644
--- a/exec.c
+++ b/exec.c
@@ -50,6 +50,7 @@
 #include "translate-all.h"
 
 #include "exec/memory-internal.h"
+#include "qemu/cache-utils.h"
 
 //#define DEBUG_SUBPAGE
 
@@ -2010,9 +2011,13 @@ void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
     address_space_rw(&address_space_memory, addr, buf, len, is_write);
 }
 
-/* used for ROM loading : can write in RAM and ROM */
-void cpu_physical_memory_write_rom(hwaddr addr,
-                                   const uint8_t *buf, int len)
+enum write_rom_type {
+    WRITE_DATA,
+    FLUSH_CACHE,
+};
+
+static inline void cpu_physical_memory_write_rom_internal(
+    hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
 {
     hwaddr l;
     uint8_t *ptr;
@@ -2031,8 +2036,15 @@ void cpu_physical_memory_write_rom(hwaddr addr,
             addr1 += memory_region_get_ram_addr(mr);
             /* ROM/RAM case */
             ptr = qemu_get_ram_ptr(addr1);
-            memcpy(ptr, buf, l);
-            invalidate_and_set_dirty(addr1, l);
+            switch (type) {
+            case WRITE_DATA:
+                memcpy(ptr, buf, l);
+                invalidate_and_set_dirty(addr1, l);
+                break;
+            case FLUSH_CACHE:
+                flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
+                break;
+            }
         }
         len -= l;
         buf += l;
@@ -2040,6 +2052,28 @@ void cpu_physical_memory_write_rom(hwaddr addr,
     }
 }
 
+/* used for ROM loading : can write in RAM and ROM */
+void cpu_physical_memory_write_rom(hwaddr addr,
+                                   const uint8_t *buf, int len)
+{
+    cpu_physical_memory_write_rom_internal(addr, buf, len, WRITE_DATA);
+}
+
+void cpu_flush_icache_range(hwaddr start, int len)
+{
+    /*
+     * This function should do the same thing as an icache flush that was
+     * triggered from within the guest. For TCG we are always cache coherent,
+     * so there is no need to flush anything. For KVM / Xen we need to flush
+     * the host's instruction cache at least.
+     */
+    if (tcg_enabled()) {
+        return;
+    }
+
+    cpu_physical_memory_write_rom_internal(start, NULL, len, FLUSH_CACHE);
+}
+
 typedef struct {
     MemoryRegion *mr;
     void *buffer;
diff --git a/hw/core/loader.c b/hw/core/loader.c
index 60d2ebd..0634bee 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -785,6 +785,13 @@ static void rom_reset(void *unused)
             g_free(rom->data);
             rom->data = NULL;
         }
+        /*
+         * The rom loader is really on the same level as firmware in the guest
+         * shadowing a ROM into RAM. Such a shadowing mechanism needs to ensure
+         * that the instruction cache for that new region is clear, so that the
+         * CPU definitely fetches its instructions from the just written data.
+         */
+        cpu_flush_icache_range(rom->addr, rom->datasize);
     }
 }
 
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index e4996e1..8f33122 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -110,6 +110,7 @@ void stq_phys(hwaddr addr, uint64_t val);
 
 void cpu_physical_memory_write_rom(hwaddr addr,
                                    const uint8_t *buf, int len);
+void cpu_flush_icache_range(hwaddr start, int len);
 
 extern struct MemoryRegion io_mem_rom;
 extern struct MemoryRegion io_mem_notdirty;
commit 0522604b09b8cff54ba2450a7478da2a4d084817
Author: Fernando Luis Vázquez Cao <fernando_b1 at lab.ntt.co.jp>
Date:   Fri Dec 6 17:33:01 2013 +0900

    target-i386: clear guest TSC on reset
    
    VCPU TSC is not cleared by a warm reset (*), which leaves some types of Linux
     guests (non-pvops guests and those with the kernel parameter no-kvmclock set)
    vulnerable to the overflow in cyc2ns_offset fixed by upstream commit
    9993bc635d01a6ee7f6b833b4ee65ce7c06350b1 ("sched/x86: Fix overflow in
    cyc2ns_offset").
    
    To put it in a nutshell, if such a Linux guest without the patch above applied
    has been up more than 208 days and attempts a warm reset chances are that
    the newly booted kernel will panic or hang.
    
    (*) Intel Xeon E5 processors show the same broken behavior due to
        the errata "TSC is Not Affected by Warm Reset" (Intel® Xeon®
        Processor E5 Family Specification Update - August 2013): "The
        TSC (Time Stamp Counter MSR 10H) should be cleared on
        reset. Due to this erratum the TSC is not affected by warm
        reset."
    
    Cc: Will Auld <will.auld at intel.com>
    Cc: Marcelo Tosatti <mtosatti at redhat.com>
    Signed-off-by: Fernando Luis Vazquez Cao <fernando at oss.ntt.co.jp>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: Fernando Luis Vázquez Cao <fernando_b1 at lab.ntt.co.jp>

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 5076a94..bc4cb9d 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -2450,6 +2450,9 @@ static void x86_cpu_reset(CPUState *s)
     cpu_breakpoint_remove_all(env, BP_CPU);
     cpu_watchpoint_remove_all(env, BP_CPU);
 
+    env->tsc_adjust = 0;
+    env->tsc = 0;
+
 #if !defined(CONFIG_USER_ONLY)
     /* We hard-wire the BSP to the first CPU. */
     if (s->cpu_index == 0) {
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 312a46b..285e1a3 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1150,14 +1150,12 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
     }
 #endif
-    if (level == KVM_PUT_FULL_STATE) {
-        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
-    }
     /*
      * The following MSRs have side effects on the guest or are too heavy
      * for normal writeback. Limit them to reset or full state updates.
      */
     if (level >= KVM_PUT_RESET_STATE) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                           env->system_time_msr);
         kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
commit f86746c263753cf7a7e4bdb8829c70272dfcf36c
Author: Fernando Luis Vázquez Cao <fernando_b1 at lab.ntt.co.jp>
Date:   Fri Dec 6 17:38:24 2013 +0900

    target-i386: do not special case TSC writeback
    
    Newer kernels are capable of synchronizing TSC values of multiple VCPUs
    on writeback, but we were excluding the power up case, which is not needed
    anymore.
    
    Signed-off-by: Fernando Luis Vazquez Cao <fernando at oss.ntt.co.jp>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    
    Signed-off-by: Fernando Luis Vázquez Cao <fernando_b1 at lab.ntt.co.jp>

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 01ebca2..312a46b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1151,15 +1151,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
     }
 #endif
     if (level == KVM_PUT_FULL_STATE) {
-        /*
-         * KVM is yet unable to synchronize TSC values of multiple VCPUs on
-         * writeback. Until this is fixed, we only write the offset to SMP
-         * guests after migration, desynchronizing the VCPUs, but avoiding
-         * huge jump-backs that would occur without any writeback at all.
-         */
-        if (smp_cpus == 1 || env->tsc != 0) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
-        }
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
     }
     /*
      * The following MSRs have side effects on the guest or are too heavy
commit 79e9ebebbf2a00c46fcedb6dc7dd5e12bbd30216
Author: Liu Jinsong <jinsong.liu at intel.com>
Date:   Thu Dec 5 08:32:12 2013 +0800

    target-i386: Intel MPX
    
    Add some MPX related definiation, and hardcode sizes and offsets
    of xsave features 3 and 4. It also add corresponding part to
    kvm_get/put_xsave, and vmstate.
    
    Signed-off-by: Liu Jinsong <jinsong.liu at intel.com>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index bb98f6d..5076a94 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -336,6 +336,10 @@ typedef struct ExtSaveArea {
 static const ExtSaveArea ext_save_areas[] = {
     [2] = { .feature = FEAT_1_ECX, .bits = CPUID_EXT_AVX,
             .offset = 0x240, .size = 0x100 },
+    [3] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
+            .offset = 0x3c0, .size = 0x40  },
+    [4] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
+            .offset = 0x400, .size = 0x10  },
 };
 
 const char *get_register_name_32(unsigned int reg)
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index ea373e8..bbec228 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -380,9 +380,14 @@
 
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
-#define XSTATE_FP                       1
-#define XSTATE_SSE                      2
-#define XSTATE_YMM                      4
+#define MSR_IA32_BNDCFGS                0x00000d90
+
+#define XSTATE_FP                       (1ULL << 0)
+#define XSTATE_SSE                      (1ULL << 1)
+#define XSTATE_YMM                      (1ULL << 2)
+#define XSTATE_BNDREGS                  (1ULL << 3)
+#define XSTATE_BNDCSR                   (1ULL << 4)
+
 
 /* CPUID feature words */
 typedef enum FeatureWord {
@@ -545,6 +550,7 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EBX_ERMS     (1 << 9)
 #define CPUID_7_0_EBX_INVPCID  (1 << 10)
 #define CPUID_7_0_EBX_RTM      (1 << 11)
+#define CPUID_7_0_EBX_MPX      (1 << 14)
 #define CPUID_7_0_EBX_RDSEED   (1 << 18)
 #define CPUID_7_0_EBX_ADX      (1 << 19)
 #define CPUID_7_0_EBX_SMAP     (1 << 20)
@@ -695,6 +701,16 @@ typedef union {
     uint64_t q;
 } MMXReg;
 
+typedef struct BNDReg {
+    uint64_t lb;
+    uint64_t ub;
+} BNDReg;
+
+typedef struct BNDCSReg {
+    uint64_t cfgu;
+    uint64_t sts;
+} BNDCSReg;
+
 #ifdef HOST_WORDS_BIGENDIAN
 #define XMM_B(n) _b[15 - (n)]
 #define XMM_W(n) _w[7 - (n)]
@@ -912,6 +928,9 @@ typedef struct CPUX86State {
 
     uint64_t xstate_bv;
     XMMReg ymmh_regs[CPU_NB_REGS];
+    BNDReg bnd_regs[4];
+    BNDCSReg bndcs_regs;
+    uint64_t msr_bndcfgs;
 
     uint64_t xcr0;
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 1188482..01ebca2 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -69,6 +69,7 @@ static bool has_msr_feature_control;
 static bool has_msr_async_pf_en;
 static bool has_msr_pv_eoi_en;
 static bool has_msr_misc_enable;
+static bool has_msr_bndcfgs;
 static bool has_msr_kvm_steal_time;
 static int lm_capable_kernel;
 
@@ -772,6 +773,10 @@ static int kvm_get_supported_msrs(KVMState *s)
                     has_msr_misc_enable = true;
                     continue;
                 }
+                if (kvm_msr_list->indices[i] == MSR_IA32_BNDCFGS) {
+                    has_msr_bndcfgs = true;
+                    continue;
+                }
             }
         }
 
@@ -975,6 +980,8 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_XMM_SPACE   40
 #define XSAVE_XSTATE_BV   128
 #define XSAVE_YMMH_SPACE  144
+#define XSAVE_BNDREGS     240
+#define XSAVE_BNDCSR      256
 
 static int kvm_put_xsave(X86CPU *cpu)
 {
@@ -1007,6 +1014,10 @@ static int kvm_put_xsave(X86CPU *cpu)
     *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
     memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
             sizeof env->ymmh_regs);
+    memcpy(&xsave->region[XSAVE_BNDREGS], env->bnd_regs,
+            sizeof env->bnd_regs);
+    memcpy(&xsave->region[XSAVE_BNDCSR], &env->bndcs_regs,
+            sizeof(env->bndcs_regs));
     r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
     return r;
 }
@@ -1208,6 +1219,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
             kvm_msr_entry_set(&msrs[n++], MSR_IA32_FEATURE_CONTROL,
                               env->msr_ia32_feature_control);
         }
+        if (has_msr_bndcfgs) {
+            kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
+        }
     }
     if (env->mcg_cap) {
         int i;
@@ -1289,6 +1303,10 @@ static int kvm_get_xsave(X86CPU *cpu)
     env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
     memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
             sizeof env->ymmh_regs);
+    memcpy(env->bnd_regs, &xsave->region[XSAVE_BNDREGS],
+            sizeof env->bnd_regs);
+    memcpy(&env->bndcs_regs, &xsave->region[XSAVE_BNDCSR],
+            sizeof(env->bndcs_regs));
     return 0;
 }
 
@@ -1435,6 +1453,9 @@ static int kvm_get_msrs(X86CPU *cpu)
     if (has_msr_feature_control) {
         msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
     }
+    if (has_msr_bndcfgs) {
+        msrs[n++].index = MSR_IA32_BNDCFGS;
+    }
 
     if (!env->tsc_valid) {
         msrs[n++].index = MSR_IA32_TSC;
@@ -1550,6 +1571,9 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_IA32_FEATURE_CONTROL:
             env->msr_ia32_feature_control = msrs[i].data;
             break;
+        case MSR_IA32_BNDCFGS:
+            env->msr_bndcfgs = msrs[i].data;
+            break;
         default:
             if (msrs[i].index >= MSR_MC0_CTL &&
                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
diff --git a/target-i386/machine.c b/target-i386/machine.c
index e568da2..2de1964 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -63,6 +63,21 @@ static const VMStateDescription vmstate_ymmh_reg = {
 #define VMSTATE_YMMH_REGS_VARS(_field, _state, _n, _v)                         \
     VMSTATE_STRUCT_ARRAY(_field, _state, _n, _v, vmstate_ymmh_reg, XMMReg)
 
+static const VMStateDescription vmstate_bnd_regs = {
+    .name = "bnd_regs",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_UINT64(lb, BNDReg),
+        VMSTATE_UINT64(ub, BNDReg),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+#define VMSTATE_BND_REGS(_field, _state, _n)          \
+    VMSTATE_STRUCT_ARRAY(_field, _state, _n, 0, vmstate_bnd_regs, BNDReg)
+
 static const VMStateDescription vmstate_mtrr_var = {
     .name = "mtrr_var",
     .version_id = 1,
@@ -506,6 +521,39 @@ static const VMStateDescription vmstate_msr_architectural_pmu = {
     }
 };
 
+static bool mpx_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+    CPUX86State *env = &cpu->env;
+    unsigned int i;
+
+    for (i = 0; i < 4; i++) {
+        if (env->bnd_regs[i].lb || env->bnd_regs[i].ub) {
+            return true;
+        }
+    }
+
+    if (env->bndcs_regs.cfgu || env->bndcs_regs.sts) {
+        return true;
+    }
+
+    return !!env->msr_bndcfgs;
+}
+
+static const VMStateDescription vmstate_mpx = {
+    .name = "cpu/mpx",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_BND_REGS(env.bnd_regs, X86CPU, 4),
+        VMSTATE_UINT64(env.bndcs_regs.cfgu, X86CPU),
+        VMSTATE_UINT64(env.bndcs_regs.sts, X86CPU),
+        VMSTATE_UINT64(env.msr_bndcfgs, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 const VMStateDescription vmstate_x86_cpu = {
     .name = "cpu",
     .version_id = 12,
@@ -638,6 +686,9 @@ const VMStateDescription vmstate_x86_cpu = {
             .vmsd = &vmstate_msr_architectural_pmu,
             .needed = pmu_enable_needed,
         } , {
+            .vmsd = &vmstate_mpx,
+            .needed = mpx_needed,
+        } , {
             /* empty */
         }
     }


More information about the Spice-commits mailing list