From 8e1b02b8ef2eefcb2ff3855531d7bc2ea71e1fb4 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Mon, 29 Apr 2013 14:48:16 -0400
Subject: [PATCH 01/15] qemu: add castagnoli crc32c checksum algorithm

This adds the Castagnoli CRC32C algorithm, using the 0x11EDC6F41
polynomial.

This is extracted from the linux kernel cryptographic crc32.c module.

The algorithm is based on:

Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman
             "Optimization of Cyclic Redundancy-Check Codes with 24
              and 32 Parity Bits", IEEE Transactions on Communication,
              Volume 41, Number 6, June 1993

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/crc32c.h |  35 +++++++++++++
 util/Makefile.objs    |   1 +
 util/crc32c.c         | 115 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 include/qemu/crc32c.h
 create mode 100644 util/crc32c.c

diff --git a/include/qemu/crc32c.h b/include/qemu/crc32c.h
new file mode 100644
index 0000000000..56d1c3bfde
--- /dev/null
+++ b/include/qemu/crc32c.h
@@ -0,0 +1,35 @@
+/*
+ *  Castagnoli CRC32C Checksum Algorithm
+ *
+ *  Polynomial: 0x11EDC6F41
+ *
+ *  Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman
+ *               "Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits",IEEE Transactions on Communication,
+ *                Volume 41, Number 6, June 1993
+ *
+ *  Copyright (c) 2013 Red Hat, Inc.,
+ *
+ *  Authors:
+ *   Jeff Cody <jcody@redhat.com>
+ *
+ *  Based on the Linux kernel cryptographic crc32c module,
+ *
+ *  Copyright (c) 2004 Cisco Systems, Inc.
+ *  Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef QEMU_CRC32_H
+#define QEMU_CRC32_H
+
+#include "qemu-common.h"
+
+uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index c5652f5260..4a1bd4ee3a 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -10,3 +10,4 @@ util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-y += iov.o aes.o qemu-config.o qemu-sockets.o uri.o notify.o
 util-obj-y += qemu-option.o qemu-progress.o
 util-obj-y += hexdump.o
+util-obj-y += crc32c.o
diff --git a/util/crc32c.c b/util/crc32c.c
new file mode 100644
index 0000000000..8866327801
--- /dev/null
+++ b/util/crc32c.c
@@ -0,0 +1,115 @@
+/*
+ *  Castagnoli CRC32C Checksum Algorithm
+ *
+ *  Polynomial: 0x11EDC6F41
+ *
+ *  Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman
+ *               "Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits",IEEE Transactions on Communication,
+ *                Volume 41, Number 6, June 1993
+ *
+ *  Copyright (c) 2013 Red Hat, Inc.,
+ *
+ *  Authors:
+ *   Jeff Cody <jcody@redhat.com>
+ *
+ *  Based on the Linux kernel cryptographic crc32c module,
+ *
+ *  Copyright (c) 2004 Cisco Systems, Inc.
+ *  Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu-common.h"
+#include "qemu/crc32c.h"
+
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const uint32_t crc32c_table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+
+uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length)
+{
+    while (length--) {
+        crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+    }
+    return crc^0xffffffff;
+}
+

From 203cdba3bc6fb6022dbece90483fe03ed33bcffb Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Mon, 29 Apr 2013 14:48:17 -0400
Subject: [PATCH 02/15] block: vhdx header for the QEMU support of VHDX images

This is based on Microsoft's VHDX specification:
    "VHDX Format Specification v0.95", published 4/12/2012
    https://www.microsoft.com/en-us/download/details.aspx?id=29681

These structures define the various header, metadata, and other
block structures defined in the VHDX specification.

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vhdx.h | 311 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 311 insertions(+)
 create mode 100644 block/vhdx.h

diff --git a/block/vhdx.h b/block/vhdx.h
new file mode 100644
index 0000000000..fcddd37d98
--- /dev/null
+++ b/block/vhdx.h
@@ -0,0 +1,311 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_VHDX_H
+#define BLOCK_VHDX_H
+
+/* Structures and fields present in the VHDX file */
+
+/* The header section has the following blocks,
+ * each block is 64KB:
+ *
+ * _____________________________________________________________________________
+ * | File Id. |   Header 1    | Header 2   | Region Table |  Reserved (768KB)  |
+ * |----------|---------------|------------|--------------|--------------------|
+ * |          |               |            |              |                    |
+ * 0.........64KB...........128KB........192KB..........256KB................1MB
+ */
+
+#define VHDX_HEADER_BLOCK_SIZE      (64*1024)
+
+#define VHDX_FILE_ID_OFFSET         0
+#define VHDX_HEADER1_OFFSET         (VHDX_HEADER_BLOCK_SIZE*1)
+#define VHDX_HEADER2_OFFSET         (VHDX_HEADER_BLOCK_SIZE*2)
+#define VHDX_REGION_TABLE_OFFSET    (VHDX_HEADER_BLOCK_SIZE*3)
+
+
+/*
+ * A note on the use of MS-GUID fields.  For more details on the GUID,
+ * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier.
+ *
+ * The VHDX specification only states that these are MS GUIDs, and which
+ * bytes are data1-data4. It makes no mention of what algorithm should be used
+ * to generate the GUID, nor what standard.  However, looking at the specified
+ * known GUID fields, it appears the GUIDs are:
+ *  Standard/DCE GUID type  (noted by 10b in the MSB of byte 0 of .data4)
+ *  Random algorithm        (noted by 0x4XXX for .data3)
+ */
+
+/* ---- HEADER SECTION STRUCTURES ---- */
+
+/* These structures are ones that are defined in the VHDX specification
+ * document */
+
+typedef struct VHDXFileIdentifier {
+    uint64_t    signature;              /* "vhdxfile" in ASCII */
+    uint16_t    creator[256];           /* optional; utf-16 string to identify
+                                           the vhdx file creator.  Diagnotistic
+                                           only */
+} VHDXFileIdentifier;
+
+
+/* the guid is a 16 byte unique ID - the definition for this used by
+ * Microsoft is not just 16 bytes though - it is a structure that is defined,
+ * so we need to follow it here so that endianness does not trip us up */
+
+typedef struct MSGUID {
+    uint32_t  data1;
+    uint16_t  data2;
+    uint16_t  data3;
+    uint8_t   data4[8];
+} MSGUID;
+
+#define guid_eq(a, b) \
+    (memcmp(&(a), &(b), sizeof(MSGUID)) == 0)
+
+#define VHDX_HEADER_SIZE (4*1024)   /* although the vhdx_header struct in disk
+                                       is only 582 bytes, for purposes of crc
+                                       the header is the first 4KB of the 64KB
+                                       block */
+
+/* The full header is 4KB, although the actual header data is much smaller.
+ * But for the checksum calculation, it is over the entire 4KB structure,
+ * not just the defined portion of it */
+typedef struct QEMU_PACKED VHDXHeader {
+    uint32_t    signature;              /* "head" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the whole header */
+    uint64_t    sequence_number;        /* Seq number of this header.  Each
+                                           VHDX file has 2 of these headers,
+                                           and only the header with the highest
+                                           sequence number is valid */
+    MSGUID      file_write_guid;       /* 128 bit unique identifier. Must be
+                                           updated to new, unique value before
+                                           the first modification is made to
+                                           file */
+    MSGUID      data_write_guid;        /* 128 bit unique identifier. Must be
+                                           updated to new, unique value before
+                                           the first modification is made to
+                                           visible data.   Visbile data is
+                                           defined as:
+                                                    - system & user metadata
+                                                    - raw block data
+                                                    - disk size
+                                                    - any change that will
+                                                      cause the virtual disk
+                                                      sector read to differ
+
+                                           This does not need to change if
+                                           blocks are re-arranged */
+    MSGUID      log_guid;               /* 128 bit unique identifier. If zero,
+                                           there is no valid log. If non-zero,
+                                           log entries with this guid are
+                                           valid. */
+    uint16_t    log_version;            /* version of the log format. Mustn't be
+                                           zero, unless log_guid is also zero */
+    uint16_t    version;                /* version of th evhdx file.  Currently,
+                                           only supported version is "1" */
+    uint32_t    log_length;             /* length of the log.  Must be multiple
+                                           of 1MB */
+    uint64_t    log_offset;             /* byte offset in the file of the log.
+                                           Must also be a multiple of 1MB */
+} VHDXHeader;
+
+/* Header for the region table block */
+typedef struct QEMU_PACKED VHDXRegionTableHeader {
+    uint32_t    signature;              /* "regi" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
+    uint32_t    entry_count;            /* number of valid entries */
+    uint32_t    reserved;
+} VHDXRegionTableHeader;
+
+/* Individual region table entry.  There may be a maximum of 2047 of these
+ *
+ *  There are two known region table properties.  Both are required.
+ *  BAT (block allocation table):  2DC27766F62342009D64115E9BFD4A08
+ *  Metadata:                      8B7CA20647904B9AB8FE575F050F886E
+ */
+#define VHDX_REGION_ENTRY_REQUIRED  0x01    /* if set, parser must understand
+                                               this entry in order to open
+                                               file */
+typedef struct QEMU_PACKED VHDXRegionTableEntry {
+    MSGUID      guid;                   /* 128-bit unique identifier */
+    uint64_t    file_offset;            /* offset of the object in the file.
+                                           Must be multiple of 1MB */
+    uint32_t    length;                 /* length, in bytes, of the object */
+    uint32_t    data_bits;
+} VHDXRegionTableEntry;
+
+
+/* ---- LOG ENTRY STRUCTURES ---- */
+#define VHDX_LOG_HDR_SIZE 64
+typedef struct QEMU_PACKED VHDXLogEntryHeader {
+    uint32_t    signature;              /* "loge" in ASCII */
+    uint32_t    checksum;               /* CRC-32C hash of the 64KB table */
+    uint32_t    entry_length;           /* length in bytes, multiple of 1MB */
+    uint32_t    tail;                   /* byte offset of first log entry of a
+                                           seq, where this entry is the last
+                                           entry */
+    uint64_t    sequence_number;        /* incremented with each log entry.
+                                           May not be zero. */
+    uint32_t    descriptor_count;       /* number of descriptors in this log
+                                           entry, must be >= 0 */
+    uint32_t    reserved;
+    MSGUID      log_guid;               /* value of the log_guid from
+                                           vhdx_header.  If not found in
+                                           vhdx_header, it is invalid */
+    uint64_t    flushed_file_offset;    /* see spec for full details - this
+                                           sould be vhdx file size in bytes */
+    uint64_t    last_file_offset;       /* size in bytes that all allocated
+                                           file structures fit into */
+} VHDXLogEntryHeader;
+
+#define VHDX_LOG_DESC_SIZE 32
+
+typedef struct QEMU_PACKED VHDXLogDescriptor {
+    uint32_t    signature;              /* "zero" or "desc" in ASCII */
+    union  {
+        uint32_t    reserved;           /* zero desc */
+        uint32_t    trailing_bytes;     /* data desc: bytes 4092-4096 of the
+                                           data sector */
+    };
+    union {
+        uint64_t    zero_length;        /* zero desc: length of the section to
+                                           zero */
+        uint64_t    leading_bytes;      /* data desc: bytes 0-7 of the data
+                                           sector */
+    };
+    uint64_t    file_offset;            /* file offset to write zeros - multiple
+                                           of 4kB */
+    uint64_t    sequence_number;        /* must match same field in
+                                           vhdx_log_entry_header */
+} VHDXLogDescriptor;
+
+typedef struct QEMU_PACKED VHDXLogDataSector {
+    uint32_t    data_signature;         /* "data" in ASCII */
+    uint32_t    sequence_high;          /* 4 MSB of 8 byte sequence_number */
+    uint8_t     data[4084];             /* raw data, bytes 8-4091 (inclusive).
+                                           see the data descriptor field for the
+                                           other mising bytes */
+    uint32_t    sequence_low;           /* 4 LSB of 8 byte sequence_number */
+} VHDXLogDataSector;
+
+
+
+/* block states - different state values depending on whether it is a
+ * payload block, or a sector block. */
+
+#define PAYLOAD_BLOCK_NOT_PRESENT       0
+#define PAYLOAD_BLOCK_UNDEFINED         1
+#define PAYLOAD_BLOCK_ZERO              2
+#define PAYLOAD_BLOCK_UNMAPPED          5
+#define PAYLOAD_BLOCK_FULL_PRESENT      6
+#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7
+
+#define SB_BLOCK_NOT_PRESENT    0
+#define SB_BLOCK_PRESENT        6
+
+/* per the spec */
+#define VHDX_MAX_SECTORS_PER_BLOCK  (1<<23)
+
+/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state
+   other bits are reserved */
+#define VHDX_BAT_STATE_BIT_MASK 0x07
+#define VHDX_BAT_FILE_OFF_BITS (64-44)
+typedef uint64_t VHDXBatEntry;
+
+/* ---- METADATA REGION STRUCTURES ---- */
+
+#define VHDX_METADATA_ENTRY_SIZE 32
+#define VHDX_METADATA_MAX_ENTRIES 2047  /* not including the header */
+#define VHDX_METADATA_TABLE_MAX_SIZE \
+    (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1))
+typedef struct QEMU_PACKED VHDXMetadataTableHeader {
+    uint64_t    signature;              /* "metadata" in ASCII */
+    uint16_t    reserved;
+    uint16_t    entry_count;            /* number table entries. <= 2047 */
+    uint32_t    reserved2[5];
+} VHDXMetadataTableHeader;
+
+#define VHDX_META_FLAGS_IS_USER         0x01    /* max 1024 entries */
+#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02    /* virtual disk metadata if set,
+                                                   otherwise file metdata */
+#define VHDX_META_FLAGS_IS_REQUIRED     0x04    /* parse must understand this
+                                                   entry to open the file */
+typedef struct QEMU_PACKED VHDXMetadataTableEntry {
+    MSGUID      item_id;                /* 128-bit identifier for metadata */
+    uint32_t    offset;                 /* byte offset of the metadata.  At
+                                           least 64kB.  Relative to start of
+                                           metadata region */
+                                        /* note: if length = 0, so is offset */
+    uint32_t    length;                 /* length of metadata. <= 1MB. */
+    uint32_t    data_bits;      /* least-significant 3 bits are flags, the
+                                   rest are reserved (see above) */
+    uint32_t    reserved2;
+} VHDXMetadataTableEntry;
+
+#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01   /* Do not change any blocks to
+                                                   be BLOCK_NOT_PRESENT.
+                                                   If set indicates a fixed
+                                                   size VHDX file */
+#define VHDX_PARAMS_HAS_PARENT           0x02    /* has parent / backing file */
+typedef struct QEMU_PACKED VHDXFileParameters {
+    uint32_t    block_size;             /* size of each payload block, always
+                                           power of 2, <= 256MB and >= 1MB. */
+    uint32_t data_bits;     /* least-significant 2 bits are flags, the rest
+                               are reserved (see above) */
+} VHDXFileParameters;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskSize {
+    uint64_t    virtual_disk_size;      /* Size of the virtual disk, in bytes.
+                                           Must be multiple of the sector size,
+                                           max of 64TB */
+} VHDXVirtualDiskSize;
+
+typedef struct QEMU_PACKED VHDXPage83Data {
+    MSGUID      page_83_data[16];       /* unique id for scsi devices that
+                                           support page 0x83 */
+} VHDXPage83Data;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize {
+    uint32_t    logical_sector_size;    /* virtual disk sector size (in bytes).
+                                           Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskLogicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize {
+    uint32_t    physical_sector_size;   /* physical sector size (in bytes).
+                                           Can only be 512 or 4096 bytes */
+} VHDXVirtualDiskPhysicalSectorSize;
+
+typedef struct QEMU_PACKED VHDXParentLocatorHeader {
+    MSGUID      locator_type[16];       /* type of the parent virtual disk. */
+    uint16_t    reserved;
+    uint16_t    key_value_count;        /* number of key/value pairs for this
+                                           locator */
+} VHDXParentLocatorHeader;
+
+/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */
+typedef struct QEMU_PACKED VHDXParentLocatorEntry {
+    uint32_t    key_offset;             /* offset in metadata for key, > 0 */
+    uint32_t    value_offset;           /* offset in metadata for value, >0 */
+    uint16_t    key_length;             /* length of entry key, > 0 */
+    uint16_t    value_length;           /* length of entry value, > 0 */
+} VHDXParentLocatorEntry;
+
+
+/* ----- END VHDX SPECIFICATION STRUCTURES ---- */
+
+#endif

From e8d4e5ffdb015959551726a5700c19b5d772ada4 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Mon, 29 Apr 2013 14:48:18 -0400
Subject: [PATCH 03/15] block: initial VHDX driver support framework - supports
 open and probe

This is the initial block driver framework for VHDX image support
(i.e. Hyper-V image file formats), that supports opening VHDX files, and
parsing the headers.

This commit does not yet enable:
    - reading
    - writing
    - updating the header
    - differencing files (images with parents)
    - log replay / dirty logs (only clean images)

This is based on Microsoft's VHDX specification:
    "VHDX Format Specification v0.95", published 4/12/2012
    https://www.microsoft.com/en-us/download/details.aspx?id=29681

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/Makefile.objs |   1 +
 block/vhdx.c        | 853 ++++++++++++++++++++++++++++++++++++++++++++
 block/vhdx.h        |  14 +
 3 files changed, 868 insertions(+)
 create mode 100644 block/vhdx.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 6c4b5bc6b0..5f0358a3d7 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -2,6 +2,7 @@ block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
+block-obj-y += vhdx.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
diff --git a/block/vhdx.c b/block/vhdx.c
new file mode 100644
index 0000000000..0ee10a76ba
--- /dev/null
+++ b/block/vhdx.c
@@ -0,0 +1,853 @@
+/*
+ * Block driver for Hyper-V VHDX Images
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ *  Jeff Cody <jcody@redhat.com>
+ *
+ *  This is based on the "VHDX Format Specification v0.95", published 4/12/2012
+ *  by Microsoft:
+ *      https://www.microsoft.com/en-us/download/details.aspx?id=29681
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block/block_int.h"
+#include "qemu/module.h"
+#include "qemu/crc32c.h"
+#include "block/vhdx.h"
+
+
+/* Several metadata and region table data entries are identified by
+ * guids in  a MS-specific GUID format. */
+
+
+/* ------- Known Region Table GUIDs ---------------------- */
+static const MSGUID bat_guid =      { .data1 = 0x2dc27766,
+                                      .data2 = 0xf623,
+                                      .data3 = 0x4200,
+                                      .data4 = { 0x9d, 0x64, 0x11, 0x5e,
+                                                 0x9b, 0xfd, 0x4a, 0x08} };
+
+static const MSGUID metadata_guid = { .data1 = 0x8b7ca206,
+                                      .data2 = 0x4790,
+                                      .data3 = 0x4b9a,
+                                      .data4 = { 0xb8, 0xfe, 0x57, 0x5f,
+                                                 0x05, 0x0f, 0x88, 0x6e} };
+
+
+
+/* ------- Known Metadata Entry GUIDs ---------------------- */
+static const MSGUID file_param_guid =   { .data1 = 0xcaa16737,
+                                          .data2 = 0xfa36,
+                                          .data3 = 0x4d43,
+                                          .data4 = { 0xb3, 0xb6, 0x33, 0xf0,
+                                                     0xaa, 0x44, 0xe7, 0x6b} };
+
+static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224,
+                                          .data2 = 0xcd1b,
+                                          .data3 = 0x4876,
+                                          .data4 = { 0xb2, 0x11, 0x5d, 0xbe,
+                                                     0xd8, 0x3b, 0xf4, 0xb8} };
+
+static const MSGUID page83_guid =       { .data1 = 0xbeca12ab,
+                                          .data2 = 0xb2e6,
+                                          .data3 = 0x4523,
+                                          .data4 = { 0x93, 0xef, 0xc3, 0x09,
+                                                     0xe0, 0x00, 0xc7, 0x46} };
+
+
+static const MSGUID phys_sector_guid =  { .data1 = 0xcda348c7,
+                                          .data2 = 0x445d,
+                                          .data3 = 0x4471,
+                                          .data4 = { 0x9c, 0xc9, 0xe9, 0x88,
+                                                     0x52, 0x51, 0xc5, 0x56} };
+
+static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d,
+                                            .data2 = 0xb30b,
+                                            .data3 = 0x454d,
+                                            .data4 = { 0xab, 0xf7, 0xd3,
+                                                       0xd8, 0x48, 0x34,
+                                                       0xab, 0x0c} };
+
+static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d,
+                                            .data2 = 0xa96f,
+                                            .data3 = 0x4709,
+                                            .data4 = { 0xba, 0x47, 0xf2,
+                                                       0x33, 0xa8, 0xfa,
+                                                       0xab, 0x5f} };
+
+/* Each parent type must have a valid GUID; this is for parent images
+ * of type 'VHDX'.  If we were to allow e.g. a QCOW2 parent, we would
+ * need to make up our own QCOW2 GUID type */
+static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7,
+                                         .data2 = 0xd19e,
+                                         .data3 = 0x4a81,
+                                         .data4 = { 0xb7, 0x89, 0x25, 0xb8,
+                                                    0xe9, 0x44, 0x59, 0x13} };
+
+
+#define META_FILE_PARAMETER_PRESENT      0x01
+#define META_VIRTUAL_DISK_SIZE_PRESENT   0x02
+#define META_PAGE_83_PRESENT             0x04
+#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08
+#define META_PHYS_SECTOR_SIZE_PRESENT    0x10
+#define META_PARENT_LOCATOR_PRESENT      0x20
+
+#define META_ALL_PRESENT    \
+    (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \
+     META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \
+     META_PHYS_SECTOR_SIZE_PRESENT)
+
+typedef struct VHDXMetadataEntries {
+    VHDXMetadataTableEntry file_parameters_entry;
+    VHDXMetadataTableEntry virtual_disk_size_entry;
+    VHDXMetadataTableEntry page83_data_entry;
+    VHDXMetadataTableEntry logical_sector_size_entry;
+    VHDXMetadataTableEntry phys_sector_size_entry;
+    VHDXMetadataTableEntry parent_locator_entry;
+    uint16_t present;
+} VHDXMetadataEntries;
+
+
+typedef struct BDRVVHDXState {
+    CoMutex lock;
+
+    int curr_header;
+    VHDXHeader *headers[2];
+
+    VHDXRegionTableHeader rt;
+    VHDXRegionTableEntry bat_rt;         /* region table for the BAT */
+    VHDXRegionTableEntry metadata_rt;    /* region table for the metadata */
+
+    VHDXMetadataTableHeader metadata_hdr;
+    VHDXMetadataEntries metadata_entries;
+
+    VHDXFileParameters params;
+    uint32_t block_size;
+    uint32_t block_size_bits;
+    uint32_t sectors_per_block;
+    uint32_t sectors_per_block_bits;
+
+    uint64_t virtual_disk_size;
+    uint32_t logical_sector_size;
+    uint32_t physical_sector_size;
+
+    uint64_t chunk_ratio;
+    uint32_t chunk_ratio_bits;
+    uint32_t logical_sector_size_bits;
+
+    uint32_t bat_entries;
+    VHDXBatEntry *bat;
+    uint64_t bat_offset;
+
+    VHDXParentLocatorHeader parent_header;
+    VHDXParentLocatorEntry *parent_entries;
+
+} BDRVVHDXState;
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset)
+{
+    uint32_t crc_new;
+    uint32_t crc_orig;
+    assert(buf != NULL);
+
+    if (crc_offset > 0) {
+        memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+        memset(buf + crc_offset, 0, sizeof(crc_orig));
+    }
+
+    crc_new = crc32c(crc, buf, size);
+    if (crc_offset > 0) {
+        memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig));
+    }
+
+    return crc_new;
+}
+
+/* Validates the checksum of the buffer, with an in-place CRC.
+ *
+ * Zero is substituted during crc calculation for the original crc field,
+ * and the crc field is restored afterwards.  But the buffer will be modifed
+ * during the calculation, so this may not be not suitable for multi-threaded
+ * use.
+ *
+ * crc_offset: byte offset in buf of the buffer crc
+ * buf: buffer pointer
+ * size: size of buffer (must be > crc_offset+4)
+ *
+ * returns true if checksum is valid, false otherwise
+ */
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset)
+{
+    uint32_t crc_orig;
+    uint32_t crc;
+
+    assert(buf != NULL);
+    assert(size > (crc_offset + 4));
+
+    memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig));
+    crc_orig = le32_to_cpu(crc_orig);
+
+    crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset);
+
+    return crc == crc_orig;
+}
+
+
+/*
+ * Per the MS VHDX Specification, for every VHDX file:
+ *      - The header section is fixed size - 1 MB
+ *      - The header section is always the first "object"
+ *      - The first 64KB of the header is the File Identifier
+ *      - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile")
+ *      - The following 512 bytes constitute a UTF-16 string identifiying the
+ *        software that created the file, and is optional and diagnostic only.
+ *
+ *  Therefore, we probe by looking for the vhdxfile signature "vhdxfile"
+ */
+static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) {
+        return 100;
+    }
+    return 0;
+}
+
+/* All VHDX structures on disk are little endian */
+static void vhdx_header_le_import(VHDXHeader *h)
+{
+    assert(h != NULL);
+
+    le32_to_cpus(&h->signature);
+    le32_to_cpus(&h->checksum);
+    le64_to_cpus(&h->sequence_number);
+
+    leguid_to_cpus(&h->file_write_guid);
+    leguid_to_cpus(&h->data_write_guid);
+    leguid_to_cpus(&h->log_guid);
+
+    le16_to_cpus(&h->log_version);
+    le16_to_cpus(&h->version);
+    le32_to_cpus(&h->log_length);
+    le64_to_cpus(&h->log_offset);
+}
+
+
+/* opens the specified header block from the VHDX file header section */
+static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    VHDXHeader *header1;
+    VHDXHeader *header2;
+    bool h1_valid = false;
+    bool h2_valid = false;
+    uint64_t h1_seq = 0;
+    uint64_t h2_seq = 0;
+    uint8_t *buffer;
+
+    header1 = qemu_blockalign(bs, sizeof(VHDXHeader));
+    header2 = qemu_blockalign(bs, sizeof(VHDXHeader));
+
+    buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE);
+
+    s->headers[0] = header1;
+    s->headers[1] = header2;
+
+    /* We have to read the whole VHDX_HEADER_SIZE instead of
+     * sizeof(VHDXHeader), because the checksum is over the whole
+     * region */
+    ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header1, buffer, sizeof(VHDXHeader));
+    vhdx_header_le_import(header1);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        !memcmp(&header1->signature, "head", 4)             &&
+        header1->version == 1) {
+        h1_seq = header1->sequence_number;
+        h1_valid = true;
+    }
+
+    ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    /* copy over just the relevant portion that we need */
+    memcpy(header2, buffer, sizeof(VHDXHeader));
+    vhdx_header_le_import(header2);
+
+    if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) &&
+        !memcmp(&header2->signature, "head", 4)             &&
+        header2->version == 1) {
+        h2_seq = header2->sequence_number;
+        h2_valid = true;
+    }
+
+    /* If there is only 1 valid header (or no valid headers), we
+     * don't care what the sequence numbers are */
+    if (h1_valid && !h2_valid) {
+        s->curr_header = 0;
+    } else if (!h1_valid && h2_valid) {
+        s->curr_header = 1;
+    } else if (!h1_valid && !h2_valid) {
+        ret = -EINVAL;
+        goto fail;
+    } else {
+        /* If both headers are valid, then we choose the active one by the
+         * highest sequence number.  If the sequence numbers are equal, that is
+         * invalid */
+        if (h1_seq > h2_seq) {
+            s->curr_header = 0;
+        } else if (h2_seq > h1_seq) {
+            s->curr_header = 1;
+        } else {
+            ret = -EINVAL;
+            goto fail;
+        }
+    }
+
+    ret = 0;
+
+    goto exit;
+
+fail:
+    qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found");
+    qemu_vfree(header1);
+    qemu_vfree(header2);
+    s->headers[0] = NULL;
+    s->headers[1] = NULL;
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    VHDXRegionTableEntry rt_entry;
+    uint32_t i;
+    bool bat_rt_found = false;
+    bool metadata_rt_found = false;
+
+    /* We have to read the whole 64KB block, because the crc32 is over the
+     * whole block */
+    buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE);
+
+    ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer,
+                     VHDX_HEADER_BLOCK_SIZE);
+    if (ret < 0) {
+        goto fail;
+    }
+    memcpy(&s->rt, buffer, sizeof(s->rt));
+    le32_to_cpus(&s->rt.signature);
+    le32_to_cpus(&s->rt.checksum);
+    le32_to_cpus(&s->rt.entry_count);
+    le32_to_cpus(&s->rt.reserved);
+    offset += sizeof(s->rt);
+
+    if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) ||
+        memcmp(&s->rt.signature, "regi", 4)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Per spec, maximum region table entry count is 2047 */
+    if (s->rt.entry_count > 2047) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    for (i = 0; i < s->rt.entry_count; i++) {
+        memcpy(&rt_entry, buffer + offset, sizeof(rt_entry));
+        offset += sizeof(rt_entry);
+
+        leguid_to_cpus(&rt_entry.guid);
+        le64_to_cpus(&rt_entry.file_offset);
+        le32_to_cpus(&rt_entry.length);
+        le32_to_cpus(&rt_entry.data_bits);
+
+        /* see if we recognize the entry */
+        if (guid_eq(rt_entry.guid, bat_guid)) {
+            /* must be unique; if we have already found it this is invalid */
+            if (bat_rt_found) {
+                ret = -EINVAL;
+                goto fail;
+            }
+            bat_rt_found = true;
+            s->bat_rt = rt_entry;
+            continue;
+        }
+
+        if (guid_eq(rt_entry.guid, metadata_guid)) {
+            /* must be unique; if we have already found it this is invalid */
+            if (metadata_rt_found) {
+                ret = -EINVAL;
+                goto fail;
+            }
+            metadata_rt_found = true;
+            s->metadata_rt = rt_entry;
+            continue;
+        }
+
+        if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+
+
+/* Metadata initial parser
+ *
+ * This loads all the metadata entry fields.  This may cause additional
+ * fields to be processed (e.g. parent locator, etc..).
+ *
+ * There are 5 Metadata items that are always required:
+ *      - File Parameters (block size, has a parent)
+ *      - Virtual Disk Size (size, in bytes, of the virtual drive)
+ *      - Page 83 Data (scsi page 83 guid)
+ *      - Logical Sector Size (logical sector size in bytes, either 512 or
+ *                             4096.  We only support 512 currently)
+ *      - Physical Sector Size (512 or 4096)
+ *
+ * Also, if the File Parameters indicate this is a differencing file,
+ * we must also look for the Parent Locator metadata item.
+ */
+static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    uint8_t *buffer;
+    int offset = 0;
+    uint32_t i = 0;
+    VHDXMetadataTableEntry md_entry;
+
+    buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE);
+
+    ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer,
+                     VHDX_METADATA_TABLE_MAX_SIZE);
+    if (ret < 0) {
+        goto exit;
+    }
+    memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr));
+    offset += sizeof(s->metadata_hdr);
+
+    le64_to_cpus(&s->metadata_hdr.signature);
+    le16_to_cpus(&s->metadata_hdr.reserved);
+    le16_to_cpus(&s->metadata_hdr.entry_count);
+
+    if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->metadata_entries.present = 0;
+
+    if ((s->metadata_hdr.entry_count * sizeof(md_entry)) >
+        (VHDX_METADATA_TABLE_MAX_SIZE - offset)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    for (i = 0; i < s->metadata_hdr.entry_count; i++) {
+        memcpy(&md_entry, buffer + offset, sizeof(md_entry));
+        offset += sizeof(md_entry);
+
+        leguid_to_cpus(&md_entry.item_id);
+        le32_to_cpus(&md_entry.offset);
+        le32_to_cpus(&md_entry.length);
+        le32_to_cpus(&md_entry.data_bits);
+        le32_to_cpus(&md_entry.reserved2);
+
+        if (guid_eq(md_entry.item_id, file_param_guid)) {
+            if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.file_parameters_entry = md_entry;
+            s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, virtual_size_guid)) {
+            if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.virtual_disk_size_entry = md_entry;
+            s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, page83_guid)) {
+            if (s->metadata_entries.present & META_PAGE_83_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.page83_data_entry = md_entry;
+            s->metadata_entries.present |= META_PAGE_83_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, logical_sector_guid)) {
+            if (s->metadata_entries.present &
+                META_LOGICAL_SECTOR_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.logical_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, phys_sector_guid)) {
+            if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.phys_sector_size_entry = md_entry;
+            s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT;
+            continue;
+        }
+
+        if (guid_eq(md_entry.item_id, parent_locator_guid)) {
+            if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+                ret = -EINVAL;
+                goto exit;
+            }
+            s->metadata_entries.parent_locator_entry = md_entry;
+            s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT;
+            continue;
+        }
+
+        if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) {
+            /* cannot read vhdx file - required region table entry that
+             * we do not understand.  per spec, we must fail to open */
+            ret = -ENOTSUP;
+            goto exit;
+        }
+    }
+
+    if (s->metadata_entries.present != META_ALL_PRESENT) {
+        ret = -ENOTSUP;
+        goto exit;
+    }
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.file_parameters_entry.offset
+                                         + s->metadata_rt.file_offset,
+                     &s->params,
+                     sizeof(s->params));
+
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le32_to_cpus(&s->params.block_size);
+    le32_to_cpus(&s->params.data_bits);
+
+
+    /* We now have the file parameters, so we can tell if this is a
+     * differencing file (i.e.. has_parent), is dynamic or fixed
+     * sized (leave_blocks_allocated), and the block size */
+
+    /* The parent locator required iff the file parameters has_parent set */
+    if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+        if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) {
+            /* TODO: parse  parent locator fields */
+            ret = -ENOTSUP; /* temp, until differencing files are supported */
+            goto exit;
+        } else {
+            /* if has_parent is set, but there is not parent locator present,
+             * then that is an invalid combination */
+            ret = -EINVAL;
+            goto exit;
+        }
+    }
+
+    /* determine virtual disk size, logical sector size,
+     * and phys sector size */
+
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.virtual_disk_size_entry.offset
+                                           + s->metadata_rt.file_offset,
+                     &s->virtual_disk_size,
+                     sizeof(uint64_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.logical_sector_size_entry.offset
+                                             + s->metadata_rt.file_offset,
+                     &s->logical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+    ret = bdrv_pread(bs->file,
+                     s->metadata_entries.phys_sector_size_entry.offset
+                                          + s->metadata_rt.file_offset,
+                     &s->physical_sector_size,
+                     sizeof(uint32_t));
+    if (ret < 0) {
+        goto exit;
+    }
+
+    le64_to_cpus(&s->virtual_disk_size);
+    le32_to_cpus(&s->logical_sector_size);
+    le32_to_cpus(&s->physical_sector_size);
+
+    if (s->logical_sector_size == 0 || s->params.block_size == 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    /* both block_size and sector_size are guaranteed powers of 2 */
+    s->sectors_per_block = s->params.block_size / s->logical_sector_size;
+    s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) *
+                     (uint64_t)s->logical_sector_size /
+                     (uint64_t)s->params.block_size;
+
+    /* These values are ones we will want to use for division / multiplication
+     * later on, and they are all guaranteed (per the spec) to be powers of 2,
+     * so we can take advantage of that for shift operations during
+     * reads/writes */
+    if (s->logical_sector_size & (s->logical_sector_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    if (s->sectors_per_block & (s->sectors_per_block - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    if (s->chunk_ratio & (s->chunk_ratio - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+    s->block_size = s->params.block_size;
+    if (s->block_size & (s->block_size - 1)) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size);
+    s->sectors_per_block_bits =   31 - clz32(s->sectors_per_block);
+    s->chunk_ratio_bits =         63 - clz64(s->chunk_ratio);
+    s->block_size_bits =          31 - clz32(s->block_size);
+
+    ret = 0;
+
+exit:
+    qemu_vfree(buffer);
+    return ret;
+}
+
+/* Parse the replay log.  Per the VHDX spec, if the log is present
+ * it must be replayed prior to opening the file, even read-only.
+ *
+ * If read-only, we must replay the log in RAM (or refuse to open
+ * a dirty VHDX file read-only */
+static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s)
+{
+    int ret = 0;
+    int i;
+    VHDXHeader *hdr;
+
+    hdr = s->headers[s->curr_header];
+
+    /* either the log guid, or log length is zero,
+     * then a replay log is present */
+    for (i = 0; i < sizeof(hdr->log_guid.data4); i++) {
+        ret |= hdr->log_guid.data4[i];
+    }
+    if (hdr->log_guid.data1 == 0 &&
+        hdr->log_guid.data2 == 0 &&
+        hdr->log_guid.data3 == 0 &&
+        ret == 0) {
+        goto exit;
+    }
+
+    /* per spec, only log version of 0 is supported */
+    if (hdr->log_version != 0) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    if (hdr->log_length == 0) {
+        goto exit;
+    }
+
+    /* We currently do not support images with logs to replay */
+    ret = -ENOTSUP;
+
+exit:
+    return ret;
+}
+
+
+static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
+{
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    uint32_t i;
+    uint64_t signature;
+    uint32_t data_blocks_cnt, bitmap_blocks_cnt;
+
+
+    s->bat = NULL;
+
+    qemu_co_mutex_init(&s->lock);
+
+    /* validate the file signature */
+    ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+    if (memcmp(&signature, "vhdxfile", 8)) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    ret = vhdx_parse_header(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_log(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_open_region_tables(bs, s);
+    if (ret) {
+        goto fail;
+    }
+
+    ret = vhdx_parse_metadata(bs, s);
+    if (ret) {
+        goto fail;
+    }
+    s->block_size = s->params.block_size;
+
+    /* the VHDX spec dictates that virtual_disk_size is always a multiple of
+     * logical_sector_size */
+    bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits;
+
+    data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits;
+    if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) {
+        data_blocks_cnt++;
+    }
+    bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits;
+    if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) {
+        bitmap_blocks_cnt++;
+    }
+
+    if (s->parent_entries) {
+        s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1);
+    } else {
+        s->bat_entries = data_blocks_cnt +
+                         ((data_blocks_cnt - 1) >> s->chunk_ratio_bits);
+    }
+
+    s->bat_offset = s->bat_rt.file_offset;
+
+    if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) {
+        /* BAT allocation is not large enough for all entries */
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->bat = qemu_blockalign(bs, s->bat_rt.length);
+
+    ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < s->bat_entries; i++) {
+        le64_to_cpus(&s->bat[i]);
+    }
+
+    if (flags & BDRV_O_RDWR) {
+        ret = -ENOTSUP;
+        goto fail;
+    }
+
+    /* TODO: differencing files, read, write */
+
+    return 0;
+fail:
+    qemu_vfree(s->headers[0]);
+    qemu_vfree(s->headers[1]);
+    qemu_vfree(s->bat);
+    qemu_vfree(s->parent_entries);
+    return ret;
+}
+
+static int vhdx_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
+
+static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return -ENOTSUP;
+}
+
+
+
+static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *qiov)
+{
+    return -ENOTSUP;
+}
+
+
+static void vhdx_close(BlockDriverState *bs)
+{
+    BDRVVHDXState *s = bs->opaque;
+    qemu_vfree(s->headers[0]);
+    qemu_vfree(s->headers[1]);
+    qemu_vfree(s->bat);
+    qemu_vfree(s->parent_entries);
+}
+
+static BlockDriver bdrv_vhdx = {
+    .format_name            = "vhdx",
+    .instance_size          = sizeof(BDRVVHDXState),
+    .bdrv_probe             = vhdx_probe,
+    .bdrv_open              = vhdx_open,
+    .bdrv_close             = vhdx_close,
+    .bdrv_reopen_prepare    = vhdx_reopen_prepare,
+    .bdrv_co_readv          = vhdx_co_readv,
+    .bdrv_co_writev         = vhdx_co_writev,
+};
+
+static void bdrv_vhdx_init(void)
+{
+    bdrv_register(&bdrv_vhdx);
+}
+
+block_init(bdrv_vhdx_init);
diff --git a/block/vhdx.h b/block/vhdx.h
index fcddd37d98..c3b64c6ff6 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -308,4 +308,18 @@ typedef struct QEMU_PACKED VHDXParentLocatorEntry {
 
 /* ----- END VHDX SPECIFICATION STRUCTURES ---- */
 
+
+uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
+                            int crc_offset);
+
+bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
+
+
+static void leguid_to_cpus(MSGUID *guid)
+{
+    le32_to_cpus(&guid->data1);
+    le16_to_cpus(&guid->data2);
+    le16_to_cpus(&guid->data3);
+}
+
 #endif

From 059e2fbbca484a68ac7fd99d1d76409294551586 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Mon, 29 Apr 2013 14:48:19 -0400
Subject: [PATCH 04/15] block: add read-only support to VHDX image format.

This adds in read-only support to the VHDX image format.  This supports
reads for fixed-size, and dynamic sized VHDX images.

Differencing files are still unsupported.

The image must be opened without BDRV_O_RDWR set, because we do not
yet update the headers.  I.e., pass 'readonly=on' in the drive image
options from the QEMU commandline.

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vhdx.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 2 deletions(-)

diff --git a/block/vhdx.c b/block/vhdx.c
index 0ee10a76ba..e9704b1fdc 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -114,6 +114,17 @@ typedef struct VHDXMetadataEntries {
 } VHDXMetadataEntries;
 
 
+typedef struct VHDXSectorInfo {
+    uint32_t bat_idx;       /* BAT entry index */
+    uint32_t sectors_avail; /* sectors available in payload block */
+    uint32_t bytes_left;    /* bytes left in the block after data to r/w */
+    uint32_t bytes_avail;   /* bytes available in payload block */
+    uint64_t file_offset;   /* absolute offset in bytes, in file */
+    uint64_t block_offset;  /* block offset, in bytes */
+} VHDXSectorInfo;
+
+
+
 typedef struct BDRVVHDXState {
     CoMutex lock;
 
@@ -792,7 +803,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
-    /* TODO: differencing files, read, write */
+    /* TODO: differencing files, write */
 
     return 0;
 fail:
@@ -810,10 +821,118 @@ static int vhdx_reopen_prepare(BDRVReopenState *state,
 }
 
 
+/*
+ * Perform sector to block offset translations, to get various
+ * sector and file offsets into the image.  See VHDXSectorInfo
+ */
+static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num,
+                                 int nb_sectors, VHDXSectorInfo *sinfo)
+{
+    uint32_t block_offset;
+
+    sinfo->bat_idx = sector_num >> s->sectors_per_block_bits;
+    /* effectively a modulo - this gives us the offset into the block
+     * (in sector sizes) for our sector number */
+    block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits);
+    /* the chunk ratio gives us the interleaving of the sector
+     * bitmaps, so we need to advance our page block index by the
+     * sector bitmaps entry number */
+    sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits;
+
+    /* the number of sectors we can read/write in this cycle */
+    sinfo->sectors_avail = s->sectors_per_block - block_offset;
+
+    sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+    if (sinfo->sectors_avail > nb_sectors) {
+        sinfo->sectors_avail = nb_sectors;
+    }
+
+    sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits;
+
+    sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS;
+
+    sinfo->block_offset = block_offset << s->logical_sector_size_bits;
+
+    /* The file offset must be past the header section, so must be > 0 */
+    if (sinfo->file_offset == 0) {
+        return;
+    }
+
+    /* block offset is the offset in vhdx logical sectors, in
+     * the payload data block. Convert that to a byte offset
+     * in the block, and add in the payload data block offset
+     * in the file, in bytes, to get the final read address */
+
+    sinfo->file_offset <<= 20;  /* now in bytes, rather than 1MB units */
+    sinfo->file_offset += sinfo->block_offset;
+}
+
+
+
 static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num,
                                       int nb_sectors, QEMUIOVector *qiov)
 {
-    return -ENOTSUP;
+    BDRVVHDXState *s = bs->opaque;
+    int ret = 0;
+    VHDXSectorInfo sinfo;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+
+    while (nb_sectors > 0) {
+        /* We are a differencing file, so we need to inspect the sector bitmap
+         * to see if we have the data or not */
+        if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) {
+            /* not supported yet */
+            ret = -ENOTSUP;
+            goto exit;
+        } else {
+            vhdx_block_translate(s, sector_num, nb_sectors, &sinfo);
+
+            qemu_iovec_reset(&hd_qiov);
+            qemu_iovec_concat(&hd_qiov, qiov,  bytes_done, sinfo.bytes_avail);
+
+            /* check the payload block state */
+            switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) {
+            case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */
+            case PAYLOAD_BLOCK_UNDEFINED:   /* fall through */
+            case PAYLOAD_BLOCK_UNMAPPED:    /* fall through */
+            case PAYLOAD_BLOCK_ZERO:
+                /* return zero */
+                qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail);
+                break;
+            case PAYLOAD_BLOCK_FULL_PRESENT:
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->file,
+                                    sinfo.file_offset >> BDRV_SECTOR_BITS,
+                                    sinfo.sectors_avail, &hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    goto exit;
+                }
+                break;
+            case PAYLOAD_BLOCK_PARTIALLY_PRESENT:
+                /* we don't yet support difference files, fall through
+                 * to error */
+            default:
+                ret = -EIO;
+                goto exit;
+                break;
+            }
+            nb_sectors -= sinfo.sectors_avail;
+            sector_num += sinfo.sectors_avail;
+            bytes_done += sinfo.bytes_avail;
+        }
+    }
+    ret = 0;
+exit:
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
 }
 
 

From 8732901e1b8a5add02a2b747875202133413d565 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 2 May 2013 15:32:55 +0200
Subject: [PATCH 05/15] blockdev: Replace "undefined error" in qmp_block_resize

We have an errno value that can be displayed, so we should just do that.
An easy way to reproduce this case is to resize a raw image to a size
that is too large for the host file system.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 blockdev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 6e293e9550..7c9d8dd0ac 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1118,6 +1118,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 void qmp_block_resize(const char *device, int64_t size, Error **errp)
 {
     BlockDriverState *bs;
+    int ret;
 
     bs = bdrv_find(device);
     if (!bs) {
@@ -1133,7 +1134,8 @@ void qmp_block_resize(const char *device, int64_t size, Error **errp)
     /* complete all in-flight operations before resizing the device */
     bdrv_drain_all();
 
-    switch (bdrv_truncate(bs, size)) {
+    ret = bdrv_truncate(bs, size);
+    switch (ret) {
     case 0:
         break;
     case -ENOMEDIUM:
@@ -1149,7 +1151,7 @@ void qmp_block_resize(const char *device, int64_t size, Error **errp)
         error_set(errp, QERR_DEVICE_IN_USE, device);
         break;
     default:
-        error_set(errp, QERR_UNDEFINED_ERROR);
+        error_setg_errno(errp, -ret, "Could not resize");
         break;
     }
 }

From 65f7472577f9409d285b0216fae76c7c3f03fc41 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:22 +0800
Subject: [PATCH 06/15] vmdk: named return code.

Internal routines in vmdk.c previously return -1 on error and 0 on
success. More return values are useful for future changes such as
zeroed-grain GTE. Change all the magic `return 0` and `return -1` to
macro names:

 * VMDK_OK      0
 * VMDK_ERROR   (-1)
 * VMDK_UNALLOC (-2)
 * VMDK_ZEROED  (-3)

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 60 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 7bad757a33..16aa29c81d 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -37,6 +37,14 @@
 #define VMDK4_FLAG_MARKER (1 << 17)
 #define VMDK4_GD_AT_END 0xffffffffffffffffULL
 
+
+/* VMDK internal error codes */
+#define VMDK_OK      0
+#define VMDK_ERROR   (-1)
+/* Cluster not allocated */
+#define VMDK_UNALLOC (-2)
+#define VMDK_ZEROED  (-3)
+
 typedef struct {
     uint32_t version;
     uint32_t flags;
@@ -578,22 +586,22 @@ static int vmdk_parse_description(const char *desc, const char *opt_name,
 
     opt_pos = strstr(desc, opt_name);
     if (!opt_pos) {
-        return -1;
+        return VMDK_ERROR;
     }
     /* Skip "=\"" following opt_name */
     opt_pos += strlen(opt_name) + 2;
     if (opt_pos >= end) {
-        return -1;
+        return VMDK_ERROR;
     }
     opt_end = opt_pos;
     while (opt_end < end && *opt_end != '"') {
         opt_end++;
     }
     if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
-        return -1;
+        return VMDK_ERROR;
     }
     pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
-    return 0;
+    return VMDK_OK;
 }
 
 /* Open an extent file and append to bs array */
@@ -772,7 +780,7 @@ static int get_whole_cluster(BlockDriverState *bs,
         int ret;
 
         if (!vmdk_is_cid_valid(bs)) {
-            return -1;
+            return VMDK_ERROR;
         }
 
         /* floor offset to cluster */
@@ -780,17 +788,17 @@ static int get_whole_cluster(BlockDriverState *bs,
         ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
                 extent->cluster_sectors);
         if (ret < 0) {
-            return -1;
+            return VMDK_ERROR;
         }
 
         /* Write grain only into the active image */
         ret = bdrv_write(extent->file, cluster_offset, whole_grain,
                 extent->cluster_sectors);
         if (ret < 0) {
-            return -1;
+            return VMDK_ERROR;
         }
     }
-    return 0;
+    return VMDK_OK;
 }
 
 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
@@ -803,7 +811,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
                 &(m_data->offset),
                 sizeof(m_data->offset)
             ) < 0) {
-        return -1;
+        return VMDK_ERROR;
     }
     /* update backup L2 table */
     if (extent->l1_backup_table_offset != 0) {
@@ -814,11 +822,11 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
                         + (m_data->l2_index * sizeof(m_data->offset)),
                     &(m_data->offset), sizeof(m_data->offset)
                 ) < 0) {
-            return -1;
+            return VMDK_ERROR;
         }
     }
 
-    return 0;
+    return VMDK_OK;
 }
 
 static int get_cluster_offset(BlockDriverState *bs,
@@ -837,17 +845,17 @@ static int get_cluster_offset(BlockDriverState *bs,
     }
     if (extent->flat) {
         *cluster_offset = extent->flat_start_offset;
-        return 0;
+        return VMDK_OK;
     }
 
     offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
     l1_index = (offset >> 9) / extent->l1_entry_sectors;
     if (l1_index >= extent->l1_size) {
-        return -1;
+        return VMDK_ERROR;
     }
     l2_offset = extent->l1_table[l1_index];
     if (!l2_offset) {
-        return -1;
+        return VMDK_UNALLOC;
     }
     for (i = 0; i < L2_CACHE_SIZE; i++) {
         if (l2_offset == extent->l2_cache_offsets[i]) {
@@ -877,7 +885,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                 l2_table,
                 extent->l2_size * sizeof(uint32_t)
             ) != extent->l2_size * sizeof(uint32_t)) {
-        return -1;
+        return VMDK_ERROR;
     }
 
     extent->l2_cache_offsets[min_index] = l2_offset;
@@ -888,7 +896,7 @@ static int get_cluster_offset(BlockDriverState *bs,
 
     if (!*cluster_offset) {
         if (!allocate) {
-            return -1;
+            return VMDK_UNALLOC;
         }
 
         /* Avoid the L2 tables update for the images that have snapshots. */
@@ -911,7 +919,7 @@ static int get_cluster_offset(BlockDriverState *bs,
          */
         if (get_whole_cluster(
                 bs, extent, *cluster_offset, offset, allocate) == -1) {
-            return -1;
+            return VMDK_ERROR;
         }
 
         if (m_data) {
@@ -923,7 +931,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
     }
     *cluster_offset <<= 9;
-    return 0;
+    return VMDK_OK;
 }
 
 static VmdkExtent *find_extent(BDRVVmdkState *s,
@@ -1173,7 +1181,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                                 sector_num << 9, !extent->compressed,
                                 &cluster_offset);
         if (extent->compressed) {
-            if (ret == 0) {
+            if (ret == VMDK_OK) {
                 /* Refuse write to allocated cluster for streamOptimized */
                 fprintf(stderr,
                         "VMDK: can't write to allocated cluster"
@@ -1357,7 +1365,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 
     if (filename == NULL || !strlen(filename)) {
         fprintf(stderr, "Vmdk: no filename provided.\n");
-        return -1;
+        return VMDK_ERROR;
     }
     p = strrchr(filename, '/');
     if (p == NULL) {
@@ -1369,7 +1377,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
     if (p != NULL) {
         p++;
         if (p - filename >= buf_len) {
-            return -1;
+            return VMDK_ERROR;
         }
         pstrcpy(path, p - filename + 1, filename);
     } else {
@@ -1382,12 +1390,12 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
         postfix[0] = '\0';
     } else {
         if (q - p >= buf_len) {
-            return -1;
+            return VMDK_ERROR;
         }
         pstrcpy(prefix, q - p + 1, p);
         pstrcpy(postfix, buf_len, q);
     }
-    return 0;
+    return VMDK_OK;
 }
 
 static int relative_path(char *dest, int dest_size,
@@ -1403,11 +1411,11 @@ static int relative_path(char *dest, int dest_size,
 #endif
 
     if (!(dest && base && target)) {
-        return -1;
+        return VMDK_ERROR;
     }
     if (path_is_absolute(target)) {
         pstrcpy(dest, dest_size, target);
-        return 0;
+        return VMDK_OK;
     }
     while (base[i] == target[i]) {
         i++;
@@ -1426,7 +1434,7 @@ static int relative_path(char *dest, int dest_size,
         pstrcat(dest, dest_size, sep);
     }
     pstrcat(dest, dest_size, q);
-    return 0;
+    return VMDK_OK;
 }
 
 static int vmdk_create(const char *filename, QEMUOptionParameter *options)

From 14ead646fe7d5163c342ebd1e0aeb556ecf343a1 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:23 +0800
Subject: [PATCH 07/15] =?UTF-8?q?vmdk:=20add=20support=20for=20=E2=80=9Cze?=
 =?UTF-8?q?roed=E2=80=90grain=E2=80=9D=20GTE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduced support for zeroed-grain GTE, as specified in Virtual Disk
Format 5.0[1].

    Recent VMware hosted platform products support a new “zeroed‐grain”
    grain table entry (GTE). The zeroed‐grain GTE returns all zeros on
    read.  In other words, the zeroed‐grain GTE indicates that a grain
    in the child disk is zero‐filled but does not actually occupy space
    in storage.  A sparse extent with zeroed‐grain GTE has the following
    in its header:

     * SparseExtentHeader.version = 2
     * SparseExtentHeader.flags has bit 2 set

    Other than the new flag and the possibly zeroed‐grain GTE, version 2
    sparse extents are identical to version 1.  Also, a zeroed‐grain GTE
    has value 0x1 in the GT table.

[1] Virtual Disk Format 5.0, http://www.vmware.com/support/developer/vddk/vmdk_50_technote.pdf?src=vmdk
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 16aa29c81d..7e07c0f6e6 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -33,10 +33,13 @@
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
 #define VMDK4_COMPRESSION_DEFLATE 1
 #define VMDK4_FLAG_RGD (1 << 1)
+/* Zeroed-grain enable bit */
+#define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
 #define VMDK4_FLAG_COMPRESS (1 << 16)
 #define VMDK4_FLAG_MARKER (1 << 17)
 #define VMDK4_GD_AT_END 0xffffffffffffffffULL
 
+#define VMDK_GTE_ZEROED 0x1
 
 /* VMDK internal error codes */
 #define VMDK_OK      0
@@ -81,6 +84,8 @@ typedef struct VmdkExtent {
     bool flat;
     bool compressed;
     bool has_marker;
+    bool has_zero_grain;
+    int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
@@ -569,6 +574,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
     extent->compressed =
         le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
     extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
+    extent->version = le32_to_cpu(header.version);
+    extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
     ret = vmdk_init_tables(bs, extent);
     if (ret) {
         /* free extent allocated by vmdk_add_extent */
@@ -839,6 +846,7 @@ static int get_cluster_offset(BlockDriverState *bs,
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
     uint32_t min_count, *l2_table, tmp = 0;
+    bool zeroed = false;
 
     if (m_data) {
         m_data->valid = 0;
@@ -894,9 +902,13 @@ static int get_cluster_offset(BlockDriverState *bs,
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
 
-    if (!*cluster_offset) {
+    if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
+        zeroed = true;
+    }
+
+    if (!*cluster_offset || zeroed) {
         if (!allocate) {
-            return VMDK_UNALLOC;
+            return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
         }
 
         /* Avoid the L2 tables update for the images that have snapshots. */
@@ -967,8 +979,8 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
     ret = get_cluster_offset(bs, extent, NULL,
                             sector_num * 512, 0, &offset);
     qemu_co_mutex_unlock(&s->lock);
-    /* get_cluster_offset returning 0 means success */
-    ret = !ret;
+
+    ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
 
     index_in_cluster = sector_num % extent->cluster_sectors;
     n = extent->cluster_sectors - index_in_cluster;
@@ -1111,9 +1123,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
         if (n > nb_sectors) {
             n = nb_sectors;
         }
-        if (ret) {
+        if (ret != VMDK_OK) {
             /* if not allocated, try to read from parent image, if exist */
-            if (bs->backing_hd) {
+            if (bs->backing_hd && ret != VMDK_ZEROED) {
                 if (!vmdk_is_cid_valid(bs)) {
                     return -EINVAL;
                 }

From 69e0b6dfa455924842bfcbfa810fde34899f1350 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:24 +0800
Subject: [PATCH 08/15] vmdk: Add option to create zeroed-grain image

Add image create option "zeroed-grain" to enable zeroed-grain GTE
feature of vmdk sparse extents. When this option is on, header version
of newly created extent will be 2 and VMDK4_FLAG_ZERO_GRAIN flag bit
will be set.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 7e07c0f6e6..cc19e207d1 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -48,6 +48,8 @@
 #define VMDK_UNALLOC (-2)
 #define VMDK_ZEROED  (-3)
 
+#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
+
 typedef struct {
     uint32_t version;
     uint32_t flags;
@@ -1262,7 +1264,7 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
 
 
 static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress)
+                              bool flat, bool compress, bool zeroed_grain)
 {
     int ret, i;
     int fd = 0;
@@ -1284,9 +1286,10 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
-    header.version = 1;
-    header.flags =
-        3 | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0);
+    header.version = zeroed_grain ? 2 : 1;
+    header.flags = 3
+                   | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
+                   | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
     header.capacity = filesize / 512;
     header.granularity = 128;
@@ -1467,6 +1470,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
     char parent_desc_line[BUF_SIZE] = "";
     uint32_t parent_cid = 0xffffffff;
     uint32_t number_heads = 16;
+    bool zeroed_grain = false;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
         "version=1\n"
@@ -1502,6 +1506,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
             fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ZEROED_GRAIN)) {
+            zeroed_grain |= options->value.n;
         }
         options++;
     }
@@ -1588,7 +1594,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         snprintf(ext_filename, sizeof(ext_filename), "%s%s",
                 path, desc_filename);
 
-        if (vmdk_create_extent(ext_filename, size, flat, compress)) {
+        if (vmdk_create_extent(ext_filename, size,
+                               flat, compress, zeroed_grain)) {
             return -EINVAL;
         }
         filesize -= size;
@@ -1714,6 +1721,11 @@ static QEMUOptionParameter vmdk_create_options[] = {
             "VMDK flat extent format, can be one of "
             "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
     },
+    {
+        .name = BLOCK_OPT_ZEROED_GRAIN,
+        .type = OPT_FLAG,
+        .help = "Enable efficient zero writes using the zeroed-grain GTE feature"
+    },
     { NULL }
 };
 

From 95b0aa4231dfc827ddc189138d6502cbca66d2b6 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:25 +0800
Subject: [PATCH 09/15] vmdk: change magic number to macro

Two hard coded flag bits are changed to macros.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index cc19e207d1..0463d3b36d 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -32,6 +32,7 @@
 #define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
 #define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
 #define VMDK4_COMPRESSION_DEFLATE 1
+#define VMDK4_FLAG_NL_DETECT (1 << 0)
 #define VMDK4_FLAG_RGD (1 << 1)
 /* Zeroed-grain enable bit */
 #define VMDK4_FLAG_ZERO_GRAIN   (1 << 2)
@@ -1287,7 +1288,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     magic = cpu_to_be32(VMDK4_MAGIC);
     memset(&header, 0, sizeof(header));
     header.version = zeroed_grain ? 2 : 1;
-    header.flags = 3
+    header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
                    | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;

From e304e8e5a05bc54c71c5f6fc76abd47410e59857 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:26 +0800
Subject: [PATCH 10/15] vmdk: store fields of VmdkMetaData in cpu endian

Previously VmdkMetaData.offset is stored little endian while other
fields are cpu endian. This changes offset to cpu endian and convert
before writing to image.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0463d3b36d..d98f304591 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -813,14 +813,15 @@ static int get_whole_cluster(BlockDriverState *bs,
 
 static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
 {
+    uint32_t offset;
+    QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset));
+    offset = cpu_to_le32(m_data->offset);
     /* update L2 table */
     if (bdrv_pwrite_sync(
                 extent->file,
                 ((int64_t)m_data->l2_offset * 512)
                     + (m_data->l2_index * sizeof(m_data->offset)),
-                &(m_data->offset),
-                sizeof(m_data->offset)
-            ) < 0) {
+                &offset, sizeof(offset)) < 0) {
         return VMDK_ERROR;
     }
     /* update backup L2 table */
@@ -830,8 +831,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
                     extent->file,
                     ((int64_t)m_data->l2_offset * 512)
                         + (m_data->l2_index * sizeof(m_data->offset)),
-                    &(m_data->offset), sizeof(m_data->offset)
-                ) < 0) {
+                    &offset, sizeof(offset)) < 0) {
             return VMDK_ERROR;
         }
     }
@@ -848,7 +848,7 @@ static int get_cluster_offset(BlockDriverState *bs,
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
-    uint32_t min_count, *l2_table, tmp = 0;
+    uint32_t min_count, *l2_table;
     bool zeroed = false;
 
     if (m_data) {
@@ -924,8 +924,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
 
         *cluster_offset >>= 9;
-        tmp = cpu_to_le32(*cluster_offset);
-        l2_table[l2_index] = tmp;
+        l2_table[l2_index] = cpu_to_le32(*cluster_offset);
 
         /* First of all we write grain itself, to avoid race condition
          * that may to corrupt the image.
@@ -938,7 +937,7 @@ static int get_cluster_offset(BlockDriverState *bs,
         }
 
         if (m_data) {
-            m_data->offset = tmp;
+            m_data->offset = *cluster_offset;
             m_data->l1_index = l1_index;
             m_data->l2_index = l2_index;
             m_data->l2_offset = l2_offset;

From cdeaf1f15909e2e8af38f45aea7cfa467a729c52 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 2 May 2013 10:25:27 +0800
Subject: [PATCH 11/15] vmdk: add bdrv_co_write_zeroes

Use special offset to write zeroes efficiently, when zeroed-grain GTE is
available. If zero-write an allocated cluster, cluster is leaked because
its offset pointer is overwritten by "0x1".

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 86 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 18 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index d98f304591..608daaf930 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -124,6 +124,7 @@ typedef struct VmdkMetaData {
     unsigned int l2_index;
     unsigned int l2_offset;
     int valid;
+    uint32_t *l2_cache_entry;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -835,6 +836,9 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
             return VMDK_ERROR;
         }
     }
+    if (m_data->l2_cache_entry) {
+        *m_data->l2_cache_entry = offset;
+    }
 
     return VMDK_OK;
 }
@@ -905,6 +909,14 @@ static int get_cluster_offset(BlockDriverState *bs,
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
     *cluster_offset = le32_to_cpu(l2_table[l2_index]);
 
+    if (m_data) {
+        m_data->valid = 1;
+        m_data->l1_index = l1_index;
+        m_data->l2_index = l2_index;
+        m_data->offset = *cluster_offset;
+        m_data->l2_offset = l2_offset;
+        m_data->l2_cache_entry = &l2_table[l2_index];
+    }
     if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) {
         zeroed = true;
     }
@@ -938,10 +950,6 @@ static int get_cluster_offset(BlockDriverState *bs,
 
         if (m_data) {
             m_data->offset = *cluster_offset;
-            m_data->l1_index = l1_index;
-            m_data->l2_index = l2_index;
-            m_data->l2_offset = l2_offset;
-            m_data->valid = 1;
         }
     }
     *cluster_offset <<= 9;
@@ -1164,8 +1172,17 @@ static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
+/**
+ * vmdk_write:
+ * @zeroed:       buf is ignored (data is zero), use zeroed_grain GTE feature
+ * if possible, otherwise return -ENOTSUP.
+ * @zero_dry_run: used for zeroed == true only, don't update L2 table, just
+ *
+ * Returns: error code with 0 for success.
+ */
 static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
-                     const uint8_t *buf, int nb_sectors)
+                      const uint8_t *buf, int nb_sectors,
+                      bool zeroed, bool zero_dry_run)
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
@@ -1211,7 +1228,7 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
                                         &cluster_offset);
             }
         }
-        if (ret) {
+        if (ret == VMDK_ERROR) {
             return -EINVAL;
         }
         extent_begin_sector = extent->end_sector - extent->sectors;
@@ -1221,17 +1238,34 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
         if (n > nb_sectors) {
             n = nb_sectors;
         }
-
-        ret = vmdk_write_extent(extent,
-                        cluster_offset, index_in_cluster * 512,
-                        buf, n, sector_num);
-        if (ret) {
-            return ret;
-        }
-        if (m_data.valid) {
-            /* update L2 tables */
-            if (vmdk_L2update(extent, &m_data) == -1) {
-                return -EIO;
+        if (zeroed) {
+            /* Do zeroed write, buf is ignored */
+            if (extent->has_zero_grain &&
+                    index_in_cluster == 0 &&
+                    n >= extent->cluster_sectors) {
+                n = extent->cluster_sectors;
+                if (!zero_dry_run) {
+                    m_data.offset = VMDK_GTE_ZEROED;
+                    /* update L2 tables */
+                    if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                        return -EIO;
+                    }
+                }
+            } else {
+                return -ENOTSUP;
+            }
+        } else {
+            ret = vmdk_write_extent(extent,
+                            cluster_offset, index_in_cluster * 512,
+                            buf, n, sector_num);
+            if (ret) {
+                return ret;
+            }
+            if (m_data.valid) {
+                /* update L2 tables */
+                if (vmdk_L2update(extent, &m_data) != VMDK_OK) {
+                    return -EIO;
+                }
             }
         }
         nb_sectors -= n;
@@ -1257,7 +1291,22 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
     int ret;
     BDRVVmdkState *s = bs->opaque;
     qemu_co_mutex_lock(&s->lock);
-    ret = vmdk_write(bs, sector_num, buf, nb_sectors);
+    ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
+    qemu_co_mutex_unlock(&s->lock);
+    return ret;
+}
+
+static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             int nb_sectors)
+{
+    int ret;
+    BDRVVmdkState *s = bs->opaque;
+    qemu_co_mutex_lock(&s->lock);
+    ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
+    if (!ret) {
+        ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
+    }
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -1737,6 +1786,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_reopen_prepare = vmdk_reopen_prepare,
     .bdrv_read      = vmdk_co_read,
     .bdrv_write     = vmdk_co_write,
+    .bdrv_co_write_zeroes = vmdk_co_write_zeroes,
     .bdrv_close     = vmdk_close,
     .bdrv_create    = vmdk_create,
     .bdrv_co_flush_to_disk  = vmdk_co_flush,

From 50522d969be315457e3296d8a32990b58a203902 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 May 2013 09:31:40 +0800
Subject: [PATCH 12/15] qemu-iotests: Filter out vmdk creation options

Cover new image creation options for vmdk, so we can use '-o
zeroed_grain=XXX' and '-o subformat=XXX' to run the tests successfully.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/qemu-iotests/common.rc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index a536bf70a7..442cf516ef 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -127,6 +127,8 @@ _make_test_img()
             -e "s# compat='[^']*'##g" \
             -e "s# compat6=\\(on\\|off\\)##g" \
             -e "s# static=\\(on\\|off\\)##g" \
+            -e "s# zeroed_grain=\\(on\\|off\\)##g" \
+            -e "s# subformat='[^']*'##g" \
             -e "s# lazy_refcounts=\\(on\\|off\\)##g"
 
     # Start an NBD server on the image file, which is what we'll be talking to

From e1adb27a83f19799c454cead7bf930a0b3f2bb28 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 2 May 2013 14:23:07 +0200
Subject: [PATCH 13/15] nbd: use g_slice_new() instead of a freelist

Use GLib's efficient slice allocator instead of open-coding the request
freelist.  This patch simplifies the NBDRequest code.

Now we qemu_blockalign() the req->data buffer each time but the next
patch switches from a fixed size buffer to a dynamic size anyway.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 nbd.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/nbd.c b/nbd.c
index 85187fff9f..761f4ec826 100644
--- a/nbd.c
+++ b/nbd.c
@@ -98,7 +98,6 @@ struct NBDExport {
     off_t size;
     uint32_t nbdflags;
     QTAILQ_HEAD(, NBDClient) clients;
-    QSIMPLEQ_HEAD(, NBDRequest) requests;
     QTAILQ_ENTRY(NBDExport) next;
 };
 
@@ -850,13 +849,8 @@ static NBDRequest *nbd_request_get(NBDClient *client)
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
 
-    if (QSIMPLEQ_EMPTY(&exp->requests)) {
-        req = g_malloc0(sizeof(NBDRequest));
-        req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
-    } else {
-        req = QSIMPLEQ_FIRST(&exp->requests);
-        QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
-    }
+    req = g_slice_new0(NBDRequest);
+    req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
     nbd_client_get(client);
     req->client = client;
     return req;
@@ -865,7 +859,10 @@ static NBDRequest *nbd_request_get(NBDClient *client)
 static void nbd_request_put(NBDRequest *req)
 {
     NBDClient *client = req->client;
-    QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
+
+    qemu_vfree(req->data);
+    g_slice_free(NBDRequest, req);
+
     if (client->nb_requests-- == MAX_NBD_REQUESTS) {
         qemu_notify_event();
     }
@@ -877,7 +874,6 @@ NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
                           void (*close)(NBDExport *))
 {
     NBDExport *exp = g_malloc0(sizeof(NBDExport));
-    QSIMPLEQ_INIT(&exp->requests);
     exp->refcount = 1;
     QTAILQ_INIT(&exp->clients);
     exp->bs = bs;
@@ -953,13 +949,6 @@ void nbd_export_put(NBDExport *exp)
             exp->close(exp);
         }
 
-        while (!QSIMPLEQ_EMPTY(&exp->requests)) {
-            NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
-            QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
-            qemu_vfree(first->data);
-            g_free(first);
-        }
-
         g_free(exp);
     }
 }

From 2d8214885942becb8f4371a66d6f8c9a9580108a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 2 May 2013 14:23:08 +0200
Subject: [PATCH 14/15] nbd: support large NBD requests

The Linux nbd driver recently increased the maximum supported request
size up to 32 MB:

  commit 078be02b80359a541928c899c2631f39628f56df
  Author: Michal Belczyk <belczyk@bsd.krakow.pl>
  Date:   Tue Apr 30 15:28:28 2013 -0700

      nbd: increase default and max request sizes

      Raise the default max request size for nbd to 128KB (from 127KB) to get it
      4KB aligned.  This patch also allows the max request size to be increased
      (via /sys/block/nbd<x>/queue/max_sectors_kb) to 32MB.

QEMU's 1 MB buffers are too small to handle these requests.

This patch allocates data buffers dynamically and allows up to 32 MB per
request.

Reported-by: Nick Thomas <nick@bytemark.co.uk>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/nbd.h |  3 ++-
 nbd.c               | 17 +++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 0903d7a603..c90f5e4d9e 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -58,7 +58,8 @@ enum {
 
 #define NBD_DEFAULT_PORT	10809
 
-#define NBD_BUFFER_SIZE (1024*1024)
+/* Maximum size of a single READ/WRITE data buffer */
+#define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
 
 ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read);
 int tcp_socket_incoming(const char *address, uint16_t port);
diff --git a/nbd.c b/nbd.c
index 761f4ec826..2606403a41 100644
--- a/nbd.c
+++ b/nbd.c
@@ -844,13 +844,11 @@ void nbd_client_close(NBDClient *client)
 static NBDRequest *nbd_request_get(NBDClient *client)
 {
     NBDRequest *req;
-    NBDExport *exp = client->exp;
 
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
 
     req = g_slice_new0(NBDRequest);
-    req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
     nbd_client_get(client);
     req->client = client;
     return req;
@@ -860,7 +858,9 @@ static void nbd_request_put(NBDRequest *req)
 {
     NBDClient *client = req->client;
 
-    qemu_vfree(req->data);
+    if (req->data) {
+        qemu_vfree(req->data);
+    }
     g_slice_free(NBDRequest, req);
 
     if (client->nb_requests-- == MAX_NBD_REQUESTS) {
@@ -1007,6 +1007,7 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *reque
 {
     NBDClient *client = req->client;
     int csock = client->sock;
+    uint32_t command;
     ssize_t rc;
 
     client->recv_coroutine = qemu_coroutine_self();
@@ -1018,9 +1019,9 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *reque
         goto out;
     }
 
-    if (request->len > NBD_BUFFER_SIZE) {
+    if (request->len > NBD_MAX_BUFFER_SIZE) {
         LOG("len (%u) is larger than max len (%u)",
-            request->len, NBD_BUFFER_SIZE);
+            request->len, NBD_MAX_BUFFER_SIZE);
         rc = -EINVAL;
         goto out;
     }
@@ -1034,7 +1035,11 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *reque
 
     TRACE("Decoding type");
 
-    if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
+    command = request->type & NBD_CMD_MASK_COMMAND;
+    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
+        req->data = qemu_blockalign(client->exp->bs, request->len);
+    }
+    if (command == NBD_CMD_WRITE) {
         TRACE("Reading %u byte(s)", request->len);
 
         if (qemu_co_recv(csock, req->data, request->len) != request->len) {

From 86abefd61e23325162e59e5bfb8f0346eda62541 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 May 2013 15:31:16 +0800
Subject: [PATCH 15/15] qemu-iotests: Filter out 'adapter_type'

Filter out vmdk creation option 'adapter_type' for vmdk. So that tests
with an explicit './check -o adapter_type=XXX' will not fail.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/qemu-iotests/common.rc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 442cf516ef..31eb62b604 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -129,6 +129,7 @@ _make_test_img()
             -e "s# static=\\(on\\|off\\)##g" \
             -e "s# zeroed_grain=\\(on\\|off\\)##g" \
             -e "s# subformat='[^']*'##g" \
+            -e "s# adapter_type='[^']*'##g" \
             -e "s# lazy_refcounts=\\(on\\|off\\)##g"
 
     # Start an NBD server on the image file, which is what we'll be talking to