mirror of
https://github.com/xemu-project/xemu.git
synced 2024-11-27 13:30:52 +00:00
45e62b464a
There is a bug in handling BDRV_REQ_NO_WAIT flag: we still may wait in wait_serialising_requests() if request is unaligned. And this is possible for the only user of this flag (preallocate filter) if underlying file is unaligned to its request_alignment on start. So, we have to fix preallocate filter to do only aligned preallocate requests. Next, we should fix generic block/io.c somehow. Keeping in mind that preallocate is the only user of BDRV_REQ_NO_WAIT and that we have to fix its behavior now, it seems more safe to just assert that we never use BDRV_REQ_NO_WAIT with unaligned requests and add corresponding comment. Let's do so. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> Reviewed-by: Denis V. Lunev <den@openvz.org> Message-Id: <20220215121609.38570-1-vsementsov@virtuozzo.com> [hreitz: Rebased on block GS/IO split] Signed-off-by: Hanna Reitz <hreitz@redhat.com>
420 lines
15 KiB
C
420 lines
15 KiB
C
/*
|
|
* QEMU System Emulator block driver
|
|
*
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
* THE SOFTWARE.
|
|
*/
|
|
#ifndef BLOCK_COMMON_H
|
|
#define BLOCK_COMMON_H
|
|
|
|
#include "block/aio.h"
|
|
#include "block/aio-wait.h"
|
|
#include "qemu/iov.h"
|
|
#include "qemu/coroutine.h"
|
|
#include "block/accounting.h"
|
|
#include "block/dirty-bitmap.h"
|
|
#include "block/blockjob.h"
|
|
#include "qemu/hbitmap.h"
|
|
#include "qemu/transactions.h"
|
|
|
|
/*
|
|
* generated_co_wrapper
|
|
*
|
|
* Function specifier, which does nothing but mark functions to be
|
|
* generated by scripts/block-coroutine-wrapper.py
|
|
*
|
|
* Read more in docs/devel/block-coroutine-wrapper.rst
|
|
*/
|
|
#define generated_co_wrapper
|
|
|
|
/* block.c */
|
|
typedef struct BlockDriver BlockDriver;
|
|
typedef struct BdrvChild BdrvChild;
|
|
typedef struct BdrvChildClass BdrvChildClass;
|
|
|
|
typedef struct BlockDriverInfo {
|
|
/* in bytes, 0 if irrelevant */
|
|
int cluster_size;
|
|
/* offset at which the VM state can be saved (0 if not possible) */
|
|
int64_t vm_state_offset;
|
|
bool is_dirty;
|
|
/*
|
|
* True if this block driver only supports compressed writes
|
|
*/
|
|
bool needs_compressed_writes;
|
|
} BlockDriverInfo;
|
|
|
|
typedef struct BlockFragInfo {
|
|
uint64_t allocated_clusters;
|
|
uint64_t total_clusters;
|
|
uint64_t fragmented_clusters;
|
|
uint64_t compressed_clusters;
|
|
} BlockFragInfo;
|
|
|
|
typedef enum {
|
|
BDRV_REQ_COPY_ON_READ = 0x1,
|
|
BDRV_REQ_ZERO_WRITE = 0x2,
|
|
|
|
/*
|
|
* The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
|
|
* that the block driver should unmap (discard) blocks if it is guaranteed
|
|
* that the result will read back as zeroes. The flag is only passed to the
|
|
* driver if the block device is opened with BDRV_O_UNMAP.
|
|
*/
|
|
BDRV_REQ_MAY_UNMAP = 0x4,
|
|
|
|
BDRV_REQ_FUA = 0x10,
|
|
BDRV_REQ_WRITE_COMPRESSED = 0x20,
|
|
|
|
/*
|
|
* Signifies that this write request will not change the visible disk
|
|
* content.
|
|
*/
|
|
BDRV_REQ_WRITE_UNCHANGED = 0x40,
|
|
|
|
/*
|
|
* Forces request serialisation. Use only with write requests.
|
|
*/
|
|
BDRV_REQ_SERIALISING = 0x80,
|
|
|
|
/*
|
|
* Execute the request only if the operation can be offloaded or otherwise
|
|
* be executed efficiently, but return an error instead of using a slow
|
|
* fallback.
|
|
*/
|
|
BDRV_REQ_NO_FALLBACK = 0x100,
|
|
|
|
/*
|
|
* BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
|
|
* (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
|
|
* filter is involved), in which case it signals that the COR operation
|
|
* need not read the data into memory (qiov) but only ensure they are
|
|
* copied to the top layer (i.e., that COR operation is done).
|
|
*/
|
|
BDRV_REQ_PREFETCH = 0x200,
|
|
|
|
/*
|
|
* If we need to wait for other requests, just fail immediately. Used
|
|
* only together with BDRV_REQ_SERIALISING. Used only with requests aligned
|
|
* to request_alignment (corresponding assertions are in block/io.c).
|
|
*/
|
|
BDRV_REQ_NO_WAIT = 0x400,
|
|
|
|
/* Mask of valid flags */
|
|
BDRV_REQ_MASK = 0x7ff,
|
|
} BdrvRequestFlags;
|
|
|
|
#define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */
|
|
#define BDRV_O_RDWR 0x0002
|
|
#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */
|
|
#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save
|
|
writes in a snapshot */
|
|
#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */
|
|
#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
|
|
#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the
|
|
thread pool */
|
|
#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
|
|
#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
|
|
#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
|
|
#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */
|
|
#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */
|
|
#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */
|
|
#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */
|
|
#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given:
|
|
select an appropriate protocol driver,
|
|
ignoring the format layer */
|
|
#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */
|
|
#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
|
|
read-write fails */
|
|
#define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
|
|
|
|
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
|
|
|
|
|
|
/* Option names of options parsed by the block layer */
|
|
|
|
#define BDRV_OPT_CACHE_WB "cache.writeback"
|
|
#define BDRV_OPT_CACHE_DIRECT "cache.direct"
|
|
#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
|
|
#define BDRV_OPT_READ_ONLY "read-only"
|
|
#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
|
|
#define BDRV_OPT_DISCARD "discard"
|
|
#define BDRV_OPT_FORCE_SHARE "force-share"
|
|
|
|
|
|
#define BDRV_SECTOR_BITS 9
|
|
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
|
|
|
|
#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
|
|
INT_MAX >> BDRV_SECTOR_BITS)
|
|
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
|
|
|
|
/*
|
|
* We want allow aligning requests and disk length up to any 32bit alignment
|
|
* and don't afraid of overflow.
|
|
* To achieve it, and in the same time use some pretty number as maximum disk
|
|
* size, let's define maximum "length" (a limit for any offset/bytes request and
|
|
* for disk size) to be the greatest power of 2 less than INT64_MAX.
|
|
*/
|
|
#define BDRV_MAX_ALIGNMENT (1L << 30)
|
|
#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
|
|
|
|
/*
|
|
* Allocation status flags for bdrv_block_status() and friends.
|
|
*
|
|
* Public flags:
|
|
* BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
|
|
* BDRV_BLOCK_ZERO: offset reads as zero
|
|
* BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
|
|
* BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
|
|
* layer rather than any backing, set by block layer
|
|
* BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
|
|
* layer, set by block layer
|
|
*
|
|
* Internal flags:
|
|
* BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
|
|
* that the block layer recompute the answer from the returned
|
|
* BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
|
|
* BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
|
|
* zeroes in file child of current block node inside
|
|
* returned region. Only valid together with both
|
|
* BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
|
|
* appear with BDRV_BLOCK_ZERO.
|
|
*
|
|
* If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
|
|
* host offset within the returned BDS that is allocated for the
|
|
* corresponding raw guest data. However, whether that offset
|
|
* actually contains data also depends on BDRV_BLOCK_DATA, as follows:
|
|
*
|
|
* DATA ZERO OFFSET_VALID
|
|
* t t t sectors read as zero, returned file is zero at offset
|
|
* t f t sectors read as valid from file at offset
|
|
* f t t sectors preallocated, read as zero, returned file not
|
|
* necessarily zero at offset
|
|
* f f t sectors preallocated but read from backing_hd,
|
|
* returned file contains garbage at offset
|
|
* t t f sectors preallocated, read as zero, unknown offset
|
|
* t f f sectors read from unknown file or offset
|
|
* f t f not allocated or unknown offset, read as zero
|
|
* f f f not allocated or unknown offset, read from backing_hd
|
|
*/
|
|
#define BDRV_BLOCK_DATA 0x01
|
|
#define BDRV_BLOCK_ZERO 0x02
|
|
#define BDRV_BLOCK_OFFSET_VALID 0x04
|
|
#define BDRV_BLOCK_RAW 0x08
|
|
#define BDRV_BLOCK_ALLOCATED 0x10
|
|
#define BDRV_BLOCK_EOF 0x20
|
|
#define BDRV_BLOCK_RECURSE 0x40
|
|
|
|
typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
|
|
|
|
typedef struct BDRVReopenState {
|
|
BlockDriverState *bs;
|
|
int flags;
|
|
BlockdevDetectZeroesOptions detect_zeroes;
|
|
bool backing_missing;
|
|
BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
|
|
BlockDriverState *old_file_bs; /* keep pointer for permissions update */
|
|
QDict *options;
|
|
QDict *explicit_options;
|
|
void *opaque;
|
|
} BDRVReopenState;
|
|
|
|
/*
|
|
* Block operation types
|
|
*/
|
|
typedef enum BlockOpType {
|
|
BLOCK_OP_TYPE_BACKUP_SOURCE,
|
|
BLOCK_OP_TYPE_BACKUP_TARGET,
|
|
BLOCK_OP_TYPE_CHANGE,
|
|
BLOCK_OP_TYPE_COMMIT_SOURCE,
|
|
BLOCK_OP_TYPE_COMMIT_TARGET,
|
|
BLOCK_OP_TYPE_DATAPLANE,
|
|
BLOCK_OP_TYPE_DRIVE_DEL,
|
|
BLOCK_OP_TYPE_EJECT,
|
|
BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
|
|
BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
|
|
BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
|
|
BLOCK_OP_TYPE_MIRROR_SOURCE,
|
|
BLOCK_OP_TYPE_MIRROR_TARGET,
|
|
BLOCK_OP_TYPE_RESIZE,
|
|
BLOCK_OP_TYPE_STREAM,
|
|
BLOCK_OP_TYPE_REPLACE,
|
|
BLOCK_OP_TYPE_MAX,
|
|
} BlockOpType;
|
|
|
|
/* Block node permission constants */
|
|
enum {
|
|
/**
|
|
* A user that has the "permission" of consistent reads is guaranteed that
|
|
* their view of the contents of the block device is complete and
|
|
* self-consistent, representing the contents of a disk at a specific
|
|
* point.
|
|
*
|
|
* For most block devices (including their backing files) this is true, but
|
|
* the property cannot be maintained in a few situations like for
|
|
* intermediate nodes of a commit block job.
|
|
*/
|
|
BLK_PERM_CONSISTENT_READ = 0x01,
|
|
|
|
/** This permission is required to change the visible disk contents. */
|
|
BLK_PERM_WRITE = 0x02,
|
|
|
|
/**
|
|
* This permission (which is weaker than BLK_PERM_WRITE) is both enough and
|
|
* required for writes to the block node when the caller promises that
|
|
* the visible disk content doesn't change.
|
|
*
|
|
* As the BLK_PERM_WRITE permission is strictly stronger, either is
|
|
* sufficient to perform an unchanging write.
|
|
*/
|
|
BLK_PERM_WRITE_UNCHANGED = 0x04,
|
|
|
|
/** This permission is required to change the size of a block node. */
|
|
BLK_PERM_RESIZE = 0x08,
|
|
|
|
/**
|
|
* There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
|
|
* 6.1 and earlier may still lock the corresponding byte in block/file-posix
|
|
* locking. So, implementing some new permission should be very careful to
|
|
* not interfere with this old unused thing.
|
|
*/
|
|
|
|
BLK_PERM_ALL = 0x0f,
|
|
|
|
DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ
|
|
| BLK_PERM_WRITE
|
|
| BLK_PERM_WRITE_UNCHANGED
|
|
| BLK_PERM_RESIZE,
|
|
|
|
DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
|
|
};
|
|
|
|
/*
|
|
* Flags that parent nodes assign to child nodes to specify what kind of
|
|
* role(s) they take.
|
|
*
|
|
* At least one of DATA, METADATA, FILTERED, or COW must be set for
|
|
* every child.
|
|
*/
|
|
enum BdrvChildRoleBits {
|
|
/*
|
|
* This child stores data.
|
|
* Any node may have an arbitrary number of such children.
|
|
*/
|
|
BDRV_CHILD_DATA = (1 << 0),
|
|
|
|
/*
|
|
* This child stores metadata.
|
|
* Any node may have an arbitrary number of metadata-storing
|
|
* children.
|
|
*/
|
|
BDRV_CHILD_METADATA = (1 << 1),
|
|
|
|
/*
|
|
* A child that always presents exactly the same visible data as
|
|
* the parent, e.g. by virtue of the parent forwarding all reads
|
|
* and writes.
|
|
* This flag is mutually exclusive with DATA, METADATA, and COW.
|
|
* Any node may have at most one filtered child at a time.
|
|
*/
|
|
BDRV_CHILD_FILTERED = (1 << 2),
|
|
|
|
/*
|
|
* Child from which to read all data that isn't allocated in the
|
|
* parent (i.e., the backing child); such data is copied to the
|
|
* parent through COW (and optionally COR).
|
|
* This field is mutually exclusive with DATA, METADATA, and
|
|
* FILTERED.
|
|
* Any node may have at most one such backing child at a time.
|
|
*/
|
|
BDRV_CHILD_COW = (1 << 3),
|
|
|
|
/*
|
|
* The primary child. For most drivers, this is the child whose
|
|
* filename applies best to the parent node.
|
|
* Any node may have at most one primary child at a time.
|
|
*/
|
|
BDRV_CHILD_PRIMARY = (1 << 4),
|
|
|
|
/* Useful combination of flags */
|
|
BDRV_CHILD_IMAGE = BDRV_CHILD_DATA
|
|
| BDRV_CHILD_METADATA
|
|
| BDRV_CHILD_PRIMARY,
|
|
};
|
|
|
|
/* Mask of BdrvChildRoleBits values */
|
|
typedef unsigned int BdrvChildRole;
|
|
|
|
typedef struct BdrvCheckResult {
|
|
int corruptions;
|
|
int leaks;
|
|
int check_errors;
|
|
int corruptions_fixed;
|
|
int leaks_fixed;
|
|
int64_t image_end_offset;
|
|
BlockFragInfo bfi;
|
|
} BdrvCheckResult;
|
|
|
|
typedef enum {
|
|
BDRV_FIX_LEAKS = 1,
|
|
BDRV_FIX_ERRORS = 2,
|
|
} BdrvCheckMode;
|
|
|
|
typedef struct BlockSizes {
|
|
uint32_t phys;
|
|
uint32_t log;
|
|
} BlockSizes;
|
|
|
|
typedef struct HDGeometry {
|
|
uint32_t heads;
|
|
uint32_t sectors;
|
|
uint32_t cylinders;
|
|
} HDGeometry;
|
|
|
|
/*
|
|
* Common functions that are neither I/O nor Global State.
|
|
*
|
|
* These functions must never call any function from other categories
|
|
* (I/O, "I/O or GS", Global State) except this one, but can be invoked by
|
|
* all of them.
|
|
*/
|
|
|
|
char *bdrv_perm_names(uint64_t perm);
|
|
uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
|
|
|
|
void bdrv_init_with_whitelist(void);
|
|
bool bdrv_uses_whitelist(void);
|
|
int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
|
|
|
|
int bdrv_parse_aio(const char *mode, int *flags);
|
|
int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
|
|
int bdrv_parse_discard_flags(const char *mode, int *flags);
|
|
|
|
int path_has_protocol(const char *path);
|
|
int path_is_absolute(const char *path);
|
|
char *path_combine(const char *base_path, const char *filename);
|
|
|
|
char *bdrv_get_full_backing_filename_from_filename(const char *backed,
|
|
const char *backing,
|
|
Error **errp);
|
|
|
|
#endif /* BLOCK_COMMON_H */
|