mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-24 09:13:20 -05:00
iomap: support IOCB_DIO_CALLER_COMP
If IOCB_DIO_CALLER_COMP is set, utilize that to set kiocb->dio_complete handler and data for that callback. Rather than punt the completion to a workqueue, we pass back the handler and data to the issuer and will get a callback from a safe task context. Using the following fio job to randomly dio write 4k blocks at queue depths of 1..16: fio --name=dio-write --filename=/data1/file --time_based=1 \ --runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \ --cpus_allowed=4 --ioengine=io_uring --iodepth=$depth shows the following results before and after this patch: Stock Patched Diff ======================================= QD1 155K 162K + 4.5% QD2 290K 313K + 7.9% QD4 533K 597K +12.0% QD8 604K 827K +36.9% QD16 615K 845K +37.4% which shows nice wins all around. If we factored in per-IOP efficiency, the wins look even nicer. This becomes apparent as queue depth rises, as the offloaded workqueue completions runs out of steam. Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
099ada2c87
commit
8c052fb300
1 changed files with 60 additions and 2 deletions
|
@ -20,6 +20,7 @@
|
|||
* Private flags for iomap_dio, must not overlap with the public ones in
|
||||
* iomap.h:
|
||||
*/
|
||||
#define IOMAP_DIO_CALLER_COMP (1U << 26)
|
||||
#define IOMAP_DIO_INLINE_COMP (1U << 27)
|
||||
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
|
||||
#define IOMAP_DIO_NEED_SYNC (1U << 29)
|
||||
|
@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_dio_complete);
|
||||
|
||||
static ssize_t iomap_dio_deferred_complete(void *data)
|
||||
{
|
||||
return iomap_dio_complete(data);
|
||||
}
|
||||
|
||||
static void iomap_dio_complete_work(struct work_struct *work)
|
||||
{
|
||||
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
|
||||
|
@ -182,6 +188,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
|
|||
goto release_bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
|
||||
* our completion that way to avoid an async punt to a workqueue.
|
||||
*/
|
||||
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
|
||||
/* only polled IO cares about private cleared */
|
||||
iocb->private = dio;
|
||||
iocb->dio_complete = iomap_dio_deferred_complete;
|
||||
|
||||
/*
|
||||
* Invoke ->ki_complete() directly. We've assigned our
|
||||
* dio_complete callback handler, and since the issuer set
|
||||
* IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
|
||||
* notice ->dio_complete being set and will defer calling that
|
||||
* handler until it can be done from a safe task context.
|
||||
*
|
||||
* Note that the 'res' being passed in here is not important
|
||||
* for this case. The actual completion value of the request
|
||||
* will be gotten from dio_complete when that is run by the
|
||||
* issuer.
|
||||
*/
|
||||
iocb->ki_complete(iocb, 0);
|
||||
goto release_bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Async DIO completion that requires filesystem level completion work
|
||||
* gets punted to a work queue to complete as the operation may require
|
||||
|
@ -278,12 +309,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
|||
* after IO completion such as unwritten extent conversion) and
|
||||
* the underlying device either supports FUA or doesn't have
|
||||
* a volatile write cache. This allows us to avoid cache flushes
|
||||
* on IO completion.
|
||||
* on IO completion. If we can't use writethrough and need to
|
||||
* sync, disable in-task completions as dio completion will
|
||||
* need to call generic_write_sync() which will do a blocking
|
||||
* fsync / cache flush call.
|
||||
*/
|
||||
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
|
||||
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
|
||||
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
|
||||
use_fua = true;
|
||||
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
|
||||
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -298,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
|||
goto out;
|
||||
|
||||
/*
|
||||
* We can only poll for single bio I/Os.
|
||||
* We can only do deferred completion for pure overwrites that
|
||||
* don't require additional IO at completion. This rules out
|
||||
* writes that need zeroing or extent conversion, extend
|
||||
* the file size, or issue journal IO or cache flushes
|
||||
* during completion processing.
|
||||
*/
|
||||
if (need_zeroout ||
|
||||
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
|
||||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
|
||||
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
|
||||
|
||||
/*
|
||||
* The rules for polled IO completions follow the guidelines as the
|
||||
* ones we set for inline and deferred completions. If none of those
|
||||
* are available for this IO, clear the polled flag.
|
||||
*/
|
||||
if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
|
||||
dio->iocb->ki_flags &= ~IOCB_HIPRI;
|
||||
|
||||
if (need_zeroout) {
|
||||
|
@ -547,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
|||
iomi.flags |= IOMAP_WRITE;
|
||||
dio->flags |= IOMAP_DIO_WRITE;
|
||||
|
||||
/*
|
||||
* Flag as supporting deferred completions, if the issuer
|
||||
* groks it. This can avoid a workqueue punt for writes.
|
||||
* We may later clear this flag if we need to do other IO
|
||||
* as part of this IO completion.
|
||||
*/
|
||||
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
|
||||
dio->flags |= IOMAP_DIO_CALLER_COMP;
|
||||
|
||||
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
|
||||
ret = -EAGAIN;
|
||||
if (iomi.pos >= dio->i_size ||
|
||||
|
|
Loading…
Add table
Reference in a new issue