mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 16:53:58 -05:00
for-6.7/io_uring-2023-10-30
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmU/vcMQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpmnaD/4spcYjSSdeHVh3J60QuWjMYOM//E/BNb6e 3I2L6Is2RLuDGhVhHKfRfkJQy1UPKYKu5TZewUnwC3bz12kWGc8CZBF4WgM0159T 0uBm2ZtsstSCONA16tQdmE7gt5MJ6KFO0rsubm/AxNWxTnpyrbrX512TkkJTBrfC ZluAKxGviZOcrl9ROoVMc/FeMmaKVcT79mDuLp0y+Pmb2KO3y9bWTs/wpmEPNVro P7n/j9B4dBQC3Saij/wCdcsodkHUaCfCnRK3g34JKeACb+Kclg7QSzinb3TZjeEw o98l1XMiejkPJDIxYmWPTmdzqu6AUnT3Geq6eL463/PUOjgkzet6idYfk6XQgRyz AhFzA6KruMJ+IhOs974KtmDJj+7LbGkMUpW0kEqKWpXFEO2t+yG6Ue4cdC2FtsqV m/ojTTeejVqJ1RLng9IqVMT/X6sqpTtBOikNIJeWyDZQGpOOBxkG9qyoYxNQTOAr 280UwcFMgsRDQMpi9uIsc7uE7QvN/RYL9nqm49bxJTRm/sRsABPb71yWcbrHSAjh y2tprYqG0V4qK7ogCiqDt8qdq/nZS6d1mN/th33yGAHtWEStTyFKNuYmPOrzLtWb tvnmYGA7YxcpSMEPHQbYG5TlmoWoTlzUlwJ1OWGzqdlPw7USCwjFfTZVJuKm6wkR u0uTkYhn4A== =okQ8 -----END PGP SIGNATURE----- Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: "This contains the core io_uring updates, of which there are not many, and adds support for using WAITID through io_uring and hence not needing to block on these kinds of events. Outside of that, tweaks to the legacy provided buffer handling and some cleanups related to cancelations for uring_cmd support" * tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux: io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups io_uring/kbuf: Use slab for struct io_buffer objects io_uring/kbuf: Allow the full buffer id space for provided buffers io_uring/kbuf: Fix check of BID wrapping in provided buffers io_uring/rsrc: cleanup io_pin_pages() io_uring: cancelable uring_cmd io_uring: retain top 8bits of uring_cmd flags for kernel internal use io_uring: add IORING_OP_WAITID support exit: add internal include file with helpers exit: add kernel_waitid_prepare() helper exit: move core of do_wait() into helper exit: abstract out should_wake helper for child_wait_callback() io_uring/rw: add support for IORING_OP_READ_MULTISHOT io_uring/rw: mark readv/writev as vectored in the opcode definition io_uring/rw: split io_read() into a helper
This commit is contained in:
commit
ffa059b262
19 changed files with 783 additions and 121 deletions
|
@ -20,8 +20,15 @@ enum io_uring_cmd_flags {
|
|||
IO_URING_F_SQE128 = (1 << 8),
|
||||
IO_URING_F_CQE32 = (1 << 9),
|
||||
IO_URING_F_IOPOLL = (1 << 10),
|
||||
|
||||
/* set when uring wants to cancel a previously issued command */
|
||||
IO_URING_F_CANCEL = (1 << 11),
|
||||
};
|
||||
|
||||
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
|
||||
#define IORING_URING_CMD_CANCELABLE (1U << 30)
|
||||
#define IORING_URING_CMD_POLLED (1U << 31)
|
||||
|
||||
struct io_uring_cmd {
|
||||
struct file *file;
|
||||
const struct io_uring_sqe *sqe;
|
||||
|
@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
|
|||
__io_uring_free(tsk);
|
||||
}
|
||||
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
|
||||
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags);
|
||||
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
|
||||
#else
|
||||
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
|
||||
struct iov_iter *iter, void *ioucmd)
|
||||
|
@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
|
|||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
}
|
||||
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -265,6 +265,12 @@ struct io_ring_ctx {
|
|||
*/
|
||||
struct io_wq_work_list iopoll_list;
|
||||
bool poll_multi_queue;
|
||||
|
||||
/*
|
||||
* Any cancelable uring_cmd is added to this list in
|
||||
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
|
||||
*/
|
||||
struct hlist_head cancelable_uring_cmd;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
struct {
|
||||
|
@ -313,6 +319,8 @@ struct io_ring_ctx {
|
|||
struct list_head cq_overflow_list;
|
||||
struct io_hash_table cancel_table;
|
||||
|
||||
struct hlist_head waitid_list;
|
||||
|
||||
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
|
||||
struct io_sq_data *sq_data; /* if using sq thread polling */
|
||||
|
||||
|
@ -342,8 +350,6 @@ struct io_ring_ctx {
|
|||
struct wait_queue_head rsrc_quiesce_wq;
|
||||
unsigned rsrc_quiesce;
|
||||
|
||||
struct list_head io_buffers_pages;
|
||||
|
||||
#if defined(CONFIG_UNIX)
|
||||
struct socket *ring_sock;
|
||||
#endif
|
||||
|
|
|
@ -65,6 +65,7 @@ struct io_uring_sqe {
|
|||
__u32 xattr_flags;
|
||||
__u32 msg_ring_flags;
|
||||
__u32 uring_cmd_flags;
|
||||
__u32 waitid_flags;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
/* pack this to avoid bogus arm OABI complaints */
|
||||
|
@ -240,19 +241,20 @@ enum io_uring_op {
|
|||
IORING_OP_URING_CMD,
|
||||
IORING_OP_SEND_ZC,
|
||||
IORING_OP_SENDMSG_ZC,
|
||||
IORING_OP_READ_MULTISHOT,
|
||||
IORING_OP_WAITID,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
};
|
||||
|
||||
/*
|
||||
* sqe->uring_cmd_flags
|
||||
* sqe->uring_cmd_flags top 8bits aren't available for userspace
|
||||
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
|
||||
* along with setting sqe->buf_index.
|
||||
* IORING_URING_CMD_POLLED driver use only
|
||||
*/
|
||||
#define IORING_URING_CMD_FIXED (1U << 0)
|
||||
#define IORING_URING_CMD_POLLED (1U << 31)
|
||||
#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
|
||||
|
||||
|
||||
/*
|
||||
|
|
|
@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
|
|||
openclose.o uring_cmd.o epoll.o \
|
||||
statx.o net.o msg_ring.o timeout.o \
|
||||
sqpoll.o fdinfo.o tctx.o poll.o \
|
||||
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
|
||||
cancel.o kbuf.o rsrc.o rw.o opdef.o \
|
||||
notif.o waitid.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "tctx.h"
|
||||
#include "poll.h"
|
||||
#include "timeout.h"
|
||||
#include "waitid.h"
|
||||
#include "cancel.h"
|
||||
|
||||
struct io_cancel {
|
||||
|
@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
|
|||
if (ret != -ENOENT)
|
||||
return ret;
|
||||
|
||||
ret = io_waitid_cancel(ctx, cd, issue_flags);
|
||||
if (ret != -ENOENT)
|
||||
return ret;
|
||||
|
||||
spin_lock(&ctx->completion_lock);
|
||||
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
|
||||
ret = io_timeout_cancel(ctx, cd);
|
||||
|
|
|
@ -92,6 +92,7 @@
|
|||
#include "cancel.h"
|
||||
#include "net.h"
|
||||
#include "notif.h"
|
||||
#include "waitid.h"
|
||||
|
||||
#include "timeout.h"
|
||||
#include "poll.h"
|
||||
|
@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
|||
spin_lock_init(&ctx->completion_lock);
|
||||
spin_lock_init(&ctx->timeout_lock);
|
||||
INIT_WQ_LIST(&ctx->iopoll_list);
|
||||
INIT_LIST_HEAD(&ctx->io_buffers_pages);
|
||||
INIT_LIST_HEAD(&ctx->io_buffers_comp);
|
||||
INIT_LIST_HEAD(&ctx->defer_list);
|
||||
INIT_LIST_HEAD(&ctx->timeout_list);
|
||||
|
@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
|||
INIT_LIST_HEAD(&ctx->tctx_list);
|
||||
ctx->submit_state.free_list.next = NULL;
|
||||
INIT_WQ_LIST(&ctx->locked_free_list);
|
||||
INIT_HLIST_HEAD(&ctx->waitid_list);
|
||||
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
|
||||
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
|
||||
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
|
||||
return ctx;
|
||||
err:
|
||||
kfree(ctx->cancel_table.hbs);
|
||||
|
@ -3276,6 +3278,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
|
||||
struct task_struct *task, bool cancel_all)
|
||||
{
|
||||
struct hlist_node *tmp;
|
||||
struct io_kiocb *req;
|
||||
bool ret = false;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
|
||||
hash_node) {
|
||||
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
|
||||
struct io_uring_cmd);
|
||||
struct file *file = req->file;
|
||||
|
||||
if (!cancel_all && req->task != task)
|
||||
continue;
|
||||
|
||||
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
|
||||
/* ->sqe isn't available if no async data */
|
||||
if (!req_has_async_data(req))
|
||||
cmd->sqe = NULL;
|
||||
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
io_submit_flush_completions(ctx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
|
||||
struct task_struct *task,
|
||||
bool cancel_all)
|
||||
|
@ -3323,6 +3356,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
|
|||
ret |= io_cancel_defer_files(ctx, task, cancel_all);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
ret |= io_poll_remove_all(ctx, task, cancel_all);
|
||||
ret |= io_waitid_remove_all(ctx, task, cancel_all);
|
||||
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
ret |= io_kill_timeouts(ctx, task, cancel_all);
|
||||
if (task)
|
||||
|
@ -4686,6 +4721,9 @@ static int __init io_uring_init(void)
|
|||
|
||||
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
|
||||
|
||||
/* top 8bits are for internal use */
|
||||
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
|
||||
|
||||
io_uring_optable_init();
|
||||
|
||||
/*
|
||||
|
@ -4701,6 +4739,9 @@ static int __init io_uring_init(void)
|
|||
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
|
||||
offsetof(struct io_kiocb, cmd.data),
|
||||
sizeof_field(struct io_kiocb, cmd.data), NULL);
|
||||
io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
|
||||
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
|
||||
NULL);
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
|
||||
|
|
|
@ -343,6 +343,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
|
|||
}
|
||||
|
||||
extern struct kmem_cache *req_cachep;
|
||||
extern struct kmem_cache *io_buf_cachep;
|
||||
|
||||
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
|
||||
{
|
||||
|
|
|
@ -19,12 +19,17 @@
|
|||
|
||||
#define BGID_ARRAY 64
|
||||
|
||||
/* BIDs are addressed by a 16-bit field in a CQE */
|
||||
#define MAX_BIDS_PER_BGID (1 << 16)
|
||||
|
||||
struct kmem_cache *io_buf_cachep;
|
||||
|
||||
struct io_provide_buf {
|
||||
struct file *file;
|
||||
__u64 addr;
|
||||
__u32 len;
|
||||
__u32 bgid;
|
||||
__u16 nbufs;
|
||||
__u32 nbufs;
|
||||
__u16 bid;
|
||||
};
|
||||
|
||||
|
@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
|||
void io_destroy_buffers(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buffer_list *bl;
|
||||
struct list_head *item, *tmp;
|
||||
struct io_buffer *buf;
|
||||
unsigned long index;
|
||||
int i;
|
||||
|
||||
|
@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
|
|||
kfree(bl);
|
||||
}
|
||||
|
||||
while (!list_empty(&ctx->io_buffers_pages)) {
|
||||
struct page *page;
|
||||
|
||||
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
|
||||
list_del_init(&page->lru);
|
||||
__free_page(page);
|
||||
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
|
||||
buf = list_entry(item, struct io_buffer, list);
|
||||
kmem_cache_free(io_buf_cachep, buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|||
return -EINVAL;
|
||||
|
||||
tmp = READ_ONCE(sqe->fd);
|
||||
if (!tmp || tmp > USHRT_MAX)
|
||||
if (!tmp || tmp > MAX_BIDS_PER_BGID)
|
||||
return -EINVAL;
|
||||
|
||||
memset(p, 0, sizeof(*p));
|
||||
|
@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
|
|||
return -EINVAL;
|
||||
|
||||
tmp = READ_ONCE(sqe->fd);
|
||||
if (!tmp || tmp > USHRT_MAX)
|
||||
if (!tmp || tmp > MAX_BIDS_PER_BGID)
|
||||
return -E2BIG;
|
||||
p->nbufs = tmp;
|
||||
p->addr = READ_ONCE(sqe->addr);
|
||||
|
@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
|
|||
tmp = READ_ONCE(sqe->off);
|
||||
if (tmp > USHRT_MAX)
|
||||
return -E2BIG;
|
||||
if (tmp + p->nbufs >= USHRT_MAX)
|
||||
if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
|
||||
return -EINVAL;
|
||||
p->bid = tmp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define IO_BUFFER_ALLOC_BATCH 64
|
||||
|
||||
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_buffer *buf;
|
||||
struct page *page;
|
||||
int bufs_in_page;
|
||||
struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
|
||||
int allocated;
|
||||
|
||||
/*
|
||||
* Completions that don't happen inline (eg not under uring_lock) will
|
||||
|
@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
|
|||
|
||||
/*
|
||||
* No free buffers and no completion entries either. Allocate a new
|
||||
* page worth of buffer entries and add those to our freelist.
|
||||
* batch of buffer entries and add those to our freelist.
|
||||
*/
|
||||
page = alloc_page(GFP_KERNEL_ACCOUNT);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add(&page->lru, &ctx->io_buffers_pages);
|
||||
|
||||
buf = page_address(page);
|
||||
bufs_in_page = PAGE_SIZE / sizeof(*buf);
|
||||
while (bufs_in_page) {
|
||||
list_add_tail(&buf->list, &ctx->io_buffers_cache);
|
||||
buf++;
|
||||
bufs_in_page--;
|
||||
allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
|
||||
ARRAY_SIZE(bufs), (void **) bufs);
|
||||
if (unlikely(!allocated)) {
|
||||
/*
|
||||
* Bulk alloc is all-or-nothing. If we fail to get a batch,
|
||||
* retry single alloc to be on the safe side.
|
||||
*/
|
||||
bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
|
||||
if (!bufs[0])
|
||||
return -ENOMEM;
|
||||
allocated = 1;
|
||||
}
|
||||
|
||||
while (allocated)
|
||||
list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "poll.h"
|
||||
#include "cancel.h"
|
||||
#include "rw.h"
|
||||
#include "waitid.h"
|
||||
|
||||
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
|
@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = {
|
|||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_read,
|
||||
},
|
||||
|
@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = {
|
|||
.ioprio = 1,
|
||||
.iopoll = 1,
|
||||
.iopoll_queue = 1,
|
||||
.vectored = 1,
|
||||
.prep = io_prep_rw,
|
||||
.issue = io_write,
|
||||
},
|
||||
|
@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = {
|
|||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_READ_MULTISHOT] = {
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollin = 1,
|
||||
.buffer_select = 1,
|
||||
.audit_skip = 1,
|
||||
.prep = io_read_mshot_prep,
|
||||
.issue = io_read_mshot,
|
||||
},
|
||||
[IORING_OP_WAITID] = {
|
||||
.prep = io_waitid_prep,
|
||||
.issue = io_waitid,
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
const struct io_cold_def io_cold_defs[] = {
|
||||
[IORING_OP_NOP] = {
|
||||
.name = "NOP",
|
||||
|
@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = {
|
|||
.fail = io_sendrecv_fail,
|
||||
#endif
|
||||
},
|
||||
[IORING_OP_READ_MULTISHOT] = {
|
||||
.name = "READ_MULTISHOT",
|
||||
},
|
||||
[IORING_OP_WAITID] = {
|
||||
.name = "WAITID",
|
||||
.async_size = sizeof(struct io_waitid_async),
|
||||
},
|
||||
};
|
||||
|
||||
const char *io_uring_get_opcode(u8 opcode)
|
||||
|
|
|
@ -29,6 +29,8 @@ struct io_issue_def {
|
|||
unsigned iopoll_queue : 1;
|
||||
/* opcode specific path will handle ->async_data allocation if needed */
|
||||
unsigned manual_alloc : 1;
|
||||
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
|
||||
unsigned vectored : 1;
|
||||
|
||||
int (*issue)(struct io_kiocb *, unsigned int);
|
||||
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
|
||||
|
|
|
@ -370,7 +370,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask)
|
|||
req->io_task_work.func = io_poll_task_func;
|
||||
|
||||
trace_io_uring_task_add(req, mask);
|
||||
io_req_task_work_add(req);
|
||||
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
|
||||
}
|
||||
|
||||
static inline void io_poll_execute(struct io_kiocb *req, int res)
|
||||
|
|
|
@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
|
|||
{
|
||||
unsigned long start, end, nr_pages;
|
||||
struct page **pages = NULL;
|
||||
int pret, ret = -ENOMEM;
|
||||
int ret;
|
||||
|
||||
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
start = ubuf >> PAGE_SHIFT;
|
||||
nr_pages = end - start;
|
||||
WARN_ON(!nr_pages);
|
||||
|
||||
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
|
||||
if (!pages)
|
||||
goto done;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ret = 0;
|
||||
mmap_read_lock(current->mm);
|
||||
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
|
||||
pages);
|
||||
if (pret == nr_pages)
|
||||
*npages = nr_pages;
|
||||
else
|
||||
ret = pret < 0 ? pret : -EFAULT;
|
||||
|
||||
ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
|
||||
mmap_read_unlock(current->mm);
|
||||
if (ret) {
|
||||
|
||||
/* success, mapped all pages */
|
||||
if (ret == nr_pages) {
|
||||
*npages = nr_pages;
|
||||
return pages;
|
||||
}
|
||||
|
||||
/* partial map, or didn't map anything */
|
||||
if (ret >= 0) {
|
||||
/* if we did partial map, release any pages we did get */
|
||||
if (pret > 0)
|
||||
unpin_user_pages(pages, pret);
|
||||
goto done;
|
||||
if (ret)
|
||||
unpin_user_pages(pages, ret);
|
||||
ret = -EFAULT;
|
||||
}
|
||||
ret = 0;
|
||||
done:
|
||||
if (ret < 0) {
|
||||
kvfree(pages);
|
||||
pages = ERR_PTR(ret);
|
||||
}
|
||||
return pages;
|
||||
kvfree(pages);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
|
|
|
@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Multishot read is prepared just like a normal read/write request, only
|
||||
* difference is that we set the MULTISHOT flag.
|
||||
*/
|
||||
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = io_prep_rw(req, sqe);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
req->flags |= REQ_F_APOLL_MULTISHOT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void io_readv_writev_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
struct io_async_rw *io = req->async_data;
|
||||
|
@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
|
|||
buf = u64_to_user_ptr(rw->addr);
|
||||
sqe_len = rw->len;
|
||||
|
||||
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
|
||||
(req->flags & REQ_F_BUFFER_SELECT)) {
|
||||
if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
|
||||
if (io_do_buffer_select(req)) {
|
||||
buf = io_buffer_select(req, &sqe_len, issue_flags);
|
||||
if (!buf)
|
||||
|
@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
||||
struct io_rw_state __s, *s = &__s;
|
||||
|
@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
|
|||
|
||||
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
/* if we can poll, just do that */
|
||||
if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
|
||||
/*
|
||||
* If we can poll, just do that. For a vectored read, we'll
|
||||
* need to copy state first.
|
||||
*/
|
||||
if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
|
||||
return -EAGAIN;
|
||||
/* IOPOLL retry should happen for io-wq threads */
|
||||
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
|
@ -853,7 +871,69 @@ done:
|
|||
/* it's faster to check here then delegate to kfree */
|
||||
if (iovec)
|
||||
kfree(iovec);
|
||||
return kiocb_done(req, ret, issue_flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __io_read(req, issue_flags);
|
||||
if (ret >= 0)
|
||||
return kiocb_done(req, ret, issue_flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
unsigned int cflags = 0;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Multishot MUST be used on a pollable file
|
||||
*/
|
||||
if (!file_can_poll(req->file))
|
||||
return -EBADFD;
|
||||
|
||||
ret = __io_read(req, issue_flags);
|
||||
|
||||
/*
|
||||
* If we get -EAGAIN, recycle our buffer and just let normal poll
|
||||
* handling arm it.
|
||||
*/
|
||||
if (ret == -EAGAIN) {
|
||||
io_kbuf_recycle(req, issue_flags);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
/*
|
||||
* Any successful return value will keep the multishot read armed.
|
||||
*/
|
||||
if (ret > 0) {
|
||||
/*
|
||||
* Put our buffer and post a CQE. If we fail to post a CQE, then
|
||||
* jump to the termination path. This request is then done.
|
||||
*/
|
||||
cflags = io_put_kbuf(req, issue_flags);
|
||||
|
||||
if (io_fill_cqe_req_aux(req,
|
||||
issue_flags & IO_URING_F_COMPLETE_DEFER,
|
||||
ret, cflags | IORING_CQE_F_MORE)) {
|
||||
if (issue_flags & IO_URING_F_MULTISHOT)
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
return -EAGAIN;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Either an error, or we've hit overflow posting the CQE. For any
|
||||
* multishot request, hitting overflow will terminate it.
|
||||
*/
|
||||
io_req_set_res(req, ret, cflags);
|
||||
if (issue_flags & IO_URING_F_MULTISHOT)
|
||||
return IOU_STOP_MULTISHOT;
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
|
|
|
@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
|
|||
void io_readv_writev_cleanup(struct io_kiocb *req);
|
||||
void io_rw_fail(struct io_kiocb *req);
|
||||
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
|
||||
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
|
|
@ -13,6 +13,51 @@
|
|||
#include "rsrc.h"
|
||||
#include "uring_cmd.h"
|
||||
|
||||
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
|
||||
return;
|
||||
|
||||
cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
hlist_del(&req->hash_node);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
|
||||
* will try to cancel this issued command by sending ->uring_cmd() with
|
||||
* issue_flags of IO_URING_F_CANCEL.
|
||||
*
|
||||
* The command is guaranteed to not be done when calling ->uring_cmd()
|
||||
* with IO_URING_F_CANCEL, but it is driver's responsibility to deal
|
||||
* with race between io_uring canceling and normal completion.
|
||||
*/
|
||||
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
|
||||
cmd->flags |= IORING_URING_CMD_CANCELABLE;
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
|
||||
|
||||
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
|
||||
{
|
||||
return cmd_to_io_kiocb(cmd)->task;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
|
||||
|
||||
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
||||
|
@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
|
|||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
||||
|
||||
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
|
||||
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
|
||||
|
@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|||
return -EINVAL;
|
||||
|
||||
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
|
||||
if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
|
||||
if (ioucmd->flags & ~IORING_URING_CMD_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
|
||||
|
|
372
io_uring/waitid.c
Normal file
372
io_uring/waitid.c
Normal file
|
@ -0,0 +1,372 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Support for async notification of waitid
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "cancel.h"
|
||||
#include "waitid.h"
|
||||
#include "../kernel/exit.h"
|
||||
|
||||
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
|
||||
|
||||
#define IO_WAITID_CANCEL_FLAG BIT(31)
|
||||
#define IO_WAITID_REF_MASK GENMASK(30, 0)
|
||||
|
||||
struct io_waitid {
|
||||
struct file *file;
|
||||
int which;
|
||||
pid_t upid;
|
||||
int options;
|
||||
atomic_t refs;
|
||||
struct wait_queue_head *head;
|
||||
struct siginfo __user *infop;
|
||||
struct waitid_info info;
|
||||
};
|
||||
|
||||
static void io_waitid_free(struct io_kiocb *req)
|
||||
{
|
||||
struct io_waitid_async *iwa = req->async_data;
|
||||
|
||||
put_pid(iwa->wo.wo_pid);
|
||||
kfree(req->async_data);
|
||||
req->async_data = NULL;
|
||||
req->flags &= ~REQ_F_ASYNC_DATA;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
|
||||
{
|
||||
struct compat_siginfo __user *infop;
|
||||
bool ret;
|
||||
|
||||
infop = (struct compat_siginfo __user *) iw->infop;
|
||||
|
||||
if (!user_write_access_begin(infop, sizeof(*infop)))
|
||||
return false;
|
||||
|
||||
unsafe_put_user(signo, &infop->si_signo, Efault);
|
||||
unsafe_put_user(0, &infop->si_errno, Efault);
|
||||
unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
|
||||
unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
|
||||
unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
|
||||
unsafe_put_user(iw->info.status, &infop->si_status, Efault);
|
||||
ret = true;
|
||||
done:
|
||||
user_write_access_end();
|
||||
return ret;
|
||||
Efault:
|
||||
ret = false;
|
||||
goto done;
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
bool ret;
|
||||
|
||||
if (!iw->infop)
|
||||
return true;
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
if (req->ctx->compat)
|
||||
return io_waitid_compat_copy_si(iw, signo);
|
||||
#endif
|
||||
|
||||
if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
|
||||
return false;
|
||||
|
||||
unsafe_put_user(signo, &iw->infop->si_signo, Efault);
|
||||
unsafe_put_user(0, &iw->infop->si_errno, Efault);
|
||||
unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
|
||||
unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
|
||||
unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
|
||||
unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
|
||||
ret = true;
|
||||
done:
|
||||
user_write_access_end();
|
||||
return ret;
|
||||
Efault:
|
||||
ret = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
static int io_waitid_finish(struct io_kiocb *req, int ret)
|
||||
{
|
||||
int signo = 0;
|
||||
|
||||
if (ret > 0) {
|
||||
signo = SIGCHLD;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
if (!io_waitid_copy_si(req, signo))
|
||||
ret = -EFAULT;
|
||||
io_waitid_free(req);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void io_waitid_complete(struct io_kiocb *req, int ret)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
struct io_tw_state ts = { .locked = true };
|
||||
|
||||
/* anyone completing better be holding a reference */
|
||||
WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
|
||||
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
|
||||
/*
|
||||
* Did cancel find it meanwhile?
|
||||
*/
|
||||
if (hlist_unhashed(&req->hash_node))
|
||||
return;
|
||||
|
||||
hlist_del_init(&req->hash_node);
|
||||
|
||||
ret = io_waitid_finish(req, ret);
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
io_req_task_complete(req, &ts);
|
||||
}
|
||||
|
||||
static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
struct io_waitid_async *iwa = req->async_data;
|
||||
|
||||
/*
|
||||
* Mark us canceled regardless of ownership. This will prevent a
|
||||
* potential retry from a spurious wakeup.
|
||||
*/
|
||||
atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
|
||||
|
||||
/* claim ownership */
|
||||
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
|
||||
return false;
|
||||
|
||||
spin_lock_irq(&iw->head->lock);
|
||||
list_del_init(&iwa->wo.child_wait.entry);
|
||||
spin_unlock_irq(&iw->head->lock);
|
||||
io_waitid_complete(req, -ECANCELED);
|
||||
return true;
|
||||
}
|
||||
|
||||
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct hlist_node *tmp;
|
||||
struct io_kiocb *req;
|
||||
int nr = 0;
|
||||
|
||||
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
|
||||
return -ENOENT;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
|
||||
if (req->cqe.user_data != cd->data &&
|
||||
!(cd->flags & IORING_ASYNC_CANCEL_ANY))
|
||||
continue;
|
||||
if (__io_waitid_cancel(ctx, req))
|
||||
nr++;
|
||||
if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
|
||||
break;
|
||||
}
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
|
||||
if (nr)
|
||||
return nr;
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
|
||||
bool cancel_all)
|
||||
{
|
||||
struct hlist_node *tmp;
|
||||
struct io_kiocb *req;
|
||||
bool found = false;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
|
||||
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
|
||||
if (!io_match_task_safe(req, task, cancel_all))
|
||||
continue;
|
||||
__io_waitid_cancel(ctx, req);
|
||||
found = true;
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
struct io_waitid_async *iwa = req->async_data;
|
||||
|
||||
if (!atomic_sub_return(1, &iw->refs))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Wakeup triggered, racing with us. It was prevented from
|
||||
* completing because of that, queue up the tw to do that.
|
||||
*/
|
||||
req->io_task_work.func = io_waitid_cb;
|
||||
io_req_task_work_add(req);
|
||||
remove_wait_queue(iw->head, &iwa->wo.child_wait);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
|
||||
{
|
||||
struct io_waitid_async *iwa = req->async_data;
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
int ret;
|
||||
|
||||
io_tw_lock(ctx, ts);
|
||||
|
||||
ret = __do_wait(&iwa->wo);
|
||||
|
||||
/*
|
||||
* If we get -ERESTARTSYS here, we need to re-arm and check again
|
||||
* to ensure we get another callback. If the retry works, then we can
|
||||
* just remove ourselves from the waitqueue again and finish the
|
||||
* request.
|
||||
*/
|
||||
if (unlikely(ret == -ERESTARTSYS)) {
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
|
||||
/* Don't retry if cancel found it meanwhile */
|
||||
ret = -ECANCELED;
|
||||
if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
|
||||
iw->head = ¤t->signal->wait_chldexit;
|
||||
add_wait_queue(iw->head, &iwa->wo.child_wait);
|
||||
ret = __do_wait(&iwa->wo);
|
||||
if (ret == -ERESTARTSYS) {
|
||||
/* retry armed, drop our ref */
|
||||
io_waitid_drop_issue_ref(req);
|
||||
return;
|
||||
}
|
||||
|
||||
remove_wait_queue(iw->head, &iwa->wo.child_wait);
|
||||
}
|
||||
}
|
||||
|
||||
io_waitid_complete(req, ret);
|
||||
}
|
||||
|
||||
static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
|
||||
int sync, void *key)
|
||||
{
|
||||
struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
|
||||
struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
|
||||
struct io_kiocb *req = iwa->req;
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
struct task_struct *p = key;
|
||||
|
||||
if (!pid_child_should_wake(wo, p))
|
||||
return 0;
|
||||
|
||||
/* cancel is in progress */
|
||||
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
|
||||
return 1;
|
||||
|
||||
req->io_task_work.func = io_waitid_cb;
|
||||
io_req_task_work_add(req);
|
||||
list_del_init(&wait->entry);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
|
||||
if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
|
||||
return -EINVAL;
|
||||
|
||||
iw->which = READ_ONCE(sqe->len);
|
||||
iw->upid = READ_ONCE(sqe->fd);
|
||||
iw->options = READ_ONCE(sqe->file_index);
|
||||
iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_waitid_async *iwa;
|
||||
int ret;
|
||||
|
||||
if (io_alloc_async_data(req))
|
||||
return -ENOMEM;
|
||||
|
||||
iwa = req->async_data;
|
||||
iwa->req = req;
|
||||
|
||||
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
|
||||
iw->options, NULL);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* Mark the request as busy upfront, in case we're racing with the
|
||||
* wakeup. If we are, then we'll notice when we drop this initial
|
||||
* reference again after arming.
|
||||
*/
|
||||
atomic_set(&iw->refs, 1);
|
||||
|
||||
/*
|
||||
* Cancel must hold the ctx lock, so there's no risk of cancelation
|
||||
* finding us until a) we remain on the list, and b) the lock is
|
||||
* dropped. We only need to worry about racing with the wakeup
|
||||
* callback.
|
||||
*/
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
hlist_add_head(&req->hash_node, &ctx->waitid_list);
|
||||
|
||||
init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
|
||||
iwa->wo.child_wait.private = req->task;
|
||||
iw->head = ¤t->signal->wait_chldexit;
|
||||
add_wait_queue(iw->head, &iwa->wo.child_wait);
|
||||
|
||||
ret = __do_wait(&iwa->wo);
|
||||
if (ret == -ERESTARTSYS) {
|
||||
/*
|
||||
* Nobody else grabbed a reference, it'll complete when we get
|
||||
* a waitqueue callback, or if someone cancels it.
|
||||
*/
|
||||
if (!io_waitid_drop_issue_ref(req)) {
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wakeup triggered, racing with us. It was prevented from
|
||||
* completing because of that, queue up the tw to do that.
|
||||
*/
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
hlist_del_init(&req->hash_node);
|
||||
remove_wait_queue(iw->head, &iwa->wo.child_wait);
|
||||
ret = io_waitid_finish(req, ret);
|
||||
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
done:
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
15
io_uring/waitid.h
Normal file
15
io_uring/waitid.h
Normal file
|
@ -0,0 +1,15 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "../kernel/exit.h"
|
||||
|
||||
struct io_waitid_async {
|
||||
struct io_kiocb *req;
|
||||
struct wait_opts wo;
|
||||
};
|
||||
|
||||
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
|
||||
unsigned int issue_flags);
|
||||
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
|
||||
bool cancel_all);
|
131
kernel/exit.c
131
kernel/exit.c
|
@ -74,6 +74,8 @@
|
|||
#include <asm/unistd.h>
|
||||
#include <asm/mmu_context.h>
|
||||
|
||||
#include "exit.h"
|
||||
|
||||
/*
|
||||
* The default value should be high enough to not crash a system that randomly
|
||||
* crashes its kernel from time to time, but low enough to at least not permit
|
||||
|
@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
|
|||
return 0;
|
||||
}
|
||||
|
||||
struct waitid_info {
|
||||
pid_t pid;
|
||||
uid_t uid;
|
||||
int status;
|
||||
int cause;
|
||||
};
|
||||
|
||||
struct wait_opts {
|
||||
enum pid_type wo_type;
|
||||
int wo_flags;
|
||||
struct pid *wo_pid;
|
||||
|
||||
struct waitid_info *wo_info;
|
||||
int wo_stat;
|
||||
struct rusage *wo_rusage;
|
||||
|
||||
wait_queue_entry_t child_wait;
|
||||
int notask_error;
|
||||
};
|
||||
|
||||
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
|
||||
{
|
||||
return wo->wo_type == PIDTYPE_MAX ||
|
||||
|
@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
|
||||
{
|
||||
if (!eligible_pid(wo, p))
|
||||
return false;
|
||||
|
||||
if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
|
||||
int sync, void *key)
|
||||
{
|
||||
|
@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
|
|||
child_wait);
|
||||
struct task_struct *p = key;
|
||||
|
||||
if (!eligible_pid(wo, p))
|
||||
return 0;
|
||||
if (pid_child_should_wake(wo, p))
|
||||
return default_wake_function(wait, mode, sync, key);
|
||||
|
||||
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
|
||||
return 0;
|
||||
|
||||
return default_wake_function(wait, mode, sync, key);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
|
||||
|
@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static long do_wait(struct wait_opts *wo)
|
||||
long __do_wait(struct wait_opts *wo)
|
||||
{
|
||||
int retval;
|
||||
long retval;
|
||||
|
||||
trace_sched_process_wait(wo->wo_pid);
|
||||
|
||||
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
|
||||
wo->child_wait.private = current;
|
||||
add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
|
||||
repeat:
|
||||
/*
|
||||
* If there is nothing that can match our criteria, just get out.
|
||||
* We will clear ->notask_error to zero if we see any child that
|
||||
|
@ -1603,24 +1587,23 @@ repeat:
|
|||
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
|
||||
goto notask;
|
||||
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
read_lock(&tasklist_lock);
|
||||
|
||||
if (wo->wo_type == PIDTYPE_PID) {
|
||||
retval = do_wait_pid(wo);
|
||||
if (retval)
|
||||
goto end;
|
||||
return retval;
|
||||
} else {
|
||||
struct task_struct *tsk = current;
|
||||
|
||||
do {
|
||||
retval = do_wait_thread(wo, tsk);
|
||||
if (retval)
|
||||
goto end;
|
||||
return retval;
|
||||
|
||||
retval = ptrace_do_wait(wo, tsk);
|
||||
if (retval)
|
||||
goto end;
|
||||
return retval;
|
||||
|
||||
if (wo->wo_flags & __WNOTHREAD)
|
||||
break;
|
||||
|
@ -1630,27 +1613,44 @@ repeat:
|
|||
|
||||
notask:
|
||||
retval = wo->notask_error;
|
||||
if (!retval && !(wo->wo_flags & WNOHANG)) {
|
||||
retval = -ERESTARTSYS;
|
||||
if (!signal_pending(current)) {
|
||||
schedule();
|
||||
goto repeat;
|
||||
}
|
||||
}
|
||||
end:
|
||||
if (!retval && !(wo->wo_flags & WNOHANG))
|
||||
return -ERESTARTSYS;
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
static long do_wait(struct wait_opts *wo)
|
||||
{
|
||||
int retval;
|
||||
|
||||
trace_sched_process_wait(wo->wo_pid);
|
||||
|
||||
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
|
||||
wo->child_wait.private = current;
|
||||
add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
|
||||
|
||||
do {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
retval = __do_wait(wo);
|
||||
if (retval != -ERESTARTSYS)
|
||||
break;
|
||||
if (signal_pending(current))
|
||||
break;
|
||||
schedule();
|
||||
} while (1);
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
|
||||
return retval;
|
||||
}
|
||||
|
||||
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
|
||||
int options, struct rusage *ru)
|
||||
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
|
||||
struct waitid_info *infop, int options,
|
||||
struct rusage *ru)
|
||||
{
|
||||
struct wait_opts wo;
|
||||
unsigned int f_flags = 0;
|
||||
struct pid *pid = NULL;
|
||||
enum pid_type type;
|
||||
long ret;
|
||||
unsigned int f_flags = 0;
|
||||
|
||||
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
|
||||
__WNOTHREAD|__WCLONE|__WALL))
|
||||
|
@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
wo.wo_type = type;
|
||||
wo.wo_pid = pid;
|
||||
wo.wo_flags = options;
|
||||
wo.wo_info = infop;
|
||||
wo.wo_rusage = ru;
|
||||
wo->wo_type = type;
|
||||
wo->wo_pid = pid;
|
||||
wo->wo_flags = options;
|
||||
wo->wo_info = infop;
|
||||
wo->wo_rusage = ru;
|
||||
if (f_flags & O_NONBLOCK)
|
||||
wo.wo_flags |= WNOHANG;
|
||||
wo->wo_flags |= WNOHANG;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
|
||||
int options, struct rusage *ru)
|
||||
{
|
||||
struct wait_opts wo;
|
||||
long ret;
|
||||
|
||||
ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = do_wait(&wo);
|
||||
if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
|
||||
if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
|
||||
ret = -EAGAIN;
|
||||
|
||||
put_pid(pid);
|
||||
put_pid(wo.wo_pid);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
30
kernel/exit.h
Normal file
30
kernel/exit.h
Normal file
|
@ -0,0 +1,30 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#ifndef LINUX_WAITID_H
|
||||
#define LINUX_WAITID_H
|
||||
|
||||
struct waitid_info {
|
||||
pid_t pid;
|
||||
uid_t uid;
|
||||
int status;
|
||||
int cause;
|
||||
};
|
||||
|
||||
struct wait_opts {
|
||||
enum pid_type wo_type;
|
||||
int wo_flags;
|
||||
struct pid *wo_pid;
|
||||
|
||||
struct waitid_info *wo_info;
|
||||
int wo_stat;
|
||||
struct rusage *wo_rusage;
|
||||
|
||||
wait_queue_entry_t child_wait;
|
||||
int notask_error;
|
||||
};
|
||||
|
||||
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
|
||||
long __do_wait(struct wait_opts *wo);
|
||||
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
|
||||
struct waitid_info *infop, int options,
|
||||
struct rusage *ru);
|
||||
#endif
|
Loading…
Add table
Reference in a new issue