diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst index 93fc96f760aa..23b3ed229e49 100644 --- a/Documentation/filesystems/fiemap.rst +++ b/Documentation/filesystems/fiemap.rst @@ -12,21 +12,10 @@ returns a list of extents. Request Basics -------------- -A fiemap request is encoded within struct fiemap:: - - struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace cares about (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents; /* number of extents that were - * mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ - __u32 fm_reserved; - struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ - }; +A fiemap request is encoded within struct fiemap: +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap fm_start, and fm_length specify the logical range within the file which the process would like mappings for. Extents returned mirror @@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR If this flag is set, the extents returned will describe the inodes extended attribute lookup tree, instead of its data tree. +FIEMAP_FLAG_CACHE + This flag requests caching of the extents. Extent Mapping -------------- @@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST flag set (see the next section on extent flags). Each extent is described by a single fiemap_extent structure as -returned in fm_extents:: +returned in fm_extents: - struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent */ - __u64 fe_length; /* length in bytes for the extent */ - __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ - __u32 fe_reserved[3]; - }; +.. kernel-doc:: include/uapi/linux/fiemap.h + :identifiers: fiemap_extent All offsets and lengths are in bytes and mirror those on disk. It is valid for an extents logical offset to start before the request or its logical @@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED userspace would be highly inefficient, the kernel will try to merge most adjacent blocks into 'extents'. +FIEMAP_EXTENT_SHARED + This flag is set to request that space be shared with other files. VFS -> File System Implementation --------------------------------- @@ -191,14 +176,10 @@ each discovered extent:: u64 len); ->fiemap is passed struct fiemap_extent_info which describes the -fiemap request:: +fiemap request: - struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ - }; +.. kernel-doc:: include/linux/fiemap.h + :identifiers: fiemap_extent_info It is intended that the file system should not need to access any of this structure directly. Filesystem handlers should be tolerant to signals and return diff --git a/drivers/virt/vboxguest/Kconfig b/drivers/virt/vboxguest/Kconfig index cc329887bfae..11b153e7454e 100644 --- a/drivers/virt/vboxguest/Kconfig +++ b/drivers/virt/vboxguest/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config VBOXGUEST tristate "Virtual Box Guest integration support" - depends on X86 && PCI && INPUT + depends on (ARM64 || X86) && PCI && INPUT help This is a driver for the Virtual Box Guest PCI device used in Virtual Box virtual machines. Enabling this driver will add diff --git a/fs/dcache.c b/fs/dcache.c index b4d5e9e1e43d..1a01d7a6a7a9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1681,9 +1681,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ - dentry->d_lockref.count = 1; dentry->d_flags = 0; - spin_lock_init(&dentry->d_lock); + lockref_init(&dentry->d_lockref, 1); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 19ef4ff2a134..254f6ad2c336 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -747,8 +747,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - spin_lock_init(&pcl->lockref.lock); - pcl->lockref.count = 1; /* one ref for this request */ + lockref_init(&pcl->lockref, 1); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89aade6f45f6..7c54ae5fcbd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { - inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode_set_cached_link(inode, (char *)ei->i_data, + inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bcf2737078b8..536d56d15072 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3418,7 +3418,6 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; } } @@ -3434,6 +3433,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; + if (!IS_ENCRYPTED(inode)) + inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, + inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) diff --git a/fs/file.c b/fs/file.c index 25c6e53b03f8..d868cdb95d1e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -279,10 +279,6 @@ static int expand_files(struct files_struct *files, unsigned int nr) if (nr < fdt->max_fds) return 0; - /* Can we expand? */ - if (nr >= sysctl_nr_open) - return -EMFILE; - if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); @@ -290,6 +286,10 @@ static int expand_files(struct files_struct *files, unsigned int nr) goto repeat; } + /* Can we expand? */ + if (unlikely(nr >= sysctl_nr_open)) + return -EMFILE; + /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); @@ -1231,17 +1231,9 @@ __releases(&files->file_lock) /* * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. NB: OpenBSD avoids that at the price of - * extra work in their equivalent of fget() - they insert struct - * file immediately after grabbing descriptor, mark it larval if - * more work (e.g. actual opening) is needed and make sure that - * fget() treats larval files as absent. Potentially interesting, - * but while extra work in fget() is trivial, locking implications - * and amount of surgery on open()-related paths in VFS are not. - * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" - * deadlocks in rather amusing ways, AFAICS. All of that is out of - * scope of POSIX or SUS, since neither considers shared descriptor - * tables and this condition does not arise without those. + * not finished descriptor. + * + * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); diff --git a/fs/file_table.c b/fs/file_table.c index 976736be47cb..a32171d2b83f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = { .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, @@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work) __fput(container_of(work, struct file, f_task_work)); } +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); + /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we @@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work) void flush_delayed_fput(void) { delayed_fput(NULL); + flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); -static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); - void fput(struct file *file) { if (file_ref_put(&file->f_ref)) { diff --git a/fs/fs_context.c b/fs/fs_context.c index 98589aae5208..582d33e81117 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc) if (log) { if (refcount_dec_and_test(&log->usage)) { fc->log.log = NULL; - for (i = 0; i <= 7; i++) + for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 72b48f6f5561..58bc5013ca49 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,8 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - qd->qd_lockref.count = 0; - spin_lock_init(&qd->qd_lockref.lock); + lockref_init(&qd->qd_lockref, 0); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); diff --git a/fs/namei.c b/fs/namei.c index 9d30c7aa9aa6..e56c29a22d26 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna getname(newname), 0); } -int readlink_copy(char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { - int len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; + int copylen; - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; -out: - return len; + copylen = linklen; + if (unlikely(copylen > (unsigned) buflen)) + copylen = buflen; + if (copy_to_user(buffer, link, copylen)) + copylen = -EFAULT; + return copylen; } /** @@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) const char *link; int res; + if (inode->i_opflags & IOP_CACHED_LINK) + return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); @@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(link)) return PTR_ERR(link); } - res = readlink_copy(buffer, buflen, link); + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } @@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { + const char *link; + int res; + DEFINE_DELAYED_CALL(done); - int res = readlink_copy(buffer, buflen, - page_get_link(dentry, d_inode(dentry), - &done)); + link = page_get_link(dentry, d_inode(dentry), &done); + res = PTR_ERR(link); + if (!IS_ERR(link)) + res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } diff --git a/fs/namespace.c b/fs/namespace.c index eac057e56948..851af89e8d72 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3839,7 +3839,7 @@ int path_mount(const char *dev_name, struct path *path, data_page); } -long do_mount(const char *dev_name, const char __user *dir_name, +int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; diff --git a/fs/pipe.c b/fs/pipe.c index 12b22c2723b7..82fede0f2111 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - bool was_full, wake_next_reader = false; + bool wake_writer = false, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ @@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) mutex_lock(&pipe->mutex); /* - * We only wake up writers if the pipe was full when we started - * reading in order to avoid unnecessary wakeups. + * We only wake up writers if the pipe was full when we started reading + * and it is no longer full after reading to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); @@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) + if (!buf->len) { + wake_writer |= pipe_full(head, tail, pipe->max_usage); tail = pipe_update_tail(pipe, buf, tail); + } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) + if (unlikely(wake_writer)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - mutex_lock(&pipe->mutex); - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + wake_writer = false; wake_next_reader = true; + mutex_lock(&pipe->mutex); } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; mutex_unlock(&pipe->mutex); - if (was_full) + if (wake_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); diff --git a/fs/pnode.c b/fs/pnode.c index a799e0315cc9..ef048f008bdd 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list) continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* - * We have come accross an partially unmounted - * mount in list that has not been visited yet. - * Remember it has been visited and continue - * about our merry way. + * We have come across a partially unmounted + * mount in a list that has not been visited + * yet. Remember it has been visited and + * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ed3510d2bb..d6a0369caa93 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * a program is not able to use ptrace(2) in that case. It is * safe because the task has stopped executing permanently. */ - if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { + if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) { if (try_get_task_stack(task)) { eip = KSTK_EIP(task); esp = KSTK_ESP(task); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a..c610224faf10 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); } put_task_struct(task); return res; diff --git a/fs/select.c b/fs/select.c index e223d1fe9d55..7da531b1cf6b 100644 --- a/fs/select.c +++ b/fs/select.c @@ -786,7 +786,7 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } @@ -1355,7 +1355,7 @@ static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, } return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } diff --git a/fs/vboxsf/Kconfig b/fs/vboxsf/Kconfig index b84586ae08b3..d4694026db8b 100644 --- a/fs/vboxsf/Kconfig +++ b/fs/vboxsf/Kconfig @@ -1,6 +1,6 @@ config VBOXSF_FS tristate "VirtualBox guest shared folder (vboxsf) support" - depends on X86 && VBOXGUEST + depends on (ARM64 || X86) && VBOXGUEST select NLS help VirtualBox hosts can share folders with guests, this driver diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235..966092ffa89a 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -5,12 +5,18 @@ #include #include +/** + * struct fiemap_extent_info - fiemap request to a filesystem + * @fi_flags: Flags as passed from user + * @fi_extents_mapped: Number of mapped extents + * @fi_extents_max: Size of fiemap_extent array + * @fi_extents_start: Start of fiemap_extent array + */ struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + unsigned int fi_flags; + unsigned int fi_extents_mapped; + unsigned int fi_extents_max; + struct fiemap_extent __user *fi_extents_start; }; int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..e06ea7e9ca15 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -626,6 +628,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for @@ -723,7 +726,10 @@ struct inode { }; struct file_lock_context *i_flctx; struct address_space i_data; - struct list_head i_devices; + union { + struct list_head i_devices; + int i_linklen; + }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; @@ -749,6 +755,13 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) +{ + inode->i_link = link; + inode->i_linklen = linklen; + inode->i_opflags |= IOP_CACHED_LINK; +} + /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. @@ -2127,6 +2140,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3351,7 +3366,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); @@ -3614,6 +3629,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c3a1f78bc884..c39f119659ba 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,14 +34,24 @@ struct lockref { }; }; -extern void lockref_get(struct lockref *); -extern int lockref_put_return(struct lockref *); -extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_not_zero(struct lockref *); -extern int lockref_put_or_lock(struct lockref *); +/** + * lockref_init - Initialize a lockref + * @lockref: pointer to lockref structure + * @count: initial count + */ +static inline void lockref_init(struct lockref *lockref, unsigned int count) +{ + spin_lock_init(&lockref->lock); + lockref->count = count; +} -extern void lockref_mark_dead(struct lockref *); -extern int lockref_get_not_dead(struct lockref *); +void lockref_get(struct lockref *lockref); +int lockref_put_return(struct lockref *lockref); +bool lockref_get_not_zero(struct lockref *lockref); +bool lockref_put_or_lock(struct lockref *lockref); + +void lockref_mark_dead(struct lockref *lockref); +bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ static inline bool __lockref_is_dead(const struct lockref *l) diff --git a/include/linux/mount.h b/include/linux/mount.h index 04213d8ef837..dcc17ce8a959 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -75,7 +75,7 @@ struct vfsmount { static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_idmap); + return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); @@ -113,7 +113,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, +int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..eb20dcaa51b5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 24ca0c00cae3..9d9e8ae32b41 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -14,37 +14,56 @@ #include +/** + * struct fiemap_extent - description of one fiemap extent + * @fe_logical: byte offset of the extent in the file + * @fe_physical: byte offset of extent on disk + * @fe_length: length in bytes for this extent + * @fe_flags: FIEMAP_EXTENT_* flags for this extent + */ struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_logical; + __u64 fe_physical; + __u64 fe_length; + /* private: */ __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + /* public: */ + __u32 fe_flags; + /* private: */ __u32 fe_reserved[3]; }; +/** + * struct fiemap - file extent mappings + * @fm_start: byte offset (inclusive) at which to start mapping (in) + * @fm_length: logical length of mapping which userspace wants (in) + * @fm_flags: FIEMAP_FLAG_* flags for request (in/out) + * @fm_mapped_extents: number of extents that were mapped (out) + * @fm_extent_count: size of fm_extents array (in) + * @fm_extents: array of mapped extents (out) + */ struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace wants (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u64 fm_start; + __u64 fm_length; + __u32 fm_flags; + __u32 fm_mapped_extents; + __u32 fm_extent_count; + /* private: */ __u32 fm_reserved; - struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ + /* public: */ + struct fiemap_extent fm_extents[]; }; #define FIEMAP_MAX_OFFSET (~0ULL) +/* flags used in fm_flags: */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) +/* flags used in fe_flags: */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 1895fbc32bcb..5267adeaa403 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; - bit += page->index; + bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); @@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; - pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); diff --git a/lib/lockref.c b/lib/lockref.c index 2afe4c5d8919..5d8e3ef3860e 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -58,63 +58,34 @@ EXPORT_SYMBOL(lockref_get); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero */ -int lockref_get_not_zero(struct lockref *lockref) +bool lockref_get_not_zero(struct lockref *lockref) { - int retval; + bool retval = false; CMPXCHG_LOOP( new.count++; if (old.count <= 0) - return 0; + return false; , - return 1; + return true; ); spin_lock(&lockref->lock); - retval = 0; if (lockref->count > 0) { lockref->count++; - retval = 1; + retval = true; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_get_not_zero); -/** - * lockref_put_not_zero - Decrements count unless count <= 1 before decrement - * @lockref: pointer to lockref structure - * Return: 1 if count updated successfully or 0 if count would become zero - */ -int lockref_put_not_zero(struct lockref *lockref) -{ - int retval; - - CMPXCHG_LOOP( - new.count--; - if (old.count <= 1) - return 0; - , - return 1; - ); - - spin_lock(&lockref->lock); - retval = 0; - if (lockref->count > 1) { - lockref->count--; - retval = 1; - } - spin_unlock(&lockref->lock); - return retval; -} -EXPORT_SYMBOL(lockref_put_not_zero); - /** * lockref_put_return - Decrement reference count if possible * @lockref: pointer to lockref structure * * Decrement the reference count and return the new value. - * If the lockref was dead or locked, return an error. + * If the lockref was dead or locked, return -1. */ int lockref_put_return(struct lockref *lockref) { @@ -134,22 +105,22 @@ EXPORT_SYMBOL(lockref_put_return); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken */ -int lockref_put_or_lock(struct lockref *lockref) +bool lockref_put_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 1) break; , - return 1; + return true; ); spin_lock(&lockref->lock); if (lockref->count <= 1) - return 0; + return false; lockref->count--; spin_unlock(&lockref->lock); - return 1; + return true; } EXPORT_SYMBOL(lockref_put_or_lock); @@ -169,23 +140,22 @@ EXPORT_SYMBOL(lockref_mark_dead); * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if lockref was dead */ -int lockref_get_not_dead(struct lockref *lockref) +bool lockref_get_not_dead(struct lockref *lockref) { - int retval; + bool retval = false; CMPXCHG_LOOP( new.count++; if (old.count < 0) - return 0; + return false; , - return 1; + return true; ); spin_lock(&lockref->lock); - retval = 0; if (lockref->count >= 0) { lockref->count++; - retval = 1; + retval = true; } spin_unlock(&lockref->lock); return retval; diff --git a/mm/shmem.c b/mm/shmem.c index fdb5afa1cfe9..333866f81aea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3917,6 +3917,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3938,12 +3939,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2c0185ebc900..c07d150685d7 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer, res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, d_inode(dentry)->i_ino); if (res > 0 && res < sizeof(name)) - res = readlink_copy(buffer, buflen, name); + res = readlink_copy(buffer, buflen, name, strlen(name)); else res = -ENOENT; diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile new file mode 100644 index 000000000000..ed210037b29d --- /dev/null +++ b/tools/testing/selftests/coredump/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS = $(KHDR_INCLUDES) + +TEST_GEN_PROGS := stackdump_test +TEST_FILES := stackdump + +include ../lib.mk diff --git a/tools/testing/selftests/coredump/README.rst b/tools/testing/selftests/coredump/README.rst new file mode 100644 index 000000000000..164a7aa181c8 --- /dev/null +++ b/tools/testing/selftests/coredump/README.rst @@ -0,0 +1,50 @@ +coredump selftest +================= + +Background context +------------------ + +`coredump` is a feature which dumps a process's memory space when the process terminates +unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default, +`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a +different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a +user-space program by writing the pipe symbol (`|`) followed by the command to be executed to +`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`. + +The piped user program may be interested in reading the stack pointers of the crashed process. The +crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in +`/proc/$PID/stat`. See `man 5 proc` for all the details. + +The problem +----------- +While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field +reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid +value. + +However, this was broken in the past and `kstkesp` was zero even during coredump: + +* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to + always be zero + +* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the + coredumping thread. However, other threads in a coredumping process still had the problem. + +* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed + for all threads in a coredumping process. + +* commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") broke it again + for the other threads in a coredumping process. + +The problem has been fixed now, but considering the history, it may appear again in the future. + +The goal of this test +--------------------- +This test detects problem with reading `kstkesp` during coredump by doing the following: + +#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script + reads the stack pointers of all threads of crashed processes. + +#. Spawn a child process who creates some threads and then crashes. + +#. Read the output from the "stackdump" script, and make sure all stack pointer values are + non-zero. diff --git a/tools/testing/selftests/coredump/stackdump b/tools/testing/selftests/coredump/stackdump new file mode 100755 index 000000000000..96714ce42d12 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump @@ -0,0 +1,14 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +CRASH_PROGRAM_ID=$1 +STACKDUMP_FILE=$2 + +TMP=$(mktemp) + +for t in /proc/$CRASH_PROGRAM_ID/task/*; do + tid=$(basename $t) + cat /proc/$tid/stat | awk '{print $29}' >> $TMP +done + +mv $TMP $STACKDUMP_FILE diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c new file mode 100644 index 000000000000..137b2364a082 --- /dev/null +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#define STACKDUMP_FILE "stack_values" +#define STACKDUMP_SCRIPT "stackdump" +#define NUM_THREAD_SPAWN 128 + +static void *do_nothing(void *) +{ + while (1) + pause(); +} + +static void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +FIXTURE(coredump) +{ + char original_core_pattern[256]; +}; + +FIXTURE_SETUP(coredump) +{ + char buf[PATH_MAX]; + FILE *file; + char *dir; + int ret; + + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret; + + unlink(STACKDUMP_FILE); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason); +} + +TEST_F(coredump, stackdump) +{ + struct sigaction action = {}; + unsigned long long stack; + char *test_dir, *line; + size_t line_length; + char buf[PATH_MAX]; + int ret, i; + FILE *file; + pid_t pid; + + /* + * Step 1: Setup core_pattern so that the stackdump script is executed when the child + * process crashes + */ + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + ASSERT_NE(-1, ret); + ASSERT_LT(ret, sizeof(buf)); + buf[ret] = '\0'; + + test_dir = dirname(buf); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + ASSERT_NE(NULL, file); + + ret = fprintf(file, "|%1$s/%2$s %%P %1$s/%3$s", test_dir, STACKDUMP_SCRIPT, STACKDUMP_FILE); + ASSERT_LT(0, ret); + + ret = fclose(file); + ASSERT_EQ(0, ret); + + /* Step 2: Create a process who spawns some threads then crashes */ + pid = fork(); + ASSERT_TRUE(pid >= 0); + if (pid == 0) + crashing_child(); + + /* + * Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file + */ + for (i = 0; i < 10; ++i) { + file = fopen(STACKDUMP_FILE, "r"); + if (file) + break; + sleep(1); + } + ASSERT_NE(file, NULL); + + /* Step 4: Make sure all stack pointer values are non-zero */ + for (i = 0; -1 != getline(&line, &line_length, file); ++i) { + stack = strtoull(line, NULL, 10); + ASSERT_NE(stack, 0); + } + + ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN); + + fclose(file); +} + +TEST_HARNESS_MAIN