1
0
Fork 0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-01-22 07:53:11 -05:00

vfs-6.14-rc1.misc

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZ4pRjQAKCRCRxhvAZXjc
 omUyAP9k31Qr7RY1zNtmpPfejqc+3Xx+xXD7NwHr+tONWtUQiQEA/F94qU2U3ivS
 AzyDABWrEQ5ZNsm+Rq2Y3zyoH7of3ww=
 =s3Bu
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.14-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull misc vfs updates from Christian Brauner:
 "Features:

   - Support caching symlink lengths in inodes

     The size is stored in a new union utilizing the same space as
     i_devices, thus avoiding growing the struct or taking up any more
     space

     When utilized it dodges strlen() in vfs_readlink(), giving about
     1.5% speed up when issuing readlink on /initrd.img on ext4

   - Add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag

     If a file system supports uncached buffered IO, it may set
     FOP_DONTCACHE and enable support for RWF_DONTCACHE.

     If RWF_DONTCACHE is attempted without the file system supporting
     it, it'll get errored with -EOPNOTSUPP

   - Enable VBOXGUEST and VBOXSF_FS on ARM64

     Now that VirtualBox is able to run as a host on arm64 (e.g. the
     Apple M3 processors) we can enable VBOXSF_FS (and in turn
     VBOXGUEST) for this architecture.

     Tested with various runs of bonnie++ and dbench on an Apple MacBook
     Pro with the latest Virtualbox 7.1.4 r165100 installed

  Cleanups:

   - Delay sysctl_nr_open check in expand_files()

   - Use kernel-doc includes in fiemap docbook

   - Use page->private instead of page->index in watch_queue

   - Use a consume fence in mnt_idmap() as it's heavily used in
     link_path_walk()

   - Replace magic number 7 with ARRAY_SIZE() in fc_log

   - Sort out a stale comment about races between fd alloc and dup2()

   - Fix return type of do_mount() from long to int

   - Various cosmetic cleanups for the lockref code

  Fixes:

   - Annotate spinning as unlikely() in __read_seqcount_begin

     The annotation already used to be there, but got lost in commit
     52ac39e5db ("seqlock: seqcount_t: Implement all read APIs as
     statement expressions")

   - Fix proc_handler for sysctl_nr_open

   - Flush delayed work in delayed fput()

   - Fix grammar and spelling in propagate_umount()

   - Fix ESP not readable during coredump

     In /proc/PID/stat, there is the kstkesp field which is the stack
     pointer of a thread. While the thread is active, this field reads
     zero. But during a coredump, it should have a valid value

     However, at the moment, kstkesp is zero even during coredump

   - Don't wake up the writer if the pipe is still full

   - Fix unbalanced user_access_end() in select code"

* tag 'vfs-6.14-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (28 commits)
  gfs2: use lockref_init for qd_lockref
  erofs: use lockref_init for pcl->lockref
  dcache: use lockref_init for d_lockref
  lockref: add a lockref_init helper
  lockref: drop superfluous externs
  lockref: use bool for false/true returns
  lockref: improve the lockref_get_not_zero description
  lockref: remove lockref_put_not_zero
  fs: Fix return type of do_mount() from long to int
  select: Fix unbalanced user_access_end()
  vbox: Enable VBOXGUEST and VBOXSF_FS on ARM64
  pipe_read: don't wake up the writer if the pipe is still full
  selftests: coredump: Add stackdump test
  fs/proc: do_task_stat: Fix ESP not readable during coredump
  fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag
  fs: sort out a stale comment about races between fd alloc and dup2
  fs: Fix grammar and spelling in propagate_umount()
  fs: fc_log replace magic number 7 with ARRAY_SIZE()
  fs: use a consume fence in mnt_idmap()
  file: flush delayed work in delayed fput()
  ...
This commit is contained in:
Linus Torvalds 2025-01-20 09:40:49 -08:00
commit 4b84a4c8d4
33 changed files with 412 additions and 177 deletions

View file

@ -12,21 +12,10 @@ returns a list of extents.
Request Basics Request Basics
-------------- --------------
A fiemap request is encoded within struct fiemap:: A fiemap request is encoded within struct fiemap:
struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at
* which to start mapping (in) */
__u64 fm_length; /* logical length of mapping which
* userspace cares about (in) */
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
__u32 fm_mapped_extents; /* number of extents that were
* mapped (out) */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved;
struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
};
.. kernel-doc:: include/uapi/linux/fiemap.h
:identifiers: fiemap
fm_start, and fm_length specify the logical range within the file fm_start, and fm_length specify the logical range within the file
which the process would like mappings for. Extents returned mirror which the process would like mappings for. Extents returned mirror
@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR
If this flag is set, the extents returned will describe the inodes If this flag is set, the extents returned will describe the inodes
extended attribute lookup tree, instead of its data tree. extended attribute lookup tree, instead of its data tree.
FIEMAP_FLAG_CACHE
This flag requests caching of the extents.
Extent Mapping Extent Mapping
-------------- --------------
@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST
flag set (see the next section on extent flags). flag set (see the next section on extent flags).
Each extent is described by a single fiemap_extent structure as Each extent is described by a single fiemap_extent structure as
returned in fm_extents:: returned in fm_extents:
struct fiemap_extent { .. kernel-doc:: include/uapi/linux/fiemap.h
__u64 fe_logical; /* logical offset in bytes for the start of :identifiers: fiemap_extent
* the extent */
__u64 fe_physical; /* physical offset in bytes for the start
* of the extent */
__u64 fe_length; /* length in bytes for the extent */
__u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
__u32 fe_reserved[3];
};
All offsets and lengths are in bytes and mirror those on disk. It is valid All offsets and lengths are in bytes and mirror those on disk. It is valid
for an extents logical offset to start before the request or its logical for an extents logical offset to start before the request or its logical
@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED
userspace would be highly inefficient, the kernel will try to merge most userspace would be highly inefficient, the kernel will try to merge most
adjacent blocks into 'extents'. adjacent blocks into 'extents'.
FIEMAP_EXTENT_SHARED
This flag is set to request that space be shared with other files.
VFS -> File System Implementation VFS -> File System Implementation
--------------------------------- ---------------------------------
@ -191,14 +176,10 @@ each discovered extent::
u64 len); u64 len);
->fiemap is passed struct fiemap_extent_info which describes the ->fiemap is passed struct fiemap_extent_info which describes the
fiemap request:: fiemap request:
struct fiemap_extent_info { .. kernel-doc:: include/linux/fiemap.h
unsigned int fi_flags; /* Flags as passed from user */ :identifiers: fiemap_extent_info
unsigned int fi_extents_mapped; /* Number of mapped extents */
unsigned int fi_extents_max; /* Size of fiemap_extent array */
struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
};
It is intended that the file system should not need to access any of this It is intended that the file system should not need to access any of this
structure directly. Filesystem handlers should be tolerant to signals and return structure directly. Filesystem handlers should be tolerant to signals and return

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config VBOXGUEST config VBOXGUEST
tristate "Virtual Box Guest integration support" tristate "Virtual Box Guest integration support"
depends on X86 && PCI && INPUT depends on (ARM64 || X86) && PCI && INPUT
help help
This is a driver for the Virtual Box Guest PCI device used in This is a driver for the Virtual Box Guest PCI device used in
Virtual Box virtual machines. Enabling this driver will add Virtual Box virtual machines. Enabling this driver will add

View file

@ -1681,9 +1681,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
/* Make sure we always see the terminating NUL character */ /* Make sure we always see the terminating NUL character */
smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
dentry->d_lockref.count = 1;
dentry->d_flags = 0; dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock); lockref_init(&dentry->d_lockref, 1);
seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
dentry->d_inode = NULL; dentry->d_inode = NULL;
dentry->d_parent = dentry; dentry->d_parent = dentry;

View file

@ -747,8 +747,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl)) if (IS_ERR(pcl))
return PTR_ERR(pcl); return PTR_ERR(pcl);
spin_lock_init(&pcl->lockref.lock); lockref_init(&pcl->lockref, 1); /* one ref for this request */
pcl->lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat; pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0; pcl->length = 0;
pcl->partial = true; pcl->partial = true;

View file

@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_ENCRYPTED(inode)) { if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations; inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) { } else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size, nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1); sizeof(ei->i_data) - 1);
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else { } else {
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} }

View file

@ -3418,7 +3418,6 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} else { } else {
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
inode->i_link = (char *)&EXT4_I(inode)->i_data;
} }
} }
@ -3434,6 +3433,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
disk_link.len); disk_link.len);
inode->i_size = disk_link.len - 1; inode->i_size = disk_link.len - 1;
EXT4_I(inode)->i_disksize = inode->i_size; EXT4_I(inode)->i_disksize = inode->i_size;
if (!IS_ENCRYPTED(inode))
inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
inode->i_size);
} }
err = ext4_add_nondir(handle, dentry, &inode); err = ext4_add_nondir(handle, dentry, &inode);
if (handle) if (handle)

View file

@ -279,10 +279,6 @@ static int expand_files(struct files_struct *files, unsigned int nr)
if (nr < fdt->max_fds) if (nr < fdt->max_fds)
return 0; return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
if (unlikely(files->resize_in_progress)) { if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
wait_event(files->resize_wait, !files->resize_in_progress); wait_event(files->resize_wait, !files->resize_in_progress);
@ -290,6 +286,10 @@ static int expand_files(struct files_struct *files, unsigned int nr)
goto repeat; goto repeat;
} }
/* Can we expand? */
if (unlikely(nr >= sysctl_nr_open))
return -EMFILE;
/* All good, so we try */ /* All good, so we try */
files->resize_in_progress = true; files->resize_in_progress = true;
error = expand_fdtable(files, nr); error = expand_fdtable(files, nr);
@ -1231,17 +1231,9 @@ __releases(&files->file_lock)
/* /*
* We need to detect attempts to do dup2() over allocated but still * We need to detect attempts to do dup2() over allocated but still
* not finished descriptor. NB: OpenBSD avoids that at the price of * not finished descriptor.
* extra work in their equivalent of fget() - they insert struct *
* file immediately after grabbing descriptor, mark it larval if * POSIX is silent on the issue, we return -EBUSY.
* more work (e.g. actual opening) is needed and make sure that
* fget() treats larval files as absent. Potentially interesting,
* but while extra work in fget() is trivial, locking implications
* and amount of surgery on open()-related paths in VFS are not.
* FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
* deadlocks in rather amusing ways, AFAICS. All of that is out of
* scope of POSIX or SUS, since neither considers shared descriptor
* tables and this condition does not arise without those.
*/ */
fdt = files_fdtable(files); fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds); fd = array_index_nospec(fd, fdt->max_fds);

View file

@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = {
.data = &sysctl_nr_open, .data = &sysctl_nr_open,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_douintvec_minmax,
.extra1 = &sysctl_nr_open_min, .extra1 = &sysctl_nr_open_min,
.extra2 = &sysctl_nr_open_max, .extra2 = &sysctl_nr_open_max,
}, },
@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work)
__fput(container_of(work, struct file, f_task_work)); __fput(container_of(work, struct file, f_task_work));
} }
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
/* /*
* If kernel thread really needs to have the final fput() it has done * If kernel thread really needs to have the final fput() it has done
* to complete, call this. The only user right now is the boot - we * to complete, call this. The only user right now is the boot - we
@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work)
void flush_delayed_fput(void) void flush_delayed_fput(void)
{ {
delayed_fput(NULL); delayed_fput(NULL);
flush_delayed_work(&delayed_fput_work);
} }
EXPORT_SYMBOL_GPL(flush_delayed_fput); EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file) void fput(struct file *file)
{ {
if (file_ref_put(&file->f_ref)) { if (file_ref_put(&file->f_ref)) {

View file

@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc)
if (log) { if (log) {
if (refcount_dec_and_test(&log->usage)) { if (refcount_dec_and_test(&log->usage)) {
fc->log.log = NULL; fc->log.log = NULL;
for (i = 0; i <= 7; i++) for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
if (log->need_free & (1 << i)) if (log->need_free & (1 << i))
kfree(log->buffer[i]); kfree(log->buffer[i]);
kfree(log); kfree(log);

View file

@ -236,8 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str
return NULL; return NULL;
qd->qd_sbd = sdp; qd->qd_sbd = sdp;
qd->qd_lockref.count = 0; lockref_init(&qd->qd_lockref, 0);
spin_lock_init(&qd->qd_lockref.lock);
qd->qd_id = qid; qd->qd_id = qid;
qd->qd_slot = -1; qd->qd_slot = -1;
INIT_LIST_HEAD(&qd->qd_lru); INIT_LIST_HEAD(&qd->qd_lru);

View file

@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
getname(newname), 0); getname(newname), 0);
} }
int readlink_copy(char __user *buffer, int buflen, const char *link) int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
{ {
int len = PTR_ERR(link); int copylen;
if (IS_ERR(link))
goto out;
len = strlen(link); copylen = linklen;
if (len > (unsigned) buflen) if (unlikely(copylen > (unsigned) buflen))
len = buflen; copylen = buflen;
if (copy_to_user(buffer, link, len)) if (copy_to_user(buffer, link, copylen))
len = -EFAULT; copylen = -EFAULT;
out: return copylen;
return len;
} }
/** /**
@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
const char *link; const char *link;
int res; int res;
if (inode->i_opflags & IOP_CACHED_LINK)
return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
if (unlikely(inode->i_op->readlink)) if (unlikely(inode->i_op->readlink))
return inode->i_op->readlink(dentry, buffer, buflen); return inode->i_op->readlink(dentry, buffer, buflen);
@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(link)) if (IS_ERR(link))
return PTR_ERR(link); return PTR_ERR(link);
} }
res = readlink_copy(buffer, buflen, link); res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }
@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{ {
const char *link;
int res;
DEFINE_DELAYED_CALL(done); DEFINE_DELAYED_CALL(done);
int res = readlink_copy(buffer, buflen, link = page_get_link(dentry, d_inode(dentry), &done);
page_get_link(dentry, d_inode(dentry), res = PTR_ERR(link);
&done)); if (!IS_ERR(link))
res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }

View file

@ -3839,7 +3839,7 @@ int path_mount(const char *dev_name, struct path *path,
data_page); data_page);
} }
long do_mount(const char *dev_name, const char __user *dir_name, int do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page) const char *type_page, unsigned long flags, void *data_page)
{ {
struct path path; struct path path;

View file

@ -253,7 +253,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
size_t total_len = iov_iter_count(to); size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp; struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data; struct pipe_inode_info *pipe = filp->private_data;
bool was_full, wake_next_reader = false; bool wake_writer = false, wake_next_reader = false;
ssize_t ret; ssize_t ret;
/* Null read succeeds. */ /* Null read succeeds. */
@ -264,14 +264,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
mutex_lock(&pipe->mutex); mutex_lock(&pipe->mutex);
/* /*
* We only wake up writers if the pipe was full when we started * We only wake up writers if the pipe was full when we started reading
* reading in order to avoid unnecessary wakeups. * and it is no longer full after reading to avoid unnecessary wakeups.
* *
* But when we do wake up writers, we do so using a sync wakeup * But when we do wake up writers, we do so using a sync wakeup
* (WF_SYNC), because we want them to get going and generate more * (WF_SYNC), because we want them to get going and generate more
* data for us. * data for us.
*/ */
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) { for (;;) {
/* Read ->head with a barrier vs post_one_notification() */ /* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head); unsigned int head = smp_load_acquire(&pipe->head);
@ -340,8 +339,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
buf->len = 0; buf->len = 0;
} }
if (!buf->len) if (!buf->len) {
wake_writer |= pipe_full(head, tail, pipe->max_usage);
tail = pipe_update_tail(pipe, buf, tail); tail = pipe_update_tail(pipe, buf, tail);
}
total_len -= chars; total_len -= chars;
if (!total_len) if (!total_len)
break; /* common path: read succeeded */ break; /* common path: read succeeded */
@ -377,7 +378,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
* _very_ unlikely case that the pipe was full, but we got * _very_ unlikely case that the pipe was full, but we got
* no data. * no data.
*/ */
if (unlikely(was_full)) if (unlikely(wake_writer))
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
@ -390,15 +391,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS; return -ERESTARTSYS;
mutex_lock(&pipe->mutex); wake_writer = false;
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true; wake_next_reader = true;
mutex_lock(&pipe->mutex);
} }
if (pipe_empty(pipe->head, pipe->tail)) if (pipe_empty(pipe->head, pipe->tail))
wake_next_reader = false; wake_next_reader = false;
mutex_unlock(&pipe->mutex); mutex_unlock(&pipe->mutex);
if (was_full) if (wake_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (wake_next_reader) if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);

View file

@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list)
continue; continue;
} else if (child->mnt.mnt_flags & MNT_UMOUNT) { } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
/* /*
* We have come accross an partially unmounted * We have come across a partially unmounted
* mount in list that has not been visited yet. * mount in a list that has not been visited
* Remember it has been visited and continue * yet. Remember it has been visited and
* about our merry way. * continue about our merry way.
*/ */
list_add_tail(&child->mnt_umounting, &visited); list_add_tail(&child->mnt_umounting, &visited);
continue; continue;

View file

@ -500,7 +500,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
* a program is not able to use ptrace(2) in that case. It is * a program is not able to use ptrace(2) in that case. It is
* safe because the task has stopped executing permanently. * safe because the task has stopped executing permanently.
*/ */
if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) { if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE|PF_POSTCOREDUMP))) {
if (try_get_task_stack(task)) { if (try_get_task_stack(task)) {
eip = KSTK_EIP(task); eip = KSTK_EIP(task);
esp = KSTK_ESP(task); esp = KSTK_ESP(task);

View file

@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops); res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0) if (res >= 0)
res = readlink_copy(buffer, buflen, name); res = readlink_copy(buffer, buflen, name, strlen(name));
} }
put_task_struct(task); put_task_struct(task);
return res; return res;

View file

@ -786,7 +786,7 @@ static inline int get_sigset_argpack(struct sigset_argpack *to,
} }
return 0; return 0;
Efault: Efault:
user_access_end(); user_read_access_end();
return -EFAULT; return -EFAULT;
} }
@ -1355,7 +1355,7 @@ static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
} }
return 0; return 0;
Efault: Efault:
user_access_end(); user_read_access_end();
return -EFAULT; return -EFAULT;
} }

View file

@ -1,6 +1,6 @@
config VBOXSF_FS config VBOXSF_FS
tristate "VirtualBox guest shared folder (vboxsf) support" tristate "VirtualBox guest shared folder (vboxsf) support"
depends on X86 && VBOXGUEST depends on (ARM64 || X86) && VBOXGUEST
select NLS select NLS
help help
VirtualBox hosts can share folders with guests, this driver VirtualBox hosts can share folders with guests, this driver

View file

@ -5,12 +5,18 @@
#include <uapi/linux/fiemap.h> #include <uapi/linux/fiemap.h>
#include <linux/fs.h> #include <linux/fs.h>
/**
* struct fiemap_extent_info - fiemap request to a filesystem
* @fi_flags: Flags as passed from user
* @fi_extents_mapped: Number of mapped extents
* @fi_extents_max: Size of fiemap_extent array
* @fi_extents_start: Start of fiemap_extent array
*/
struct fiemap_extent_info { struct fiemap_extent_info {
unsigned int fi_flags; /* Flags as passed from user */ unsigned int fi_flags;
unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_mapped;
unsigned int fi_extents_max; /* Size of fiemap_extent array */ unsigned int fi_extents_max;
struct fiemap_extent __user *fi_extents_start; /* Start of struct fiemap_extent __user *fi_extents_start;
fiemap_extent array */
}; };
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,

View file

@ -322,6 +322,7 @@ struct readahead_control;
#define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_NOWAIT (__force int) RWF_NOWAIT
#define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_APPEND (__force int) RWF_APPEND
#define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_ATOMIC (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE
/* non-RWF related bits - start at 16 */ /* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD (1 << 16) #define IOCB_EVENTFD (1 << 16)
@ -356,7 +357,8 @@ struct readahead_control;
{ IOCB_SYNC, "SYNC" }, \ { IOCB_SYNC, "SYNC" }, \
{ IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \ { IOCB_APPEND, "APPEND" }, \
{ IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_ATOMIC, "ATOMIC" }, \
{ IOCB_DONTCACHE, "DONTCACHE" }, \
{ IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \ { IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \ { IOCB_WRITE, "WRITE" }, \
@ -626,6 +628,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_XATTR 0x0008 #define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010 #define IOP_DEFAULT_READLINK 0x0010
#define IOP_MGTIME 0x0020 #define IOP_MGTIME 0x0020
#define IOP_CACHED_LINK 0x0040
/* /*
* Keep mostly read-only and often accessed (especially for * Keep mostly read-only and often accessed (especially for
@ -723,7 +726,10 @@ struct inode {
}; };
struct file_lock_context *i_flctx; struct file_lock_context *i_flctx;
struct address_space i_data; struct address_space i_data;
struct list_head i_devices; union {
struct list_head i_devices;
int i_linklen;
};
union { union {
struct pipe_inode_info *i_pipe; struct pipe_inode_info *i_pipe;
struct cdev *i_cdev; struct cdev *i_cdev;
@ -749,6 +755,13 @@ struct inode {
void *i_private; /* fs or device private pointer */ void *i_private; /* fs or device private pointer */
} __randomize_layout; } __randomize_layout;
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
inode->i_link = link;
inode->i_linklen = linklen;
inode->i_opflags |= IOP_CACHED_LINK;
}
/* /*
* Get bit address from inode->i_state to use with wait_var_event() * Get bit address from inode->i_state to use with wait_var_event()
* infrastructre. * infrastructre.
@ -2127,6 +2140,8 @@ struct file_operations {
#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
/* Supports asynchronous lock callbacks */ /* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7))
/* Wrap a directory iterator that needs exclusive inode access */ /* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *, int wrap_directory_iterator(struct file *, struct dir_context *,
@ -3351,7 +3366,7 @@ extern const struct file_operations generic_ro_fops;
#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
extern int readlink_copy(char __user *, int, const char *); extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int); extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *, extern const char *page_get_link(struct dentry *, struct inode *,
struct delayed_call *); struct delayed_call *);
@ -3614,6 +3629,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
if (flags & RWF_DONTCACHE) {
/* file system must support it */
if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
return -EOPNOTSUPP;
/* DAX mappings not supported */
if (IS_DAX(ki->ki_filp->f_mapping->host))
return -EOPNOTSUPP;
}
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
if (flags & RWF_SYNC) if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC; kiocb_flags |= IOCB_DSYNC;

View file

@ -34,14 +34,24 @@ struct lockref {
}; };
}; };
extern void lockref_get(struct lockref *); /**
extern int lockref_put_return(struct lockref *); * lockref_init - Initialize a lockref
extern int lockref_get_not_zero(struct lockref *); * @lockref: pointer to lockref structure
extern int lockref_put_not_zero(struct lockref *); * @count: initial count
extern int lockref_put_or_lock(struct lockref *); */
static inline void lockref_init(struct lockref *lockref, unsigned int count)
{
spin_lock_init(&lockref->lock);
lockref->count = count;
}
extern void lockref_mark_dead(struct lockref *); void lockref_get(struct lockref *lockref);
extern int lockref_get_not_dead(struct lockref *); int lockref_put_return(struct lockref *lockref);
bool lockref_get_not_zero(struct lockref *lockref);
bool lockref_put_or_lock(struct lockref *lockref);
void lockref_mark_dead(struct lockref *lockref);
bool lockref_get_not_dead(struct lockref *lockref);
/* Must be called under spinlock for reliable results */ /* Must be called under spinlock for reliable results */
static inline bool __lockref_is_dead(const struct lockref *l) static inline bool __lockref_is_dead(const struct lockref *l)

View file

@ -75,7 +75,7 @@ struct vfsmount {
static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
{ {
/* Pairs with smp_store_release() in do_idmap_mount(). */ /* Pairs with smp_store_release() in do_idmap_mount(). */
return smp_load_acquire(&mnt->mnt_idmap); return READ_ONCE(mnt->mnt_idmap);
} }
extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write(struct vfsmount *mnt);
@ -113,7 +113,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *);
extern void kern_unmount(struct vfsmount *mnt); extern void kern_unmount(struct vfsmount *mnt);
extern int may_umount_tree(struct vfsmount *); extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *); extern int may_umount(struct vfsmount *);
extern long do_mount(const char *, const char __user *, int do_mount(const char *, const char __user *,
const char *, unsigned long, void *); const char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(const struct path *); extern struct vfsmount *collect_mounts(const struct path *);
extern void drop_collected_mounts(struct vfsmount *); extern void drop_collected_mounts(struct vfsmount *);

View file

@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
({ \ ({ \
unsigned __seq; \ unsigned __seq; \
\ \
while ((__seq = seqprop_sequence(s)) & 1) \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \
cpu_relax(); \ cpu_relax(); \
\ \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \

View file

@ -14,37 +14,56 @@
#include <linux/types.h> #include <linux/types.h>
/**
* struct fiemap_extent - description of one fiemap extent
* @fe_logical: byte offset of the extent in the file
* @fe_physical: byte offset of extent on disk
* @fe_length: length in bytes for this extent
* @fe_flags: FIEMAP_EXTENT_* flags for this extent
*/
struct fiemap_extent { struct fiemap_extent {
__u64 fe_logical; /* logical offset in bytes for the start of __u64 fe_logical;
* the extent from the beginning of the file */ __u64 fe_physical;
__u64 fe_physical; /* physical offset in bytes for the start __u64 fe_length;
* of the extent from the beginning of the disk */ /* private: */
__u64 fe_length; /* length in bytes for this extent */
__u64 fe_reserved64[2]; __u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ /* public: */
__u32 fe_flags;
/* private: */
__u32 fe_reserved[3]; __u32 fe_reserved[3];
}; };
/**
* struct fiemap - file extent mappings
* @fm_start: byte offset (inclusive) at which to start mapping (in)
* @fm_length: logical length of mapping which userspace wants (in)
* @fm_flags: FIEMAP_FLAG_* flags for request (in/out)
* @fm_mapped_extents: number of extents that were mapped (out)
* @fm_extent_count: size of fm_extents array (in)
* @fm_extents: array of mapped extents (out)
*/
struct fiemap { struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at __u64 fm_start;
* which to start mapping (in) */ __u64 fm_length;
__u64 fm_length; /* logical length of mapping which __u32 fm_flags;
* userspace wants (in) */ __u32 fm_mapped_extents;
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ __u32 fm_extent_count;
__u32 fm_mapped_extents;/* number of extents that were mapped (out) */ /* private: */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved; __u32 fm_reserved;
struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ /* public: */
struct fiemap_extent fm_extents[];
}; };
#define FIEMAP_MAX_OFFSET (~0ULL) #define FIEMAP_MAX_OFFSET (~0ULL)
/* flags used in fm_flags: */
#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */
#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
/* flags used in fe_flags: */
#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.

View file

@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t;
/* Atomic Write */ /* Atomic Write */
#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
/* buffered IO that drops the cache after reading or writing data */
#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080)
/* mask of flags supported by the kernel */ /* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
RWF_DONTCACHE)
#define PROCFS_IOCTL_MAGIC 'f' #define PROCFS_IOCTL_MAGIC 'f'

View file

@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit /= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE;
page = buf->page; page = buf->page;
bit += page->index; bit += page->private;
set_bit(bit, wqueue->notes_bitmap); set_bit(bit, wqueue->notes_bitmap);
generic_pipe_buf_release(pipe, buf); generic_pipe_buf_release(pipe, buf);
@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i] = alloc_page(GFP_KERNEL); pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i]) if (!pages[i])
goto error_p; goto error_p;
pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE;
} }
bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);

View file

@ -58,63 +58,34 @@ EXPORT_SYMBOL(lockref_get);
* @lockref: pointer to lockref structure * @lockref: pointer to lockref structure
* Return: 1 if count updated successfully or 0 if count was zero * Return: 1 if count updated successfully or 0 if count was zero
*/ */
int lockref_get_not_zero(struct lockref *lockref) bool lockref_get_not_zero(struct lockref *lockref)
{ {
int retval; bool retval = false;
CMPXCHG_LOOP( CMPXCHG_LOOP(
new.count++; new.count++;
if (old.count <= 0) if (old.count <= 0)
return 0; return false;
, ,
return 1; return true;
); );
spin_lock(&lockref->lock); spin_lock(&lockref->lock);
retval = 0;
if (lockref->count > 0) { if (lockref->count > 0) {
lockref->count++; lockref->count++;
retval = 1; retval = true;
} }
spin_unlock(&lockref->lock); spin_unlock(&lockref->lock);
return retval; return retval;
} }
EXPORT_SYMBOL(lockref_get_not_zero); EXPORT_SYMBOL(lockref_get_not_zero);
/**
* lockref_put_not_zero - Decrements count unless count <= 1 before decrement
* @lockref: pointer to lockref structure
* Return: 1 if count updated successfully or 0 if count would become zero
*/
int lockref_put_not_zero(struct lockref *lockref)
{
int retval;
CMPXCHG_LOOP(
new.count--;
if (old.count <= 1)
return 0;
,
return 1;
);
spin_lock(&lockref->lock);
retval = 0;
if (lockref->count > 1) {
lockref->count--;
retval = 1;
}
spin_unlock(&lockref->lock);
return retval;
}
EXPORT_SYMBOL(lockref_put_not_zero);
/** /**
* lockref_put_return - Decrement reference count if possible * lockref_put_return - Decrement reference count if possible
* @lockref: pointer to lockref structure * @lockref: pointer to lockref structure
* *
* Decrement the reference count and return the new value. * Decrement the reference count and return the new value.
* If the lockref was dead or locked, return an error. * If the lockref was dead or locked, return -1.
*/ */
int lockref_put_return(struct lockref *lockref) int lockref_put_return(struct lockref *lockref)
{ {
@ -134,22 +105,22 @@ EXPORT_SYMBOL(lockref_put_return);
* @lockref: pointer to lockref structure * @lockref: pointer to lockref structure
* Return: 1 if count updated successfully or 0 if count <= 1 and lock taken * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
*/ */
int lockref_put_or_lock(struct lockref *lockref) bool lockref_put_or_lock(struct lockref *lockref)
{ {
CMPXCHG_LOOP( CMPXCHG_LOOP(
new.count--; new.count--;
if (old.count <= 1) if (old.count <= 1)
break; break;
, ,
return 1; return true;
); );
spin_lock(&lockref->lock); spin_lock(&lockref->lock);
if (lockref->count <= 1) if (lockref->count <= 1)
return 0; return false;
lockref->count--; lockref->count--;
spin_unlock(&lockref->lock); spin_unlock(&lockref->lock);
return 1; return true;
} }
EXPORT_SYMBOL(lockref_put_or_lock); EXPORT_SYMBOL(lockref_put_or_lock);
@ -169,23 +140,22 @@ EXPORT_SYMBOL(lockref_mark_dead);
* @lockref: pointer to lockref structure * @lockref: pointer to lockref structure
* Return: 1 if count updated successfully or 0 if lockref was dead * Return: 1 if count updated successfully or 0 if lockref was dead
*/ */
int lockref_get_not_dead(struct lockref *lockref) bool lockref_get_not_dead(struct lockref *lockref)
{ {
int retval; bool retval = false;
CMPXCHG_LOOP( CMPXCHG_LOOP(
new.count++; new.count++;
if (old.count < 0) if (old.count < 0)
return 0; return false;
, ,
return 1; return true;
); );
spin_lock(&lockref->lock); spin_lock(&lockref->lock);
retval = 0;
if (lockref->count >= 0) { if (lockref->count >= 0) {
lockref->count++; lockref->count++;
retval = 1; retval = true;
} }
spin_unlock(&lockref->lock); spin_unlock(&lockref->lock);
return retval; return retval;

View file

@ -3917,6 +3917,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len; int len;
struct inode *inode; struct inode *inode;
struct folio *folio; struct folio *folio;
char *link;
len = strlen(symname) + 1; len = strlen(symname) + 1;
if (len > PAGE_SIZE) if (len > PAGE_SIZE)
@ -3938,12 +3939,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1; inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) { if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL); link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) { if (!link) {
error = -ENOMEM; error = -ENOMEM;
goto out_remove_offset; goto out_remove_offset;
} }
inode->i_op = &shmem_short_symlink_operations; inode->i_op = &shmem_short_symlink_operations;
inode_set_cached_link(inode, link, len - 1);
} else { } else {
inode_nohighmem(inode); inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->a_ops = &shmem_aops;

View file

@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME,
d_inode(dentry)->i_ino); d_inode(dentry)->i_ino);
if (res > 0 && res < sizeof(name)) if (res > 0 && res < sizeof(name))
res = readlink_copy(buffer, buflen, name); res = readlink_copy(buffer, buflen, name, strlen(name));
else else
res = -ENOENT; res = -ENOENT;

View file

@ -0,0 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS = $(KHDR_INCLUDES)
TEST_GEN_PROGS := stackdump_test
TEST_FILES := stackdump
include ../lib.mk

View file

@ -0,0 +1,50 @@
coredump selftest
=================
Background context
------------------
`coredump` is a feature which dumps a process's memory space when the process terminates
unexpectedly (e.g. due to segmentation fault), which can be useful for debugging. By default,
`coredump` dumps the memory to the file named `core`, but this behavior can be changed by writing a
different file name to `/proc/sys/kernel/core_pattern`. Furthermore, `coredump` can be piped to a
user-space program by writing the pipe symbol (`|`) followed by the command to be executed to
`/proc/sys/kernel/core_pattern`. For the full description, see `man 5 core`.
The piped user program may be interested in reading the stack pointers of the crashed process. The
crashed process's stack pointers can be read from `procfs`: it is the `kstkesp` field in
`/proc/$PID/stat`. See `man 5 proc` for all the details.
The problem
-----------
While a thread is active, the stack pointer is unsafe to read and therefore the `kstkesp` field
reads zero. But when the thread is dead (e.g. during a coredump), this field should have valid
value.
However, this was broken in the past and `kstkesp` was zero even during coredump:
* commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") changed kstkesp to
always be zero
* commit fd7d56270b52 ("fs/proc: Report eip/esp in /prod/PID/stat for coredumping") fixed it for the
coredumping thread. However, other threads in a coredumping process still had the problem.
* commit cb8f381f1613 ("fs/proc/array.c: allow reporting eip/esp for all coredumping threads") fixed
for all threads in a coredumping process.
* commit 92307383082d ("coredump: Don't perform any cleanups before dumping core") broke it again
for the other threads in a coredumping process.
The problem has been fixed now, but considering the history, it may appear again in the future.
The goal of this test
---------------------
This test detects problem with reading `kstkesp` during coredump by doing the following:
#. Tell the kernel to execute the "stackdump" script when a coredump happens. This script
reads the stack pointers of all threads of crashed processes.
#. Spawn a child process who creates some threads and then crashes.
#. Read the output from the "stackdump" script, and make sure all stack pointer values are
non-zero.

View file

@ -0,0 +1,14 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
CRASH_PROGRAM_ID=$1
STACKDUMP_FILE=$2
TMP=$(mktemp)
for t in /proc/$CRASH_PROGRAM_ID/task/*; do
tid=$(basename $t)
cat /proc/$tid/stat | awk '{print $29}' >> $TMP
done
mv $TMP $STACKDUMP_FILE

View file

@ -0,0 +1,151 @@
// SPDX-License-Identifier: GPL-2.0
#include <fcntl.h>
#include <libgen.h>
#include <linux/limits.h>
#include <pthread.h>
#include <string.h>
#include <sys/resource.h>
#include <unistd.h>
#include "../kselftest_harness.h"
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
#define NUM_THREAD_SPAWN 128
static void *do_nothing(void *)
{
while (1)
pause();
}
static void crashing_child(void)
{
pthread_t thread;
int i;
for (i = 0; i < NUM_THREAD_SPAWN; ++i)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
i = *(int *)NULL;
}
FIXTURE(coredump)
{
char original_core_pattern[256];
};
FIXTURE_SETUP(coredump)
{
char buf[PATH_MAX];
FILE *file;
char *dir;
int ret;
file = fopen("/proc/sys/kernel/core_pattern", "r");
ASSERT_NE(NULL, file);
ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
ASSERT_TRUE(ret || feof(file));
ASSERT_LT(ret, sizeof(self->original_core_pattern));
self->original_core_pattern[ret] = '\0';
ret = fclose(file);
ASSERT_EQ(0, ret);
}
FIXTURE_TEARDOWN(coredump)
{
const char *reason;
FILE *file;
int ret;
unlink(STACKDUMP_FILE);
file = fopen("/proc/sys/kernel/core_pattern", "w");
if (!file) {
reason = "Unable to open core_pattern";
goto fail;
}
ret = fprintf(file, "%s", self->original_core_pattern);
if (ret < 0) {
reason = "Unable to write to core_pattern";
goto fail;
}
ret = fclose(file);
if (ret) {
reason = "Unable to close core_pattern";
goto fail;
}
return;
fail:
/* This should never happen */
fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason);
}
TEST_F(coredump, stackdump)
{
struct sigaction action = {};
unsigned long long stack;
char *test_dir, *line;
size_t line_length;
char buf[PATH_MAX];
int ret, i;
FILE *file;
pid_t pid;
/*
* Step 1: Setup core_pattern so that the stackdump script is executed when the child
* process crashes
*/
ret = readlink("/proc/self/exe", buf, sizeof(buf));
ASSERT_NE(-1, ret);
ASSERT_LT(ret, sizeof(buf));
buf[ret] = '\0';
test_dir = dirname(buf);
file = fopen("/proc/sys/kernel/core_pattern", "w");
ASSERT_NE(NULL, file);
ret = fprintf(file, "|%1$s/%2$s %%P %1$s/%3$s", test_dir, STACKDUMP_SCRIPT, STACKDUMP_FILE);
ASSERT_LT(0, ret);
ret = fclose(file);
ASSERT_EQ(0, ret);
/* Step 2: Create a process who spawns some threads then crashes */
pid = fork();
ASSERT_TRUE(pid >= 0);
if (pid == 0)
crashing_child();
/*
* Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file
*/
for (i = 0; i < 10; ++i) {
file = fopen(STACKDUMP_FILE, "r");
if (file)
break;
sleep(1);
}
ASSERT_NE(file, NULL);
/* Step 4: Make sure all stack pointer values are non-zero */
for (i = 0; -1 != getline(&line, &line_length, file); ++i) {
stack = strtoull(line, NULL, 10);
ASSERT_NE(stack, 0);
}
ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN);
fclose(file);
}
TEST_HARNESS_MAIN