mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 16:53:58 -05:00
shm: extend forced shm destroy to support objects from several IPC nses
Currently, the exit_shm() function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
This is an attempt to fix the problem by extending exit_shm mechanism to
handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add a namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
task->sysvshm.shm_clist and gets IPC namespace not from current task
as it was before but from shp's object itself, then call
shm_destroy(shp, ns).
Note: We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we
using special helper get_ipc_ns_not_zero() which allows to get IPC ns
refcounter only if IPC ns not in the "state of destruction".
Q/A
Q: Why can we access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
lifetime, so, if we get shp object from the task->sysvshm.shm_clist
while holding task_lock(task) nobody can steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: No. It's just fixes non-covered case when process may leave IPC
namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com
Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com
Fixes: ab602f7991
("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Vasily Averin <vvs@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
126e8bee94
commit
85b6d24646
3 changed files with 163 additions and 51 deletions
|
@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
|
||||||
return ns;
|
return ns;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
|
||||||
|
{
|
||||||
|
if (ns) {
|
||||||
|
if (refcount_inc_not_zero(&ns->ns.count))
|
||||||
|
return ns;
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
extern void put_ipc_ns(struct ipc_namespace *ns);
|
extern void put_ipc_ns(struct ipc_namespace *ns);
|
||||||
#else
|
#else
|
||||||
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
|
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
|
||||||
|
@ -147,6 +157,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
|
||||||
return ns;
|
return ns;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
|
||||||
|
{
|
||||||
|
return ns;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void put_ipc_ns(struct ipc_namespace *ns)
|
static inline void put_ipc_ns(struct ipc_namespace *ns)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
|
||||||
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
||||||
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
||||||
* pins the final release of task.io_context. Also protects ->cpuset and
|
* pins the final release of task.io_context. Also protects ->cpuset and
|
||||||
* ->cgroup.subsys[]. And ->vfork_done.
|
* ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
|
||||||
*
|
*
|
||||||
* Nests both inside and outside of read_lock(&tasklist_lock).
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
||||||
* It must not be nested with write_lock_irq(&tasklist_lock),
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
||||||
|
|
169
ipc/shm.c
169
ipc/shm.c
|
@ -62,9 +62,18 @@ struct shmid_kernel /* private to the kernel */
|
||||||
struct pid *shm_lprid;
|
struct pid *shm_lprid;
|
||||||
struct ucounts *mlock_ucounts;
|
struct ucounts *mlock_ucounts;
|
||||||
|
|
||||||
/* The task created the shm object. NULL if the task is dead. */
|
/*
|
||||||
|
* The task created the shm object, for
|
||||||
|
* task_lock(shp->shm_creator)
|
||||||
|
*/
|
||||||
struct task_struct *shm_creator;
|
struct task_struct *shm_creator;
|
||||||
struct list_head shm_clist; /* list by creator */
|
|
||||||
|
/*
|
||||||
|
* List by creator. task_lock(->shm_creator) required for read/write.
|
||||||
|
* If list_empty(), then the creator is dead already.
|
||||||
|
*/
|
||||||
|
struct list_head shm_clist;
|
||||||
|
struct ipc_namespace *ns;
|
||||||
} __randomize_layout;
|
} __randomize_layout;
|
||||||
|
|
||||||
/* shm_mode upper byte flags */
|
/* shm_mode upper byte flags */
|
||||||
|
@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
|
||||||
struct shmid_kernel *shp;
|
struct shmid_kernel *shp;
|
||||||
|
|
||||||
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
|
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
|
||||||
|
WARN_ON(ns != shp->ns);
|
||||||
|
|
||||||
if (shp->shm_nattch) {
|
if (shp->shm_nattch) {
|
||||||
shp->shm_perm.mode |= SHM_DEST;
|
shp->shm_perm.mode |= SHM_DEST;
|
||||||
|
@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head *head)
|
||||||
kfree(shp);
|
kfree(shp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
|
/*
|
||||||
|
* It has to be called with shp locked.
|
||||||
|
* It must be called before ipc_rmid()
|
||||||
|
*/
|
||||||
|
static inline void shm_clist_rm(struct shmid_kernel *shp)
|
||||||
{
|
{
|
||||||
list_del(&s->shm_clist);
|
struct task_struct *creator;
|
||||||
ipc_rmid(&shm_ids(ns), &s->shm_perm);
|
|
||||||
|
/* ensure that shm_creator does not disappear */
|
||||||
|
rcu_read_lock();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A concurrent exit_shm may do a list_del_init() as well.
|
||||||
|
* Just do nothing if exit_shm already did the work
|
||||||
|
*/
|
||||||
|
if (!list_empty(&shp->shm_clist)) {
|
||||||
|
/*
|
||||||
|
* shp->shm_creator is guaranteed to be valid *only*
|
||||||
|
* if shp->shm_clist is not empty.
|
||||||
|
*/
|
||||||
|
creator = shp->shm_creator;
|
||||||
|
|
||||||
|
task_lock(creator);
|
||||||
|
/*
|
||||||
|
* list_del_init() is a nop if the entry was already removed
|
||||||
|
* from the list.
|
||||||
|
*/
|
||||||
|
list_del_init(&shp->shm_clist);
|
||||||
|
task_unlock(creator);
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void shm_rmid(struct shmid_kernel *s)
|
||||||
|
{
|
||||||
|
shm_clist_rm(s);
|
||||||
|
ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
|
||||||
shm_file = shp->shm_file;
|
shm_file = shp->shm_file;
|
||||||
shp->shm_file = NULL;
|
shp->shm_file = NULL;
|
||||||
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||||
shm_rmid(ns, shp);
|
shm_rmid(shp);
|
||||||
shm_unlock(shp);
|
shm_unlock(shp);
|
||||||
if (!is_file_hugepages(shm_file))
|
if (!is_file_hugepages(shm_file))
|
||||||
shmem_lock(shm_file, 0, shp->mlock_ucounts);
|
shmem_lock(shm_file, 0, shp->mlock_ucounts);
|
||||||
|
@ -303,10 +346,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
|
||||||
*
|
*
|
||||||
* 2) sysctl kernel.shm_rmid_forced is set to 1.
|
* 2) sysctl kernel.shm_rmid_forced is set to 1.
|
||||||
*/
|
*/
|
||||||
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
|
static bool shm_may_destroy(struct shmid_kernel *shp)
|
||||||
{
|
{
|
||||||
return (shp->shm_nattch == 0) &&
|
return (shp->shm_nattch == 0) &&
|
||||||
(ns->shm_rmid_forced ||
|
(shp->ns->shm_rmid_forced ||
|
||||||
(shp->shm_perm.mode & SHM_DEST));
|
(shp->shm_perm.mode & SHM_DEST));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -337,7 +380,7 @@ static void shm_close(struct vm_area_struct *vma)
|
||||||
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
|
ipc_update_pid(&shp->shm_lprid, task_tgid(current));
|
||||||
shp->shm_dtim = ktime_get_real_seconds();
|
shp->shm_dtim = ktime_get_real_seconds();
|
||||||
shp->shm_nattch--;
|
shp->shm_nattch--;
|
||||||
if (shm_may_destroy(ns, shp))
|
if (shm_may_destroy(shp))
|
||||||
shm_destroy(ns, shp);
|
shm_destroy(ns, shp);
|
||||||
else
|
else
|
||||||
shm_unlock(shp);
|
shm_unlock(shp);
|
||||||
|
@ -358,10 +401,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
|
||||||
*
|
*
|
||||||
* As shp->* are changed under rwsem, it's safe to skip shp locking.
|
* As shp->* are changed under rwsem, it's safe to skip shp locking.
|
||||||
*/
|
*/
|
||||||
if (shp->shm_creator != NULL)
|
if (!list_empty(&shp->shm_clist))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (shm_may_destroy(ns, shp)) {
|
if (shm_may_destroy(shp)) {
|
||||||
shm_lock_by_ptr(shp);
|
shm_lock_by_ptr(shp);
|
||||||
shm_destroy(ns, shp);
|
shm_destroy(ns, shp);
|
||||||
}
|
}
|
||||||
|
@ -379,48 +422,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
|
||||||
/* Locking assumes this will only be called with task == current */
|
/* Locking assumes this will only be called with task == current */
|
||||||
void exit_shm(struct task_struct *task)
|
void exit_shm(struct task_struct *task)
|
||||||
{
|
{
|
||||||
struct ipc_namespace *ns = task->nsproxy->ipc_ns;
|
for (;;) {
|
||||||
struct shmid_kernel *shp, *n;
|
struct shmid_kernel *shp;
|
||||||
|
struct ipc_namespace *ns;
|
||||||
|
|
||||||
if (list_empty(&task->sysvshm.shm_clist))
|
task_lock(task);
|
||||||
return;
|
|
||||||
|
if (list_empty(&task->sysvshm.shm_clist)) {
|
||||||
|
task_unlock(task);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
|
||||||
|
shm_clist);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If kernel.shm_rmid_forced is not set then only keep track of
|
* 1) Get pointer to the ipc namespace. It is worth to say
|
||||||
|
* that this pointer is guaranteed to be valid because
|
||||||
|
* shp lifetime is always shorter than namespace lifetime
|
||||||
|
* in which shp lives.
|
||||||
|
* We taken task_lock it means that shp won't be freed.
|
||||||
|
*/
|
||||||
|
ns = shp->ns;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 2) If kernel.shm_rmid_forced is not set then only keep track of
|
||||||
* which shmids are orphaned, so that a later set of the sysctl
|
* which shmids are orphaned, so that a later set of the sysctl
|
||||||
* can clean them up.
|
* can clean them up.
|
||||||
*/
|
*/
|
||||||
if (!ns->shm_rmid_forced) {
|
if (!ns->shm_rmid_forced)
|
||||||
down_read(&shm_ids(ns).rwsem);
|
goto unlink_continue;
|
||||||
list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
|
|
||||||
shp->shm_creator = NULL;
|
|
||||||
/*
|
/*
|
||||||
* Only under read lock but we are only called on current
|
* 3) get a reference to the namespace.
|
||||||
* so no entry on the list will be shared.
|
* The refcount could be already 0. If it is 0, then
|
||||||
|
* the shm objects will be free by free_ipc_work().
|
||||||
*/
|
*/
|
||||||
list_del(&task->sysvshm.shm_clist);
|
ns = get_ipc_ns_not_zero(ns);
|
||||||
up_read(&shm_ids(ns).rwsem);
|
if (!ns) {
|
||||||
return;
|
unlink_continue:
|
||||||
|
list_del_init(&shp->shm_clist);
|
||||||
|
task_unlock(task);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Destroy all already created segments, that were not yet mapped,
|
* 4) get a reference to shp.
|
||||||
* and mark any mapped as orphan to cover the sysctl toggling.
|
* This cannot fail: shm_clist_rm() is called before
|
||||||
* Destroy is skipped if shm_may_destroy() returns false.
|
* ipc_rmid(), thus the refcount cannot be 0.
|
||||||
|
*/
|
||||||
|
WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 5) unlink the shm segment from the list of segments
|
||||||
|
* created by current.
|
||||||
|
* This must be done last. After unlinking,
|
||||||
|
* only the refcounts obtained above prevent IPC_RMID
|
||||||
|
* from destroying the segment or the namespace.
|
||||||
|
*/
|
||||||
|
list_del_init(&shp->shm_clist);
|
||||||
|
|
||||||
|
task_unlock(task);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 6) we have all references
|
||||||
|
* Thus lock & if needed destroy shp.
|
||||||
*/
|
*/
|
||||||
down_write(&shm_ids(ns).rwsem);
|
down_write(&shm_ids(ns).rwsem);
|
||||||
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
|
|
||||||
shp->shm_creator = NULL;
|
|
||||||
|
|
||||||
if (shm_may_destroy(ns, shp)) {
|
|
||||||
shm_lock_by_ptr(shp);
|
shm_lock_by_ptr(shp);
|
||||||
|
/*
|
||||||
|
* rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
|
||||||
|
* safe to call ipc_rcu_putref here
|
||||||
|
*/
|
||||||
|
ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
|
||||||
|
|
||||||
|
if (ipc_valid_object(&shp->shm_perm)) {
|
||||||
|
if (shm_may_destroy(shp))
|
||||||
shm_destroy(ns, shp);
|
shm_destroy(ns, shp);
|
||||||
}
|
else
|
||||||
|
shm_unlock(shp);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Someone else deleted the shp from namespace
|
||||||
|
* idr/kht while we have waited.
|
||||||
|
* Just unlock and continue.
|
||||||
|
*/
|
||||||
|
shm_unlock(shp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Remove the list head from any segments still attached. */
|
|
||||||
list_del(&task->sysvshm.shm_clist);
|
|
||||||
up_write(&shm_ids(ns).rwsem);
|
up_write(&shm_ids(ns).rwsem);
|
||||||
|
put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static vm_fault_t shm_fault(struct vm_fault *vmf)
|
static vm_fault_t shm_fault(struct vm_fault *vmf)
|
||||||
|
@ -676,7 +768,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
|
||||||
if (error < 0)
|
if (error < 0)
|
||||||
goto no_id;
|
goto no_id;
|
||||||
|
|
||||||
|
shp->ns = ns;
|
||||||
|
|
||||||
|
task_lock(current);
|
||||||
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
|
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
|
||||||
|
task_unlock(current);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* shmid gets reported as "inode#" in /proc/pid/maps.
|
* shmid gets reported as "inode#" in /proc/pid/maps.
|
||||||
|
@ -1567,7 +1663,8 @@ out_nattch:
|
||||||
down_write(&shm_ids(ns).rwsem);
|
down_write(&shm_ids(ns).rwsem);
|
||||||
shp = shm_lock(ns, shmid);
|
shp = shm_lock(ns, shmid);
|
||||||
shp->shm_nattch--;
|
shp->shm_nattch--;
|
||||||
if (shm_may_destroy(ns, shp))
|
|
||||||
|
if (shm_may_destroy(shp))
|
||||||
shm_destroy(ns, shp);
|
shm_destroy(ns, shp);
|
||||||
else
|
else
|
||||||
shm_unlock(shp);
|
shm_unlock(shp);
|
||||||
|
|
Loading…
Add table
Reference in a new issue