mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 16:53:58 -05:00
Reimplement RLIMIT_NPROC on top of ucounts
The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov <legion@kernel.org> Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
This commit is contained in:
parent
b6c3365289
commit
21d1c5e386
11 changed files with 73 additions and 15 deletions
|
@ -1878,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
|
|||
* whether NPROC limit is still exceeded.
|
||||
*/
|
||||
if ((current->flags & PF_NPROC_EXCEEDED) &&
|
||||
atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) {
|
||||
is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
|
||||
retval = -EAGAIN;
|
||||
goto out_ret;
|
||||
}
|
||||
|
|
|
@ -372,6 +372,7 @@ static inline void put_cred(const struct cred *_cred)
|
|||
|
||||
#define task_uid(task) (task_cred_xxx((task), uid))
|
||||
#define task_euid(task) (task_cred_xxx((task), euid))
|
||||
#define task_ucounts(task) (task_cred_xxx((task), ucounts))
|
||||
|
||||
#define current_cred_xxx(xxx) \
|
||||
({ \
|
||||
|
@ -388,6 +389,7 @@ static inline void put_cred(const struct cred *_cred)
|
|||
#define current_fsgid() (current_cred_xxx(fsgid))
|
||||
#define current_cap() (current_cred_xxx(cap_effective))
|
||||
#define current_user() (current_cred_xxx(user))
|
||||
#define current_ucounts() (current_cred_xxx(ucounts))
|
||||
|
||||
extern struct user_namespace init_user_ns;
|
||||
#ifdef CONFIG_USER_NS
|
||||
|
|
|
@ -12,7 +12,6 @@
|
|||
*/
|
||||
struct user_struct {
|
||||
refcount_t __count; /* reference count */
|
||||
atomic_t processes; /* How many processes does this user have? */
|
||||
atomic_t sigpending; /* How many pending signals does this user have? */
|
||||
#ifdef CONFIG_FANOTIFY
|
||||
atomic_t fanotify_listeners;
|
||||
|
|
|
@ -50,9 +50,12 @@ enum ucount_type {
|
|||
UCOUNT_INOTIFY_INSTANCES,
|
||||
UCOUNT_INOTIFY_WATCHES,
|
||||
#endif
|
||||
UCOUNT_RLIMIT_NPROC,
|
||||
UCOUNT_COUNTS,
|
||||
};
|
||||
|
||||
#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
|
||||
|
||||
struct user_namespace {
|
||||
struct uid_gid_map uid_map;
|
||||
struct uid_gid_map gid_map;
|
||||
|
@ -110,6 +113,15 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
|
|||
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
|
||||
void put_ucounts(struct ucounts *ucounts);
|
||||
|
||||
static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
|
||||
{
|
||||
return atomic_long_read(&ucounts->ucount[type]);
|
||||
}
|
||||
|
||||
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
|
||||
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
|
||||
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
|
||||
|
||||
#ifdef CONFIG_USER_NS
|
||||
|
||||
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
|
||||
|
|
|
@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
|
|||
kdebug("share_creds(%p{%d,%d})",
|
||||
p->cred, atomic_read(&p->cred->usage),
|
||||
read_cred_subscribers(p->cred));
|
||||
atomic_inc(&p->cred->user->processes);
|
||||
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
|
|||
}
|
||||
#endif
|
||||
|
||||
atomic_inc(&new->user->processes);
|
||||
p->cred = p->real_cred = get_cred(new);
|
||||
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
|
||||
alter_cred_subscribers(new, 2);
|
||||
validate_creds(new);
|
||||
return 0;
|
||||
|
@ -496,12 +496,12 @@ int commit_creds(struct cred *new)
|
|||
* in set_user().
|
||||
*/
|
||||
alter_cred_subscribers(new, 2);
|
||||
if (new->user != old->user)
|
||||
atomic_inc(&new->user->processes);
|
||||
if (new->user != old->user || new->user_ns != old->user_ns)
|
||||
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
|
||||
rcu_assign_pointer(task->real_cred, new);
|
||||
rcu_assign_pointer(task->cred, new);
|
||||
if (new->user != old->user)
|
||||
atomic_dec(&old->user->processes);
|
||||
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
|
||||
alter_cred_subscribers(old, -2);
|
||||
|
||||
/* send notifications */
|
||||
|
|
|
@ -188,7 +188,7 @@ repeat:
|
|||
/* don't need to get the RCU readlock here - the process is dead and
|
||||
* can't be modifying its own credentials. But shut RCU-lockdep up */
|
||||
rcu_read_lock();
|
||||
atomic_dec(&__task_cred(p)->user->processes);
|
||||
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
|
||||
rcu_read_unlock();
|
||||
|
||||
cgroup_release(p);
|
||||
|
|
|
@ -819,9 +819,11 @@ void __init fork_init(void)
|
|||
init_task.signal->rlim[RLIMIT_SIGPENDING] =
|
||||
init_task.signal->rlim[RLIMIT_NPROC];
|
||||
|
||||
for (i = 0; i < UCOUNT_COUNTS; i++)
|
||||
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
|
||||
init_user_ns.ucount_max[i] = max_threads/2;
|
||||
|
||||
init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
|
||||
|
||||
#ifdef CONFIG_VMAP_STACK
|
||||
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
|
||||
NULL, free_vm_stack_cache);
|
||||
|
@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
|
||||
#endif
|
||||
retval = -EAGAIN;
|
||||
if (atomic_read(&p->real_cred->user->processes) >=
|
||||
task_rlimit(p, RLIMIT_NPROC)) {
|
||||
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
|
||||
if (p->real_cred->user != INIT_USER &&
|
||||
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
|
||||
goto bad_fork_free;
|
||||
|
@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock:
|
|||
#endif
|
||||
delayacct_tsk_free(p);
|
||||
bad_fork_cleanup_count:
|
||||
atomic_dec(&p->cred->user->processes);
|
||||
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
|
||||
exit_creds(p);
|
||||
bad_fork_free:
|
||||
p->state = TASK_DEAD;
|
||||
|
|
|
@ -473,7 +473,7 @@ static int set_user(struct cred *new)
|
|||
* for programs doing set*uid()+execve() by harmlessly deferring the
|
||||
* failure to the execve() stage.
|
||||
*/
|
||||
if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
|
||||
if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
|
||||
new_user != INIT_USER)
|
||||
current->flags |= PF_NPROC_EXCEEDED;
|
||||
else
|
||||
|
|
|
@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
|
|||
UCOUNT_ENTRY("max_inotify_instances"),
|
||||
UCOUNT_ENTRY("max_inotify_watches"),
|
||||
#endif
|
||||
{ },
|
||||
{ }
|
||||
};
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
|
|||
put_ucounts(ucounts);
|
||||
}
|
||||
|
||||
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
||||
{
|
||||
struct ucounts *iter;
|
||||
long ret = 0;
|
||||
|
||||
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
||||
long max = READ_ONCE(iter->ns->ucount_max[type]);
|
||||
long new = atomic_long_add_return(v, &iter->ucount[type]);
|
||||
if (new < 0 || new > max)
|
||||
ret = LONG_MAX;
|
||||
else if (iter == ucounts)
|
||||
ret = new;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
||||
{
|
||||
struct ucounts *iter;
|
||||
long new;
|
||||
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
||||
long dec = atomic_long_add_return(-v, &iter->ucount[type]);
|
||||
WARN_ON_ONCE(dec < 0);
|
||||
if (iter == ucounts)
|
||||
new = dec;
|
||||
}
|
||||
return (new == 0);
|
||||
}
|
||||
|
||||
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
|
||||
{
|
||||
struct ucounts *iter;
|
||||
if (get_ucounts_value(ucounts, type) > max)
|
||||
return true;
|
||||
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
||||
max = READ_ONCE(iter->ns->ucount_max[type]);
|
||||
if (get_ucounts_value(iter, type) > max)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static __init int user_namespace_sysctl_init(void)
|
||||
{
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void)
|
|||
BUG_ON(!setup_userns_sysctls(&init_user_ns));
|
||||
#endif
|
||||
hlist_add_ucounts(&init_ucounts);
|
||||
inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(user_namespace_sysctl_init);
|
||||
|
|
|
@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
|
|||
/* root_user.__count is 1, for init task cred */
|
||||
struct user_struct root_user = {
|
||||
.__count = REFCOUNT_INIT(1),
|
||||
.processes = ATOMIC_INIT(1),
|
||||
.sigpending = ATOMIC_INIT(0),
|
||||
.locked_shm = 0,
|
||||
.uid = GLOBAL_ROOT_UID,
|
||||
|
|
|
@ -119,9 +119,10 @@ int create_user_ns(struct cred *new)
|
|||
ns->owner = owner;
|
||||
ns->group = group;
|
||||
INIT_WORK(&ns->work, free_user_ns);
|
||||
for (i = 0; i < UCOUNT_COUNTS; i++) {
|
||||
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
|
||||
ns->ucount_max[i] = INT_MAX;
|
||||
}
|
||||
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
|
||||
ns->ucounts = ucounts;
|
||||
|
||||
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
|
||||
|
|
Loading…
Add table
Reference in a new issue