/* * Copyright (c) 2018-2021, Andreas Kling * Copyright (c) 2022, the SerenityOS developers. * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace Kernel { extern Memory::Region* g_signal_trampoline_region; struct LoadResult { FlatPtr load_base { 0 }; FlatPtr entry_eip { 0 }; size_t size { 0 }; LockWeakPtr tls_region; size_t tls_size { 0 }; size_t tls_alignment { 0 }; LockWeakPtr stack_region; }; static constexpr size_t auxiliary_vector_size = 15; static Array generate_auxiliary_vector(FlatPtr load_base, FlatPtr entry_eip, UserID uid, UserID euid, GroupID gid, GroupID egid, StringView executable_path, Optional const& main_program_fd_allocation); static bool validate_stack_size(Vector> const& arguments, Vector>& environment, Array const& auxiliary) { size_t total_arguments_size = 0; size_t total_environment_size = 0; size_t total_auxiliary_size = 0; for (auto const& a : arguments) total_arguments_size += a->length() + 1; for (auto const& e : environment) total_environment_size += e->length() + 1; for (auto const& v : auxiliary) { if (!v.optional_string.is_empty()) total_auxiliary_size += round_up_to_power_of_two(v.optional_string.length() + 1, sizeof(FlatPtr)); if (v.auxv.a_type == ELF::AuxiliaryValue::Random) total_auxiliary_size += round_up_to_power_of_two(16, sizeof(FlatPtr)); } total_arguments_size += sizeof(char*) * (arguments.size() + 1); total_environment_size += sizeof(char*) * (environment.size() + 1); total_auxiliary_size += sizeof(auxv_t) * auxiliary.size(); if (total_arguments_size > Process::max_arguments_size) return false; if (total_environment_size > Process::max_environment_size) return false; if (total_auxiliary_size > Process::max_auxiliary_size) return false; return true; } static ErrorOr make_userspace_context_for_main_thread([[maybe_unused]] ThreadRegisters& regs, Memory::Region& region, Vector> const& arguments, Vector> const& environment, Array auxiliary_values) { FlatPtr new_sp = region.range().end().get(); // Add some bits of randomness to the user stack pointer. new_sp -= round_up_to_power_of_two(get_fast_random() % 4096, 16); auto push_on_new_stack = [&new_sp](FlatPtr value) { new_sp -= sizeof(FlatPtr); Userspace stack_ptr = new_sp; auto result = copy_to_user(stack_ptr, &value); VERIFY(!result.is_error()); }; auto push_aux_value_on_new_stack = [&new_sp](auxv_t value) { new_sp -= sizeof(auxv_t); Userspace stack_ptr = new_sp; auto result = copy_to_user(stack_ptr, &value); VERIFY(!result.is_error()); }; auto push_string_on_new_stack = [&new_sp](StringView string) { new_sp -= round_up_to_power_of_two(string.length() + 1, sizeof(FlatPtr)); Userspace stack_ptr = new_sp; auto result = copy_to_user(stack_ptr, string.characters_without_null_termination(), string.length() + 1); VERIFY(!result.is_error()); }; Vector argv_entries; for (auto const& argument : arguments) { push_string_on_new_stack(argument->view()); TRY(argv_entries.try_append(new_sp)); } Vector env_entries; for (auto const& variable : environment) { push_string_on_new_stack(variable->view()); TRY(env_entries.try_append(new_sp)); } for (auto& value : auxiliary_values) { if (!value.optional_string.is_empty()) { push_string_on_new_stack(value.optional_string); value.auxv.a_un.a_ptr = (void*)new_sp; } if (value.auxv.a_type == ELF::AuxiliaryValue::Random) { u8 random_bytes[16] {}; get_fast_random_bytes({ random_bytes, sizeof(random_bytes) }); push_string_on_new_stack({ random_bytes, sizeof(random_bytes) }); value.auxv.a_un.a_ptr = (void*)new_sp; } } for (ssize_t i = auxiliary_values.size() - 1; i >= 0; --i) { auto& value = auxiliary_values[i]; push_aux_value_on_new_stack(value.auxv); } push_on_new_stack(0); for (ssize_t i = env_entries.size() - 1; i >= 0; --i) push_on_new_stack(env_entries[i]); FlatPtr envp = new_sp; push_on_new_stack(0); for (ssize_t i = argv_entries.size() - 1; i >= 0; --i) push_on_new_stack(argv_entries[i]); FlatPtr argv = new_sp; // NOTE: The stack needs to be 16-byte aligned. new_sp -= new_sp % 16; #if ARCH(X86_64) regs.rdi = argv_entries.size(); regs.rsi = argv; regs.rdx = envp; #elif ARCH(AARCH64) regs.x[0] = argv_entries.size(); regs.x[1] = argv; regs.x[2] = envp; #else # error Unknown architecture #endif VERIFY(new_sp % 16 == 0); // FIXME: The way we're setting up the stack and passing arguments to the entry point isn't ABI-compliant return new_sp; } struct RequiredLoadRange { FlatPtr start { 0 }; FlatPtr end { 0 }; }; static ErrorOr get_required_load_range(OpenFileDescription& program_description) { auto& inode = *(program_description.inode()); auto vmobject = TRY(Memory::SharedInodeVMObject::try_create_with_inode(inode)); size_t executable_size = inode.size(); size_t rounded_executable_size = TRY(Memory::page_round_up(executable_size)); auto region = TRY(MM.allocate_kernel_region_with_vmobject(*vmobject, rounded_executable_size, "ELF memory range calculation"sv, Memory::Region::Access::Read)); auto elf_image = ELF::Image(region->vaddr().as_ptr(), executable_size); if (!elf_image.is_valid()) { return EINVAL; } RequiredLoadRange range {}; elf_image.for_each_program_header([&range](auto const& pheader) { if (pheader.type() != PT_LOAD) return; auto region_start = (FlatPtr)pheader.vaddr().as_ptr(); auto region_end = region_start + pheader.size_in_memory(); if (range.start == 0 || region_start < range.start) range.start = region_start; if (range.end == 0 || region_end > range.end) range.end = region_end; }); VERIFY(range.end > range.start); return range; } static ErrorOr get_load_offset(const ElfW(Ehdr) & main_program_header, OpenFileDescription& main_program_description, OpenFileDescription* interpreter_description) { constexpr FlatPtr load_range_start = 0x08000000; constexpr FlatPtr load_range_size = 65536 * PAGE_SIZE; // 2**16 * PAGE_SIZE = 256MB constexpr FlatPtr minimum_load_offset_randomization_size = 10 * MiB; auto random_load_offset_in_range([](auto start, auto size) { return Memory::page_round_down(start + get_good_random() % size); }); if (main_program_header.e_type == ET_DYN) { return random_load_offset_in_range(load_range_start, load_range_size); } if (main_program_header.e_type != ET_EXEC) return EINVAL; auto main_program_load_range = TRY(get_required_load_range(main_program_description)); RequiredLoadRange selected_range {}; if (interpreter_description) { auto interpreter_load_range = TRY(get_required_load_range(*interpreter_description)); auto interpreter_size_in_memory = interpreter_load_range.end - interpreter_load_range.start; auto interpreter_load_range_end = load_range_start + load_range_size - interpreter_size_in_memory; // No intersection if (main_program_load_range.end < load_range_start || main_program_load_range.start > interpreter_load_range_end) return random_load_offset_in_range(load_range_start, load_range_size); RequiredLoadRange first_available_part = { load_range_start, main_program_load_range.start }; RequiredLoadRange second_available_part = { main_program_load_range.end, interpreter_load_range_end }; // Select larger part if (first_available_part.end - first_available_part.start > second_available_part.end - second_available_part.start) selected_range = first_available_part; else selected_range = second_available_part; } else selected_range = main_program_load_range; // If main program is too big and leaves us without enough space for adequate loader randomization if (selected_range.end - selected_range.start < minimum_load_offset_randomization_size) return E2BIG; return random_load_offset_in_range(selected_range.start, selected_range.end - selected_range.start); } enum class ShouldAllocateTls { No, Yes, }; enum class ShouldAllowSyscalls { No, Yes, }; static ErrorOr load_elf_object(Memory::AddressSpace& new_space, OpenFileDescription& object_description, FlatPtr load_offset, ShouldAllocateTls should_allocate_tls, ShouldAllowSyscalls should_allow_syscalls, Optional minimum_stack_size = {}) { auto& inode = *(object_description.inode()); auto vmobject = TRY(Memory::SharedInodeVMObject::try_create_with_inode(inode)); if (vmobject->writable_mappings()) { dbgln("Refusing to execute a write-mapped program"); return ETXTBSY; } size_t executable_size = inode.size(); size_t rounded_executable_size = TRY(Memory::page_round_up(executable_size)); auto executable_region = TRY(MM.allocate_kernel_region_with_vmobject(*vmobject, rounded_executable_size, "ELF loading"sv, Memory::Region::Access::Read)); auto elf_image = ELF::Image(executable_region->vaddr().as_ptr(), executable_size); if (!elf_image.is_valid()) return ENOEXEC; Memory::Region* master_tls_region { nullptr }; size_t master_tls_size = 0; size_t master_tls_alignment = 0; FlatPtr load_base_address = 0; size_t stack_size = Thread::default_userspace_stack_size; if (minimum_stack_size.has_value() && minimum_stack_size.value() > stack_size) stack_size = minimum_stack_size.value(); auto elf_name = TRY(object_description.pseudo_path()); VERIFY(!Processor::in_critical()); Memory::MemoryManager::enter_address_space(new_space); auto load_tls_section = [&](auto& program_header) -> ErrorOr { VERIFY(should_allocate_tls == ShouldAllocateTls::Yes); VERIFY(program_header.size_in_memory()); if (!elf_image.is_within_image(program_header.raw_data(), program_header.size_in_image())) { dbgln("Shenanigans! ELF PT_TLS header sneaks outside of executable."); return ENOEXEC; } auto region_name = TRY(KString::formatted("{} (master-tls)", elf_name)); master_tls_region = TRY(new_space.allocate_region(Memory::RandomizeVirtualAddress::Yes, {}, program_header.size_in_memory(), PAGE_SIZE, region_name->view(), PROT_READ | PROT_WRITE, AllocationStrategy::Reserve)); master_tls_size = program_header.size_in_memory(); master_tls_alignment = program_header.alignment(); TRY(copy_to_user(master_tls_region->vaddr().as_ptr(), program_header.raw_data(), program_header.size_in_image())); return {}; }; auto load_writable_section = [&](auto& program_header) -> ErrorOr { // Writable section: create a copy in memory. VERIFY(program_header.alignment() % PAGE_SIZE == 0); if (!elf_image.is_within_image(program_header.raw_data(), program_header.size_in_image())) { dbgln("Shenanigans! Writable ELF PT_LOAD header sneaks outside of executable."); return ENOEXEC; } int prot = 0; if (program_header.is_readable()) prot |= PROT_READ; if (program_header.is_writable()) prot |= PROT_WRITE; auto region_name = TRY(KString::formatted("{} (data-{}{})", elf_name, program_header.is_readable() ? "r" : "", program_header.is_writable() ? "w" : "")); auto range_base = VirtualAddress { Memory::page_round_down(program_header.vaddr().offset(load_offset).get()) }; size_t rounded_range_end = TRY(Memory::page_round_up(program_header.vaddr().offset(load_offset).offset(program_header.size_in_memory()).get())); auto range_end = VirtualAddress { rounded_range_end }; auto region = TRY(new_space.allocate_region(Memory::RandomizeVirtualAddress::Yes, range_base, range_end.get() - range_base.get(), PAGE_SIZE, region_name->view(), prot, AllocationStrategy::Reserve)); // It's not always the case with PIE executables (and very well shouldn't be) that the // virtual address in the program header matches the one we end up giving the process. // In order to copy the data image correctly into memory, we need to copy the data starting at // the right initial page offset into the pages allocated for the elf_alloc-XX section. // FIXME: There's an opportunity to munmap, or at least mprotect, the padding space between // the .text and .data PT_LOAD sections of the executable. // Accessing it would definitely be a bug. auto page_offset = program_header.vaddr(); page_offset.mask(~PAGE_MASK); TRY(copy_to_user((u8*)region->vaddr().as_ptr() + page_offset.get(), program_header.raw_data(), program_header.size_in_image())); return {}; }; auto load_section = [&](auto& program_header) -> ErrorOr { if (program_header.size_in_memory() == 0) return {}; if (program_header.is_writable()) return load_writable_section(program_header); // Non-writable section: map the executable itself in memory. VERIFY(program_header.alignment() % PAGE_SIZE == 0); int prot = 0; if (program_header.is_readable()) prot |= PROT_READ; if (program_header.is_writable()) prot |= PROT_WRITE; if (program_header.is_executable()) prot |= PROT_EXEC; auto range_base = VirtualAddress { Memory::page_round_down(program_header.vaddr().offset(load_offset).get()) }; size_t rounded_range_end = TRY(Memory::page_round_up(program_header.vaddr().offset(load_offset).offset(program_header.size_in_memory()).get())); auto range_end = VirtualAddress { rounded_range_end }; auto region = TRY(new_space.allocate_region_with_vmobject(Memory::RandomizeVirtualAddress::Yes, range_base, range_end.get() - range_base.get(), program_header.alignment(), *vmobject, program_header.offset(), elf_name->view(), prot, true)); if (should_allow_syscalls == ShouldAllowSyscalls::Yes) region->set_syscall_region(true); if (program_header.offset() == 0) load_base_address = (FlatPtr)region->vaddr().as_ptr(); return {}; }; auto load_elf_program_header = [&](auto& program_header) -> ErrorOr { if (program_header.type() == PT_TLS) return load_tls_section(program_header); if (program_header.type() == PT_LOAD) return load_section(program_header); // NOTE: We ignore other program header types. return {}; }; TRY([&] { ErrorOr result; elf_image.for_each_program_header([&](ELF::Image::ProgramHeader const& program_header) { result = load_elf_program_header(program_header); return result.is_error() ? IterationDecision::Break : IterationDecision::Continue; }); return result; }()); if (!elf_image.entry().offset(load_offset).get()) { dbgln("do_exec: Failure loading program, entry pointer is invalid! {})", elf_image.entry().offset(load_offset)); return ENOEXEC; } auto* stack_region = TRY(new_space.allocate_region(Memory::RandomizeVirtualAddress::Yes, {}, stack_size, PAGE_SIZE, "Stack (Main thread)"sv, PROT_READ | PROT_WRITE, AllocationStrategy::Reserve)); stack_region->set_stack(true); return LoadResult { load_base_address, elf_image.entry().offset(load_offset).get(), executable_size, TRY(AK::try_make_weak_ptr_if_nonnull(master_tls_region)), master_tls_size, master_tls_alignment, TRY(stack_region->try_make_weak_ptr()) }; } ErrorOr Process::load(Memory::AddressSpace& new_space, NonnullRefPtr main_program_description, RefPtr interpreter_description, const ElfW(Ehdr) & main_program_header, Optional minimum_stack_size) { auto load_offset = TRY(get_load_offset(main_program_header, main_program_description, interpreter_description)); if (interpreter_description.is_null()) { auto load_result = TRY(load_elf_object(new_space, main_program_description, load_offset, ShouldAllocateTls::Yes, ShouldAllowSyscalls::No, minimum_stack_size)); m_master_tls_region = load_result.tls_region; m_master_tls_size = load_result.tls_size; m_master_tls_alignment = load_result.tls_alignment; return load_result; } auto interpreter_load_result = TRY(load_elf_object(new_space, *interpreter_description, load_offset, ShouldAllocateTls::No, ShouldAllowSyscalls::Yes, minimum_stack_size)); // TLS allocation will be done in userspace by the loader VERIFY(!interpreter_load_result.tls_region); VERIFY(!interpreter_load_result.tls_alignment); VERIFY(!interpreter_load_result.tls_size); return interpreter_load_result; } void Process::clear_signal_handlers_for_exec() { // Comments are as they are presented in the POSIX specification, but slightly out of order. for (size_t signal = 0; signal < m_signal_action_data.size(); signal++) { // Except for SIGCHLD, signals set to be ignored by the calling process image shall be set to be ignored by the new process image. // If the SIGCHLD signal is set to be ignored by the calling process image, it is unspecified whether the SIGCHLD signal is set // to be ignored or to the default action in the new process image. if (signal != SIGCHLD && m_signal_action_data[signal].handler_or_sigaction.get() == reinterpret_cast(SIG_IGN)) { m_signal_action_data[signal] = {}; m_signal_action_data[signal].handler_or_sigaction.set(reinterpret_cast(SIG_IGN)); continue; } // Signals set to the default action in the calling process image shall be set to the default action in the new process image. // Signals set to be caught by the calling process image shall be set to the default action in the new process image. m_signal_action_data[signal] = {}; } } ErrorOr Process::do_exec(NonnullRefPtr main_program_description, Vector> arguments, Vector> environment, RefPtr interpreter_description, Thread*& new_main_thread, InterruptsState& previous_interrupts_state, const ElfW(Ehdr) & main_program_header, Optional minimum_stack_size) { VERIFY(is_user_process()); VERIFY(!Processor::in_critical()); auto main_program_metadata = main_program_description->metadata(); // NOTE: Don't allow running SUID binaries at all if we are in a jail. TRY(Process::current().jail().with([&](auto const& my_jail) -> ErrorOr { if (my_jail && (main_program_metadata.is_setuid() || main_program_metadata.is_setgid())) { return Error::from_errno(EPERM); } return {}; })); // Although we *could* handle a pseudo_path here, trying to execute something that doesn't have // a custody (e.g. BlockDevice or RandomDevice) is pretty suspicious anyway. auto path = TRY(main_program_description->original_absolute_path()); dbgln_if(EXEC_DEBUG, "do_exec: {}", path); auto last_part = path->view().find_last_split_view('/'); auto new_process_name = TRY(KString::try_create(last_part)); auto new_main_thread_name = TRY(new_process_name->try_clone()); auto allocated_space = TRY(Memory::AddressSpace::try_create(*this, nullptr)); OwnPtr old_space; auto old_master_tls_region = m_master_tls_region; auto old_master_tls_size = m_master_tls_size; auto old_master_tls_alignment = m_master_tls_alignment; auto& new_space = m_space.with([&](auto& space) -> Memory::AddressSpace& { old_space = move(space); space = move(allocated_space); return *space; }); m_master_tls_region = nullptr; m_master_tls_size = 0; m_master_tls_alignment = 0; ArmedScopeGuard space_guard([&]() { // If we failed at any point from now on we have to revert back to the old address space m_space.with([&](auto& space) { space = old_space.release_nonnull(); }); m_master_tls_region = old_master_tls_region; m_master_tls_size = old_master_tls_size; m_master_tls_alignment = old_master_tls_alignment; Memory::MemoryManager::enter_process_address_space(*this); }); auto load_result = TRY(load(new_space, main_program_description, interpreter_description, main_program_header, minimum_stack_size)); // NOTE: We don't need the interpreter executable description after this point. // We destroy it here to prevent it from getting destroyed when we return from this function. // That's important because when we're returning from this function, we're in a very delicate // state where we can't block (e.g by trying to acquire a mutex in description teardown.) bool has_interpreter = interpreter_description; interpreter_description = nullptr; auto* signal_trampoline_region = TRY(new_space.allocate_region_with_vmobject(Memory::RandomizeVirtualAddress::Yes, {}, PAGE_SIZE, PAGE_SIZE, g_signal_trampoline_region->vmobject(), 0, "Signal trampoline"sv, PROT_READ | PROT_EXEC, true)); signal_trampoline_region->set_syscall_region(true); // (For dynamically linked executable) Allocate an FD for passing the main executable to the dynamic loader. Optional main_program_fd_allocation; if (has_interpreter) main_program_fd_allocation = TRY(allocate_fd()); auto old_credentials = this->credentials(); auto new_credentials = old_credentials; auto old_process_attached_jail = m_attached_jail.with([&](auto& jail) -> RefPtr { return jail; }); auto old_scoped_list = m_jail_process_list.with([&](auto& list) -> RefPtr { return list; }); bool executable_is_setid = false; if (!(main_program_description->custody()->mount_flags() & MS_NOSUID)) { auto new_euid = old_credentials->euid(); auto new_egid = old_credentials->egid(); auto new_suid = old_credentials->suid(); auto new_sgid = old_credentials->sgid(); if (main_program_metadata.is_setuid()) { executable_is_setid = true; new_euid = main_program_metadata.uid; new_suid = main_program_metadata.uid; } if (main_program_metadata.is_setgid()) { executable_is_setid = true; new_egid = main_program_metadata.gid; new_sgid = main_program_metadata.gid; } if (executable_is_setid) { new_credentials = TRY(Credentials::create( old_credentials->uid(), old_credentials->gid(), new_euid, new_egid, new_suid, new_sgid, old_credentials->extra_gids(), old_credentials->sid(), old_credentials->pgid())); } } // We commit to the new executable at this point. There is no turning back! space_guard.disarm(); // Prevent other processes from attaching to us with ptrace while we're doing this. MutexLocker ptrace_locker(ptrace_lock()); // Disable profiling temporarily in case it's running on this process. auto was_profiling = m_profiling; TemporaryChange profiling_disabler(m_profiling, false); kill_threads_except_self(); with_mutable_protected_data([&](auto& protected_data) { protected_data.credentials = move(new_credentials); protected_data.dumpable = !executable_is_setid; protected_data.executable_is_setid = executable_is_setid; }); m_executable.with([&](auto& executable) { executable = main_program_description->custody(); }); m_arguments = move(arguments); m_attached_jail.with([&](auto& jail) { jail = old_process_attached_jail; }); m_jail_process_list.with([&](auto& list) { list = old_scoped_list; }); m_environment = move(environment); TRY(m_unveil_data.with([&](auto& unveil_data) -> ErrorOr { TRY(m_exec_unveil_data.with([&](auto& exec_unveil_data) -> ErrorOr { // Note: If we have exec unveil data being waiting to be dispatched // to the current execve'd program, then we apply the unveil data and // ensure it is locked in the new program. if (exec_unveil_data.state == VeilState::Dropped) { unveil_data.state = VeilState::LockedInherited; exec_unveil_data.state = VeilState::None; unveil_data.paths = TRY(exec_unveil_data.paths.deep_copy()); } else { unveil_data.state = VeilState::None; exec_unveil_data.state = VeilState::None; unveil_data.paths.clear(); unveil_data.paths.set_metadata({ TRY(KString::try_create("/"sv)), UnveilAccess::None, false }); } exec_unveil_data.paths.clear(); exec_unveil_data.paths.set_metadata({ TRY(KString::try_create("/"sv)), UnveilAccess::None, false }); return {}; })); return {}; })); m_coredump_properties.for_each([](auto& property) { property = {}; }); auto* current_thread = Thread::current(); current_thread->reset_signals_for_exec(); clear_signal_handlers_for_exec(); clear_futex_queues_on_exec(); m_fds.with_exclusive([&](auto& fds) { fds.change_each([&](auto& file_description_metadata) { if (file_description_metadata.is_valid() && file_description_metadata.flags() & FD_CLOEXEC) file_description_metadata = {}; }); }); if (main_program_fd_allocation.has_value()) { main_program_description->set_readable(true); m_fds.with_exclusive([&](auto& fds) { fds[main_program_fd_allocation->fd].set(move(main_program_description), FD_CLOEXEC); }); } new_main_thread = nullptr; if (¤t_thread->process() == this) { new_main_thread = current_thread; } else { for_each_thread([&](auto& thread) { new_main_thread = &thread; return IterationDecision::Break; }); } VERIFY(new_main_thread); auto credentials = this->credentials(); auto auxv = generate_auxiliary_vector(load_result.load_base, load_result.entry_eip, credentials->uid(), credentials->euid(), credentials->gid(), credentials->egid(), path->view(), main_program_fd_allocation); // FIXME: How much stack space does process startup need? if (!validate_stack_size(m_arguments, m_environment, auxv)) return E2BIG; // NOTE: We create the new stack before disabling interrupts since it will zero-fault // and we don't want to deal with faults after this point. auto new_userspace_sp = TRY(make_userspace_context_for_main_thread(new_main_thread->regs(), *load_result.stack_region.unsafe_ptr(), m_arguments, m_environment, move(auxv))); set_name(move(new_process_name)); new_main_thread->set_name(move(new_main_thread_name)); if (wait_for_tracer_at_next_execve()) { // Make sure we release the ptrace lock here or the tracer will block forever. ptrace_locker.unlock(); Thread::current()->send_urgent_signal_to_self(SIGSTOP); } else { // Unlock regardless before disabling interrupts. // Ensure we always unlock after checking ptrace status to avoid TOCTOU ptrace issues ptrace_locker.unlock(); } // We enter a critical section here because we don't want to get interrupted between do_exec() // and Processor::assume_context() or the next context switch. // If we used an InterruptDisabler that calls enable_interrupts() on exit, we might timer tick'd too soon in exec(). Processor::enter_critical(); previous_interrupts_state = processor_interrupts_state(); Processor::disable_interrupts(); // NOTE: Be careful to not trigger any page faults below! with_mutable_protected_data([&](auto& protected_data) { protected_data.promises = protected_data.execpromises; protected_data.has_promises = protected_data.has_execpromises; protected_data.execpromises = 0; protected_data.has_execpromises = false; protected_data.signal_trampoline = signal_trampoline_region->vaddr(); // FIXME: PID/TID ISSUE protected_data.pid = new_main_thread->tid().value(); }); auto tsr_result = new_main_thread->make_thread_specific_region({}); if (tsr_result.is_error()) { // FIXME: We cannot fail this late. Refactor this so the allocation happens before we commit to the new executable. VERIFY_NOT_REACHED(); } new_main_thread->reset_fpu_state(); auto& regs = new_main_thread->m_regs; address_space().with([&](auto& space) { regs.set_exec_state(load_result.entry_eip, new_userspace_sp, *space); }); { TemporaryChange profiling_disabler(m_profiling, was_profiling); PerformanceManager::add_process_exec_event(*this); } u32 lock_count_to_restore; [[maybe_unused]] auto rc = big_lock().force_unlock_exclusive_if_locked(lock_count_to_restore); VERIFY_INTERRUPTS_DISABLED(); VERIFY(Processor::in_critical()); return {}; } static Array generate_auxiliary_vector(FlatPtr load_base, FlatPtr entry_eip, UserID uid, UserID euid, GroupID gid, GroupID egid, StringView executable_path, Optional const& main_program_fd_allocation) { return { { // PHDR/EXECFD // PH* { ELF::AuxiliaryValue::PageSize, PAGE_SIZE }, { ELF::AuxiliaryValue::BaseAddress, (void*)load_base }, { ELF::AuxiliaryValue::Entry, (void*)entry_eip }, // NOTELF { ELF::AuxiliaryValue::Uid, (long)uid.value() }, { ELF::AuxiliaryValue::EUid, (long)euid.value() }, { ELF::AuxiliaryValue::Gid, (long)gid.value() }, { ELF::AuxiliaryValue::EGid, (long)egid.value() }, { ELF::AuxiliaryValue::Platform, Processor::platform_string() }, // FIXME: This is platform specific #if ARCH(X86_64) { ELF::AuxiliaryValue::HwCap, (long)CPUID(1).edx() }, #elif ARCH(AARCH64) { ELF::AuxiliaryValue::HwCap, (long)0 }, #else # error "Unknown architecture" #endif { ELF::AuxiliaryValue::ClockTick, (long)TimeManagement::the().ticks_per_second() }, // FIXME: Also take into account things like extended filesystem permissions? That's what linux does... { ELF::AuxiliaryValue::Secure, ((uid != euid) || (gid != egid)) ? 1 : 0 }, { ELF::AuxiliaryValue::Random, nullptr }, { ELF::AuxiliaryValue::ExecFilename, executable_path }, main_program_fd_allocation.has_value() ? ELF::AuxiliaryValue { ELF::AuxiliaryValue::ExecFileDescriptor, main_program_fd_allocation->fd } : ELF::AuxiliaryValue { ELF::AuxiliaryValue::Ignore, 0L }, { ELF::AuxiliaryValue::Null, 0L }, } }; } static ErrorOr>> find_shebang_interpreter_for_executable(char const first_page[], size_t nread) { int word_start = 2; size_t word_length = 0; if (nread > 2 && first_page[0] == '#' && first_page[1] == '!') { Vector> interpreter_words; for (size_t i = 2; i < nread; ++i) { if (first_page[i] == '\n') { break; } if (first_page[i] != ' ') { ++word_length; } if (first_page[i] == ' ') { if (word_length > 0) { auto word = TRY(KString::try_create(StringView { &first_page[word_start], word_length })); TRY(interpreter_words.try_append(move(word))); } word_length = 0; word_start = i + 1; } } if (word_length > 0) { auto word = TRY(KString::try_create(StringView { &first_page[word_start], word_length })); TRY(interpreter_words.try_append(move(word))); } if (!interpreter_words.is_empty()) return interpreter_words; } return ENOEXEC; } ErrorOr> Process::find_elf_interpreter_for_executable(StringView path, ElfW(Ehdr) const& main_executable_header, size_t main_executable_header_size, size_t file_size, Optional& minimum_stack_size) { // Not using ErrorOr here because we'll want to do the same thing in userspace in the RTLD StringBuilder interpreter_path_builder; Optional main_executable_requested_stack_size {}; if (!TRY(ELF::validate_program_headers(main_executable_header, file_size, { &main_executable_header, main_executable_header_size }, &interpreter_path_builder, &main_executable_requested_stack_size))) { dbgln("exec({}): File has invalid ELF Program headers", path); return ENOEXEC; } auto interpreter_path = interpreter_path_builder.string_view(); if (main_executable_requested_stack_size.has_value() && (!minimum_stack_size.has_value() || *minimum_stack_size < *main_executable_requested_stack_size)) minimum_stack_size = main_executable_requested_stack_size; if (!interpreter_path.is_empty()) { dbgln_if(EXEC_DEBUG, "exec({}): Using program interpreter {}", path, interpreter_path); auto interpreter_description = TRY(VirtualFileSystem::the().open(credentials(), interpreter_path, O_EXEC, 0, current_directory())); auto interp_metadata = interpreter_description->metadata(); VERIFY(interpreter_description->inode()); // Validate the program interpreter as a valid elf binary. // If your program interpreter is a #! file or something, it's time to stop playing games :) if (interp_metadata.size < (int)sizeof(ElfW(Ehdr))) return ENOEXEC; char first_page[PAGE_SIZE] = {}; auto first_page_buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&first_page); auto nread = TRY(interpreter_description->read(first_page_buffer, sizeof(first_page))); if (nread < sizeof(ElfW(Ehdr))) return ENOEXEC; auto* elf_header = (ElfW(Ehdr)*)first_page; if (!ELF::validate_elf_header(*elf_header, interp_metadata.size)) { dbgln("exec({}): Interpreter ({}) has invalid ELF header", path, interpreter_path); return ENOEXEC; } // Not using ErrorOr here because we'll want to do the same thing in userspace in the RTLD StringBuilder interpreter_interpreter_path_builder; Optional interpreter_requested_stack_size {}; if (!TRY(ELF::validate_program_headers(*elf_header, interp_metadata.size, { first_page, nread }, &interpreter_interpreter_path_builder, &interpreter_requested_stack_size))) { dbgln("exec({}): Interpreter ({}) has invalid ELF Program headers", path, interpreter_path); return ENOEXEC; } auto interpreter_interpreter_path = interpreter_interpreter_path_builder.string_view(); if (interpreter_requested_stack_size.has_value() && (!minimum_stack_size.has_value() || *minimum_stack_size < *interpreter_requested_stack_size)) minimum_stack_size = interpreter_requested_stack_size; if (!interpreter_interpreter_path.is_empty()) { dbgln("exec({}): Interpreter ({}) has its own interpreter ({})! No thank you!", path, interpreter_path, interpreter_interpreter_path); return ELOOP; } return interpreter_description; } if (main_executable_header.e_type == ET_REL) { // We can't exec an ET_REL, that's just an object file from the compiler return ENOEXEC; } if (main_executable_header.e_type == ET_DYN) { // If it's ET_DYN with no PT_INTERP, then it's a dynamic executable responsible // for its own relocation (i.e. it's /usr/lib/Loader.so) if (path != "/usr/lib/Loader.so") dbgln("exec({}): WARNING - Dynamic ELF executable without a PT_INTERP header, and isn't /usr/lib/Loader.so", path); return nullptr; } // No interpreter, but, path refers to a valid elf image return nullptr; } ErrorOr Process::exec(NonnullOwnPtr path, Vector> arguments, Vector> environment, Thread*& new_main_thread, InterruptsState& previous_interrupts_state, int recursion_depth) { if (recursion_depth > 2) { dbgln("exec({}): SHENANIGANS! recursed too far trying to find #! interpreter", path); return ELOOP; } // Open the file to check what kind of binary format it is // Currently supported formats: // - #! interpreted file // - ELF32 // * ET_EXEC binary that just gets loaded // * ET_DYN binary that requires a program interpreter // auto description = TRY(VirtualFileSystem::the().open(credentials(), path->view(), O_EXEC, 0, current_directory())); auto metadata = description->metadata(); if (!metadata.is_regular_file()) return EACCES; // Always gonna need at least 3 bytes. these are for #!X if (metadata.size < 3) return ENOEXEC; VERIFY(description->inode()); // Read the first page of the program into memory so we can validate the binfmt of it char first_page[PAGE_SIZE]; auto first_page_buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&first_page); auto nread = TRY(description->read(first_page_buffer, sizeof(first_page))); // 1) #! interpreted file auto shebang_result = find_shebang_interpreter_for_executable(first_page, nread); if (!shebang_result.is_error()) { auto shebang_words = shebang_result.release_value(); auto shebang_path = TRY(shebang_words.first()->try_clone()); arguments[0] = move(path); TRY(arguments.try_prepend(move(shebang_words))); return exec(move(shebang_path), move(arguments), move(environment), new_main_thread, previous_interrupts_state, ++recursion_depth); } // #2) ELF32 for i386 if (nread < sizeof(ElfW(Ehdr))) return ENOEXEC; auto const* main_program_header = (ElfW(Ehdr)*)first_page; if (!ELF::validate_elf_header(*main_program_header, metadata.size)) { dbgln("exec({}): File has invalid ELF header", path); return ENOEXEC; } Optional minimum_stack_size {}; auto interpreter_description = TRY(find_elf_interpreter_for_executable(path->view(), *main_program_header, nread, metadata.size, minimum_stack_size)); return do_exec(move(description), move(arguments), move(environment), move(interpreter_description), new_main_thread, previous_interrupts_state, *main_program_header, minimum_stack_size); } ErrorOr Process::sys$execve(Userspace user_params) { VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this); TRY(require_promise(Pledge::exec)); Thread* new_main_thread = nullptr; InterruptsState previous_interrupts_state = InterruptsState::Enabled; // NOTE: Be extremely careful with allocating any kernel memory in this function. // On success, the kernel stack will be lost. // The explicit block scope below is specifically placed to minimize the number // of stack locals in this function. { auto params = TRY(copy_typed_from_user(user_params)); if (params.arguments.length > ARG_MAX || params.environment.length > ARG_MAX) return E2BIG; // NOTE: The caller is expected to always pass at least one argument by convention, // the program path that was passed as params.path. if (params.arguments.length == 0) return EINVAL; auto path = TRY(get_syscall_path_argument(params.path)); auto copy_user_strings = [](auto const& list, auto& output) -> ErrorOr { if (!list.length) return {}; Checked size = sizeof(*list.strings); size *= list.length; if (size.has_overflow()) return EOVERFLOW; Vector strings; TRY(strings.try_resize(list.length)); TRY(copy_from_user(strings.data(), list.strings, size.value())); for (size_t i = 0; i < list.length; ++i) { auto string = TRY(try_copy_kstring_from_user(strings[i])); TRY(output.try_append(move(string))); } return {}; }; Vector> arguments; TRY(copy_user_strings(params.arguments, arguments)); Vector> environment; TRY(copy_user_strings(params.environment, environment)); TRY(exec(move(path), move(arguments), move(environment), new_main_thread, previous_interrupts_state)); } // NOTE: If we're here, the exec has succeeded and we've got a new executable image! // We will not return normally from this function. Instead, the next time we // get scheduled, it'll be at the entry point of the new executable. VERIFY_INTERRUPTS_DISABLED(); VERIFY(Processor::in_critical()); auto* current_thread = Thread::current(); if (current_thread == new_main_thread) { // We need to enter the scheduler lock before changing the state // and it will be released after the context switch into that // thread. We should also still be in our critical section VERIFY(!g_scheduler_lock.is_locked_by_current_processor()); VERIFY(Processor::in_critical() == 1); g_scheduler_lock.lock(); current_thread->set_state(Thread::State::Running); Processor::assume_context(*current_thread, previous_interrupts_state); VERIFY_NOT_REACHED(); } // NOTE: This code path is taken in the non-syscall case, i.e when the kernel spawns // a userspace process directly (such as /bin/SystemServer on startup) restore_processor_interrupts_state(previous_interrupts_state); Processor::leave_critical(); return 0; } }