Kernel/Ext2FS: Add full support for large inodes

128-byte inodes are more-or-less deprecated since they store timestamps
as unsigned 32-bit integer offsets from the UNIX epoch. Large inodes, on
the other hand, contain records that may specify a different epoch, and
are thus less susceptible to overflows. These very same records also
have the capability to store timestamps with nanosecond precision, which
this commit adds full support for as well.
This commit is contained in:
implicitfield 2024-11-23 00:43:31 +02:00 committed by Nico Weber
parent 2a205768b6
commit 0e368bb71a
5 changed files with 111 additions and 25 deletions

View file

@ -414,6 +414,10 @@ struct ext2_inode_large {
#define i_size_high i_dir_acl
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)
#if defined(__KERNEL__) || defined(__linux__)
# define i_reserved1 osd1.linux1.l_i_reserved1
# define i_frag osd2.linux2.l_i_frag

View file

@ -210,18 +210,28 @@ u64 Ext2FS::inode_size() const
{
return EXT2_INODE_SIZE(&super_block());
}
u64 Ext2FS::blocks_per_group() const
{
return EXT2_BLOCKS_PER_GROUP(&super_block());
}
ErrorOr<void> Ext2FS::write_ext2_inode(InodeIndex inode, ext2_inode const& e2inode)
ErrorOr<void> Ext2FS::write_ext2_inode(InodeIndex inode, ext2_inode_large const& e2inode)
{
BlockIndex block_index;
unsigned offset;
if (!find_block_containing_inode(inode, block_index, offset))
return EINVAL;
auto buffer = UserOrKernelBuffer::for_kernel_buffer(const_cast<u8*>((u8 const*)&e2inode));
Vector<u8> inode_storage;
TRY(inode_storage.try_resize(inode_size()));
size_t used_inode_size = inode_size() > EXT2_GOOD_OLD_INODE_SIZE ? EXT2_GOOD_OLD_INODE_SIZE + e2inode.i_extra_isize : inode_size();
VERIFY(used_inode_size >= EXT2_GOOD_OLD_INODE_SIZE && used_inode_size <= inode_size());
memcpy(inode_storage.data(), &e2inode, min(used_inode_size, sizeof(ext2_inode_large)));
auto buffer = UserOrKernelBuffer::for_kernel_buffer(inode_storage.data());
return write_block(block_index, buffer, inode_size(), offset);
}
@ -493,20 +503,31 @@ ErrorOr<NonnullRefPtr<Inode>> Ext2FS::create_inode(Ext2FSInode& parent_inode, St
if (parent_inode.m_raw_inode.i_links_count == 0)
return ENOENT;
ext2_inode e2inode {};
auto now = kgettimeofday().truncated_seconds_since_epoch();
ext2_inode_large e2inode {};
auto now = kgettimeofday().to_timespec();
u32 extra = Ext2FSInode::encode_time_to_extra(now.tv_sec, now.tv_nsec);
e2inode.i_mode = mode;
e2inode.i_uid = static_cast<u16>(uid.value());
ext2fs_set_i_uid_high(e2inode, uid.value() >> 16);
e2inode.i_gid = static_cast<u16>(gid.value());
ext2fs_set_i_gid_high(e2inode, gid.value() >> 16);
e2inode.i_size = 0;
e2inode.i_atime = now;
e2inode.i_ctime = now;
e2inode.i_mtime = now;
e2inode.i_atime = now.tv_sec;
e2inode.i_ctime = now.tv_sec;
e2inode.i_mtime = now.tv_sec;
e2inode.i_crtime = now.tv_sec;
e2inode.i_atime_extra = extra;
e2inode.i_ctime_extra = extra;
e2inode.i_mtime_extra = extra;
e2inode.i_crtime_extra = extra;
e2inode.i_dtime = 0;
e2inode.i_flags = 0;
if (inode_size() > EXT2_GOOD_OLD_INODE_SIZE)
e2inode.i_extra_isize = min(inode_size(), sizeof(ext2_inode_large)) - EXT2_GOOD_OLD_INODE_SIZE;
// For directories, add +1 link count for the "." entry in self.
e2inode.i_links_count = is_directory(mode);
@ -606,7 +627,7 @@ ErrorOr<void> Ext2FS::free_inode(Ext2FSInode& inode)
}
// NOTE: After this point, the inode metadata is wiped.
memset(&inode.m_raw_inode, 0, sizeof(ext2_inode));
memset(&inode.m_raw_inode, 0, sizeof(ext2_inode_large));
inode.m_raw_inode.i_dtime = kgettimeofday().truncated_seconds_since_epoch();
TRY(write_ext2_inode(inode.index(), inode.m_raw_inode));
@ -707,7 +728,11 @@ ErrorOr<NonnullRefPtr<Ext2FSInode>> Ext2FS::build_root_inode() const
auto inode = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) Ext2FSInode(const_cast<Ext2FS&>(*this), EXT2_ROOT_INO)));
auto buffer = UserOrKernelBuffer::for_kernel_buffer(reinterpret_cast<u8*>(&inode->m_raw_inode));
TRY(read_block(block_index, &buffer, sizeof(ext2_inode), offset));
size_t size = min(inode_size(), sizeof(ext2_inode_large));
VERIFY(size >= EXT2_GOOD_OLD_INODE_SIZE);
TRY(read_block(block_index, &buffer, size, offset));
return inode;
}
@ -744,7 +769,11 @@ ErrorOr<NonnullRefPtr<Inode>> Ext2FS::get_inode(InodeIdentifier inode) const
auto new_inode = TRY(adopt_nonnull_ref_or_enomem(new (nothrow) Ext2FSInode(const_cast<Ext2FS&>(*this), inode.index())));
auto buffer = UserOrKernelBuffer::for_kernel_buffer(reinterpret_cast<u8*>(&new_inode->m_raw_inode));
TRY(read_block(block_index, &buffer, sizeof(ext2_inode), offset));
size_t size = min(inode_size(), sizeof(ext2_inode_large));
VERIFY(size >= EXT2_GOOD_OLD_INODE_SIZE);
TRY(read_block(block_index, &buffer, size, offset));
TRY(m_inode_cache.try_set(inode.index(), new_inode));
return new_inode;

View file

@ -79,7 +79,10 @@ private:
ErrorOr<NonnullRefPtr<Ext2FSInode>> build_root_inode() const;
ErrorOr<void> write_ext2_inode(InodeIndex, ext2_inode const&);
// NOTE: The large Ext2 inode structure is strictly superset of the classic 128-byte inode structure,
// so the this function simply ignores all the extra data if the filesystem doesn't support large inodes.
ErrorOr<void> write_ext2_inode(InodeIndex, ext2_inode_large const&);
bool find_block_containing_inode(InodeIndex, BlockIndex& block_index, unsigned& offset) const;
ErrorOr<void> flush_super_block();

View file

@ -38,6 +38,16 @@ static u8 to_ext2_file_type(mode_t mode)
return EXT2_FT_UNKNOWN;
}
bool Ext2FSInode::is_within_inode_bounds(FlatPtr base, FlatPtr value_offset, size_t value_size) const
{
if (value_offset - base - value_size < EXT2_GOOD_OLD_INODE_SIZE)
return true;
VERIFY(static_cast<u64>(EXT2_GOOD_OLD_INODE_SIZE + m_raw_inode.i_extra_isize) <= fs().inode_size());
return value_offset - base + value_size <= static_cast<u64>(EXT2_GOOD_OLD_INODE_SIZE + m_raw_inode.i_extra_isize);
}
ErrorOr<void> Ext2FSInode::write_singly_indirect_block_pointer(BlockBasedFileSystem::BlockIndex logical_block_index, BlockBasedFileSystem::BlockIndex on_disk_index)
{
auto const entries_per_block = EXT2_ADDR_PER_BLOCK(&fs().super_block());
@ -417,10 +427,35 @@ InodeMetadata Ext2FSInode::metadata() const
metadata.uid = inode_uid(m_raw_inode);
metadata.gid = inode_gid(m_raw_inode);
metadata.link_count = m_raw_inode.i_links_count;
metadata.atime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_atime));
metadata.ctime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_ctime));
metadata.mtime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_mtime));
metadata.dtime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_dtime));
auto decode_time = [this](u32 const& time, u32 const& time_extra) {
// NOTE: All the *_extra fields have to be bounds-checked in case we have oddly-sized inodes.
// This is simply a correctness measure, since an OOB read wouldn't happen anyway due to the
// fact that we always store the raw inode as an ext2_inode_large.
if (is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&time_extra), sizeof(time_extra))) {
time_t seconds = decode_seconds_with_extra(time, time_extra);
u32 nanoseconds = decode_nanoseconds_from_extra(time_extra);
return UnixDateTime::from_unix_timespec({ .tv_sec = seconds, .tv_nsec = nanoseconds });
}
return UnixDateTime::from_seconds_since_epoch(static_cast<i32>(time));
};
metadata.atime = decode_time(m_raw_inode.i_atime, m_raw_inode.i_atime_extra);
metadata.mtime = decode_time(m_raw_inode.i_mtime, m_raw_inode.i_mtime_extra);
// NOTE: There's no i_dtime_extra, so we use i_ctime_extra to approximate the right epoch for metadata.dtime.
if (is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_ctime_extra), sizeof(m_raw_inode.i_ctime_extra))) {
time_t ctime_seconds = decode_seconds_with_extra(m_raw_inode.i_ctime, m_raw_inode.i_ctime_extra);
u32 ctime_nanoseconds = decode_nanoseconds_from_extra(m_raw_inode.i_ctime_extra);
metadata.ctime = UnixDateTime::from_unix_timespec({ .tv_sec = ctime_seconds, .tv_nsec = ctime_nanoseconds });
metadata.dtime = UnixDateTime::from_seconds_since_epoch(decode_seconds_with_extra(m_raw_inode.i_dtime, m_raw_inode.i_ctime_extra));
} else {
metadata.ctime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_ctime));
metadata.dtime = UnixDateTime::from_seconds_since_epoch(static_cast<i32>(m_raw_inode.i_dtime));
}
metadata.block_size = fs().logical_block_size();
metadata.block_count = m_raw_inode.i_blocks;
@ -895,18 +930,27 @@ ErrorOr<void> Ext2FSInode::update_timestamps(Optional<UnixDateTime> atime, Optio
MutexLocker locker(m_inode_lock);
if (fs().is_readonly())
return EROFS;
if (atime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (atime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_atime_extra), sizeof(m_raw_inode.i_atime_extra)))
return EINVAL;
if (ctime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (ctime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_ctime_extra), sizeof(m_raw_inode.i_ctime_extra)))
return EINVAL;
if (mtime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max())
if (mtime.value_or({}).to_timespec().tv_sec > NumericLimits<i32>::max() && !is_within_inode_bounds(bit_cast<FlatPtr>(&m_raw_inode), bit_cast<FlatPtr>(&m_raw_inode.i_mtime_extra), sizeof(m_raw_inode.i_mtime_extra)))
return EINVAL;
if (atime.has_value())
m_raw_inode.i_atime = atime.value().to_timespec().tv_sec;
if (ctime.has_value())
m_raw_inode.i_ctime = ctime.value().to_timespec().tv_sec;
if (mtime.has_value())
m_raw_inode.i_mtime = mtime.value().to_timespec().tv_sec;
auto maybe_encode_time = [](auto const& source, u32& time, u32& time_extra) {
if (!source.has_value())
return;
time_t seconds = source.value().to_timespec().tv_sec;
u32 nanoseconds = source.value().to_timespec().tv_nsec;
time = static_cast<u32>(seconds);
time_extra = encode_time_to_extra(seconds, nanoseconds);
};
maybe_encode_time(atime, m_raw_inode.i_atime, m_raw_inode.i_atime_extra);
maybe_encode_time(ctime, m_raw_inode.i_ctime, m_raw_inode.i_ctime_extra);
maybe_encode_time(mtime, m_raw_inode.i_mtime, m_raw_inode.i_mtime_extra);
set_metadata_dirty(true);
return {};
}

View file

@ -47,6 +47,12 @@ private:
virtual ErrorOr<void> truncate_locked(u64) override;
virtual ErrorOr<int> get_block_address(int) override;
bool is_within_inode_bounds(FlatPtr base, FlatPtr value_offset, size_t value_size) const;
static time_t decode_seconds_with_extra(i32 seconds, u32 extra) { return (extra & EXT4_EPOCH_MASK) ? static_cast<time_t>(seconds) + (static_cast<time_t>(extra & EXT4_EPOCH_MASK) << 32) : static_cast<time_t>(seconds); }
static u32 decode_nanoseconds_from_extra(u32 extra) { return (extra & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; }
static u32 encode_time_to_extra(time_t seconds, u32 nanoseconds) { return (((static_cast<time_t>(seconds) - static_cast<i32>(seconds)) >> 32) & EXT4_EPOCH_MASK) | (nanoseconds << EXT4_EPOCH_BITS); }
ErrorOr<BlockBasedFileSystem::BlockIndex> allocate_block(BlockBasedFileSystem::BlockIndex, bool zero_newly_allocated_block, bool allow_cache);
ErrorOr<u32> allocate_and_zero_block();
@ -86,7 +92,7 @@ private:
mutable Ext2FSBlockView m_block_view;
HashMap<NonnullOwnPtr<KString>, InodeIndex> m_lookup_cache;
ext2_inode m_raw_inode {};
ext2_inode_large m_raw_inode {};
};
inline Ext2FS& Ext2FSInode::fs()