mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 00:20:52 -05:00
vfs-6.11.iomap
-----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZpEHLQAKCRCRxhvAZXjc ot3sAP9TBUM+vzUcQ5SVcUnSX+y3dhOGYnquORBbRc/Y6AzLMAEAu3TcsvdoaWfy 6ImUaju6iLqy9cCY3uDlNmJR16E4IgE= =Bwpy -----END PGP SIGNATURE----- Merge tag 'vfs-6.11.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull iomap updates from Christian Brauner: "This contains some minor work for the iomap subsystem: - Add documentation on the design of iomap and how to port to it - Optimize iomap_read_folio() - Bring back the change to iomap_write_end() to no increase i_size. This is accompanied by a change to xfs to reserve blocks for truncating large realtime inodes to avoid exposing stale data when iomap_write_end() stops increasing i_size" * tag 'vfs-6.11.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: iomap: don't increase i_size in iomap_write_end() xfs: reserve blocks for truncating large realtime inode Documentation: the design of iomap and how to port iomap: Optimize iomap_read_folio
This commit is contained in:
commit
4f5e249ec0
8 changed files with 1352 additions and 27 deletions
|
@ -34,6 +34,7 @@ algorithms work.
|
|||
seq_file
|
||||
sharedsubtree
|
||||
idmappings
|
||||
iomap/index
|
||||
|
||||
automount-support
|
||||
|
||||
|
|
441
Documentation/filesystems/iomap/design.rst
Normal file
441
Documentation/filesystems/iomap/design.rst
Normal file
|
@ -0,0 +1,441 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. _iomap_design:
|
||||
|
||||
..
|
||||
Dumb style notes to maintain the author's sanity:
|
||||
Please try to start sentences on separate lines so that
|
||||
sentence changes don't bleed colors in diff.
|
||||
Heading decorations are documented in sphinx.rst.
|
||||
|
||||
==============
|
||||
Library Design
|
||||
==============
|
||||
|
||||
.. contents:: Table of Contents
|
||||
:local:
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
iomap is a filesystem library for handling common file operations.
|
||||
The library has two layers:
|
||||
|
||||
1. A lower layer that provides an iterator over ranges of file offsets.
|
||||
This layer tries to obtain mappings of each file ranges to storage
|
||||
from the filesystem, but the storage information is not necessarily
|
||||
required.
|
||||
|
||||
2. An upper layer that acts upon the space mappings provided by the
|
||||
lower layer iterator.
|
||||
|
||||
The iteration can involve mappings of file's logical offset ranges to
|
||||
physical extents, but the storage layer information is not necessarily
|
||||
required, e.g. for walking cached file information.
|
||||
The library exports various APIs for implementing file operations such
|
||||
as:
|
||||
|
||||
* Pagecache reads and writes
|
||||
* Folio write faults to the pagecache
|
||||
* Writeback of dirty folios
|
||||
* Direct I/O reads and writes
|
||||
* fsdax I/O reads, writes, loads, and stores
|
||||
* FIEMAP
|
||||
* lseek ``SEEK_DATA`` and ``SEEK_HOLE``
|
||||
* swapfile activation
|
||||
|
||||
This origins of this library is the file I/O path that XFS once used; it
|
||||
has now been extended to cover several other operations.
|
||||
|
||||
Who Should Read This?
|
||||
=====================
|
||||
|
||||
The target audience for this document are filesystem, storage, and
|
||||
pagecache programmers and code reviewers.
|
||||
|
||||
If you are working on PCI, machine architectures, or device drivers, you
|
||||
are most likely in the wrong place.
|
||||
|
||||
How Is This Better?
|
||||
===================
|
||||
|
||||
Unlike the classic Linux I/O model which breaks file I/O into small
|
||||
units (generally memory pages or blocks) and looks up space mappings on
|
||||
the basis of that unit, the iomap model asks the filesystem for the
|
||||
largest space mappings that it can create for a given file operation and
|
||||
initiates operations on that basis.
|
||||
This strategy improves the filesystem's visibility into the size of the
|
||||
operation being performed, which enables it to combat fragmentation with
|
||||
larger space allocations when possible.
|
||||
Larger space mappings improve runtime performance by amortizing the cost
|
||||
of mapping function calls into the filesystem across a larger amount of
|
||||
data.
|
||||
|
||||
At a high level, an iomap operation `looks like this
|
||||
<https://lore.kernel.org/all/ZGbVaewzcCysclPt@dread.disaster.area/>`_:
|
||||
|
||||
1. For each byte in the operation range...
|
||||
|
||||
1. Obtain a space mapping via ``->iomap_begin``
|
||||
|
||||
2. For each sub-unit of work...
|
||||
|
||||
1. Revalidate the mapping and go back to (1) above, if necessary.
|
||||
So far only the pagecache operations need to do this.
|
||||
|
||||
2. Do the work
|
||||
|
||||
3. Increment operation cursor
|
||||
|
||||
4. Release the mapping via ``->iomap_end``, if necessary
|
||||
|
||||
Each iomap operation will be covered in more detail below.
|
||||
This library was covered previously by an `LWN article
|
||||
<https://lwn.net/Articles/935934/>`_ and a `KernelNewbies page
|
||||
<https://kernelnewbies.org/KernelProjects/iomap>`_.
|
||||
|
||||
The goal of this document is to provide a brief discussion of the
|
||||
design and capabilities of iomap, followed by a more detailed catalog
|
||||
of the interfaces presented by iomap.
|
||||
If you change iomap, please update this design document.
|
||||
|
||||
File Range Iterator
|
||||
===================
|
||||
|
||||
Definitions
|
||||
-----------
|
||||
|
||||
* **buffer head**: Shattered remnants of the old buffer cache.
|
||||
|
||||
* ``fsblock``: The block size of a file, also known as ``i_blocksize``.
|
||||
|
||||
* ``i_rwsem``: The VFS ``struct inode`` rwsemaphore.
|
||||
Processes hold this in shared mode to read file state and contents.
|
||||
Some filesystems may allow shared mode for writes.
|
||||
Processes often hold this in exclusive mode to change file state and
|
||||
contents.
|
||||
|
||||
* ``invalidate_lock``: The pagecache ``struct address_space``
|
||||
rwsemaphore that protects against folio insertion and removal for
|
||||
filesystems that support punching out folios below EOF.
|
||||
Processes wishing to insert folios must hold this lock in shared
|
||||
mode to prevent removal, though concurrent insertion is allowed.
|
||||
Processes wishing to remove folios must hold this lock in exclusive
|
||||
mode to prevent insertions.
|
||||
Concurrent removals are not allowed.
|
||||
|
||||
* ``dax_read_lock``: The RCU read lock that dax takes to prevent a
|
||||
device pre-shutdown hook from returning before other threads have
|
||||
released resources.
|
||||
|
||||
* **filesystem mapping lock**: This synchronization primitive is
|
||||
internal to the filesystem and must protect the file mapping data
|
||||
from updates while a mapping is being sampled.
|
||||
The filesystem author must determine how this coordination should
|
||||
happen; it does not need to be an actual lock.
|
||||
|
||||
* **iomap internal operation lock**: This is a general term for
|
||||
synchronization primitives that iomap functions take while holding a
|
||||
mapping.
|
||||
A specific example would be taking the folio lock while reading or
|
||||
writing the pagecache.
|
||||
|
||||
* **pure overwrite**: A write operation that does not require any
|
||||
metadata or zeroing operations to perform during either submission
|
||||
or completion.
|
||||
This implies that the fileystem must have already allocated space
|
||||
on disk as ``IOMAP_MAPPED`` and the filesystem must not place any
|
||||
constaints on IO alignment or size.
|
||||
The only constraints on I/O alignment are device level (minimum I/O
|
||||
size and alignment, typically sector size).
|
||||
|
||||
``struct iomap``
|
||||
----------------
|
||||
|
||||
The filesystem communicates to the iomap iterator the mapping of
|
||||
byte ranges of a file to byte ranges of a storage device with the
|
||||
structure below:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct iomap {
|
||||
u64 addr;
|
||||
loff_t offset;
|
||||
u64 length;
|
||||
u16 type;
|
||||
u16 flags;
|
||||
struct block_device *bdev;
|
||||
struct dax_device *dax_dev;
|
||||
voidw *inline_data;
|
||||
void *private;
|
||||
const struct iomap_folio_ops *folio_ops;
|
||||
u64 validity_cookie;
|
||||
};
|
||||
|
||||
The fields are as follows:
|
||||
|
||||
* ``offset`` and ``length`` describe the range of file offsets, in
|
||||
bytes, covered by this mapping.
|
||||
These fields must always be set by the filesystem.
|
||||
|
||||
* ``type`` describes the type of the space mapping:
|
||||
|
||||
* **IOMAP_HOLE**: No storage has been allocated.
|
||||
This type must never be returned in response to an ``IOMAP_WRITE``
|
||||
operation because writes must allocate and map space, and return
|
||||
the mapping.
|
||||
The ``addr`` field must be set to ``IOMAP_NULL_ADDR``.
|
||||
iomap does not support writing (whether via pagecache or direct
|
||||
I/O) to a hole.
|
||||
|
||||
* **IOMAP_DELALLOC**: A promise to allocate space at a later time
|
||||
("delayed allocation").
|
||||
If the filesystem returns IOMAP_F_NEW here and the write fails, the
|
||||
``->iomap_end`` function must delete the reservation.
|
||||
The ``addr`` field must be set to ``IOMAP_NULL_ADDR``.
|
||||
|
||||
* **IOMAP_MAPPED**: The file range maps to specific space on the
|
||||
storage device.
|
||||
The device is returned in ``bdev`` or ``dax_dev``.
|
||||
The device address, in bytes, is returned via ``addr``.
|
||||
|
||||
* **IOMAP_UNWRITTEN**: The file range maps to specific space on the
|
||||
storage device, but the space has not yet been initialized.
|
||||
The device is returned in ``bdev`` or ``dax_dev``.
|
||||
The device address, in bytes, is returned via ``addr``.
|
||||
Reads from this type of mapping will return zeroes to the caller.
|
||||
For a write or writeback operation, the ioend should update the
|
||||
mapping to MAPPED.
|
||||
Refer to the sections about ioends for more details.
|
||||
|
||||
* **IOMAP_INLINE**: The file range maps to the memory buffer
|
||||
specified by ``inline_data``.
|
||||
For write operation, the ``->iomap_end`` function presumably
|
||||
handles persisting the data.
|
||||
The ``addr`` field must be set to ``IOMAP_NULL_ADDR``.
|
||||
|
||||
* ``flags`` describe the status of the space mapping.
|
||||
These flags should be set by the filesystem in ``->iomap_begin``:
|
||||
|
||||
* **IOMAP_F_NEW**: The space under the mapping is newly allocated.
|
||||
Areas that will not be written to must be zeroed.
|
||||
If a write fails and the mapping is a space reservation, the
|
||||
reservation must be deleted.
|
||||
|
||||
* **IOMAP_F_DIRTY**: The inode will have uncommitted metadata needed
|
||||
to access any data written.
|
||||
fdatasync is required to commit these changes to persistent
|
||||
storage.
|
||||
This needs to take into account metadata changes that *may* be made
|
||||
at I/O completion, such as file size updates from direct I/O.
|
||||
|
||||
* **IOMAP_F_SHARED**: The space under the mapping is shared.
|
||||
Copy on write is necessary to avoid corrupting other file data.
|
||||
|
||||
* **IOMAP_F_BUFFER_HEAD**: This mapping requires the use of buffer
|
||||
heads for pagecache operations.
|
||||
Do not add more uses of this.
|
||||
|
||||
* **IOMAP_F_MERGED**: Multiple contiguous block mappings were
|
||||
coalesced into this single mapping.
|
||||
This is only useful for FIEMAP.
|
||||
|
||||
* **IOMAP_F_XATTR**: The mapping is for extended attribute data, not
|
||||
regular file data.
|
||||
This is only useful for FIEMAP.
|
||||
|
||||
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
|
||||
be set by the filesystem for its own purposes.
|
||||
|
||||
These flags can be set by iomap itself during file operations.
|
||||
The filesystem should supply an ``->iomap_end`` function if it needs
|
||||
to observe these flags:
|
||||
|
||||
* **IOMAP_F_SIZE_CHANGED**: The file size has changed as a result of
|
||||
using this mapping.
|
||||
|
||||
* **IOMAP_F_STALE**: The mapping was found to be stale.
|
||||
iomap will call ``->iomap_end`` on this mapping and then
|
||||
``->iomap_begin`` to obtain a new mapping.
|
||||
|
||||
Currently, these flags are only set by pagecache operations.
|
||||
|
||||
* ``addr`` describes the device address, in bytes.
|
||||
|
||||
* ``bdev`` describes the block device for this mapping.
|
||||
This only needs to be set for mapped or unwritten operations.
|
||||
|
||||
* ``dax_dev`` describes the DAX device for this mapping.
|
||||
This only needs to be set for mapped or unwritten operations, and
|
||||
only for a fsdax operation.
|
||||
|
||||
* ``inline_data`` points to a memory buffer for I/O involving
|
||||
``IOMAP_INLINE`` mappings.
|
||||
This value is ignored for all other mapping types.
|
||||
|
||||
* ``private`` is a pointer to `filesystem-private information
|
||||
<https://lore.kernel.org/all/20180619164137.13720-7-hch@lst.de/>`_.
|
||||
This value will be passed unchanged to ``->iomap_end``.
|
||||
|
||||
* ``folio_ops`` will be covered in the section on pagecache operations.
|
||||
|
||||
* ``validity_cookie`` is a magic freshness value set by the filesystem
|
||||
that should be used to detect stale mappings.
|
||||
For pagecache operations this is critical for correct operation
|
||||
because page faults can occur, which implies that filesystem locks
|
||||
should not be held between ``->iomap_begin`` and ``->iomap_end``.
|
||||
Filesystems with completely static mappings need not set this value.
|
||||
Only pagecache operations revalidate mappings; see the section about
|
||||
``iomap_valid`` for details.
|
||||
|
||||
``struct iomap_ops``
|
||||
--------------------
|
||||
|
||||
Every iomap function requires the filesystem to pass an operations
|
||||
structure to obtain a mapping and (optionally) to release the mapping:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct iomap_ops {
|
||||
int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
|
||||
unsigned flags, struct iomap *iomap,
|
||||
struct iomap *srcmap);
|
||||
|
||||
int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
|
||||
ssize_t written, unsigned flags,
|
||||
struct iomap *iomap);
|
||||
};
|
||||
|
||||
``->iomap_begin``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
iomap operations call ``->iomap_begin`` to obtain one file mapping for
|
||||
the range of bytes specified by ``pos`` and ``length`` for the file
|
||||
``inode``.
|
||||
This mapping should be returned through the ``iomap`` pointer.
|
||||
The mapping must cover at least the first byte of the supplied file
|
||||
range, but it does not need to cover the entire requested range.
|
||||
|
||||
Each iomap operation describes the requested operation through the
|
||||
``flags`` argument.
|
||||
The exact value of ``flags`` will be documented in the
|
||||
operation-specific sections below.
|
||||
These flags can, at least in principle, apply generally to iomap
|
||||
operations:
|
||||
|
||||
* ``IOMAP_DIRECT`` is set when the caller wishes to issue file I/O to
|
||||
block storage.
|
||||
|
||||
* ``IOMAP_DAX`` is set when the caller wishes to issue file I/O to
|
||||
memory-like storage.
|
||||
|
||||
* ``IOMAP_NOWAIT`` is set when the caller wishes to perform a best
|
||||
effort attempt to avoid any operation that would result in blocking
|
||||
the submitting task.
|
||||
This is similar in intent to ``O_NONBLOCK`` for network APIs - it is
|
||||
intended for asynchronous applications to keep doing other work
|
||||
instead of waiting for the specific unavailable filesystem resource
|
||||
to become available.
|
||||
Filesystems implementing ``IOMAP_NOWAIT`` semantics need to use
|
||||
trylock algorithms.
|
||||
They need to be able to satisfy the entire I/O request range with a
|
||||
single iomap mapping.
|
||||
They need to avoid reading or writing metadata synchronously.
|
||||
They need to avoid blocking memory allocations.
|
||||
They need to avoid waiting on transaction reservations to allow
|
||||
modifications to take place.
|
||||
They probably should not be allocating new space.
|
||||
And so on.
|
||||
If there is any doubt in the filesystem developer's mind as to
|
||||
whether any specific ``IOMAP_NOWAIT`` operation may end up blocking,
|
||||
then they should return ``-EAGAIN`` as early as possible rather than
|
||||
start the operation and force the submitting task to block.
|
||||
``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or
|
||||
``RWF_NOWAIT``.
|
||||
|
||||
If it is necessary to read existing file contents from a `different
|
||||
<https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_
|
||||
device or address range on a device, the filesystem should return that
|
||||
information via ``srcmap``.
|
||||
Only pagecache and fsdax operations support reading from one mapping and
|
||||
writing to another.
|
||||
|
||||
``->iomap_end``
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
After the operation completes, the ``->iomap_end`` function, if present,
|
||||
is called to signal that iomap is finished with a mapping.
|
||||
Typically, implementations will use this function to tear down any
|
||||
context that were set up in ``->iomap_begin``.
|
||||
For example, a write might wish to commit the reservations for the bytes
|
||||
that were operated upon and unreserve any space that was not operated
|
||||
upon.
|
||||
``written`` might be zero if no bytes were touched.
|
||||
``flags`` will contain the same value passed to ``->iomap_begin``.
|
||||
iomap ops for reads are not likely to need to supply this function.
|
||||
|
||||
Both functions should return a negative errno code on error, or zero on
|
||||
success.
|
||||
|
||||
Preparing for File Operations
|
||||
=============================
|
||||
|
||||
iomap only handles mapping and I/O.
|
||||
Filesystems must still call out to the VFS to check input parameters
|
||||
and file state before initiating an I/O operation.
|
||||
It does not handle obtaining filesystem freeze protection, updating of
|
||||
timestamps, stripping privileges, or access control.
|
||||
|
||||
Locking Hierarchy
|
||||
=================
|
||||
|
||||
iomap requires that filesystems supply their own locking model.
|
||||
There are three categories of synchronization primitives, as far as
|
||||
iomap is concerned:
|
||||
|
||||
* The **upper** level primitive is provided by the filesystem to
|
||||
coordinate access to different iomap operations.
|
||||
The exact primitive is specifc to the filesystem and operation,
|
||||
but is often a VFS inode, pagecache invalidation, or folio lock.
|
||||
For example, a filesystem might take ``i_rwsem`` before calling
|
||||
``iomap_file_buffered_write`` and ``iomap_file_unshare`` to prevent
|
||||
these two file operations from clobbering each other.
|
||||
Pagecache writeback may lock a folio to prevent other threads from
|
||||
accessing the folio until writeback is underway.
|
||||
|
||||
* The **lower** level primitive is taken by the filesystem in the
|
||||
``->iomap_begin`` and ``->iomap_end`` functions to coordinate
|
||||
access to the file space mapping information.
|
||||
The fields of the iomap object should be filled out while holding
|
||||
this primitive.
|
||||
The upper level synchronization primitive, if any, remains held
|
||||
while acquiring the lower level synchronization primitive.
|
||||
For example, XFS takes ``ILOCK_EXCL`` and ext4 takes ``i_data_sem``
|
||||
while sampling mappings.
|
||||
Filesystems with immutable mapping information may not require
|
||||
synchronization here.
|
||||
|
||||
* The **operation** primitive is taken by an iomap operation to
|
||||
coordinate access to its own internal data structures.
|
||||
The upper level synchronization primitive, if any, remains held
|
||||
while acquiring this primitive.
|
||||
The lower level primitive is not held while acquiring this
|
||||
primitive.
|
||||
For example, pagecache write operations will obtain a file mapping,
|
||||
then grab and lock a folio to copy new contents.
|
||||
It may also lock an internal folio state object to update metadata.
|
||||
|
||||
The exact locking requirements are specific to the filesystem; for
|
||||
certain operations, some of these locks can be elided.
|
||||
All further mention of locking are *recommendations*, not mandates.
|
||||
Each filesystem author must figure out the locking for themself.
|
||||
|
||||
Bugs and Limitations
|
||||
====================
|
||||
|
||||
* No support for fscrypt.
|
||||
* No support for compression.
|
||||
* No support for fsverity yet.
|
||||
* Strong assumptions that IO should work the way it does on XFS.
|
||||
* Does iomap *actually* work for non-regular file data?
|
||||
|
||||
Patches welcome!
|
13
Documentation/filesystems/iomap/index.rst
Normal file
13
Documentation/filesystems/iomap/index.rst
Normal file
|
@ -0,0 +1,13 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
VFS iomap Documentation
|
||||
=======================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:numbered:
|
||||
|
||||
design
|
||||
operations
|
||||
porting
|
713
Documentation/filesystems/iomap/operations.rst
Normal file
713
Documentation/filesystems/iomap/operations.rst
Normal file
|
@ -0,0 +1,713 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. _iomap_operations:
|
||||
|
||||
..
|
||||
Dumb style notes to maintain the author's sanity:
|
||||
Please try to start sentences on separate lines so that
|
||||
sentence changes don't bleed colors in diff.
|
||||
Heading decorations are documented in sphinx.rst.
|
||||
|
||||
=========================
|
||||
Supported File Operations
|
||||
=========================
|
||||
|
||||
.. contents:: Table of Contents
|
||||
:local:
|
||||
|
||||
Below are a discussion of the high level file operations that iomap
|
||||
implements.
|
||||
|
||||
Buffered I/O
|
||||
============
|
||||
|
||||
Buffered I/O is the default file I/O path in Linux.
|
||||
File contents are cached in memory ("pagecache") to satisfy reads and
|
||||
writes.
|
||||
Dirty cache will be written back to disk at some point that can be
|
||||
forced via ``fsync`` and variants.
|
||||
|
||||
iomap implements nearly all the folio and pagecache management that
|
||||
filesystems have to implement themselves under the legacy I/O model.
|
||||
This means that the filesystem need not know the details of allocating,
|
||||
mapping, managing uptodate and dirty state, or writeback of pagecache
|
||||
folios.
|
||||
Under the legacy I/O model, this was managed very inefficiently with
|
||||
linked lists of buffer heads instead of the per-folio bitmaps that iomap
|
||||
uses.
|
||||
Unless the filesystem explicitly opts in to buffer heads, they will not
|
||||
be used, which makes buffered I/O much more efficient, and the pagecache
|
||||
maintainer much happier.
|
||||
|
||||
``struct address_space_operations``
|
||||
-----------------------------------
|
||||
|
||||
The following iomap functions can be referenced directly from the
|
||||
address space operations structure:
|
||||
|
||||
* ``iomap_dirty_folio``
|
||||
* ``iomap_release_folio``
|
||||
* ``iomap_invalidate_folio``
|
||||
* ``iomap_is_partially_uptodate``
|
||||
|
||||
The following address space operations can be wrapped easily:
|
||||
|
||||
* ``read_folio``
|
||||
* ``readahead``
|
||||
* ``writepages``
|
||||
* ``bmap``
|
||||
* ``swap_activate``
|
||||
|
||||
``struct iomap_folio_ops``
|
||||
--------------------------
|
||||
|
||||
The ``->iomap_begin`` function for pagecache operations may set the
|
||||
``struct iomap::folio_ops`` field to an ops structure to override
|
||||
default behaviors of iomap:
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct iomap_folio_ops {
|
||||
struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
|
||||
unsigned len);
|
||||
void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
|
||||
struct folio *folio);
|
||||
bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
|
||||
};
|
||||
|
||||
iomap calls these functions:
|
||||
|
||||
- ``get_folio``: Called to allocate and return an active reference to
|
||||
a locked folio prior to starting a write.
|
||||
If this function is not provided, iomap will call
|
||||
``iomap_get_folio``.
|
||||
This could be used to `set up per-folio filesystem state
|
||||
<https://lore.kernel.org/all/20190429220934.10415-5-agruenba@redhat.com/>`_
|
||||
for a write.
|
||||
|
||||
- ``put_folio``: Called to unlock and put a folio after a pagecache
|
||||
operation completes.
|
||||
If this function is not provided, iomap will ``folio_unlock`` and
|
||||
``folio_put`` on its own.
|
||||
This could be used to `commit per-folio filesystem state
|
||||
<https://lore.kernel.org/all/20180619164137.13720-6-hch@lst.de/>`_
|
||||
that was set up by ``->get_folio``.
|
||||
|
||||
- ``iomap_valid``: The filesystem may not hold locks between
|
||||
``->iomap_begin`` and ``->iomap_end`` because pagecache operations
|
||||
can take folio locks, fault on userspace pages, initiate writeback
|
||||
for memory reclamation, or engage in other time-consuming actions.
|
||||
If a file's space mapping data are mutable, it is possible that the
|
||||
mapping for a particular pagecache folio can `change in the time it
|
||||
takes
|
||||
<https://lore.kernel.org/all/20221123055812.747923-8-david@fromorbit.com/>`_
|
||||
to allocate, install, and lock that folio.
|
||||
|
||||
For the pagecache, races can happen if writeback doesn't take
|
||||
``i_rwsem`` or ``invalidate_lock`` and updates mapping information.
|
||||
Races can also happen if the filesytem allows concurrent writes.
|
||||
For such files, the mapping *must* be revalidated after the folio
|
||||
lock has been taken so that iomap can manage the folio correctly.
|
||||
|
||||
fsdax does not need this revalidation because there's no writeback
|
||||
and no support for unwritten extents.
|
||||
|
||||
Filesystems subject to this kind of race must provide a
|
||||
``->iomap_valid`` function to decide if the mapping is still valid.
|
||||
If the mapping is not valid, the mapping will be sampled again.
|
||||
|
||||
To support making the validity decision, the filesystem's
|
||||
``->iomap_begin`` function may set ``struct iomap::validity_cookie``
|
||||
at the same time that it populates the other iomap fields.
|
||||
A simple validation cookie implementation is a sequence counter.
|
||||
If the filesystem bumps the sequence counter every time it modifies
|
||||
the inode's extent map, it can be placed in the ``struct
|
||||
iomap::validity_cookie`` during ``->iomap_begin``.
|
||||
If the value in the cookie is found to be different to the value
|
||||
the filesystem holds when the mapping is passed back to
|
||||
``->iomap_valid``, then the iomap should considered stale and the
|
||||
validation failed.
|
||||
|
||||
These ``struct kiocb`` flags are significant for buffered I/O with iomap:
|
||||
|
||||
* ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
|
||||
|
||||
Internal per-Folio State
|
||||
------------------------
|
||||
|
||||
If the fsblock size matches the size of a pagecache folio, it is assumed
|
||||
that all disk I/O operations will operate on the entire folio.
|
||||
The uptodate (memory contents are at least as new as what's on disk) and
|
||||
dirty (memory contents are newer than what's on disk) status of the
|
||||
folio are all that's needed for this case.
|
||||
|
||||
If the fsblock size is less than the size of a pagecache folio, iomap
|
||||
tracks the per-fsblock uptodate and dirty state itself.
|
||||
This enables iomap to handle both "bs < ps" `filesystems
|
||||
<https://lore.kernel.org/all/20230725122932.144426-1-ritesh.list@gmail.com/>`_
|
||||
and large folios in the pagecache.
|
||||
|
||||
iomap internally tracks two state bits per fsblock:
|
||||
|
||||
* ``uptodate``: iomap will try to keep folios fully up to date.
|
||||
If there are read(ahead) errors, those fsblocks will not be marked
|
||||
uptodate.
|
||||
The folio itself will be marked uptodate when all fsblocks within the
|
||||
folio are uptodate.
|
||||
|
||||
* ``dirty``: iomap will set the per-block dirty state when programs
|
||||
write to the file.
|
||||
The folio itself will be marked dirty when any fsblock within the
|
||||
folio is dirty.
|
||||
|
||||
iomap also tracks the amount of read and write disk IOs that are in
|
||||
flight.
|
||||
This structure is much lighter weight than ``struct buffer_head``
|
||||
because there is only one per folio, and the per-fsblock overhead is two
|
||||
bits vs. 104 bytes.
|
||||
|
||||
Filesystems wishing to turn on large folios in the pagecache should call
|
||||
``mapping_set_large_folios`` when initializing the incore inode.
|
||||
|
||||
Buffered Readahead and Reads
|
||||
----------------------------
|
||||
|
||||
The ``iomap_readahead`` function initiates readahead to the pagecache.
|
||||
The ``iomap_read_folio`` function reads one folio's worth of data into
|
||||
the pagecache.
|
||||
The ``flags`` argument to ``->iomap_begin`` will be set to zero.
|
||||
The pagecache takes whatever locks it needs before calling the
|
||||
filesystem.
|
||||
|
||||
Buffered Writes
|
||||
---------------
|
||||
|
||||
The ``iomap_file_buffered_write`` function writes an ``iocb`` to the
|
||||
pagecache.
|
||||
``IOMAP_WRITE`` or ``IOMAP_WRITE`` | ``IOMAP_NOWAIT`` will be passed as
|
||||
the ``flags`` argument to ``->iomap_begin``.
|
||||
Callers commonly take ``i_rwsem`` in either shared or exclusive mode
|
||||
before calling this function.
|
||||
|
||||
mmap Write Faults
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``iomap_page_mkwrite`` function handles a write fault to a folio in
|
||||
the pagecache.
|
||||
``IOMAP_WRITE | IOMAP_FAULT`` will be passed as the ``flags`` argument
|
||||
to ``->iomap_begin``.
|
||||
Callers commonly take the mmap ``invalidate_lock`` in shared or
|
||||
exclusive mode before calling this function.
|
||||
|
||||
Buffered Write Failures
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
After a short write to the pagecache, the areas not written will not
|
||||
become marked dirty.
|
||||
The filesystem must arrange to `cancel
|
||||
<https://lore.kernel.org/all/20221123055812.747923-6-david@fromorbit.com/>`_
|
||||
such `reservations
|
||||
<https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/>`_
|
||||
because writeback will not consume the reservation.
|
||||
The ``iomap_file_buffered_write_punch_delalloc`` can be called from a
|
||||
``->iomap_end`` function to find all the clean areas of the folios
|
||||
caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
|
||||
It takes the ``invalidate_lock``.
|
||||
|
||||
The filesystem must supply a function ``punch`` to be called for
|
||||
each file range in this state.
|
||||
This function must *only* remove delayed allocation reservations, in
|
||||
case another thread racing with the current thread writes successfully
|
||||
to the same region and triggers writeback to flush the dirty data out to
|
||||
disk.
|
||||
|
||||
Zeroing for File Operations
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Filesystems can call ``iomap_zero_range`` to perform zeroing of the
|
||||
pagecache for non-truncation file operations that are not aligned to
|
||||
the fsblock size.
|
||||
``IOMAP_ZERO`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive
|
||||
mode before calling this function.
|
||||
|
||||
Unsharing Reflinked File Data
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Filesystems can call ``iomap_file_unshare`` to force a file sharing
|
||||
storage with another file to preemptively copy the shared data to newly
|
||||
allocate storage.
|
||||
``IOMAP_WRITE | IOMAP_UNSHARE`` will be passed as the ``flags`` argument
|
||||
to ``->iomap_begin``.
|
||||
Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive
|
||||
mode before calling this function.
|
||||
|
||||
Truncation
|
||||
----------
|
||||
|
||||
Filesystems can call ``iomap_truncate_page`` to zero the bytes in the
|
||||
pagecache from EOF to the end of the fsblock during a file truncation
|
||||
operation.
|
||||
``truncate_setsize`` or ``truncate_pagecache`` will take care of
|
||||
everything after the EOF block.
|
||||
``IOMAP_ZERO`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
Callers typically hold ``i_rwsem`` and ``invalidate_lock`` in exclusive
|
||||
mode before calling this function.
|
||||
|
||||
Pagecache Writeback
|
||||
-------------------
|
||||
|
||||
Filesystems can call ``iomap_writepages`` to respond to a request to
|
||||
write dirty pagecache folios to disk.
|
||||
The ``mapping`` and ``wbc`` parameters should be passed unchanged.
|
||||
The ``wpc`` pointer should be allocated by the filesystem and must
|
||||
be initialized to zero.
|
||||
|
||||
The pagecache will lock each folio before trying to schedule it for
|
||||
writeback.
|
||||
It does not lock ``i_rwsem`` or ``invalidate_lock``.
|
||||
|
||||
The dirty bit will be cleared for all folios run through the
|
||||
``->map_blocks`` machinery described below even if the writeback fails.
|
||||
This is to prevent dirty folio clots when storage devices fail; an
|
||||
``-EIO`` is recorded for userspace to collect via ``fsync``.
|
||||
|
||||
The ``ops`` structure must be specified and is as follows:
|
||||
|
||||
``struct iomap_writeback_ops``
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
struct iomap_writeback_ops {
|
||||
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
|
||||
loff_t offset, unsigned len);
|
||||
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
|
||||
void (*discard_folio)(struct folio *folio, loff_t pos);
|
||||
};
|
||||
|
||||
The fields are as follows:
|
||||
|
||||
- ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file
|
||||
range (in bytes) given by ``offset`` and ``len``.
|
||||
iomap calls this function for each dirty fs block in each dirty folio,
|
||||
though it will `reuse mappings
|
||||
<https://lore.kernel.org/all/20231207072710.176093-15-hch@lst.de/>`_
|
||||
for runs of contiguous dirty fsblocks within a folio.
|
||||
Do not return ``IOMAP_INLINE`` mappings here; the ``->iomap_end``
|
||||
function must deal with persisting written data.
|
||||
Do not return ``IOMAP_DELALLOC`` mappings here; iomap currently
|
||||
requires mapping to allocated space.
|
||||
Filesystems can skip a potentially expensive mapping lookup if the
|
||||
mappings have not changed.
|
||||
This revalidation must be open-coded by the filesystem; it is
|
||||
unclear if ``iomap::validity_cookie`` can be reused for this
|
||||
purpose.
|
||||
This function must be supplied by the filesystem.
|
||||
|
||||
- ``prepare_ioend``: Enables filesystems to transform the writeback
|
||||
ioend or perform any other preparatory work before the writeback I/O
|
||||
is submitted.
|
||||
This might include pre-write space accounting updates, or installing
|
||||
a custom ``->bi_end_io`` function for internal purposes, such as
|
||||
deferring the ioend completion to a workqueue to run metadata update
|
||||
transactions from process context.
|
||||
This function is optional.
|
||||
|
||||
- ``discard_folio``: iomap calls this function after ``->map_blocks``
|
||||
fails to schedule I/O for any part of a dirty folio.
|
||||
The function should throw away any reservations that may have been
|
||||
made for the write.
|
||||
The folio will be marked clean and an ``-EIO`` recorded in the
|
||||
pagecache.
|
||||
Filesystems can use this callback to `remove
|
||||
<https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
|
||||
delalloc reservations to avoid having delalloc reservations for
|
||||
clean pagecache.
|
||||
This function is optional.
|
||||
|
||||
Pagecache Writeback Completion
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To handle the bookkeeping that must happen after disk I/O for writeback
|
||||
completes, iomap creates chains of ``struct iomap_ioend`` objects that
|
||||
wrap the ``bio`` that is used to write pagecache data to disk.
|
||||
By default, iomap finishes writeback ioends by clearing the writeback
|
||||
bit on the folios attached to the ``ioend``.
|
||||
If the write failed, it will also set the error bits on the folios and
|
||||
the address space.
|
||||
This can happen in interrupt or process context, depending on the
|
||||
storage device.
|
||||
|
||||
Filesystems that need to update internal bookkeeping (e.g. unwritten
|
||||
extent conversions) should provide a ``->prepare_ioend`` function to
|
||||
set ``struct iomap_end::bio::bi_end_io`` to its own function.
|
||||
This function should call ``iomap_finish_ioends`` after finishing its
|
||||
own work (e.g. unwritten extent conversion).
|
||||
|
||||
Some filesystems may wish to `amortize the cost of running metadata
|
||||
transactions
|
||||
<https://lore.kernel.org/all/20220120034733.221737-1-david@fromorbit.com/>`_
|
||||
for post-writeback updates by batching them.
|
||||
They may also require transactions to run from process context, which
|
||||
implies punting batches to a workqueue.
|
||||
iomap ioends contain a ``list_head`` to enable batching.
|
||||
|
||||
Given a batch of ioends, iomap has a few helpers to assist with
|
||||
amortization:
|
||||
|
||||
* ``iomap_sort_ioends``: Sort all the ioends in the list by file
|
||||
offset.
|
||||
|
||||
* ``iomap_ioend_try_merge``: Given an ioend that is not in any list and
|
||||
a separate list of sorted ioends, merge as many of the ioends from
|
||||
the head of the list into the given ioend.
|
||||
ioends can only be merged if the file range and storage addresses are
|
||||
contiguous; the unwritten and shared status are the same; and the
|
||||
write I/O outcome is the same.
|
||||
The merged ioends become their own list.
|
||||
|
||||
* ``iomap_finish_ioends``: Finish an ioend that possibly has other
|
||||
ioends linked to it.
|
||||
|
||||
Direct I/O
|
||||
==========
|
||||
|
||||
In Linux, direct I/O is defined as file I/O that is issued directly to
|
||||
storage, bypassing the pagecache.
|
||||
The ``iomap_dio_rw`` function implements O_DIRECT (direct I/O) reads and
|
||||
writes for files.
|
||||
|
||||
.. code-block:: c
|
||||
|
||||
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops,
|
||||
const struct iomap_dio_ops *dops,
|
||||
unsigned int dio_flags, void *private,
|
||||
size_t done_before);
|
||||
|
||||
The filesystem can provide the ``dops`` parameter if it needs to perform
|
||||
extra work before or after the I/O is issued to storage.
|
||||
The ``done_before`` parameter tells the how much of the request has
|
||||
already been transferred.
|
||||
It is used to continue a request asynchronously when `part of the
|
||||
request
|
||||
<https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c03098d4b9ad76bca2966a8769dcfe59f7f85103>`_
|
||||
has already been completed synchronously.
|
||||
|
||||
The ``done_before`` parameter should be set if writes for the ``iocb``
|
||||
have been initiated prior to the call.
|
||||
The direction of the I/O is determined from the ``iocb`` passed in.
|
||||
|
||||
The ``dio_flags`` argument can be set to any combination of the
|
||||
following values:
|
||||
|
||||
* ``IOMAP_DIO_FORCE_WAIT``: Wait for the I/O to complete even if the
|
||||
kiocb is not synchronous.
|
||||
|
||||
* ``IOMAP_DIO_OVERWRITE_ONLY``: Perform a pure overwrite for this range
|
||||
or fail with ``-EAGAIN``.
|
||||
This can be used by filesystems with complex unaligned I/O
|
||||
write paths to provide an optimised fast path for unaligned writes.
|
||||
If a pure overwrite can be performed, then serialisation against
|
||||
other I/Os to the same filesystem block(s) is unnecessary as there is
|
||||
no risk of stale data exposure or data loss.
|
||||
If a pure overwrite cannot be performed, then the filesystem can
|
||||
perform the serialisation steps needed to provide exclusive access
|
||||
to the unaligned I/O range so that it can perform allocation and
|
||||
sub-block zeroing safely.
|
||||
Filesystems can use this flag to try to reduce locking contention,
|
||||
but a lot of `detailed checking
|
||||
<https://lore.kernel.org/linux-ext4/20230314130759.642710-1-bfoster@redhat.com/>`_
|
||||
is required to do it `correctly
|
||||
<https://lore.kernel.org/linux-ext4/20230810165559.946222-1-bfoster@redhat.com/>`_.
|
||||
|
||||
* ``IOMAP_DIO_PARTIAL``: If a page fault occurs, return whatever
|
||||
progress has already been made.
|
||||
The caller may deal with the page fault and retry the operation.
|
||||
If the caller decides to retry the operation, it should pass the
|
||||
accumulated return values of all previous calls as the
|
||||
``done_before`` parameter to the next call.
|
||||
|
||||
These ``struct kiocb`` flags are significant for direct I/O with iomap:
|
||||
|
||||
* ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
|
||||
|
||||
* ``IOCB_SYNC``: Ensure that the device has persisted data to disk
|
||||
before completing the call.
|
||||
In the case of pure overwrites, the I/O may be issued with FUA
|
||||
enabled.
|
||||
|
||||
* ``IOCB_HIPRI``: Poll for I/O completion instead of waiting for an
|
||||
interrupt.
|
||||
Only meaningful for asynchronous I/O, and only if the entire I/O can
|
||||
be issued as a single ``struct bio``.
|
||||
|
||||
* ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's
|
||||
process context.
|
||||
See ``linux/fs.h`` for more details.
|
||||
|
||||
Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and
|
||||
``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``
|
||||
function for the file.
|
||||
They should not set ``->direct_IO``, which is deprecated.
|
||||
|
||||
If a filesystem wishes to perform its own work before direct I/O
|
||||
completion, it should call ``__iomap_dio_rw``.
|
||||
If its return value is not an error pointer or a NULL pointer, the
|
||||
filesystem should pass the return value to ``iomap_dio_complete`` after
|
||||
finishing its internal work.
|
||||
|
||||
Return Values
|
||||
-------------
|
||||
|
||||
``iomap_dio_rw`` can return one of the following:
|
||||
|
||||
* A non-negative number of bytes transferred.
|
||||
|
||||
* ``-ENOTBLK``: Fall back to buffered I/O.
|
||||
iomap itself will return this value if it cannot invalidate the page
|
||||
cache before issuing the I/O to storage.
|
||||
The ``->iomap_begin`` or ``->iomap_end`` functions may also return
|
||||
this value.
|
||||
|
||||
* ``-EIOCBQUEUED``: The asynchronous direct I/O request has been
|
||||
queued and will be completed separately.
|
||||
|
||||
* Any of the other negative error codes.
|
||||
|
||||
Direct Reads
|
||||
------------
|
||||
|
||||
A direct I/O read initiates a read I/O from the storage device to the
|
||||
caller's buffer.
|
||||
Dirty parts of the pagecache are flushed to storage before initiating
|
||||
the read io.
|
||||
The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DIRECT`` with
|
||||
any combination of the following enhancements:
|
||||
|
||||
* ``IOMAP_NOWAIT``, as defined previously.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared mode before calling this
|
||||
function.
|
||||
|
||||
Direct Writes
|
||||
-------------
|
||||
|
||||
A direct I/O write initiates a write I/O to the storage device from the
|
||||
caller's buffer.
|
||||
Dirty parts of the pagecache are flushed to storage before initiating
|
||||
the write io.
|
||||
The pagecache is invalidated both before and after the write io.
|
||||
The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DIRECT |
|
||||
IOMAP_WRITE`` with any combination of the following enhancements:
|
||||
|
||||
* ``IOMAP_NOWAIT``, as defined previously.
|
||||
|
||||
* ``IOMAP_OVERWRITE_ONLY``: Allocating blocks and zeroing partial
|
||||
blocks is not allowed.
|
||||
The entire file range must map to a single written or unwritten
|
||||
extent.
|
||||
The file I/O range must be aligned to the filesystem block size
|
||||
if the mapping is unwritten and the filesystem cannot handle zeroing
|
||||
the unaligned regions without exposing stale contents.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
|
||||
calling this function.
|
||||
|
||||
``struct iomap_dio_ops:``
|
||||
-------------------------
|
||||
.. code-block:: c
|
||||
|
||||
struct iomap_dio_ops {
|
||||
void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
|
||||
loff_t file_offset);
|
||||
int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
|
||||
unsigned flags);
|
||||
struct bio_set *bio_set;
|
||||
};
|
||||
|
||||
The fields of this structure are as follows:
|
||||
|
||||
- ``submit_io``: iomap calls this function when it has constructed a
|
||||
``struct bio`` object for the I/O requested, and wishes to submit it
|
||||
to the block device.
|
||||
If no function is provided, ``submit_bio`` will be called directly.
|
||||
Filesystems that would like to perform additional work before (e.g.
|
||||
data replication for btrfs) should implement this function.
|
||||
|
||||
- ``end_io``: This is called after the ``struct bio`` completes.
|
||||
This function should perform post-write conversions of unwritten
|
||||
extent mappings, handle write failures, etc.
|
||||
The ``flags`` argument may be set to a combination of the following:
|
||||
|
||||
* ``IOMAP_DIO_UNWRITTEN``: The mapping was unwritten, so the ioend
|
||||
should mark the extent as written.
|
||||
|
||||
* ``IOMAP_DIO_COW``: Writing to the space in the mapping required a
|
||||
copy on write operation, so the ioend should switch mappings.
|
||||
|
||||
- ``bio_set``: This allows the filesystem to provide a custom bio_set
|
||||
for allocating direct I/O bios.
|
||||
This enables filesystems to `stash additional per-bio information
|
||||
<https://lore.kernel.org/all/20220505201115.937837-3-hch@lst.de/>`_
|
||||
for private use.
|
||||
If this field is NULL, generic ``struct bio`` objects will be used.
|
||||
|
||||
Filesystems that want to perform extra work after an I/O completion
|
||||
should set a custom ``->bi_end_io`` function via ``->submit_io``.
|
||||
Afterwards, the custom endio function must call
|
||||
``iomap_dio_bio_end_io`` to finish the direct I/O.
|
||||
|
||||
DAX I/O
|
||||
=======
|
||||
|
||||
Some storage devices can be directly mapped as memory.
|
||||
These devices support a new access mode known as "fsdax" that allows
|
||||
loads and stores through the CPU and memory controller.
|
||||
|
||||
fsdax Reads
|
||||
-----------
|
||||
|
||||
A fsdax read performs a memcpy from storage device to the caller's
|
||||
buffer.
|
||||
The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DAX`` with any
|
||||
combination of the following enhancements:
|
||||
|
||||
* ``IOMAP_NOWAIT``, as defined previously.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared mode before calling this
|
||||
function.
|
||||
|
||||
fsdax Writes
|
||||
------------
|
||||
|
||||
A fsdax write initiates a memcpy to the storage device from the caller's
|
||||
buffer.
|
||||
The ``flags`` value for ``->iomap_begin`` will be ``IOMAP_DAX |
|
||||
IOMAP_WRITE`` with any combination of the following enhancements:
|
||||
|
||||
* ``IOMAP_NOWAIT``, as defined previously.
|
||||
|
||||
* ``IOMAP_OVERWRITE_ONLY``: The caller requires a pure overwrite to be
|
||||
performed from this mapping.
|
||||
This requires the filesystem extent mapping to already exist as an
|
||||
``IOMAP_MAPPED`` type and span the entire range of the write I/O
|
||||
request.
|
||||
If the filesystem cannot map this request in a way that allows the
|
||||
iomap infrastructure to perform a pure overwrite, it must fail the
|
||||
mapping operation with ``-EAGAIN``.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in exclusive mode before calling this
|
||||
function.
|
||||
|
||||
fsdax mmap Faults
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``dax_iomap_fault`` function handles read and write faults to fsdax
|
||||
storage.
|
||||
For a read fault, ``IOMAP_DAX | IOMAP_FAULT`` will be passed as the
|
||||
``flags`` argument to ``->iomap_begin``.
|
||||
For a write fault, ``IOMAP_DAX | IOMAP_FAULT | IOMAP_WRITE`` will be
|
||||
passed as the ``flags`` argument to ``->iomap_begin``.
|
||||
|
||||
Callers commonly hold the same locks as they do to call their iomap
|
||||
pagecache counterparts.
|
||||
|
||||
fsdax Truncation, fallocate, and Unsharing
|
||||
------------------------------------------
|
||||
|
||||
For fsdax files, the following functions are provided to replace their
|
||||
iomap pagecache I/O counterparts.
|
||||
The ``flags`` argument to ``->iomap_begin`` are the same as the
|
||||
pagecache counterparts, with ``IOMAP_DAX`` added.
|
||||
|
||||
* ``dax_file_unshare``
|
||||
* ``dax_zero_range``
|
||||
* ``dax_truncate_page``
|
||||
|
||||
Callers commonly hold the same locks as they do to call their iomap
|
||||
pagecache counterparts.
|
||||
|
||||
fsdax Deduplication
|
||||
-------------------
|
||||
|
||||
Filesystems implementing the ``FIDEDUPERANGE`` ioctl must call the
|
||||
``dax_remap_file_range_prep`` function with their own iomap read ops.
|
||||
|
||||
Seeking Files
|
||||
=============
|
||||
|
||||
iomap implements the two iterating whence modes of the ``llseek`` system
|
||||
call.
|
||||
|
||||
SEEK_DATA
|
||||
---------
|
||||
|
||||
The ``iomap_seek_data`` function implements the SEEK_DATA "whence" value
|
||||
for llseek.
|
||||
``IOMAP_REPORT`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
|
||||
For unwritten mappings, the pagecache will be searched.
|
||||
Regions of the pagecache with a folio mapped and uptodate fsblocks
|
||||
within those folios will be reported as data areas.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared mode before calling this
|
||||
function.
|
||||
|
||||
SEEK_HOLE
|
||||
---------
|
||||
|
||||
The ``iomap_seek_hole`` function implements the SEEK_HOLE "whence" value
|
||||
for llseek.
|
||||
``IOMAP_REPORT`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
|
||||
For unwritten mappings, the pagecache will be searched.
|
||||
Regions of the pagecache with no folio mapped, or a !uptodate fsblock
|
||||
within a folio will be reported as sparse hole areas.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared mode before calling this
|
||||
function.
|
||||
|
||||
Swap File Activation
|
||||
====================
|
||||
|
||||
The ``iomap_swapfile_activate`` function finds all the base-page aligned
|
||||
regions in a file and sets them up as swap space.
|
||||
The file will be ``fsync()``'d before activation.
|
||||
``IOMAP_REPORT`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
All mappings must be mapped or unwritten; cannot be dirty or shared, and
|
||||
cannot span multiple block devices.
|
||||
Callers must hold ``i_rwsem`` in exclusive mode; this is already
|
||||
provided by ``swapon``.
|
||||
|
||||
File Space Mapping Reporting
|
||||
============================
|
||||
|
||||
iomap implements two of the file space mapping system calls.
|
||||
|
||||
FS_IOC_FIEMAP
|
||||
-------------
|
||||
|
||||
The ``iomap_fiemap`` function exports file extent mappings to userspace
|
||||
in the format specified by the ``FS_IOC_FIEMAP`` ioctl.
|
||||
``IOMAP_REPORT`` will be passed as the ``flags`` argument to
|
||||
``->iomap_begin``.
|
||||
Callers commonly hold ``i_rwsem`` in shared mode before calling this
|
||||
function.
|
||||
|
||||
FIBMAP (deprecated)
|
||||
-------------------
|
||||
|
||||
``iomap_bmap`` implements FIBMAP.
|
||||
The calling conventions are the same as for FIEMAP.
|
||||
This function is only provided to maintain compatibility for filesystems
|
||||
that implemented FIBMAP prior to conversion.
|
||||
This ioctl is deprecated; do **not** add a FIBMAP implementation to
|
||||
filesystems that do not have it.
|
||||
Callers should probably hold ``i_rwsem`` in shared mode before calling
|
||||
this function, but this is unclear.
|
120
Documentation/filesystems/iomap/porting.rst
Normal file
120
Documentation/filesystems/iomap/porting.rst
Normal file
|
@ -0,0 +1,120 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. _iomap_porting:
|
||||
|
||||
..
|
||||
Dumb style notes to maintain the author's sanity:
|
||||
Please try to start sentences on separate lines so that
|
||||
sentence changes don't bleed colors in diff.
|
||||
Heading decorations are documented in sphinx.rst.
|
||||
|
||||
=======================
|
||||
Porting Your Filesystem
|
||||
=======================
|
||||
|
||||
.. contents:: Table of Contents
|
||||
:local:
|
||||
|
||||
Why Convert?
|
||||
============
|
||||
|
||||
There are several reasons to convert a filesystem to iomap:
|
||||
|
||||
1. The classic Linux I/O path is not terribly efficient.
|
||||
Pagecache operations lock a single base page at a time and then call
|
||||
into the filesystem to return a mapping for only that page.
|
||||
Direct I/O operations build I/O requests a single file block at a
|
||||
time.
|
||||
This worked well enough for direct/indirect-mapped filesystems such
|
||||
as ext2, but is very inefficient for extent-based filesystems such
|
||||
as XFS.
|
||||
|
||||
2. Large folios are only supported via iomap; there are no plans to
|
||||
convert the old buffer_head path to use them.
|
||||
|
||||
3. Direct access to storage on memory-like devices (fsdax) is only
|
||||
supported via iomap.
|
||||
|
||||
4. Lower maintenance overhead for individual filesystem maintainers.
|
||||
iomap handles common pagecache related operations itself, such as
|
||||
allocating, instantiating, locking, and unlocking of folios.
|
||||
No ->write_begin(), ->write_end() or direct_IO
|
||||
address_space_operations are required to be implemented by
|
||||
filesystem using iomap.
|
||||
|
||||
How Do I Convert a Filesystem?
|
||||
==============================
|
||||
|
||||
First, add ``#include <linux/iomap.h>`` from your source code and add
|
||||
``select FS_IOMAP`` to your filesystem's Kconfig option.
|
||||
Build the kernel, run fstests with the ``-g all`` option across a wide
|
||||
variety of your filesystem's supported configurations to build a
|
||||
baseline of which tests pass and which ones fail.
|
||||
|
||||
The recommended approach is first to implement ``->iomap_begin`` (and
|
||||
``->iomap_end`` if necessary) to allow iomap to obtain a read-only
|
||||
mapping of a file range.
|
||||
In most cases, this is a relatively trivial conversion of the existing
|
||||
``get_block()`` function for read-only mappings.
|
||||
``FS_IOC_FIEMAP`` is a good first target because it is trivial to
|
||||
implement support for it and then to determine that the extent map
|
||||
iteration is correct from userspace.
|
||||
If FIEMAP is returning the correct information, it's a good sign that
|
||||
other read-only mapping operations will do the right thing.
|
||||
|
||||
Next, modify the filesystem's ``get_block(create = false)``
|
||||
implementation to use the new ``->iomap_begin`` implementation to map
|
||||
file space for selected read operations.
|
||||
Hide behind a debugging knob the ability to switch on the iomap mapping
|
||||
functions for selected call paths.
|
||||
It is necessary to write some code to fill out the bufferhead-based
|
||||
mapping information from the ``iomap`` structure, but the new functions
|
||||
can be tested without needing to implement any iomap APIs.
|
||||
|
||||
Once the read-only functions are working like this, convert each high
|
||||
level file operation one by one to use iomap native APIs instead of
|
||||
going through ``get_block()``.
|
||||
Done one at a time, regressions should be self evident.
|
||||
You *do* have a regression test baseline for fstests, right?
|
||||
It is suggested to convert swap file activation, ``SEEK_DATA``, and
|
||||
``SEEK_HOLE`` before tackling the I/O paths.
|
||||
A likely complexity at this point will be converting the buffered read
|
||||
I/O path because of bufferheads.
|
||||
The buffered read I/O paths doesn't need to be converted yet, though the
|
||||
direct I/O read path should be converted in this phase.
|
||||
|
||||
At this point, you should look over your ``->iomap_begin`` function.
|
||||
If it switches between large blocks of code based on dispatching of the
|
||||
``flags`` argument, you should consider breaking it up into
|
||||
per-operation iomap ops with smaller, more cohesive functions.
|
||||
XFS is a good example of this.
|
||||
|
||||
The next thing to do is implement ``get_blocks(create == true)``
|
||||
functionality in the ``->iomap_begin``/``->iomap_end`` methods.
|
||||
It is strongly recommended to create separate mapping functions and
|
||||
iomap ops for write operations.
|
||||
Then convert the direct I/O write path to iomap, and start running fsx
|
||||
w/ DIO enabled in earnest on filesystem.
|
||||
This will flush out lots of data integrity corner case bugs that the new
|
||||
write mapping implementation introduces.
|
||||
|
||||
Now, convert any remaining file operations to call the iomap functions.
|
||||
This will get the entire filesystem using the new mapping functions, and
|
||||
they should largely be debugged and working correctly after this step.
|
||||
|
||||
Most likely at this point, the buffered read and write paths will still
|
||||
need to be converted.
|
||||
The mapping functions should all work correctly, so all that needs to be
|
||||
done is rewriting all the code that interfaces with bufferheads to
|
||||
interface with iomap and folios.
|
||||
It is much easier first to get regular file I/O (without any fancy
|
||||
features like fscrypt, fsverity, compression, or data=journaling)
|
||||
converted to use iomap.
|
||||
Some of those fancy features (fscrypt and compression) aren't
|
||||
implemented yet in iomap.
|
||||
For unjournalled filesystems that use the pagecache for symbolic links
|
||||
and directories, you might also try converting their handling to iomap.
|
||||
|
||||
The rest is left as an exercise for the reader, as it will be different
|
||||
for every filesystem.
|
||||
If you encounter problems, email the people and lists in
|
||||
``get_maintainers.pl`` for help.
|
|
@ -8460,6 +8460,7 @@ R: Darrick J. Wong <djwong@kernel.org>
|
|||
L: linux-xfs@vger.kernel.org
|
||||
L: linux-fsdevel@vger.kernel.org
|
||||
S: Supported
|
||||
F: Documentation/filesystems/iomap/*
|
||||
F: fs/iomap/
|
||||
F: include/linux/iomap.h
|
||||
|
||||
|
|
|
@ -442,6 +442,24 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
|
|||
return pos - orig_pos + plen;
|
||||
}
|
||||
|
||||
static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
|
||||
struct iomap_readpage_ctx *ctx)
|
||||
{
|
||||
struct folio *folio = ctx->cur_folio;
|
||||
size_t offset = offset_in_folio(folio, iter->pos);
|
||||
loff_t length = min_t(loff_t, folio_size(folio) - offset,
|
||||
iomap_length(iter));
|
||||
loff_t done, ret;
|
||||
|
||||
for (done = 0; done < length; done += ret) {
|
||||
ret = iomap_readpage_iter(iter, ctx, done);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
||||
{
|
||||
struct iomap_iter iter = {
|
||||
|
@ -457,7 +475,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
|||
trace_iomap_readpage(iter.inode, 1);
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
|
||||
iter.processed = iomap_read_folio_iter(&iter, &ctx);
|
||||
|
||||
if (ctx.bio) {
|
||||
submit_bio(ctx.bio);
|
||||
|
@ -872,37 +890,22 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
|
|||
size_t copied, struct folio *folio)
|
||||
{
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
loff_t old_size = iter->inode->i_size;
|
||||
size_t written;
|
||||
|
||||
if (srcmap->type == IOMAP_INLINE) {
|
||||
iomap_write_end_inline(iter, folio, pos, copied);
|
||||
written = copied;
|
||||
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
|
||||
written = block_write_end(NULL, iter->inode->i_mapping, pos,
|
||||
return true;
|
||||
}
|
||||
|
||||
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
|
||||
size_t bh_written;
|
||||
|
||||
bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
|
||||
len, copied, &folio->page, NULL);
|
||||
WARN_ON_ONCE(written != copied && written != 0);
|
||||
} else {
|
||||
written = __iomap_write_end(iter->inode, pos, len, copied,
|
||||
folio) ? copied : 0;
|
||||
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
|
||||
return bh_written == copied;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the in-memory inode size after copying the data into the page
|
||||
* cache. It's up to the file system to write the updated size to disk,
|
||||
* preferably after I/O completion so that no stale data is exposed.
|
||||
* Only once that's done can we unlock and release the folio.
|
||||
*/
|
||||
if (pos + written > old_size) {
|
||||
i_size_write(iter->inode, pos + written);
|
||||
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
|
||||
}
|
||||
__iomap_put_folio(iter, pos, written, folio);
|
||||
|
||||
if (old_size < pos)
|
||||
pagecache_isize_extended(iter->inode, old_size, pos);
|
||||
|
||||
return written == copied;
|
||||
return __iomap_write_end(iter->inode, pos, len, copied, folio);
|
||||
}
|
||||
|
||||
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
|
@ -917,6 +920,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
|||
|
||||
do {
|
||||
struct folio *folio;
|
||||
loff_t old_size;
|
||||
size_t offset; /* Offset into folio */
|
||||
size_t bytes; /* Bytes to write to folio */
|
||||
size_t copied; /* Bytes copied from user */
|
||||
|
@ -968,6 +972,23 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
|||
written = iomap_write_end(iter, pos, bytes, copied, folio) ?
|
||||
copied : 0;
|
||||
|
||||
/*
|
||||
* Update the in-memory inode size after copying the data into
|
||||
* the page cache. It's up to the file system to write the
|
||||
* updated size to disk, preferably after I/O completion so that
|
||||
* no stale data is exposed. Only once that's done can we
|
||||
* unlock and release the folio.
|
||||
*/
|
||||
old_size = iter->inode->i_size;
|
||||
if (pos + written > old_size) {
|
||||
i_size_write(iter->inode, pos + written);
|
||||
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
|
||||
}
|
||||
__iomap_put_folio(iter, pos, written, folio);
|
||||
|
||||
if (old_size < pos)
|
||||
pagecache_isize_extended(iter->inode, old_size, pos);
|
||||
|
||||
cond_resched();
|
||||
if (unlikely(written == 0)) {
|
||||
/*
|
||||
|
@ -1338,6 +1359,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
|
|||
bytes = folio_size(folio) - offset;
|
||||
|
||||
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
|
||||
__iomap_put_folio(iter, pos, bytes, folio);
|
||||
if (WARN_ON_ONCE(!ret))
|
||||
return -EIO;
|
||||
|
||||
|
@ -1403,6 +1425,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
|||
folio_mark_accessed(folio);
|
||||
|
||||
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
|
||||
__iomap_put_folio(iter, pos, bytes, folio);
|
||||
if (WARN_ON_ONCE(!ret))
|
||||
return -EIO;
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#include "xfs_da_btree.h"
|
||||
#include "xfs_attr.h"
|
||||
#include "xfs_trans.h"
|
||||
#include "xfs_trans_space.h"
|
||||
#include "xfs_bmap_btree.h"
|
||||
#include "xfs_trace.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "xfs_symlink.h"
|
||||
|
@ -811,6 +813,7 @@ xfs_setattr_size(
|
|||
struct xfs_trans *tp;
|
||||
int error;
|
||||
uint lock_flags = 0;
|
||||
uint resblks = 0;
|
||||
bool did_zeroing = false;
|
||||
|
||||
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
|
||||
|
@ -917,7 +920,17 @@ xfs_setattr_size(
|
|||
return error;
|
||||
}
|
||||
|
||||
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
|
||||
/*
|
||||
* For realtime inode with more than one block rtextsize, we need the
|
||||
* block reservation for bmap btree block allocations/splits that can
|
||||
* happen since it could split the tail written extent and convert the
|
||||
* right beyond EOF one to unwritten.
|
||||
*/
|
||||
if (xfs_inode_has_bigrtalloc(ip))
|
||||
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
|
||||
|
||||
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
|
||||
0, 0, &tp);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
|
|
Loading…
Reference in a new issue