mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-24 01:09:38 -05:00
libnvdimm for 4.18
* DAX broke a fundamental assumption of truncate of file mapped pages. The truncate path assumed that it is safe to disconnect a pinned page from a file and let the filesystem reclaim the physical block. With DAX the page is equivalent to the filesystem block. Introduce dax_layout_busy_page() to enable filesystems to wait for pinned DAX pages to be released. Without this wait a filesystem could allocate blocks under active device-DMA to a new file. * DAX arranges for the block layer to be bypassed and uses dax_direct_access() + copy_to_iter() to satisfy read(2) calls. However, the memcpy_mcsafe() facility is available through the pmem block driver. In order to safely handle media errors, via the DAX block-layer bypass, introduce copy_to_iter_mcsafe(). * Fix cache management policy relative to the ACPI NFIT Platform Capabilities Structure to properly elide cache flushes when they are not necessary. The table indicates whether CPU caches are power-fail protected. Clarify that a deep flush is always performed on REQ_{FUA,PREFLUSH} requests. -----BEGIN PGP SIGNATURE----- iQIcBAABAgAGBQJbGxI7AAoJEB7SkWpmfYgCDjsP/2Lcibu9Kf4tKIzuInsle6iE 6qP29qlkpHVTpDKbhvIxTYTYL9sMU0DNUrpPCJR/EYdeyztLWDFC5EAT1wF240vf maV37s/uP331jSC/2VJnKWzBs2ztQxmKLEIQCxh6aT0qs9cbaOvJgB/WlVu+qtsl aGJFLmb6vdQacp31noU5plKrMgMA1pADyF5qx9I9K2HwowHE7T368ZEFS/3S//c3 LXmpx/Nfq52sGu/qbRbu6B1CTJhIGhmarObyQnvBYoKntK1Ov4e8DS95wD3EhNDe FuRkOCUKhjl6cFy7QVWh1ct1bFm84ny+b4/AtbpOmv9l/+0mveJ7e+5mu8HQTifT wYiEe2xzXJ+OG/xntv8SvlZKMpjP3BqI0jYsTutsjT4oHrciiXdXM186cyS+BiGp KtFmWyncQJgfiTq6+Hj5XpP9BapNS+OYdYgUagw9ZwzdzptuGFYUMSVOBrYrn6c/ fwqtxjubykJoW0P3pkIoT91arFSea7nxOKnGwft06imQ7TwR4ARsI308feQ9itJq 2P2e7/20nYMsw2aRaUDDA70Yu+Lagn1m8WL87IybUGeUDLb1BAkjphAlWa6COJ+u PhvAD2tvyM9m0c7O5Mytvz7iWKG6SVgatoAyOPkaeplQK8khZ+wEpuK58sO6C1w8 4GBvt9ri9i/Ww/A+ppWs =4bfw -----END PGP SIGNATURE----- Merge tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm updates from Dan Williams: "This adds a user for the new 'bytes-remaining' updates to memcpy_mcsafe() that you already received through Ingo via the x86-dax- for-linus pull. Not included here, but still targeting this cycle, is support for handling memory media errors (poison) consumed via userspace dax mappings. Summary: - DAX broke a fundamental assumption of truncate of file mapped pages. The truncate path assumed that it is safe to disconnect a pinned page from a file and let the filesystem reclaim the physical block. With DAX the page is equivalent to the filesystem block. Introduce dax_layout_busy_page() to enable filesystems to wait for pinned DAX pages to be released. Without this wait a filesystem could allocate blocks under active device-DMA to a new file. - DAX arranges for the block layer to be bypassed and uses dax_direct_access() + copy_to_iter() to satisfy read(2) calls. However, the memcpy_mcsafe() facility is available through the pmem block driver. In order to safely handle media errors, via the DAX block-layer bypass, introduce copy_to_iter_mcsafe(). - Fix cache management policy relative to the ACPI NFIT Platform Capabilities Structure to properly elide cache flushes when they are not necessary. The table indicates whether CPU caches are power-fail protected. Clarify that a deep flush is always performed on REQ_{FUA,PREFLUSH} requests" * tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (21 commits) dax: Use dax_write_cache* helpers libnvdimm, pmem: Do not flush power-fail protected CPU caches libnvdimm, pmem: Unconditionally deep flush on *sync libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH acpi, nfit: Remove ecc_unit_size dax: dax_insert_mapping_entry always succeeds libnvdimm, e820: Register all pmem resources libnvdimm: Debug probe times linvdimm, pmem: Preserve read-only setting for pmem devices x86, nfit_test: Add unit test for memcpy_mcsafe() pmem: Switch to copy_to_iter_mcsafe() dax: Report bytes remaining in dax_iomap_actor() dax: Introduce a ->copy_to_iter dax operation uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation xfs, dax: introduce xfs_break_dax_layouts() xfs: prepare xfs_break_layouts() for another layout type xfs: prepare xfs_break_layouts() to be called with XFS_MMAPLOCK_EXCL mm, fs, dax: handle layout changes to pinned dax mappings mm: fix __gup_device_huge vs unmap mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS ...
This commit is contained in:
commit
7d3bf613e9
40 changed files with 925 additions and 380 deletions
17
Documentation/ABI/removed/sysfs-bus-nfit
Normal file
17
Documentation/ABI/removed/sysfs-bus-nfit
Normal file
|
@ -0,0 +1,17 @@
|
|||
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
|
||||
Date: Aug, 2017
|
||||
KernelVersion: v4.14 (Removed v4.18)
|
||||
Contact: linux-nvdimm@lists.01.org
|
||||
Description:
|
||||
(RO) Size of a write request to a DIMM that will not incur a
|
||||
read-modify-write cycle at the memory controller.
|
||||
|
||||
When the nfit driver initializes it runs an ARS (Address Range
|
||||
Scrub) operation across every pmem range. Part of that process
|
||||
involves determining the ARS capabilities of a given address
|
||||
range. One of the capabilities that is reported is the 'Clear
|
||||
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
|
||||
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
|
||||
This property indicates the boundary at which the NVDIMM may
|
||||
need to perform read-modify-write cycles to maintain ECC (Error
|
||||
Correcting Code) blocks.
|
|
@ -212,22 +212,3 @@ Description:
|
|||
range. Used by NVDIMM Region Mapping Structure to uniquely refer
|
||||
to this structure. Value of 0 is reserved and not used as an
|
||||
index.
|
||||
|
||||
|
||||
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
|
||||
Date: Aug, 2017
|
||||
KernelVersion: v4.14
|
||||
Contact: linux-nvdimm@lists.01.org
|
||||
Description:
|
||||
(RO) Size of a write request to a DIMM that will not incur a
|
||||
read-modify-write cycle at the memory controller.
|
||||
|
||||
When the nfit driver initializes it runs an ARS (Address Range
|
||||
Scrub) operation across every pmem range. Part of that process
|
||||
involves determining the ARS capabilities of a given address
|
||||
range. One of the capabilities that is reported is the 'Clear
|
||||
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
|
||||
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
|
||||
This property indicates the boundary at which the NVDIMM may
|
||||
need to perform read-modify-write cycles to maintain ECC (Error
|
||||
Correcting Code) blocks.
|
||||
|
|
|
@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
|
|||
You should normally say N here, unless you want to debug early
|
||||
crashes or need a very simple printk logging facility.
|
||||
|
||||
config MCSAFE_TEST
|
||||
def_bool n
|
||||
|
||||
config X86_PTDUMP_CORE
|
||||
def_bool n
|
||||
|
||||
|
|
75
arch/x86/include/asm/mcsafe_test.h
Normal file
75
arch/x86/include/asm/mcsafe_test.h
Normal file
|
@ -0,0 +1,75 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _MCSAFE_TEST_H_
|
||||
#define _MCSAFE_TEST_H_
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#ifdef CONFIG_MCSAFE_TEST
|
||||
extern unsigned long mcsafe_test_src;
|
||||
extern unsigned long mcsafe_test_dst;
|
||||
|
||||
static inline void mcsafe_inject_src(void *addr)
|
||||
{
|
||||
if (addr)
|
||||
mcsafe_test_src = (unsigned long) addr;
|
||||
else
|
||||
mcsafe_test_src = ~0UL;
|
||||
}
|
||||
|
||||
static inline void mcsafe_inject_dst(void *addr)
|
||||
{
|
||||
if (addr)
|
||||
mcsafe_test_dst = (unsigned long) addr;
|
||||
else
|
||||
mcsafe_test_dst = ~0UL;
|
||||
}
|
||||
#else /* CONFIG_MCSAFE_TEST */
|
||||
static inline void mcsafe_inject_src(void *addr)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mcsafe_inject_dst(void *addr)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MCSAFE_TEST */
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
#include <asm/export.h>
|
||||
|
||||
#ifdef CONFIG_MCSAFE_TEST
|
||||
.macro MCSAFE_TEST_CTL
|
||||
.pushsection .data
|
||||
.align 8
|
||||
.globl mcsafe_test_src
|
||||
mcsafe_test_src:
|
||||
.quad 0
|
||||
EXPORT_SYMBOL_GPL(mcsafe_test_src)
|
||||
.globl mcsafe_test_dst
|
||||
mcsafe_test_dst:
|
||||
.quad 0
|
||||
EXPORT_SYMBOL_GPL(mcsafe_test_dst)
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
.macro MCSAFE_TEST_SRC reg count target
|
||||
leaq \count(\reg), %r9
|
||||
cmp mcsafe_test_src, %r9
|
||||
ja \target
|
||||
.endm
|
||||
|
||||
.macro MCSAFE_TEST_DST reg count target
|
||||
leaq \count(\reg), %r9
|
||||
cmp mcsafe_test_dst, %r9
|
||||
ja \target
|
||||
.endm
|
||||
#else
|
||||
.macro MCSAFE_TEST_CTL
|
||||
.endm
|
||||
|
||||
.macro MCSAFE_TEST_SRC reg count target
|
||||
.endm
|
||||
|
||||
.macro MCSAFE_TEST_DST reg count target
|
||||
.endm
|
||||
#endif /* CONFIG_MCSAFE_TEST */
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _MCSAFE_TEST_H_ */
|
|
@ -3,6 +3,7 @@
|
|||
#include <linux/linkage.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/cpufeatures.h>
|
||||
#include <asm/mcsafe_test.h>
|
||||
#include <asm/alternative-asm.h>
|
||||
#include <asm/export.h>
|
||||
|
||||
|
@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
|
|||
ENDPROC(memcpy_orig)
|
||||
|
||||
#ifndef CONFIG_UML
|
||||
|
||||
MCSAFE_TEST_CTL
|
||||
|
||||
/*
|
||||
* __memcpy_mcsafe - memory copy with machine check exception handling
|
||||
* Note that we only catch machine checks when reading the source addresses.
|
||||
|
@ -206,6 +210,8 @@ ENTRY(__memcpy_mcsafe)
|
|||
subl %ecx, %edx
|
||||
.L_read_leading_bytes:
|
||||
movb (%rsi), %al
|
||||
MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
|
||||
MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
|
||||
.L_write_leading_bytes:
|
||||
movb %al, (%rdi)
|
||||
incq %rsi
|
||||
|
@ -221,6 +227,8 @@ ENTRY(__memcpy_mcsafe)
|
|||
|
||||
.L_read_words:
|
||||
movq (%rsi), %r8
|
||||
MCSAFE_TEST_SRC %rsi 8 .E_read_words
|
||||
MCSAFE_TEST_DST %rdi 8 .E_write_words
|
||||
.L_write_words:
|
||||
movq %r8, (%rdi)
|
||||
addq $8, %rsi
|
||||
|
@ -237,6 +245,8 @@ ENTRY(__memcpy_mcsafe)
|
|||
movl %edx, %ecx
|
||||
.L_read_trailing_bytes:
|
||||
movb (%rsi), %al
|
||||
MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
|
||||
MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
|
||||
.L_write_trailing_bytes:
|
||||
movb %al, (%rdi)
|
||||
incq %rsi
|
||||
|
|
|
@ -1978,19 +1978,8 @@ static ssize_t range_index_show(struct device *dev,
|
|||
}
|
||||
static DEVICE_ATTR_RO(range_index);
|
||||
|
||||
static ssize_t ecc_unit_size_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nd_region *nd_region = to_nd_region(dev);
|
||||
struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
|
||||
|
||||
return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
|
||||
}
|
||||
static DEVICE_ATTR_RO(ecc_unit_size);
|
||||
|
||||
static struct attribute *acpi_nfit_region_attributes[] = {
|
||||
&dev_attr_range_index.attr,
|
||||
&dev_attr_ecc_unit_size.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
|
|
@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
|
|||
bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
|
||||
{
|
||||
struct dax_device *dax_dev;
|
||||
bool dax_enabled = false;
|
||||
pgoff_t pgoff;
|
||||
int err, id;
|
||||
void *kaddr;
|
||||
|
@ -134,14 +135,21 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
|
|||
* on being able to do (page_address(pfn_to_page())).
|
||||
*/
|
||||
WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
|
||||
dax_enabled = true;
|
||||
} else if (pfn_t_devmap(pfn)) {
|
||||
/* pass */;
|
||||
} else {
|
||||
struct dev_pagemap *pgmap;
|
||||
|
||||
pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
|
||||
if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
|
||||
dax_enabled = true;
|
||||
put_dev_pagemap(pgmap);
|
||||
}
|
||||
|
||||
if (!dax_enabled) {
|
||||
pr_debug("%s: error: dax support not enabled\n",
|
||||
bdevname(bdev, buf));
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__bdev_dax_supported);
|
||||
|
@ -182,8 +190,7 @@ static ssize_t write_cache_show(struct device *dev,
|
|||
if (!dax_dev)
|
||||
return -ENXIO;
|
||||
|
||||
rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
|
||||
&dax_dev->flags));
|
||||
rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
|
||||
put_dax(dax_dev);
|
||||
return rc;
|
||||
}
|
||||
|
@ -201,10 +208,8 @@ static ssize_t write_cache_store(struct device *dev,
|
|||
|
||||
if (rc)
|
||||
len = rc;
|
||||
else if (write_cache)
|
||||
set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
|
||||
else
|
||||
clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
|
||||
dax_write_cache(dax_dev, write_cache);
|
||||
|
||||
put_dax(dax_dev);
|
||||
return len;
|
||||
|
@ -282,11 +287,21 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(dax_copy_from_iter);
|
||||
|
||||
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
if (!dax_alive(dax_dev))
|
||||
return 0;
|
||||
|
||||
return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_copy_to_iter);
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_PMEM_API
|
||||
void arch_wb_cache_pmem(void *addr, size_t size);
|
||||
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
|
||||
{
|
||||
if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
|
||||
if (unlikely(!dax_write_cache_enabled(dax_dev)))
|
||||
return;
|
||||
|
||||
arch_wb_cache_pmem(addr, size);
|
||||
|
|
|
@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
|
|||
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
struct linear_c *lc = ti->private;
|
||||
struct block_device *bdev = lc->dev->bdev;
|
||||
struct dax_device *dax_dev = lc->dev->dax_dev;
|
||||
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
|
||||
|
||||
dev_sector = linear_map_sector(ti, sector);
|
||||
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
|
||||
return 0;
|
||||
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
#else
|
||||
#define linear_dax_direct_access NULL
|
||||
#define linear_dax_copy_from_iter NULL
|
||||
#define linear_dax_copy_to_iter NULL
|
||||
#endif
|
||||
|
||||
static struct target_type linear_target = {
|
||||
|
@ -204,6 +219,7 @@ static struct target_type linear_target = {
|
|||
.iterate_devices = linear_iterate_devices,
|
||||
.direct_access = linear_dax_direct_access,
|
||||
.dax_copy_from_iter = linear_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = linear_dax_copy_to_iter,
|
||||
};
|
||||
|
||||
int __init dm_linear_init(void)
|
||||
|
|
|
@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
|
|||
dax_copy:
|
||||
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
|
||||
pgoff_t pgoff, void *addr, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
struct log_writes_c *lc = ti->private;
|
||||
sector_t sector = pgoff * PAGE_SECTORS;
|
||||
|
||||
if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
|
||||
return 0;
|
||||
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
#else
|
||||
#define log_writes_dax_direct_access NULL
|
||||
#define log_writes_dax_copy_from_iter NULL
|
||||
#define log_writes_dax_copy_to_iter NULL
|
||||
#endif
|
||||
|
||||
static struct target_type log_writes_target = {
|
||||
|
@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
|
|||
.io_hints = log_writes_io_hints,
|
||||
.direct_access = log_writes_dax_direct_access,
|
||||
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
|
||||
};
|
||||
|
||||
static int __init dm_log_writes_init(void)
|
||||
|
|
|
@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
|
|||
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
|
||||
struct stripe_c *sc = ti->private;
|
||||
struct dax_device *dax_dev;
|
||||
struct block_device *bdev;
|
||||
uint32_t stripe;
|
||||
|
||||
stripe_map_sector(sc, sector, &stripe, &dev_sector);
|
||||
dev_sector += sc->stripe[stripe].physical_start;
|
||||
dax_dev = sc->stripe[stripe].dev->dax_dev;
|
||||
bdev = sc->stripe[stripe].dev->bdev;
|
||||
|
||||
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
|
||||
return 0;
|
||||
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
|
||||
}
|
||||
|
||||
#else
|
||||
#define stripe_dax_direct_access NULL
|
||||
#define stripe_dax_copy_from_iter NULL
|
||||
#define stripe_dax_copy_to_iter NULL
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -478,6 +498,7 @@ static struct target_type stripe_target = {
|
|||
.io_hints = stripe_io_hints,
|
||||
.direct_access = stripe_dax_direct_access,
|
||||
.dax_copy_from_iter = stripe_dax_copy_from_iter,
|
||||
.dax_copy_to_iter = stripe_dax_copy_to_iter,
|
||||
};
|
||||
|
||||
int __init dm_stripe_init(void)
|
||||
|
|
|
@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
struct mapped_device *md = dax_get_private(dax_dev);
|
||||
sector_t sector = pgoff * PAGE_SECTORS;
|
||||
struct dm_target *ti;
|
||||
long ret = 0;
|
||||
int srcu_idx;
|
||||
|
||||
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
|
||||
|
||||
if (!ti)
|
||||
goto out;
|
||||
if (!ti->type->dax_copy_to_iter) {
|
||||
ret = copy_to_iter(addr, bytes, i);
|
||||
goto out;
|
||||
}
|
||||
ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
|
||||
out:
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* A target may call dm_accept_partial_bio only from the map routine. It is
|
||||
* allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
|
||||
|
@ -3137,6 +3161,7 @@ static const struct block_device_operations dm_blk_dops = {
|
|||
static const struct dax_operations dm_dax_ops = {
|
||||
.direct_access = dm_dax_direct_access,
|
||||
.copy_from_iter = dm_dax_copy_from_iter,
|
||||
.copy_to_iter = dm_dax_copy_to_iter,
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -100,6 +100,9 @@ static int nvdimm_bus_probe(struct device *dev)
|
|||
if (!try_module_get(provider))
|
||||
return -ENXIO;
|
||||
|
||||
dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
|
||||
dev->driver->name, dev_name(dev));
|
||||
|
||||
nvdimm_bus_probe_start(nvdimm_bus);
|
||||
rc = nd_drv->probe(dev);
|
||||
if (rc == 0)
|
||||
|
@ -108,7 +111,7 @@ static int nvdimm_bus_probe(struct device *dev)
|
|||
nd_region_disable(nvdimm_bus, dev);
|
||||
nvdimm_bus_probe_end(nvdimm_bus);
|
||||
|
||||
dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
|
||||
dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
|
||||
dev_name(dev), rc);
|
||||
|
||||
if (rc != 0)
|
||||
|
@ -566,14 +569,18 @@ int nvdimm_revalidate_disk(struct gendisk *disk)
|
|||
{
|
||||
struct device *dev = disk_to_dev(disk)->parent;
|
||||
struct nd_region *nd_region = to_nd_region(dev->parent);
|
||||
const char *pol = nd_region->ro ? "only" : "write";
|
||||
int disk_ro = get_disk_ro(disk);
|
||||
|
||||
if (nd_region->ro == get_disk_ro(disk))
|
||||
/*
|
||||
* Upgrade to read-only if the region is read-only preserve as
|
||||
* read-only if the disk is already read-only.
|
||||
*/
|
||||
if (disk_ro || nd_region->ro == disk_ro)
|
||||
return 0;
|
||||
|
||||
dev_info(dev, "%s read-%s, marking %s read-%s\n",
|
||||
dev_name(&nd_region->dev), pol, disk->disk_name, pol);
|
||||
set_disk_ro(disk, nd_region->ro);
|
||||
dev_info(dev, "%s read-only, marking %s read-only\n",
|
||||
dev_name(&nd_region->dev), disk->disk_name);
|
||||
set_disk_ro(disk, 1);
|
||||
|
||||
return 0;
|
||||
|
||||
|
|
|
@ -38,12 +38,27 @@ static int e820_range_to_nid(resource_size_t addr)
|
|||
}
|
||||
#endif
|
||||
|
||||
static int e820_register_one(struct resource *res, void *data)
|
||||
{
|
||||
struct nd_region_desc ndr_desc;
|
||||
struct nvdimm_bus *nvdimm_bus = data;
|
||||
|
||||
memset(&ndr_desc, 0, sizeof(ndr_desc));
|
||||
ndr_desc.res = res;
|
||||
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
|
||||
ndr_desc.numa_node = e820_range_to_nid(res->start);
|
||||
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
|
||||
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
|
||||
return -ENXIO;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int e820_pmem_probe(struct platform_device *pdev)
|
||||
{
|
||||
static struct nvdimm_bus_descriptor nd_desc;
|
||||
struct device *dev = &pdev->dev;
|
||||
struct nvdimm_bus *nvdimm_bus;
|
||||
struct resource *p;
|
||||
int rc = -ENXIO;
|
||||
|
||||
nd_desc.attr_groups = e820_pmem_attribute_groups;
|
||||
nd_desc.provider_name = "e820";
|
||||
|
@ -53,27 +68,15 @@ static int e820_pmem_probe(struct platform_device *pdev)
|
|||
goto err;
|
||||
platform_set_drvdata(pdev, nvdimm_bus);
|
||||
|
||||
for (p = iomem_resource.child; p ; p = p->sibling) {
|
||||
struct nd_region_desc ndr_desc;
|
||||
|
||||
if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
|
||||
continue;
|
||||
|
||||
memset(&ndr_desc, 0, sizeof(ndr_desc));
|
||||
ndr_desc.res = p;
|
||||
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
|
||||
ndr_desc.numa_node = e820_range_to_nid(p->start);
|
||||
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
|
||||
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
|
||||
goto err;
|
||||
}
|
||||
|
||||
rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
|
||||
IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
|
||||
if (rc)
|
||||
goto err;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
err:
|
||||
nvdimm_bus_unregister(nvdimm_bus);
|
||||
dev_err(dev, "failed to register legacy persistent memory ranges\n");
|
||||
return -ENXIO;
|
||||
return rc;
|
||||
}
|
||||
|
||||
static struct platform_driver e820_pmem_driver = {
|
||||
|
|
|
@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
|
|||
res->start += start_pad;
|
||||
res->end -= end_trunc;
|
||||
|
||||
pgmap->type = MEMORY_DEVICE_HOST;
|
||||
|
||||
if (nd_pfn->mode == PFN_MODE_RAM) {
|
||||
if (offset < SZ_8K)
|
||||
return -EINVAL;
|
||||
|
|
|
@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
|
|||
return rc;
|
||||
}
|
||||
|
||||
/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
|
||||
#ifndef REQ_FLUSH
|
||||
#define REQ_FLUSH REQ_PREFLUSH
|
||||
#endif
|
||||
|
||||
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
blk_status_t rc = 0;
|
||||
|
@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
|
|||
struct pmem_device *pmem = q->queuedata;
|
||||
struct nd_region *nd_region = to_region(pmem);
|
||||
|
||||
if (bio->bi_opf & REQ_FLUSH)
|
||||
if (bio->bi_opf & REQ_PREFLUSH)
|
||||
nvdimm_flush(nd_region);
|
||||
|
||||
do_acct = nd_iostat_start(bio, &start);
|
||||
|
@ -264,9 +259,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
|||
return copy_from_iter_flushcache(addr, bytes, i);
|
||||
}
|
||||
|
||||
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
return copy_to_iter_mcsafe(addr, bytes, i);
|
||||
}
|
||||
|
||||
static const struct dax_operations pmem_dax_ops = {
|
||||
.direct_access = pmem_dax_direct_access,
|
||||
.copy_from_iter = pmem_copy_from_iter,
|
||||
.copy_to_iter = pmem_copy_to_iter,
|
||||
};
|
||||
|
||||
static const struct attribute_group *pmem_attribute_groups[] = {
|
||||
|
@ -294,12 +296,33 @@ static void pmem_release_disk(void *__pmem)
|
|||
put_disk(pmem->disk);
|
||||
}
|
||||
|
||||
static void pmem_release_pgmap_ops(void *__pgmap)
|
||||
{
|
||||
dev_pagemap_put_ops();
|
||||
}
|
||||
|
||||
static void fsdax_pagefree(struct page *page, void *data)
|
||||
{
|
||||
wake_up_var(&page->_refcount);
|
||||
}
|
||||
|
||||
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
dev_pagemap_get_ops();
|
||||
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
|
||||
return -ENOMEM;
|
||||
pgmap->type = MEMORY_DEVICE_FS_DAX;
|
||||
pgmap->page_free = fsdax_pagefree;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pmem_attach_disk(struct device *dev,
|
||||
struct nd_namespace_common *ndns)
|
||||
{
|
||||
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
|
||||
struct nd_region *nd_region = to_nd_region(dev->parent);
|
||||
int nid = dev_to_node(dev), fua, wbc;
|
||||
int nid = dev_to_node(dev), fua;
|
||||
struct resource *res = &nsio->res;
|
||||
struct resource bb_res;
|
||||
struct nd_pfn *nd_pfn = NULL;
|
||||
|
@ -335,7 +358,6 @@ static int pmem_attach_disk(struct device *dev,
|
|||
dev_warn(dev, "unable to guarantee persistence of writes\n");
|
||||
fua = 0;
|
||||
}
|
||||
wbc = nvdimm_has_cache(nd_region);
|
||||
|
||||
if (!devm_request_mem_region(dev, res->start, resource_size(res),
|
||||
dev_name(&ndns->dev))) {
|
||||
|
@ -353,6 +375,8 @@ static int pmem_attach_disk(struct device *dev,
|
|||
pmem->pfn_flags = PFN_DEV;
|
||||
pmem->pgmap.ref = &q->q_usage_counter;
|
||||
if (is_nd_pfn(dev)) {
|
||||
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
|
||||
return -ENOMEM;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pfn_sb = nd_pfn->pfn_sb;
|
||||
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
|
||||
|
@ -364,6 +388,8 @@ static int pmem_attach_disk(struct device *dev,
|
|||
} else if (pmem_should_map_pages(dev)) {
|
||||
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
|
||||
pmem->pgmap.altmap_valid = false;
|
||||
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
|
||||
return -ENOMEM;
|
||||
addr = devm_memremap_pages(dev, &pmem->pgmap);
|
||||
pmem->pfn_flags |= PFN_MAP;
|
||||
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
|
||||
|
@ -382,7 +408,7 @@ static int pmem_attach_disk(struct device *dev,
|
|||
return PTR_ERR(addr);
|
||||
pmem->virt_addr = addr;
|
||||
|
||||
blk_queue_write_cache(q, wbc, fua);
|
||||
blk_queue_write_cache(q, true, fua);
|
||||
blk_queue_make_request(q, pmem_make_request);
|
||||
blk_queue_physical_block_size(q, PAGE_SIZE);
|
||||
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
|
||||
|
@ -413,7 +439,7 @@ static int pmem_attach_disk(struct device *dev,
|
|||
put_disk(disk);
|
||||
return -ENOMEM;
|
||||
}
|
||||
dax_write_cache(dax_dev, wbc);
|
||||
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
|
||||
pmem->dax_dev = dax_dev;
|
||||
|
||||
gendev = disk_to_dev(disk);
|
||||
|
|
|
@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush);
|
|||
|
||||
int nvdimm_has_cache(struct nd_region *nd_region)
|
||||
{
|
||||
return is_nd_pmem(&nd_region->dev);
|
||||
return is_nd_pmem(&nd_region->dev) &&
|
||||
!test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvdimm_has_cache);
|
||||
|
||||
|
|
|
@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
|
|||
return copy_from_iter(addr, bytes, i);
|
||||
}
|
||||
|
||||
static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
|
||||
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
return copy_to_iter(addr, bytes, i);
|
||||
}
|
||||
|
||||
static const struct dax_operations dcssblk_dax_ops = {
|
||||
.direct_access = dcssblk_dax_direct_access,
|
||||
.copy_from_iter = dcssblk_dax_copy_from_iter,
|
||||
.copy_to_iter = dcssblk_dax_copy_to_iter,
|
||||
};
|
||||
|
||||
struct dcssblk_dev_info {
|
||||
|
|
|
@ -38,6 +38,7 @@ config FS_DAX
|
|||
bool "Direct Access (DAX) support"
|
||||
depends on MMU
|
||||
depends on !(ARM || MIPS || SPARC)
|
||||
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
|
||||
select FS_IOMAP
|
||||
select DAX
|
||||
help
|
||||
|
|
136
fs/dax.c
136
fs/dax.c
|
@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
|
|||
}
|
||||
}
|
||||
|
||||
static struct page *dax_busy_page(void *entry)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
for_each_mapped_pfn(entry, pfn) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
if (page_ref_count(page) > 1)
|
||||
return page;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find radix tree entry at given index. If it points to an exceptional entry,
|
||||
* return it with the radix tree entry locked. If the radix tree doesn't
|
||||
|
@ -492,6 +505,90 @@ restart:
|
|||
return entry;
|
||||
}
|
||||
|
||||
/**
|
||||
* dax_layout_busy_page - find first pinned page in @mapping
|
||||
* @mapping: address space to scan for a page with ref count > 1
|
||||
*
|
||||
* DAX requires ZONE_DEVICE mapped pages. These pages are never
|
||||
* 'onlined' to the page allocator so they are considered idle when
|
||||
* page->count == 1. A filesystem uses this interface to determine if
|
||||
* any page in the mapping is busy, i.e. for DMA, or other
|
||||
* get_user_pages() usages.
|
||||
*
|
||||
* It is expected that the filesystem is holding locks to block the
|
||||
* establishment of new mappings in this address_space. I.e. it expects
|
||||
* to be able to run unmap_mapping_range() and subsequently not race
|
||||
* mapping_mapped() becoming true.
|
||||
*/
|
||||
struct page *dax_layout_busy_page(struct address_space *mapping)
|
||||
{
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
struct page *page = NULL;
|
||||
struct pagevec pvec;
|
||||
pgoff_t index, end;
|
||||
unsigned i;
|
||||
|
||||
/*
|
||||
* In the 'limited' case get_user_pages() for dax is disabled.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
|
||||
return NULL;
|
||||
|
||||
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
|
||||
return NULL;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
index = 0;
|
||||
end = -1;
|
||||
|
||||
/*
|
||||
* If we race get_user_pages_fast() here either we'll see the
|
||||
* elevated page count in the pagevec_lookup and wait, or
|
||||
* get_user_pages_fast() will see that the page it took a reference
|
||||
* against is no longer mapped in the page tables and bail to the
|
||||
* get_user_pages() slow path. The slow path is protected by
|
||||
* pte_lock() and pmd_lock(). New references are not taken without
|
||||
* holding those locks, and unmap_mapping_range() will not zero the
|
||||
* pte or pmd without holding the respective lock, so we are
|
||||
* guaranteed to either see new references or prevent new
|
||||
* references from being established.
|
||||
*/
|
||||
unmap_mapping_range(mapping, 0, 0, 1);
|
||||
|
||||
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
|
||||
min(end - index, (pgoff_t)PAGEVEC_SIZE),
|
||||
indices)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *pvec_ent = pvec.pages[i];
|
||||
void *entry;
|
||||
|
||||
index = indices[i];
|
||||
if (index >= end)
|
||||
break;
|
||||
|
||||
if (!radix_tree_exceptional_entry(pvec_ent))
|
||||
continue;
|
||||
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
entry = get_unlocked_mapping_entry(mapping, index, NULL);
|
||||
if (entry)
|
||||
page = dax_busy_page(entry);
|
||||
put_unlocked_mapping_entry(mapping, index, entry);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (page)
|
||||
break;
|
||||
}
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
index++;
|
||||
|
||||
if (page)
|
||||
break;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
|
||||
|
||||
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
|
||||
pgoff_t index, bool trunc)
|
||||
{
|
||||
|
@ -912,7 +1009,6 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
|
|||
unsigned long vaddr = vmf->address;
|
||||
vm_fault_t ret = VM_FAULT_NOPAGE;
|
||||
struct page *zero_page;
|
||||
void *entry2;
|
||||
pfn_t pfn;
|
||||
|
||||
zero_page = ZERO_PAGE(0);
|
||||
|
@ -922,13 +1018,8 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
|
|||
}
|
||||
|
||||
pfn = page_to_pfn_t(zero_page);
|
||||
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
|
||||
RADIX_DAX_ZERO_PAGE, false);
|
||||
if (IS_ERR(entry2)) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
|
||||
false);
|
||||
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
|
||||
out:
|
||||
trace_dax_load_hole(inode, vmf, ret);
|
||||
|
@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
|||
struct iov_iter *iter = data;
|
||||
loff_t end = pos + length, done = 0;
|
||||
ssize_t ret = 0;
|
||||
size_t xfer;
|
||||
int id;
|
||||
|
||||
if (iov_iter_rw(iter) == READ) {
|
||||
|
@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
|||
* vfs_write(), depending on which operation we are doing.
|
||||
*/
|
||||
if (iov_iter_rw(iter) == WRITE)
|
||||
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
|
||||
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
else
|
||||
map_len = copy_to_iter(kaddr, map_len, iter);
|
||||
if (map_len <= 0) {
|
||||
ret = map_len ? map_len : -EFAULT;
|
||||
break;
|
||||
}
|
||||
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
|
||||
pos += map_len;
|
||||
length -= map_len;
|
||||
done += map_len;
|
||||
pos += xfer;
|
||||
length -= xfer;
|
||||
done += xfer;
|
||||
|
||||
if (xfer == 0)
|
||||
ret = -EFAULT;
|
||||
if (xfer < map_len)
|
||||
break;
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
|
||||
|
@ -1240,10 +1334,6 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
|||
|
||||
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
|
||||
0, write && !sync);
|
||||
if (IS_ERR(entry)) {
|
||||
error = PTR_ERR(entry);
|
||||
goto error_finish_iomap;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are doing synchronous page fault and inode needs fsync,
|
||||
|
@ -1324,8 +1414,6 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
|
|||
pfn = page_to_pfn_t(zero_page);
|
||||
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
|
||||
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
|
||||
if (IS_ERR(ret))
|
||||
goto fallback;
|
||||
|
||||
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
|
||||
if (!pmd_none(*(vmf->pmd))) {
|
||||
|
@ -1447,8 +1535,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
|||
|
||||
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
|
||||
RADIX_DAX_PMD, write && !sync);
|
||||
if (IS_ERR(entry))
|
||||
goto finish_iomap;
|
||||
|
||||
/*
|
||||
* If we are doing synchronous page fault and inode needs fsync,
|
||||
|
|
|
@ -312,7 +312,7 @@ restart:
|
|||
if (error <= 0)
|
||||
return error;
|
||||
|
||||
error = xfs_break_layouts(inode, iolock);
|
||||
error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
|
@ -731,6 +731,69 @@ xfs_file_write_iter(
|
|||
return xfs_file_buffered_aio_write(iocb, from);
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_wait_dax_page(
|
||||
struct inode *inode,
|
||||
bool *did_unlock)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
|
||||
*did_unlock = true;
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
|
||||
schedule();
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_break_dax_layouts(
|
||||
struct inode *inode,
|
||||
uint iolock,
|
||||
bool *did_unlock)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
|
||||
|
||||
page = dax_layout_busy_page(inode->i_mapping);
|
||||
if (!page)
|
||||
return 0;
|
||||
|
||||
return ___wait_var_event(&page->_refcount,
|
||||
atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
|
||||
0, 0, xfs_wait_dax_page(inode, did_unlock));
|
||||
}
|
||||
|
||||
int
|
||||
xfs_break_layouts(
|
||||
struct inode *inode,
|
||||
uint *iolock,
|
||||
enum layout_break_reason reason)
|
||||
{
|
||||
bool retry;
|
||||
int error;
|
||||
|
||||
ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
|
||||
|
||||
do {
|
||||
retry = false;
|
||||
switch (reason) {
|
||||
case BREAK_UNMAP:
|
||||
error = xfs_break_dax_layouts(inode, *iolock, &retry);
|
||||
if (error || retry)
|
||||
break;
|
||||
/* fall through */
|
||||
case BREAK_WRITE:
|
||||
error = xfs_break_leased_layouts(inode, iolock, &retry);
|
||||
break;
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
error = -EINVAL;
|
||||
}
|
||||
} while (error == 0 && retry);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
#define XFS_FALLOC_FL_SUPPORTED \
|
||||
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
|
||||
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
|
||||
|
@ -747,7 +810,7 @@ xfs_file_fallocate(
|
|||
struct xfs_inode *ip = XFS_I(inode);
|
||||
long error;
|
||||
enum xfs_prealloc_flags flags = 0;
|
||||
uint iolock = XFS_IOLOCK_EXCL;
|
||||
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
||||
loff_t new_size = 0;
|
||||
bool do_file_insert = false;
|
||||
|
||||
|
@ -757,13 +820,10 @@ xfs_file_fallocate(
|
|||
return -EOPNOTSUPP;
|
||||
|
||||
xfs_ilock(ip, iolock);
|
||||
error = xfs_break_layouts(inode, &iolock);
|
||||
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
|
||||
iolock |= XFS_MMAPLOCK_EXCL;
|
||||
|
||||
if (mode & FALLOC_FL_PUNCH_HOLE) {
|
||||
error = xfs_free_file_space(ip, offset, len);
|
||||
if (error)
|
||||
|
|
|
@ -378,6 +378,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
|
|||
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
|
||||
>> XFS_ILOCK_SHIFT)
|
||||
|
||||
/*
|
||||
* Layouts are broken in the BREAK_WRITE case to ensure that
|
||||
* layout-holders do not collide with local writes. Additionally,
|
||||
* layouts are broken in the BREAK_UNMAP case to make sure the
|
||||
* layout-holder has a consistent view of the file's extent map. While
|
||||
* BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
|
||||
* BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
|
||||
* go idle.
|
||||
*/
|
||||
enum layout_break_reason {
|
||||
BREAK_WRITE,
|
||||
BREAK_UNMAP,
|
||||
};
|
||||
|
||||
/*
|
||||
* For multiple groups support: if S_ISGID bit is set in the parent
|
||||
* directory, group of new file is set to that of the parent, and
|
||||
|
@ -453,6 +467,8 @@ enum xfs_prealloc_flags {
|
|||
|
||||
int xfs_update_prealloc_flags(struct xfs_inode *ip,
|
||||
enum xfs_prealloc_flags flags);
|
||||
int xfs_break_layouts(struct inode *inode, uint *iolock,
|
||||
enum layout_break_reason reason);
|
||||
|
||||
/* from xfs_iops.c */
|
||||
extern void xfs_setup_inode(struct xfs_inode *ip);
|
||||
|
|
|
@ -39,7 +39,6 @@
|
|||
#include "xfs_icache.h"
|
||||
#include "xfs_symlink.h"
|
||||
#include "xfs_trans.h"
|
||||
#include "xfs_pnfs.h"
|
||||
#include "xfs_acl.h"
|
||||
#include "xfs_btree.h"
|
||||
#include <linux/fsmap.h>
|
||||
|
@ -614,7 +613,7 @@ xfs_ioc_space(
|
|||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct iattr iattr;
|
||||
enum xfs_prealloc_flags flags = 0;
|
||||
uint iolock = XFS_IOLOCK_EXCL;
|
||||
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
||||
int error;
|
||||
|
||||
/*
|
||||
|
@ -644,13 +643,10 @@ xfs_ioc_space(
|
|||
return error;
|
||||
|
||||
xfs_ilock(ip, iolock);
|
||||
error = xfs_break_layouts(inode, &iolock);
|
||||
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
|
||||
iolock |= XFS_MMAPLOCK_EXCL;
|
||||
|
||||
switch (bf->l_whence) {
|
||||
case 0: /*SEEK_SET*/
|
||||
break;
|
||||
|
|
|
@ -37,7 +37,6 @@
|
|||
#include "xfs_da_btree.h"
|
||||
#include "xfs_dir2.h"
|
||||
#include "xfs_trans_space.h"
|
||||
#include "xfs_pnfs.h"
|
||||
#include "xfs_iomap.h"
|
||||
|
||||
#include <linux/capability.h>
|
||||
|
@ -1030,14 +1029,19 @@ xfs_vn_setattr(
|
|||
int error;
|
||||
|
||||
if (iattr->ia_valid & ATTR_SIZE) {
|
||||
struct xfs_inode *ip = XFS_I(d_inode(dentry));
|
||||
uint iolock = XFS_IOLOCK_EXCL;
|
||||
|
||||
error = xfs_break_layouts(d_inode(dentry), &iolock);
|
||||
if (error)
|
||||
return error;
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
uint iolock;
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
|
||||
iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
||||
|
||||
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
|
||||
if (error) {
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
|
||||
return error;
|
||||
}
|
||||
|
||||
error = xfs_vn_setattr_size(dentry, iattr);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
|
||||
} else {
|
||||
|
|
|
@ -31,19 +31,20 @@
|
|||
* rules in the page fault path we don't bother.
|
||||
*/
|
||||
int
|
||||
xfs_break_layouts(
|
||||
xfs_break_leased_layouts(
|
||||
struct inode *inode,
|
||||
uint *iolock)
|
||||
uint *iolock,
|
||||
bool *did_unlock)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
int error;
|
||||
|
||||
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
|
||||
|
||||
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
|
||||
xfs_iunlock(ip, *iolock);
|
||||
*did_unlock = true;
|
||||
error = break_layout(inode, true);
|
||||
*iolock = XFS_IOLOCK_EXCL;
|
||||
*iolock &= ~XFS_IOLOCK_SHARED;
|
||||
*iolock |= XFS_IOLOCK_EXCL;
|
||||
xfs_ilock(ip, *iolock);
|
||||
}
|
||||
|
||||
|
@ -120,8 +121,8 @@ xfs_fs_map_blocks(
|
|||
* Lock out any other I/O before we flush and invalidate the pagecache,
|
||||
* and then hand out a layout to the remote system. This is very
|
||||
* similar to direct I/O, except that the synchronization is much more
|
||||
* complicated. See the comment near xfs_break_layouts for a detailed
|
||||
* explanation.
|
||||
* complicated. See the comment near xfs_break_leased_layouts
|
||||
* for a detailed explanation.
|
||||
*/
|
||||
xfs_ilock(ip, XFS_IOLOCK_EXCL);
|
||||
|
||||
|
|
|
@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
|
|||
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
|
||||
struct iattr *iattr);
|
||||
|
||||
int xfs_break_layouts(struct inode *inode, uint *iolock);
|
||||
int xfs_break_leased_layouts(struct inode *inode, uint *iolock,
|
||||
bool *did_unlock);
|
||||
#else
|
||||
static inline int
|
||||
xfs_break_layouts(struct inode *inode, uint *iolock)
|
||||
xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,9 @@ struct dax_operations {
|
|||
/* copy_from_iter: required operation for fs-dax direct-i/o */
|
||||
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
|
||||
struct iov_iter *);
|
||||
/* copy_to_iter: required operation for fs-dax direct-i/o */
|
||||
size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
|
||||
struct iov_iter *);
|
||||
};
|
||||
|
||||
extern struct attribute_group dax_attribute_group;
|
||||
|
@ -83,6 +86,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
|
|||
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
|
||||
int dax_writeback_mapping_range(struct address_space *mapping,
|
||||
struct block_device *bdev, struct writeback_control *wbc);
|
||||
|
||||
struct page *dax_layout_busy_page(struct address_space *mapping);
|
||||
#else
|
||||
static inline bool bdev_dax_supported(struct block_device *bdev,
|
||||
int blocksize)
|
||||
|
@ -104,6 +109,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int dax_writeback_mapping_range(struct address_space *mapping,
|
||||
struct block_device *bdev, struct writeback_control *wbc)
|
||||
{
|
||||
|
@ -119,6 +129,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
|
|||
void **kaddr, pfn_t *pfn);
|
||||
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
size_t bytes, struct iov_iter *i);
|
||||
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
|
||||
size_t bytes, struct iov_iter *i);
|
||||
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
|
||||
|
||||
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
|
|
@ -133,7 +133,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
|
|||
*/
|
||||
typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
|
||||
long nr_pages, void **kaddr, pfn_t *pfn);
|
||||
typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
|
||||
typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
|
||||
void *addr, size_t bytes, struct iov_iter *i);
|
||||
#define PAGE_SECTORS (PAGE_SIZE / 512)
|
||||
|
||||
|
@ -184,7 +184,8 @@ struct target_type {
|
|||
dm_iterate_devices_fn iterate_devices;
|
||||
dm_io_hints_fn io_hints;
|
||||
dm_dax_direct_access_fn direct_access;
|
||||
dm_dax_copy_from_iter_fn dax_copy_from_iter;
|
||||
dm_dax_copy_iter_fn dax_copy_from_iter;
|
||||
dm_dax_copy_iter_fn dax_copy_to_iter;
|
||||
|
||||
/* For internal device-mapper use. */
|
||||
struct list_head list;
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_MEMREMAP_H_
|
||||
#define _LINUX_MEMREMAP_H_
|
||||
#include <linux/mm.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/percpu-refcount.h>
|
||||
|
||||
|
@ -30,13 +29,6 @@ struct vmem_altmap {
|
|||
* Specialize ZONE_DEVICE memory into multiple types each having differents
|
||||
* usage.
|
||||
*
|
||||
* MEMORY_DEVICE_HOST:
|
||||
* Persistent device memory (pmem): struct page might be allocated in different
|
||||
* memory and architecture might want to perform special actions. It is similar
|
||||
* to regular memory, in that the CPU can access it transparently. However,
|
||||
* it is likely to have different bandwidth and latency than regular memory.
|
||||
* See Documentation/nvdimm/nvdimm.txt for more information.
|
||||
*
|
||||
* MEMORY_DEVICE_PRIVATE:
|
||||
* Device memory that is not directly addressable by the CPU: CPU can neither
|
||||
* read nor write private memory. In this case, we do still have struct pages
|
||||
|
@ -53,11 +45,19 @@ struct vmem_altmap {
|
|||
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
|
||||
* type. Any page of a process can be migrated to such memory. However no one
|
||||
* should be allow to pin such memory so that it can always be evicted.
|
||||
*
|
||||
* MEMORY_DEVICE_FS_DAX:
|
||||
* Host memory that has similar access semantics as System RAM i.e. DMA
|
||||
* coherent and supports page pinning. In support of coordinating page
|
||||
* pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
|
||||
* wakeup event whenever a page is unpinned and becomes idle. This
|
||||
* wakeup is used to coordinate physical address space management (ex:
|
||||
* fs truncate/hole punch) vs pinned pages (ex: device dma).
|
||||
*/
|
||||
enum memory_type {
|
||||
MEMORY_DEVICE_HOST = 0,
|
||||
MEMORY_DEVICE_PRIVATE,
|
||||
MEMORY_DEVICE_PRIVATE = 1,
|
||||
MEMORY_DEVICE_PUBLIC,
|
||||
MEMORY_DEVICE_FS_DAX,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -129,8 +129,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
|||
|
||||
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
|
||||
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
|
||||
|
||||
static inline bool is_zone_device_page(const struct page *page);
|
||||
#else
|
||||
static inline void *devm_memremap_pages(struct device *dev,
|
||||
struct dev_pagemap *pgmap)
|
||||
|
@ -161,20 +159,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
|
|||
}
|
||||
#endif /* CONFIG_ZONE_DEVICE */
|
||||
|
||||
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
|
||||
static inline bool is_device_private_page(const struct page *page)
|
||||
{
|
||||
return is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool is_device_public_page(const struct page *page)
|
||||
{
|
||||
return is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
|
||||
}
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
|
||||
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap)
|
||||
|
|
|
@ -830,27 +830,65 @@ static inline bool is_zone_device_page(const struct page *page)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
|
||||
void put_zone_device_private_or_public_page(struct page *page);
|
||||
DECLARE_STATIC_KEY_FALSE(device_private_key);
|
||||
#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
|
||||
static inline bool is_device_private_page(const struct page *page);
|
||||
static inline bool is_device_public_page(const struct page *page);
|
||||
#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
static inline void put_zone_device_private_or_public_page(struct page *page)
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void dev_pagemap_get_ops(void);
|
||||
void dev_pagemap_put_ops(void);
|
||||
void __put_devmap_managed_page(struct page *page);
|
||||
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
if (!static_branch_unlikely(&devmap_managed_key))
|
||||
return false;
|
||||
if (!is_zone_device_page(page))
|
||||
return false;
|
||||
switch (page->pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
case MEMORY_DEVICE_PUBLIC:
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
__put_devmap_managed_page(page);
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool is_device_private_page(const struct page *page)
|
||||
{
|
||||
return is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
|
||||
}
|
||||
|
||||
static inline bool is_device_public_page(const struct page *page)
|
||||
{
|
||||
return is_zone_device_page(page) &&
|
||||
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
|
||||
}
|
||||
|
||||
#else /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
static inline void dev_pagemap_get_ops(void)
|
||||
{
|
||||
}
|
||||
#define IS_HMM_ENABLED 0
|
||||
|
||||
static inline void dev_pagemap_put_ops(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline bool put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool is_device_private_page(const struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool is_device_public_page(const struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
static inline void get_page(struct page *page)
|
||||
{
|
||||
|
@ -868,16 +906,13 @@ static inline void put_page(struct page *page)
|
|||
page = compound_head(page);
|
||||
|
||||
/*
|
||||
* For private device pages we need to catch refcount transition from
|
||||
* 2 to 1, when refcount reach one it means the private device page is
|
||||
* free and we need to inform the device driver through callback. See
|
||||
* For devmap managed pages we need to catch refcount transition from
|
||||
* 2 to 1, when refcount reach one it means the page is free and we
|
||||
* need to inform the device driver through callback. See
|
||||
* include/linux/memremap.h and HMM for details.
|
||||
*/
|
||||
if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
|
||||
unlikely(is_device_public_page(page)))) {
|
||||
put_zone_device_private_or_public_page(page);
|
||||
if (put_devmap_managed_page(page))
|
||||
return;
|
||||
}
|
||||
|
||||
if (put_page_testzero(page))
|
||||
__put_page(page);
|
||||
|
|
|
@ -155,7 +155,7 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
|
||||
size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
|
||||
size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i);
|
||||
#else
|
||||
#define _copy_to_iter_mcsafe _copy_to_iter
|
||||
#endif
|
||||
|
|
|
@ -112,7 +112,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
|
|||
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
|
||||
obj-$(CONFIG_TORTURE_TEST) += torture.o
|
||||
|
||||
obj-$(CONFIG_HAS_IOMEM) += memremap.o
|
||||
obj-$(CONFIG_HAS_IOMEM) += iomem.o
|
||||
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
|
||||
|
||||
$(obj)/configs.o: $(obj)/config_data.h
|
||||
|
||||
|
|
167
kernel/iomem.c
Normal file
167
kernel/iomem.c
Normal file
|
@ -0,0 +1,167 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include <linux/device.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
#ifndef ioremap_cache
|
||||
/* temporary while we convert existing ioremap_cache users to memremap */
|
||||
__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
|
||||
{
|
||||
return ioremap(offset, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_memremap_wb
|
||||
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
|
||||
{
|
||||
return (__force void *)ioremap_cache(offset, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_memremap_can_ram_remap
|
||||
static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void *try_ram_remap(resource_size_t offset, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
unsigned long pfn = PHYS_PFN(offset);
|
||||
|
||||
/* In the simple case just return the existing linear address */
|
||||
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
|
||||
arch_memremap_can_ram_remap(offset, size, flags))
|
||||
return __va(offset);
|
||||
|
||||
return NULL; /* fallback to arch_memremap_wb */
|
||||
}
|
||||
|
||||
/**
|
||||
* memremap() - remap an iomem_resource as cacheable memory
|
||||
* @offset: iomem resource start address
|
||||
* @size: size of remap
|
||||
* @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
|
||||
* MEMREMAP_ENC, MEMREMAP_DEC
|
||||
*
|
||||
* memremap() is "ioremap" for cases where it is known that the resource
|
||||
* being mapped does not have i/o side effects and the __iomem
|
||||
* annotation is not applicable. In the case of multiple flags, the different
|
||||
* mapping types will be attempted in the order listed below until one of
|
||||
* them succeeds.
|
||||
*
|
||||
* MEMREMAP_WB - matches the default mapping for System RAM on
|
||||
* the architecture. This is usually a read-allocate write-back cache.
|
||||
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
|
||||
* memremap() will bypass establishing a new mapping and instead return
|
||||
* a pointer into the direct map.
|
||||
*
|
||||
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
|
||||
* cache or are written through to memory and never exist in a
|
||||
* cache-dirty state with respect to program visibility. Attempts to
|
||||
* map System RAM with this mapping type will fail.
|
||||
*
|
||||
* MEMREMAP_WC - establish a writecombine mapping, whereby writes may
|
||||
* be coalesced together (e.g. in the CPU's write buffers), but is otherwise
|
||||
* uncached. Attempts to map System RAM with this mapping type will fail.
|
||||
*/
|
||||
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
|
||||
{
|
||||
int is_ram = region_intersects(offset, size,
|
||||
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
|
||||
void *addr = NULL;
|
||||
|
||||
if (!flags)
|
||||
return NULL;
|
||||
|
||||
if (is_ram == REGION_MIXED) {
|
||||
WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
|
||||
&offset, (unsigned long) size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Try all mapping types requested until one returns non-NULL */
|
||||
if (flags & MEMREMAP_WB) {
|
||||
/*
|
||||
* MEMREMAP_WB is special in that it can be satisifed
|
||||
* from the direct map. Some archs depend on the
|
||||
* capability of memremap() to autodetect cases where
|
||||
* the requested range is potentially in System RAM.
|
||||
*/
|
||||
if (is_ram == REGION_INTERSECTS)
|
||||
addr = try_ram_remap(offset, size, flags);
|
||||
if (!addr)
|
||||
addr = arch_memremap_wb(offset, size);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we don't have a mapping yet and other request flags are
|
||||
* present then we will be attempting to establish a new virtual
|
||||
* address mapping. Enforce that this mapping is not aliasing
|
||||
* System RAM.
|
||||
*/
|
||||
if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
|
||||
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
|
||||
&offset, (unsigned long) size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!addr && (flags & MEMREMAP_WT))
|
||||
addr = ioremap_wt(offset, size);
|
||||
|
||||
if (!addr && (flags & MEMREMAP_WC))
|
||||
addr = ioremap_wc(offset, size);
|
||||
|
||||
return addr;
|
||||
}
|
||||
EXPORT_SYMBOL(memremap);
|
||||
|
||||
void memunmap(void *addr)
|
||||
{
|
||||
if (is_vmalloc_addr(addr))
|
||||
iounmap((void __iomem *) addr);
|
||||
}
|
||||
EXPORT_SYMBOL(memunmap);
|
||||
|
||||
static void devm_memremap_release(struct device *dev, void *res)
|
||||
{
|
||||
memunmap(*(void **)res);
|
||||
}
|
||||
|
||||
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
|
||||
{
|
||||
return *(void **)res == match_data;
|
||||
}
|
||||
|
||||
void *devm_memremap(struct device *dev, resource_size_t offset,
|
||||
size_t size, unsigned long flags)
|
||||
{
|
||||
void **ptr, *addr;
|
||||
|
||||
ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
|
||||
dev_to_node(dev));
|
||||
if (!ptr)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
addr = memremap(offset, size, flags);
|
||||
if (addr) {
|
||||
*ptr = addr;
|
||||
devres_add(dev, ptr);
|
||||
} else {
|
||||
devres_free(ptr);
|
||||
return ERR_PTR(-ENXIO);
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
EXPORT_SYMBOL(devm_memremap);
|
||||
|
||||
void devm_memunmap(struct device *dev, void *addr)
|
||||
{
|
||||
WARN_ON(devres_release(dev, devm_memremap_release,
|
||||
devm_memremap_match, addr));
|
||||
}
|
||||
EXPORT_SYMBOL(devm_memunmap);
|
|
@ -1,15 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2015 Intel Corporation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of version 2 of the GNU General Public License as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/types.h>
|
||||
|
@ -19,170 +9,8 @@
|
|||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/wait_bit.h>
|
||||
|
||||
#ifndef ioremap_cache
|
||||
/* temporary while we convert existing ioremap_cache users to memremap */
|
||||
__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
|
||||
{
|
||||
return ioremap(offset, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_memremap_wb
|
||||
static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
|
||||
{
|
||||
return (__force void *)ioremap_cache(offset, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_memremap_can_ram_remap
|
||||
static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void *try_ram_remap(resource_size_t offset, size_t size,
|
||||
unsigned long flags)
|
||||
{
|
||||
unsigned long pfn = PHYS_PFN(offset);
|
||||
|
||||
/* In the simple case just return the existing linear address */
|
||||
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
|
||||
arch_memremap_can_ram_remap(offset, size, flags))
|
||||
return __va(offset);
|
||||
|
||||
return NULL; /* fallback to arch_memremap_wb */
|
||||
}
|
||||
|
||||
/**
|
||||
* memremap() - remap an iomem_resource as cacheable memory
|
||||
* @offset: iomem resource start address
|
||||
* @size: size of remap
|
||||
* @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
|
||||
* MEMREMAP_ENC, MEMREMAP_DEC
|
||||
*
|
||||
* memremap() is "ioremap" for cases where it is known that the resource
|
||||
* being mapped does not have i/o side effects and the __iomem
|
||||
* annotation is not applicable. In the case of multiple flags, the different
|
||||
* mapping types will be attempted in the order listed below until one of
|
||||
* them succeeds.
|
||||
*
|
||||
* MEMREMAP_WB - matches the default mapping for System RAM on
|
||||
* the architecture. This is usually a read-allocate write-back cache.
|
||||
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
|
||||
* memremap() will bypass establishing a new mapping and instead return
|
||||
* a pointer into the direct map.
|
||||
*
|
||||
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
|
||||
* cache or are written through to memory and never exist in a
|
||||
* cache-dirty state with respect to program visibility. Attempts to
|
||||
* map System RAM with this mapping type will fail.
|
||||
*
|
||||
* MEMREMAP_WC - establish a writecombine mapping, whereby writes may
|
||||
* be coalesced together (e.g. in the CPU's write buffers), but is otherwise
|
||||
* uncached. Attempts to map System RAM with this mapping type will fail.
|
||||
*/
|
||||
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
|
||||
{
|
||||
int is_ram = region_intersects(offset, size,
|
||||
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
|
||||
void *addr = NULL;
|
||||
|
||||
if (!flags)
|
||||
return NULL;
|
||||
|
||||
if (is_ram == REGION_MIXED) {
|
||||
WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
|
||||
&offset, (unsigned long) size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Try all mapping types requested until one returns non-NULL */
|
||||
if (flags & MEMREMAP_WB) {
|
||||
/*
|
||||
* MEMREMAP_WB is special in that it can be satisifed
|
||||
* from the direct map. Some archs depend on the
|
||||
* capability of memremap() to autodetect cases where
|
||||
* the requested range is potentially in System RAM.
|
||||
*/
|
||||
if (is_ram == REGION_INTERSECTS)
|
||||
addr = try_ram_remap(offset, size, flags);
|
||||
if (!addr)
|
||||
addr = arch_memremap_wb(offset, size);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we don't have a mapping yet and other request flags are
|
||||
* present then we will be attempting to establish a new virtual
|
||||
* address mapping. Enforce that this mapping is not aliasing
|
||||
* System RAM.
|
||||
*/
|
||||
if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
|
||||
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
|
||||
&offset, (unsigned long) size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!addr && (flags & MEMREMAP_WT))
|
||||
addr = ioremap_wt(offset, size);
|
||||
|
||||
if (!addr && (flags & MEMREMAP_WC))
|
||||
addr = ioremap_wc(offset, size);
|
||||
|
||||
return addr;
|
||||
}
|
||||
EXPORT_SYMBOL(memremap);
|
||||
|
||||
void memunmap(void *addr)
|
||||
{
|
||||
if (is_vmalloc_addr(addr))
|
||||
iounmap((void __iomem *) addr);
|
||||
}
|
||||
EXPORT_SYMBOL(memunmap);
|
||||
|
||||
static void devm_memremap_release(struct device *dev, void *res)
|
||||
{
|
||||
memunmap(*(void **)res);
|
||||
}
|
||||
|
||||
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
|
||||
{
|
||||
return *(void **)res == match_data;
|
||||
}
|
||||
|
||||
void *devm_memremap(struct device *dev, resource_size_t offset,
|
||||
size_t size, unsigned long flags)
|
||||
{
|
||||
void **ptr, *addr;
|
||||
|
||||
ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
|
||||
dev_to_node(dev));
|
||||
if (!ptr)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
addr = memremap(offset, size, flags);
|
||||
if (addr) {
|
||||
*ptr = addr;
|
||||
devres_add(dev, ptr);
|
||||
} else {
|
||||
devres_free(ptr);
|
||||
return ERR_PTR(-ENXIO);
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
EXPORT_SYMBOL(devm_memremap);
|
||||
|
||||
void devm_memunmap(struct device *dev, void *addr)
|
||||
{
|
||||
WARN_ON(devres_release(dev, devm_memremap_release,
|
||||
devm_memremap_match, addr));
|
||||
}
|
||||
EXPORT_SYMBOL(devm_memunmap);
|
||||
|
||||
#ifdef CONFIG_ZONE_DEVICE
|
||||
static DEFINE_MUTEX(pgmap_lock);
|
||||
static RADIX_TREE(pgmap_radix, GFP_KERNEL);
|
||||
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
|
||||
|
@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
|||
|
||||
return pgmap;
|
||||
}
|
||||
#endif /* CONFIG_ZONE_DEVICE */
|
||||
EXPORT_SYMBOL_GPL(get_dev_pagemap);
|
||||
|
||||
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
|
||||
void put_zone_device_private_or_public_page(struct page *page)
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL_GPL(devmap_managed_key);
|
||||
static atomic_t devmap_enable;
|
||||
|
||||
/*
|
||||
* Toggle the static key for ->page_free() callbacks when dev_pagemap
|
||||
* pages go idle.
|
||||
*/
|
||||
void dev_pagemap_get_ops(void)
|
||||
{
|
||||
if (atomic_inc_return(&devmap_enable) == 1)
|
||||
static_branch_enable(&devmap_managed_key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
|
||||
|
||||
void dev_pagemap_put_ops(void)
|
||||
{
|
||||
if (atomic_dec_and_test(&devmap_enable))
|
||||
static_branch_disable(&devmap_managed_key);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
|
||||
|
||||
void __put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count = page_ref_dec_return(page);
|
||||
|
||||
|
@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page)
|
|||
} else if (!count)
|
||||
__put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL(put_zone_device_private_or_public_page);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
|
|
@ -415,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
|
|||
|
||||
return __walk_iomem_res_desc(&res, desc, false, arg, func);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
|
||||
|
||||
/*
|
||||
* This function calls the @func callback against all memory ranges of type
|
||||
|
|
|
@ -621,6 +621,9 @@ config ARCH_HAS_PMEM_API
|
|||
config ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
bool
|
||||
|
||||
config ARCH_HAS_UACCESS_MCSAFE
|
||||
bool
|
||||
|
||||
config STACKDEPOT
|
||||
bool
|
||||
select STACKTRACE
|
||||
|
|
|
@ -694,6 +694,9 @@ config ARCH_HAS_HMM
|
|||
config MIGRATE_VMA_HELPER
|
||||
bool
|
||||
|
||||
config DEV_PAGEMAP_OPS
|
||||
bool
|
||||
|
||||
config HMM
|
||||
bool
|
||||
select MIGRATE_VMA_HELPER
|
||||
|
@ -714,6 +717,7 @@ config DEVICE_PRIVATE
|
|||
bool "Unaddressable device memory (GPU memory, ...)"
|
||||
depends on ARCH_HAS_HMM
|
||||
select HMM
|
||||
select DEV_PAGEMAP_OPS
|
||||
|
||||
help
|
||||
Allows creation of struct pages to represent unaddressable device
|
||||
|
@ -724,6 +728,7 @@ config DEVICE_PUBLIC
|
|||
bool "Addressable device memory (like GPU memory)"
|
||||
depends on ARCH_HAS_HMM
|
||||
select HMM
|
||||
select DEV_PAGEMAP_OPS
|
||||
|
||||
help
|
||||
Allows creation of struct pages to represent addressable device
|
||||
|
|
36
mm/gup.c
36
mm/gup.c
|
@ -1475,32 +1475,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
|
|||
return 1;
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
|
||||
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long fault_pfn;
|
||||
int nr_start = *nr;
|
||||
|
||||
fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
|
||||
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
|
||||
return 0;
|
||||
|
||||
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
|
||||
undo_dev_pagemap(nr, nr_start, pages);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
|
||||
static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long fault_pfn;
|
||||
int nr_start = *nr;
|
||||
|
||||
fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
|
||||
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
|
||||
return 0;
|
||||
|
||||
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
|
||||
undo_dev_pagemap(nr, nr_start, pages);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
|
||||
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
BUILD_BUG();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
|
||||
static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
|
||||
unsigned long end, struct page **pages, int *nr)
|
||||
{
|
||||
BUILD_BUG();
|
||||
|
@ -1518,7 +1534,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
|||
return 0;
|
||||
|
||||
if (pmd_devmap(orig))
|
||||
return __gup_device_huge_pmd(orig, addr, end, pages, nr);
|
||||
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
|
||||
|
||||
refs = 0;
|
||||
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||
|
@ -1556,7 +1572,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
|
|||
return 0;
|
||||
|
||||
if (pud_devmap(orig))
|
||||
return __gup_device_huge_pud(orig, addr, end, pages, nr);
|
||||
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
|
||||
|
||||
refs = 0;
|
||||
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
|
||||
|
|
13
mm/hmm.c
13
mm/hmm.c
|
@ -35,15 +35,6 @@
|
|||
|
||||
#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
|
||||
#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
|
||||
/*
|
||||
* Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
|
||||
*/
|
||||
DEFINE_STATIC_KEY_FALSE(device_private_key);
|
||||
EXPORT_SYMBOL(device_private_key);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
|
||||
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM_MIRROR)
|
||||
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
|
||||
|
||||
|
@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
|
|||
resource_size_t addr;
|
||||
int ret;
|
||||
|
||||
static_branch_enable(&device_private_key);
|
||||
dev_pagemap_get_ops();
|
||||
|
||||
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
|
||||
GFP_KERNEL, dev_to_node(device));
|
||||
|
@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
|
|||
if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
static_branch_enable(&device_private_key);
|
||||
dev_pagemap_get_ops();
|
||||
|
||||
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
|
||||
GFP_KERNEL, dev_to_node(device));
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/cpu.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/uio.h>
|
||||
|
@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr)
|
|||
flags);
|
||||
locked_pgdat = NULL;
|
||||
}
|
||||
put_zone_device_private_or_public_page(page);
|
||||
put_devmap_managed_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,8 @@
|
|||
#include "nfit_test.h"
|
||||
#include "../watermark.h"
|
||||
|
||||
#include <asm/mcsafe_test.h>
|
||||
|
||||
/*
|
||||
* Generate an NFIT table to describe the following topology:
|
||||
*
|
||||
|
@ -2681,6 +2683,107 @@ static struct platform_driver nfit_test_driver = {
|
|||
.id_table = nfit_test_id,
|
||||
};
|
||||
|
||||
static char mcsafe_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
|
||||
|
||||
enum INJECT {
|
||||
INJECT_NONE,
|
||||
INJECT_SRC,
|
||||
INJECT_DST,
|
||||
};
|
||||
|
||||
static void mcsafe_test_init(char *dst, char *src, size_t size)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
memset(dst, 0xff, size);
|
||||
for (i = 0; i < size; i++)
|
||||
src[i] = (char) i;
|
||||
}
|
||||
|
||||
static bool mcsafe_test_validate(unsigned char *dst, unsigned char *src,
|
||||
size_t size, unsigned long rem)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < size - rem; i++)
|
||||
if (dst[i] != (unsigned char) i) {
|
||||
pr_info_once("%s:%d: offset: %zd got: %#x expect: %#x\n",
|
||||
__func__, __LINE__, i, dst[i],
|
||||
(unsigned char) i);
|
||||
return false;
|
||||
}
|
||||
for (i = size - rem; i < size; i++)
|
||||
if (dst[i] != 0xffU) {
|
||||
pr_info_once("%s:%d: offset: %zd got: %#x expect: 0xff\n",
|
||||
__func__, __LINE__, i, dst[i]);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void mcsafe_test(void)
|
||||
{
|
||||
char *inject_desc[] = { "none", "source", "destination" };
|
||||
enum INJECT inj;
|
||||
|
||||
if (IS_ENABLED(CONFIG_MCSAFE_TEST)) {
|
||||
pr_info("%s: run...\n", __func__);
|
||||
} else {
|
||||
pr_info("%s: disabled, skip.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
for (inj = INJECT_NONE; inj <= INJECT_DST; inj++) {
|
||||
int i;
|
||||
|
||||
pr_info("%s: inject: %s\n", __func__, inject_desc[inj]);
|
||||
for (i = 0; i < 512; i++) {
|
||||
unsigned long expect, rem;
|
||||
void *src, *dst;
|
||||
bool valid;
|
||||
|
||||
switch (inj) {
|
||||
case INJECT_NONE:
|
||||
mcsafe_inject_src(NULL);
|
||||
mcsafe_inject_dst(NULL);
|
||||
dst = &mcsafe_buf[2048];
|
||||
src = &mcsafe_buf[1024 - i];
|
||||
expect = 0;
|
||||
break;
|
||||
case INJECT_SRC:
|
||||
mcsafe_inject_src(&mcsafe_buf[1024]);
|
||||
mcsafe_inject_dst(NULL);
|
||||
dst = &mcsafe_buf[2048];
|
||||
src = &mcsafe_buf[1024 - i];
|
||||
expect = 512 - i;
|
||||
break;
|
||||
case INJECT_DST:
|
||||
mcsafe_inject_src(NULL);
|
||||
mcsafe_inject_dst(&mcsafe_buf[2048]);
|
||||
dst = &mcsafe_buf[2048 - i];
|
||||
src = &mcsafe_buf[1024];
|
||||
expect = 512 - i;
|
||||
break;
|
||||
}
|
||||
|
||||
mcsafe_test_init(dst, src, 512);
|
||||
rem = __memcpy_mcsafe(dst, src, 512);
|
||||
valid = mcsafe_test_validate(dst, src, 512, expect);
|
||||
if (rem == expect && valid)
|
||||
continue;
|
||||
pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n",
|
||||
__func__,
|
||||
((unsigned long) dst) & ~PAGE_MASK,
|
||||
((unsigned long ) src) & ~PAGE_MASK,
|
||||
512, i, rem, valid ? "valid" : "bad",
|
||||
expect);
|
||||
}
|
||||
}
|
||||
|
||||
mcsafe_inject_src(NULL);
|
||||
mcsafe_inject_dst(NULL);
|
||||
}
|
||||
|
||||
static __init int nfit_test_init(void)
|
||||
{
|
||||
int rc, i;
|
||||
|
@ -2689,6 +2792,7 @@ static __init int nfit_test_init(void)
|
|||
libnvdimm_test();
|
||||
acpi_nfit_test();
|
||||
device_dax_test();
|
||||
mcsafe_test();
|
||||
|
||||
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue