mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-23 08:35:19 -05:00
dma-mapping updates for 5.11:
- support for a partial IOMMU bypass (Alexey Kardashevskiy) - add a DMA API benchmark (Barry Song) - misc fixes (Tiezhu Yang, tangjianqiang) -----BEGIN PGP SIGNATURE----- iQI/BAABCgApFiEEgdbnc3r/njty3Iq9D55TZVIEUYMFAl/iF+wLHGhjaEBsc3Qu ZGUACgkQD55TZVIEUYP/HQ//beE+HGi0+5yiWdLY/Q3nqT/VExgdY2CAE2en0jcs kpUEZPfhE2dlKPf9nBl+ZsLIgqIwVP+oSawxZ65r0z/w95vgWANAmBg2m/FcnEUx Vl6jUUPmViY0AGzMR1+55voNNor4aX9rLUdZEN+vBy62Z6fvQVmKOtVPUf0ekkUh n3factrKIt7mplsIgJPO3v6G2XHugBaNmcrp9LkkmhniYkH8S31l4uKCSKI+6atT CsTTVqpQ+qVAyrgp30Xs+N9QoOefI4tFdXmvXzIIFe3JyDvIpniaiT+HHVXuEQJR 5Yukj9sKJXS1ipjc+yWsjvn4Wax/tubJ7eqIo9alsIkcJXI3AlKljwBtFrKPMq+L NK54N3N31A+/hrxOLQyfqz5vH6cUicFr7STIedb8pD3y3/f/tiP+/qown6Wj3rB/ E0IlCK5w87OVrJT1f5fHxh9Xe4R2W9rpeU/v/AZ/DrQllwp3PbG0kWqu3m4mujih CF3D+rOoB5tNtBxrNU+TpOVm+OOdX1IDtBleKePHTTIQeBY3p3UZuwINO0axFr9p oQndj1fiYvRwI6GS31GPpduFG7PrrwCb5zD0Rm07+aNCIoSmu+ADEngfgxnj/rGz q+c9Te1dDn58OktkiEONPNu+iuGBxhBR7AeP6UDdcC1vkO/hvW2HnppSY0qiWthU IFY= =dOug -----END PGP SIGNATURE----- Merge tag 'dma-mapping-5.11' of git://git.infradead.org/users/hch/dma-mapping Pull dma-mapping updates from Christoph Hellwig: - support for a partial IOMMU bypass (Alexey Kardashevskiy) - add a DMA API benchmark (Barry Song) - misc fixes (Tiezhu Yang, tangjianqiang) * tag 'dma-mapping-5.11' of git://git.infradead.org/users/hch/dma-mapping: selftests/dma: add test application for DMA_MAP_BENCHMARK dma-mapping: add benchmark support for streaming DMA APIs dma-contiguous: fix a typo error in a comment dma-pool: no need to check return value of debugfs_create functions powerpc/dma: Fallback to dma_ops when persistent memory present dma-mapping: Allow mixing bypass and mapped DMA operation
This commit is contained in:
commit
347d81b68b
14 changed files with 645 additions and 20 deletions
|
@ -5297,6 +5297,12 @@ F: include/linux/dma-mapping.h
|
|||
F: include/linux/dma-map-ops.h
|
||||
F: kernel/dma/
|
||||
|
||||
DMA MAPPING BENCHMARK
|
||||
M: Barry Song <song.bao.hua@hisilicon.com>
|
||||
L: iommu@lists.linux-foundation.org
|
||||
F: kernel/dma/map_benchmark.c
|
||||
F: tools/testing/selftests/dma/
|
||||
|
||||
DMA-BUF HEAPS FRAMEWORK
|
||||
M: Sumit Semwal <sumit.semwal@linaro.org>
|
||||
R: Benjamin Gaignard <benjamin.gaignard@linaro.org>
|
||||
|
|
|
@ -161,6 +161,7 @@ config PPC
|
|||
select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN
|
||||
select DMA_OPS if PPC64
|
||||
select DMA_OPS_BYPASS if PPC64
|
||||
select ARCH_HAS_DMA_MAP_DIRECT if PPC64 && PPC_PSERIES
|
||||
select DYNAMIC_FTRACE if FUNCTION_TRACER
|
||||
select EDAC_ATOMIC_SCRUB
|
||||
select EDAC_SUPPORT
|
||||
|
|
|
@ -10,6 +10,63 @@
|
|||
#include <linux/pci.h>
|
||||
#include <asm/iommu.h>
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
|
||||
#define can_map_direct(dev, addr) \
|
||||
((dev)->bus_dma_limit >= phys_to_dma((dev), (addr)))
|
||||
|
||||
bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr)
|
||||
{
|
||||
if (likely(!dev->bus_dma_limit))
|
||||
return false;
|
||||
|
||||
return can_map_direct(dev, addr);
|
||||
}
|
||||
|
||||
#define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset)
|
||||
|
||||
bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle)
|
||||
{
|
||||
if (likely(!dev->bus_dma_limit))
|
||||
return false;
|
||||
|
||||
return is_direct_handle(dev, dma_handle);
|
||||
}
|
||||
|
||||
bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
|
||||
int nents)
|
||||
{
|
||||
struct scatterlist *s;
|
||||
int i;
|
||||
|
||||
if (likely(!dev->bus_dma_limit))
|
||||
return false;
|
||||
|
||||
for_each_sg(sg, s, nents, i) {
|
||||
if (!can_map_direct(dev, sg_phys(s) + s->offset + s->length))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
|
||||
int nents)
|
||||
{
|
||||
struct scatterlist *s;
|
||||
int i;
|
||||
|
||||
if (likely(!dev->bus_dma_limit))
|
||||
return false;
|
||||
|
||||
for_each_sg(sg, s, nents, i) {
|
||||
if (!is_direct_handle(dev, s->dma_address + s->length))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_HAS_DMA_MAP_DIRECT */
|
||||
|
||||
/*
|
||||
* Generic iommu implementation
|
||||
*/
|
||||
|
@ -90,8 +147,18 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
|
|||
struct iommu_table *tbl = get_iommu_table_base(dev);
|
||||
|
||||
if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
|
||||
dev->dma_ops_bypass = true;
|
||||
dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
|
||||
/*
|
||||
* dma_iommu_bypass_supported() sets dma_max when there is
|
||||
* 1:1 mapping but it is somehow limited.
|
||||
* ibm,pmemory is one example.
|
||||
*/
|
||||
dev->dma_ops_bypass = dev->bus_dma_limit == 0;
|
||||
if (!dev->dma_ops_bypass)
|
||||
dev_warn(dev,
|
||||
"iommu: 64-bit OK but direct DMA is limited by %llx\n",
|
||||
dev->bus_dma_limit);
|
||||
else
|
||||
dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -839,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
|
|||
np, ret);
|
||||
}
|
||||
|
||||
static u64 find_existing_ddw(struct device_node *pdn)
|
||||
static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
|
||||
{
|
||||
struct direct_window *window;
|
||||
const struct dynamic_dma_window_prop *direct64;
|
||||
|
@ -851,6 +851,7 @@ static u64 find_existing_ddw(struct device_node *pdn)
|
|||
if (window->device == pdn) {
|
||||
direct64 = window->prop;
|
||||
dma_addr = be64_to_cpu(direct64->dma_base);
|
||||
*window_shift = be32_to_cpu(direct64->window_shift);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1111,11 +1112,12 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
|
|||
*/
|
||||
static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
|
||||
{
|
||||
int len, ret;
|
||||
int len = 0, ret;
|
||||
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
|
||||
struct ddw_query_response query;
|
||||
struct ddw_create_response create;
|
||||
int page_shift;
|
||||
u64 dma_addr, max_addr;
|
||||
u64 dma_addr;
|
||||
struct device_node *dn;
|
||||
u32 ddw_avail[DDW_APPLICABLE_SIZE];
|
||||
struct direct_window *window;
|
||||
|
@ -1123,10 +1125,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
|
|||
struct dynamic_dma_window_prop *ddwprop;
|
||||
struct failed_ddw_pdn *fpdn;
|
||||
bool default_win_removed = false;
|
||||
bool pmem_present;
|
||||
|
||||
dn = of_find_node_by_type(NULL, "ibm,pmemory");
|
||||
pmem_present = dn != NULL;
|
||||
of_node_put(dn);
|
||||
|
||||
mutex_lock(&direct_window_init_mutex);
|
||||
|
||||
dma_addr = find_existing_ddw(pdn);
|
||||
dma_addr = find_existing_ddw(pdn, &len);
|
||||
if (dma_addr != 0)
|
||||
goto out_unlock;
|
||||
|
||||
|
@ -1212,14 +1219,29 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
|
|||
}
|
||||
/* verify the window * number of ptes will map the partition */
|
||||
/* check largest block * page size > max memory hotplug addr */
|
||||
max_addr = ddw_memory_hotplug_max();
|
||||
if (query.largest_available_block < (max_addr >> page_shift)) {
|
||||
dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
|
||||
"%llu-sized pages\n", max_addr, query.largest_available_block,
|
||||
1ULL << page_shift);
|
||||
/*
|
||||
* The "ibm,pmemory" can appear anywhere in the address space.
|
||||
* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
|
||||
* for the upper limit and fallback to max RAM otherwise but this
|
||||
* disables device::dma_ops_bypass.
|
||||
*/
|
||||
len = max_ram_len;
|
||||
if (pmem_present) {
|
||||
if (query.largest_available_block >=
|
||||
(1ULL << (MAX_PHYSMEM_BITS - page_shift)))
|
||||
len = MAX_PHYSMEM_BITS - page_shift;
|
||||
else
|
||||
dev_info(&dev->dev, "Skipping ibm,pmemory");
|
||||
}
|
||||
|
||||
if (query.largest_available_block < (1ULL << (len - page_shift))) {
|
||||
dev_dbg(&dev->dev,
|
||||
"can't map partition max 0x%llx with %llu %llu-sized pages\n",
|
||||
1ULL << len,
|
||||
query.largest_available_block,
|
||||
1ULL << page_shift);
|
||||
goto out_failed;
|
||||
}
|
||||
len = order_base_2(max_addr);
|
||||
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
|
||||
if (!win64) {
|
||||
dev_info(&dev->dev,
|
||||
|
@ -1299,6 +1321,15 @@ out_failed:
|
|||
|
||||
out_unlock:
|
||||
mutex_unlock(&direct_window_init_mutex);
|
||||
|
||||
/*
|
||||
* If we have persistent memory and the window size is only as big
|
||||
* as RAM, then we failed to create a window to cover persistent
|
||||
* memory and need to set the DMA limit.
|
||||
*/
|
||||
if (pmem_present && dma_addr && (len == max_ram_len))
|
||||
dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
|
||||
|
||||
return dma_addr;
|
||||
}
|
||||
|
||||
|
|
|
@ -317,6 +317,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
|
|||
void *arch_dma_set_uncached(void *addr, size_t size);
|
||||
void arch_dma_clear_uncached(void *addr, size_t size);
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
|
||||
bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr);
|
||||
bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle);
|
||||
bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
|
||||
int nents);
|
||||
bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
|
||||
int nents);
|
||||
#else
|
||||
#define arch_dma_map_page_direct(d, a) (false)
|
||||
#define arch_dma_unmap_page_direct(d, a) (false)
|
||||
#define arch_dma_map_sg_direct(d, s, n) (false)
|
||||
#define arch_dma_unmap_sg_direct(d, s, n) (false)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
|
||||
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
|
||||
const struct iommu_ops *iommu, bool coherent);
|
||||
|
|
|
@ -20,6 +20,10 @@ config DMA_OPS
|
|||
config DMA_OPS_BYPASS
|
||||
bool
|
||||
|
||||
# Lets platform IOMMU driver choose between bypass and IOMMU
|
||||
config ARCH_HAS_DMA_MAP_DIRECT
|
||||
bool
|
||||
|
||||
config NEED_SG_DMA_LENGTH
|
||||
bool
|
||||
|
||||
|
@ -220,3 +224,12 @@ config DMA_API_DEBUG_SG
|
|||
is technically out-of-spec.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DMA_MAP_BENCHMARK
|
||||
bool "Enable benchmarking of streaming DMA mapping"
|
||||
depends on DEBUG_FS
|
||||
help
|
||||
Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
|
||||
performance of dma_(un)map_page.
|
||||
|
||||
See tools/testing/selftests/dma/dma_map_benchmark.c
|
||||
|
|
|
@ -9,3 +9,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o
|
|||
obj-$(CONFIG_SWIOTLB) += swiotlb.o
|
||||
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
|
||||
obj-$(CONFIG_DMA_REMAP) += remap.o
|
||||
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
* coders, etc.
|
||||
*
|
||||
* Such devices often require big memory buffers (a full HD frame
|
||||
* is, for instance, more then 2 mega pixels large, i.e. more than 6
|
||||
* is, for instance, more than 2 mega pixels large, i.e. more than 6
|
||||
* MB of memory), which makes mechanisms such as kmalloc() or
|
||||
* alloc_page() ineffective.
|
||||
*
|
||||
|
|
361
kernel/dma/map_benchmark.c
Normal file
361
kernel/dma/map_benchmark.c
Normal file
|
@ -0,0 +1,361 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2020 Hisilicon Limited.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/math64.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/timekeeping.h>
|
||||
|
||||
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
|
||||
#define DMA_MAP_MAX_THREADS 1024
|
||||
#define DMA_MAP_MAX_SECONDS 300
|
||||
|
||||
#define DMA_MAP_BIDIRECTIONAL 0
|
||||
#define DMA_MAP_TO_DEVICE 1
|
||||
#define DMA_MAP_FROM_DEVICE 2
|
||||
|
||||
struct map_benchmark {
|
||||
__u64 avg_map_100ns; /* average map latency in 100ns */
|
||||
__u64 map_stddev; /* standard deviation of map latency */
|
||||
__u64 avg_unmap_100ns; /* as above */
|
||||
__u64 unmap_stddev;
|
||||
__u32 threads; /* how many threads will do map/unmap in parallel */
|
||||
__u32 seconds; /* how long the test will last */
|
||||
__s32 node; /* which numa node this benchmark will run on */
|
||||
__u32 dma_bits; /* DMA addressing capability */
|
||||
__u32 dma_dir; /* DMA data direction */
|
||||
__u64 expansion[10]; /* For future use */
|
||||
};
|
||||
|
||||
struct map_benchmark_data {
|
||||
struct map_benchmark bparam;
|
||||
struct device *dev;
|
||||
struct dentry *debugfs;
|
||||
enum dma_data_direction dir;
|
||||
atomic64_t sum_map_100ns;
|
||||
atomic64_t sum_unmap_100ns;
|
||||
atomic64_t sum_sq_map;
|
||||
atomic64_t sum_sq_unmap;
|
||||
atomic64_t loops;
|
||||
};
|
||||
|
||||
static int map_benchmark_thread(void *data)
|
||||
{
|
||||
void *buf;
|
||||
dma_addr_t dma_addr;
|
||||
struct map_benchmark_data *map = data;
|
||||
int ret = 0;
|
||||
|
||||
buf = (void *)__get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
|
||||
ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
|
||||
ktime_t map_delta, unmap_delta;
|
||||
|
||||
/*
|
||||
* for a non-coherent device, if we don't stain them in the
|
||||
* cache, this will give an underestimate of the real-world
|
||||
* overhead of BIDIRECTIONAL or TO_DEVICE mappings;
|
||||
* 66 means evertything goes well! 66 is lucky.
|
||||
*/
|
||||
if (map->dir != DMA_FROM_DEVICE)
|
||||
memset(buf, 0x66, PAGE_SIZE);
|
||||
|
||||
map_stime = ktime_get();
|
||||
dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
|
||||
if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
|
||||
pr_err("dma_map_single failed on %s\n",
|
||||
dev_name(map->dev));
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
map_etime = ktime_get();
|
||||
map_delta = ktime_sub(map_etime, map_stime);
|
||||
|
||||
unmap_stime = ktime_get();
|
||||
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
|
||||
unmap_etime = ktime_get();
|
||||
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
|
||||
|
||||
/* calculate sum and sum of squares */
|
||||
|
||||
map_100ns = div64_ul(map_delta, 100);
|
||||
unmap_100ns = div64_ul(unmap_delta, 100);
|
||||
map_sq = map_100ns * map_100ns;
|
||||
unmap_sq = unmap_100ns * unmap_100ns;
|
||||
|
||||
atomic64_add(map_100ns, &map->sum_map_100ns);
|
||||
atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
|
||||
atomic64_add(map_sq, &map->sum_sq_map);
|
||||
atomic64_add(unmap_sq, &map->sum_sq_unmap);
|
||||
atomic64_inc(&map->loops);
|
||||
}
|
||||
|
||||
out:
|
||||
free_page((unsigned long)buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_map_benchmark(struct map_benchmark_data *map)
|
||||
{
|
||||
struct task_struct **tsk;
|
||||
int threads = map->bparam.threads;
|
||||
int node = map->bparam.node;
|
||||
const cpumask_t *cpu_mask = cpumask_of_node(node);
|
||||
u64 loops;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
|
||||
if (!tsk)
|
||||
return -ENOMEM;
|
||||
|
||||
get_device(map->dev);
|
||||
|
||||
for (i = 0; i < threads; i++) {
|
||||
tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
|
||||
map->bparam.node, "dma-map-benchmark/%d", i);
|
||||
if (IS_ERR(tsk[i])) {
|
||||
pr_err("create dma_map thread failed\n");
|
||||
ret = PTR_ERR(tsk[i]);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (node != NUMA_NO_NODE)
|
||||
kthread_bind_mask(tsk[i], cpu_mask);
|
||||
}
|
||||
|
||||
/* clear the old value in the previous benchmark */
|
||||
atomic64_set(&map->sum_map_100ns, 0);
|
||||
atomic64_set(&map->sum_unmap_100ns, 0);
|
||||
atomic64_set(&map->sum_sq_map, 0);
|
||||
atomic64_set(&map->sum_sq_unmap, 0);
|
||||
atomic64_set(&map->loops, 0);
|
||||
|
||||
for (i = 0; i < threads; i++)
|
||||
wake_up_process(tsk[i]);
|
||||
|
||||
msleep_interruptible(map->bparam.seconds * 1000);
|
||||
|
||||
/* wait for the completion of benchmark threads */
|
||||
for (i = 0; i < threads; i++) {
|
||||
ret = kthread_stop(tsk[i]);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
loops = atomic64_read(&map->loops);
|
||||
if (likely(loops > 0)) {
|
||||
u64 map_variance, unmap_variance;
|
||||
u64 sum_map = atomic64_read(&map->sum_map_100ns);
|
||||
u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
|
||||
u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
|
||||
u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
|
||||
|
||||
/* average latency */
|
||||
map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
|
||||
map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
|
||||
|
||||
/* standard deviation of latency */
|
||||
map_variance = div64_u64(sum_sq_map, loops) -
|
||||
map->bparam.avg_map_100ns *
|
||||
map->bparam.avg_map_100ns;
|
||||
unmap_variance = div64_u64(sum_sq_unmap, loops) -
|
||||
map->bparam.avg_unmap_100ns *
|
||||
map->bparam.avg_unmap_100ns;
|
||||
map->bparam.map_stddev = int_sqrt64(map_variance);
|
||||
map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
|
||||
}
|
||||
|
||||
out:
|
||||
put_device(map->dev);
|
||||
kfree(tsk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
struct map_benchmark_data *map = file->private_data;
|
||||
void __user *argp = (void __user *)arg;
|
||||
u64 old_dma_mask;
|
||||
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
|
||||
return -EFAULT;
|
||||
|
||||
switch (cmd) {
|
||||
case DMA_MAP_BENCHMARK:
|
||||
if (map->bparam.threads == 0 ||
|
||||
map->bparam.threads > DMA_MAP_MAX_THREADS) {
|
||||
pr_err("invalid thread number\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (map->bparam.seconds == 0 ||
|
||||
map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
|
||||
pr_err("invalid duration seconds\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (map->bparam.node != NUMA_NO_NODE &&
|
||||
!node_possible(map->bparam.node)) {
|
||||
pr_err("invalid numa node\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
switch (map->bparam.dma_dir) {
|
||||
case DMA_MAP_BIDIRECTIONAL:
|
||||
map->dir = DMA_BIDIRECTIONAL;
|
||||
break;
|
||||
case DMA_MAP_FROM_DEVICE:
|
||||
map->dir = DMA_FROM_DEVICE;
|
||||
break;
|
||||
case DMA_MAP_TO_DEVICE:
|
||||
map->dir = DMA_TO_DEVICE;
|
||||
break;
|
||||
default:
|
||||
pr_err("invalid DMA direction\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
old_dma_mask = dma_get_mask(map->dev);
|
||||
|
||||
ret = dma_set_mask(map->dev,
|
||||
DMA_BIT_MASK(map->bparam.dma_bits));
|
||||
if (ret) {
|
||||
pr_err("failed to set dma_mask on device %s\n",
|
||||
dev_name(map->dev));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = do_map_benchmark(map);
|
||||
|
||||
/*
|
||||
* restore the original dma_mask as many devices' dma_mask are
|
||||
* set by architectures, acpi, busses. When we bind them back
|
||||
* to their original drivers, those drivers shouldn't see
|
||||
* dma_mask changed by benchmark
|
||||
*/
|
||||
dma_set_mask(map->dev, old_dma_mask);
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
|
||||
return -EFAULT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct file_operations map_benchmark_fops = {
|
||||
.open = simple_open,
|
||||
.unlocked_ioctl = map_benchmark_ioctl,
|
||||
};
|
||||
|
||||
static void map_benchmark_remove_debugfs(void *data)
|
||||
{
|
||||
struct map_benchmark_data *map = (struct map_benchmark_data *)data;
|
||||
|
||||
debugfs_remove(map->debugfs);
|
||||
}
|
||||
|
||||
static int __map_benchmark_probe(struct device *dev)
|
||||
{
|
||||
struct dentry *entry;
|
||||
struct map_benchmark_data *map;
|
||||
int ret;
|
||||
|
||||
map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
|
||||
if (!map)
|
||||
return -ENOMEM;
|
||||
map->dev = dev;
|
||||
|
||||
ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
|
||||
if (ret) {
|
||||
pr_err("Can't add debugfs remove action\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* we only permit a device bound with this driver, 2nd probe
|
||||
* will fail
|
||||
*/
|
||||
entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
|
||||
&map_benchmark_fops);
|
||||
if (IS_ERR(entry))
|
||||
return PTR_ERR(entry);
|
||||
map->debugfs = entry;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int map_benchmark_platform_probe(struct platform_device *pdev)
|
||||
{
|
||||
return __map_benchmark_probe(&pdev->dev);
|
||||
}
|
||||
|
||||
static struct platform_driver map_benchmark_platform_driver = {
|
||||
.driver = {
|
||||
.name = "dma_map_benchmark",
|
||||
},
|
||||
.probe = map_benchmark_platform_probe,
|
||||
};
|
||||
|
||||
static int
|
||||
map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
{
|
||||
return __map_benchmark_probe(&pdev->dev);
|
||||
}
|
||||
|
||||
static struct pci_driver map_benchmark_pci_driver = {
|
||||
.name = "dma_map_benchmark",
|
||||
.probe = map_benchmark_pci_probe,
|
||||
};
|
||||
|
||||
static int __init map_benchmark_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = pci_register_driver(&map_benchmark_pci_driver);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = platform_driver_register(&map_benchmark_platform_driver);
|
||||
if (ret) {
|
||||
pci_unregister_driver(&map_benchmark_pci_driver);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit map_benchmark_cleanup(void)
|
||||
{
|
||||
platform_driver_unregister(&map_benchmark_platform_driver);
|
||||
pci_unregister_driver(&map_benchmark_pci_driver);
|
||||
}
|
||||
|
||||
module_init(map_benchmark_init);
|
||||
module_exit(map_benchmark_cleanup);
|
||||
|
||||
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
|
||||
MODULE_DESCRIPTION("dma_map benchmark driver");
|
||||
MODULE_LICENSE("GPL");
|
|
@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
|
|||
if (WARN_ON_ONCE(!dev->dma_mask))
|
||||
return DMA_MAPPING_ERROR;
|
||||
|
||||
if (dma_map_direct(dev, ops))
|
||||
if (dma_map_direct(dev, ops) ||
|
||||
arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
|
||||
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
|
||||
else
|
||||
addr = ops->map_page(dev, page, offset, size, dir, attrs);
|
||||
|
@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
|
|||
const struct dma_map_ops *ops = get_dma_ops(dev);
|
||||
|
||||
BUG_ON(!valid_dma_direction(dir));
|
||||
if (dma_map_direct(dev, ops))
|
||||
if (dma_map_direct(dev, ops) ||
|
||||
arch_dma_unmap_page_direct(dev, addr + size))
|
||||
dma_direct_unmap_page(dev, addr, size, dir, attrs);
|
||||
else if (ops->unmap_page)
|
||||
ops->unmap_page(dev, addr, size, dir, attrs);
|
||||
|
@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
|
|||
if (WARN_ON_ONCE(!dev->dma_mask))
|
||||
return 0;
|
||||
|
||||
if (dma_map_direct(dev, ops))
|
||||
if (dma_map_direct(dev, ops) ||
|
||||
arch_dma_map_sg_direct(dev, sg, nents))
|
||||
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
|
||||
else
|
||||
ents = ops->map_sg(dev, sg, nents, dir, attrs);
|
||||
|
@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
|
|||
|
||||
BUG_ON(!valid_dma_direction(dir));
|
||||
debug_dma_unmap_sg(dev, sg, nents, dir);
|
||||
if (dma_map_direct(dev, ops))
|
||||
if (dma_map_direct(dev, ops) ||
|
||||
arch_dma_unmap_sg_direct(dev, sg, nents))
|
||||
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
|
||||
else if (ops->unmap_sg)
|
||||
ops->unmap_sg(dev, sg, nents, dir, attrs);
|
||||
|
|
|
@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void)
|
|||
struct dentry *root;
|
||||
|
||||
root = debugfs_create_dir("dma_pools", NULL);
|
||||
if (IS_ERR_OR_NULL(root))
|
||||
return;
|
||||
|
||||
debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
|
||||
debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
|
||||
debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
|
||||
|
|
6
tools/testing/selftests/dma/Makefile
Normal file
6
tools/testing/selftests/dma/Makefile
Normal file
|
@ -0,0 +1,6 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
CFLAGS += -I../../../../usr/include/
|
||||
|
||||
TEST_GEN_PROGS := dma_map_benchmark
|
||||
|
||||
include ../lib.mk
|
1
tools/testing/selftests/dma/config
Normal file
1
tools/testing/selftests/dma/config
Normal file
|
@ -0,0 +1 @@
|
|||
CONFIG_DMA_MAP_BENCHMARK=y
|
123
tools/testing/selftests/dma/dma_map_benchmark.c
Normal file
123
tools/testing/selftests/dma/dma_map_benchmark.c
Normal file
|
@ -0,0 +1,123 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2020 Hisilicon Limited.
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
|
||||
#define DMA_MAP_MAX_THREADS 1024
|
||||
#define DMA_MAP_MAX_SECONDS 300
|
||||
|
||||
#define DMA_MAP_BIDIRECTIONAL 0
|
||||
#define DMA_MAP_TO_DEVICE 1
|
||||
#define DMA_MAP_FROM_DEVICE 2
|
||||
|
||||
static char *directions[] = {
|
||||
"BIDIRECTIONAL",
|
||||
"TO_DEVICE",
|
||||
"FROM_DEVICE",
|
||||
};
|
||||
|
||||
struct map_benchmark {
|
||||
__u64 avg_map_100ns; /* average map latency in 100ns */
|
||||
__u64 map_stddev; /* standard deviation of map latency */
|
||||
__u64 avg_unmap_100ns; /* as above */
|
||||
__u64 unmap_stddev;
|
||||
__u32 threads; /* how many threads will do map/unmap in parallel */
|
||||
__u32 seconds; /* how long the test will last */
|
||||
__s32 node; /* which numa node this benchmark will run on */
|
||||
__u32 dma_bits; /* DMA addressing capability */
|
||||
__u32 dma_dir; /* DMA data direction */
|
||||
__u64 expansion[10]; /* For future use */
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct map_benchmark map;
|
||||
int fd, opt;
|
||||
/* default single thread, run 20 seconds on NUMA_NO_NODE */
|
||||
int threads = 1, seconds = 20, node = -1;
|
||||
/* default dma mask 32bit, bidirectional DMA */
|
||||
int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
|
||||
|
||||
int cmd = DMA_MAP_BENCHMARK;
|
||||
char *p;
|
||||
|
||||
while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
|
||||
switch (opt) {
|
||||
case 't':
|
||||
threads = atoi(optarg);
|
||||
break;
|
||||
case 's':
|
||||
seconds = atoi(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
node = atoi(optarg);
|
||||
break;
|
||||
case 'b':
|
||||
bits = atoi(optarg);
|
||||
break;
|
||||
case 'd':
|
||||
dir = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
|
||||
fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
|
||||
DMA_MAP_MAX_THREADS);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) {
|
||||
fprintf(stderr, "invalid number of seconds, must be in 1-%d\n",
|
||||
DMA_MAP_MAX_SECONDS);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* suppose the mininum DMA zone is 1MB in the world */
|
||||
if (bits < 20 || bits > 64) {
|
||||
fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE &&
|
||||
dir != DMA_MAP_FROM_DEVICE) {
|
||||
fprintf(stderr, "invalid dma direction\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
|
||||
if (fd == -1) {
|
||||
perror("open");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
map.seconds = seconds;
|
||||
map.threads = threads;
|
||||
map.node = node;
|
||||
map.dma_bits = bits;
|
||||
map.dma_dir = dir;
|
||||
if (ioctl(fd, cmd, &map)) {
|
||||
perror("ioctl");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n",
|
||||
threads, seconds, node, dir[directions]);
|
||||
printf("average map latency(us):%.1f standard deviation:%.1f\n",
|
||||
map.avg_map_100ns/10.0, map.map_stddev/10.0);
|
||||
printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
|
||||
map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0);
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue