mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-24 01:09:38 -05:00
md/raid5: activate raid6 rmw feature
Glue it altogehter. The raid6 rmw path should work the same as the already existing raid5 logic. So emulate the prexor handling/flags and split functions as needed. 1) Enable xor_syndrome() in the async layer. 2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome at the start of a rmw run as we did it before for the single parity. 3) Take care of rmw run in ops_run_reconstruct6(). Again process only the changed pages to get syndrome back into sync. 4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw run. The lower layers will calculate start & end pages from that and call the xor_syndrome() correspondingly. 5) Adapt the several places where we ignored Q handling up to now. Performance numbers for a single E5630 system with a mix of 10 7200k desktop/server disks. 300 seconds random write with 8 threads onto a 3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4) bsize rmw_level=1 rmw_level=0 rmw_level=1 rmw_level=0 skip_copy=1 skip_copy=1 skip_copy=0 skip_copy=0 4K 115 KB/s 141 KB/s 165 KB/s 140 KB/s 8K 225 KB/s 275 KB/s 324 KB/s 274 KB/s 16K 434 KB/s 536 KB/s 640 KB/s 534 KB/s 32K 751 KB/s 1,051 KB/s 1,234 KB/s 1,045 KB/s 64K 1,339 KB/s 1,958 KB/s 2,282 KB/s 1,962 KB/s 128K 2,673 KB/s 3,862 KB/s 4,113 KB/s 3,898 KB/s 256K 7,685 KB/s 7,539 KB/s 7,557 KB/s 7,638 KB/s 512K 19,556 KB/s 19,558 KB/s 19,652 KB/s 19,688 Kb/s Signed-off-by: Markus Stockhausen <stockhausen@collogia.de> Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
parent
a582564b24
commit
584acdd49c
4 changed files with 115 additions and 30 deletions
|
@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
|
|||
{
|
||||
void **srcs;
|
||||
int i;
|
||||
int start = -1, stop = disks - 3;
|
||||
|
||||
if (submit->scribble)
|
||||
srcs = submit->scribble;
|
||||
|
@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
|
|||
if (blocks[i] == NULL) {
|
||||
BUG_ON(i > disks - 3); /* P or Q can't be zero */
|
||||
srcs[i] = (void*)raid6_empty_zero_page;
|
||||
} else
|
||||
} else {
|
||||
srcs[i] = page_address(blocks[i]) + offset;
|
||||
if (i < disks - 2) {
|
||||
stop = i;
|
||||
if (start == -1)
|
||||
start = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
raid6_call.gen_syndrome(disks, len, srcs);
|
||||
if (submit->flags & ASYNC_TX_PQ_XOR_DST) {
|
||||
BUG_ON(!raid6_call.xor_syndrome);
|
||||
if (start >= 0)
|
||||
raid6_call.xor_syndrome(disks, start, stop, len, srcs);
|
||||
} else
|
||||
raid6_call.gen_syndrome(disks, len, srcs);
|
||||
async_tx_sync_epilog(submit);
|
||||
}
|
||||
|
||||
|
@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
|
|||
if (device)
|
||||
unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
|
||||
|
||||
if (unmap &&
|
||||
/* XORing P/Q is only implemented in software */
|
||||
if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
|
||||
(src_cnt <= dma_maxpq(device, 0) ||
|
||||
dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
|
||||
is_dma_pq_aligned(device, offset, 0, len)) {
|
||||
|
|
|
@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|||
* destination buffer is recorded in srcs[count] and the Q destination
|
||||
* is recorded in srcs[count+1]].
|
||||
*/
|
||||
static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
|
||||
static int set_syndrome_sources(struct page **srcs,
|
||||
struct stripe_head *sh,
|
||||
int srctype)
|
||||
{
|
||||
int disks = sh->disks;
|
||||
int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
|
||||
|
@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
|
|||
i = d0_idx;
|
||||
do {
|
||||
int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
|
||||
srcs[slot] = sh->dev[i].page;
|
||||
if (i == sh->qd_idx || i == sh->pd_idx ||
|
||||
(srctype == SYNDROME_SRC_ALL) ||
|
||||
(srctype == SYNDROME_SRC_WANT_DRAIN &&
|
||||
test_bit(R5_Wantdrain, &dev->flags)) ||
|
||||
(srctype == SYNDROME_SRC_WRITTEN &&
|
||||
dev->written))
|
||||
srcs[slot] = sh->dev[i].page;
|
||||
i = raid6_next_disk(i, disks);
|
||||
} while (i != d0_idx);
|
||||
|
||||
|
@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|||
atomic_inc(&sh->count);
|
||||
|
||||
if (target == qd_idx) {
|
||||
count = set_syndrome_sources(blocks, sh);
|
||||
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
|
||||
blocks[count] = NULL; /* regenerating p is not necessary */
|
||||
BUG_ON(blocks[count+1] != dest); /* q should already be set */
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
||||
|
@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
|||
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
|
||||
&submit);
|
||||
|
||||
count = set_syndrome_sources(blocks, sh);
|
||||
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE, tx,
|
||||
ops_complete_compute, sh,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
|
@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
|
|||
}
|
||||
|
||||
static struct dma_async_tx_descriptor *
|
||||
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
|
||||
struct dma_async_tx_descriptor *tx)
|
||||
ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
|
||||
struct dma_async_tx_descriptor *tx)
|
||||
{
|
||||
int disks = sh->disks;
|
||||
struct page **xor_srcs = to_addr_page(percpu, 0);
|
||||
|
@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|||
return tx;
|
||||
}
|
||||
|
||||
static struct dma_async_tx_descriptor *
|
||||
ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
||||
struct dma_async_tx_descriptor *tx)
|
||||
{
|
||||
struct page **blocks = to_addr_page(percpu, 0);
|
||||
int count;
|
||||
struct async_submit_ctl submit;
|
||||
|
||||
pr_debug("%s: stripe %llu\n", __func__,
|
||||
(unsigned long long)sh->sector);
|
||||
|
||||
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
|
||||
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
|
||||
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
|
||||
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
||||
|
||||
return tx;
|
||||
}
|
||||
|
||||
static struct dma_async_tx_descriptor *
|
||||
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
||||
{
|
||||
|
@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|||
int count, i, j = 0;
|
||||
struct stripe_head *head_sh = sh;
|
||||
int last_stripe;
|
||||
int synflags;
|
||||
unsigned long txflags;
|
||||
|
||||
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
|
||||
|
||||
|
@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|||
|
||||
again:
|
||||
blocks = to_addr_page(percpu, j);
|
||||
count = set_syndrome_sources(blocks, sh);
|
||||
|
||||
if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
|
||||
synflags = SYNDROME_SRC_WRITTEN;
|
||||
txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
|
||||
} else {
|
||||
synflags = SYNDROME_SRC_ALL;
|
||||
txflags = ASYNC_TX_ACK;
|
||||
}
|
||||
|
||||
count = set_syndrome_sources(blocks, sh, synflags);
|
||||
last_stripe = !head_sh->batch_head ||
|
||||
list_first_entry(&sh->batch_list,
|
||||
struct stripe_head, batch_list) == head_sh;
|
||||
|
||||
if (last_stripe) {
|
||||
atomic_inc(&head_sh->count);
|
||||
init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
|
||||
init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
|
||||
head_sh, to_addr_conv(sh, percpu, j));
|
||||
} else
|
||||
init_async_submit(&submit, 0, tx, NULL, NULL,
|
||||
|
@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
|
|||
(unsigned long long)sh->sector, checkp);
|
||||
|
||||
BUG_ON(sh->batch_head);
|
||||
count = set_syndrome_sources(srcs, sh);
|
||||
count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
|
||||
if (!checkp)
|
||||
srcs[count] = NULL;
|
||||
|
||||
|
@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
|||
async_tx_ack(tx);
|
||||
}
|
||||
|
||||
if (test_bit(STRIPE_OP_PREXOR, &ops_request))
|
||||
tx = ops_run_prexor(sh, percpu, tx);
|
||||
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
|
||||
if (level < 6)
|
||||
tx = ops_run_prexor5(sh, percpu, tx);
|
||||
else
|
||||
tx = ops_run_prexor6(sh, percpu, tx);
|
||||
}
|
||||
|
||||
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
|
||||
tx = ops_run_biodrain(sh, tx);
|
||||
|
@ -2770,7 +2814,7 @@ static void
|
|||
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
|
||||
int rcw, int expand)
|
||||
{
|
||||
int i, pd_idx = sh->pd_idx, disks = sh->disks;
|
||||
int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
int level = conf->level;
|
||||
|
||||
|
@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
|
|||
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
|
||||
atomic_inc(&conf->pending_full_writes);
|
||||
} else {
|
||||
BUG_ON(level == 6);
|
||||
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
|
||||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
|
||||
BUG_ON(level == 6 &&
|
||||
(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
|
||||
test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
|
||||
|
||||
for (i = disks; i--; ) {
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
if (i == pd_idx)
|
||||
if (i == pd_idx || i == qd_idx)
|
||||
continue;
|
||||
|
||||
if (dev->towrite &&
|
||||
|
@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
|
|||
int rmw = 0, rcw = 0, i;
|
||||
sector_t recovery_cp = conf->mddev->recovery_cp;
|
||||
|
||||
/* RAID6 requires 'rcw' in current implementation.
|
||||
* Otherwise, check whether resync is now happening or should start.
|
||||
/* Check whether resync is now happening or should start.
|
||||
* If yes, then the array is dirty (after unclean shutdown or
|
||||
* initial creation), so parity in some stripes might be inconsistent.
|
||||
* In this case, we need to always do reconstruct-write, to ensure
|
||||
* that in case of drive failure or read-error correction, we
|
||||
* generate correct data from the parity.
|
||||
*/
|
||||
if (conf->max_degraded == 2 ||
|
||||
if (conf->rmw_level == PARITY_DISABLE_RMW ||
|
||||
(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
|
||||
s->failed == 0)) {
|
||||
/* Calculate the real rcw later - for now make it
|
||||
* look like rcw is cheaper
|
||||
*/
|
||||
rcw = 1; rmw = 2;
|
||||
pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
|
||||
conf->max_degraded, (unsigned long long)recovery_cp,
|
||||
pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
|
||||
conf->rmw_level, (unsigned long long)recovery_cp,
|
||||
(unsigned long long)sh->sector);
|
||||
} else for (i = disks; i--; ) {
|
||||
/* would I have to read this buffer for read_modify_write */
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
if ((dev->towrite || i == sh->pd_idx) &&
|
||||
if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
|
||||
!test_bit(R5_LOCKED, &dev->flags) &&
|
||||
!(test_bit(R5_UPTODATE, &dev->flags) ||
|
||||
test_bit(R5_Wantcompute, &dev->flags))) {
|
||||
|
@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
|
|||
rmw += 2*disks; /* cannot read it */
|
||||
}
|
||||
/* Would I have to read this buffer for reconstruct_write */
|
||||
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
|
||||
if (!test_bit(R5_OVERWRITE, &dev->flags) &&
|
||||
i != sh->pd_idx && i != sh->qd_idx &&
|
||||
!test_bit(R5_LOCKED, &dev->flags) &&
|
||||
!(test_bit(R5_UPTODATE, &dev->flags) ||
|
||||
test_bit(R5_Wantcompute, &dev->flags))) {
|
||||
|
@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
|
|||
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
|
||||
(unsigned long long)sh->sector, rmw, rcw);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
if (rmw < rcw && rmw > 0) {
|
||||
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
|
||||
/* prefer read-modify-write, but need to get some data */
|
||||
if (conf->mddev->queue)
|
||||
blk_add_trace_msg(conf->mddev->queue,
|
||||
|
@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
|
|||
(unsigned long long)sh->sector, rmw);
|
||||
for (i = disks; i--; ) {
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
if ((dev->towrite || i == sh->pd_idx) &&
|
||||
if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
|
||||
!test_bit(R5_LOCKED, &dev->flags) &&
|
||||
!(test_bit(R5_UPTODATE, &dev->flags) ||
|
||||
test_bit(R5_Wantcompute, &dev->flags)) &&
|
||||
|
@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
|
|||
}
|
||||
}
|
||||
}
|
||||
if (rcw <= rmw && rcw > 0) {
|
||||
if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
|
||||
/* want reconstruct write, but need to get some data */
|
||||
int qread =0;
|
||||
rcw = 0;
|
||||
|
@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
|||
}
|
||||
|
||||
conf->level = mddev->new_level;
|
||||
if (conf->level == 6)
|
||||
if (conf->level == 6) {
|
||||
conf->max_degraded = 2;
|
||||
else
|
||||
if (raid6_call.xor_syndrome)
|
||||
conf->rmw_level = PARITY_ENABLE_RMW;
|
||||
else
|
||||
conf->rmw_level = PARITY_DISABLE_RMW;
|
||||
} else {
|
||||
conf->max_degraded = 1;
|
||||
conf->rmw_level = PARITY_ENABLE_RMW;
|
||||
}
|
||||
conf->algorithm = mddev->new_layout;
|
||||
conf->reshape_progress = mddev->reshape_position;
|
||||
if (conf->reshape_progress != MaxSector) {
|
||||
|
|
|
@ -355,6 +355,23 @@ enum {
|
|||
STRIPE_OP_RECONSTRUCT,
|
||||
STRIPE_OP_CHECK,
|
||||
};
|
||||
|
||||
/*
|
||||
* RAID parity calculation preferences
|
||||
*/
|
||||
enum {
|
||||
PARITY_DISABLE_RMW = 0,
|
||||
PARITY_ENABLE_RMW,
|
||||
};
|
||||
|
||||
/*
|
||||
* Pages requested from set_syndrome_sources()
|
||||
*/
|
||||
enum {
|
||||
SYNDROME_SRC_ALL,
|
||||
SYNDROME_SRC_WANT_DRAIN,
|
||||
SYNDROME_SRC_WRITTEN,
|
||||
};
|
||||
/*
|
||||
* Plugging:
|
||||
*
|
||||
|
@ -411,7 +428,7 @@ struct r5conf {
|
|||
spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
|
||||
struct mddev *mddev;
|
||||
int chunk_sectors;
|
||||
int level, algorithm;
|
||||
int level, algorithm, rmw_level;
|
||||
int max_degraded;
|
||||
int raid_disks;
|
||||
int max_nr_stripes;
|
||||
|
|
|
@ -60,12 +60,15 @@ struct dma_chan_ref {
|
|||
* dependency chain
|
||||
* @ASYNC_TX_FENCE: specify that the next operation in the dependency
|
||||
* chain uses this operation's result as an input
|
||||
* @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the
|
||||
* input data. Required for rmw case.
|
||||
*/
|
||||
enum async_tx_flags {
|
||||
ASYNC_TX_XOR_ZERO_DST = (1 << 0),
|
||||
ASYNC_TX_XOR_DROP_DST = (1 << 1),
|
||||
ASYNC_TX_ACK = (1 << 2),
|
||||
ASYNC_TX_FENCE = (1 << 3),
|
||||
ASYNC_TX_PQ_XOR_DST = (1 << 4),
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Reference in a new issue