mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-24 01:09:38 -05:00
e63890f38a
In the current implementation of unaligned aio+dio, lock order behave as follow: in user process context: -> call io_submit() -> get i_mutex <== window1 -> get ip_unaligned_aio -> submit direct io to block device -> release i_mutex -> io_submit() return in dio work queue context(the work queue is created in __blockdev_direct_IO): -> release ip_unaligned_aio <== window2 -> get i_mutex -> clear unwritten flag & change i_size -> release i_mutex There is a limitation to the thread number of dio work queue. 256 at default. If all 256 thread are in the above 'window2' stage, and there is a user process in the 'window1' stage, the system will became deadlock. Since the user process hold i_mutex to wait ip_unaligned_aio lock, while there is a direct bio hold ip_unaligned_aio mutex who is waiting for a dio work queue thread to be schedule. But all the dio work queue thread is waiting for i_mutex lock in 'window2'. This case only happened in a test which send a large number(more than 256) of aio at one io_submit() call. My design is to remove ip_unaligned_aio lock. Change it to a sync io instead. Just like ip_unaligned_aio lock, serialize the unaligned aio dio. [akpm@linux-foundation.org: remove OCFS2_IOCB_UNALIGNED_IO, per Junxiao Bi] Signed-off-by: Ryan Ding <ryan.ding@oracle.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <joseph.qi@huawei.com> Cc: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
193 lines
6.2 KiB
C
193 lines
6.2 KiB
C
/* -*- mode: c; c-basic-offset: 8; -*-
|
|
* vim: noexpandtab sw=8 ts=8 sts=0:
|
|
*
|
|
* inode.h
|
|
*
|
|
* Function prototypes
|
|
*
|
|
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*/
|
|
|
|
#ifndef OCFS2_INODE_H
|
|
#define OCFS2_INODE_H
|
|
|
|
#include "extent_map.h"
|
|
|
|
/* OCFS2 Inode Private Data */
|
|
struct ocfs2_inode_info
|
|
{
|
|
u64 ip_blkno;
|
|
|
|
struct ocfs2_lock_res ip_rw_lockres;
|
|
struct ocfs2_lock_res ip_inode_lockres;
|
|
struct ocfs2_lock_res ip_open_lockres;
|
|
|
|
/* protects allocation changes on this inode. */
|
|
struct rw_semaphore ip_alloc_sem;
|
|
|
|
/* protects extended attribute changes on this inode */
|
|
struct rw_semaphore ip_xattr_sem;
|
|
|
|
/* These fields are protected by ip_lock */
|
|
spinlock_t ip_lock;
|
|
u32 ip_open_count;
|
|
struct list_head ip_io_markers;
|
|
u32 ip_clusters;
|
|
|
|
u16 ip_dyn_features;
|
|
struct mutex ip_io_mutex;
|
|
u32 ip_flags; /* see below */
|
|
u32 ip_attr; /* inode attributes */
|
|
|
|
/* Record unwritten extents during direct io. */
|
|
struct list_head ip_unwritten_list;
|
|
|
|
/* protected by recovery_lock. */
|
|
struct inode *ip_next_orphan;
|
|
|
|
struct ocfs2_caching_info ip_metadata_cache;
|
|
struct ocfs2_extent_map ip_extent_map;
|
|
struct inode vfs_inode;
|
|
struct jbd2_inode ip_jinode;
|
|
|
|
u32 ip_dir_start_lookup;
|
|
|
|
/* Only valid if the inode is the dir. */
|
|
u32 ip_last_used_slot;
|
|
u64 ip_last_used_group;
|
|
u32 ip_dir_lock_gen;
|
|
|
|
struct ocfs2_alloc_reservation ip_la_data_resv;
|
|
|
|
/*
|
|
* Transactions that contain inode's metadata needed to complete
|
|
* fsync and fdatasync, respectively.
|
|
*/
|
|
tid_t i_sync_tid;
|
|
tid_t i_datasync_tid;
|
|
|
|
struct dquot *i_dquot[MAXQUOTAS];
|
|
};
|
|
|
|
/*
|
|
* Flags for the ip_flags field
|
|
*/
|
|
/* System file inodes */
|
|
#define OCFS2_INODE_SYSTEM_FILE 0x00000001
|
|
#define OCFS2_INODE_JOURNAL 0x00000002
|
|
#define OCFS2_INODE_BITMAP 0x00000004
|
|
/* This inode has been wiped from disk */
|
|
#define OCFS2_INODE_DELETED 0x00000008
|
|
/* Has the inode been orphaned on another node?
|
|
*
|
|
* This hints to ocfs2_drop_inode that it should clear i_nlink before
|
|
* continuing.
|
|
*
|
|
* We *only* set this on unlink vote from another node. If the inode
|
|
* was locally orphaned, then we're sure of the state and don't need
|
|
* to twiddle i_nlink later - it's either zero or not depending on
|
|
* whether our unlink succeeded. Otherwise we got this from a node
|
|
* whose intention was to orphan the inode, however he may have
|
|
* crashed, failed etc, so we let ocfs2_drop_inode zero the value and
|
|
* rely on ocfs2_delete_inode to sort things out under the proper
|
|
* cluster locks.
|
|
*/
|
|
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
|
|
/* Does someone have the file open O_DIRECT */
|
|
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
|
|
/* Tell the inode wipe code it's not in orphan dir */
|
|
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
|
|
/* Entry in orphan dir with 'dio-' prefix */
|
|
#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
|
|
|
|
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
|
|
{
|
|
return container_of(inode, struct ocfs2_inode_info, vfs_inode);
|
|
}
|
|
|
|
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
|
|
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
|
|
|
|
extern struct kmem_cache *ocfs2_inode_cache;
|
|
|
|
extern const struct address_space_operations ocfs2_aops;
|
|
extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
|
|
|
|
static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
|
|
{
|
|
return &OCFS2_I(inode)->ip_metadata_cache;
|
|
}
|
|
|
|
void ocfs2_evict_inode(struct inode *inode);
|
|
int ocfs2_drop_inode(struct inode *inode);
|
|
|
|
/* Flags for ocfs2_iget() */
|
|
#define OCFS2_FI_FLAG_SYSFILE 0x1
|
|
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
|
|
#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
|
|
#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
|
|
|
|
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
|
|
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
|
|
int sysfile_type);
|
|
int ocfs2_inode_init_private(struct inode *inode);
|
|
int ocfs2_inode_revalidate(struct dentry *dentry);
|
|
void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
|
|
int create_ino);
|
|
void ocfs2_read_inode(struct inode *inode);
|
|
void ocfs2_read_inode2(struct inode *inode, void *opaque);
|
|
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
|
|
size_t size, loff_t *offp);
|
|
void ocfs2_sync_blockdev(struct super_block *sb);
|
|
void ocfs2_refresh_inode(struct inode *inode,
|
|
struct ocfs2_dinode *fe);
|
|
int ocfs2_mark_inode_dirty(handle_t *handle,
|
|
struct inode *inode,
|
|
struct buffer_head *bh);
|
|
struct buffer_head *ocfs2_bread(struct inode *inode,
|
|
int block, int *err, int reada);
|
|
|
|
void ocfs2_set_inode_flags(struct inode *inode);
|
|
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
|
|
|
|
static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
|
|
{
|
|
int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
|
|
|
|
return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
|
|
}
|
|
|
|
/* Validate that a bh contains a valid inode */
|
|
int ocfs2_validate_inode_block(struct super_block *sb,
|
|
struct buffer_head *bh);
|
|
/*
|
|
* Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
|
|
* This is a cached read. The inode will be validated with
|
|
* ocfs2_validate_inode_block().
|
|
*/
|
|
int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
|
|
/* The same, but can be passed OCFS2_BH_* flags */
|
|
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
|
|
int flags);
|
|
|
|
static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
|
|
{
|
|
return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
|
|
}
|
|
|
|
#endif /* OCFS2_INODE_H */
|