前言
io hang对于数据库/存储系统而言是致命的,因此,如何模拟一个较为真实的io hang环境,并对自己的系统代码进行测试显得尤为重要。io hang的模拟根据模拟的程度可以有很多方法,比较简单的有使用LD_PRELOAD或Fuse实现用户态文件系统对用户态函数进行偷梁换柱、稍微深一点的有使用systemtap工具hook内核函数,还有一些其他的内核错误注入工具等。这些io hang模拟方法虽然层次不同,但是它们其实并没有模拟到最底层。通常,我们所谓的io hang,都是指最底层的物理设备(磁盘)出现问题,比如坏块过多导致整理gc延迟过大、或者磁盘直接出现硬件故障无法正常读写数据等。可以看到,我们需要模拟这种块设备的io hang,才能最接近真实的hang场景。
那么如何才能对一个块设备进行模拟,其实,模拟一个磁盘还是很简单的,就是写一个简单的块设备驱动就好了(LDD3中就有一个subll的例子)。当然,如果懒得写,在linux内核中,我们也可以在源码/driver/block目录下找到一个块设备驱动,如果你用过linux下的ramdisk设备(通常为/dev/ram0、/dev/ram1等等),那么就会对这个代码很熟悉。
这里我们直接使用ramdisk驱动,稍作修改。
块设备驱动
首先,在内核中找到你机器上对应内核版本的代码,比如我的是:https://elixir.bootlin.com/linux/v3.10/source/drivers/block/brd.c ,然后在文件中DECLARE_WAIT_QUEUE_HEAD(io_hang_q)声明一个等待队列,因为我们主要测试write/read hang,因此这里我们直接在copy_to_brd(表示从用户态拷贝数据到ramdisk)和copy_from_brd(表示从ramdisk读数据到用户态)两个函数中分别加入如下代码,表示如果io_hang不为0,则在TASK_UNINTERRUPTIBLE状态睡眠io_hang秒(或者io_hang被修改为0)。
if(io_hang){
pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);
wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));
pr_info("Simulation of IO hang for read finish\n");
}
if(io_hang){
pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);
wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));
pr_info("Simulation of IO hang for read finish\n");
}
同时,我们需要一个参数来控制ramdisk的磁盘行为(它一开始不能hang,因为我们需要格式化文件系统,写superblock等),这里只要加一个开关控制:
module_param(io_hang,int,S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
MODULE_PARM_DESC(io_hang,"io hang switch");
这里注意参数的权限设置,必须要能在用户态可以修改,后续我们将通过用户态sysfs对这个参数进行修改。
这些改动做完之后,写一个简单的Makefile将其便以为内核模块(注:为了简明我把brd.c改为ramdisk.c):
obj-m := ramdisk.o
SRC = /lib/modules/$(shell uname -r)/build
ko:
make -C $(SRC) M=$(PWD) modules
clean:
make -C $(SRC) M=$(PWD) clean
然后直接make即可生成ramdisk.ko。
ramdisk.c代码如下:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <linux/radix-tree.h>
#include <linux/fs.h>
#include <linux/slab.h>
#ifdef CONFIG_BLK_DEV_RAM_DAX
#include <linux/pfn_t.h>
#endif
#include <asm/uaccess.h>
#define SECTOR_SHIFT 9
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
DECLARE_WAIT_QUEUE_HEAD(io_hang_q);
static int io_hang = 0;
/*
* Each block ramdisk device has a radix_tree brd_pages of pages that stores
* the pages containing the block device's contents. A brd page's ->index is
* its offset in PAGE_SIZE units. This is similar to, but in no way connected
* with, the kernel's pagecache or buffer cache (which sit above our block
* device).
*/
struct brd_device {
int brd_number;
struct request_queue *brd_queue;
struct gendisk *brd_disk;
struct list_head brd_list;
/*
* Backing store of pages and lock to protect it. This is the contents
* of the block device.
*/
spinlock_t brd_lock;
struct radix_tree_root brd_pages;
};
/*
* Look up and return a brd's page for a given sector.
*/
static DEFINE_MUTEX(brd_mutex);
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
struct page *page;
/*
* The page lifetime is protected by the fact that we have opened the
* device node -- brd pages will never be deleted under us, so we
* don't need any further locking or refcounting.
*
* This is strictly true for the radix-tree nodes as well (ie. we
* don't actually need the rcu_read_lock()), however that is not a
* documented feature of the radix-tree API so it is better to be
* safe here (we don't have total exclusion from radix tree updates
* here, only deletes).
*/
rcu_read_lock();
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
page = radix_tree_lookup(&brd->brd_pages, idx);
rcu_read_unlock();
BUG_ON(page && page->index != idx);
return page;
}
/*
* Look up and return a brd's page for a given sector.
* If one does not exist, allocate an empty page, and insert that. Then
* return it.
*/
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
struct page *page;
gfp_t gfp_flags;
page = brd_lookup_page(brd, sector);
if (page)
return page;
/*
* Must use NOIO because we don't want to recurse back into the
* block or filesystem layers from page reclaim.
*
* Cannot support DAX and highmem, because our ->direct_access
* routine for DAX must return memory that is always addressable.
* If DAX was reworked to use pfns and kmap throughout, this
* restriction might be able to be lifted.
*/
gfp_flags = GFP_NOIO | __GFP_ZERO;
#ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
#endif
page = alloc_page(gfp_flags);
if (!page)
return NULL;
if (radix_tree_preload(GFP_NOIO)) {
__free_page(page);
return NULL;
}
spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT;
page->index = idx;
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
__free_page(page);
page = radix_tree_lookup(&brd->brd_pages, idx);
BUG_ON(!page);
BUG_ON(page->index != idx);
}
spin_unlock(&brd->brd_lock);
radix_tree_preload_end();
return page;
}
static void brd_free_page(struct brd_device *brd, sector_t sector)
{
struct page *page;
pgoff_t idx;
spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT;
page = radix_tree_delete(&brd->brd_pages, idx);
spin_unlock(&brd->brd_lock);
if (page)
__free_page(page);
}
static void brd_zero_page(struct brd_device *brd, sector_t sector)
{
struct page *page;
page = brd_lookup_page(brd, sector);
if (page)
clear_highpage(page);
}
/*
* Free all backing store pages and radix tree. This must only be called when
* there are no other users of the device.
*/
#define FREE_BATCH 16
static void brd_free_pages(struct brd_device *brd)
{
unsigned long pos = 0;
struct page *pages[FREE_BATCH];
int nr_pages;
do {
int i;
nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
(void **)pages, pos, FREE_BATCH);
for (i = 0; i < nr_pages; i++) {
void *ret;
BUG_ON(pages[i]->index < pos);
pos = pages[i]->index;
ret = radix_tree_delete(&brd->brd_pages, pos);
BUG_ON(!ret || ret != pages[i]);
__free_page(pages[i]);
}
pos++;
/*
* This assumes radix_tree_gang_lookup always returns as
* many pages as possible. If the radix-tree code changes,
* so will this have to.
*/
} while (nr_pages == FREE_BATCH);
}
/*
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
*/
static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
{
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
copy = min_t(size_t, n, PAGE_SIZE - offset);
if (!brd_insert_page(brd, sector))
return -ENOSPC;
if (copy < n) {
sector += copy >> SECTOR_SHIFT;
if (!brd_insert_page(brd, sector))
return -ENOSPC;
}
return 0;
}
static void discard_from_brd(struct brd_device *brd,
sector_t sector, size_t n)
{
while (n >= PAGE_SIZE) {
/*
* Don't want to actually discard pages here because
* re-allocating the pages can result in writeback
* deadlocks under heavy load.
*/
if (0)
brd_free_page(brd, sector);
else
brd_zero_page(brd, sector);
sector += PAGE_SIZE >> SECTOR_SHIFT;
n -= PAGE_SIZE;
}
}
/*
* Copy n bytes from src to the brd starting at sector. Does not sleep.
*/
static void copy_to_brd(struct brd_device *brd, const void *src,
sector_t sector, size_t n)
{
struct page *page;
void *dst;
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
/**
* Simulation of IO hang
*
* per 1 second
*/
if(io_hang){
pr_info("Simulation of IO hang for write,will hang %d seconds\n",io_hang);
wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));
pr_info("Simulation of IO hang for write finish\n");
}
copy = min_t(size_t, n, PAGE_SIZE - offset);
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
dst = kmap_atomic(page);
memcpy(dst + offset, src, copy);
kunmap_atomic(dst);
if (copy < n) {
src += copy;
sector += copy >> SECTOR_SHIFT;
copy = n - copy;
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
dst = kmap_atomic(page);
memcpy(dst, src, copy);
kunmap_atomic(dst);
}
}
/*
* Copy n bytes to dst from the brd starting at sector. Does not sleep.
*/
static void copy_from_brd(void *dst, struct brd_device *brd,
sector_t sector, size_t n)
{
struct page *page;
void *src;
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
/**
* Simulation of IO hang
*
* per 1 second
*/
if(io_hang){
pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);
wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));
pr_info("Simulation of IO hang for read finish\n");
}
copy = min_t(size_t, n, PAGE_SIZE - offset);
page = brd_lookup_page(brd, sector);
if (page) {
src = kmap_atomic(page);
memcpy(dst, src + offset, copy);
kunmap_atomic(src);
} else
memset(dst, 0, copy);
if (copy < n) {
dst += copy;
sector += copy >> SECTOR_SHIFT;
copy = n - copy;
page = brd_lookup_page(brd, sector);
if (page) {
src = kmap_atomic(page);
memcpy(dst, src, copy);
kunmap_atomic(src);
} else
memset(dst, 0, copy);
}
}
/*
* Process a single bvec of a bio.
*/
static int brd_do_bvec(struct brd_device *brd, struct page *page,
unsigned int len, unsigned int off, bool is_write,
sector_t sector)
{
void *mem;
int err = 0;
if (is_write) {
err = copy_to_brd_setup(brd, sector, len);
if (err)
goto out;
}
mem = kmap_atomic(page);
if (!is_write) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
copy_to_brd(brd, mem + off, sector, len);
}
kunmap_atomic(mem);
out:
return err;
}
static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
struct brd_device *brd = bdev->bd_disk->private_data;
struct bio_vec bvec;
sector_t sector;
struct bvec_iter iter;
sector = bio->bi_iter.bi_sector;
if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
goto io_error;
if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||
bio->bi_iter.bi_size & ~PAGE_MASK)
goto io_error;
discard_from_brd(brd, sector, bio->bi_iter.bi_size);
goto out;
}
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
int err;
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), sector);
if (err)
goto io_error;
sector += len >> SECTOR_SHIFT;
}
out:
bio_endio(bio);
return BLK_QC_T_NONE;
io_error:
bio_io_error(bio);
return BLK_QC_T_NONE;
}
static int brd_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, bool is_write)
{
struct brd_device *brd = bdev->bd_disk->private_data;
int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
page_endio(page, is_write, err);
return err;
}
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
void **kaddr, pfn_t *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
if (!brd)
return -ENODEV;
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn_t(page);
return PAGE_SIZE;
}
#else
#define brd_direct_access NULL
#endif
static int brd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
int error;
struct brd_device *brd = bdev->bd_disk->private_data;
if (cmd != BLKFLSBUF)
return -ENOTTY;
/*
* ram device BLKFLSBUF has special semantics, we want to actually
* release and destroy the ramdisk data.
*/
mutex_lock(&brd_mutex);
mutex_lock(&bdev->bd_mutex);
error = -EBUSY;
if (bdev->bd_openers <= 1) {
/*
* Kill the cache first, so it isn't written back to the
* device.
*
* Another thread might instantiate more buffercache here,
* but there is not much we can do to close that race.
*/
kill_bdev(bdev);
brd_free_pages(brd);
error = 0;
}
mutex_unlock(&bdev->bd_mutex);
mutex_unlock(&brd_mutex);
return error;
}
static const struct block_device_operations brd_fops = {
.owner = THIS_MODULE,
.rw_page = brd_rw_page,
.ioctl = brd_ioctl,
.direct_access = brd_direct_access,
};
/*
* And now the modules code and kernel interface.
*/
static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
static int max_part = 1;
module_param(max_part, int, S_IRUGO);
MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
module_param(io_hang,int,S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
MODULE_PARM_DESC(io_hang,"io hang switch");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
#ifndef MODULE
/* Legacy boot options - nonmodular */
static int __init ramdisk_size(char *str)
{
rd_size = simple_strtol(str, NULL, 0);
return 1;
}
__setup("ramdisk_size=", ramdisk_size);
#endif
/*
* The device scheme is derived from loop.c. Keep them in synch where possible
* (should share code eventually).
*/
static LIST_HEAD(brd_devices);
static DEFINE_MUTEX(brd_devices_mutex);
static struct brd_device *brd_alloc(int i)
{
struct brd_device *brd;
struct gendisk *disk;
brd = kzalloc(sizeof(*brd), GFP_KERNEL);
if (!brd)
goto out;
brd->brd_number = i;
spin_lock_init(&brd->brd_lock);
INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
/* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
#ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
#endif
disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ramdisk%d", i);
set_capacity(disk, rd_size * 2);
return brd;
out_free_queue:
blk_cleanup_queue(brd->brd_queue);
out_free_dev:
kfree(brd);
out:
return NULL;
}
static void brd_free(struct brd_device *brd)
{
put_disk(brd->brd_disk);
blk_cleanup_queue(brd->brd_queue);
brd_free_pages(brd);
kfree(brd);
}
static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
*new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
}
brd = brd_alloc(i);
if (brd) {
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
*new = true;
out:
return brd;
}
static void brd_del_one(struct brd_device *brd)
{
list_del(&brd->brd_list);
del_gendisk(brd->brd_disk);
brd_free(brd);
}
static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
bool new;
mutex_lock(&brd_devices_mutex);
brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
if (new)
*part = 0;
return kobj;
}
static int __init brd_init(void)
{
struct brd_device *brd, *next;
int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
*
* (1) if rd_nr is specified, create that many upfront. else
* it defaults to CONFIG_BLK_DEV_RAM_COUNT
* (2) User can further extend brd devices by create dev node themselves
* and have kernel automatically instantiate actual device
* on-demand. Example:
* mknod /path/devnod_name b 1 X # 1 is the rd major
* fdisk -l /path/devnod_name
* If (X / max_part) was not already created it will be created
* dynamically.
*/
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
if (unlikely(!max_part))
max_part = 1;
for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
list_add_tail(&brd->brd_list, &brd_devices);
}
/* point of no return */
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
pr_info("brd: module loaded\n");
return 0;
out_free:
list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
list_del(&brd->brd_list);
brd_free(brd);
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
struct brd_device *brd, *next;
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module unloaded\n");
}
module_init(brd_init);
module_exit(brd_exit);
格式化、挂载
上面我们实现了一个块设备驱动,并编译出来一个ramdisk.ko,现在可以注册这个块设备驱动:
sudo insmod ramdisk.ko

可以看到/dev下多出来很多ramdisk设备,这些都是块设备,你可以把它当做一个真实的磁盘,只是这写磁盘都是memory backend。
这里我使用ramdisk0做测试,首先对齐格式化:
mkfs -t ext4 /dev/ramdisk0
然后将其挂载到一个目录(选一个测试目录,比如io_hang_test)
sudo mount -t ext4 /dev/ramdisk0 /mnt/io_hang_test
挂载之后可以运行mount命令看下挂载效果(最后一行)。

控制
好了,现在/mnt/io_hang_test目录对应的就是我们的ramdisk0,所有在/mnt/io_hang_test目录下进行的io操作,都会经过ext4文件系统后转为对ramdisk0的读写操作,可以直接在上面创建一个文件实时,这是这些文件是在内存里,重启后就丢失了。
如果前面一切正常,则可以开始模拟io hang。还记得之前那个内核参数io_hang,它目前还是0,因为不会io hang,现在来修改它:
sudo bash -c " echo 10 > /sys/module/ramdisk/parameters/io_hang"
这条命令会修改内核参数io_hang,将其值改为10,这样的话下一次读写的时候,ramdisk驱动会对每一次的read/write hang 10 秒钟(不精确)。如果想关闭io hang,执行:
sudo bash -c " echo 0 > /sys/module/ramdisk/parameters/io_hang"
测试
为了测试read和write,我这里写一个简单的测试代码,测试read和write,同时,为了避开page cache的影响和使用同步io,使用O_DIRECT和O_SYNC方式open。
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#define BUF_SIZE 1024
int main(){
int fd,size,len;
int ret = 0;
char *write_buf,*read_buf;
ret = posix_memalign((void **)&write_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
ret = posix_memalign((void **)&read_buf, 512, BUF_SIZE);
if (ret) {
perror("posix_memalign:");
exit(1);
}
strcpy(write_buf,"direct osync mode");
len = strlen("direct osync mode");
if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_DIRECT | O_SYNC,0666 ))<0) {
perror("open:");
exit(1);
}
if ((size = write(fd, write_buf, BUF_SIZE)) < 0){
perror("write:");
exit(1);
}
lseek(fd, 0, SEEK_SET );
if ((size = read( fd, read_buf, BUF_SIZE))<0) {
perror("read:");
exit(1);
}
if(strncmp(read_buf,write_buf,len) != 0){
perror("strncmp:");
exit(1);
}
if (close(fd) < 0 ) {
perror("close:");
exit(1);
}
}
假设测试代码为test.c,则为了测试方便可以写一个简单的shell:
mkdir io_hang_test
gcc -o test test.c
sudo umount io_hang_test
sudo rmmod ramdisk.ko
make
sudo insmod ramdisk.ko
sudo mkfs -t ext4 /dev/ramdisk0
sudo mount -t ext4 /dev/ramdisk0 io_hang_test
sudo cp test io_hang_test/
然后在io_hang_test目录下sudo ./test,然后看test近处处于D状态。

同时sudo tail -f /var/log/kern.log看下内核的持续输出。

总结
本文只是提供了一种较为简单、较为真实、扩展性较强的方式模拟io hang。在次基础上,可以扩展自己的io hang策略(本文只阐述了一种延时hang策略)。
这里之所以不使用自己写块设备驱动(参考LDD demo很简单),是因为内核版本很多,这样在调试不同内核版本时需要移植,如果直接使用brd.c,那么内核中不同版本下都直接有相应的源码,稍微加一下自己的代码就可以编译使用。非常方便。