You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
5044 lines
124 KiB
5044 lines
124 KiB
#include <linux/bitops.h> |
|
#include <linux/slab.h> |
|
#include <linux/bio.h> |
|
#include <linux/mm.h> |
|
#include <linux/pagemap.h> |
|
#include <linux/page-flags.h> |
|
#include <linux/module.h> |
|
#include <linux/spinlock.h> |
|
#include <linux/blkdev.h> |
|
#include <linux/swap.h> |
|
#include <linux/writeback.h> |
|
#include <linux/pagevec.h> |
|
#include <linux/prefetch.h> |
|
#include <linux/cleancache.h> |
|
#include "extent_io.h" |
|
#include "extent_map.h" |
|
#include "compat.h" |
|
#include "ctree.h" |
|
#include "btrfs_inode.h" |
|
#include "volumes.h" |
|
#include "check-integrity.h" |
|
#include "locking.h" |
|
#include "rcu-string.h" |
|
|
|
static struct kmem_cache *extent_state_cache; |
|
static struct kmem_cache *extent_buffer_cache; |
|
|
|
static LIST_HEAD(buffers); |
|
static LIST_HEAD(states); |
|
|
|
#define LEAK_DEBUG 0 |
|
#if LEAK_DEBUG |
|
static DEFINE_SPINLOCK(leak_lock); |
|
#endif |
|
|
|
#define BUFFER_LRU_MAX 64 |
|
|
|
struct tree_entry { |
|
u64 start; |
|
u64 end; |
|
struct rb_node rb_node; |
|
}; |
|
|
|
struct extent_page_data { |
|
struct bio *bio; |
|
struct extent_io_tree *tree; |
|
get_extent_t *get_extent; |
|
unsigned long bio_flags; |
|
|
|
/* tells writepage not to lock the state bits for this range |
|
* it still does the unlocking |
|
*/ |
|
unsigned int extent_locked:1; |
|
|
|
/* tells the submit_bio code to use a WRITE_SYNC */ |
|
unsigned int sync_io:1; |
|
}; |
|
|
|
static noinline void flush_write_bio(void *data); |
|
static inline struct btrfs_fs_info * |
|
tree_fs_info(struct extent_io_tree *tree) |
|
{ |
|
return btrfs_sb(tree->mapping->host->i_sb); |
|
} |
|
|
|
int __init extent_io_init(void) |
|
{ |
|
extent_state_cache = kmem_cache_create("btrfs_extent_state", |
|
sizeof(struct extent_state), 0, |
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); |
|
if (!extent_state_cache) |
|
return -ENOMEM; |
|
|
|
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", |
|
sizeof(struct extent_buffer), 0, |
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); |
|
if (!extent_buffer_cache) |
|
goto free_state_cache; |
|
return 0; |
|
|
|
free_state_cache: |
|
kmem_cache_destroy(extent_state_cache); |
|
return -ENOMEM; |
|
} |
|
|
|
void extent_io_exit(void) |
|
{ |
|
struct extent_state *state; |
|
struct extent_buffer *eb; |
|
|
|
while (!list_empty(&states)) { |
|
state = list_entry(states.next, struct extent_state, leak_list); |
|
printk(KERN_ERR "btrfs state leak: start %llu end %llu " |
|
"state %lu in tree %p refs %d\n", |
|
(unsigned long long)state->start, |
|
(unsigned long long)state->end, |
|
state->state, state->tree, atomic_read(&state->refs)); |
|
list_del(&state->leak_list); |
|
kmem_cache_free(extent_state_cache, state); |
|
|
|
} |
|
|
|
while (!list_empty(&buffers)) { |
|
eb = list_entry(buffers.next, struct extent_buffer, leak_list); |
|
printk(KERN_ERR "btrfs buffer leak start %llu len %lu " |
|
"refs %d\n", (unsigned long long)eb->start, |
|
eb->len, atomic_read(&eb->refs)); |
|
list_del(&eb->leak_list); |
|
kmem_cache_free(extent_buffer_cache, eb); |
|
} |
|
|
|
/* |
|
* Make sure all delayed rcu free are flushed before we |
|
* destroy caches. |
|
*/ |
|
rcu_barrier(); |
|
if (extent_state_cache) |
|
kmem_cache_destroy(extent_state_cache); |
|
if (extent_buffer_cache) |
|
kmem_cache_destroy(extent_buffer_cache); |
|
} |
|
|
|
void extent_io_tree_init(struct extent_io_tree *tree, |
|
struct address_space *mapping) |
|
{ |
|
tree->state = RB_ROOT; |
|
INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); |
|
tree->ops = NULL; |
|
tree->dirty_bytes = 0; |
|
spin_lock_init(&tree->lock); |
|
spin_lock_init(&tree->buffer_lock); |
|
tree->mapping = mapping; |
|
} |
|
|
|
static struct extent_state *alloc_extent_state(gfp_t mask) |
|
{ |
|
struct extent_state *state; |
|
#if LEAK_DEBUG |
|
unsigned long flags; |
|
#endif |
|
|
|
state = kmem_cache_alloc(extent_state_cache, mask); |
|
if (!state) |
|
return state; |
|
state->state = 0; |
|
state->private = 0; |
|
state->tree = NULL; |
|
#if LEAK_DEBUG |
|
spin_lock_irqsave(&leak_lock, flags); |
|
list_add(&state->leak_list, &states); |
|
spin_unlock_irqrestore(&leak_lock, flags); |
|
#endif |
|
atomic_set(&state->refs, 1); |
|
init_waitqueue_head(&state->wq); |
|
trace_alloc_extent_state(state, mask, _RET_IP_); |
|
return state; |
|
} |
|
|
|
void free_extent_state(struct extent_state *state) |
|
{ |
|
if (!state) |
|
return; |
|
if (atomic_dec_and_test(&state->refs)) { |
|
#if LEAK_DEBUG |
|
unsigned long flags; |
|
#endif |
|
WARN_ON(state->tree); |
|
#if LEAK_DEBUG |
|
spin_lock_irqsave(&leak_lock, flags); |
|
list_del(&state->leak_list); |
|
spin_unlock_irqrestore(&leak_lock, flags); |
|
#endif |
|
trace_free_extent_state(state, _RET_IP_); |
|
kmem_cache_free(extent_state_cache, state); |
|
} |
|
} |
|
|
|
static struct rb_node *tree_insert(struct rb_root *root, u64 offset, |
|
struct rb_node *node) |
|
{ |
|
struct rb_node **p = &root->rb_node; |
|
struct rb_node *parent = NULL; |
|
struct tree_entry *entry; |
|
|
|
while (*p) { |
|
parent = *p; |
|
entry = rb_entry(parent, struct tree_entry, rb_node); |
|
|
|
if (offset < entry->start) |
|
p = &(*p)->rb_left; |
|
else if (offset > entry->end) |
|
p = &(*p)->rb_right; |
|
else |
|
return parent; |
|
} |
|
|
|
rb_link_node(node, parent, p); |
|
rb_insert_color(node, root); |
|
return NULL; |
|
} |
|
|
|
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, |
|
struct rb_node **prev_ret, |
|
struct rb_node **next_ret) |
|
{ |
|
struct rb_root *root = &tree->state; |
|
struct rb_node *n = root->rb_node; |
|
struct rb_node *prev = NULL; |
|
struct rb_node *orig_prev = NULL; |
|
struct tree_entry *entry; |
|
struct tree_entry *prev_entry = NULL; |
|
|
|
while (n) { |
|
entry = rb_entry(n, struct tree_entry, rb_node); |
|
prev = n; |
|
prev_entry = entry; |
|
|
|
if (offset < entry->start) |
|
n = n->rb_left; |
|
else if (offset > entry->end) |
|
n = n->rb_right; |
|
else |
|
return n; |
|
} |
|
|
|
if (prev_ret) { |
|
orig_prev = prev; |
|
while (prev && offset > prev_entry->end) { |
|
prev = rb_next(prev); |
|
prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
|
} |
|
*prev_ret = prev; |
|
prev = orig_prev; |
|
} |
|
|
|
if (next_ret) { |
|
prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
|
while (prev && offset < prev_entry->start) { |
|
prev = rb_prev(prev); |
|
prev_entry = rb_entry(prev, struct tree_entry, rb_node); |
|
} |
|
*next_ret = prev; |
|
} |
|
return NULL; |
|
} |
|
|
|
static inline struct rb_node *tree_search(struct extent_io_tree *tree, |
|
u64 offset) |
|
{ |
|
struct rb_node *prev = NULL; |
|
struct rb_node *ret; |
|
|
|
ret = __etree_search(tree, offset, &prev, NULL); |
|
if (!ret) |
|
return prev; |
|
return ret; |
|
} |
|
|
|
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, |
|
struct extent_state *other) |
|
{ |
|
if (tree->ops && tree->ops->merge_extent_hook) |
|
tree->ops->merge_extent_hook(tree->mapping->host, new, |
|
other); |
|
} |
|
|
|
/* |
|
* utility function to look for merge candidates inside a given range. |
|
* Any extents with matching state are merged together into a single |
|
* extent in the tree. Extents with EXTENT_IO in their state field |
|
* are not merged because the end_io handlers need to be able to do |
|
* operations on them without sleeping (or doing allocations/splits). |
|
* |
|
* This should be called with the tree lock held. |
|
*/ |
|
static void merge_state(struct extent_io_tree *tree, |
|
struct extent_state *state) |
|
{ |
|
struct extent_state *other; |
|
struct rb_node *other_node; |
|
|
|
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) |
|
return; |
|
|
|
other_node = rb_prev(&state->rb_node); |
|
if (other_node) { |
|
other = rb_entry(other_node, struct extent_state, rb_node); |
|
if (other->end == state->start - 1 && |
|
other->state == state->state) { |
|
merge_cb(tree, state, other); |
|
state->start = other->start; |
|
other->tree = NULL; |
|
rb_erase(&other->rb_node, &tree->state); |
|
free_extent_state(other); |
|
} |
|
} |
|
other_node = rb_next(&state->rb_node); |
|
if (other_node) { |
|
other = rb_entry(other_node, struct extent_state, rb_node); |
|
if (other->start == state->end + 1 && |
|
other->state == state->state) { |
|
merge_cb(tree, state, other); |
|
state->end = other->end; |
|
other->tree = NULL; |
|
rb_erase(&other->rb_node, &tree->state); |
|
free_extent_state(other); |
|
} |
|
} |
|
} |
|
|
|
static void set_state_cb(struct extent_io_tree *tree, |
|
struct extent_state *state, int *bits) |
|
{ |
|
if (tree->ops && tree->ops->set_bit_hook) |
|
tree->ops->set_bit_hook(tree->mapping->host, state, bits); |
|
} |
|
|
|
static void clear_state_cb(struct extent_io_tree *tree, |
|
struct extent_state *state, int *bits) |
|
{ |
|
if (tree->ops && tree->ops->clear_bit_hook) |
|
tree->ops->clear_bit_hook(tree->mapping->host, state, bits); |
|
} |
|
|
|
static void set_state_bits(struct extent_io_tree *tree, |
|
struct extent_state *state, int *bits); |
|
|
|
/* |
|
* insert an extent_state struct into the tree. 'bits' are set on the |
|
* struct before it is inserted. |
|
* |
|
* This may return -EEXIST if the extent is already there, in which case the |
|
* state struct is freed. |
|
* |
|
* The tree lock is not taken internally. This is a utility function and |
|
* probably isn't what you want to call (see set/clear_extent_bit). |
|
*/ |
|
static int insert_state(struct extent_io_tree *tree, |
|
struct extent_state *state, u64 start, u64 end, |
|
int *bits) |
|
{ |
|
struct rb_node *node; |
|
|
|
if (end < start) |
|
WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", |
|
(unsigned long long)end, |
|
(unsigned long long)start); |
|
state->start = start; |
|
state->end = end; |
|
|
|
set_state_bits(tree, state, bits); |
|
|
|
node = tree_insert(&tree->state, end, &state->rb_node); |
|
if (node) { |
|
struct extent_state *found; |
|
found = rb_entry(node, struct extent_state, rb_node); |
|
printk(KERN_ERR "btrfs found node %llu %llu on insert of " |
|
"%llu %llu\n", (unsigned long long)found->start, |
|
(unsigned long long)found->end, |
|
(unsigned long long)start, (unsigned long long)end); |
|
return -EEXIST; |
|
} |
|
state->tree = tree; |
|
merge_state(tree, state); |
|
return 0; |
|
} |
|
|
|
static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, |
|
u64 split) |
|
{ |
|
if (tree->ops && tree->ops->split_extent_hook) |
|
tree->ops->split_extent_hook(tree->mapping->host, orig, split); |
|
} |
|
|
|
/* |
|
* split a given extent state struct in two, inserting the preallocated |
|
* struct 'prealloc' as the newly created second half. 'split' indicates an |
|
* offset inside 'orig' where it should be split. |
|
* |
|
* Before calling, |
|
* the tree has 'orig' at [orig->start, orig->end]. After calling, there |
|
* are two extent state structs in the tree: |
|
* prealloc: [orig->start, split - 1] |
|
* orig: [ split, orig->end ] |
|
* |
|
* The tree locks are not taken by this function. They need to be held |
|
* by the caller. |
|
*/ |
|
static int split_state(struct extent_io_tree *tree, struct extent_state *orig, |
|
struct extent_state *prealloc, u64 split) |
|
{ |
|
struct rb_node *node; |
|
|
|
split_cb(tree, orig, split); |
|
|
|
prealloc->start = orig->start; |
|
prealloc->end = split - 1; |
|
prealloc->state = orig->state; |
|
orig->start = split; |
|
|
|
node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); |
|
if (node) { |
|
free_extent_state(prealloc); |
|
return -EEXIST; |
|
} |
|
prealloc->tree = tree; |
|
return 0; |
|
} |
|
|
|
static struct extent_state *next_state(struct extent_state *state) |
|
{ |
|
struct rb_node *next = rb_next(&state->rb_node); |
|
if (next) |
|
return rb_entry(next, struct extent_state, rb_node); |
|
else |
|
return NULL; |
|
} |
|
|
|
/* |
|
* utility function to clear some bits in an extent state struct. |
|
* it will optionally wake up any one waiting on this state (wake == 1). |
|
* |
|
* If no bits are set on the state struct after clearing things, the |
|
* struct is freed and removed from the tree |
|
*/ |
|
static struct extent_state *clear_state_bit(struct extent_io_tree *tree, |
|
struct extent_state *state, |
|
int *bits, int wake) |
|
{ |
|
struct extent_state *next; |
|
int bits_to_clear = *bits & ~EXTENT_CTLBITS; |
|
|
|
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { |
|
u64 range = state->end - state->start + 1; |
|
WARN_ON(range > tree->dirty_bytes); |
|
tree->dirty_bytes -= range; |
|
} |
|
clear_state_cb(tree, state, bits); |
|
state->state &= ~bits_to_clear; |
|
if (wake) |
|
wake_up(&state->wq); |
|
if (state->state == 0) { |
|
next = next_state(state); |
|
if (state->tree) { |
|
rb_erase(&state->rb_node, &tree->state); |
|
state->tree = NULL; |
|
free_extent_state(state); |
|
} else { |
|
WARN_ON(1); |
|
} |
|
} else { |
|
merge_state(tree, state); |
|
next = next_state(state); |
|
} |
|
return next; |
|
} |
|
|
|
static struct extent_state * |
|
alloc_extent_state_atomic(struct extent_state *prealloc) |
|
{ |
|
if (!prealloc) |
|
prealloc = alloc_extent_state(GFP_ATOMIC); |
|
|
|
return prealloc; |
|
} |
|
|
|
void extent_io_tree_panic(struct extent_io_tree *tree, int err) |
|
{ |
|
btrfs_panic(tree_fs_info(tree), err, "Locking error: " |
|
"Extent tree was modified by another " |
|
"thread while locked."); |
|
} |
|
|
|
/* |
|
* clear some bits on a range in the tree. This may require splitting |
|
* or inserting elements in the tree, so the gfp mask is used to |
|
* indicate which allocations or sleeping are allowed. |
|
* |
|
* pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove |
|
* the given range from the tree regardless of state (ie for truncate). |
|
* |
|
* the range [start, end] is inclusive. |
|
* |
|
* This takes the tree lock, and returns 0 on success and < 0 on error. |
|
*/ |
|
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, int wake, int delete, |
|
struct extent_state **cached_state, |
|
gfp_t mask) |
|
{ |
|
struct extent_state *state; |
|
struct extent_state *cached; |
|
struct extent_state *prealloc = NULL; |
|
struct rb_node *node; |
|
u64 last_end; |
|
int err; |
|
int clear = 0; |
|
|
|
if (delete) |
|
bits |= ~EXTENT_CTLBITS; |
|
bits |= EXTENT_FIRST_DELALLOC; |
|
|
|
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) |
|
clear = 1; |
|
again: |
|
if (!prealloc && (mask & __GFP_WAIT)) { |
|
prealloc = alloc_extent_state(mask); |
|
if (!prealloc) |
|
return -ENOMEM; |
|
} |
|
|
|
spin_lock(&tree->lock); |
|
if (cached_state) { |
|
cached = *cached_state; |
|
|
|
if (clear) { |
|
*cached_state = NULL; |
|
cached_state = NULL; |
|
} |
|
|
|
if (cached && cached->tree && cached->start <= start && |
|
cached->end > start) { |
|
if (clear) |
|
atomic_dec(&cached->refs); |
|
state = cached; |
|
goto hit_next; |
|
} |
|
if (clear) |
|
free_extent_state(cached); |
|
} |
|
/* |
|
* this search will find the extents that end after |
|
* our range starts |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) |
|
goto out; |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
hit_next: |
|
if (state->start > end) |
|
goto out; |
|
WARN_ON(state->end < start); |
|
last_end = state->end; |
|
|
|
/* the state doesn't have the wanted bits, go ahead */ |
|
if (!(state->state & bits)) { |
|
state = next_state(state); |
|
goto next; |
|
} |
|
|
|
/* |
|
* | ---- desired range ---- | |
|
* | state | or |
|
* | ------------- state -------------- | |
|
* |
|
* We need to split the extent we found, and may flip |
|
* bits on second half. |
|
* |
|
* If the extent we found extends past our range, we |
|
* just split and search again. It'll get split again |
|
* the next time though. |
|
* |
|
* If the extent we found is inside our range, we clear |
|
* the desired bit on it. |
|
*/ |
|
|
|
if (state->start < start) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
err = split_state(tree, state, prealloc, start); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
prealloc = NULL; |
|
if (err) |
|
goto out; |
|
if (state->end <= end) { |
|
state = clear_state_bit(tree, state, &bits, wake); |
|
goto next; |
|
} |
|
goto search_again; |
|
} |
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* We need to split the extent, and clear the bit |
|
* on the first half |
|
*/ |
|
if (state->start <= end && state->end > end) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
err = split_state(tree, state, prealloc, end + 1); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
if (wake) |
|
wake_up(&state->wq); |
|
|
|
clear_state_bit(tree, prealloc, &bits, wake); |
|
|
|
prealloc = NULL; |
|
goto out; |
|
} |
|
|
|
state = clear_state_bit(tree, state, &bits, wake); |
|
next: |
|
if (last_end == (u64)-1) |
|
goto out; |
|
start = last_end + 1; |
|
if (start <= end && state && !need_resched()) |
|
goto hit_next; |
|
goto search_again; |
|
|
|
out: |
|
spin_unlock(&tree->lock); |
|
if (prealloc) |
|
free_extent_state(prealloc); |
|
|
|
return 0; |
|
|
|
search_again: |
|
if (start > end) |
|
goto out; |
|
spin_unlock(&tree->lock); |
|
if (mask & __GFP_WAIT) |
|
cond_resched(); |
|
goto again; |
|
} |
|
|
|
static void wait_on_state(struct extent_io_tree *tree, |
|
struct extent_state *state) |
|
__releases(tree->lock) |
|
__acquires(tree->lock) |
|
{ |
|
DEFINE_WAIT(wait); |
|
prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); |
|
spin_unlock(&tree->lock); |
|
schedule(); |
|
spin_lock(&tree->lock); |
|
finish_wait(&state->wq, &wait); |
|
} |
|
|
|
/* |
|
* waits for one or more bits to clear on a range in the state tree. |
|
* The range [start, end] is inclusive. |
|
* The tree lock is taken by this function |
|
*/ |
|
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) |
|
{ |
|
struct extent_state *state; |
|
struct rb_node *node; |
|
|
|
spin_lock(&tree->lock); |
|
again: |
|
while (1) { |
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) |
|
break; |
|
|
|
state = rb_entry(node, struct extent_state, rb_node); |
|
|
|
if (state->start > end) |
|
goto out; |
|
|
|
if (state->state & bits) { |
|
start = state->start; |
|
atomic_inc(&state->refs); |
|
wait_on_state(tree, state); |
|
free_extent_state(state); |
|
goto again; |
|
} |
|
start = state->end + 1; |
|
|
|
if (start > end) |
|
break; |
|
|
|
cond_resched_lock(&tree->lock); |
|
} |
|
out: |
|
spin_unlock(&tree->lock); |
|
} |
|
|
|
static void set_state_bits(struct extent_io_tree *tree, |
|
struct extent_state *state, |
|
int *bits) |
|
{ |
|
int bits_to_set = *bits & ~EXTENT_CTLBITS; |
|
|
|
set_state_cb(tree, state, bits); |
|
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { |
|
u64 range = state->end - state->start + 1; |
|
tree->dirty_bytes += range; |
|
} |
|
state->state |= bits_to_set; |
|
} |
|
|
|
static void cache_state(struct extent_state *state, |
|
struct extent_state **cached_ptr) |
|
{ |
|
if (cached_ptr && !(*cached_ptr)) { |
|
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { |
|
*cached_ptr = state; |
|
atomic_inc(&state->refs); |
|
} |
|
} |
|
} |
|
|
|
static void uncache_state(struct extent_state **cached_ptr) |
|
{ |
|
if (cached_ptr && (*cached_ptr)) { |
|
struct extent_state *state = *cached_ptr; |
|
*cached_ptr = NULL; |
|
free_extent_state(state); |
|
} |
|
} |
|
|
|
/* |
|
* set some bits on a range in the tree. This may require allocations or |
|
* sleeping, so the gfp mask is used to indicate what is allowed. |
|
* |
|
* If any of the exclusive bits are set, this will fail with -EEXIST if some |
|
* part of the range already has the desired bits set. The start of the |
|
* existing range is returned in failed_start in this case. |
|
* |
|
* [start, end] is inclusive This takes the tree lock. |
|
*/ |
|
|
|
static int __must_check |
|
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, int exclusive_bits, u64 *failed_start, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
struct extent_state *state; |
|
struct extent_state *prealloc = NULL; |
|
struct rb_node *node; |
|
int err = 0; |
|
u64 last_start; |
|
u64 last_end; |
|
|
|
bits |= EXTENT_FIRST_DELALLOC; |
|
again: |
|
if (!prealloc && (mask & __GFP_WAIT)) { |
|
prealloc = alloc_extent_state(mask); |
|
BUG_ON(!prealloc); |
|
} |
|
|
|
spin_lock(&tree->lock); |
|
if (cached_state && *cached_state) { |
|
state = *cached_state; |
|
if (state->start <= start && state->end > start && |
|
state->tree) { |
|
node = &state->rb_node; |
|
goto hit_next; |
|
} |
|
} |
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
err = insert_state(tree, prealloc, start, end, &bits); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
prealloc = NULL; |
|
goto out; |
|
} |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
hit_next: |
|
last_start = state->start; |
|
last_end = state->end; |
|
|
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* |
|
* Just lock what we found and keep going |
|
*/ |
|
if (state->start == start && state->end <= end) { |
|
if (state->state & exclusive_bits) { |
|
*failed_start = state->start; |
|
err = -EEXIST; |
|
goto out; |
|
} |
|
|
|
set_state_bits(tree, state, &bits); |
|
cache_state(state, cached_state); |
|
merge_state(tree, state); |
|
if (last_end == (u64)-1) |
|
goto out; |
|
start = last_end + 1; |
|
state = next_state(state); |
|
if (start < end && state && state->start == start && |
|
!need_resched()) |
|
goto hit_next; |
|
goto search_again; |
|
} |
|
|
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* or |
|
* | ------------- state -------------- | |
|
* |
|
* We need to split the extent we found, and may flip bits on |
|
* second half. |
|
* |
|
* If the extent we found extends past our |
|
* range, we just split and search again. It'll get split |
|
* again the next time though. |
|
* |
|
* If the extent we found is inside our range, we set the |
|
* desired bit on it. |
|
*/ |
|
if (state->start < start) { |
|
if (state->state & exclusive_bits) { |
|
*failed_start = start; |
|
err = -EEXIST; |
|
goto out; |
|
} |
|
|
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
err = split_state(tree, state, prealloc, start); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
prealloc = NULL; |
|
if (err) |
|
goto out; |
|
if (state->end <= end) { |
|
set_state_bits(tree, state, &bits); |
|
cache_state(state, cached_state); |
|
merge_state(tree, state); |
|
if (last_end == (u64)-1) |
|
goto out; |
|
start = last_end + 1; |
|
state = next_state(state); |
|
if (start < end && state && state->start == start && |
|
!need_resched()) |
|
goto hit_next; |
|
} |
|
goto search_again; |
|
} |
|
/* |
|
* | ---- desired range ---- | |
|
* | state | or | state | |
|
* |
|
* There's a hole, we need to insert something in it and |
|
* ignore the extent we found. |
|
*/ |
|
if (state->start > start) { |
|
u64 this_end; |
|
if (end < last_start) |
|
this_end = end; |
|
else |
|
this_end = last_start - 1; |
|
|
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
|
|
/* |
|
* Avoid to free 'prealloc' if it can be merged with |
|
* the later extent. |
|
*/ |
|
err = insert_state(tree, prealloc, start, this_end, |
|
&bits); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
cache_state(prealloc, cached_state); |
|
prealloc = NULL; |
|
start = this_end + 1; |
|
goto search_again; |
|
} |
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* We need to split the extent, and set the bit |
|
* on the first half |
|
*/ |
|
if (state->start <= end && state->end > end) { |
|
if (state->state & exclusive_bits) { |
|
*failed_start = start; |
|
err = -EEXIST; |
|
goto out; |
|
} |
|
|
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
BUG_ON(!prealloc); |
|
err = split_state(tree, state, prealloc, end + 1); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
set_state_bits(tree, prealloc, &bits); |
|
cache_state(prealloc, cached_state); |
|
merge_state(tree, prealloc); |
|
prealloc = NULL; |
|
goto out; |
|
} |
|
|
|
goto search_again; |
|
|
|
out: |
|
spin_unlock(&tree->lock); |
|
if (prealloc) |
|
free_extent_state(prealloc); |
|
|
|
return err; |
|
|
|
search_again: |
|
if (start > end) |
|
goto out; |
|
spin_unlock(&tree->lock); |
|
if (mask & __GFP_WAIT) |
|
cond_resched(); |
|
goto again; |
|
} |
|
|
|
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, |
|
u64 *failed_start, struct extent_state **cached_state, |
|
gfp_t mask) |
|
{ |
|
return __set_extent_bit(tree, start, end, bits, 0, failed_start, |
|
cached_state, mask); |
|
} |
|
|
|
|
|
/** |
|
* convert_extent_bit - convert all bits in a given range from one bit to |
|
* another |
|
* @tree: the io tree to search |
|
* @start: the start offset in bytes |
|
* @end: the end offset in bytes (inclusive) |
|
* @bits: the bits to set in this range |
|
* @clear_bits: the bits to clear in this range |
|
* @cached_state: state that we're going to cache |
|
* @mask: the allocation mask |
|
* |
|
* This will go through and set bits for the given range. If any states exist |
|
* already in this range they are set with the given bit and cleared of the |
|
* clear_bits. This is only meant to be used by things that are mergeable, ie |
|
* converting from say DELALLOC to DIRTY. This is not meant to be used with |
|
* boundary bits like LOCK. |
|
*/ |
|
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, int clear_bits, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
struct extent_state *state; |
|
struct extent_state *prealloc = NULL; |
|
struct rb_node *node; |
|
int err = 0; |
|
u64 last_start; |
|
u64 last_end; |
|
|
|
again: |
|
if (!prealloc && (mask & __GFP_WAIT)) { |
|
prealloc = alloc_extent_state(mask); |
|
if (!prealloc) |
|
return -ENOMEM; |
|
} |
|
|
|
spin_lock(&tree->lock); |
|
if (cached_state && *cached_state) { |
|
state = *cached_state; |
|
if (state->start <= start && state->end > start && |
|
state->tree) { |
|
node = &state->rb_node; |
|
goto hit_next; |
|
} |
|
} |
|
|
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
if (!prealloc) { |
|
err = -ENOMEM; |
|
goto out; |
|
} |
|
err = insert_state(tree, prealloc, start, end, &bits); |
|
prealloc = NULL; |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
goto out; |
|
} |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
hit_next: |
|
last_start = state->start; |
|
last_end = state->end; |
|
|
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* |
|
* Just lock what we found and keep going |
|
*/ |
|
if (state->start == start && state->end <= end) { |
|
set_state_bits(tree, state, &bits); |
|
cache_state(state, cached_state); |
|
state = clear_state_bit(tree, state, &clear_bits, 0); |
|
if (last_end == (u64)-1) |
|
goto out; |
|
start = last_end + 1; |
|
if (start < end && state && state->start == start && |
|
!need_resched()) |
|
goto hit_next; |
|
goto search_again; |
|
} |
|
|
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* or |
|
* | ------------- state -------------- | |
|
* |
|
* We need to split the extent we found, and may flip bits on |
|
* second half. |
|
* |
|
* If the extent we found extends past our |
|
* range, we just split and search again. It'll get split |
|
* again the next time though. |
|
* |
|
* If the extent we found is inside our range, we set the |
|
* desired bit on it. |
|
*/ |
|
if (state->start < start) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
if (!prealloc) { |
|
err = -ENOMEM; |
|
goto out; |
|
} |
|
err = split_state(tree, state, prealloc, start); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
prealloc = NULL; |
|
if (err) |
|
goto out; |
|
if (state->end <= end) { |
|
set_state_bits(tree, state, &bits); |
|
cache_state(state, cached_state); |
|
state = clear_state_bit(tree, state, &clear_bits, 0); |
|
if (last_end == (u64)-1) |
|
goto out; |
|
start = last_end + 1; |
|
if (start < end && state && state->start == start && |
|
!need_resched()) |
|
goto hit_next; |
|
} |
|
goto search_again; |
|
} |
|
/* |
|
* | ---- desired range ---- | |
|
* | state | or | state | |
|
* |
|
* There's a hole, we need to insert something in it and |
|
* ignore the extent we found. |
|
*/ |
|
if (state->start > start) { |
|
u64 this_end; |
|
if (end < last_start) |
|
this_end = end; |
|
else |
|
this_end = last_start - 1; |
|
|
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
if (!prealloc) { |
|
err = -ENOMEM; |
|
goto out; |
|
} |
|
|
|
/* |
|
* Avoid to free 'prealloc' if it can be merged with |
|
* the later extent. |
|
*/ |
|
err = insert_state(tree, prealloc, start, this_end, |
|
&bits); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
cache_state(prealloc, cached_state); |
|
prealloc = NULL; |
|
start = this_end + 1; |
|
goto search_again; |
|
} |
|
/* |
|
* | ---- desired range ---- | |
|
* | state | |
|
* We need to split the extent, and set the bit |
|
* on the first half |
|
*/ |
|
if (state->start <= end && state->end > end) { |
|
prealloc = alloc_extent_state_atomic(prealloc); |
|
if (!prealloc) { |
|
err = -ENOMEM; |
|
goto out; |
|
} |
|
|
|
err = split_state(tree, state, prealloc, end + 1); |
|
if (err) |
|
extent_io_tree_panic(tree, err); |
|
|
|
set_state_bits(tree, prealloc, &bits); |
|
cache_state(prealloc, cached_state); |
|
clear_state_bit(tree, prealloc, &clear_bits, 0); |
|
prealloc = NULL; |
|
goto out; |
|
} |
|
|
|
goto search_again; |
|
|
|
out: |
|
spin_unlock(&tree->lock); |
|
if (prealloc) |
|
free_extent_state(prealloc); |
|
|
|
return err; |
|
|
|
search_again: |
|
if (start > end) |
|
goto out; |
|
spin_unlock(&tree->lock); |
|
if (mask & __GFP_WAIT) |
|
cond_resched(); |
|
goto again; |
|
} |
|
|
|
/* wrappers around set/clear extent bit */ |
|
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
|
gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, |
|
NULL, mask); |
|
} |
|
|
|
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, bits, NULL, |
|
NULL, mask); |
|
} |
|
|
|
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, gfp_t mask) |
|
{ |
|
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); |
|
} |
|
|
|
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, |
|
EXTENT_DELALLOC | EXTENT_UPTODATE, |
|
NULL, cached_state, mask); |
|
} |
|
|
|
int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, |
|
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, |
|
NULL, cached_state, mask); |
|
} |
|
|
|
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
|
gfp_t mask) |
|
{ |
|
return clear_extent_bit(tree, start, end, |
|
EXTENT_DIRTY | EXTENT_DELALLOC | |
|
EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); |
|
} |
|
|
|
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, |
|
gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, |
|
NULL, mask); |
|
} |
|
|
|
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, |
|
cached_state, mask); |
|
} |
|
|
|
int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, |
|
struct extent_state **cached_state, gfp_t mask) |
|
{ |
|
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, |
|
cached_state, mask); |
|
} |
|
|
|
/* |
|
* either insert or lock state struct between start and end use mask to tell |
|
* us if waiting is desired. |
|
*/ |
|
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, struct extent_state **cached_state) |
|
{ |
|
int err; |
|
u64 failed_start; |
|
while (1) { |
|
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, |
|
EXTENT_LOCKED, &failed_start, |
|
cached_state, GFP_NOFS); |
|
if (err == -EEXIST) { |
|
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); |
|
start = failed_start; |
|
} else |
|
break; |
|
WARN_ON(start > end); |
|
} |
|
return err; |
|
} |
|
|
|
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) |
|
{ |
|
return lock_extent_bits(tree, start, end, 0, NULL); |
|
} |
|
|
|
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) |
|
{ |
|
int err; |
|
u64 failed_start; |
|
|
|
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, |
|
&failed_start, NULL, GFP_NOFS); |
|
if (err == -EEXIST) { |
|
if (failed_start > start) |
|
clear_extent_bit(tree, start, failed_start - 1, |
|
EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); |
|
return 0; |
|
} |
|
return 1; |
|
} |
|
|
|
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, |
|
struct extent_state **cached, gfp_t mask) |
|
{ |
|
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, |
|
mask); |
|
} |
|
|
|
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) |
|
{ |
|
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, |
|
GFP_NOFS); |
|
} |
|
|
|
/* |
|
* helper function to set both pages and extents in the tree writeback |
|
*/ |
|
static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) |
|
{ |
|
unsigned long index = start >> PAGE_CACHE_SHIFT; |
|
unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
|
struct page *page; |
|
|
|
while (index <= end_index) { |
|
page = find_get_page(tree->mapping, index); |
|
BUG_ON(!page); /* Pages should be in the extent_io_tree */ |
|
set_page_writeback(page); |
|
page_cache_release(page); |
|
index++; |
|
} |
|
return 0; |
|
} |
|
|
|
/* find the first state struct with 'bits' set after 'start', and |
|
* return it. tree->lock must be held. NULL will returned if |
|
* nothing was found after 'start' |
|
*/ |
|
struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, |
|
u64 start, int bits) |
|
{ |
|
struct rb_node *node; |
|
struct extent_state *state; |
|
|
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) |
|
goto out; |
|
|
|
while (1) { |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
if (state->end >= start && (state->state & bits)) |
|
return state; |
|
|
|
node = rb_next(node); |
|
if (!node) |
|
break; |
|
} |
|
out: |
|
return NULL; |
|
} |
|
|
|
/* |
|
* find the first offset in the io tree with 'bits' set. zero is |
|
* returned if we find something, and *start_ret and *end_ret are |
|
* set to reflect the state struct that was found. |
|
* |
|
* If nothing was found, 1 is returned. If found something, return 0. |
|
*/ |
|
int find_first_extent_bit(struct extent_io_tree *tree, u64 start, |
|
u64 *start_ret, u64 *end_ret, int bits, |
|
struct extent_state **cached_state) |
|
{ |
|
struct extent_state *state; |
|
struct rb_node *n; |
|
int ret = 1; |
|
|
|
spin_lock(&tree->lock); |
|
if (cached_state && *cached_state) { |
|
state = *cached_state; |
|
if (state->end == start - 1 && state->tree) { |
|
n = rb_next(&state->rb_node); |
|
while (n) { |
|
state = rb_entry(n, struct extent_state, |
|
rb_node); |
|
if (state->state & bits) |
|
goto got_it; |
|
n = rb_next(n); |
|
} |
|
free_extent_state(*cached_state); |
|
*cached_state = NULL; |
|
goto out; |
|
} |
|
free_extent_state(*cached_state); |
|
*cached_state = NULL; |
|
} |
|
|
|
state = find_first_extent_bit_state(tree, start, bits); |
|
got_it: |
|
if (state) { |
|
cache_state(state, cached_state); |
|
*start_ret = state->start; |
|
*end_ret = state->end; |
|
ret = 0; |
|
} |
|
out: |
|
spin_unlock(&tree->lock); |
|
return ret; |
|
} |
|
|
|
/* |
|
* find a contiguous range of bytes in the file marked as delalloc, not |
|
* more than 'max_bytes'. start and end are used to return the range, |
|
* |
|
* 1 is returned if we find something, 0 if nothing was in the tree |
|
*/ |
|
static noinline u64 find_delalloc_range(struct extent_io_tree *tree, |
|
u64 *start, u64 *end, u64 max_bytes, |
|
struct extent_state **cached_state) |
|
{ |
|
struct rb_node *node; |
|
struct extent_state *state; |
|
u64 cur_start = *start; |
|
u64 found = 0; |
|
u64 total_bytes = 0; |
|
|
|
spin_lock(&tree->lock); |
|
|
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, cur_start); |
|
if (!node) { |
|
if (!found) |
|
*end = (u64)-1; |
|
goto out; |
|
} |
|
|
|
while (1) { |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
if (found && (state->start != cur_start || |
|
(state->state & EXTENT_BOUNDARY))) { |
|
goto out; |
|
} |
|
if (!(state->state & EXTENT_DELALLOC)) { |
|
if (!found) |
|
*end = state->end; |
|
goto out; |
|
} |
|
if (!found) { |
|
*start = state->start; |
|
*cached_state = state; |
|
atomic_inc(&state->refs); |
|
} |
|
found++; |
|
*end = state->end; |
|
cur_start = state->end + 1; |
|
node = rb_next(node); |
|
if (!node) |
|
break; |
|
total_bytes += state->end - state->start + 1; |
|
if (total_bytes >= max_bytes) |
|
break; |
|
} |
|
out: |
|
spin_unlock(&tree->lock); |
|
return found; |
|
} |
|
|
|
static noinline void __unlock_for_delalloc(struct inode *inode, |
|
struct page *locked_page, |
|
u64 start, u64 end) |
|
{ |
|
int ret; |
|
struct page *pages[16]; |
|
unsigned long index = start >> PAGE_CACHE_SHIFT; |
|
unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
|
unsigned long nr_pages = end_index - index + 1; |
|
int i; |
|
|
|
if (index == locked_page->index && end_index == index) |
|
return; |
|
|
|
while (nr_pages > 0) { |
|
ret = find_get_pages_contig(inode->i_mapping, index, |
|
min_t(unsigned long, nr_pages, |
|
ARRAY_SIZE(pages)), pages); |
|
for (i = 0; i < ret; i++) { |
|
if (pages[i] != locked_page) |
|
unlock_page(pages[i]); |
|
page_cache_release(pages[i]); |
|
} |
|
nr_pages -= ret; |
|
index += ret; |
|
cond_resched(); |
|
} |
|
} |
|
|
|
static noinline int lock_delalloc_pages(struct inode *inode, |
|
struct page *locked_page, |
|
u64 delalloc_start, |
|
u64 delalloc_end) |
|
{ |
|
unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; |
|
unsigned long start_index = index; |
|
unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; |
|
unsigned long pages_locked = 0; |
|
struct page *pages[16]; |
|
unsigned long nrpages; |
|
int ret; |
|
int i; |
|
|
|
/* the caller is responsible for locking the start index */ |
|
if (index == locked_page->index && index == end_index) |
|
return 0; |
|
|
|
/* skip the page at the start index */ |
|
nrpages = end_index - index + 1; |
|
while (nrpages > 0) { |
|
ret = find_get_pages_contig(inode->i_mapping, index, |
|
min_t(unsigned long, |
|
nrpages, ARRAY_SIZE(pages)), pages); |
|
if (ret == 0) { |
|
ret = -EAGAIN; |
|
goto done; |
|
} |
|
/* now we have an array of pages, lock them all */ |
|
for (i = 0; i < ret; i++) { |
|
/* |
|
* the caller is taking responsibility for |
|
* locked_page |
|
*/ |
|
if (pages[i] != locked_page) { |
|
lock_page(pages[i]); |
|
if (!PageDirty(pages[i]) || |
|
pages[i]->mapping != inode->i_mapping) { |
|
ret = -EAGAIN; |
|
unlock_page(pages[i]); |
|
page_cache_release(pages[i]); |
|
goto done; |
|
} |
|
} |
|
page_cache_release(pages[i]); |
|
pages_locked++; |
|
} |
|
nrpages -= ret; |
|
index += ret; |
|
cond_resched(); |
|
} |
|
ret = 0; |
|
done: |
|
if (ret && pages_locked) { |
|
__unlock_for_delalloc(inode, locked_page, |
|
delalloc_start, |
|
((u64)(start_index + pages_locked - 1)) << |
|
PAGE_CACHE_SHIFT); |
|
} |
|
return ret; |
|
} |
|
|
|
/* |
|
* find a contiguous range of bytes in the file marked as delalloc, not |
|
* more than 'max_bytes'. start and end are used to return the range, |
|
* |
|
* 1 is returned if we find something, 0 if nothing was in the tree |
|
*/ |
|
static noinline u64 find_lock_delalloc_range(struct inode *inode, |
|
struct extent_io_tree *tree, |
|
struct page *locked_page, |
|
u64 *start, u64 *end, |
|
u64 max_bytes) |
|
{ |
|
u64 delalloc_start; |
|
u64 delalloc_end; |
|
u64 found; |
|
struct extent_state *cached_state = NULL; |
|
int ret; |
|
int loops = 0; |
|
|
|
again: |
|
/* step one, find a bunch of delalloc bytes starting at start */ |
|
delalloc_start = *start; |
|
delalloc_end = 0; |
|
found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, |
|
max_bytes, &cached_state); |
|
if (!found || delalloc_end <= *start) { |
|
*start = delalloc_start; |
|
*end = delalloc_end; |
|
free_extent_state(cached_state); |
|
return found; |
|
} |
|
|
|
/* |
|
* start comes from the offset of locked_page. We have to lock |
|
* pages in order, so we can't process delalloc bytes before |
|
* locked_page |
|
*/ |
|
if (delalloc_start < *start) |
|
delalloc_start = *start; |
|
|
|
/* |
|
* make sure to limit the number of pages we try to lock down |
|
* if we're looping. |
|
*/ |
|
if (delalloc_end + 1 - delalloc_start > max_bytes && loops) |
|
delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; |
|
|
|
/* step two, lock all the pages after the page that has start */ |
|
ret = lock_delalloc_pages(inode, locked_page, |
|
delalloc_start, delalloc_end); |
|
if (ret == -EAGAIN) { |
|
/* some of the pages are gone, lets avoid looping by |
|
* shortening the size of the delalloc range we're searching |
|
*/ |
|
free_extent_state(cached_state); |
|
if (!loops) { |
|
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); |
|
max_bytes = PAGE_CACHE_SIZE - offset; |
|
loops = 1; |
|
goto again; |
|
} else { |
|
found = 0; |
|
goto out_failed; |
|
} |
|
} |
|
BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ |
|
|
|
/* step three, lock the state bits for the whole range */ |
|
lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); |
|
|
|
/* then test to make sure it is all still delalloc */ |
|
ret = test_range_bit(tree, delalloc_start, delalloc_end, |
|
EXTENT_DELALLOC, 1, cached_state); |
|
if (!ret) { |
|
unlock_extent_cached(tree, delalloc_start, delalloc_end, |
|
&cached_state, GFP_NOFS); |
|
__unlock_for_delalloc(inode, locked_page, |
|
delalloc_start, delalloc_end); |
|
cond_resched(); |
|
goto again; |
|
} |
|
free_extent_state(cached_state); |
|
*start = delalloc_start; |
|
*end = delalloc_end; |
|
out_failed: |
|
return found; |
|
} |
|
|
|
int extent_clear_unlock_delalloc(struct inode *inode, |
|
struct extent_io_tree *tree, |
|
u64 start, u64 end, struct page *locked_page, |
|
unsigned long op) |
|
{ |
|
int ret; |
|
struct page *pages[16]; |
|
unsigned long index = start >> PAGE_CACHE_SHIFT; |
|
unsigned long end_index = end >> PAGE_CACHE_SHIFT; |
|
unsigned long nr_pages = end_index - index + 1; |
|
int i; |
|
int clear_bits = 0; |
|
|
|
if (op & EXTENT_CLEAR_UNLOCK) |
|
clear_bits |= EXTENT_LOCKED; |
|
if (op & EXTENT_CLEAR_DIRTY) |
|
clear_bits |= EXTENT_DIRTY; |
|
|
|
if (op & EXTENT_CLEAR_DELALLOC) |
|
clear_bits |= EXTENT_DELALLOC; |
|
|
|
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); |
|
if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | |
|
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | |
|
EXTENT_SET_PRIVATE2))) |
|
return 0; |
|
|
|
while (nr_pages > 0) { |
|
ret = find_get_pages_contig(inode->i_mapping, index, |
|
min_t(unsigned long, |
|
nr_pages, ARRAY_SIZE(pages)), pages); |
|
for (i = 0; i < ret; i++) { |
|
|
|
if (op & EXTENT_SET_PRIVATE2) |
|
SetPagePrivate2(pages[i]); |
|
|
|
if (pages[i] == locked_page) { |
|
page_cache_release(pages[i]); |
|
continue; |
|
} |
|
if (op & EXTENT_CLEAR_DIRTY) |
|
clear_page_dirty_for_io(pages[i]); |
|
if (op & EXTENT_SET_WRITEBACK) |
|
set_page_writeback(pages[i]); |
|
if (op & EXTENT_END_WRITEBACK) |
|
end_page_writeback(pages[i]); |
|
if (op & EXTENT_CLEAR_UNLOCK_PAGE) |
|
unlock_page(pages[i]); |
|
page_cache_release(pages[i]); |
|
} |
|
nr_pages -= ret; |
|
index += ret; |
|
cond_resched(); |
|
} |
|
return 0; |
|
} |
|
|
|
/* |
|
* count the number of bytes in the tree that have a given bit(s) |
|
* set. This can be fairly slow, except for EXTENT_DIRTY which is |
|
* cached. The total number found is returned. |
|
*/ |
|
u64 count_range_bits(struct extent_io_tree *tree, |
|
u64 *start, u64 search_end, u64 max_bytes, |
|
unsigned long bits, int contig) |
|
{ |
|
struct rb_node *node; |
|
struct extent_state *state; |
|
u64 cur_start = *start; |
|
u64 total_bytes = 0; |
|
u64 last = 0; |
|
int found = 0; |
|
|
|
if (search_end <= cur_start) { |
|
WARN_ON(1); |
|
return 0; |
|
} |
|
|
|
spin_lock(&tree->lock); |
|
if (cur_start == 0 && bits == EXTENT_DIRTY) { |
|
total_bytes = tree->dirty_bytes; |
|
goto out; |
|
} |
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, cur_start); |
|
if (!node) |
|
goto out; |
|
|
|
while (1) { |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
if (state->start > search_end) |
|
break; |
|
if (contig && found && state->start > last + 1) |
|
break; |
|
if (state->end >= cur_start && (state->state & bits) == bits) { |
|
total_bytes += min(search_end, state->end) + 1 - |
|
max(cur_start, state->start); |
|
if (total_bytes >= max_bytes) |
|
break; |
|
if (!found) { |
|
*start = max(cur_start, state->start); |
|
found = 1; |
|
} |
|
last = state->end; |
|
} else if (contig && found) { |
|
break; |
|
} |
|
node = rb_next(node); |
|
if (!node) |
|
break; |
|
} |
|
out: |
|
spin_unlock(&tree->lock); |
|
return total_bytes; |
|
} |
|
|
|
/* |
|
* set the private field for a given byte offset in the tree. If there isn't |
|
* an extent_state there already, this does nothing. |
|
*/ |
|
int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) |
|
{ |
|
struct rb_node *node; |
|
struct extent_state *state; |
|
int ret = 0; |
|
|
|
spin_lock(&tree->lock); |
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) { |
|
ret = -ENOENT; |
|
goto out; |
|
} |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
if (state->start != start) { |
|
ret = -ENOENT; |
|
goto out; |
|
} |
|
state->private = private; |
|
out: |
|
spin_unlock(&tree->lock); |
|
return ret; |
|
} |
|
|
|
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) |
|
{ |
|
struct rb_node *node; |
|
struct extent_state *state; |
|
int ret = 0; |
|
|
|
spin_lock(&tree->lock); |
|
/* |
|
* this search will find all the extents that end after |
|
* our range starts. |
|
*/ |
|
node = tree_search(tree, start); |
|
if (!node) { |
|
ret = -ENOENT; |
|
goto out; |
|
} |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
if (state->start != start) { |
|
ret = -ENOENT; |
|
goto out; |
|
} |
|
*private = state->private; |
|
out: |
|
spin_unlock(&tree->lock); |
|
return ret; |
|
} |
|
|
|
/* |
|
* searches a range in the state tree for a given mask. |
|
* If 'filled' == 1, this returns 1 only if every extent in the tree |
|
* has the bits set. Otherwise, 1 is returned if any bit in the |
|
* range is found set. |
|
*/ |
|
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, |
|
int bits, int filled, struct extent_state *cached) |
|
{ |
|
struct extent_state *state = NULL; |
|
struct rb_node *node; |
|
int bitset = 0; |
|
|
|
spin_lock(&tree->lock); |
|
if (cached && cached->tree && cached->start <= start && |
|
cached->end > start) |
|
node = &cached->rb_node; |
|
else |
|
node = tree_search(tree, start); |
|
while (node && start <= end) { |
|
state = rb_entry(node, struct extent_state, rb_node); |
|
|
|
if (filled && state->start > start) { |
|
bitset = 0; |
|
break; |
|
} |
|
|
|
if (state->start > end) |
|
break; |
|
|
|
if (state->state & bits) { |
|
bitset = 1; |
|
if (!filled) |
|
break; |
|
} else if (filled) { |
|
bitset = 0; |
|
break; |
|
} |
|
|
|
if (state->end == (u64)-1) |
|
break; |
|
|
|
start = state->end + 1; |
|
if (start > end) |
|
break; |
|
node = rb_next(node); |
|
if (!node) { |
|
if (filled) |
|
bitset = 0; |
|
break; |
|
} |
|
} |
|
spin_unlock(&tree->lock); |
|
return bitset; |
|
} |
|
|
|
/* |
|
* helper function to set a given page up to date if all the |
|
* extents in the tree for that page are up to date |
|
*/ |
|
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) |
|
{ |
|
u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
|
u64 end = start + PAGE_CACHE_SIZE - 1; |
|
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) |
|
SetPageUptodate(page); |
|
} |
|
|
|
/* |
|
* helper function to unlock a page if all the extents in the tree |
|
* for that page are unlocked |
|
*/ |
|
static void check_page_locked(struct extent_io_tree *tree, struct page *page) |
|
{ |
|
u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
|
u64 end = start + PAGE_CACHE_SIZE - 1; |
|
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) |
|
unlock_page(page); |
|
} |
|
|
|
/* |
|
* helper function to end page writeback if all the extents |
|
* in the tree for that page are done with writeback |
|
*/ |
|
static void check_page_writeback(struct extent_io_tree *tree, |
|
struct page *page) |
|
{ |
|
end_page_writeback(page); |
|
} |
|
|
|
/* |
|
* When IO fails, either with EIO or csum verification fails, we |
|
* try other mirrors that might have a good copy of the data. This |
|
* io_failure_record is used to record state as we go through all the |
|
* mirrors. If another mirror has good data, the page is set up to date |
|
* and things continue. If a good mirror can't be found, the original |
|
* bio end_io callback is called to indicate things have failed. |
|
*/ |
|
struct io_failure_record { |
|
struct page *page; |
|
u64 start; |
|
u64 len; |
|
u64 logical; |
|
unsigned long bio_flags; |
|
int this_mirror; |
|
int failed_mirror; |
|
int in_validation; |
|
}; |
|
|
|
static int free_io_failure(struct inode *inode, struct io_failure_record *rec, |
|
int did_repair) |
|
{ |
|
int ret; |
|
int err = 0; |
|
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; |
|
|
|
set_state_private(failure_tree, rec->start, 0); |
|
ret = clear_extent_bits(failure_tree, rec->start, |
|
rec->start + rec->len - 1, |
|
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); |
|
if (ret) |
|
err = ret; |
|
|
|
if (did_repair) { |
|
ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, |
|
rec->start + rec->len - 1, |
|
EXTENT_DAMAGED, GFP_NOFS); |
|
if (ret && !err) |
|
err = ret; |
|
} |
|
|
|
kfree(rec); |
|
return err; |
|
} |
|
|
|
static void repair_io_failure_callback(struct bio *bio, int err) |
|
{ |
|
complete(bio->bi_private); |
|
} |
|
|
|
/* |
|
* this bypasses the standard btrfs submit functions deliberately, as |
|
* the standard behavior is to write all copies in a raid setup. here we only |
|
* want to write the one bad copy. so we do the mapping for ourselves and issue |
|
* submit_bio directly. |
|
* to avoid any synchonization issues, wait for the data after writing, which |
|
* actually prevents the read that triggered the error from finishing. |
|
* currently, there can be no more than two copies of every data bit. thus, |
|
* exactly one rewrite is required. |
|
*/ |
|
int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, |
|
u64 length, u64 logical, struct page *page, |
|
int mirror_num) |
|
{ |
|
struct bio *bio; |
|
struct btrfs_device *dev; |
|
DECLARE_COMPLETION_ONSTACK(compl); |
|
u64 map_length = 0; |
|
u64 sector; |
|
struct btrfs_bio *bbio = NULL; |
|
int ret; |
|
|
|
BUG_ON(!mirror_num); |
|
|
|
bio = bio_alloc(GFP_NOFS, 1); |
|
if (!bio) |
|
return -EIO; |
|
bio->bi_private = &compl; |
|
bio->bi_end_io = repair_io_failure_callback; |
|
bio->bi_size = 0; |
|
map_length = length; |
|
|
|
ret = btrfs_map_block(map_tree, WRITE, logical, |
|
&map_length, &bbio, mirror_num); |
|
if (ret) { |
|
bio_put(bio); |
|
return -EIO; |
|
} |
|
BUG_ON(mirror_num != bbio->mirror_num); |
|
sector = bbio->stripes[mirror_num-1].physical >> 9; |
|
bio->bi_sector = sector; |
|
dev = bbio->stripes[mirror_num-1].dev; |
|
kfree(bbio); |
|
if (!dev || !dev->bdev || !dev->writeable) { |
|
bio_put(bio); |
|
return -EIO; |
|
} |
|
bio->bi_bdev = dev->bdev; |
|
bio_add_page(bio, page, length, start-page_offset(page)); |
|
btrfsic_submit_bio(WRITE_SYNC, bio); |
|
wait_for_completion(&compl); |
|
|
|
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
|
/* try to remap that extent elsewhere? */ |
|
bio_put(bio); |
|
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); |
|
return -EIO; |
|
} |
|
|
|
printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " |
|
"(dev %s sector %llu)\n", page->mapping->host->i_ino, |
|
start, rcu_str_deref(dev->name), sector); |
|
|
|
bio_put(bio); |
|
return 0; |
|
} |
|
|
|
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, |
|
int mirror_num) |
|
{ |
|
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; |
|
u64 start = eb->start; |
|
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); |
|
int ret = 0; |
|
|
|
for (i = 0; i < num_pages; i++) { |
|
struct page *p = extent_buffer_page(eb, i); |
|
ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, |
|
start, p, mirror_num); |
|
if (ret) |
|
break; |
|
start += PAGE_CACHE_SIZE; |
|
} |
|
|
|
return ret; |
|
} |
|
|
|
/* |
|
* each time an IO finishes, we do a fast check in the IO failure tree |
|
* to see if we need to process or clean up an io_failure_record |
|
*/ |
|
static int clean_io_failure(u64 start, struct page *page) |
|
{ |
|
u64 private; |
|
u64 private_failure; |
|
struct io_failure_record *failrec; |
|
struct btrfs_mapping_tree *map_tree; |
|
struct extent_state *state; |
|
int num_copies; |
|
int did_repair = 0; |
|
int ret; |
|
struct inode *inode = page->mapping->host; |
|
|
|
private = 0; |
|
ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, |
|
(u64)-1, 1, EXTENT_DIRTY, 0); |
|
if (!ret) |
|
return 0; |
|
|
|
ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, |
|
&private_failure); |
|
if (ret) |
|
return 0; |
|
|
|
failrec = (struct io_failure_record *)(unsigned long) private_failure; |
|
BUG_ON(!failrec->this_mirror); |
|
|
|
if (failrec->in_validation) { |
|
/* there was no real error, just free the record */ |
|
pr_debug("clean_io_failure: freeing dummy error at %llu\n", |
|
failrec->start); |
|
did_repair = 1; |
|
goto out; |
|
} |
|
|
|
spin_lock(&BTRFS_I(inode)->io_tree.lock); |
|
state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, |
|
failrec->start, |
|
EXTENT_LOCKED); |
|
spin_unlock(&BTRFS_I(inode)->io_tree.lock); |
|
|
|
if (state && state->start == failrec->start) { |
|
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, |
|
failrec->logical, failrec->len); |
|
if (num_copies > 1) { |
|
map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; |
|
ret = repair_io_failure(map_tree, start, failrec->len, |
|
failrec->logical, page, |
|
failrec->failed_mirror); |
|
did_repair = !ret; |
|
} |
|
} |
|
|
|
out: |
|
if (!ret) |
|
ret = free_io_failure(inode, failrec, did_repair); |
|
|
|
return ret; |
|
} |
|
|
|
/* |
|
* this is a generic handler for readpage errors (default |
|
* readpage_io_failed_hook). if other copies exist, read those and write back |
|
* good data to the failed position. does not investigate in remapping the |
|
* failed extent elsewhere, hoping the device will be smart enough to do this as |
|
* needed |
|
*/ |
|
|
|
static int bio_readpage_error(struct bio *failed_bio, struct page *page, |
|
u64 start, u64 end, int failed_mirror, |
|
struct extent_state *state) |
|
{ |
|
struct io_failure_record *failrec = NULL; |
|
u64 private; |
|
struct extent_map *em; |
|
struct inode *inode = page->mapping->host; |
|
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; |
|
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; |
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
|
struct bio *bio; |
|
int num_copies; |
|
int ret; |
|
int read_mode; |
|
u64 logical; |
|
|
|
BUG_ON(failed_bio->bi_rw & REQ_WRITE); |
|
|
|
ret = get_state_private(failure_tree, start, &private); |
|
if (ret) { |
|
failrec = kzalloc(sizeof(*failrec), GFP_NOFS); |
|
if (!failrec) |
|
return -ENOMEM; |
|
failrec->start = start; |
|
failrec->len = end - start + 1; |
|
failrec->this_mirror = 0; |
|
failrec->bio_flags = 0; |
|
failrec->in_validation = 0; |
|
|
|
read_lock(&em_tree->lock); |
|
em = lookup_extent_mapping(em_tree, start, failrec->len); |
|
if (!em) { |
|
read_unlock(&em_tree->lock); |
|
kfree(failrec); |
|
return -EIO; |
|
} |
|
|
|
if (em->start > start || em->start + em->len < start) { |
|
free_extent_map(em); |
|
em = NULL; |
|
} |
|
read_unlock(&em_tree->lock); |
|
|
|
if (!em) { |
|
kfree(failrec); |
|
return -EIO; |
|
} |
|
logical = start - em->start; |
|
logical = em->block_start + logical; |
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { |
|
logical = em->block_start; |
|
failrec->bio_flags = EXTENT_BIO_COMPRESSED; |
|
extent_set_compress_type(&failrec->bio_flags, |
|
em->compress_type); |
|
} |
|
pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " |
|
"len=%llu\n", logical, start, failrec->len); |
|
failrec->logical = logical; |
|
free_extent_map(em); |
|
|
|
/* set the bits in the private failure tree */ |
|
ret = set_extent_bits(failure_tree, start, end, |
|
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); |
|
if (ret >= 0) |
|
ret = set_state_private(failure_tree, start, |
|
(u64)(unsigned long)failrec); |
|
/* set the bits in the inode's tree */ |
|
if (ret >= 0) |
|
ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, |
|
GFP_NOFS); |
|
if (ret < 0) { |
|
kfree(failrec); |
|
return ret; |
|
} |
|
} else { |
|
failrec = (struct io_failure_record *)(unsigned long)private; |
|
pr_debug("bio_readpage_error: (found) logical=%llu, " |
|
"start=%llu, len=%llu, validation=%d\n", |
|
failrec->logical, failrec->start, failrec->len, |
|
failrec->in_validation); |
|
/* |
|
* when data can be on disk more than twice, add to failrec here |
|
* (e.g. with a list for failed_mirror) to make |
|
* clean_io_failure() clean all those errors at once. |
|
*/ |
|
} |
|
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, |
|
failrec->logical, failrec->len); |
|
if (num_copies == 1) { |
|
/* |
|
* we only have a single copy of the data, so don't bother with |
|
* all the retry and error correction code that follows. no |
|
* matter what the error is, it is very likely to persist. |
|
*/ |
|
pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " |
|
"state=%p, num_copies=%d, next_mirror %d, " |
|
"failed_mirror %d\n", state, num_copies, |
|
failrec->this_mirror, failed_mirror); |
|
free_io_failure(inode, failrec, 0); |
|
return -EIO; |
|
} |
|
|
|
if (!state) { |
|
spin_lock(&tree->lock); |
|
state = find_first_extent_bit_state(tree, failrec->start, |
|
EXTENT_LOCKED); |
|
if (state && state->start != failrec->start) |
|
state = NULL; |
|
spin_unlock(&tree->lock); |
|
} |
|
|
|
/* |
|
* there are two premises: |
|
* a) deliver good data to the caller |
|
* b) correct the bad sectors on disk |
|
*/ |
|
if (failed_bio->bi_vcnt > 1) { |
|
/* |
|
* to fulfill b), we need to know the exact failing sectors, as |
|
* we don't want to rewrite any more than the failed ones. thus, |
|
* we need separate read requests for the failed bio |
|
* |
|
* if the following BUG_ON triggers, our validation request got |
|
* merged. we need separate requests for our algorithm to work. |
|
*/ |
|
BUG_ON(failrec->in_validation); |
|
failrec->in_validation = 1; |
|
failrec->this_mirror = failed_mirror; |
|
read_mode = READ_SYNC | REQ_FAILFAST_DEV; |
|
} else { |
|
/* |
|
* we're ready to fulfill a) and b) alongside. get a good copy |
|
* of the failed sector and if we succeed, we have setup |
|
* everything for repair_io_failure to do the rest for us. |
|
*/ |
|
if (failrec->in_validation) { |
|
BUG_ON(failrec->this_mirror != failed_mirror); |
|
failrec->in_validation = 0; |
|
failrec->this_mirror = 0; |
|
} |
|
failrec->failed_mirror = failed_mirror; |
|
failrec->this_mirror++; |
|
if (failrec->this_mirror == failed_mirror) |
|
failrec->this_mirror++; |
|
read_mode = READ_SYNC; |
|
} |
|
|
|
if (!state || failrec->this_mirror > num_copies) { |
|
pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " |
|
"next_mirror %d, failed_mirror %d\n", state, |
|
num_copies, failrec->this_mirror, failed_mirror); |
|
free_io_failure(inode, failrec, 0); |
|
return -EIO; |
|
} |
|
|
|
bio = bio_alloc(GFP_NOFS, 1); |
|
if (!bio) { |
|
free_io_failure(inode, failrec, 0); |
|
return -EIO; |
|
} |
|
bio->bi_private = state; |
|
bio->bi_end_io = failed_bio->bi_end_io; |
|
bio->bi_sector = failrec->logical >> 9; |
|
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; |
|
bio->bi_size = 0; |
|
|
|
bio_add_page(bio, page, failrec->len, start - page_offset(page)); |
|
|
|
pr_debug("bio_readpage_error: submitting new read[%#x] to " |
|
"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, |
|
failrec->this_mirror, num_copies, failrec->in_validation); |
|
|
|
ret = tree->ops->submit_bio_hook(inode, read_mode, bio, |
|
failrec->this_mirror, |
|
failrec->bio_flags, 0); |
|
return ret; |
|
} |
|
|
|
/* lots and lots of room for performance fixes in the end_bio funcs */ |
|
|
|
int end_extent_writepage(struct page *page, int err, u64 start, u64 end) |
|
{ |
|
int uptodate = (err == 0); |
|
struct extent_io_tree *tree; |
|
int ret; |
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree; |
|
|
|
if (tree->ops && tree->ops->writepage_end_io_hook) { |
|
ret = tree->ops->writepage_end_io_hook(page, start, |
|
end, NULL, uptodate); |
|
if (ret) |
|
uptodate = 0; |
|
} |
|
|
|
if (!uptodate) { |
|
ClearPageUptodate(page); |
|
SetPageError(page); |
|
} |
|
return 0; |
|
} |
|
|
|
/* |
|
* after a writepage IO is done, we need to: |
|
* clear the uptodate bits on error |
|
* clear the writeback bits in the extent tree for this IO |
|
* end_page_writeback if the page has no more pending IO |
|
* |
|
* Scheduling is not allowed, so the extent state tree is expected |
|
* to have one and only one object corresponding to this IO. |
|
*/ |
|
static void end_bio_extent_writepage(struct bio *bio, int err) |
|
{ |
|
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
|
struct extent_io_tree *tree; |
|
u64 start; |
|
u64 end; |
|
int whole_page; |
|
|
|
do { |
|
struct page *page = bvec->bv_page; |
|
tree = &BTRFS_I(page->mapping->host)->io_tree; |
|
|
|
start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
|
bvec->bv_offset; |
|
end = start + bvec->bv_len - 1; |
|
|
|
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
|
whole_page = 1; |
|
else |
|
whole_page = 0; |
|
|
|
if (--bvec >= bio->bi_io_vec) |
|
prefetchw(&bvec->bv_page->flags); |
|
|
|
if (end_extent_writepage(page, err, start, end)) |
|
continue; |
|
|
|
if (whole_page) |
|
end_page_writeback(page); |
|
else |
|
check_page_writeback(tree, page); |
|
} while (bvec >= bio->bi_io_vec); |
|
|
|
bio_put(bio); |
|
} |
|
|
|
/* |
|
* after a readpage IO is done, we need to: |
|
* clear the uptodate bits on error |
|
* set the uptodate bits if things worked |
|
* set the page up to date if all extents in the tree are uptodate |
|
* clear the lock bit in the extent tree |
|
* unlock the page if there are no other extents locked for it |
|
* |
|
* Scheduling is not allowed, so the extent state tree is expected |
|
* to have one and only one object corresponding to this IO. |
|
*/ |
|
static void end_bio_extent_readpage(struct bio *bio, int err) |
|
{ |
|
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
|
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; |
|
struct bio_vec *bvec = bio->bi_io_vec; |
|
struct extent_io_tree *tree; |
|
u64 start; |
|
u64 end; |
|
int whole_page; |
|
int mirror; |
|
int ret; |
|
|
|
if (err) |
|
uptodate = 0; |
|
|
|
do { |
|
struct page *page = bvec->bv_page; |
|
struct extent_state *cached = NULL; |
|
struct extent_state *state; |
|
|
|
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " |
|
"mirror=%ld\n", (u64)bio->bi_sector, err, |
|
(long int)bio->bi_bdev); |
|
tree = &BTRFS_I(page->mapping->host)->io_tree; |
|
|
|
start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
|
bvec->bv_offset; |
|
end = start + bvec->bv_len - 1; |
|
|
|
if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
|
whole_page = 1; |
|
else |
|
whole_page = 0; |
|
|
|
if (++bvec <= bvec_end) |
|
prefetchw(&bvec->bv_page->flags); |
|
|
|
spin_lock(&tree->lock); |
|
state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); |
|
if (state && state->start == start) { |
|
/* |
|
* take a reference on the state, unlock will drop |
|
* the ref |
|
*/ |
|
cache_state(state, &cached); |
|
} |
|
spin_unlock(&tree->lock); |
|
|
|
mirror = (int)(unsigned long)bio->bi_bdev; |
|
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { |
|
ret = tree->ops->readpage_end_io_hook(page, start, end, |
|
state, mirror); |
|
if (ret) |
|
uptodate = 0; |
|
else |
|
clean_io_failure(start, page); |
|
} |
|
|
|
if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { |
|
ret = tree->ops->readpage_io_failed_hook(page, mirror); |
|
if (!ret && !err && |
|
test_bit(BIO_UPTODATE, &bio->bi_flags)) |
|
uptodate = 1; |
|
} else if (!uptodate) { |
|
/* |
|
* The generic bio_readpage_error handles errors the |
|
* following way: If possible, new read requests are |
|
* created and submitted and will end up in |
|
* end_bio_extent_readpage as well (if we're lucky, not |
|
* in the !uptodate case). In that case it returns 0 and |
|
* we just go on with the next page in our bio. If it |
|
* can't handle the error it will return -EIO and we |
|
* remain responsible for that page. |
|
*/ |
|