/* * Copyright (C) 2019 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "format.h" #include "super.h" #include "inode.h" #include "key.h" #include "alloc.h" #include "data.h" #include "trans.h" #include "counters.h" #include "scoutfs_trace.h" #include "item.h" #include "ioctl.h" #include "btree.h" #include "lock.h" #include "file.h" #include "msg.h" #include "ext.h" #include "util.h" /* * We want to amortize work done after dirtying the shared transaction * accounting, but we don't want to blow out dirty allocator btree * blocks. Each allocation can dirty quite a few allocator btree blocks * so we check in pretty often. */ #define EXTENTS_PER_HOLD 8 struct data_info { struct super_block *sb; struct mutex mutex; struct scoutfs_alloc *alloc; struct scoutfs_block_writer *wri; struct scoutfs_alloc_root data_freed; struct scoutfs_data_alloc dalloc; }; #define DECLARE_DATA_INFO(sb, name) \ struct data_info *name = SCOUTFS_SB(sb)->data_info struct data_ext_args { u64 ino; struct inode *inode; struct scoutfs_lock *lock; }; static void item_from_extent(struct scoutfs_key *key, struct scoutfs_data_extent_val *dv, u64 ino, u64 start, u64 len, u64 map, u8 flags) { *key = (struct scoutfs_key) { .sk_zone = SCOUTFS_FS_ZONE, .skdx_ino = cpu_to_le64(ino), .sk_type = SCOUTFS_DATA_EXTENT_TYPE, .skdx_end = cpu_to_le64(start + len - 1), .skdx_len = cpu_to_le64(len), }; *dv = (struct scoutfs_data_extent_val) { .blkno = cpu_to_le64(map), .flags = flags, }; } static void ext_from_item(struct scoutfs_extent *ext, struct scoutfs_key *key, struct scoutfs_data_extent_val *dv) { ext->start = le64_to_cpu(key->skdx_end) - le64_to_cpu(key->skdx_len) + 1; ext->len = le64_to_cpu(key->skdx_len); ext->map = le64_to_cpu(dv->blkno); ext->flags = dv->flags; } static void data_ext_op_warn(struct inode *inode) { struct scoutfs_inode_info *si; if (inode) { si = SCOUTFS_I(inode); WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem)); } } static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len, struct scoutfs_extent *ext) { struct data_ext_args *args = arg; struct scoutfs_data_extent_val dv; struct scoutfs_key key; struct scoutfs_key last; int ret; data_ext_op_warn(args->inode); item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0); item_from_extent(&key, &dv, args->ino, start, len, 0, 0); ret = scoutfs_item_next(sb, &key, &last, &dv, sizeof(dv), args->lock); if (ret == sizeof(dv)) { ext_from_item(ext, &key, &dv); ret = 0; } else if (ret >= 0) { ret = -EIO; } if (ret < 0) memset(ext, 0, sizeof(struct scoutfs_extent)); return ret; } static void add_onoff(struct inode *inode, u64 map, u8 flags, s64 len) { s64 on = 0; s64 off = 0; if (map && !(flags & SEF_UNWRITTEN)) on += len; else if (flags & SEF_OFFLINE) off += len; scoutfs_inode_add_onoff(inode, on, off); } static int data_ext_insert(struct super_block *sb, void *arg, u64 start, u64 len, u64 map, u8 flags) { struct data_ext_args *args = arg; struct scoutfs_data_extent_val dv; struct scoutfs_key key; int ret; data_ext_op_warn(args->inode); item_from_extent(&key, &dv, args->ino, start, len, map, flags); ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock); if (ret == 0 && args->inode) add_onoff(args->inode, map, flags, len); return ret; } static int data_ext_remove(struct super_block *sb, void *arg, u64 start, u64 len, u64 map, u8 flags) { struct data_ext_args *args = arg; struct scoutfs_data_extent_val dv; struct scoutfs_key key; int ret; data_ext_op_warn(args->inode); item_from_extent(&key, &dv, args->ino, start, len, map, flags); ret = scoutfs_item_delete(sb, &key, args->lock); if (ret == 0 && args->inode) add_onoff(args->inode, map, flags, -len); return ret; } static struct scoutfs_ext_ops data_ext_ops = { .next = data_ext_next, .insert = data_ext_insert, .remove = data_ext_remove, }; /* * Find and remove or mark offline the block mappings that intersect * with the caller's range. The caller is responsible for transactions * and locks. * * Returns: * - -errno on errors * - 0 if there are no more extents to stop iteration * - +iblock of next logical block to truncate the next block from */ static s64 truncate_extents(struct super_block *sb, struct inode *inode, u64 ino, u64 iblock, u64 last, bool offline, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); struct data_ext_args args = { .ino = ino, .inode = inode, .lock = lock, }; struct scoutfs_extent ext; struct scoutfs_extent tr; u64 offset; s64 ret; u8 flags; int err; int i; flags = offline ? SEF_OFFLINE : 0; ret = 0; for (i = 0; iblock <= last; i++) { if (i == EXTENTS_PER_HOLD) { ret = iblock; break; } ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; break; } /* done if we went past the region */ if (ext.start > last) { ret = 0; break; } /* nothing to do when already offline and unmapped */ if ((offline && (ext.flags & SEF_OFFLINE)) && !ext.map) { iblock = ext.start + ext.len; continue; } iblock = max(ext.start, iblock); offset = iblock - ext.start; tr.start = iblock; tr.map = ext.map ? ext.map + offset : 0; tr.len = min(ext.len - offset, last - iblock + 1); tr.flags = ext.flags; trace_scoutfs_data_extent_truncated(sb, ino, &tr); ret = scoutfs_ext_set(sb, &data_ext_ops, &args, tr.start, tr.len, 0, flags); if (ret < 0) { if (WARN_ON_ONCE(ret == -EINVAL)) { scoutfs_err(sb, "unexpected truncate inconsistency: ino %llu iblock %llu last %llu, start %llu len %llu", ino, iblock, last, tr.start, tr.len); } break; } if (tr.map) { mutex_lock(&datinf->mutex); ret = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_freed, tr.map, tr.len); mutex_unlock(&datinf->mutex); if (ret < 0) { err = scoutfs_ext_set(sb, &data_ext_ops, &args, tr.start, tr.len, tr.map, tr.flags); if (err < 0) scoutfs_err(sb, "truncate err %d restoring extent after error %lld: ino %llu start %llu len %llu", err, ret, ino, tr.start, tr.len); break; } } iblock += tr.len; } return ret; } /* * Free blocks inside the logical block range from 'iblock' to 'last', * inclusive. * * If 'offline' is given then blocks are freed an offline mapping is * left behind. Only blocks that have been allocated can be marked * offline. * * If the inode is provided then we update its tracking of the online * and offline blocks. If it's not provided then the inode is being * destroyed and isn't reachable, we don't need to update it. * * The caller is in charge of locking the inode and data, but we may * have to modify far more items than fit in a transaction so we're in * charge of batching updates into transactions. If the inode is * provided then we're responsible for updating its item as we go. */ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, u64 ino, u64 iblock, u64 last, bool offline, struct scoutfs_lock *lock) { struct scoutfs_inode_info *si = NULL; LIST_HEAD(ind_locks); s64 ret = 0; WARN_ON_ONCE(inode && !inode_is_locked(inode)); /* clamp last to the last possible block? */ if (last > SCOUTFS_BLOCK_SM_MAX) last = SCOUTFS_BLOCK_SM_MAX; trace_scoutfs_data_truncate_items(sb, iblock, last, offline); if (WARN_ON_ONCE(last < iblock)) return -EINVAL; if (inode) { si = SCOUTFS_I(inode); down_write(&si->extent_sem); } while (iblock <= last) { if (inode) ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false); else ret = scoutfs_hold_trans(sb, false); if (ret) break; if (inode) ret = scoutfs_dirty_inode_item(inode, lock); else ret = 0; if (ret == 0) ret = truncate_extents(sb, inode, ino, iblock, last, offline, lock); if (inode) scoutfs_update_inode_item(inode, lock, &ind_locks); scoutfs_release_trans(sb); if (inode) scoutfs_inode_index_unlock(sb, &ind_locks); if (ret <= 0) break; iblock = ret; ret = 0; } if (si) up_write(&si->extent_sem); return ret; } static inline u64 ext_last(struct scoutfs_extent *ext) { return ext->start + ext->len - 1; } /* * The caller is writing to a logical iblock that doesn't have an * allocated extent. The caller has searched for an extent containing * iblock. If it already existed then it must be unallocated and * offline. * * We implement two preallocation strategies. Typically we only * preallocate for simple streaming writes and limit preallocation while * the file is small. The largest efficient allocation size is * typically large enough that it would be unreasonable to allocate that * much for all small files. * * Optionally, we can simply preallocate large empty aligned regions. * This can waste a lot of space for small or sparse files but is * reasonable when a file population is known to be large and dense but * known to be written with non-streaming write patterns. */ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent *ext, u64 iblock, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); struct scoutfs_mount_options opts; const u64 ino = scoutfs_ino(inode); struct data_ext_args args = { .ino = ino, .inode = inode, .lock = lock, }; struct scoutfs_extent found; struct scoutfs_extent pre = {0,}; bool undo_pre = false; u64 blkno = 0; u64 online; u64 offline; u8 flags; u64 start; u64 count; u64 rem; int ret; int err; trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext); scoutfs_options_read(sb, &opts); /* can only allocate over existing unallocated offline extent */ if (WARN_ON_ONCE(ext->len && !(iblock >= ext->start && iblock <= ext_last(ext) && ext->map == 0 && (ext->flags & SEF_OFFLINE)))) return -EINVAL; mutex_lock(&datinf->mutex); /* default to single allocation at the written block */ start = iblock; count = 1; /* copy existing flags for preallocated regions */ flags = ext->len ? ext->flags : 0; if (ext->len) { /* * Assume that offline writers are going to be writing * all the offline extents and try to preallocate the * rest of the unwritten extent. */ count = ext->len - (iblock - ext->start); } else if (opts.data_prealloc_contig_only) { /* * Only preallocate when a quick test of the online * block counts looks like we're a simple streaming * write. Try to write until the next extent but limit * the preallocation size to the number of online * blocks. */ scoutfs_inode_get_onoff(inode, &online, &offline); if (iblock > 1 && iblock == online) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found); if (ret < 0 && ret != -ENOENT) goto out; if (found.len && found.start > iblock) count = found.start - iblock; else count = opts.data_prealloc_blocks; count = min(iblock, count); } } else { /* * Preallocation within aligned regions tries to * allocate an extent to fill the hole in the region * that contains iblock. We'd have to add a bit of plumbing * to find previous extents so we only search for a next * extent from the front of the region and from iblock. */ div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem); start = iblock - rem; count = opts.data_prealloc_blocks; ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found); if (ret < 0 && ret != -ENOENT) goto out; /* trim count if there's an extent in the region before iblock */ if (found.len && found.start < iblock) { count -= iblock - start; start = iblock; /* see if there's also an extent after iblock */ ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found); if (ret < 0 && ret != -ENOENT) goto out; } /* trim count by next extent after iblock */ if (found.len && found.start > start && found.start < start + count) count = (found.start - start); } /* overall prealloc limit */ count = min_t(u64, count, opts.data_prealloc_blocks); ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, &datinf->dalloc, count, &blkno, &count); if (ret < 0) goto out; /* * An aligned prealloc attempt that gets a smaller extent can * fail to cover iblock, make sure that it does. This is a * pathological case so we don't try to move the window past * iblock. Just enough to cover it, which we know is safe. */ if (start + count <= iblock) start += (iblock - (start + count) + 1); if (count > 1) { pre.start = start; pre.len = count; pre.map = blkno; pre.flags = flags | SEF_UNWRITTEN; ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start, pre.len, pre.map, pre.flags); if (ret < 0) goto out; undo_pre = true; } ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno + (iblock - start), 0); if (ret < 0) goto out; /* tell the caller we have a single block, could check next? */ ext->start = iblock; ext->len = 1; ext->map = blkno + (iblock - start); ext->flags = 0; ret = 0; out: if (ret < 0 && blkno > 0) { if (undo_pre) { err = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start, pre.len, 0, flags); BUG_ON(err); /* leaked preallocated extent */ } err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_freed, blkno, count); BUG_ON(err); /* leaked free blocks */ } if (ret == 0) { trace_scoutfs_data_alloc(sb, ino, ext); trace_scoutfs_data_prealloc(sb, ino, &pre); } mutex_unlock(&datinf->mutex); return ret; } static int scoutfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); const u64 ino = scoutfs_ino(inode); struct super_block *sb = inode->i_sb; struct data_ext_args args; struct scoutfs_lock *lock = NULL; struct scoutfs_extent ext = {0,}; struct scoutfs_extent un; u64 offset; int ret; WARN_ON_ONCE(create && !rwsem_is_locked(&si->extent_sem)); /* make sure caller holds a cluster lock */ lock = scoutfs_per_task_get(&si->pt_data_lock); if (WARN_ON_ONCE(!lock)) { ret = -EINVAL; goto out; } args.ino = ino; args.inode = inode; args.lock = lock; ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret == -ENOENT || (ret == 0 && ext.start > iblock)) memset(&ext, 0, sizeof(ext)); else if (ret < 0) goto out; if (ext.len) trace_scoutfs_data_get_block_found(sb, ino, &ext); /* non-staging callers should have waited on offline blocks */ if (WARN_ON_ONCE(ext.map && (ext.flags & SEF_OFFLINE) && !si->staging)){ ret = -EIO; goto out; } if (create && !si->staging) { ret = scoutfs_inode_check_retention(inode); if (ret < 0) goto out; } /* convert unwritten to written, could be staging */ if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) { un.start = iblock; un.len = 1; un.map = ext.map + (iblock - ext.start); un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN); ret = scoutfs_ext_set(sb, &data_ext_ops, &args, un.start, un.len, un.map, un.flags); if (ret == 0) { ext = un; set_buffer_new(bh); } goto out; } /* allocate and map blocks containing our logical block */ if (create && !ext.map) { ret = alloc_block(sb, inode, &ext, iblock, lock); if (ret == 0) set_buffer_new(bh); } else { ret = 0; } out: /* map usable extent, else leave bh unmapped for sparse reads */ if (ret == 0 && ext.map && !(ext.flags & SEF_UNWRITTEN)) { offset = iblock - ext.start; map_bh(bh, inode->i_sb, ext.map + offset); bh->b_size = min_t(u64, bh->b_size, (ext.len - offset) << SCOUTFS_BLOCK_SM_SHIFT); trace_scoutfs_data_get_block_mapped(sb, ino, &ext); } trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create, &ext, ret, bh->b_blocknr, bh->b_size); return ret; } /* * Typically extent item users are serialized by i_mutex. But page * readers only hold the page lock and need to be protected from writers * in other pages which can be manipulating neighbouring extents as * they split and merge. */ static int scoutfs_get_block_read(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); int ret; down_read(&si->extent_sem); ret = scoutfs_get_block(inode, iblock, bh, create); up_read(&si->extent_sem); return ret; } int scoutfs_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); int ret; down_write(&si->extent_sem); ret = scoutfs_get_block(inode, iblock, bh, create); up_write(&si->extent_sem); return ret; } /* * This is almost never used. We can't block on a cluster lock while * holding the page lock because lock invalidation gets the page lock * while blocking locks. If a non blocking lock attempt fails we unlock * the page and block acquiring the lock. We unlocked the page so it * could have been truncated away, or whatever, so we return * AOP_TRUNCATED_PAGE to have the caller try again. * * A similar process happens if we try to read from an offline extent * that a caller hasn't already waited for. Instead of blocking * acquiring the lock we block waiting for the offline extent. The page * lock protects the page from release while we're checking and * reading the extent. * * We can return errors from locking and checking offline extents. The * page is unlocked if we return an error. */ #ifdef KC_MPAGE_READ_FOLIO static int scoutfs_read_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; #else static int scoutfs_readpage(struct file *file, struct page *page) { #endif struct inode *inode = file->f_inode; struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); DECLARE_DATA_WAIT(dw); int flags; int ret; flags = SCOUTFS_LKF_REFRESH_INODE | SCOUTFS_LKF_NONBLOCK; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags, inode, &inode_lock); if (ret < 0) { unlock_page(page); if (ret == -EAGAIN) { flags &= ~SCOUTFS_LKF_NONBLOCK; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, flags, inode, &inode_lock); if (ret == 0) { scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); ret = AOP_TRUNCATED_PAGE; } } return ret; } if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { ret = scoutfs_data_wait_check(inode, page_offset(page), PAGE_SIZE, SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, &dw, inode_lock); if (ret != 0) { unlock_page(page); scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); } if (ret > 0) { ret = scoutfs_data_wait(inode, &dw); if (ret == 0) ret = AOP_TRUNCATED_PAGE; } if (ret != 0) return ret; } #ifdef KC_MPAGE_READ_FOLIO ret = mpage_read_folio(folio, scoutfs_get_block_read); #else ret = mpage_readpage(page, scoutfs_get_block_read); #endif scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); return ret; } #ifndef KC_FILE_AOPS_READAHEAD /* * This is used for opportunistic read-ahead which can throw the pages * away if it needs to. If the caller didn't deal with offline extents * then we drop those pages rather than trying to wait. Whoever is * staging offline extents should be doing it in enormous chunks so that * read-ahead can ramp up within each staged region. The check for * offline extents is cheap when the inode has no offline extents. */ static int scoutfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_inode; struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; struct page *page; struct page *tmp; int ret; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (ret) goto out; list_for_each_entry_safe(page, tmp, pages, lru) { ret = scoutfs_data_wait_check(inode, page_offset(page), PAGE_SIZE, SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, NULL, inode_lock); if (ret < 0) goto out; if (ret > 0) { list_del(&page->lru); put_page(page); if (--nr_pages == 0) { ret = 0; goto out; } } } ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read); out: scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); BUG_ON(!list_empty(pages)); return ret; } #else static void scoutfs_readahead(struct readahead_control *rac) { struct inode *inode = rac->file->f_inode; struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; int ret; ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (ret) return; ret = scoutfs_data_wait_check(inode, readahead_pos(rac), readahead_length(rac), SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, NULL, inode_lock); if (ret == 0) mpage_readahead(rac, scoutfs_get_block_read); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); } #endif static int scoutfs_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, scoutfs_get_block_write, wbc); } static int scoutfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, scoutfs_get_block_write); } /* fsdata allocated in write_begin and freed in write_end */ struct write_begin_data { struct list_head ind_locks; struct scoutfs_lock *lock; }; static int scoutfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, #ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS unsigned flags, #endif struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct write_begin_data *wbd; u64 ind_seq; int ret; trace_scoutfs_write_begin(sb, scoutfs_ino(inode), (__u64)pos, len); wbd = kmalloc(sizeof(struct write_begin_data), GFP_NOFS); if (!wbd) return -ENOMEM; INIT_LIST_HEAD(&wbd->ind_locks); *fsdata = wbd; wbd->lock = scoutfs_per_task_get(&si->pt_data_lock); if (WARN_ON_ONCE(!wbd->lock)) { ret = -EINVAL; goto out; } retry: do { ret = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode, true) ?: scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks, ind_seq, true); } while (ret > 0); if (ret < 0) goto out; #ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS /* can't re-enter fs, have trans */ flags |= AOP_FLAG_NOFS; #endif /* generic write_end updates i_size and calls dirty_inode */ ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?: block_write_begin(mapping, pos, len, #ifdef KC_BLOCK_WRITE_BEGIN_AOP_FLAGS flags, #endif pagep, scoutfs_get_block_write); if (ret < 0) { scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &wbd->ind_locks); if (ret == -ENOBUFS) { /* Retry with a new transaction. */ scoutfs_inc_counter(sb, data_write_begin_enobufs_retry); goto retry; } } out: if (ret < 0) kfree(wbd); return ret; } /* kinda like __filemap_fdatawrite_range! :P */ static int writepages_sync_none(struct address_space *mapping, loff_t start, loff_t end) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; return mapping->a_ops->writepages(mapping, &wbc); } static int scoutfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct write_begin_data *wbd = fsdata; int ret; trace_scoutfs_write_end(sb, scoutfs_ino(inode), page->index, (u64)pos, len, copied); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (ret > 0) { if (!si->staging) { scoutfs_inode_set_data_seq(inode); scoutfs_inode_inc_data_version(inode); } inode_inc_iversion(inode); scoutfs_update_inode_item(inode, wbd->lock, &wbd->ind_locks); scoutfs_inode_queue_writeback(inode); } scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &wbd->ind_locks); kfree(wbd); /* * Currently transactions are kept very simple. Only one is * open at a time and commit excludes concurrent dirtying. It * writes out all dirty file data during commit. This can lead * to very long commit latencies with lots of dirty file data. * * This hack tries to minimize these writeback latencies while * keeping concurrent large file strreaming writes from * suffering too terribly. Every N bytes we kick off background * writbeack on the previous N bytes. By the time transaction * commit comes along it will find that dirty file blocks have * already been written. */ #define BACKGROUND_WRITEBACK_BYTES (16 * 1024 * 1024) #define BACKGROUND_WRITEBACK_MASK (BACKGROUND_WRITEBACK_BYTES - 1) if (ret > 0 && ((pos + ret) & BACKGROUND_WRITEBACK_MASK) == 0) writepages_sync_none(mapping, pos + ret - BACKGROUND_WRITEBACK_BYTES, pos + ret - 1); return ret; } /* * Try to allocate unwritten extents for any unallocated regions of the * logical block extent from the caller. The caller manages locks and * transactions. We limit ourselves to a reasonable number of extents * before returning to open another transaction. * * We return an error or the number of blocks starting at iblock that * were successfully processed. The caller will continue after those * blocks until they reach last. */ static s64 fallocate_extents(struct super_block *sb, struct inode *inode, u64 iblock, u64 last, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); struct data_ext_args args = { .ino = scoutfs_ino(inode), .inode = inode, .lock = lock, }; struct scoutfs_extent ext; u8 ext_fl; u64 blkno; u64 count; s64 done = 0; int ret = 0; int err; int i; for (i = 0; iblock <= last && i < EXTENTS_PER_HOLD; i++) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret == -ENOENT) ret = 0; else if (ret < 0) break; /* default to allocate to end of region */ count = last - iblock + 1; ext_fl = 0; if (!ext.len) { /* no extent, default alloc from above */ } else if (ext.start <= iblock && ext.map) { /* skip portion of allocated extent */ count = min_t(u64, count, ext.len - (iblock - ext.start)); iblock += count; done += count; continue; } else if (ext.start <= iblock && !ext.map) { /* alloc portion of unallocated extent */ count = min_t(u64, count, ext.len - (iblock - ext.start)); ext_fl = ext.flags; } else if (iblock < ext.start) { /* alloc hole until next extent */ count = min_t(u64, count, ext.start - iblock); } /* limit allocation attempts */ count = min_t(u64, count, SCOUTFS_FALLOCATE_ALLOC_LIMIT); mutex_lock(&datinf->mutex); ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, &datinf->dalloc, count, &blkno, &count); if (ret == 0) { ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, count, blkno, ext_fl | SEF_UNWRITTEN); if (ret < 0) { err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_freed, blkno, count); BUG_ON(err); /* inconsistent */ } } mutex_unlock(&datinf->mutex); if (ret < 0) break; iblock += count; done += count; } if (ret == 0) ret = done; return ret; } /* * Modify the extents that map the blocks that store the len byte region * starting at offset. * * The caller has only prevented freezing by entering a fs write * context. We're responsible for all other locking and consistency. * * This can be used to preallocate files for staging. We find existing * offline extents and allocate block for them and set unwritten. */ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; LIST_HEAD(ind_locks); loff_t end; u64 iblock; u64 last; loff_t tmp; s64 ret; /* XXX support more flags */ if (mode & ~(FALLOC_FL_KEEP_SIZE)) { ret = -EOPNOTSUPP; goto out; } if (len == 0) { ret = 0; goto out; } /* catch wrapping */ if (check_add_overflow(offset, len - 1, &tmp)) { ret = -EINVAL; goto out; } inode_lock(inode); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (ret) goto out_mutex; inode_dio_wait(inode); down_write(&si->extent_sem); if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > i_size_read(inode))) { ret = inode_newsize_ok(inode, offset + len); if (ret) goto out_extent; } iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT; last = (offset + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; while(iblock <= last) { ret = scoutfs_quota_check_data(sb, inode); if (ret) goto out_extent; ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false, true); if (ret) goto out_extent; ret = fallocate_extents(sb, inode, iblock, last, lock); if (ret >= 0 && !(mode & FALLOC_FL_KEEP_SIZE)) { end = (iblock + ret) << SCOUTFS_BLOCK_SM_SHIFT; if (end > offset + len) end = offset + len; if (end > i_size_read(inode)) { i_size_write(inode, end); inode_inc_iversion(inode); scoutfs_inode_inc_data_version(inode); } } if (ret >= 0) scoutfs_update_inode_item(inode, lock, &ind_locks); scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); /* txn couldn't meet the request. Let's try with a new txn */ if (ret == -ENOBUFS) { scoutfs_inc_counter(sb, data_fallocate_enobufs_retry); continue; } if (ret <= 0) goto out_extent; iblock += ret; ret = 0; } out_extent: up_write(&si->extent_sem); out_mutex: scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); inode_unlock(inode); out: trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret); return ret; } /* * A special case of initializing a single large offline extent. This * chooses not to deal with any existing extents. It can only be used * on regular files with no data extents. It's used to restore a file * with an offline extent which can then trigger staging. * * The caller must take care of cluster locking, transactions, inode * updates, and index updates (so that they can atomically make this * change along with other metadata changes). */ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size, struct scoutfs_lock *lock) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct data_ext_args args = { .ino = scoutfs_ino(inode), .inode = inode, .lock = lock, }; const u64 count = DIV_ROUND_UP(size, SCOUTFS_BLOCK_SM_SIZE); u64 on; u64 off; int ret; scoutfs_inode_get_onoff(inode, &on, &off); /* caller should have checked */ if (on > 0 || off > 0) { ret = -EINVAL; goto out; } down_write(&si->extent_sem); ret = scoutfs_ext_insert(sb, &data_ext_ops, &args, 0, count, 0, SEF_OFFLINE); up_write(&si->extent_sem); out: return ret; } /* * We're using truncate_inode_pages_range to maintain consistency * between the page cache and extents that just changed. We have to * call with full aligned page offsets or it thinks that it should leave * behind a zeroed partial page. */ static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len) { truncate_inode_pages_range(&inode->i_data, start << SCOUTFS_BLOCK_SM_SHIFT, ((start + len) << SCOUTFS_BLOCK_SM_SHIFT) - 1); } /* * Move extents from one file to another. The behaviour is more fully * explained above the move_blocks ioctl argument structure definition. * * The caller has processed the ioctl args and performed the most basic * argument sanity and inode checks, but we perform more detailed inode * checks once we have the inode lock and refreshed inodes. Our job is * to safely lock the two files and move the extents. */ #define MOVE_DATA_EXTENTS_PER_HOLD 16 int scoutfs_data_move_blocks(struct inode *from, u64 from_off, u64 byte_len, struct inode *to, u64 to_off, bool is_stage, u64 data_version) { struct scoutfs_inode_info *from_si = SCOUTFS_I(from); struct scoutfs_inode_info *to_si = SCOUTFS_I(to); struct super_block *sb = from->i_sb; struct scoutfs_lock *from_lock = NULL; struct scoutfs_lock *to_lock = NULL; struct data_ext_args from_args; struct data_ext_args to_args; struct scoutfs_extent ext; struct kc_timespec cur_time; LIST_HEAD(locks); bool done = false; loff_t from_size; loff_t to_size; u64 from_offline; u64 to_offline; u64 from_start; u64 to_start; u64 from_iblock; u64 to_iblock; u64 count; u64 junk; u64 seq; u64 map; u64 len; int ret; int err; int i; lock_two_nondirectories(from, to); ret = scoutfs_lock_inodes(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, from, &from_lock, to, &to_lock, NULL, NULL, NULL, NULL); if (ret) goto out; if (!is_stage && (ret = scoutfs_inode_check_retention(to))) goto out; if ((from_off & SCOUTFS_BLOCK_SM_MASK) || (to_off & SCOUTFS_BLOCK_SM_MASK) || ((byte_len & SCOUTFS_BLOCK_SM_MASK) && (from_off + byte_len != i_size_read(from)))) { ret = -EINVAL; goto out; } if (is_stage && (data_version != SCOUTFS_I(to)->data_version)) { ret = -ESTALE; goto out; } from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT; count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT; to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT; from_start = from_iblock; /* only move extent blocks inside i_size, careful not to wrap */ from_size = i_size_read(from); if (from_off >= from_size) { ret = 0; goto out; } if (from_off + byte_len > from_size) count = ((from_size - from_off) + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT; if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) { ret = -EISDIR; goto out; } if (!S_ISREG(from->i_mode) || !S_ISREG(to->i_mode)) { ret = -EINVAL; goto out; } ret = inode_permission(KC_VFS_INIT_NS from, MAY_WRITE) ?: inode_permission(KC_VFS_INIT_NS to, MAY_WRITE); if (ret < 0) goto out; /* can't stage once data_version changes */ scoutfs_inode_get_onoff(from, &junk, &from_offline); scoutfs_inode_get_onoff(to, &junk, &to_offline); if (from_offline || (to_offline && !is_stage)) { ret = -ENODATA; goto out; } from_args = (struct data_ext_args) { .ino = scoutfs_ino(from), .inode = from, .lock = from_lock, }; to_args = (struct data_ext_args) { .ino = scoutfs_ino(to), .inode = to, .lock = to_lock, }; inode_dio_wait(from); inode_dio_wait(to); ret = filemap_write_and_wait_range(&from->i_data, from_off, from_off + byte_len - 1); if (ret < 0) goto out; for (;;) { ret = scoutfs_inode_index_start(sb, &seq) ?: scoutfs_inode_index_prepare(sb, &locks, from, true) ?: scoutfs_inode_index_prepare(sb, &locks, to, true) ?: scoutfs_inode_index_try_lock_hold(sb, &locks, seq, false); if (ret > 0) continue; if (ret < 0) goto out; ret = scoutfs_dirty_inode_item(from, from_lock) ?: scoutfs_dirty_inode_item(to, to_lock); if (ret < 0) goto out; down_write_two(&from_si->extent_sem, &to_si->extent_sem); /* arbitrarily limit the number of extents per trans hold */ for (i = 0; i < MOVE_DATA_EXTENTS_PER_HOLD; i++) { struct scoutfs_extent off_ext; /* find the next extent to move */ ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args, from_start, 1, &ext); if (ret < 0) { if (ret == -ENOENT) { done = true; ret = 0; } break; } /* done if next extent starts after moving region */ if (ext.start >= from_iblock + count) { done = true; ret = 0; break; } from_start = max(ext.start, from_iblock); map = ext.map + (from_start - ext.start); len = min(from_iblock + count, ext.start + ext.len) - from_start; to_start = to_iblock + (from_start - from_iblock); /* we'd get stuck, shouldn't happen */ if (WARN_ON_ONCE(len == 0)) { ret = -EIO; goto out; } if (is_stage) { ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args, to_start, 1, &off_ext); if (ret) break; if (!scoutfs_ext_inside(to_start, len, &off_ext) || !(off_ext.flags & SEF_OFFLINE)) { ret = -EINVAL; break; } ret = scoutfs_ext_set(sb, &data_ext_ops, &to_args, to_start, len, map, ext.flags); } else { /* insert the new, fails if it overlaps */ ret = scoutfs_ext_insert(sb, &data_ext_ops, &to_args, to_start, len, map, ext.flags); } if (ret < 0) break; /* remove the old, possibly splitting */ ret = scoutfs_ext_set(sb, &data_ext_ops, &from_args, from_start, len, 0, 0); if (ret < 0) { if (is_stage) { /* re-mark dest range as offline */ WARN_ON_ONCE(!(off_ext.flags & SEF_OFFLINE)); err = scoutfs_ext_set(sb, &data_ext_ops, &to_args, to_start, len, 0, off_ext.flags); } else { /* remove inserted new on err */ err = scoutfs_ext_remove(sb, &data_ext_ops, &to_args, to_start, len); } BUG_ON(err); /* XXX inconsistent */ break; } trace_scoutfs_data_move_blocks(sb, scoutfs_ino(from), from_start, len, map, ext.flags, scoutfs_ino(to), to_start); /* moved extent might extend i_size */ to_size = (to_start + len) << SCOUTFS_BLOCK_SM_SHIFT; if (to_size > i_size_read(to)) { /* while maintaining final partial */ from_size = (from_start + len) << SCOUTFS_BLOCK_SM_SHIFT; if (from_size > i_size_read(from)) to_size -= from_size - i_size_read(from); i_size_write(to, to_size); } /* find next after moved extent, avoiding wrapping */ if (from_start + len < from_start) from_start = from_iblock + count + 1; else from_start += len; } up_write(&from_si->extent_sem); up_write(&to_si->extent_sem); cur_time = current_time(from); if (!is_stage) { to->i_ctime = to->i_mtime = cur_time; inode_inc_iversion(to); scoutfs_inode_inc_data_version(to); scoutfs_inode_set_data_seq(to); } from->i_ctime = from->i_mtime = cur_time; inode_inc_iversion(from); scoutfs_inode_inc_data_version(from); scoutfs_inode_set_data_seq(from); scoutfs_update_inode_item(from, from_lock, &locks); scoutfs_update_inode_item(to, to_lock, &locks); scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &locks); if (ret < 0 || done) break; } /* remove any cached pages from old extents */ truncate_inode_pages_extent(from, from_iblock, count); truncate_inode_pages_extent(to, to_iblock, count); out: scoutfs_unlock(sb, from_lock, SCOUTFS_LOCK_WRITE); scoutfs_unlock(sb, to_lock, SCOUTFS_LOCK_WRITE); unlock_two_nondirectories(from, to); return ret; } /* * This copies to userspace :/ */ static int fill_extent(struct fiemap_extent_info *fieinfo, struct scoutfs_extent *ext, u32 fiemap_flags) { u32 flags; if (ext->len == 0) return 0; flags = fiemap_flags; if (ext->flags & SEF_OFFLINE) flags |= FIEMAP_EXTENT_UNKNOWN; else if (ext->flags & SEF_UNWRITTEN) flags |= FIEMAP_EXTENT_UNWRITTEN; return fiemap_fill_next_extent(fieinfo, ext->start << SCOUTFS_BLOCK_SM_SHIFT, ext->map << SCOUTFS_BLOCK_SM_SHIFT, ext->len << SCOUTFS_BLOCK_SM_SHIFT, flags); } /* * Return all the file's extents whose blocks overlap with the caller's * byte region. We set _LAST on the last extent and _UNKNOWN on offline * extents. */ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct scoutfs_lock *lock = NULL; struct scoutfs_extent *info = NULL; struct page *page = NULL; struct scoutfs_extent ext; struct scoutfs_extent cur; struct data_ext_args args; u32 last_flags; u64 iblock; u64 last; int entries = 0; int ret; int complete = 0; if (len == 0) { ret = 0; goto out; } ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) goto out; page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } /* use a dummy extent to track */ memset(&cur, 0, sizeof(cur)); last_flags = 0; iblock = start >> SCOUTFS_BLOCK_SM_SHIFT; last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; args.ino = ino; args.inode = inode; /* outer loop */ while (iblock <= last) { /* lock */ inode_lock(inode); down_read(&si->extent_sem); ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock); if (ret) { up_read(&si->extent_sem); inode_unlock(inode); break; } args.lock = lock; /* collect entries */ info = page_address(page); memset(info, 0, PAGE_SIZE); while (entries < (PAGE_SIZE / sizeof(struct fiemap_extent)) - 1) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; complete = 1; last_flags = FIEMAP_EXTENT_LAST; break; } trace_scoutfs_data_fiemap_extent(sb, ino, &ext); if (ext.start > last) { /* not setting _LAST, it's for end of file */ ret = 0; complete = 1; break; } if (scoutfs_ext_can_merge(&cur, &ext)) { /* merged extents could be greater than input len */ cur.len += ext.len; } else { /* fill it */ memcpy(info, &cur, sizeof(cur)); entries++; info++; cur = ext; } iblock = ext.start + ext.len; } /* unlock */ scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ); up_read(&si->extent_sem); inode_unlock(inode); if (ret) break; /* emit entries */ info = page_address(page); for (; entries > 0; entries--) { ret = fill_extent(fieinfo, info, 0); if (ret != 0) goto out; info++; } if (complete) break; } /* still one left, it's in cur */ if (cur.len) ret = fill_extent(fieinfo, &cur, last_flags); out: if (ret == 1) ret = 0; if (page) __free_page(page); trace_scoutfs_data_fiemap(sb, start, len, ret); return ret; } /* * Insert a new waiter. This supports multiple tasks waiting for the * same ino and iblock by also comparing waiters by their addresses. */ static void insert_offline_waiting(struct rb_root *root, struct scoutfs_data_wait *ins) { struct rb_node **node = &root->rb_node; struct rb_node *parent = NULL; struct scoutfs_data_wait *dw; int cmp; while (*node) { parent = *node; dw = rb_entry(*node, struct scoutfs_data_wait, node); cmp = scoutfs_cmp_u64s(ins->ino, dw->ino) ?: scoutfs_cmp_u64s(ins->iblock, dw->iblock) ?: scoutfs_cmp(ins, dw); if (cmp < 0) node = &(*node)->rb_left; else node = &(*node)->rb_right; } rb_link_node(&ins->node, parent, node); rb_insert_color(&ins->node, root); } static struct scoutfs_data_wait *next_data_wait(struct rb_root *root, u64 ino, u64 iblock) { struct rb_node **node = &root->rb_node; struct rb_node *parent = NULL; struct scoutfs_data_wait *next = NULL; struct scoutfs_data_wait *dw; int cmp; while (*node) { parent = *node; dw = rb_entry(*node, struct scoutfs_data_wait, node); /* go left when ino/iblock are equal to get first task */ cmp = scoutfs_cmp_u64s(ino, dw->ino) ?: scoutfs_cmp_u64s(iblock, dw->iblock); if (cmp <= 0) { node = &(*node)->rb_left; next = dw; } else if (cmp > 0) { node = &(*node)->rb_right; } } return next; } static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw) { struct rb_node *node = rb_next(&dw->node); if (node) return container_of(node, struct scoutfs_data_wait, node); return NULL; } /* * Check if we should wait by looking for extents whose flags match. * Returns 0 if no extents were found or any error encountered. * * The caller must have acquired a cluster lock that covers the extent * items. We acquire the extent_sem to protect our read from writers in * other tasks. * * Returns 1 if any file extents in the caller's region matched. If the * wait struct is provided then it is initialized to be woken when the * extents change after the caller unlocks after the check. The caller * must come through _data_wait() to clean up the wait struct if we set * it up. */ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len, u8 sef, u8 op, struct scoutfs_data_wait *dw, struct scoutfs_lock *lock) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); struct data_ext_args args = { .ino = ino, .inode = inode, .lock = lock, }; DECLARE_DATA_WAIT_ROOT(sb, rt); DECLARE_DATA_WAITQ(inode, wq); struct scoutfs_extent ext = {0,}; u64 iblock; u64 last_block; u64 on; u64 off; loff_t tmp; int ret = 0; if (len == 0) goto out; if (WARN_ON_ONCE(sef & SEF_UNKNOWN) || WARN_ON_ONCE(op & SCOUTFS_IOC_DWO_UNKNOWN) || WARN_ON_ONCE(dw && !RB_EMPTY_NODE(&dw->node)) || WARN_ON_ONCE(check_add_overflow(pos, len - 1, &tmp))) { ret = -EINVAL; goto out; } if ((sef & SEF_OFFLINE)) { scoutfs_inode_get_onoff(inode, &on, &off); if (off == 0) { ret = 0; goto out; } } down_read(&si->extent_sem); iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT; last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT; while(iblock <= last_block) { ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &ext); if (ret < 0) { if (ret == -ENOENT) ret = 0; break; } if (ext.start > last_block) { ret = 0; break; } if (sef & ext.flags) { if (dw) { dw->chg = atomic64_read(&wq->changed); dw->ino = ino; dw->iblock = max(iblock, ext.start); dw->op = op; spin_lock(&rt->lock); insert_offline_waiting(&rt->root, dw); spin_unlock(&rt->lock); } ret = 1; break; } iblock = ext.start + ext.len; } up_read(&si->extent_sem); out: trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret); return ret; } bool scoutfs_data_wait_found(struct scoutfs_data_wait *dw) { return !RB_EMPTY_NODE(&dw->node); } int scoutfs_data_wait_check_iov(struct inode *inode, const struct iovec *iov, unsigned long nr_segs, loff_t pos, u8 sef, u8 op, struct scoutfs_data_wait *dw, struct scoutfs_lock *lock) { unsigned long i; int ret = 0; for (i = 0; i < nr_segs; i++) { if (iov[i].iov_len == 0) continue; ret = scoutfs_data_wait_check(inode, pos, iov[i].iov_len, sef, op, dw, lock); if (ret != 0) break; pos += iov[i].iov_len; } return ret; } int scoutfs_data_wait(struct inode *inode, struct scoutfs_data_wait *dw) { DECLARE_DATA_WAIT_ROOT(inode->i_sb, rt); DECLARE_DATA_WAITQ(inode, wq); int ret; ret = wait_event_interruptible(wq->waitq, atomic64_read(&wq->changed) != dw->chg); spin_lock(&rt->lock); rb_erase(&dw->node, &rt->root); RB_CLEAR_NODE(&dw->node); if (!ret && dw->err) ret = dw->err; spin_unlock(&rt->lock); return ret; } void scoutfs_data_wait_changed(struct inode *inode) { DECLARE_DATA_WAITQ(inode, wq); atomic64_inc(&wq->changed); wake_up(&wq->waitq); } long scoutfs_data_wait_err(struct inode *inode, u64 sblock, u64 eblock, u64 op, long err) { struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); DECLARE_DATA_WAIT_ROOT(sb, rt); struct scoutfs_data_wait *dw; long nr = 0; if (!err) return 0; spin_lock(&rt->lock); for (dw = next_data_wait(&rt->root, ino, sblock); dw; dw = dw_next(dw)) { if (dw->ino != ino || dw->iblock > eblock) break; if ((dw->op & op) && !dw->err) { dw->err = err; nr++; } } spin_unlock(&rt->lock); if (nr) scoutfs_data_wait_changed(inode); return nr; } int scoutfs_data_waiting(struct super_block *sb, u64 ino, u64 iblock, struct scoutfs_ioctl_data_waiting_entry *dwe, unsigned int nr) { DECLARE_DATA_WAIT_ROOT(sb, rt); struct scoutfs_data_wait *dw; int ret = 0; spin_lock(&rt->lock); dw = next_data_wait(&rt->root, ino, iblock); while (dw && ret < nr) { dwe->ino = dw->ino; dwe->iblock = dw->iblock; dwe->op = dw->op; while ((dw = dw_next(dw)) && (dw->ino == dwe->ino && dw->iblock == dwe->iblock)) { dwe->op |= dw->op; } dwe++; ret++; } spin_unlock(&rt->lock); return ret; } #ifdef KC_MM_VM_FAULT_T static vm_fault_t scoutfs_data_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #else static int scoutfs_data_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { #endif struct page *page = vmf->page; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct scoutfs_lock *lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); DECLARE_DATA_WAIT(dw); struct write_begin_data wbd; u64 ind_seq; loff_t pos; loff_t size; unsigned int len = PAGE_SIZE; vm_fault_t ret = VM_FAULT_SIGBUS; int err; pos = vmf->pgoff << PAGE_SHIFT; sb_start_pagefault(sb); err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_WRITE, SCOUTFS_LKF_REFRESH_INODE, inode, &lock); if (err) { ret = vmf_error(err); goto out; } size = i_size_read(inode); if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, lock)) { /* data_version is per inode, whole file must be online */ err = scoutfs_data_wait_check(inode, 0, size, SEF_OFFLINE, SCOUTFS_IOC_DWO_WRITE, &dw, lock); if (err != 0) { if (err < 0) ret = vmf_error(err); goto out_unlock; } } /* scoutfs_write_begin */ memset(&wbd, 0, sizeof(wbd)); INIT_LIST_HEAD(&wbd.ind_locks); wbd.lock = lock; /* * Start transaction before taking page locks - we want to make sure we're * not locking a page, then waiting for trans, because writeback might race * against it and cause a lock inversion hang - as demonstrated by both * holetest and fsstress tests in xfstests. */ do { err = scoutfs_inode_index_start(sb, &ind_seq) ?: scoutfs_inode_index_prepare(sb, &wbd.ind_locks, inode, true) ?: scoutfs_inode_index_try_lock_hold(sb, &wbd.ind_locks, ind_seq, false); } while (err > 0); if (err < 0) { ret = vmf_error(err); goto out_trans; } down_write(&si->extent_sem); if (!trylock_page(page)) { ret = VM_FAULT_NOPAGE; goto out_sem; } ret = VM_FAULT_LOCKED; if ((page->mapping != inode->i_mapping) || (!PageUptodate(page)) || (page_offset(page) > size)) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out_sem; } if (page->index == (size - 1) >> PAGE_SHIFT) len = ((size - 1) & ~PAGE_MASK) + 1; err = __block_write_begin(page, pos, PAGE_SIZE, scoutfs_get_block); if (err) { ret = vmf_error(err); unlock_page(page); goto out_sem; } /* end scoutfs_write_begin */ /* * We mark the page dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty page and writeprotect it again. */ set_page_dirty(page); wait_for_stable_page(page); /* scoutfs_write_end */ scoutfs_inode_set_data_seq(inode); scoutfs_inode_inc_data_version(inode); file_update_time(vma->vm_file); scoutfs_update_inode_item(inode, wbd.lock, &wbd.ind_locks); scoutfs_inode_queue_writeback(inode); out_sem: up_write(&si->extent_sem); out_trans: scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &wbd.ind_locks); /* end scoutfs_write_end */ out_unlock: scoutfs_per_task_del(&si->pt_data_lock, &pt_ent); scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE); out: sb_end_pagefault(sb); if (scoutfs_data_wait_found(&dw)) { /* * It'd be really nice to not hold the mmap_sem lock here * before waiting for data, and then return VM_FAULT_RETRY */ err = scoutfs_data_wait(inode, &dw); if (err == 0) ret = VM_FAULT_NOPAGE; else ret = vmf_error(err); } trace_scoutfs_data_page_mkwrite(sb, scoutfs_ino(inode), pos, (__force u32)ret); return ret; } #ifdef KC_MM_VM_FAULT_T static vm_fault_t scoutfs_data_filemap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #else static int scoutfs_data_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { #endif struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; struct scoutfs_lock *inode_lock = NULL; SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent); DECLARE_DATA_WAIT(dw); loff_t pos; int err; vm_fault_t ret = VM_FAULT_SIGBUS; pos = vmf->pgoff; pos <<= PAGE_SHIFT; retry: err = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, SCOUTFS_LKF_REFRESH_INODE, inode, &inode_lock); if (err < 0) return vmf_error(err); if (scoutfs_per_task_add_excl(&si->pt_data_lock, &pt_ent, inode_lock)) { /* protect checked extents from stage/release */ atomic_inc(&inode->i_dio_count); err = scoutfs_data_wait_check(inode, pos, PAGE_SIZE, SEF_OFFLINE, SCOUTFS_IOC_DWO_READ, &dw, inode_lock); if (err != 0) { if (err < 0) ret = vmf_error(err); goto out; } } #ifdef KC_MM_VM_FAULT_T ret = filemap_fault(vmf); #else ret = filemap_fault(vma, vmf); #endif out: if (scoutfs_per_task_del(&si->pt_data_lock, &pt_ent)) kc_inode_dio_end(inode); scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ); if (scoutfs_data_wait_found(&dw)) { err = scoutfs_data_wait(inode, &dw); if (err == 0) goto retry; ret = VM_FAULT_RETRY; } trace_scoutfs_data_filemap_fault(sb, scoutfs_ino(inode), pos, (__force u32)ret); return ret; } static const struct vm_operations_struct scoutfs_data_file_vm_ops = { .fault = scoutfs_data_filemap_fault, .page_mkwrite = scoutfs_data_page_mkwrite, #ifdef KC_MM_REMAP_PAGES .remap_pages = generic_file_remap_pages, #endif }; static int scoutfs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &scoutfs_data_file_vm_ops; return 0; } const struct address_space_operations scoutfs_file_aops = { #ifdef KC_MPAGE_READ_FOLIO .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = scoutfs_read_folio, #else .readpage = scoutfs_readpage, #endif #ifndef KC_FILE_AOPS_READAHEAD .readpages = scoutfs_readpages, #else .readahead = scoutfs_readahead, #endif .writepage = scoutfs_writepage, .writepages = scoutfs_writepages, .write_begin = scoutfs_write_begin, .write_end = scoutfs_write_end, }; const struct file_operations scoutfs_file_fops = { #ifdef KC_LINUX_HAVE_FOP_AIO_READ .read = do_sync_read, .write = do_sync_write, .aio_read = scoutfs_file_aio_read, .aio_write = scoutfs_file_aio_write, #else .read_iter = scoutfs_file_read_iter, .write_iter = scoutfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, #endif .mmap = scoutfs_file_mmap, .unlocked_ioctl = scoutfs_ioctl, .fsync = scoutfs_file_fsync, .llseek = scoutfs_file_llseek, .fallocate = scoutfs_fallocate, }; void scoutfs_data_init_btrees(struct super_block *sb, struct scoutfs_alloc *alloc, struct scoutfs_block_writer *wri, struct scoutfs_log_trees *lt) { DECLARE_DATA_INFO(sb, datinf); mutex_lock(&datinf->mutex); datinf->alloc = alloc; datinf->wri = wri; scoutfs_dalloc_init(&datinf->dalloc, <->data_avail); datinf->data_freed = lt->data_freed; mutex_unlock(&datinf->mutex); } void scoutfs_data_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt) { DECLARE_DATA_INFO(sb, datinf); mutex_lock(&datinf->mutex); scoutfs_dalloc_get_root(&datinf->dalloc, <->data_avail); lt->data_freed = datinf->data_freed; mutex_unlock(&datinf->mutex); } /* * This should be called before preparing the allocators for the commit * because it can allocate and free btree blocks in the data allocator. */ int scoutfs_data_prepare_commit(struct super_block *sb) { DECLARE_DATA_INFO(sb, datinf); int ret; mutex_lock(&datinf->mutex); ret = scoutfs_dalloc_return_cached(sb, datinf->alloc, datinf->wri, &datinf->dalloc); mutex_unlock(&datinf->mutex); return ret; } /* * Return true if the data allocator is lower than the caller's * requirement and we haven't been told by the server that we're out of * free extents. */ bool scoutfs_data_alloc_should_refill(struct super_block *sb, u64 blocks) { DECLARE_DATA_INFO(sb, datinf); return (scoutfs_dalloc_total_len(&datinf->dalloc) < blocks) && !(le32_to_cpu(datinf->dalloc.root.flags) & SCOUTFS_ALLOC_FLAG_LOW); } int scoutfs_data_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct data_info *datinf; datinf = kzalloc(sizeof(struct data_info), GFP_KERNEL); if (!datinf) return -ENOMEM; datinf->sb = sb; mutex_init(&datinf->mutex); sbi->data_info = datinf; return 0; } void scoutfs_data_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct data_info *datinf = sbi->data_info; if (datinf) { sbi->data_info = NULL; kfree(datinf); } }