mirror of
https://github.com/versity/scoutfs.git
synced 2026-02-07 11:10:44 +00:00
scoutfs: return free extents to server
Freed file data extents are tracked in free extent items in each node. They could only be re-used in the future for file data extent allocation on that node. Allocations on other nodes or, critically, segment allocation on the server could never see those free extents. With the right allocation patterns, particularly allocating on node X and freeing on node Y, all the free extents can build up on a node and starve other allocations. This adds a simple high water mark after which nodes start returning free extents to the server. From there they can satisfy segment allocations or be sent to other nodes for file data extent allocation. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -582,6 +582,18 @@ int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int scoutfs_client_free_extents(struct super_block *sb,
|
||||
struct scoutfs_net_extent_list *nexl)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
unsigned int bytes;
|
||||
|
||||
bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(le64_to_cpu(nexl->nr));
|
||||
|
||||
return client_request(client, SCOUTFS_NET_FREE_EXTENTS,
|
||||
nexl, bytes, NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
@@ -5,6 +5,8 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
|
||||
u64 *ino, u64 *nr);
|
||||
int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start,
|
||||
u64 *len);
|
||||
int scoutfs_client_free_extents(struct super_block *sb,
|
||||
struct scoutfs_net_extent_list *nexl);
|
||||
int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno);
|
||||
int scoutfs_client_record_segment(struct super_block *sb,
|
||||
struct scoutfs_segment *seg, u8 level);
|
||||
|
||||
@@ -277,6 +277,21 @@ SIC_TRUNC_EXTENT(struct inode *inode)
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returning extents to the server can, at most:
|
||||
* - delete MAX_NR extents with indexed copies
|
||||
* - create an extent for the leftovers of the last extent
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_RETURN_EXTENTS(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
unsigned int nr = SCOUTFS_NET_EXTENT_LIST_MAX_NR + 1;
|
||||
|
||||
cnt.items += (nr * 2);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fallocating an extent can, at most:
|
||||
* - allocate from the server: delete two free and insert merged
|
||||
|
||||
139
kmod/src/data.c
139
kmod/src/data.c
@@ -21,6 +21,7 @@
|
||||
#include <linux/log2.h>
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
@@ -38,6 +39,7 @@
|
||||
#include "file.h"
|
||||
#include "extents.h"
|
||||
#include "msg.h"
|
||||
#include "count.h"
|
||||
|
||||
/*
|
||||
* scoutfs uses extent items to track file data block mappings and free
|
||||
@@ -72,9 +74,17 @@
|
||||
* We ask for a fixed size from the server today.
|
||||
*/
|
||||
#define SERVER_ALLOC_BLOCKS (MAX_EXTENT_BLOCKS * 8)
|
||||
/*
|
||||
* Send free extents back to the server if we have plenty locally.
|
||||
*/
|
||||
#define NODE_FREE_HIGH_WATER_BLOCKS (SERVER_ALLOC_BLOCKS * 16)
|
||||
|
||||
struct data_info {
|
||||
struct super_block *sb;
|
||||
struct rw_semaphore alloc_rwsem;
|
||||
atomic64_t node_free_blocks;
|
||||
struct workqueue_struct *workq;
|
||||
struct work_struct return_work;
|
||||
};
|
||||
|
||||
#define DECLARE_DATA_INFO(sb, name) \
|
||||
@@ -148,10 +158,16 @@ static int init_extent_from_item(struct scoutfs_extent *ext,
|
||||
* We also index free extents by their length. We implement that by
|
||||
* keeping their _BLOCKS_ item in sync with the primary _BLKNO_ item
|
||||
* that callers operate on.
|
||||
*
|
||||
* The count of free blocks stored in node items is kept consistent by
|
||||
* updating the count every time we create or delete items. Updated
|
||||
* extents are deleted and then recreated so the count can bounce around
|
||||
* a bit, but it's OK for it to be imprecise at the margins.
|
||||
*/
|
||||
static int data_extent_io(struct super_block *sb, int op,
|
||||
struct scoutfs_extent *ext, void *data)
|
||||
{
|
||||
DECLARE_DATA_INFO(sb, datinf);
|
||||
struct scoutfs_lock *lock = data;
|
||||
struct scoutfs_file_extent fex;
|
||||
struct scoutfs_key first;
|
||||
@@ -230,6 +246,13 @@ static int data_extent_io(struct super_block *sb, int op,
|
||||
}
|
||||
}
|
||||
|
||||
if (ret == 0 && ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) {
|
||||
if (op == SEI_INSERT)
|
||||
atomic64_add(ext->len, &datinf->node_free_blocks);
|
||||
else if (op == SEI_DELETE)
|
||||
atomic64_sub(ext->len, &datinf->node_free_blocks);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -252,6 +275,7 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_DATA_INFO(sb, datinf);
|
||||
struct scoutfs_extent next;
|
||||
struct scoutfs_extent rem;
|
||||
struct scoutfs_extent fr;
|
||||
@@ -324,6 +348,11 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode,
|
||||
|
||||
scoutfs_inode_add_onoff(inode, online_delta, offline_delta);
|
||||
|
||||
/* start returning free extents to the server after a small delay */
|
||||
if (rem.map && (atomic64_read(&datinf->node_free_blocks) >
|
||||
NODE_FREE_HIGH_WATER_BLOCKS))
|
||||
queue_work(datinf->workq, &datinf->return_work);
|
||||
|
||||
ret = 1;
|
||||
out:
|
||||
scoutfs_extent_cleanup(ret < 0 && add_rem, scoutfs_extent_add, sb,
|
||||
@@ -1238,6 +1267,94 @@ const struct file_operations scoutfs_file_fops = {
|
||||
.fallocate = scoutfs_fallocate,
|
||||
};
|
||||
|
||||
/*
|
||||
* Return extents to the server if we're over the high water mark. Each
|
||||
* work call sends one batch of extents so that the work can be easily
|
||||
* canceled to stop progress during unmount.
|
||||
*/
|
||||
static void scoutfs_data_return_server_extents_worker(struct work_struct *work)
|
||||
{
|
||||
struct data_info *datinf = container_of(work, struct data_info,
|
||||
return_work);
|
||||
struct super_block *sb = datinf->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_net_extent_list *nexl;
|
||||
struct scoutfs_extent ext;
|
||||
u64 nr = 0;
|
||||
u64 free;
|
||||
int bytes;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
trace_scoutfs_data_return_server_extents_enter(sb, 0, 0);
|
||||
|
||||
bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(SCOUTFS_NET_EXTENT_LIST_MAX_NR);
|
||||
nexl = kmalloc(bytes, GFP_NOFS);
|
||||
if (!nexl) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_hold_trans(sb, SIC_RETURN_EXTENTS());
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
down_write(&datinf->alloc_rwsem);
|
||||
|
||||
free = atomic64_read(&datinf->node_free_blocks);
|
||||
|
||||
while (nr < SCOUTFS_NET_EXTENT_LIST_MAX_NR &&
|
||||
free > NODE_FREE_HIGH_WATER_BLOCKS) {
|
||||
|
||||
scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE,
|
||||
sbi->node_id, 0, 1, 0, 0);
|
||||
ret = scoutfs_extent_next(sb, data_extent_io, &ext,
|
||||
sbi->node_id_lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
trace_scoutfs_data_return_server_extent(sb, &ext);
|
||||
|
||||
ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE;
|
||||
ext.len = min(ext.len, free - NODE_FREE_HIGH_WATER_BLOCKS);
|
||||
|
||||
ret = scoutfs_extent_remove(sb, data_extent_io, &ext,
|
||||
sbi->node_id_lock);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
nexl->extents[nr].start = cpu_to_le64(ext.start);
|
||||
nexl->extents[nr].len = cpu_to_le64(ext.len);
|
||||
|
||||
nr++;
|
||||
free -= ext.len;
|
||||
}
|
||||
|
||||
nexl->nr = cpu_to_le64(nr);
|
||||
|
||||
up_write(&datinf->alloc_rwsem);
|
||||
|
||||
if (nr > 0) {
|
||||
err = scoutfs_client_free_extents(sb, nexl);
|
||||
/* XXX leaked extents if free failed */
|
||||
if (ret == 0 && err < 0)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
scoutfs_release_trans(sb);
|
||||
out:
|
||||
kfree(nexl);
|
||||
|
||||
trace_scoutfs_data_return_server_extents_exit(sb, nr, ret);
|
||||
|
||||
/* keep returning if we're still over the water mark */
|
||||
if (ret == 0 && (atomic64_read(&datinf->node_free_blocks) >
|
||||
NODE_FREE_HIGH_WATER_BLOCKS))
|
||||
queue_work(datinf->workq, &datinf->return_work);
|
||||
}
|
||||
|
||||
int scoutfs_data_setup(struct super_block *sb)
|
||||
{
|
||||
@@ -1248,10 +1365,19 @@ int scoutfs_data_setup(struct super_block *sb)
|
||||
if (!datinf)
|
||||
return -ENOMEM;
|
||||
|
||||
datinf->sb = sb;
|
||||
init_rwsem(&datinf->alloc_rwsem);
|
||||
atomic64_set(&datinf->node_free_blocks, 0);
|
||||
INIT_WORK(&datinf->return_work,
|
||||
scoutfs_data_return_server_extents_worker);
|
||||
|
||||
datinf->workq = alloc_workqueue("scoutfs_data", 0, 1);
|
||||
if (!datinf->workq) {
|
||||
kfree(datinf);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
sbi->data_info = datinf;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1260,5 +1386,14 @@ void scoutfs_data_destroy(struct super_block *sb)
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct data_info *datinf = sbi->data_info;
|
||||
|
||||
kfree(datinf);
|
||||
if (datinf) {
|
||||
if (datinf->workq) {
|
||||
cancel_work_sync(&datinf->return_work);
|
||||
destroy_workqueue(datinf->workq);
|
||||
datinf->workq = NULL;
|
||||
}
|
||||
|
||||
sbi->data_info = NULL;
|
||||
kfree(datinf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -562,6 +562,20 @@ struct scoutfs_net_extent {
|
||||
__le64 len;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_net_extent_list {
|
||||
__le64 nr;
|
||||
struct {
|
||||
__le64 start;
|
||||
__le64 len;
|
||||
} __packed extents[0];
|
||||
} __packed;
|
||||
|
||||
#define SCOUTFS_NET_EXTENT_LIST_BYTES(nr) \
|
||||
offsetof(struct scoutfs_net_extent_list, extents[nr])
|
||||
|
||||
/* arbitrarily makes a nice ~1k extent list payload */
|
||||
#define SCOUTFS_NET_EXTENT_LIST_MAX_NR 64
|
||||
|
||||
/* XXX eventually we'll have net compaction and will need agents to agree */
|
||||
|
||||
/* one upper segment and fanout lower segments */
|
||||
@@ -575,6 +589,7 @@ struct scoutfs_net_extent {
|
||||
enum {
|
||||
SCOUTFS_NET_ALLOC_INODES = 0,
|
||||
SCOUTFS_NET_ALLOC_EXTENT,
|
||||
SCOUTFS_NET_FREE_EXTENTS,
|
||||
SCOUTFS_NET_ALLOC_SEGNO,
|
||||
SCOUTFS_NET_RECORD_SEGMENT,
|
||||
SCOUTFS_NET_ADVANCE_SEQ,
|
||||
|
||||
@@ -1726,6 +1726,14 @@ DEFINE_EVENT(scoutfs_work_class, scoutfs_server_workqueue_destroy,
|
||||
TP_PROTO(struct super_block *sb, u64 data, int ret),
|
||||
TP_ARGS(sb, data, ret)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_enter,
|
||||
TP_PROTO(struct super_block *sb, u64 data, int ret),
|
||||
TP_ARGS(sb, data, ret)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_exit,
|
||||
TP_PROTO(struct super_block *sb, u64 data, int ret),
|
||||
TP_ARGS(sb, data, ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_item_next_range_check,
|
||||
TP_PROTO(struct super_block *sb, int cached,
|
||||
@@ -2149,6 +2157,10 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_fiemap_extent,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
|
||||
TP_ARGS(sb, ext)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_return_server_extent,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
|
||||
TP_ARGS(sb, ext)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_next,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext),
|
||||
TP_ARGS(sb, ext)
|
||||
|
||||
@@ -743,6 +743,54 @@ out:
|
||||
return send_reply(conn, id, type, ret, &nex, sizeof(nex));
|
||||
}
|
||||
|
||||
static bool invalid_net_extent_list(struct scoutfs_net_extent_list *nexl,
|
||||
unsigned data_len)
|
||||
{
|
||||
return (data_len < sizeof(struct scoutfs_net_extent_list)) ||
|
||||
(le64_to_cpu(nexl->nr) > SCOUTFS_NET_EXTENT_LIST_MAX_NR) ||
|
||||
(data_len != offsetof(struct scoutfs_net_extent_list,
|
||||
extents[le64_to_cpu(nexl->nr)]));
|
||||
}
|
||||
|
||||
static int process_free_extents(struct server_connection *conn,
|
||||
u64 id, u8 type, void *data, unsigned data_len)
|
||||
{
|
||||
struct server_info *server = conn->server;
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_net_extent_list *nexl;
|
||||
struct commit_waiter cw;
|
||||
int ret = 0;
|
||||
int err;
|
||||
u64 i;
|
||||
|
||||
nexl = data;
|
||||
if (invalid_net_extent_list(nexl, data_len)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
down_read(&server->commit_rwsem);
|
||||
|
||||
for (i = 0; i < le64_to_cpu(nexl->nr); i++) {
|
||||
ret = free_extent(sb, le64_to_cpu(nexl->extents[i].start),
|
||||
le64_to_cpu(nexl->extents[i].len));
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i > 0)
|
||||
queue_commit_work(server, &cw);
|
||||
up_read(&server->commit_rwsem);
|
||||
|
||||
if (i > 0) {
|
||||
err = wait_for_commit(server, &cw, id, type);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
out:
|
||||
return send_reply(conn, id, type, ret, NULL, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* We still special case segno allocation because it's aligned and we'd
|
||||
* like to keep that detail in the server.
|
||||
@@ -1091,6 +1139,7 @@ static void scoutfs_server_process_func(struct work_struct *work)
|
||||
static process_func_t process_funcs[] = {
|
||||
[SCOUTFS_NET_ALLOC_INODES] = process_alloc_inodes,
|
||||
[SCOUTFS_NET_ALLOC_EXTENT] = process_alloc_extent,
|
||||
[SCOUTFS_NET_FREE_EXTENTS] = process_free_extents,
|
||||
[SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno,
|
||||
[SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment,
|
||||
[SCOUTFS_NET_ADVANCE_SEQ] = process_advance_seq,
|
||||
|
||||
@@ -120,12 +120,13 @@ static void scoutfs_put_super(struct super_block *sb)
|
||||
|
||||
sbi->shutdown = true;
|
||||
|
||||
scoutfs_data_destroy(sb);
|
||||
|
||||
scoutfs_unlock(sb, sbi->node_id_lock, DLM_LOCK_EX);
|
||||
sbi->node_id_lock = NULL;
|
||||
|
||||
scoutfs_shutdown_trans(sb);
|
||||
scoutfs_client_destroy(sb);
|
||||
scoutfs_data_destroy(sb);
|
||||
scoutfs_inode_destroy(sb);
|
||||
scoutfs_item_destroy(sb);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user