scoutfs: separate client and server net processing

The networking code was really suffering by trying to combine the client and server processing paths into one file. The code can be a lot simpler by giving the client and server their own processing paths that take their different socket lifecysles into account. The client maintains a single connection. Blocked senders work on the socket under a sending mutex. The recv path runs in work that can be canceled after first shutting down the socket. A long running server work function acquires the listener lock, manages the listening socket, and accepts new sockets. Each accepted socket has a single recv work blocked waiting for requests. That then spawns concurrent processing work which sends replies under a sending mutex. All of this is torn down by shutting down sockets and canceling work which frees its context. All this restructuring makes it a lot easier to track what is happening in mount and unmount between the client and server. This fixes bugs where unmount was failing because the monolithic socket shutdown function was queueing other work while running while draining. Signed-off-by: Zach Brown <zab@versity.com>
2026-05-12 15:41:28 +00:00 · 2017-07-26 13:59:40 -07:00
parent 74a80b772e
commit c1b2ad9421
19 changed files with 2043 additions and 2270 deletions
--- a/kmod/src/Makefile
+++ b/kmod/src/Makefile
@@ -4,9 +4,10 @@ CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\"

 CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
 
-scoutfs-y += alloc.o bio.o btree.o compact.o counters.o data.o dir.o kvec.o \
-	     inode.o ioctl.o item.o key.o lock.o manifest.o msg.o net.o \
-	     options.o seg.o scoutfs_trace.o sort_priv.o super.o trans.o xattr.o
+scoutfs-y += alloc.o bio.o btree.o client.o compact.o counters.o data.o dir.o \
+	     kvec.o inode.o ioctl.o item.o key.o lock.o manifest.o msg.o \
+	     options.o seg.o server.o scoutfs_trace.o sock.o sort_priv.o \
+	     super.o trans.o xattr.o

 #
 # The raw types aren't available in userspace headers.  Make sure all
--- a/kmod/src/client.c
+++ b/kmod/src/client.c
@@ -0,0 +1,729 @@
+/*
+ * Copyright (C) 2017 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/ioctls.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/sort.h>
+#include <asm/barrier.h>
+
+#include "format.h"
+#include "counters.h"
+#include "inode.h"
+#include "btree.h"
+#include "manifest.h"
+#include "seg.h"
+#include "compact.h"
+#include "scoutfs_trace.h"
+#include "msg.h"
+#include "server.h"
+#include "client.h"
+#include "sock.h"
+#include "endian_swap.h"
+
+/*
+ * Client callers block sending requests to the server.  Senders connect
+ * and send down the socket in their blocked context under a mutex.
+ * Once a socket is connected recv work is fired up.  Destroying a
+ * socket shuts down the socket and cancels the work.
+ *
+ * Clients are responsible for resending their requests after
+ * reconnecting to a new socket.  These new socket connections might be
+ * connecting to the same server.  The message sending and processing
+ * paths are responsible for dealing with duplicate requests.
+ */
+
+#define SIN_FMT		"%pIS:%u"
+#define SIN_ARG(sin)	sin, be16_to_cpu((sin)->sin_port)
+
+/*
+ * Have a pretty aggressive keepalive timeout of around 10 seconds.  The
+ * TCP keepalives are being processed out of task context so they should
+ * be responsive even when mounts are under load.  We also derive the
+ * connect timeout from this.
+ */
+#define KEEPCNT			3
+#define KEEPIDLE		7
+#define KEEPINTVL		1
+#define KEEP_TIMEO_SECS		(KEEPIDLE + (KEEPCNT * KEEPINTVL))
+#define CONNECT_TIMEO_SECS	KEEP_TIMEO_SECS
+#define CONNECT_TIMEO_MSECS	(KEEP_TIMEO_SECS * MSEC_PER_SEC)
+
+struct client_info {
+	struct super_block *sb;
+
+	/* spinlock protects quick critical sections between send,recv,umount */
+	spinlock_t recv_lock;
+	struct rb_root sender_root;
+
+	/* the sock mutex serializes connecting and sending */
+	struct mutex send_mutex;
+	bool recv_shutdown;
+	u64 next_id;
+	u64 sock_gen;
+	struct socket *sock;
+	struct sockaddr_in peername;
+	struct sockaddr_in sockname;
+
+	/* blocked senders sit on a waitq that's woken for resends */
+	wait_queue_head_t waitq;
+
+	struct workqueue_struct *recv_wq;
+	struct work_struct recv_work;
+};
+
+struct waiting_sender {
+	struct rb_node node;
+	struct task_struct *task;
+
+	u64 id;
+	void *rx;
+	size_t rx_size;
+	int result;
+};
+
+static struct waiting_sender *walk_sender_tree(struct client_info *client,
+					       u64 id,
+					       struct waiting_sender *ins)
+{
+	struct rb_node **node = &client->sender_root.rb_node;
+	struct waiting_sender *found = NULL;
+	struct waiting_sender *sender;
+	struct rb_node *parent = NULL;
+
+	assert_spin_locked(&client->recv_lock);
+
+	while (*node) {
+		parent = *node;
+		sender = container_of(*node, struct waiting_sender, node);
+
+		if (id < sender->id) {
+			node = &(*node)->rb_left;
+		} else if (id > sender->id) {
+			node = &(*node)->rb_right;
+		} else {
+			found = sender;
+			break;
+		}
+	}
+
+	if (ins) {
+		/* ids are never reused and assigned under lock */
+		BUG_ON(found);
+		rb_link_node(&ins->node, parent, node);
+		rb_insert_color(&ins->node, &client->sender_root);
+		found = ins;
+	}
+
+	return found;
+}
+
+/*
+ * This work is queued once the socket is created.  It blocks trying to
+ * receive replies to sent messages.  If the sender is still around it
+ * receives the reply data into their buffer.  If the sender has left
+ * then it silently drops the reply.
+ *
+ * This exits once someone shuts down the socket.  If this sees a fatal
+ * error it shuts down the socket which causes senders to reconnect.
+ */
+static void scoutfs_client_recv_func(struct work_struct *work)
+{
+	struct client_info *client = container_of(work, struct client_info,
+						  recv_work);
+	struct waiting_sender *sender;
+	struct scoutfs_net_header nh;
+	void *rx_alloc = NULL;
+	int result = 0;
+	u16 data_len;
+	void *rx;
+	int ret;
+
+	for (;;) {
+		/* receive the header */
+		ret = scoutfs_sock_recvmsg(client->sock, &nh, sizeof(nh));
+		if (ret)
+			break;
+
+		data_len = le16_to_cpu(nh.data_len);
+
+		trace_scoutfs_client_recv_reply(client->sb,
+						&client->sockname,
+						&client->peername, &nh);
+
+		/* see if we have a waiting sender */
+		spin_lock(&client->recv_lock);
+		sender = walk_sender_tree(client, le64_to_cpu(nh.id), NULL);
+		spin_unlock(&client->recv_lock);
+
+		if (sender) {
+			if (sender->rx_size < data_len) {
+				/* protocol mismatch is fatal */
+				rx = NULL;
+				result = -EIO;
+			} else {
+				rx = sender->rx;
+				result = 0;
+			}
+		} else {
+			rx = NULL;
+		}
+
+		if (!rx) {
+			kfree(rx_alloc);
+			rx_alloc = kmalloc(data_len, GFP_NOFS);
+			if (!rx_alloc) {
+				ret = -ENOMEM;
+				break;
+			}
+			rx = rx_alloc;
+		}
+
+		/* recv failure can be server crashing, not fatal */
+		ret = scoutfs_sock_recvmsg(client->sock, rx, data_len);
+		if (ret) {
+			break;
+		}
+
+		if (sender) {
+			/* lock to keep sender around until after we wake */
+			spin_lock(&client->recv_lock);
+			sender->result = result;
+			smp_mb(); /* store result before waking */
+			wake_up_process(sender->task);
+			spin_unlock(&client->recv_lock);
+		}
+	}
+
+	/* make senders reconnect if we see an rx error */
+	if (ret) {
+		/* XXX would need to break out send */
+		kernel_sock_shutdown(client->sock, SHUT_RDWR);
+		client->recv_shutdown = true;
+	}
+
+	kfree(rx_alloc);
+}
+
+
+/*
+ * Spin discovering the address of the server and trying to connect to
+ * it until either we connect or we're interrupted by a signal.
+ *
+ * A single mount coming up starts both the server and the client.  The
+ * server takes a few IOs and network messages to get going and communicate
+ * its address.  We want to aggressively retry getting the address so that
+ * these mounts can be quick.  But we back off to avoid storms waiting for
+ * recovery after an existing server explodes.
+ */
+static int client_connect(struct client_info *client)
+{
+	struct super_block *sb = client->sb;
+	struct scoutfs_super_block super;
+	struct sockaddr_in *sin;
+	struct socket *sock = NULL;
+	struct timeval tv;
+	unsigned int msecs = MSEC_PER_SEC / 10;
+	int addrlen;
+	int optval;
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&client->send_mutex));
+
+	for(;;) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+
+		ret = scoutfs_read_supers(sb, &super);
+		if (ret)
+			continue;
+
+		if (super.server_addr.addr == cpu_to_le32(INADDR_ANY)) {
+			msleep_interruptible(msecs);
+			if (msecs < CONNECT_TIMEO_MSECS)
+				msecs = max(msecs + MSEC_PER_SEC,
+					    CONNECT_TIMEO_MSECS);
+			continue;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		sin = &client->peername;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = le32_to_be32(super.server_addr.addr);
+		sin->sin_port = le16_to_be16(super.server_addr.port);
+
+		ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP,
+				       &sock);
+		if (ret)
+			continue;
+
+		optval = 1;
+		ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			continue;
+
+		/* start with a connect timeout */
+		tv.tv_sec = CONNECT_TIMEO_SECS;
+		tv.tv_usec = 0;
+		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					(char *)&tv, sizeof(tv));
+		if (ret)
+			continue;
+
+		client->sock = sock;
+
+		ret = kernel_connect(sock, (struct sockaddr *)sin,
+				     sizeof(struct sockaddr_in), 0);
+		if (ret)
+			continue;
+
+		/* but use a keepalive timeout instead of send timeout */
+		tv.tv_sec = 0;
+		tv.tv_usec = 0;
+		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+					(char *)&tv, sizeof(tv));
+		if (ret)
+			continue;
+
+		optval = KEEPCNT;
+		ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			continue;
+
+		optval = KEEPIDLE;
+		ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			continue;
+
+		optval = KEEPINTVL;
+		ret = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			continue;
+
+		optval = 1;
+		ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+					(char *)&optval, sizeof(optval));
+		if (ret)
+			continue;
+
+		addrlen = sizeof(struct sockaddr_in);
+		ret = kernel_getsockname(sock,
+					 (struct sockaddr *)&client->sockname,
+					 &addrlen);
+		if (ret)
+			continue;
+
+		scoutfs_info(sb, "client connected "SIN_FMT" -> "SIN_FMT,
+			     SIN_ARG(&client->sockname),
+			     SIN_ARG(&client->peername));
+
+		client->sock_gen++;
+		client->recv_shutdown = false;
+		queue_work(client->recv_wq, &client->recv_work);
+		wake_up(&client->waitq);
+		ret = 0;
+		break;
+	}
+
+	if (ret && sock)
+		sock_release(sock);
+
+	return ret;
+}
+
+/* either a sender or unmount is destroying the socket */
+static void shutdown_sock_sync(struct client_info *client)
+{
+	struct super_block *sb = client->sb;
+	struct socket *sock = client->sock;
+
+	if (sock) {
+		kernel_sock_shutdown(sock, SHUT_RDWR);
+		cancel_work_sync(&client->recv_work);
+		sock_release(sock);
+		client->sock = NULL;
+
+		scoutfs_info(sb, "client disconnected "SIN_FMT" -> "SIN_FMT,
+			     SIN_ARG(&client->sockname),
+			     SIN_ARG(&client->peername));
+	}
+}
+
+/*
+ * Senders sleep waiting for a reply to come down the connection out
+ * which they just sent a request.  They need to wake up when the recv
+ * work has given them a reply or when it's given up and the sender
+ * needs to reconnect and resend.
+ *
+ * This is a condition for wait_event.  The barrier orders the task
+ * state store before loading the sender and client fields.
+ */
+static int sender_should_wake(struct client_info *client,
+			      struct waiting_sender *sender)
+{
+	smp_mb();
+	return sender->result != -EINPROGRESS || client->recv_shutdown;
+}
+
+/*
+ * Block sending a request and then waiting for the reply.  All senders
+ * are responsible for connecting sockets and sending their requests.
+ * recv work blocks receiving from the socket and waking senders if
+ * they're reply has been copied to their buffer.  If the socket sees an
+ * error the recv work will shutdown and wake us to reconnect.
+ */
+static int client_request(struct client_info *client, int type, void *data,
+			  unsigned data_len, void *rx, size_t rx_size)
+{
+	struct waiting_sender sender;
+	struct scoutfs_net_header nh;
+	struct kvec kv[2];
+	unsigned kv_len;
+	u64 sent_to_gen = ~0ULL;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(!data && data_len))
+		return -EINVAL;
+
+	spin_lock(&client->recv_lock);
+
+	sender.task = current;
+	sender.id = client->next_id++;
+	sender.rx = rx;
+	sender.rx_size = rx_size;
+	sender.result = -EINPROGRESS;
+
+	nh.id = cpu_to_le64(sender.id);
+	nh.data_len = cpu_to_le16(data_len);
+	nh.type = type;
+	nh.status = SCOUTFS_NET_STATUS_REQUEST;
+
+	walk_sender_tree(client, sender.id, &sender);
+
+	spin_unlock(&client->recv_lock);
+
+	mutex_lock(&client->send_mutex);
+
+	while (sender.result == -EINPROGRESS) {
+
+		if (!client->sock) {
+			ret = client_connect(client);
+			if (ret < 0)
+				break;
+		}
+
+		if (sent_to_gen != client->sock_gen) {
+			kv[0].iov_base = &nh;
+			kv[0].iov_len = sizeof(nh);
+			kv[1].iov_base = data;
+			kv[1].iov_len = data_len;
+			kv_len = data ? 2 : 1;
+
+			trace_scoutfs_client_send_request(client->sb,
+							  &client->sockname,
+							  &client->peername,
+							  &nh);
+
+			ret = scoutfs_sock_sendmsg(client->sock, kv, kv_len);
+			if (ret) {
+				shutdown_sock_sync(client);
+				continue;
+			}
+
+			sent_to_gen = client->sock_gen;
+		}
+
+		/* XXX would need to protect erase during rx if interruptible */
+		mutex_unlock(&client->send_mutex);
+
+		wait_event(client->waitq, sender_should_wake(client, &sender));
+
+		mutex_lock(&client->send_mutex);
+
+		/* finish tearing down the socket if recv shutdown */
+		if (client->sock && client->recv_shutdown) {
+			shutdown_sock_sync(client);
+			continue;
+		}
+	}
+
+	mutex_unlock(&client->send_mutex);
+
+	/* safe to remove, we only finish after canceling recv or we're woke */
+	spin_lock(&client->recv_lock);
+	rb_erase(&sender.node, &client->sender_root);
+	spin_unlock(&client->recv_lock);
+
+	if (ret == 0)
+		ret = sender.result;
+
+	return ret;
+}
+
+int scoutfs_client_alloc_inodes(struct super_block *sb)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	struct scoutfs_net_inode_alloc ial;
+	u64 ino = 0;
+	u64 nr = 0;
+	int ret;
+
+	ret = client_request(client, SCOUTFS_NET_ALLOC_INODES, NULL, 0,
+			     &ial, sizeof(ial));
+	if (ret == 0) {
+		ino = le64_to_cpu(ial.ino);
+		nr = le64_to_cpu(ial.nr);
+
+		/* catch wrapping */
+		if (ino + nr < ino)
+			ret = -EINVAL;
+	}
+
+	if (ret < 0)
+		scoutfs_inode_fill_pool(sb, 0, 0);
+	else
+		scoutfs_inode_fill_pool(sb, ino, nr);
+
+	return ret;
+}
+
+int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	__le64 lesegno;
+	int ret;
+
+	ret = client_request(client, SCOUTFS_NET_ALLOC_SEGNO, NULL, 0,
+			     &lesegno, sizeof(lesegno));
+	if (ret == 0) {
+		if (lesegno == 0)
+			ret = -ENOSPC;
+		else
+			*segno = le64_to_cpu(lesegno);
+	}
+
+	return ret;
+}
+
+int scoutfs_client_record_segment(struct super_block *sb,
+				  struct scoutfs_segment *seg, u8 level)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	struct scoutfs_net_manifest_entry *net_ment;
+	struct scoutfs_manifest_entry ment;
+	int ret;
+
+	scoutfs_seg_init_ment(&ment, level, seg);
+	net_ment = scoutfs_alloc_net_ment(&ment);
+	if (net_ment) {
+		ret = client_request(client, SCOUTFS_NET_RECORD_SEGMENT,
+				     net_ment, scoutfs_net_ment_bytes(net_ment),
+				     NULL, 0);
+		kfree(net_ment);
+	} else {
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int sort_cmp_u64s(const void *A, const void *B)
+{
+	const u64 *a = A;
+	const u64 *b = B;
+
+	return *a < *b ? -1  : *a > *b ? 1 : 0;
+}
+
+static void sort_swap_u64s(void *A, void *B, int size)
+{
+	u64 *a = A;
+	u64 *b = B;
+
+	swap(*a, *b);
+}
+
+/*
+ * Returns a 0-terminated allocated array of segnos, the caller is
+ * responsible for freeing it.
+ *
+ * This double alloc is silly.  But the caller does have an easier time
+ * with native u64s.  We'll probably clean this up.
+ */
+u64 *scoutfs_client_bulk_alloc(struct super_block *sb)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	struct scoutfs_net_segnos *ns = NULL;
+	u64 *segnos = NULL;
+	size_t size;
+	unsigned nr;
+	u64 prev;
+	int ret;
+	int i;
+
+	size = offsetof(struct scoutfs_net_segnos,
+			segnos[SCOUTFS_BULK_ALLOC_COUNT]);
+	ns = kmalloc(size, GFP_NOFS);
+	if (!ns) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = client_request(client, SCOUTFS_NET_BULK_ALLOC, NULL, 0, ns, size);
+	if (ret)
+		goto out;
+
+	nr = le16_to_cpu(ns->nr);
+	if (nr == 0) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	if (nr > SCOUTFS_BULK_ALLOC_COUNT) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	segnos = kmalloc_array(nr + 1, sizeof(*segnos), GFP_NOFS);
+	if (segnos == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < nr; i++)
+		segnos[i] = le64_to_cpu(ns->segnos[i]);
+	segnos[nr] = 0;
+
+	/* sort segnos for the caller so they can merge easily */
+	sort(segnos, nr, sizeof(segnos[0]), sort_cmp_u64s, sort_swap_u64s);
+
+	/* make sure they're all non-zero and unique */
+	prev = 0;
+	for (i = 0; i < nr; i++) {
+		if (segnos[i] == prev) {
+			ret = -EINVAL;
+			goto out;
+		}
+		prev = segnos[i];
+	}
+
+	ret = 0;
+out:
+	kfree(ns);
+	if (ret) {
+		kfree(segnos);
+		segnos = ERR_PTR(ret);
+	}
+
+	return segnos;
+}
+
+int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	__le64 before = cpu_to_le64p(seq);
+	__le64 after;
+	int ret;
+
+	ret = client_request(client, SCOUTFS_NET_ADVANCE_SEQ,
+			     &before, sizeof(before), &after, sizeof(after));
+	if (ret == 0)
+		*seq = le64_to_cpu(after);
+
+	return ret;
+}
+
+int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+	__le64 last_seq;
+	int ret;
+
+	ret = client_request(client, SCOUTFS_NET_GET_LAST_SEQ,
+			     NULL, 0, &last_seq, sizeof(last_seq));
+	if (ret == 0)
+		*seq = le64_to_cpu(last_seq);
+
+	return ret;
+}
+
+int scoutfs_client_get_manifest_root(struct super_block *sb,
+				     struct scoutfs_btree_root *root)
+{
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	return client_request(client, SCOUTFS_NET_GET_MANIFEST_ROOT,
+			      NULL, 0, root, sizeof(struct scoutfs_btree_root));
+}
+
+int scoutfs_client_setup(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct client_info *client;
+
+	client = kzalloc(sizeof(struct client_info), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+
+	client->sb = sb;
+	spin_lock_init(&client->recv_lock);
+	client->sender_root = RB_ROOT;
+	mutex_init(&client->send_mutex);
+	init_waitqueue_head(&client->waitq);
+	INIT_WORK(&client->recv_work, scoutfs_client_recv_func);
+
+	client->recv_wq = alloc_workqueue("scoutfs_client_recv", WQ_UNBOUND, 1);
+	if (!client->recv_wq) {
+		kfree(client);
+		return -ENOMEM;
+	}
+
+	sbi->client_info = client;
+	return 0;
+}
+
+/*
+ * There must be no more callers to the client send functions by the
+ * time we get here.  We just need to free the socket if it's
+ * still sitting around.
+ */
+void scoutfs_client_destroy(struct super_block *sb)
+{
+	struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
+	struct client_info *client = SCOUTFS_SB(sb)->client_info;
+
+	if (client) {
+		shutdown_sock_sync(client);
+
+		cancel_work_sync(&client->recv_work);
+		destroy_workqueue(client->recv_wq);
+
+		kfree(client);
+		sbi->client_info = NULL;
+	}
+}
--- a/kmod/src/client.h
+++ b/kmod/src/client.h
@@ -0,0 +1,17 @@
+#ifndef _SCOUTFS_CLIENT_H_
+#define _SCOUTFS_CLIENT_H_
+
+int scoutfs_client_alloc_inodes(struct super_block *sb);
+int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno);
+int scoutfs_client_record_segment(struct super_block *sb,
+				  struct scoutfs_segment *seg, u8 level);
+u64 *scoutfs_client_bulk_alloc(struct super_block *sb);
+int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq);
+int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq);
+int scoutfs_client_get_manifest_root(struct super_block *sb,
+				     struct scoutfs_btree_root *root);
+
+int scoutfs_client_setup(struct super_block *sb);
+void scoutfs_client_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/compact.c
+++ b/kmod/src/compact.c
@@ -24,7 +24,7 @@
 #include "manifest.h"
 #include "counters.h"
 #include "alloc.h"
-#include "net.h"
+#include "server.h"
 #include "scoutfs_trace.h"

 /*
@@ -579,7 +579,7 @@ static void scoutfs_compact_func(struct work_struct *work)
 	INIT_LIST_HEAD(&curs.csegs);
 	scoutfs_bio_init_comp(&comp);

-	ret = scoutfs_net_get_compaction(sb, (void *)&curs);
+	ret = scoutfs_client_get_compaction(sb, (void *)&curs);

 	/* short circuit no compaction work to do */
 	if (ret == 0 && list_empty(&curs.csegs))
@@ -610,7 +610,7 @@ static void scoutfs_compact_func(struct work_struct *work)
 		free_cseg_list(sb, &results);
 	}

-	err = scoutfs_net_finish_compaction(sb, &curs, &results);
+	err = scoutfs_client_finish_compaction(sb, &curs, &results);
 	if (!ret && err)
 		ret = err;

--- a/kmod/src/data.c
+++ b/kmod/src/data.c
@@ -29,7 +29,7 @@
 #include "scoutfs_trace.h"
 #include "item.h"
 #include "ioctl.h"
-#include "net.h"
+#include "client.h"
 #include "lock.h"

 #define EXTF "[off %llu bno %llu bks %llu fl %x]"
@@ -731,7 +731,7 @@ static int bulk_alloc(struct super_block *sb)
 	int ret;
 	int i;

-	segnos = scoutfs_net_bulk_alloc(sb);
+	segnos = scoutfs_client_bulk_alloc(sb);
 	if (IS_ERR(segnos)) {
 		ret = PTR_ERR(segnos);
 		goto out;
--- a/kmod/src/inode.c
+++ b/kmod/src/inode.c
@@ -31,7 +31,7 @@
 #include "msg.h"
 #include "kvec.h"
 #include "item.h"
-#include "net.h"
+#include "client.h"

 /*
 * XXX
@@ -670,7 +670,7 @@ static int alloc_ino(struct super_block *sb, u64 *ino)
 		spin_unlock(&pool->lock);

 		if (request) {
-			ret = scoutfs_net_alloc_inodes(sb);
+			ret = scoutfs_client_alloc_inodes(sb);
 			if (ret) {
 				spin_lock(&pool->lock);
 				pool->in_flight = false;
--- a/kmod/src/ioctl.c
+++ b/kmod/src/ioctl.c
@@ -29,7 +29,7 @@
 #include "inode.h"
 #include "item.h"
 #include "data.h"
-#include "net.h"
+#include "client.h"
 #include "lock.h"
 #include "manifest.h"

@@ -90,7 +90,7 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg)
 	if (type == SCOUTFS_INODE_INDEX_META_SEQ_TYPE ||
 	    type == SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE) {

-		ret = scoutfs_net_get_last_seq(sb, &last_seq);
+		ret = scoutfs_client_get_last_seq(sb, &last_seq);
 		if (ret)
 			return ret;

--- a/kmod/src/manifest.c
+++ b/kmod/src/manifest.c
@@ -26,7 +26,7 @@
 #include "manifest.h"
 #include "trans.h"
 #include "counters.h"
-#include "net.h"
+#include "client.h"
 #include "scoutfs_trace.h"

 /*
@@ -601,7 +601,7 @@ static int read_items(struct super_block *sb, struct scoutfs_key_buf *key,
 	 * either get a manifest ref in the lvb of their lock or they'll
 	 * ask the server the first time the system sees the lock.
 	 */
-	ret = scoutfs_net_get_manifest_root(sb, &root);
+	ret = scoutfs_client_get_manifest_root(sb, &root);
 	if (ret)
 		goto out;

--- a/kmod/src/net.c
+++ b/kmod/src/net.c
--- a/kmod/src/net.h
+++ b/kmod/src/net.h
@@ -1,25 +0,0 @@
-#ifndef _SCOUTFS_NET_H_
-#define _SCOUTFS_NET_H_
-
-struct scoutfs_key_buf;
-struct scoutfs_segment;
-
-int scoutfs_net_alloc_inodes(struct super_block *sb);
-int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno);
-int scoutfs_net_record_segment(struct super_block *sb,
-			       struct scoutfs_segment *seg, u8 level);
-u64 *scoutfs_net_bulk_alloc(struct super_block *sb);
-
-int scoutfs_net_get_compaction(struct super_block *sb, void *curs);
-int scoutfs_net_finish_compaction(struct super_block *sb, void *curs,
-				  void *list);
-int scoutfs_net_get_last_seq(struct super_block *sb, u64 *seq);
-int scoutfs_net_advance_seq(struct super_block *sb, u64 *seq);
-
-int scoutfs_net_get_manifest_root(struct super_block *sb,
-				  struct scoutfs_btree_root *root);
-
-int scoutfs_net_setup(struct super_block *sb);
-void scoutfs_net_destroy(struct super_block *sb);
-
-#endif
--- a/kmod/src/scoutfs_trace.h
+++ b/kmod/src/scoutfs_trace.h
@@ -23,6 +23,7 @@
 #define _TRACE_SCOUTFS_H

 #include <linux/tracepoint.h>
+#include <linux/in.h>
 #include <linux/unaligned/access_ok.h>

 #include "key.h"
@@ -420,6 +421,67 @@ DEFINE_EVENT(scoutfs_seg_class, scoutfs_seg_free,
        TP_ARGS(seg)
 );

+DECLARE_EVENT_CLASS(scoutfs_net_class,
+        TP_PROTO(struct super_block *sb, struct sockaddr_in *name,
+		 struct sockaddr_in *peer, struct scoutfs_net_header *nh),
+        TP_ARGS(sb, name, peer, nh),
+        TP_STRUCT__entry(
+		__field(unsigned int, major)
+		__field(unsigned int, minor)
+		__field(u32, name_addr)
+		__field(u16, name_port)
+		__field(u32, peer_addr)
+		__field(u16, peer_port)
+		__field(u64, id)
+		__field(u8, type)
+		__field(u8, status)
+		__field(u16, data_len)
+        ),
+        TP_fast_assign(
+		__entry->major = MAJOR(sb->s_bdev->bd_dev);
+		__entry->minor = MINOR(sb->s_bdev->bd_dev);
+		/* sparse can't handle this cpp nightmare */
+		__entry->name_addr = (u32 __force)name->sin_addr.s_addr;
+		__entry->name_port = be16_to_cpu(name->sin_port);
+		__entry->peer_addr = (u32 __force)peer->sin_addr.s_addr;
+		__entry->peer_port = be16_to_cpu(peer->sin_port);
+		__entry->id = le64_to_cpu(nh->id);
+		__entry->type = nh->type;
+		__entry->status = nh->status;
+		__entry->data_len = le16_to_cpu(nh->data_len);
+        ),
+        TP_printk("dev %u:%u %pI4:%u -> %pI4:%u id %llu type %u status %u data_len %u",
+		  __entry->major, __entry->minor,
+		  &__entry->name_addr, __entry->name_port,
+		  &__entry->peer_addr, __entry->peer_port,
+		  __entry->id, __entry->type, __entry->status,
+		  __entry->data_len)
+);
+
+DEFINE_EVENT(scoutfs_net_class, scoutfs_client_send_request,
+        TP_PROTO(struct super_block *sb, struct sockaddr_in *name,
+		 struct sockaddr_in *peer, struct scoutfs_net_header *nh),
+        TP_ARGS(sb, name, peer, nh)
+);
+
+DEFINE_EVENT(scoutfs_net_class, scoutfs_server_recv_request,
+        TP_PROTO(struct super_block *sb, struct sockaddr_in *name,
+		 struct sockaddr_in *peer, struct scoutfs_net_header *nh),
+        TP_ARGS(sb, name, peer, nh)
+);
+
+DEFINE_EVENT(scoutfs_net_class, scoutfs_server_send_reply,
+        TP_PROTO(struct super_block *sb, struct sockaddr_in *name,
+		 struct sockaddr_in *peer, struct scoutfs_net_header *nh),
+        TP_ARGS(sb, name, peer, nh)
+);
+
+DEFINE_EVENT(scoutfs_net_class, scoutfs_client_recv_reply,
+        TP_PROTO(struct super_block *sb, struct sockaddr_in *name,
+		 struct sockaddr_in *peer, struct scoutfs_net_header *nh),
+        TP_ARGS(sb, name, peer, nh)
+);
+
 #endif /* _TRACE_SCOUTFS_H */

 /* This part must be outside protection */
--- a/kmod/src/server.c
+++ b/kmod/src/server.c
--- a/kmod/src/server.h
+++ b/kmod/src/server.h
@@ -0,0 +1,20 @@
+#ifndef _SCOUTFS_SERVER_H_
+#define _SCOUTFS_SERVER_H_
+
+void scoutfs_init_net_ment_keys(struct scoutfs_net_manifest_entry *net_ment,
+				struct scoutfs_key_buf *first,
+				struct scoutfs_key_buf *last);
+struct scoutfs_net_manifest_entry *
+scoutfs_alloc_net_ment(struct scoutfs_manifest_entry *ment);
+void scoutfs_init_ment_net_ment(struct scoutfs_manifest_entry *ment,
+				struct scoutfs_net_manifest_entry *net_ment);
+unsigned scoutfs_net_ment_bytes(struct scoutfs_net_manifest_entry *net_ment);
+
+int scoutfs_client_get_compaction(struct super_block *sb, void *curs);
+int scoutfs_client_finish_compaction(struct super_block *sb, void *curs,
+				     void *list);
+
+int scoutfs_server_setup(struct super_block *sb);
+void scoutfs_server_destroy(struct super_block *sb);
+
+#endif
--- a/kmod/src/sock.c
+++ b/kmod/src/sock.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2017 Versity Software, Inc.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/ioctls.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "sock.h"
+
+/*
+ * Some quick socket helper wrappers.
+ */
+
+static struct kvec *kvec_advance(struct kvec *kv, unsigned *kv_len,
+				 unsigned bytes)
+{
+	while (*kv_len && bytes) {
+		if (kv->iov_len <= bytes) {
+			bytes -= kv->iov_len;
+			kv++;
+			(*kv_len)--;
+		} else {
+			kv->iov_base += bytes;
+			kv->iov_len -= bytes;
+			bytes = 0;
+		}
+	}
+
+	return kv;
+}
+
+/*
+ * This can modify the kvec as it modifies the vec to continue after
+ * partial sends.
+ */
+int scoutfs_sock_sendmsg(struct socket *sock, struct kvec *kv, unsigned kv_len)
+{
+	struct msghdr msg;
+	int ret;
+
+	while (kv_len) {
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iov = (struct iovec *)kv;
+		msg.msg_iovlen = kv_len;
+		msg.msg_flags = MSG_NOSIGNAL;
+
+		ret = kernel_sendmsg(sock, &msg, kv, kv_len,
+				     iov_length((struct iovec *)kv, kv_len));
+		if (ret <= 0)
+			return -ECONNABORTED;
+
+		kv = kvec_advance(kv, &kv_len, ret);
+	}
+
+	return 0;
+}
+
+int scoutfs_sock_recvmsg(struct socket *sock, void *buf, unsigned len)
+{
+	struct msghdr msg;
+	struct kvec kv;
+	int ret;
+
+	while (len) {
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_iov = (struct iovec *)&kv;
+		msg.msg_iovlen = 1;
+		msg.msg_flags = MSG_NOSIGNAL;
+		kv.iov_base = buf;
+		kv.iov_len = len;
+
+		ret = kernel_recvmsg(sock, &msg, &kv, 1, len, msg.msg_flags);
+		if (ret <= 0)
+			return -ECONNABORTED;
+
+		len -= ret;
+		buf += ret;
+	}
+
+	return 0;
+}
--- a/kmod/src/sock.h
+++ b/kmod/src/sock.h
@@ -0,0 +1,7 @@
+#ifndef _SCOUTFS_SOCK_H_
+#define _SCOUTFS_SOCK_H_
+
+int scoutfs_sock_recvmsg(struct socket *sock, void *buf, unsigned len);
+int scoutfs_sock_sendmsg(struct socket *sock, struct kvec *kv, unsigned kv_len);
+
+#endif
--- a/kmod/src/super.c
+++ b/kmod/src/super.c
@@ -36,7 +36,8 @@
 #include "compact.h"
 #include "data.h"
 #include "lock.h"
-#include "net.h"
+#include "client.h"
+#include "server.h"
 #include "options.h"
 #include "scoutfs_trace.h"

@@ -241,27 +242,49 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
 	      scoutfs_inode_setup(sb) ?:
 	      scoutfs_data_setup(sb) ?:
 	      scoutfs_setup_trans(sb) ?:
-	      scoutfs_lock_setup(sb) ?:
-	      scoutfs_net_setup(sb);
+	      scoutfs_lock_setup(sb);
 	if (ret)
 		return ret;

+	/*
+	 * The server is a bit magical because it can try to read the
+	 * device in async work context.  Once we return an error from
+	 * here the kernel starts tearing down the mount and it isn't
+	 * safe to do IO.  So we shut the server down before returning
+	 * an error.
+	 *
+	 * But we still want to start the server before the client to
+	 * help single mounts come up without passing through connection
+	 * timeouts.
+	 */
+	ret = scoutfs_server_setup(sb) ?:
+	      scoutfs_client_setup(sb);
+	if (ret)
+		goto out;
+
 	inode = scoutfs_iget(sb, SCOUTFS_ROOT_INO);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto out;
+	}

 	sb->s_root = d_make_root(inode);
-	if (!sb->s_root)
-		return -ENOMEM;
+	if (!sb->s_root) {
+		ret = -ENOMEM;
+		goto out;
+	}

-	ret = scoutfs_net_advance_seq(sb, &sbi->trans_seq);
+	ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 	if (ret)
-		return ret;
+		goto out;

 	scoutfs_trans_restart_sync_deadline(sb);
 //	scoutfs_scan_orphans(sb);
-
-	return 0;
+	ret = 0;
+out:
+	if (ret)
+	      scoutfs_server_destroy(sb);
+	return ret;
 }

 static struct dentry *scoutfs_mount(struct file_system_type *fs_type, int flags,
@@ -283,14 +306,15 @@ static void scoutfs_kill_sb(struct super_block *sb)
 		sync_filesystem(sb);

 		scoutfs_lock_shutdown(sb);
-		scoutfs_net_destroy(sb);
+		scoutfs_server_destroy(sb);
 	}

 	kill_block_super(sb);

 	if (sbi) {
 		scoutfs_lock_destroy(sb);
-		scoutfs_net_destroy(sb);
+		scoutfs_client_destroy(sb);
+		scoutfs_server_destroy(sb);
 		scoutfs_shutdown_trans(sb);
 		scoutfs_data_destroy(sb);
 		scoutfs_inode_destroy(sb);
--- a/kmod/src/super.h
+++ b/kmod/src/super.h
@@ -15,7 +15,8 @@ struct compact_info;
 struct data_info;
 struct trans_info;
 struct lock_info;
-struct net_info;
+struct client_info;
+struct server_info;
 struct inode_sb_info;
 struct btree_info;

@@ -51,7 +52,8 @@ struct scoutfs_sb_info {

 	struct trans_info *trans_info;
 	struct lock_info *lock_info;
-	struct net_info *net_info;
+	struct client_info *client_info;
+	struct server_info *server_info;

 	/* $sysfs/fs/scoutfs/$id/ */
 	struct kset *kset;
--- a/kmod/src/trans.c
+++ b/kmod/src/trans.c
@@ -26,7 +26,7 @@
 #include "manifest.h"
 #include "seg.h"
 #include "counters.h"
-#include "net.h"
+#include "client.h"
 #include "inode.h"
 #include "scoutfs_trace.h"

@@ -130,14 +130,14 @@ void scoutfs_trans_write_func(struct work_struct *work)
 		 * on crashes between us and the server.
 		 */
 		ret = scoutfs_inode_walk_writeback(sb, true) ?:
-		      scoutfs_net_alloc_segno(sb, &segno) ?:
+		      scoutfs_client_alloc_segno(sb, &segno) ?:
 		      scoutfs_seg_alloc(sb, segno, &seg) ?:
 		      scoutfs_item_dirty_seg(sb, seg) ?:
 		      scoutfs_seg_submit_write(sb, seg, &comp) ?:
 		      scoutfs_inode_walk_writeback(sb, false) ?:
 		      scoutfs_bio_wait_comp(sb, &comp) ?:
-		      scoutfs_net_record_segment(sb, seg, 0) ?:
-		      scoutfs_net_advance_seq(sb, &sbi->trans_seq);
+		      scoutfs_client_record_segment(sb, seg, 0) ?:
+		      scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 		scoutfs_seg_put(seg);
 		if (ret)
 			goto out;
@@ -152,7 +152,7 @@ void scoutfs_trans_write_func(struct work_struct *work)
 		 * seq indices but doesn't send a message for every sync
 		 * syscall.
 		 */
-		ret = scoutfs_net_advance_seq(sb, &sbi->trans_seq);
+		ret = scoutfs_client_advance_seq(sb, &sbi->trans_seq);
 	}

 out:
--- a/kmod/src/trans.h
+++ b/kmod/src/trans.h
@@ -1,7 +1,6 @@
 #ifndef _SCOUTFS_TRANS_H_
 #define _SCOUTFS_TRANS_H_

-#include "net.h"
 #include "count.h"

 void scoutfs_trans_write_func(struct work_struct *work);