diff --git a/kmod/src/lock_server.c b/kmod/src/lock_server.c index 56d20338..d5e55255 100644 --- a/kmod/src/lock_server.c +++ b/kmod/src/lock_server.c @@ -202,21 +202,48 @@ static u8 invalidation_mode(u8 granted, u8 requested) /* * Return true of the client lock instances described by the entries can - * be granted at the same time. Typically this only means they're both - * modes that are compatible between nodes. In addition there's the - * special case where a read lock on a client is compatible with a write - * lock on the same client because the client's cache covered by the - * read lock is still valid if they get a write lock. + * be granted at the same time. There's only three cases where this is + * true. + * + * First, the two locks are both of the same mode that allows full + * sharing -- read and write only. The only point of these modes is + * that everyone can share them. + * + * Second, a write lock gives the client permission to read as well. + * This means that a client can upgrade its read lock to a write lock + * without having to invalidate the existing read and drop caches. + * + * Third, null locks are always compatible between clients. It's as + * though the client with the null lock has no lock at all. But it's + * never compatible with all locks on the client requesting null. + * Sending invalidations for existing locks on a client when we get a + * null request is how we resolve races in shrinking locks -- we turn it + * into the unsolicited remote invalidation case. + * + * All other mode and client combinations can not be shared, most + * typically a write lock invalidating all other non-write holders to + * drop caches and force a read after the write has completed. */ static bool client_entries_compatible(struct client_lock_entry *granted, struct client_lock_entry *requested) { - return (granted->mode == requested->mode && - (granted->mode == SCOUTFS_LOCK_READ || - granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) || - (granted->rid == requested->rid && - granted->mode == SCOUTFS_LOCK_READ && - requested->mode == SCOUTFS_LOCK_WRITE); + /* only read and write_only can be full shared */ + if ((granted->mode == requested->mode) && + (granted->mode == SCOUTFS_LOCK_READ || granted->mode == SCOUTFS_LOCK_WRITE_ONLY)) + return true; + + /* _write includes reading, so a client can upgrade its read to write */ + if (granted->rid == requested->rid && + granted->mode == SCOUTFS_LOCK_READ && + requested->mode == SCOUTFS_LOCK_WRITE) + return true; + + /* null is always compatible across clients, never within a client */ + if ((granted->rid != requested->rid) && + (granted->mode == SCOUTFS_LOCK_NULL || requested->mode == SCOUTFS_LOCK_NULL)) + return true; + + return false; } /* @@ -317,16 +344,18 @@ static void put_server_lock(struct lock_server_info *inf, BUG_ON(!mutex_is_locked(&snode->mutex)); + spin_lock(&inf->lock); + if (atomic_dec_and_test(&snode->refcount) && list_empty(&snode->granted) && list_empty(&snode->requested) && list_empty(&snode->invalidated)) { - spin_lock(&inf->lock); rb_erase(&snode->node, &inf->locks_root); - spin_unlock(&inf->lock); should_free = true; } + spin_unlock(&inf->lock); + mutex_unlock(&snode->mutex); if (should_free) { diff --git a/kmod/src/net.c b/kmod/src/net.c index d7c68743..613cf169 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -502,12 +502,12 @@ static void scoutfs_net_proc_worker(struct work_struct *work) * Free live responses up to and including the seq by marking them dead * and moving them to the send queue to be freed. */ -static int move_acked_responses(struct scoutfs_net_connection *conn, - struct list_head *list, u64 seq) +static bool move_acked_responses(struct scoutfs_net_connection *conn, + struct list_head *list, u64 seq) { struct message_send *msend; struct message_send *tmp; - int ret = 0; + bool moved = false; assert_spin_locked(&conn->lock); @@ -519,20 +519,20 @@ static int move_acked_responses(struct scoutfs_net_connection *conn, msend->dead = 1; list_move(&msend->head, &conn->send_queue); - ret = 1; + moved = true; } - return ret; + return moved; } /* acks are processed inline in the recv worker */ static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq) { - int moved; + bool moved; spin_lock(&conn->lock); - moved = move_acked_responses(conn, &conn->send_queue, seq) + + moved = move_acked_responses(conn, &conn->send_queue, seq) | move_acked_responses(conn, &conn->resend_queue, seq); spin_unlock(&conn->lock); diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index ceb538ba..e9c09750 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1046,9 +1046,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class, sk_trace_define(start) sk_trace_define(end) __field(u64, refresh_gen) + __field(u64, write_seq) + __field(u64, dirty_trans_seq) __field(unsigned char, request_pending) __field(unsigned char, invalidate_pending) __field(int, mode) + __field(int, invalidating_mode) __field(unsigned int, waiters_cw) __field(unsigned int, waiters_pr) __field(unsigned int, waiters_ex) @@ -1061,9 +1064,12 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class, sk_trace_assign(start, &lck->start); sk_trace_assign(end, &lck->end); __entry->refresh_gen = lck->refresh_gen; + __entry->write_seq = lck->write_seq; + __entry->dirty_trans_seq = lck->dirty_trans_seq; __entry->request_pending = lck->request_pending; __entry->invalidate_pending = lck->invalidate_pending; __entry->mode = lck->mode; + __entry->invalidating_mode = lck->invalidating_mode; __entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ]; __entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE]; __entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY]; @@ -1071,10 +1077,11 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class, __entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE]; __entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY]; ), - TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u", + TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u invmd %u reqp %u invp %u refg %llu wris %llu dts %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u", SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end), - __entry->mode, __entry->request_pending, - __entry->invalidate_pending, __entry->refresh_gen, + __entry->mode, __entry->invalidating_mode, __entry->request_pending, + __entry->invalidate_pending, __entry->refresh_gen, __entry->write_seq, + __entry->dirty_trans_seq, __entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw, __entry->users_pr, __entry->users_ex, __entry->users_cw) ); diff --git a/tests/golden/lock-shrink-read-race b/tests/golden/lock-shrink-read-race new file mode 100644 index 00000000..eb528076 --- /dev/null +++ b/tests/golden/lock-shrink-read-race @@ -0,0 +1,2 @@ +=== setup +=== spin reading and shrinking diff --git a/tests/sequence b/tests/sequence index 16e0e465..78001e59 100644 --- a/tests/sequence +++ b/tests/sequence @@ -25,6 +25,7 @@ totl-xattr-tag.sh quota.sh lock-refleak.sh lock-shrink-consistency.sh +lock-shrink-read-race.sh lock-pr-cw-conflict.sh lock-revoke-getcwd.sh lock-recover-invalidate.sh diff --git a/tests/tests/lock-shrink-read-race.sh b/tests/tests/lock-shrink-read-race.sh new file mode 100644 index 00000000..18693bf0 --- /dev/null +++ b/tests/tests/lock-shrink-read-race.sh @@ -0,0 +1,40 @@ +# +# We had a lock server refcounting bug that could let one thread get a +# reference on a lock struct that was being freed by another thread. We +# were able to reproduce this by having all clients try and produce a +# lot of read and null requests. +# +# This will manfiest as a hung lock and timed out test runs, probably +# with hung task messages on the console. Depending on how the race +# turns out, it can trigger KASAN warnings in +# process_waiting_requests(). +# + +READERS_PER=3 +SECS=30 + +echo "=== setup" +touch "$T_D0/file" + +echo "=== spin reading and shrinking" +END=$((SECONDS + SECS)) +for m in $(t_fs_nrs); do + eval file="\$T_D${m}/file" + + # lots of tasks reading as fast as they can + for t in $(seq 1 $READERS_PER); do + (while [ $SECONDS -lt $END ]; do + stat $file > /dev/null + done) & + done + # one task shrinking (triggering null requests) and reading + (while [ $SECONDS -lt $END ]; do + stat $file > /dev/null + t_trigger_arm_silent statfs_lock_purge $m + stat -f "$file" > /dev/null + done) & +done + +wait + +t_pass