diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 98ed1359..8f04337a 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -88,7 +88,9 @@ struct lock_info { struct rhashtable ht; struct rb_root lock_range_tree; KC_DEFINE_SHRINKER(shrinker); - struct list_head lru_list; + struct list_head lru_active; + struct list_head lru_reclaim; + long lru_imbalance; unsigned long long lru_nr; struct workqueue_struct *workq; struct work_list inv_wlist; @@ -343,24 +345,102 @@ static bool lock_counts_match(int granted, unsigned int *counts) return true; } -static void __lock_add_lru(struct lock_info *linfo, struct scoutfs_lock *lock) +enum { LOCK_LRU_ACTIVE, LOCK_LRU_RECLAIM }; + +/* + * Restore balance between the active and reclaim lru lists. This is + * called after single operations on the lists could have created + * imbalance so we can always restore balance with one operation. + * + * @lru_imbalance is the difference between the number of entries on the + * active list and the number on the reclaim list. It's positive if + * there are more entries on the active list. + */ +static void lock_lru_rebalance(struct lock_info *linfo) { + struct scoutfs_lock *lock; + assert_spin_locked(&linfo->lock); - if (list_empty(&lock->lru_head)) { - list_add_tail(&lock->lru_head, &linfo->lru_list); - linfo->lru_nr++; + if (linfo->lru_imbalance > 1) { + BUG_ON(list_empty(&linfo->lru_active)); + lock = list_first_entry(&linfo->lru_active, struct scoutfs_lock, lru_head); + list_move_tail(&lock->lru_head, &linfo->lru_reclaim); + lock->lru_on_list = LOCK_LRU_RECLAIM; + linfo->lru_imbalance -= 2; + + } else if (linfo->lru_imbalance < -1) { + BUG_ON(list_empty(&linfo->lru_reclaim)); + lock = list_last_entry(&linfo->lru_reclaim, struct scoutfs_lock, lru_head); + list_move(&lock->lru_head, &linfo->lru_active); + lock->lru_on_list = LOCK_LRU_ACTIVE; + linfo->lru_imbalance += 2; + } + + BUG_ON(linfo->lru_imbalance < -1 || linfo->lru_imbalance > 1); +} + +static void lock_lru_insert(struct lock_info *linfo, struct scoutfs_lock *lock) +{ + assert_spin_locked(&linfo->lock); + BUG_ON(!list_empty(&lock->lru_head)); + + list_add_tail(&lock->lru_head, &linfo->lru_active); + lock->lru_on_list = LOCK_LRU_ACTIVE; + linfo->lru_imbalance++; + linfo->lru_nr++; + + lock_lru_rebalance(linfo); +} + +/* + * As we use a lock we move it to the end of the active list if it was + * on the reclaim list. + * + * This is meant to reduce contention on use of active locks. It + * doesn't maintain a precise ordering of lock access times and only + * ensures that reclaim has to go through the oldest half of locks + * before it can get to any of the newest half. That does mean that the + * first lock in the newest half could well be the most recently used. + * + * The caller only has a reference to the lock. We use an unlocked test + * of which list it's on to avoid acquiring the global lru lock. We + * don't mind if the load is rarely racey. It's always safe to reclaim + * and reacquire locks, so the LRU being rarely a bit off doesn't + * matter. Shrinking costs the most for locks that are actively in use, + * and in that case there are lots of chances for the load to be + * consistent and move a lock to protect it from shrinking. + */ +static void lock_lru_update(struct lock_info *linfo, struct scoutfs_lock *lock) +{ + BUG_ON(atomic_read(&lock->refcount) < 3); + BUG_ON(list_empty(&lock->lru_head)); + + if (lock->lru_on_list != LOCK_LRU_ACTIVE) { + spin_lock(&linfo->lock); + if (lock->lru_on_list != LOCK_LRU_ACTIVE) { + list_move_tail(&lock->lru_head, &linfo->lru_active); + lock->lru_on_list = LOCK_LRU_ACTIVE; + linfo->lru_imbalance += 2; + lock_lru_rebalance(linfo); + } + spin_unlock(&linfo->lock); } } -static void __lock_del_lru(struct lock_info *linfo, struct scoutfs_lock *lock) +static void lock_lru_remove(struct lock_info *linfo, struct scoutfs_lock *lock) { assert_spin_locked(&linfo->lock); + BUG_ON(list_empty(&lock->lru_head)); - if (!list_empty(&lock->lru_head)) { - list_del_init(&lock->lru_head); - linfo->lru_nr--; - } + list_del_init(&lock->lru_head); + if (lock->lru_on_list == LOCK_LRU_ACTIVE) + linfo->lru_imbalance--; + else + linfo->lru_imbalance++; + linfo->lru_nr--; + + lock_lru_rebalance(linfo); } /* @@ -477,7 +557,7 @@ retry: ret = insert_lock_range(sb, lock); if (ret == 0) { scoutfs_tseq_add(&linfo->tseq_tree, &lock->tseq_entry); - __lock_add_lru(linfo, lock); + lock_lru_insert(linfo, lock); atomic_add(2, &lock->refcount); } @@ -505,7 +585,7 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock) spin_lock(&linfo->lock); rb_erase(&lock->range_node, &linfo->lock_range_tree); RB_CLEAR_NODE(&lock->range_node); - __lock_del_lru(linfo, lock); + lock_lru_remove(linfo, lock); spin_unlock(&linfo->lock); scoutfs_tseq_del(&linfo->tseq_tree, &lock->tseq_entry); @@ -581,12 +661,8 @@ static struct scoutfs_lock *find_lock(struct super_block *sb, struct scoutfs_key lock = get_lock(lock); rcu_read_unlock(); - if (lock) { - spin_lock(&linfo->lock); - __lock_del_lru(linfo, lock); - __lock_add_lru(linfo, lock); - spin_unlock(&linfo->lock); - } + if (lock) + lock_lru_update(linfo, lock); return lock; } @@ -1562,17 +1638,21 @@ static unsigned long lock_count_objects(struct shrinker *shrink, } /* - * Start the shrinking process for locks on the lru. Locks are always - * on the lru so we skip any locks that are being used by any other - * references. Lock put/free defines nesting of the linfo spinlock - * inside the lock's spinlock so we're careful to honor that here. Our - * reference to the lock protects its presence on the lru so we can - * always resume iterating from it after dropping and reacquiring the - * linfo lock. + * Start the shrinking process for locks on the lru. The reclaim and + * active lists are walked from head to tail. We hand locks off to the + * shrink worker if we can get a reference and acquire the lock's + * spinlock and find it idle. * - * We don't want to block or allocate here so all we do is get the lock, - * mark it request pending, and kick off the work. The work sends a - * null request and eventually the lock is freed by its response. + * The global linfo spinlock is ordered under the lock's spinlock as a + * convenience to freeing null locks. We use trylock to check each + * lock and just skip locks when trylock fails. It seemed easier and + * more reliable than stopping and restarting iteration around spinlock + * reacquisition. + * + * This is only a best effort scan to start freeing locks. We return + * after having queued work that will do the blocking work to kick off + * the null requests, and even then it will be some time before we get + * the responses and free the null locks. * * Only a racing lock attempt that isn't matched can prevent the lock * from being freed. It'll block waiting to send its request for its @@ -1585,39 +1665,53 @@ static unsigned long lock_scan_objects(struct shrinker *shrink, struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info); struct super_block *sb = linfo->sb; struct scoutfs_lock *lock = NULL; + struct list_head *list; unsigned long freed = 0; unsigned long nr = sc->nr_to_scan; scoutfs_inc_counter(sb, lock_scan_objects); + if (nr == 0) + goto out; + spin_lock(&linfo->lock); - lock = list_first_entry_or_null(&linfo->lru_list, struct scoutfs_lock, lru_head); - while (lock && nr > 0) { - + list = &linfo->lru_reclaim; + list_for_each_entry(lock, list, lru_head) { if (get_lock(lock)) { - spin_unlock(&linfo->lock); - - spin_lock(&lock->lock); - if (lock->mode != SCOUTFS_LOCK_NULL && atomic_read(&lock->refcount) == 3) { - lock->request_pending = 1; - spin_lock(&linfo->shrink_wlist.lock); - list_add_tail(&lock->shrink_head, &linfo->shrink_wlist.list); - spin_unlock(&linfo->shrink_wlist.lock); - get_lock(lock); - nr--; - freed++; + if (spin_trylock(&lock->lock)) { + if (lock->mode != SCOUTFS_LOCK_NULL && + !lock->request_pending && + !lock->invalidate_pending && + atomic_read(&lock->refcount) == 3) { + get_lock(lock); + lock->request_pending = 1; + spin_lock(&linfo->shrink_wlist.lock); + list_add_tail(&lock->shrink_head, + &linfo->shrink_wlist.list); + spin_unlock(&linfo->shrink_wlist.lock); + nr--; + freed++; + } + spin_unlock(&lock->lock); + put_lock(linfo, lock); + } else { + /* + * The put_lock() is intentionally not factored + * out since it confuses the sparse checker. + */ + put_lock(linfo, lock); } - spin_unlock(&lock->lock); - put_lock(linfo, lock); - - spin_lock(&linfo->lock); } - if (lock->lru_head.next != &linfo->lru_list) - lock = list_next_entry(lock, lru_head); - else - lock = NULL; + if (nr == 0) + break; + + /* switch to active at last reclaim entry, _for_each_ stops if active empty */ + if (lock->lru_head.next == &linfo->lru_reclaim) { + list = &linfo->lru_active; + lock = list_first_entry(list, struct scoutfs_lock, lru_head); + } } spin_unlock(&linfo->lock); @@ -1626,6 +1720,7 @@ static unsigned long lock_scan_objects(struct shrinker *shrink, queue_nonempty_work_list(linfo, &linfo->shrink_wlist); spin_unlock(&linfo->shrink_wlist.lock); +out: trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, freed); return freed; } @@ -1871,7 +1966,8 @@ int scoutfs_lock_setup(struct super_block *sb) KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects, lock_scan_objects); KC_REGISTER_SHRINKER(&linfo->shrinker, "scoutfs-lock:" SCSBF, SCSB_ARGS(sb)); - INIT_LIST_HEAD(&linfo->lru_list); + INIT_LIST_HEAD(&linfo->lru_active); + INIT_LIST_HEAD(&linfo->lru_reclaim); init_work_list(&linfo->inv_wlist, lock_invalidate_worker); init_work_list(&linfo->shrink_wlist, lock_shrink_worker); atomic64_set(&linfo->next_refresh_gen, 0); diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 970af4ce..e1fc7c93 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -32,6 +32,7 @@ struct scoutfs_lock { u64 write_seq; u64 dirty_trans_seq; struct list_head lru_head; + int lru_on_list; wait_queue_head_t waitq; unsigned long request_pending:1, invalidate_pending:1;