scoutfs: add tracing messages

This adds tracing functionality that's cheap and easy to
use.  By constantly gathering traces we'll always have rich
history to analyze when something goes wrong.

Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
Zach Brown
2016-05-27 14:14:12 -07:00
parent 0820a7b5bd
commit 7d6dd91a24
8 changed files with 579 additions and 2 deletions

View File

@@ -2,6 +2,16 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
#
# these first and last objects are a super lame hack to put boundary
# symbols around trace printf formats that are put in an elf section.
# That should be done in a linker script, of course, but I'll be honest:
# I didn't go digging to see if modules can have linker scripts today.
#
scoutfs-y += first.o
scoutfs-y += block.o btree.o buddy.o counters.o crc.o dir.o filerw.o \
inode.o msg.o roster.o scoutfs_trace.o super.o trans.o treap.o \
wrlock.o
inode.o ioctl.o msg.o roster.o scoutfs_trace.o super.o trace.o \
trans.o treap.o wrlock.o
scoutfs-y += last.o

4
kmod/src/first.c Normal file
View File

@@ -0,0 +1,4 @@
#include "trace.h"
char __scoutfs_trace_section scoutfs_trace_first_format[] = "";

46
kmod/src/ioctl.c Normal file
View File

@@ -0,0 +1,46 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/compiler.h>
#include <linux/uio.h>
#include "ioctl.h"
#include "trace.h"
int scoutfs_copy_ibuf(struct iovec *iov, unsigned long arg)
{
struct scoutfs_ioctl_buf __user *user_ibuf = (void __user *)arg;
struct scoutfs_ioctl_buf ibuf;
if (copy_from_user(&ibuf, user_ibuf, sizeof(ibuf)))
return -EFAULT;
/* limit lengths to an int for some helpers that take int len args */
if (ibuf.len < 0)
return -EINVAL;
iov->iov_base = (void __user *)(long)ibuf.ptr;
iov->iov_len = ibuf.len;
/*
* This is not meant to protect the rest of the code from
* faults, it can't. It's meant to return early for iovecs that
* are completely garbage.
*/
if (!access_ok(VERIFY_READ, iov->iov_base, iov->iov_len))
return -EFAULT;
return 0;
}

32
kmod/src/ioctl.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef _SCOUTFS_IOCTL_H_
#define _SCOUTFS_IOCTL_H_
/* XXX I have no idea how these are chosen. */
#define SCOUTFS_IOCTL_MAGIC 's'
struct scoutfs_ioctl_buf {
__u64 ptr;
__s32 len;
} __packed;
/*
* Fills the buffer with a packed array of format strings. Trace records
* refer to the format strings in the buffer by their byte offset.
*/
#define SCOUTFS_IOC_GET_TRACE_FORMATS _IOW(SCOUTFS_IOCTL_MAGIC, 1, \
struct scoutfs_ioctl_buf)
struct scoutfs_trace_record {
__u16 format_off;
__u8 nr;
__u8 data[0];
} __packed;
/*
* Fills the buffer with trace records.
*/
#define SCOUTFS_IOC_GET_TRACE_RECORDS _IOW(SCOUTFS_IOCTL_MAGIC, 2, \
struct scoutfs_ioctl_buf)
int scoutfs_copy_ibuf(struct iovec *iov, unsigned long arg);
#endif

4
kmod/src/last.c Normal file
View File

@@ -0,0 +1,4 @@
#include "trace.h"
char __scoutfs_trace_section scoutfs_trace_last_format[] = "";

View File

@@ -28,6 +28,7 @@
#include "trans.h"
#include "roster.h"
#include "wrlock.h"
#include "trace.h"
#include "scoutfs_trace.h"
static struct kset *scoutfs_kset;
@@ -222,12 +223,14 @@ static void teardown_module(void)
scoutfs_inode_exit();
if (scoutfs_kset)
kset_unregister(scoutfs_kset);
scoutfs_trace_exit();
}
static int __init scoutfs_module_init(void)
{
int ret;
scoutfs_trace_init();
scoutfs_init_counters();
scoutfs_kset = kset_create_and_add("scoutfs", NULL, fs_kobj);

360
kmod/src/trace.c Normal file
View File

@@ -0,0 +1,360 @@
/*
* Copyright (C) 2016 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/hardirq.h>
#include <linux/uio.h>
#include <linux/debugfs.h>
#include "trace.h"
#include "super.h"
#include "ioctl.h"
/*
* This tracing gives us:
*
* - Always on. We get history leading up to an event without having
* had to predict the event.
*
* - Cheap. Recording the format pointer index and packed arguments is
* cheap enough that we don't mind always doing it at a reasonable
* frequency.
*
* - Trivial to add. We want to err on the side of too much logging.
* We don't want there to be so much garbage associated with adding a
* single logging message that people are discouraged from doing it.
*
* - Easy to extract from crash dumps. The more the computer can tell
* us about what happened when the world went sideways, the better.
*
* The implementation is reasonably straight forward.
*
* Log statements are simple printf format strings and arguments. The
* first trick bit is that we only support u64 arguments. This lets us
* use macro hacks to walk the arguments without having to parse the
* format string. This actually isn't a great hardship because often
* the things we might want to print as strings -- process names,
* xattrs, directory entries -- could in fact be sensitive user data
* that we don't want to see.
*
* Each log statement is packed into a variable byte size record. The
* records are packed into long term per-page pages. We only support
* logging from task context so that we don't have to fool around with
* serializing between contexts on a cpu. Writers record the number of
* record bytes stored in each page in page->private.
*
* Userspace reads the format strings and trace records from trivial
* ioctls that copy the entire data set in one go. This avoids all the
* nonsense of trying to translate the changing set of records into a
* seekable byte stream of formatted output. Readers of each page of
* records samples page->private to discover when they race with writers
* and retry.
*/
/*
* This tries to strike a balance between having enough logging on a cpu
* and not allocating an enormous amount of memory on systems with many
* cpus.
*/
#define TRACE_PAGES_PER_CPU DIV_ROUND_UP(256 * 1024, PAGE_SIZE)
struct trace_percpu {
int cur_page;
struct page *pages[TRACE_PAGES_PER_CPU];
};
static DEFINE_PER_CPU(struct trace_percpu, scoutfs_trace_percpu);
static int rec_bytes(int bytes)
{
return offsetof(struct scoutfs_trace_record, data[bytes]);
}
/*
* We compact the record of the trace format by referencing it with a
* small offset into a section that contains all the format strings.
* This shrinks the per-record format reference from an 8 byte pointer
* to a 2 byte offset. 6 bytes is a lot when records are 15 bytes.
*/
static char *trace_format(u16 off)
{
return &scoutfs_trace_first_format[1 + off];
}
static u16 trace_format_off(char *fmt)
{
return fmt - scoutfs_trace_first_format - 1;
}
static int trace_format_bytes(void)
{
return trace_format_off(scoutfs_trace_last_format);
}
static int valid_trace_format(char *fmt)
{
return fmt > scoutfs_trace_first_format &&
fmt < scoutfs_trace_last_format;
}
/*
* We only support trace messages with integer arguments. Most of them
* are small: counters, pids, sizes, cpus, etc. It's worth spending a
* few cycles to remove the leading bytes full of zeros.
*
* VLQ is very simple and does reasonably well. I'd happily consider
* alternatives with similar complexity but better space efficiency.
*
* This is the most boring conservative iterative implementation. A
* much cooler implementation would efficiently transform all the bits,
* store the whole little endian value, and return the number of bytes
* with bits set.
*/
static unsigned char encode_u64_bytes(u8 *data, u64 val)
{
unsigned char bytes = 0;
do {
*data = val & 127;
val >>= 7;
*(data++) += (!!val) << 7;
bytes++;
} while (val);
return bytes;
}
/*
* Write a trace record to a percpu page. We only write from task
* context so one writer is racing with many readers. Readers sample
* the count of total written bytes in the page at page private and
* retry the copy if the count changes. It's a poor man's seqlock.
*
* The calling trace wrapper has pinned our task to the cpu.
*/
void scoutfs_trace_write(char *fmt, int nr, ...)
{
struct trace_percpu *pcpu = this_cpu_ptr(&scoutfs_trace_percpu);
struct scoutfs_trace_record *rec;
struct page *page;
unsigned long page_bytes;
int encoded;
va_list args;
int i;
if (WARN_ON_ONCE(in_interrupt() || in_softirq() || in_irq()) ||
WARN_ON_ONCE(!valid_trace_format(fmt)) ||
WARN_ON_ONCE(trace_format_bytes() > U16_MAX))
return;
next_page:
page = pcpu->pages[pcpu->cur_page];
page_bytes = page->private & ~PAGE_MASK;
rec = page_address(page) + page_bytes;
encoded = 0;
va_start(args, nr);
for (i = 0; i < nr; i++) {
if (page_bytes + rec_bytes(encoded + 9) >= PAGE_SIZE) {
if (++pcpu->cur_page == TRACE_PAGES_PER_CPU)
pcpu->cur_page = 0;
page = pcpu->pages[pcpu->cur_page];
/* XXX barriers? */
page->private = round_up(page->private, PAGE_SIZE);
va_end(args);
goto next_page;
}
encoded += encode_u64_bytes(&rec->data[encoded],
va_arg(args, u64));
}
va_end(args);
rec->format_off = trace_format_off(fmt);
rec->nr = nr;
/* XXX barriers? */
page->private += rec_bytes(encoded);
}
/*
* Give userspace all of the format strings. They're packed and null
* terminated.
*
* We return the number of bytes copied. A return size smaller than the
* buffer len indicates a partial copy and the user can retry with a
* larger buffer.
*/
static int scoutfs_ioc_get_trace_formats(void __user *buf, int len)
{
int bytes= trace_format_bytes();
if (bytes <= len) {
if (copy_to_user(buf, trace_format(0), bytes))
return -EFAULT;
}
return bytes;
}
/*
* Copy all the trace records on all the cpus' pages to the user buffer.
* Each page's records will be copied atomically so records won't be
* scrambled. But writers can cycle through the pages as we copy so the
* entire set of records returned is not an atomic snapshot of all the
* pages.
*
* We return the number of bytes copied. A return size smaller than the
* buffer len indicates a partial copy and the user can retry with a
* larger buffer.
*/
static int scoutfs_ioc_get_trace_records(void __user *buf, int len)
{
struct trace_percpu *pcpu;
unsigned long before;
unsigned long after;
struct page *page;
int total = 0;
int bytes;
int ret;
int cpu;
int i;
if (len < 0)
return -EINVAL;
/* quickly give the caller the largest possible buffer size */
ret = num_online_cpus() * TRACE_PAGES_PER_CPU;
if (ret > len)
return ret;
for_each_online_cpu(cpu) {
pcpu = per_cpu_ptr(&scoutfs_trace_percpu, cpu);
for (i = 0; i < TRACE_PAGES_PER_CPU; i++) {
page = pcpu->pages[i];
do {
before = ACCESS_ONCE(page->private);
bytes = before & ~PAGE_MASK;
/* ret still nr * pages */
if (total + bytes > len)
goto out;
if (copy_to_user(buf + total,
page_address(page), bytes)) {
ret = -EFAULT;
goto out;
}
after = ACCESS_ONCE(page->private);
} while (after != before);
total += bytes;
}
}
ret = total;
out:
return ret;
}
static long scoutfs_trace_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct iovec iov;
switch (cmd) {
case SCOUTFS_IOC_GET_TRACE_FORMATS:
return scoutfs_copy_ibuf(&iov, arg) ?:
scoutfs_ioc_get_trace_formats(iov.iov_base, iov.iov_len);
case SCOUTFS_IOC_GET_TRACE_RECORDS:
return scoutfs_copy_ibuf(&iov, arg) ?:
scoutfs_ioc_get_trace_records(iov.iov_base, iov.iov_len);
}
return -ENOTTY;
}
static const struct file_operations scoutfs_trace_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = scoutfs_trace_ioctl,
};
static struct dentry *scoutfs_debugfs_dir;
static struct dentry *scoutfs_trace_dentry;
int __init scoutfs_trace_init(void)
{
struct trace_percpu *pcpu;
int cpu;
int i;
if (WARN_ON_ONCE(&scoutfs_trace_first_format >=
&scoutfs_trace_last_format) ||
WARN_ON_ONCE(trace_format_bytes() > U16_MAX))
return -EINVAL;
/* XXX possible instead of online? yikes? */
for_each_possible_cpu(cpu) {
pcpu = per_cpu_ptr(&scoutfs_trace_percpu, cpu);
for (i = 0; i < TRACE_PAGES_PER_CPU; i++) {
pcpu->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!pcpu->pages[i])
return -ENOMEM;
pcpu->pages[i]->private = 0;
}
}
scoutfs_debugfs_dir = debugfs_create_dir("scoutfs", NULL);
if (!scoutfs_debugfs_dir)
return -ENOMEM;
scoutfs_trace_dentry = debugfs_create_file("trace", 0600,
scoutfs_debugfs_dir, NULL,
&scoutfs_trace_fops);
if (!scoutfs_trace_dentry)
return -ENOMEM;
return 0;
}
void __exit scoutfs_trace_exit(void)
{
struct trace_percpu *pcpu;
int cpu;
int i;
if (scoutfs_trace_dentry) {
debugfs_remove(scoutfs_trace_dentry);
scoutfs_trace_dentry = NULL;
}
if (scoutfs_debugfs_dir) {
debugfs_remove(scoutfs_debugfs_dir);
scoutfs_debugfs_dir = NULL;
}
/* XXX possible instead of online? yikes? */
for_each_possible_cpu(cpu) {
pcpu = per_cpu_ptr(&scoutfs_trace_percpu, cpu);
for (i = 0; i < TRACE_PAGES_PER_CPU; i++) {
if (pcpu->pages[i]) {
__free_page(pcpu->pages[i]);
pcpu->pages[i] = NULL;
}
}
}
}

118
kmod/src/trace.h Normal file
View File

@@ -0,0 +1,118 @@
#ifndef _SCOUTFS_TRACE_H_
#define _SCOUTFS_TRACE_H_
#include <linux/compiler.h>
#include <linux/sched.h>
#define __scoutfs_trace_section __attribute__((section("__scoutfs_trace_fmt")))
extern char scoutfs_trace_first_format[];
extern char scoutfs_trace_last_format[];
/*
* What a beautifully baffling construct! First our arguments are added
* to a reverse sequence of numbers. Then all the arguments are handed
* to a macro that only returns its 64th argument. The presence of our
* arguments before the sequence means that the 64th argument will be
* the number in the reverse sequence that matches the number of our
* initial arguments.
*
* h/t to:
* https://groups.google.com/forum/#!topic/comp.std.c/d-6Mj5Lko_s
*/
#define NR_VA_ARGS(...) \
_ONLY_64TH(__VA_ARGS__, _reverse_sequence())
#define _ONLY_64TH(...) \
__ONLY_64TH(__VA_ARGS__)
#define __ONLY_64TH( \
_1, _2, _3, _4, _5, _6, _7, _8, _9,_10, \
_11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
_21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
_31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
_41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
_51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
_61,_62,_63,N,...) N
#define _reverse_sequence() \
63,62,61,60, \
59,58,57,56,55,54,53,52,51,50, \
49,48,47,46,45,44,43,42,41,40, \
39,38,37,36,35,34,33,32,31,30, \
29,28,27,26,25,24,23,22,21,20, \
19,18,17,16,15,14,13,12,11,10, \
9,8,7,6,5,4,3,2,1,0
/*
* surround each arg with (u64)( .. ),
*
* A 'called object not a function' error can mean there's too many args.
*
* XXX doesn't yet work with no args
*/
#define CAST_ARGS_U64(...) \
EXPAND_MACRO(__VA_ARGS__,CU_16,CU_15,CU_14,CU_13,CU_12,\
CU_11,CU_10,CU_9,CU_8,CU_7,CU_6,CU_5,CU_4,\
CU_3,CU_2,CU_1)(__VA_ARGS__)
#define EXPAND_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,\
_9,_10,_11,_12,_13,_14,_15,_16,NAME,...) NAME
#define CU_1(X) (u64)(X)
#define CU_2(X, ...) (u64)(X),CU_1(__VA_ARGS__)
#define CU_3(X, ...) (u64)(X),CU_2(__VA_ARGS__)
#define CU_4(X, ...) (u64)(X),CU_3(__VA_ARGS__)
#define CU_5(X, ...) (u64)(X),CU_4(__VA_ARGS__)
#define CU_6(X, ...) (u64)(X),CU_5(__VA_ARGS__)
#define CU_7(X, ...) (u64)(X),CU_6(__VA_ARGS__)
#define CU_8(X, ...) (u64)(X),CU_7(__VA_ARGS__)
#define CU_9(X, ...) (u64)(X),CU_8(__VA_ARGS__)
#define CU_10(X, ...) (u64)(X),CU_9(__VA_ARGS__)
#define CU_11(X, ...) (u64)(X),CU_10(__VA_ARGS__)
#define CU_12(X, ...) (u64)(X),CU_11(__VA_ARGS__)
#define CU_13(X, ...) (u64)(X),CU_12(__VA_ARGS__)
#define CU_14(X, ...) (u64)(X),CU_13(__VA_ARGS__)
#define CU_15(X, ...) (u64)(X),CU_14(__VA_ARGS__)
#define CU_16(X, ...) (u64)(X),CU_15(__VA_ARGS__)
struct super_block;
void scoutfs_trace_write(char *fmt, int nr, ...);
__attribute__((format(printf, 1, 2)))
static inline void only_check_format(const char *fmt, ...)
{
}
#define __trace_write(fmtp, args...) \
scoutfs_trace_write(fmtp, NR_VA_ARGS(args), ##args)
/*
* Record an unstructured trace message for debugging.
*
* The arguments can only be scalar integers and will be cast to u64 so
* only %llu formats can be used.
*
* This can only be called from task context.
*
* The super block is only used to indicate which mount initiated the
* trace and it can be null for trace messages not associated with
* mounts.
*/
#define scoutfs_trace(sb, fmt, ...) \
do { \
static char __scoutfs_trace_section __fmt[] = \
"ns %llu sb %llx pid %llu cpu %llu "fmt; \
\
BUILD_BUG_ON(fmt[sizeof(fmt) - 2] == '\n'); \
\
/* check the caller's format before we prepend things to it */ \
only_check_format(fmt, CAST_ARGS_U64(__VA_ARGS__)); \
\
__trace_write(__fmt, \
CAST_ARGS_U64(sched_clock(), (long)(sb), \
current->pid, get_cpu(), \
__VA_ARGS__)); \
put_cpu(); \
} while (0)
int __init scoutfs_trace_init(void);
void __exit scoutfs_trace_exit(void);
#endif