Files
scylladb/dist/common/scripts/node_health_check
Avi Kivity f3eade2f62 treewide: relicense to ScyllaDB-Source-Available-1.0
Drop the AGPL license in favor of a source-available license.
See the blog post [1] for details.

[1] https://www.scylladb.com/2024/12/18/why-were-moving-to-a-source-available-license/
2024-12-18 17:45:13 +02:00

535 lines
18 KiB
Bash
Executable File

#!/bin/bash
#
# Copyright (C) 2017-present ScyllaDB
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
export PATH=$PATH:/opt/scylladb/bin
##Variables##
REPORT="./`hostname`-health-check-report.txt"
OUTPUT_PATH="./output_files"
OUTPUT_PATH1="$OUTPUT_PATH/system_checks"
OUTPUT_PATH2="$OUTPUT_PATH/scylladb_checks"
OUTPUT_PATH3="$OUTPUT_PATH/nodetool_commands"
OUTPUT_PATH4="$OUTPUT_PATH/data_model"
OUTPUT_PATH5="$OUTPUT_PATH/network_checks"
IS_FEDORA="0"
IS_DEBIAN="0"
IS_GENTOO="0"
JMX_PORT="7199"
CQL_PORT="9042"
PRINT_DM=NO
PRINT_NET=NO
PRINT_cfstats=NO
SCYLLA_SERVICE="0"
JMX_SERVICE="0"
LISTEN_ADDR=$(grep -e '^listen_address' /etc/scylla/scylla.yaml|awk -F':' '{ print $NF }')
while getopts ":hdncap:q:" opt; do
case $opt in
h) echo ""
echo "This script performs system review and generates health check report based on"
echo "the configuration data (hardware, OS, Scylla SW, etc.) collected from the node."
echo ""
echo "Usage:"
echo "-p Port to use for nodetool commands (default: 7199)"
echo "-q Port to use for cqlsh (default: 9042)"
echo "-c Print cfstats output"
echo "-d Print data model info"
echo "-n Print network info"
echo "-a Print all"
echo "-h Display this help and exit"
echo ""
echo "Note: output for the above is collected, but not printed in the report."
echo "If you wish to have them printed, please supply the relevant flag/s."
echo ""
exit 2
;;
p) JMX_PORT=$OPTARG ;;
q) CQL_PORT=$OPTARG ;;
d) PRINT_DM=YES ;;
n) PRINT_NET=YES ;;
c) PRINT_cfstats=YES ;;
a) PRINT_DM=YES
PRINT_NET=YES
PRINT_cfstats=YES
;;
\?) echo "Invalid option: -$OPTARG"
exit 2
;;
:) echo "Option -$OPTARG requires an argument."
exit 2
;;
esac
done
# Get the CQL_ADDR from whatever is currently listening on $CQL_PORT. If it's a wildcard,
# use the listen_address from scylla.yaml
CQL_ADDR=`ss -l |grep ${CQL_PORT}|awk -F"${CQL_PORT}" '{print $1}'|awk '{print $NF}'`
if [ "$CQL_ADDR" == '*:' -o "$CQL_ADDR" == '0.0.0.0:' ]; then
CQL_ADDR=`grep -e '^listen_address:' /etc/scylla/scylla.yaml|awk '{print $NF}'`
else
CQL_ADDR=${CQL_ADDR/%:/}
fi
##Check server release (Fedora/Oracle/Debian/Gentoo)##
cat /etc/os-release | grep -i fedora &> /dev/null
if [ $? -ne 0 ]; then
cat /etc/os-release | grep -i oracle &> /dev/null
if [ $? -ne 0 ]; then
IS_FEDORA="1"
fi
fi
cat /etc/os-release | grep -i debian &> /dev/null
if [ $? -ne 0 ]; then
IS_DEBIAN="1"
fi
cat /etc/os-release | grep -i gentoo &> /dev/null
if [ $? -ne 0 ]; then
IS_GENTOO="1"
fi
if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ] && [ "$IS_GENTOO" == "1" ]; then
echo "This s a Non-Supported OS, Please Review the Support Matrix"
exit 222
fi
##Scylla-server service status##
echo "--------------------------------------------------"
echo "Checking Scylla-server Service"
echo "--------------------------------------------------"
ps -C scylla --no-headers &> /dev/null
if [ $? -ne 0 ]; then
SCYLLA_SERVICE="1"
echo "ERROR: Scylla-server is NOT Running"
echo "Cannot Collect Data Model Info"
echo "--------------------------------------------------"
else
echo "Scylla-server Service: OK"
echo "--------------------------------------------------"
fi
##Scylla-JMX service status##
echo "Checking Scylla-JMX Service on Port $JMX_PORT"
echo "--------------------------------------------------"
nodetool -p$JMX_PORT status &> /dev/null
if [ $? -ne 0 ]; then
JMX_SERVICE="1"
echo "ERROR: Scylla-JMX is NOT Running / NOT Listening on Port $JMX_PORT"
echo "Cannot Collect Nodetool Info"
echo "Use the '-p' Option to Provide the Scylla-JMX Port"
echo "--------------------------------------------------"
else
echo "Scylla-JMX Service (nodetool): OK"
echo "--------------------------------------------------"
fi
#Create dir structure to save output_files#
echo "--------------------------------------------------"
echo "Creating Output Files Directory"
echo "--------------------------------------------------"
mkdir -p $OUTPUT_PATH
mkdir -p $OUTPUT_PATH1 $OUTPUT_PATH2 $OUTPUT_PATH3 $OUTPUT_PATH4 $OUTPUT_PATH5
##Output Collection##
#System Checks#
echo "Collecting System Info"
echo "--------------------------------------------------"
cp -p /etc/os-release $OUTPUT_PATH1
uname -r > $OUTPUT_PATH1/kernel-release.txt
lscpu > $OUTPUT_PATH1/cpu-info.txt
vmstat -s -S M | awk '{$1=$1};1' > $OUTPUT_PATH1/vmstat.txt
df -Th > $OUTPUT_PATH1/capacity-info.txt && echo "" >> $OUTPUT_PATH1/capacity-info.txt && sudo du -sh /var/lib/scylla/* >> $OUTPUT_PATH1/capacity-info.txt
cp -p /proc/mdstat $OUTPUT_PATH1
for f in `sudo find /sys -name scheduler`; do echo -n "$f: "; cat $f; done > $OUTPUT_PATH1/io-sched-conf.txt && echo "" >> $OUTPUT_PATH1/io-sched-conf.txt
for f in `sudo find /sys -name nomerges`; do echo -n "$f: "; cat $f; done >> $OUTPUT_PATH1/io-sched-conf.txt
#ScyllaDB Checks#
echo "Collecting Scylla Info"
echo "--------------------------------------------------"
scylla --version > $OUTPUT_PATH2/scylla-version.txt
cp -p /etc/scylla/* $OUTPUT_PATH2
cp -p /etc/scylla.d/* $OUTPUT_PATH2
ls -ltrh /var/lib/scylla/coredump/ > $OUTPUT_PATH2/coredump-folder.txt
if [ "$IS_FEDORA" == "0" ]; then
rpm -qa | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
cp -p /etc/sysconfig/scylla-server $OUTPUT_PATH2
fi
if [ "$IS_DEBIAN" == "0" ]; then
dpkg -l | grep -i scylla > $OUTPUT_PATH2/scylla-pkgs.txt
cp -p /etc/default/scylla-server $OUTPUT_PATH2
fi
if [ "$IS_GENTOO" == "0" ]; then
sudo emerge -1uq app-portage/portage-utils
sudo qlist -ICv scylla > $OUTPUT_PATH2/scylla-pkgs.txt
cp -p /etc/default/scylla-server $OUTPUT_PATH2
fi
#Scylla Logs#
echo "--------------------------------------------------"
echo "Collecting Logs"
echo "--------------------------------------------------"
journalctl --help &> /dev/null
if [ $? -eq 0 ]; then
journalctl -t scylla > $OUTPUT_PATH/scylla-logs.txt
else
if [ "$IS_GENTOO" == "0" ]; then
cat /var/log/scylla/scylla.log > $OUTPUT_PATH/scylla-logs.txt
else
cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
fi
fi
gzip -f $OUTPUT_PATH/scylla-logs.txt
#Nodetool commands#
if [ "$JMX_SERVICE" == "1" ]; then
echo "Skipping Nodetool Info Collection"
echo "--------------------------------------------------"
else
echo "Collecting Nodetool Commands Info (using port $JMX_PORT)"
echo "--------------------------------------------------"
nodetool -p$JMX_PORT status > $OUTPUT_PATH3/nodetool-status.txt
nodetool -p$JMX_PORT info > $OUTPUT_PATH3/nodetool-info.txt
nodetool -p$JMX_PORT netstats > $OUTPUT_PATH3/nodetool-netstats.txt
nodetool -p$JMX_PORT gossipinfo > $OUTPUT_PATH3/nodetool-gossipinfo.txt
nodetool -p$JMX_PORT proxyhistograms > $OUTPUT_PATH3/nodetool-proxyhistograms.txt
nodetool -p$JMX_PORT cfstats -H | grep Keyspace -A 4 > $OUTPUT_PATH3/nodetool-cfstats-keyspace.txt
nodetool -p$JMX_PORT cfstats -H | egrep 'Table:|SSTable count:|Compacted|tombstones' | awk '{$1=$1};1' | awk '{print; if (FNR % 7 == 0 ) printf "\n --";}' > $OUTPUT_PATH3/nodetool-cfstats-table.txt
sed -i '1s/^/ --/' $OUTPUT_PATH3/nodetool-cfstats-table.txt
nodetool -p$JMX_PORT compactionstats > $OUTPUT_PATH3/nodetool-compactionstats.txt
nodetool -p$JMX_PORT ring > $OUTPUT_PATH3/nodetool-ring.txt
fi
#not implemented: nodetool cfhistograms $KS $TN >> $OUTPUT_PATH3/nodetool-cfhistograms.txt#
#Data Model#
if [ "$SCYLLA_SERVICE" == "1" ]; then
echo "Skipping Data Model Info Collection"
echo "--------------------------------------------------"
else
# TODO: handle connecting with authentication
cqlsh $CQL_ADDR $CQL_PORT -e "HELP" &> /dev/null
if [ $? -eq 0 ]; then
echo "Collecting Data Model Info (using port $CQL_PORT)"
echo "--------------------------------------------------"
cqlsh $CQL_ADDR $CQL_PORT -e "DESCRIBE SCHEMA" > $OUTPUT_PATH4/describe-schema.txt
cqlsh $CQL_ADDR $CQL_PORT -e "DESCRIBE TABLES" > $OUTPUT_PATH4/describe-tables.txt
else
echo "ERROR: CQL is NOT Listening on Port $CQL_PORT"
echo "Cannot Collect Data Model Info"
echo "Use the '-q' Option to Provide the CQL Port"
echo "--------------------------------------------------"
fi
fi
#Network Checks#
echo "Collecting Network Info"
echo "--------------------------------------------------"
ifconfig -a >> $OUTPUT_PATH5/ifconfig.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; ethtool -i $i; echo ""; done > $OUTPUT_PATH5/ethtool-NIC.txt
cat /proc/interrupts > $OUTPUT_PATH5/proc-interrupts.txt
for i in `ls -I default_smp_affinity /proc/irq`; do echo -n "--$i:"; sudo cat /proc/irq/$i/smp_affinity; echo ""; done > $OUTPUT_PATH5/irq-smp-affinity.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_cpus; echo ""; done > $OUTPUT_PATH5/rps-conf.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/tx-*/xps_cpus; echo ""; done > $OUTPUT_PATH5/xps-conf.txt
for i in `ls -I lo /sys/class/net/`; do echo "--$i"; cat /sys/class/net/$i/queues/rx-*/rps_flow_cnt; echo ""; done > $OUTPUT_PATH5/rfs-conf.txt
ps -elf | grep irqbalance > $OUTPUT_PATH5/irqbalance-conf.txt
sudo sysctl -a > $OUTPUT_PATH5/sysctl.txt 2>&1
if [ -e /usr/sbin/iptables ]; then
sudo iptables -L -v > $OUTPUT_PATH5/iptables.txt
else
touch $OUTPUT_PATH5/iptables.txt
fi
netstat -an | grep tcp > $OUTPUT_PATH5/netstat-tcp.txt
echo "Output Collection Completed Successfully"
echo "--------------------------------------------------"
##Generate Health Check Report##
echo "Generating Health Check Report"
echo "--------------------------------------------------"
echo "Print cfstats: $PRINT_cfstats"
echo "Print Data Model: $PRINT_DM"
echo "Print Network Info: $PRINT_NET"
echo "--------------------------------------------------"
echo "" > $REPORT
date "+DATE: %m/%d/%y" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo " Health Check Report for node: `hostname`" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "PURPOSE" >> $REPORT
echo "=======" >> $REPORT
echo "This document first serves as a system review and health check report." >> $REPORT
echo "It is based on the configuration data (hardware, OS, Scylla SW, etc.) collected from the node." >> $REPORT
echo "Based on the review and analysis of the collected data, ScyllaDB can recommend on possible" >> $REPORT
echo "ways to better utilize the cluster, based on both experience and best practices." >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "SYSTEM INFO" >> $REPORT
echo "===========" >> $REPORT
echo "" >> $REPORT
echo "Host Operating System" >> $REPORT
echo "---------------------" >> $REPORT
cat $OUTPUT_PATH1/os-release >> $REPORT
if [ -f /etc/redhat-release ]; then cat /etc/redhat-release >> $REPORT; fi
echo "" >> $REPORT
echo "" >> $REPORT
echo "Kernel Release" >> $REPORT
echo "--------------" >> $REPORT
cat $OUTPUT_PATH1/kernel-release.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "CPU Info" >> $REPORT
echo "--------" >> $REPORT
cat $OUTPUT_PATH1/cpu-info.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "Memory Info in MB" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH1/vmstat.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "Storage/Disk Info" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH1/capacity-info.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "RAID Configuration" >> $REPORT
echo "------------------" >> $REPORT
cat $OUTPUT_PATH1/mdstat >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "I/O Scheduler Configuration" >> $REPORT
echo "---------------------------" >> $REPORT
cat $OUTPUT_PATH1/io-sched-conf.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "ScyllaDB INFO" >> $REPORT
echo "=============" >> $REPORT
echo "" >> $REPORT
echo "SW Version (PKGs)" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH2/scylla-version.txt >> $REPORT
echo "" >> $REPORT
cat $OUTPUT_PATH2/scylla-pkgs.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "Configuration files" >> $REPORT
echo "-------------------" >> $REPORT
echo "## /etc/scylla/scylla.yaml ##" >> $REPORT
cat $OUTPUT_PATH2/scylla.yaml | grep -v "#" | grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
if [ "$IS_FEDORA" == "0" ]; then
echo "## /etc/sysconfig/scylla-server ##" >> $REPORT
fi
if [ "$IS_DEBIAN" == "0" ] || [ "$IS_GENTOO" == "0" ]; then
echo "## /etc/default/scylla-server ##" >> $REPORT
fi
cat $OUTPUT_PATH2/scylla-server | grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "## /etc/scylla/cassandra-rackdc.properties ##" >> $REPORT
cat $OUTPUT_PATH2/cassandra-rackdc.properties | grep -v "#" |grep -v "^[[:space:]]*$" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "Check for Coredumps" >> $REPORT
echo "-------------------" >> $REPORT
cat $OUTPUT_PATH2/coredump-folder.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
if [ "$JMX_SERVICE" == "0" ]; then
echo "Nodetool Status/Info/Gossip" >> $REPORT
echo "---------------------------" >> $REPORT
echo "## Nodetool Status ##" >> $REPORT
cat $OUTPUT_PATH3/nodetool-status.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "## Nodetool Info ##" >> $REPORT
cat $OUTPUT_PATH3/nodetool-info.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "## Nodetool GossipInfo ##" >> $REPORT
cat $OUTPUT_PATH3/nodetool-gossipinfo.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
fi
if [ $PRINT_DM == "YES" ]; then
echo "DATA MODEL INFO" >> $REPORT
echo "===============" >> $REPORT
echo "" >> $REPORT
cqlsh $CQL_ADDR $CQL_PORT -e "HELP" &> /dev/null
if [ $? -eq 0 ]; then
echo "Printing Data Model Info to Report"
echo "--------------------------------------------------"
echo "Describe Schema" >> $REPORT
echo "---------------" >> $REPORT
cat $OUTPUT_PATH4/describe-schema.txt >> $REPORT
echo "" >> $REPORT
echo "Describe Tables" >> $REPORT
echo "---------------" >> $REPORT
cat $OUTPUT_PATH4/describe-tables.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
else
echo "ERROR: Data Model NOT Collected - Nothing to Print"
echo "--------------------------------------------------"
echo "Data Model was not collected" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
fi
fi
echo "PERFORMANCE and METRICS INFO" >> $REPORT
echo "============================" >> $REPORT
echo "" >> $REPORT
if [ "$JMX_SERVICE" == "0" ]; then
echo "Nodetool Proxyhistograms (RD/WR latency)" >> $REPORT
echo "----------------------------------------" >> $REPORT
cat $OUTPUT_PATH3/nodetool-proxyhistograms.txt >> $REPORT
echo "" >> $REPORT
echo "Nodetool netstats" >> $REPORT
echo "-----------------" >> $REPORT
cat $OUTPUT_PATH3/nodetool-netstats.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
if [ $PRINT_cfstats == "YES" ]; then
echo "Printing cfstats Output to Report"
echo "--------------------------------------------------"
echo "Nodetool cfstats" >> $REPORT
echo "----------------" >> $REPORT
echo "## Keyspace Info ##" >> $REPORT
cat $OUTPUT_PATH3/nodetool-cfstats-keyspace.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "## Tables Info ##" >> $REPORT
cat $OUTPUT_PATH3/nodetool-cfstats-table.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
fi
echo "Nodetool compactionstats" >> $REPORT
echo "------------------------" >> $REPORT
cat $OUTPUT_PATH3/nodetool-compactionstats.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
else
echo "Nodetool info was not collected" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
fi
if [ $PRINT_NET == "YES" ]; then
echo "Printing Network Info to Report"
echo "--------------------------------------------------"
echo "NETWORK INFO" >> $REPORT
echo "============" >> $REPORT
echo "" >> $REPORT
echo "ethtool per NIC" >> $REPORT
echo "---------------" >> $REPORT
cat $OUTPUT_PATH5/ethtool-NIC.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "/proc/interrupts" >> $REPORT
echo "----------------" >> $REPORT
cat $OUTPUT_PATH5/proc-interrupts.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "IRQ smp affinity" >> $REPORT
echo "----------------" >> $REPORT
cat $OUTPUT_PATH5/irq-smp-affinity.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "sysctl -a" >> $REPORT
echo "---------" >> $REPORT
cat $OUTPUT_PATH5/sysctl.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "iptables -L" >> $REPORT
echo "-----------" >> $REPORT
cat $OUTPUT_PATH5/iptables.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "netstat -an | grep tcp" >> $REPORT
echo "----------------------" >> $REPORT
cat $OUTPUT_PATH5/netstat-tcp.txt >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
echo "" >> $REPORT
fi
echo "Archiving Output Files"
echo "--------------------------------------------------"
tar cvzf output_files.tgz $OUTPUT_PATH --remove-files
echo "--------------------------------------------------"
echo "Health Check Report Created Successfully"
echo "Path to Report: $REPORT"
echo "--------------------------------------------------"