Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[techsupport] improve robustness #2117

Merged
merged 8 commits into from
Apr 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 85 additions & 45 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ EXT_RECVSIG=3
EXT_RETRY=4
EXT_TAR_FAILED=5
EXT_PROCFS_SAVE_FAILED=6
EXT_INTERRUPTED=7
EXT_TERMINATED=8
EXT_INVALID_ARGUMENT=10

TIMEOUT_EXIT_CODE=124

TAR=tar
MKDIR=mkdir
RM=rm
Expand Down Expand Up @@ -61,6 +65,8 @@ rm_lock_and_exit()
handle_exit()
{
ECODE=$?
echo "Cleaning up working directory $TARDIR"
$RM -rf $TARDIR
echo "Removing lock. Exit: $ECODE" >&2
$RM $V -rf ${LOCKDIR}
# Echo the filename as the last statement if the generation succeeds
Expand All @@ -69,11 +75,16 @@ handle_exit()
fi
}

handle_signal()
handle_sigint()
{
echo "Generate Dump received interrupt" >&2
$RM $V -rf $TARDIR
exit $EXT_RECVSIG
exit $EXT_INTERRUPTED
}

handle_sigterm() {
echo "Dump generation terminated" >&2
finalize
exit $EXT_TERMINATED
}

handle_error() {
Expand All @@ -83,6 +94,10 @@ handle_error() {
fi
}

escape_quotes() {
echo $1 | sed 's/\"/\\\"/g'
}

save_bcmcmd() {
trap 'handle_error $? $LINENO' ERR
local start_t=$(date +%s%3N)
Expand All @@ -93,6 +108,7 @@ save_bcmcmd() {
local do_gzip=${3:-false}
local tarpath="${BASE}/dump/$filename"
local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m"
local cmd=$(escape_quotes "$cmd")
if [ ! -d $LOGDIR ]; then
$MKDIR $V -p $LOGDIR
fi
Expand All @@ -106,12 +122,12 @@ save_bcmcmd() {
# as one argument, e.g. vtysh -c "COMMAND HERE" needs to have
# "COMMAND HERE" bunched together as 1 arg to vtysh -c
if $NOOP; then
echo "${timeout_cmd} $cmd &> '${filepath}'"
echo "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'"
else
ret=0
eval "${timeout_cmd} $cmd" &> "${filepath}" || ret=$?
eval "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'" || ret=$?
if [ $ret -ne 0 ]; then
if [ $ret -eq 124 ]; then
if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then
echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes."
else
RC=0
Expand Down Expand Up @@ -207,6 +223,8 @@ save_cmd() {
redirect_eval=""
fi

local cmd=$(escape_quotes "$cmd")
local cleanup_method_declration=$(declare -f $cleanup_method)
# eval required here to re-evaluate the $cmd properly at runtime
# This is required if $cmd has quoted strings that should be bunched
# as one argument, e.g. vtysh -c "COMMAND HERE" needs to have
Expand All @@ -215,25 +233,29 @@ save_cmd() {
tarpath="${tarpath}.gz"
filepath="${filepath}.gz"
# cleanup_method will run in a sub-shell, need declare it first
local cleanup_method_declration=$(declare -f $cleanup_method)
local cmds="$cleanup_method_declration; $cmd $redirect_eval | $cleanup_method | gzip -c > '${filepath}'"
if $NOOP; then
echo "${timeout_cmd} bash -c \"${cmds}\""
else
RC=0
eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$?
if [ $RC -ne 0 ]; then
if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then
echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes."
elif [ $RC -ne 0 ]; then
echo "Command: $cmds failed with RC $RC"
fi
fi
else
local cmds="$cleanup_method_declration; $cmd | $cleanup_method $redirect '$filepath'"
if $NOOP; then
echo "${timeout_cmd} $cmd | $cleanup_method $redirect '$filepath'"
echo "${timeout_cmd} bash -c \"${cmds}\""
else
RC=0
eval "${timeout_cmd} $cmd | $cleanup_method" "$redirect" "$filepath" || RC=$?
if [ $RC -ne 0 ]; then
echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes."
eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$?
if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then
echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes."
elif [ $RC -ne 0 ]; then
echo "Command: $cmds failed with RC $RC"
fi
fi
fi
Expand Down Expand Up @@ -484,20 +506,20 @@ save_bgp_neighbor() {
local asic_id=${1:-""}
local ns=$(get_vtysh_namespace $asic_id)

neighbor_list_v4=$(${timeout_cmd} vtysh $ns -c "show ip bgp neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}')
neighbor_list_v4=$(${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'")
for word in $neighbor_list_v4; do
save_cmd "vtysh $ns -c \"show ip bgp neighbors $word advertised-routes\"" "ip.bgp.neighbor.$word.adv$asic_id"
save_cmd "vtysh $ns -c \"show ip bgp neighbors $word routes\"" "ip.bgp.neighbor.$word.rcv$asic_id"
done
neighbor_list_v6=$(vtysh $ns -c "show bgp ipv6 neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}' | fgrep ':')
neighbor_list_v6=$(${timeout_cmd} bash -c "vtysh $ns -c 'show bgp ipv6 neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}' | fgrep ':'")
for word in $neighbor_list_v6; do
save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word advertised-routes\"" "ipv6.bgp.neighbor.$word.adv$asic_id"
save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word routes\"" "ipv6.bgp.neighbor.$word.rcv$asic_id"
done

vrf_list=`${timeout_cmd} vtysh $ns -c "show vrf" | awk -F" " '{print $2}'`
vrf_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show vrf' | awk -F" " '{print \$2}'"`
for vrf in $vrf_list; do
neighbor_list=`${timeout_cmd} vtysh $ns -c "show ip bgp vrf $vrf neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}'`
neighbor_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp vrf $vrf neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'"`
for word in $neighbor_list; do
save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word advertised-routes\"" "ip.bgp.neighbor.$vrf.$word.adv$asic_id"
save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word routes\"" "ip.bgp.neighbor.$vrf.$word.rcv$asic_id"
Expand Down Expand Up @@ -737,7 +759,7 @@ save_platform_info() {
save_cmd "show platform psustatus" "psustatus"
save_cmd "show platform ssdhealth" "ssdhealth"
save_cmd "show platform temperature" "temperature"
save_cmd "show platform fan" "fan"
save_cmd "show platform fan" "fan"
fi
}

Expand Down Expand Up @@ -856,6 +878,7 @@ enable_logrotate() {
###############################################################################
collect_mellanox() {
trap 'handle_error $? $LINENO' ERR
local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m"
local sai_dump_folder="/tmp/saisdkdump"
local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")"

Expand All @@ -865,12 +888,12 @@ collect_mellanox() {
copy_from_docker syncd $sai_dump_folder $sai_dump_folder
echo "$sai_dump_folder"
for file in `ls $sai_dump_folder`; do
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
done

${CMD_PREFIX}rm -rf $sai_dump_folder
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder

# Save SDK error dumps
local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2`
if [[ -d $sdk_dump_path ]]; then
Expand All @@ -880,6 +903,26 @@ collect_mellanox() {
done
rm -rf /tmp/sdk-dumps
fi

# run 'hw-management-generate-dump.sh' script and save the result file
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
if [ -f "$HW_DUMP_FILE" ]; then
${CMD_PREFIX}${timeout_cmd} /usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if timeout expired and file does not exist, we should cont but IMO should have an error message identify timeout expired and no hw mgmt file. .

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

ret=$?
if [ $ret -ne 0 ]; then
if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then
echo "hw-management dump timedout after ${TIMEOUT_MIN} minutes."
else
echo "hw-management dump failed ..."
fi
else
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
rm -f /tmp/hw-mgmt-dump*
fi
else
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
fi

}

###############################################################################
Expand Down Expand Up @@ -1087,12 +1130,11 @@ save_crash_files() {
get_asic_count() {
trap 'handle_error $? $LINENO' ERR
local redirect_eval="2>&1"
if ! $SAVE_STDERR
if ! $SAVE_STDERR
then
redirect_eval=""
fi
local cmd="show platform summary --json | python -c 'import sys, json; \
print(json.load(sys.stdin)[\"asic_count\"])'"
local cmd="python -c 'from sonic_py_common.multi_asic import get_num_asics; print(get_num_asics())'"
echo `eval ${cmd} ${redirect_eval}`
}

Expand Down Expand Up @@ -1199,6 +1241,11 @@ main() {
end_t=$(date +%s%3N)
echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO

# Save logs and cores early
save_log_files
save_crash_files
save_warmboot_files

# Save all the processes within each docker
save_cmd "show services" services.summary

Expand Down Expand Up @@ -1265,14 +1312,14 @@ main() {
save_bfd_info
save_redis_info

if $DEBUG_DUMP
if $DEBUG_DUMP
then
save_dump_state_all_ns
fi

save_cmd "docker ps -a" "docker.ps"
save_cmd "docker top pmon" "docker.pmon"

if [[ -d ${PLUGINS_DIR} ]]; then
local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)"
for plugin in $dump_plugins; do
Expand Down Expand Up @@ -1333,25 +1380,16 @@ main() {
end_t=$(date +%s%3N)
echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO

save_log_files
save_warmboot_files
save_crash_files
finalize
}

# run 'hw-management-generate-dump.sh' script and save the result file
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
if [ -f "$HW_DUMP_FILE" ]; then
/usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
rm -f /tmp/hw-mgmt-dump*
else
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
fi
###############################################################################
# Finalize dump generation
###############################################################################
finalize() {
# Save techsupport timing profile info
save_file $TECHSUPPORT_TIME_INFO log false

# clean up working tar dir before compressing
$RM $V -rf $TARDIR

if $DO_COMPRESS; then
RC=0
$GZIP $V $TARFILE || RC=$?
Expand All @@ -1364,13 +1402,14 @@ main() {

# Invoke the TechSupport Cleanup Hook
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

if ! $SAVE_STDERR
then
exit $RETURN_CODE
fi
}


###############################################################################
# Remove secret from pipeline inout and output result to pipeline.
# Globals:
Expand Down Expand Up @@ -1416,7 +1455,7 @@ remove_secret_from_etc_files() {

# Remove snmp community string from snmp.yml
sed -i -E 's/(\s*snmp_\S*community\s*:\s*)(\S*)/\1****/g' $dumppath/etc/sonic/snmp.yml

# Remove secret from /etc/sonic/config_db.json
cat $dumppath/etc/sonic/config_db.json | remove_secret_from_config_db_dump > $dumppath/etc/sonic/config_db.json.temp
mv $dumppath/etc/sonic/config_db.json.temp $dumppath/etc/sonic/config_db.json
Expand Down Expand Up @@ -1475,9 +1514,9 @@ OPTIONS
"24 March", "yesterday", etc.
-t TIMEOUT_MINS
Command level timeout in minutes
-r
-r
Redirect any intermediate errors to STDERR
-d
-d
Collect the output of debug dump cli
EOF
}
Expand Down Expand Up @@ -1527,7 +1566,7 @@ while getopts ":xnvhzas:t:r:d" opt; do
r)
SAVE_STDERR=false
;;
d)
d)
DEBUG_DUMP=true
;;
/?)
Expand All @@ -1553,7 +1592,8 @@ if $MKDIR "${LOCKDIR}" &>/dev/null; then
echo "$$" > "${PIDFILE}"
# This handler will exit the script upon receiving these interrupts
# Trap configured on EXIT will be triggered by the exit from handle_signal function
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
trap 'handle_sigterm' SIGHUP SIGQUIT SIGTERM
trap 'handle_sigint' SIGINT
echo "Lock succesfully accquired and installed signal handlers"
# Proceed with the actual code
if [[ ! -z "${V}" ]]; then
Expand Down
21 changes: 9 additions & 12 deletions show/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,25 +68,22 @@

GEARBOX_TABLE_PHY_PATTERN = r"_GEARBOX_TABLE:phy:*"

COMMAND_TIMEOUT = 300

# To be enhanced. Routing-stack information should be collected from a global
# location (configdb?), so that we prevent the continous execution of this
# bash oneliner. To be revisited once routing-stack info is tracked somewhere.
def get_routing_stack():
result = None
command = "sudo docker ps | grep bgp | awk '{print$2}' | cut -d'-' -f3 | cut -d':' -f1 | head -n 1"

try:
proc = subprocess.Popen(command,
stdout=subprocess.PIPE,
shell=True,
text=True)
stdout = proc.communicate()[0]
proc.wait()
stdout = subprocess.check_output(command, shell=True, timeout=COMMAND_TIMEOUT)
result = stdout.rstrip('\n')
except Exception as err:
click.echo('Failed to get routing stack: {}'.format(err), err=True)

except OSError as e:
raise OSError("Cannot detect routing-stack")

return (result)
return result


# Global Routing-Stack variable
Expand Down Expand Up @@ -1149,7 +1146,7 @@ def users(verbose):
@click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect an intermediate errors to STDERR")
def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, debug_dump, redirect_stderr):
"""Gather information for troubleshooting"""
cmd = "sudo timeout -s SIGTERM --foreground {}m".format(global_timeout)
cmd = "sudo timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)

if allow_process_stop:
cmd += " -a"
Expand All @@ -1164,7 +1161,7 @@ def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop,
cmd += " -s '{}'".format(since)

if debug_dump:
cmd += " -d "
cmd += " -d"

cmd += " -t {}".format(cmd_timeout)
if redirect_stderr:
Expand Down
Loading