diff --git a/scripts/generate_dump b/scripts/generate_dump index 85a7f6a057..9223314955 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -13,8 +13,12 @@ EXT_RECVSIG=3 EXT_RETRY=4 EXT_TAR_FAILED=5 EXT_PROCFS_SAVE_FAILED=6 +EXT_INTERRUPTED=7 +EXT_TERMINATED=8 EXT_INVALID_ARGUMENT=10 +TIMEOUT_EXIT_CODE=124 + TAR=tar MKDIR=mkdir RM=rm @@ -61,6 +65,8 @@ rm_lock_and_exit() handle_exit() { ECODE=$? + echo "Cleaning up working directory $TARDIR" + $RM -rf $TARDIR echo "Removing lock. Exit: $ECODE" >&2 $RM $V -rf ${LOCKDIR} # Echo the filename as the last statement if the generation succeeds @@ -69,11 +75,16 @@ handle_exit() fi } -handle_signal() +handle_sigint() { echo "Generate Dump received interrupt" >&2 - $RM $V -rf $TARDIR - exit $EXT_RECVSIG + exit $EXT_INTERRUPTED +} + +handle_sigterm() { + echo "Dump generation terminated" >&2 + finalize + exit $EXT_TERMINATED } handle_error() { @@ -83,6 +94,10 @@ handle_error() { fi } +escape_quotes() { + echo $1 | sed 's/\"/\\\"/g' +} + save_bcmcmd() { trap 'handle_error $? $LINENO' ERR local start_t=$(date +%s%3N) @@ -93,6 +108,7 @@ save_bcmcmd() { local do_gzip=${3:-false} local tarpath="${BASE}/dump/$filename" local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" + local cmd=$(escape_quotes "$cmd") if [ ! -d $LOGDIR ]; then $MKDIR $V -p $LOGDIR fi @@ -106,12 +122,12 @@ save_bcmcmd() { # as one argument, e.g. vtysh -c "COMMAND HERE" needs to have # "COMMAND HERE" bunched together as 1 arg to vtysh -c if $NOOP; then - echo "${timeout_cmd} $cmd &> '${filepath}'" + echo "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'" else ret=0 - eval "${timeout_cmd} $cmd" &> "${filepath}" || ret=$? + eval "${timeout_cmd} bash -c \"${cmd}\" &> '${filepath}'" || ret=$? if [ $ret -ne 0 ]; then - if [ $ret -eq 124 ]; then + if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes." else RC=0 @@ -207,6 +223,8 @@ save_cmd() { redirect_eval="" fi + local cmd=$(escape_quotes "$cmd") + local cleanup_method_declration=$(declare -f $cleanup_method) # eval required here to re-evaluate the $cmd properly at runtime # This is required if $cmd has quoted strings that should be bunched # as one argument, e.g. vtysh -c "COMMAND HERE" needs to have @@ -215,25 +233,29 @@ save_cmd() { tarpath="${tarpath}.gz" filepath="${filepath}.gz" # cleanup_method will run in a sub-shell, need declare it first - local cleanup_method_declration=$(declare -f $cleanup_method) local cmds="$cleanup_method_declration; $cmd $redirect_eval | $cleanup_method | gzip -c > '${filepath}'" if $NOOP; then echo "${timeout_cmd} bash -c \"${cmds}\"" else RC=0 eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$? - if [ $RC -ne 0 ]; then + if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes." + elif [ $RC -ne 0 ]; then + echo "Command: $cmds failed with RC $RC" fi fi else + local cmds="$cleanup_method_declration; $cmd | $cleanup_method $redirect '$filepath'" if $NOOP; then - echo "${timeout_cmd} $cmd | $cleanup_method $redirect '$filepath'" + echo "${timeout_cmd} bash -c \"${cmds}\"" else RC=0 - eval "${timeout_cmd} $cmd | $cleanup_method" "$redirect" "$filepath" || RC=$? - if [ $RC -ne 0 ]; then - echo "Command: $cmd timedout after ${TIMEOUT_MIN} minutes." + eval "${timeout_cmd} bash -c \"${cmds}\"" || RC=$? + if [ $RC -eq $TIMEOUT_EXIT_CODE ]; then + echo "Command: $cmds timedout after ${TIMEOUT_MIN} minutes." + elif [ $RC -ne 0 ]; then + echo "Command: $cmds failed with RC $RC" fi fi fi @@ -484,20 +506,20 @@ save_bgp_neighbor() { local asic_id=${1:-""} local ns=$(get_vtysh_namespace $asic_id) - neighbor_list_v4=$(${timeout_cmd} vtysh $ns -c "show ip bgp neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}') + neighbor_list_v4=$(${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'") for word in $neighbor_list_v4; do save_cmd "vtysh $ns -c \"show ip bgp neighbors $word advertised-routes\"" "ip.bgp.neighbor.$word.adv$asic_id" save_cmd "vtysh $ns -c \"show ip bgp neighbors $word routes\"" "ip.bgp.neighbor.$word.rcv$asic_id" done - neighbor_list_v6=$(vtysh $ns -c "show bgp ipv6 neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}' | fgrep ':') + neighbor_list_v6=$(${timeout_cmd} bash -c "vtysh $ns -c 'show bgp ipv6 neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}' | fgrep ':'") for word in $neighbor_list_v6; do save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word advertised-routes\"" "ipv6.bgp.neighbor.$word.adv$asic_id" save_cmd "vtysh $ns -c \"show bgp ipv6 neighbors $word routes\"" "ipv6.bgp.neighbor.$word.rcv$asic_id" done - vrf_list=`${timeout_cmd} vtysh $ns -c "show vrf" | awk -F" " '{print $2}'` + vrf_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show vrf' | awk -F" " '{print \$2}'"` for vrf in $vrf_list; do - neighbor_list=`${timeout_cmd} vtysh $ns -c "show ip bgp vrf $vrf neighbors" | grep "BGP neighbor is" | awk -F '[, ]' '{print $4}'` + neighbor_list=`${timeout_cmd} bash -c "vtysh $ns -c 'show ip bgp vrf $vrf neighbors' | grep 'BGP neighbor is' | awk -F '[, ]' '{print \$4}'"` for word in $neighbor_list; do save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word advertised-routes\"" "ip.bgp.neighbor.$vrf.$word.adv$asic_id" save_cmd "vtysh $ns -c \"show ip bgp vrf $vrf neighbors $word routes\"" "ip.bgp.neighbor.$vrf.$word.rcv$asic_id" @@ -737,7 +759,7 @@ save_platform_info() { save_cmd "show platform psustatus" "psustatus" save_cmd "show platform ssdhealth" "ssdhealth" save_cmd "show platform temperature" "temperature" - save_cmd "show platform fan" "fan" + save_cmd "show platform fan" "fan" fi } @@ -856,6 +878,7 @@ enable_logrotate() { ############################################################################### collect_mellanox() { trap 'handle_error $? $LINENO' ERR + local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" local sai_dump_folder="/tmp/saisdkdump" local sai_dump_filename="${sai_dump_folder}/sai_sdk_dump_$(date +"%m_%d_%Y_%I_%M_%p")" @@ -865,12 +888,12 @@ collect_mellanox() { copy_from_docker syncd $sai_dump_folder $sai_dump_folder echo "$sai_dump_folder" for file in `ls $sai_dump_folder`; do - save_file ${sai_dump_folder}/${file} sai_sdk_dump true + save_file ${sai_dump_folder}/${file} sai_sdk_dump true done ${CMD_PREFIX}rm -rf $sai_dump_folder ${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder - + # Save SDK error dumps local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2` if [[ -d $sdk_dump_path ]]; then @@ -880,6 +903,26 @@ collect_mellanox() { done rm -rf /tmp/sdk-dumps fi + + # run 'hw-management-generate-dump.sh' script and save the result file + HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh + if [ -f "$HW_DUMP_FILE" ]; then + ${CMD_PREFIX}${timeout_cmd} /usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP + ret=$? + if [ $ret -ne 0 ]; then + if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then + echo "hw-management dump timedout after ${TIMEOUT_MIN} minutes." + else + echo "hw-management dump failed ..." + fi + else + save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false + rm -f /tmp/hw-mgmt-dump* + fi + else + echo "HW Mgmt dump script $HW_DUMP_FILE does not exist" + fi + } ############################################################################### @@ -1087,12 +1130,11 @@ save_crash_files() { get_asic_count() { trap 'handle_error $? $LINENO' ERR local redirect_eval="2>&1" - if ! $SAVE_STDERR + if ! $SAVE_STDERR then redirect_eval="" fi - local cmd="show platform summary --json | python -c 'import sys, json; \ - print(json.load(sys.stdin)[\"asic_count\"])'" + local cmd="python -c 'from sonic_py_common.multi_asic import get_num_asics; print(get_num_asics())'" echo `eval ${cmd} ${redirect_eval}` } @@ -1199,6 +1241,11 @@ main() { end_t=$(date +%s%3N) echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO + # Save logs and cores early + save_log_files + save_crash_files + save_warmboot_files + # Save all the processes within each docker save_cmd "show services" services.summary @@ -1265,14 +1312,14 @@ main() { save_bfd_info save_redis_info - if $DEBUG_DUMP + if $DEBUG_DUMP then save_dump_state_all_ns fi save_cmd "docker ps -a" "docker.ps" save_cmd "docker top pmon" "docker.pmon" - + if [[ -d ${PLUGINS_DIR} ]]; then local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)" for plugin in $dump_plugins; do @@ -1333,25 +1380,16 @@ main() { end_t=$(date +%s%3N) echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO - save_log_files - save_warmboot_files - save_crash_files + finalize +} - # run 'hw-management-generate-dump.sh' script and save the result file - HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh - if [ -f "$HW_DUMP_FILE" ]; then - /usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP - save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false - rm -f /tmp/hw-mgmt-dump* - else - echo "HW Mgmt dump script $HW_DUMP_FILE does not exist" - fi +############################################################################### +# Finalize dump generation +############################################################################### +finalize() { # Save techsupport timing profile info save_file $TECHSUPPORT_TIME_INFO log false - # clean up working tar dir before compressing - $RM $V -rf $TARDIR - if $DO_COMPRESS; then RC=0 $GZIP $V $TARFILE || RC=$? @@ -1364,13 +1402,14 @@ main() { # Invoke the TechSupport Cleanup Hook setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log & - + if ! $SAVE_STDERR then exit $RETURN_CODE fi } + ############################################################################### # Remove secret from pipeline inout and output result to pipeline. # Globals: @@ -1416,7 +1455,7 @@ remove_secret_from_etc_files() { # Remove snmp community string from snmp.yml sed -i -E 's/(\s*snmp_\S*community\s*:\s*)(\S*)/\1****/g' $dumppath/etc/sonic/snmp.yml - + # Remove secret from /etc/sonic/config_db.json cat $dumppath/etc/sonic/config_db.json | remove_secret_from_config_db_dump > $dumppath/etc/sonic/config_db.json.temp mv $dumppath/etc/sonic/config_db.json.temp $dumppath/etc/sonic/config_db.json @@ -1475,9 +1514,9 @@ OPTIONS "24 March", "yesterday", etc. -t TIMEOUT_MINS Command level timeout in minutes - -r + -r Redirect any intermediate errors to STDERR - -d + -d Collect the output of debug dump cli EOF } @@ -1527,7 +1566,7 @@ while getopts ":xnvhzas:t:r:d" opt; do r) SAVE_STDERR=false ;; - d) + d) DEBUG_DUMP=true ;; /?) @@ -1553,7 +1592,8 @@ if $MKDIR "${LOCKDIR}" &>/dev/null; then echo "$$" > "${PIDFILE}" # This handler will exit the script upon receiving these interrupts # Trap configured on EXIT will be triggered by the exit from handle_signal function - trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM + trap 'handle_sigterm' SIGHUP SIGQUIT SIGTERM + trap 'handle_sigint' SIGINT echo "Lock succesfully accquired and installed signal handlers" # Proceed with the actual code if [[ ! -z "${V}" ]]; then diff --git a/show/main.py b/show/main.py index 9d50cbf15e..3f3e367463 100755 --- a/show/main.py +++ b/show/main.py @@ -68,25 +68,22 @@ GEARBOX_TABLE_PHY_PATTERN = r"_GEARBOX_TABLE:phy:*" +COMMAND_TIMEOUT = 300 + # To be enhanced. Routing-stack information should be collected from a global # location (configdb?), so that we prevent the continous execution of this # bash oneliner. To be revisited once routing-stack info is tracked somewhere. def get_routing_stack(): + result = None command = "sudo docker ps | grep bgp | awk '{print$2}' | cut -d'-' -f3 | cut -d':' -f1 | head -n 1" try: - proc = subprocess.Popen(command, - stdout=subprocess.PIPE, - shell=True, - text=True) - stdout = proc.communicate()[0] - proc.wait() + stdout = subprocess.check_output(command, shell=True, timeout=COMMAND_TIMEOUT) result = stdout.rstrip('\n') + except Exception as err: + click.echo('Failed to get routing stack: {}'.format(err), err=True) - except OSError as e: - raise OSError("Cannot detect routing-stack") - - return (result) + return result # Global Routing-Stack variable @@ -1149,7 +1146,7 @@ def users(verbose): @click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect an intermediate errors to STDERR") def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, debug_dump, redirect_stderr): """Gather information for troubleshooting""" - cmd = "sudo timeout -s SIGTERM --foreground {}m".format(global_timeout) + cmd = "sudo timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout) if allow_process_stop: cmd += " -a" @@ -1164,7 +1161,7 @@ def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, cmd += " -s '{}'".format(since) if debug_dump: - cmd += " -d " + cmd += " -d" cmd += " -t {}".format(cmd_timeout) if redirect_stderr: diff --git a/tests/techsupport_test.py b/tests/techsupport_test.py new file mode 100644 index 0000000000..64bc133627 --- /dev/null +++ b/tests/techsupport_test.py @@ -0,0 +1,24 @@ +import pytest +import show.main +from unittest.mock import patch, Mock +from click.testing import CliRunner + +EXPECTED_BASE_COMMAND = 'sudo timeout --kill-after=300s -s SIGTERM --foreground ' + +@patch("show.main.run_command") +@pytest.mark.parametrize( + "cli_arguments,expected", + [ + ([], '30m generate_dump -v -t 5'), + (['--since', '2 days ago'], "30m generate_dump -v -s '2 days ago' -t 5"), + (['-g', '50'], '50m generate_dump -v -t 5'), + (['--allow-process-stop'], '30m -a generate_dump -v -t 5'), + (['--silent'], '30m generate_dump -t 5'), + (['--debug-dump', '--redirect-stderr'], '30m generate_dump -v -d -t 5 -r'), + ] +) +def test_techsupport(run_command, cli_arguments, expected): + runner = CliRunner() + result = runner.invoke(show.main.cli.commands['techsupport'], cli_arguments) + run_command.assert_called_with(EXPECTED_BASE_COMMAND + expected, display_cmd=False) +