Skip to content

Commit

Permalink
handle failed hw-mgmt-dump command
Browse files Browse the repository at this point in the history
Signed-off-by: Stepan Blyschak <[email protected]>
  • Loading branch information
stepanblyschak committed Mar 30, 2022
1 parent 4fae05e commit 44f90ca
Showing 1 changed file with 23 additions and 14 deletions.
37 changes: 23 additions & 14 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ save_platform_info() {
save_cmd "show platform psustatus" "psustatus"
save_cmd "show platform ssdhealth" "ssdhealth"
save_cmd "show platform temperature" "temperature"
save_cmd "show platform fan" "fan"
save_cmd "show platform fan" "fan"
fi
}

Expand Down Expand Up @@ -884,12 +884,12 @@ collect_mellanox() {
copy_from_docker syncd $sai_dump_folder $sai_dump_folder
echo "$sai_dump_folder"
for file in `ls $sai_dump_folder`; do
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
save_file ${sai_dump_folder}/${file} sai_sdk_dump true
done

${CMD_PREFIX}rm -rf $sai_dump_folder
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder

# Save SDK error dumps
local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2`
if [[ -d $sdk_dump_path ]]; then
Expand All @@ -904,8 +904,17 @@ collect_mellanox() {
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
if [ -f "$HW_DUMP_FILE" ]; then
${CMD_PREFIX}${timeout_cmd} /usr/bin/hw-management-generate-dump.sh $ALLOW_PROCESS_STOP
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
rm -f /tmp/hw-mgmt-dump*
ret=$?
if [ $ret -ne 0 ]; then
if [ $ret -eq $TIMEOUT_EXIT_CODE ]; then
echo "hw-management dump timedout after ${TIMEOUT_MIN} minutes."
else
echo "hw-management dump failed ..."
fi
else
save_file "/tmp/hw-mgmt-dump*" "hw-mgmt" false
rm -f /tmp/hw-mgmt-dump*
fi
else
echo "HW Mgmt dump script $HW_DUMP_FILE does not exist"
fi
Expand Down Expand Up @@ -1117,7 +1126,7 @@ save_crash_files() {
get_asic_count() {
trap 'handle_error $? $LINENO' ERR
local redirect_eval="2>&1"
if ! $SAVE_STDERR
if ! $SAVE_STDERR
then
redirect_eval=""
fi
Expand Down Expand Up @@ -1299,14 +1308,14 @@ main() {
save_bfd_info
save_redis_info

if $DEBUG_DUMP
if $DEBUG_DUMP
then
save_dump_state_all_ns
fi

save_cmd "docker ps -a" "docker.ps"
save_cmd "docker top pmon" "docker.pmon"

if [[ -d ${PLUGINS_DIR} ]]; then
local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)"
for plugin in $dump_plugins; do
Expand Down Expand Up @@ -1391,7 +1400,7 @@ finalize() {
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

echo ${TARFILE}

if ! $SAVE_STDERR
then
exit $RETURN_CODE
Expand Down Expand Up @@ -1444,7 +1453,7 @@ remove_secret_from_etc_files() {

# Remove snmp community string from snmp.yml
sed -i -E 's/(\s*snmp_\S*community\s*:\s*)(\S*)/\1****/g' $dumppath/etc/sonic/snmp.yml

# Remove secret from /etc/sonic/config_db.json
cat $dumppath/etc/sonic/config_db.json | remove_secret_from_config_db_dump > $dumppath/etc/sonic/config_db.json.temp
mv $dumppath/etc/sonic/config_db.json.temp $dumppath/etc/sonic/config_db.json
Expand Down Expand Up @@ -1503,9 +1512,9 @@ OPTIONS
"24 March", "yesterday", etc.
-t TIMEOUT_MINS
Command level timeout in minutes
-r
-r
Redirect any intermediate errors to STDERR
-d
-d
Collect the output of debug dump cli
EOF
}
Expand Down Expand Up @@ -1556,7 +1565,7 @@ while getopts ":xnvhzas:t:r:d" opt; do
r)
SAVE_STDERR=false
;;
d)
d)
DEBUG_DUMP=true
;;
/?)
Expand All @@ -1583,7 +1592,7 @@ if mkdir "${LOCKDIR}" &>/dev/null; then
# This handler will exit the script upon receiving these interrupts
# Trap configured on EXIT will be triggered by the exit from handle_signal function
trap 'handle_sigterm' SIGHUP SIGQUIT SIGTERM
trap 'handle_sigint' SIGINT
trap 'handle_sigint' SIGINT
echo "Lock succesfully accquired and installed signal handlers"
# Proceed with the actual code
main
Expand Down

0 comments on commit 44f90ca

Please sign in to comment.