Skip to content

Commit

Permalink
NVidia mismatch recovery (#915)
Browse files Browse the repository at this point in the history
If the build failed due to the Nvidia error, we do the following steps:

 *  Remove all labels from current agent and add a recovery-process label to indicate that that agent is performing recovery actions, preventing it from taking on any other build.
 *   Requeue the job with same parameters with a delay of 70s to give time for the next step.
 *   Schedule a system restart on 1 minute (the delay is needed here so that the postbuild action finishes correctly
  • Loading branch information
claraberendsen authored and j-rivero committed Jun 28, 2023
1 parent cf8fed3 commit 5603eb0
Showing 1 changed file with 54 additions and 1 deletion.
55 changes: 54 additions & 1 deletion jenkins-scripts/dsl/_configs_/OSRFUNIXBase.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,59 @@ class OSRFUNIXBase extends OSRFBase
git clone https://github.com/gazebo-tooling/release-tools scripts -b \$RTOOLS_BRANCH
""".stripIndent())
}
publishers {
postBuildScripts {
steps{
conditionalSteps {
condition {
expression("(.)* gpu-nvidia (.)*",'${NODE_LABELS}')
}
steps {
systemGroovyCommand('''\
import hudson.model.Cause.UpstreamCause;
import hudson.model.*;
def node = build.getBuiltOn()
def old_labels = node.getLabelString()
println("# BEGIN SECTION: NVIDIA MISMATCH RECOVERY")
if (!(build.getLog(1000) =~ "nvml error: driver/library version mismatch")) {
println(" NVIDIA driver/library version mismatch not detected in the log - Not performing any recovery automatic recovery step")
return 1;
} else {
try {
println(" PROBLEM: NVIDIA driver/library version mismatch was detected in the log. Try to automatically resolve it:")
println("Removing labels and adding 'recovery-process' label to node")
node.setLabelString("recovery-process")
} catch (Exception ex) {
println("ERROR - CANNOT PERFORM RECOVERY ACTIONS FOR NVIDIA ERROR")
println("Restoring to previous state")
node.setLabelString(old_labels)
throw ex
}
}
println("# END SECTION: NVIDIA MISMATCH RECOVERY")
'''.stripIndent()
)
shell("""sudo shutdown -r +1""")
}
}
}
onlyIfBuildSucceeds(false)
onlyIfBuildFails(true)
}
// Manual insertion of xml for Naginator plugin because of this issue https://issues.jenkins.io/browse/JENKINS-66458
configure { project ->
project / publishers / 'com.chikli.hudson.plugin.naginator.NaginatorPublisher' {
regexpForRerun("nvml error: driver/library version mismatch")
checkRegexp(true)
maxSchedule(1)
delay(class: 'com.chikli.hudson.plugin.naginator.FixedDelay') {
delay(70)
}
}
}
}
}
}
}
}

0 comments on commit 5603eb0

Please sign in to comment.