Skip to content

Commit

Permalink
(feat): postbuild script to recover from nvidia version mismatch
Browse files Browse the repository at this point in the history
  • Loading branch information
claraberendsen committed May 4, 2023
1 parent abe5d53 commit 7b8b6dd
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
.pyc
### VS Code ###
.vscode/

### Mac OS ###
.DS_Store
jenkins-scripts/dsl/job-dsl-core-1.77-standalone.jar
65 changes: 61 additions & 4 deletions jenkins-scripts/dsl/_configs_/OSRFUNIXBase.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import javaposse.jobdsl.dsl.Job
- allow concurrent builds
- bash: RTOOLS checkout
*/
class OSRFUNIXBase extends OSRFBase
class OSRFUNIXBase extends OSRFBase
{
static void create(Job job)
{
static void create(Job job)
{
OSRFBase.create(job)

job.with
Expand All @@ -38,6 +38,63 @@ class OSRFUNIXBase extends OSRFBase
git clone https://github.com/gazebo-tooling/release-tools scripts -b \$RTOOLS_BRANCH
""".stripIndent())
}
publishers {
postBuildScripts {
steps{
conditionalSteps {
condition {
expression('(.)*gpu-nvidia(.)*','${NODE_LABELS}')
}
steps {
systemGroovyCommand('''\
import hudson.model.Cause.UpstreamCause;
import hudson.model.*;
def node = build.getBuiltOn()
def old_labels = node.getLabelString()
println("# BEGIN SECTION: NVIDIA MISMATCH RECOVERY")
if (!(build.getLog(1000) =~ "nvml error: driver/library version mismatch")) {
println("Build failed for other reason than NVIDIA error - Not performing any more steps")
return 0;
} else {
try {
println("Removing labels and adding 'recovery-process' label to node")
node.setLabelString("recovery-process")
println("Requeuing job :" + build.project.name)
def job = Hudson.instance.getJob(build.project.name)
def params = build.getAllActions().find{ it instanceof ParametersAction }?.parameters
def cause = new UpstreamCause(build)
// wait 70s to build again so that the computer has time to reboot (e.g only one agent available)
def scheduled = job.scheduleBuild2(70, cause, new ParametersAction(params))
if(!scheduled) {
throw new Exception("Job could not be requeued!")
}
println("Job requeued!")
} catch (Exception ex) {
println("ERROR - CANNOT PERFORM RECOVERY ACTIONS FOR NVIDIA ERROR")
println("Restoring to previous state")
node.setLabelString(old_labels)
throw ex
}
}
println("# END SECTION: NVIDIA MISMATCH RECOVERY")
'''.stripIndent()
)
shell("""sudo shutdown -r +1""")
}
}
}
onlyIfBuildSucceeds(false)
onlyIfBuildFails(true)
}

}

}
}
}

}

0 comments on commit 7b8b6dd

Please sign in to comment.