nni/pipelines/templates/save-crashed-info.yml

73 строки
2.9 KiB
YAML

# Upload crashed experiments to artifact,
# so that further offline investigations are possible.
parameters:
- name: training_service
type: string
default: unknown
steps:
- script: |
set -e
export EXPERIMENT_DIR=$(readlink -f ~/nni-experiments/_latest)
echo "Latest experiment directory: ${EXPERIMENT_DIR}"
echo "##vso[task.setvariable variable=experiment_dir]${EXPERIMENT_DIR}"
condition: and(failed(), not(contains(variables['Agent.OS'], 'Windows')))
displayName: (failed) (POSIX) Latest experiment directory
- script: |
cp -r /tmp/$USER/nni ${EXPERIMENT_DIR}/local && echo "Copy successful" || echo "Copy failed"
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), not(contains(variables['Agent.OS'], 'Windows')))
displayName: (failed) (POSIX) Harvest GPU scheduler logs
- script: |
set -e
export EXPERIMENT_ID=$(echo ${EXPERIMENT_DIR} | sed -e 's/\/.*\///g')
sudo docker cp $(Build.BuildId):/tmp/nni-experiments/${EXPERIMENT_ID} ${EXPERIMENT_DIR}/remote && echo "Copy successful" || echo "Copy failed"
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'remote'), not(contains(variables['Agent.OS'], 'Windows')))
displayName: (failed) (POSIX) Harvest remote trial logs
- powershell: |
$latestDir = (gci ~/nni-experiments -exclude _latest | ? { $_.PSIsContainer } | sort CreationTime)[-1]
echo "Latest experiment directory: $latestDir"
echo "##vso[task.setvariable variable=experiment_dir]$latestDir"
condition: and(failed(), contains(variables['Agent.OS'], 'Windows'))
displayName: (failed) (Windows) Latest experiment directory
- powershell: |
$latestDir = Get-Item $(experiment_dir)
$tmpPath = "${env:Temp}\${env:UserName}\nni"
$destPath = "${latestDir}\local"
if (Test-Path $tmpPath) {
Write-Host "Copying $tmpPath to $destPath"
Copy-Item $tmpPath -Destination $destPath -Recurse
}
else {
Write-host "$tmpPath doesn't exist"
}
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), contains(variables['Agent.OS'], 'Windows'))
displayName: (failed) (Windows) Harvest GPU scheduler logs
- powershell: |
$latestDir = Get-Item $(experiment_dir)
$experimentId = $latestDir.name
$remotePath = "C:\Users\nniuser\AppData\Local\Temp\nni-experiments\${experimentId}"
$destPath = "${latestDir}\remote"
if (Test-Path $remotePath) {
Write-Host "Copying $remotePath to $destPath"
Copy-Item $remotePath -Destination $destPath -Recurse
}
else {
Write-host "$remotePath doesn't exist"
}
condition: and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'remote'), contains(variables['Agent.OS'], 'Windows'))
displayName: (failed) (Windows) Harvest remote trial logs
- publish: $(experiment_dir)
artifact: experiment
condition: variables['experiment_dir']
displayName: (failed) Upload experiment artifact