This commit is contained in:
Yuge Zhang 2022-08-15 09:49:31 +08:00 коммит произвёл GitHub
Родитель ac892fc764
Коммит 81c9b9389d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
22 изменённых файлов: 172 добавлений и 106 удалений

Просмотреть файл

@ -245,16 +245,18 @@ We appreciate all contributions from community to make NNI thrive.
| Type | Status |
| :---: | :---: |
| Fast test | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/fast%20test?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=54&branchName=master) |
| Full linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20linux?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=62&repoName=microsoft%2Fnni&branchName=master) |
| Full windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=63&branchName=master) |
| Full test - HPO | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20HPO?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=90&repoName=microsoft%2Fnni&branchName=master) |
| Full test - NAS | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20NAS?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=89&repoName=microsoft%2Fnni&branchName=master) |
| Full test - compression | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20compression?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=91&repoName=microsoft%2Fnni&branchName=master) |
### Training services
| Type | Status |
| :---: | :---: |
| Local - linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20local%20-%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=92&branchName=master) |
| Local - windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20local%20-%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=98&branchName=master) |
| Remote - linux to linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20linux%20to%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=64&branchName=master) |
| Remote - linux to windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20linux%20to%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=67&branchName=master) |
| Remote - windows to linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20windows%20to%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=68&branchName=master) |
| Remote - windows to windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20windows%20to%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=99&branchName=master) |
| OpenPAI | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20openpai%20-%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=65&branchName=master) |
| Frameworkcontroller | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20frameworkcontroller?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=70&branchName=master) |
| Kubeflow | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20kubeflow?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=69&branchName=master) |

Просмотреть файл

@ -4,8 +4,8 @@ trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
resource_group: nni-image-builder
gallery_name: nniImageGalleryV2
network_security_group: nni-image-builder-nsg
managed_image_name: nni-linux-image
image_definition_name: nniLinuxImage

Просмотреть файл

@ -4,11 +4,11 @@ trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
resource_group: nni-image-builder
gallery_name: nniImageGalleryV2
network_security_group: nni-image-builder-nsg
managed_image_name: nni-windows-image
image_name: nniWindowsImage
image_definition_name: nniWindowsImage
packer_config: config_windows
jobs:

Просмотреть файл

@ -46,3 +46,11 @@ jobs:
- template: templates/cache-dependencies-template.yml
parameters:
platform: ubuntu-latest-gpu
- job: windows_gpu
pool:
vmImage: windows-latest
steps:
- template: templates/cache-dependencies-template.yml
parameters:
platform: windows-gpu

Просмотреть файл

@ -35,9 +35,7 @@ stages:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:

Просмотреть файл

@ -35,9 +35,7 @@ stages:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:

Просмотреть файл

@ -32,12 +32,10 @@ stages:
jobs:
- job: linux
pool: nni-it-1es-11
timeoutInMinutes: 60
timeoutInMinutes: 90
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:
@ -55,24 +53,25 @@ stages:
- job: windows
pool: nni-it-1es-windows
timeoutInMinutes: 60
timeoutInMinutes: 90
steps:
# FIXME: Windows should use GPU,
# but it's not used now since driver is not installed in the image.
- template: templates/check-gpu-status.yml
parameters:
platform: windows
- template: templates/install-dependencies.yml
parameters:
platform: windows
platform: windows-gpu
python_env: noop
- template: templates/install-nni.yml
parameters:
user: false
# NOTE: Data needs to be downloaded if Windows has GPU.
# Also, the download template needs to be updated with powershell syntax.
# - template: templates/download-test-data.yml
- template: templates/download-test-data.yml
parameters:
platform: windows
- powershell: |
python test/vso_tools/ssl_patch.py

Просмотреть файл

@ -11,8 +11,7 @@ jobs:
timeoutInMinutes: 90
steps:
# FIXME: should use GPU here
- template: templates/fix-apt-1es.yml
# TODO: consider adding GPU tests here
- template: templates/install-dependencies.yml
parameters:

Просмотреть файл

@ -11,9 +11,7 @@ jobs:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:

Просмотреть файл

@ -11,6 +11,9 @@ jobs:
timeoutInMinutes: 120
steps:
- template: templates/check-gpu-status.yml
parameters:
platform: windows
- template: templates/install-dependencies.yml
parameters:
@ -38,8 +41,7 @@ jobs:
# We can't install it on-the-fly because we can't elevate the permission here.
- powershell: |
cd test
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local `
--exclude mnist-pytorch-local-gpu
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local
displayName: Integration test
- template: templates/save-crashed-info.yml

Просмотреть файл

@ -16,9 +16,8 @@ jobs:
timeoutInMinutes: 120
steps:
- template: templates/fix-apt-1es.yml
# FIXME: GPU is not supported yet.
# Change to ubuntu-latest-gpu when it's done.
# TODO: We don't currently have a test for GPU.
# And nvidia-docker is not installed yet.
- template: templates/install-dependencies.yml
parameters:

Просмотреть файл

@ -1,16 +1,51 @@
# BEFORE READING:
#
# 1. We are now running agents on 1ES, all the notes about VMSS can be safely ignored.
# 2. Many actions can be done on both cloud shell and web portal. Choose whichever you prefer.
steps:
- script: |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
displayName: Install azcli
# Please follow the tutorial of [image builder](https://docs.microsoft.com/en-us/azure/virtual-machines/image-builder-overview)
# to set up a managed identity, and,
# 1. Assign the role following the instruction.
# 2. Assign contributor role of the resource group to the identity.
# 3. Add the identity to VMSS.
#
# Update 2022/7 (running on Microsoft-hosted agents).
# Update 2022/7 (running on Microsoft-hosted agents / 1ES agents).
# Use a service principal. This service principal must be assigned contributor access to the resource group.
#
# Alternative option: managed identity.
# Follow tutorial of [image builder](https://docs.microsoft.com/en-us/azure/virtual-machines/image-builder-overview).
#
# Either way, the identity / service principal must be assigned contributor access to the resource group.
# We also added the following role (but I'm not sure whether it's necessary):
#
# {
# "properties": {
# "roleName": "ImageBuilderRole",
# "description": "Image Builder access to create resources for the image build, you should delete or split out as appropriate",
# "assignableScopes": [
# "/subscriptions/<subscription_id>/resourceGroups/<resource_group>"
# ],
# "permissions": [
# {
# "actions": [
# "Microsoft.Compute/galleries/read",
# "Microsoft.Compute/galleries/images/read",
# "Microsoft.Compute/galleries/images/versions/read",
# "Microsoft.Compute/galleries/images/versions/write",
# "Microsoft.Compute/images/write",
# "Microsoft.Compute/images/read",
# "Microsoft.Compute/images/delete",
# "Microsoft.VirtualMachineImages/imageTemplates/write",
# "Microsoft.VirtualMachineImages/imageTemplates/read",
# "Microsoft.VirtualMachineImages/imageTemplates/delete"
# ],
# "notActions": [],
# "dataActions": [],
# "notDataActions": []
# }
# ]
# }
# }
#
- script: |
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
displayName: Login to Azure
@ -28,10 +63,12 @@ steps:
az provider show -n Microsoft.Network -o json
displayName: Register features
# Need to create an image gallerybefore this.
# Only need to create once.
# Need to create an image gallery before this.
# Only need to create once (can be done on web portal).
# az sig create --resource-group <resource_group> --gallery-name <sig_name>
#
# NOTE: Remember to add READER access to the image gallery for "1ES Resource Management".
#
# Add a image definition (also only once).
# az sig image-definition create -g <resource_group> \
# --gallery-name <sig_name> \
@ -45,6 +82,8 @@ steps:
# --sku 20_04-nni \
# --os-type Linux \
# --hyper-v-generation V2
#
# This can be done on web portal, remember to choose V2 for Hyper-V generation.
- script: |
set -e
@ -96,7 +135,7 @@ steps:
# The workaround here is to use a monitor to detect the machine ready signal and change its WinRM port.
- script: |
cd test/vso_tools/build_vm
python3 packer_build_windows.py
python3 packer_build_windows.py $(packer_config).json $(resource_group)
displayName: (Windows) Packer build
condition: and(succeeded(), contains(variables['packer_config'], 'windows'))
@ -109,14 +148,14 @@ steps:
# TODO: Should delete the managed image after build is done.
# Image gallery alone is enough. Keeping it for now for debugging purposes.
# No further actions are needed here. VM images are already set to latest. They should be auto-updated.
# In case you want to do it on your own:
#
# To deploy the image on VMSS, run this in Cloud Shell:
# az vmss update --resource-group nni --name nni-windows-it \
# --set virtualMachineProfile.storageProfile.imageReference.id=/subscriptions/{subscriptionId}/resourceGroups/nni/providers/Microsoft.Compute/galleries/nniImageGallery/images/nniWindowsImage/versions/Latest
#
# To deploy the image on 1ES, similar actions need to be performed on the web portal of 1ES managed images.
#
# Probably need to enlarge the disk size, in case it's too small:
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
#
# No need to update the image every time, because it's already set to latest.
#
# NOTE: After using 1ES pool, the pool image has to be updated manually to the latest version.
# However, no successful build has been performed yet, because of resource shortage in Southeast Asia.

Просмотреть файл

@ -0,0 +1,18 @@
parameters:
- name: platform
type: string
default: linux
steps:
# Install GPU driver on Windows.
# Installer has already been downloaded and saved in the image.
- powershell: |
Start-Process -Verb RunAs -FilePath "$env:ProgramData\driver_installer.exe" -ArgumentList "/s /n" -Wait
displayName: (Windows) Install GPU driver
condition: and(succeeded(), contains('${{ parameters.platform }}', 'windows'))
# Make sure GPU isn't broken.
- script: |
nvidia-smi
displayName: Check GPU status

Просмотреть файл

@ -7,13 +7,26 @@
# because it's not easy to setup auto-download for some datasets.
# See cache-dependencies-template.yml on how to generate credentials to upload new test data.
parameters:
- name: platform
type: string
default: linux
steps:
- script: |
set -e
mkdir -p test/data
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
ls -al data
- ${{ if contains(parameters.platform, 'windows') }}:
powershell: |
New-Item -Path test/data -ItemType directory -Force
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
Get-ChildItem data
${{ else }}:
script: |
set -e
mkdir -p test/data
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
ls -al data
displayName: Download test data

Просмотреть файл

@ -1,37 +0,0 @@
# Fix apt-related issues on 1ES linux pipeline.
# 1ES has an auto-upgraded with apt-get running in the background, periodically.
# This leads to bad consequences:
# 1) apt is locked when install is actually needed
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
#
# The ultimate solution should be to upgrade the VM image correctly,
# but it's currently infeasible because of a resource group limitation.
# We introduce a workaround here by force disabling the auto-upgrade and,
# fix the broken dependencies if upgrade has already been accidentally run.
#
# This file can be removed after image is updated to latest.
parameters:
- name: check_gpu
type: boolean
default: false
steps:
# Don't set -e
# Always make sure the lock is released.
- script: |
set -x
sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
displayName: (1ES) Disable apt upgrade
# Make sure GPU isn't broken.
# Sometimes we can't save the GPU because upgrade runs too early.
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
- script: |
echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
nvidia-smi
displayName: (1ES) Check GPU status
condition: and(succeeded(), ${{ parameters.check_gpu }})

Просмотреть файл

@ -261,6 +261,7 @@ _yarn_env['PATH'] = str(Path().resolve() / 'nni_node') + path_env_seperator + os
_yarn_path = Path().resolve() / 'toolchain/yarn/bin' / yarn_executable
def _yarn(path, *args):
_print('yarn ' + ' '.join(args) + f' (path: {path})')
if os.environ.get('GLOBAL_TOOLCHAIN'):
subprocess.run(['yarn', *args], cwd=path, check=True)
else:

Просмотреть файл

@ -1,4 +1,5 @@
import logging
import sys
import pytest
import numpy as np
@ -196,6 +197,18 @@ def test_hub_oneshot(space_type, strategy_type):
if strategy_type in ['darts', 'gumbel'] and space_type == 'mobilenetv3':
pytest.skip('Skip as it consumes too much memory.')
WINDOWS_SPACES = [
# Skip some spaces as Windows platform is slow.
'nasbench201',
'mobilenetv3',
'proxylessnas',
'shufflenet',
'autoformer',
'darts',
]
if sys.platform == 'win32' and space_type not in WINDOWS_SPACES:
pytest.skip('Skip as Windows is too slow.')
model_space = _hub_factory(space_type)
dataset_type = 'cifar10'

Просмотреть файл

@ -22,11 +22,11 @@
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2", "eastus"],
"replication_regions": ["southeastasia", "westus3", "eastus"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"build_resource_group_name": "<resource_group>",
"vm_size": "Standard_DS2_v2"
}],
"provisioners": [{

Просмотреть файл

@ -20,11 +20,11 @@
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2", "eastus"],
"replication_regions": ["southeastasia", "westus3", "eastus"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"build_resource_group_name": "<resource_group>",
"vm_size": "Standard_D2s_v4",
"allowed_inbound_ip_addresses": ["<ip_address>"],

Просмотреть файл

@ -15,8 +15,8 @@ import subprocess
import sys
import time
BUILD_COMMAND = 'PACKER_LOG=1 packer build packer_windows.json'
RESOURCE_GROUP = 'nni'
BUILD_COMMAND = 'PACKER_LOG=1 packer build ' + sys.argv[1]
RESOURCE_GROUP = sys.argv[2]
def monitor_print(*args):
@ -24,6 +24,9 @@ def monitor_print(*args):
def main():
monitor_print('Build command:', BUILD_COMMAND)
monitor_print('Resource group:', RESOURCE_GROUP)
process = subprocess.Popen(BUILD_COMMAND, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
retcode = process.poll()

Просмотреть файл

@ -60,7 +60,17 @@ sudo apt-get install -y cuda-drivers
# Reference: https://dev.to/akaszynski/create-an-azure-self-hosted-agent-without-going-insane-173g
# We only need Python 3.7 and 3.9 for now.
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt-get install -y python3.7-dev python3.7-venv python3.9-dev python3.9-venv
sudo apt-get install -y python3.7-dev python3.7-venv python3.9-dev python3.9-venv python3.10-dev python3.10-venv python3.11-dev python3.11-venv
# Disable the periodical apt-get upgrade.
# Sometimes, unattended upgrade blocks apt-get install
sudo sed -i -e "s/Update-Package-Lists \"1\"/Update-Package-Lists \"0\"/g" /etc/apt/apt.conf.d/10periodic
sudo sed -i -e "s/Update-Package-Lists \"1\"/Update-Package-Lists \"0\"/g" /etc/apt/apt.conf.d/20auto-upgrades
sudo sed -i -e "s/Unattended-Upgrade \"1\"/Unattended-Upgrade \"0\"/g" /etc/apt/apt.conf.d/20auto-upgrades
sudo systemctl disable apt-daily.timer
sudo systemctl disable apt-daily.service
sudo systemctl disable apt-daily-upgrade.timer
sudo systemctl disable apt-daily-upgrade.service
# Deprovision
sudo /usr/sbin/waagent -force -deprovision

Просмотреть файл

@ -66,9 +66,9 @@ choco install -y --no-progress vcredist2012 vcredist2013 vcredist2015 vcredist20
# Install CUDA.
Write-Host "Installing CUDA..."
$CudaUrl = "https://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
Invoke-WebRequest $CudaUrl -OutFile "cuda_installer.exe"
Start-Process -FilePath "cuda_installer.exe" -ArgumentList "/s /n" -Wait
Remove-Item "cuda_installer.exe"
Invoke-WebRequest $CudaUrl -OutFile "$env:ProgramData\cuda_installer.exe"
Start-Process -FilePath "$env:ProgramData\cuda_installer.exe" -ArgumentList "/s /n" -Wait
# Remove-Item "cuda_installer.exe"
# Verify CUDA.
Write-Host "Verify CUDA installation..."
$CudaDir = "$env:ProgramFiles\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin"
@ -77,6 +77,9 @@ $CudaDir = "$env:ProgramFiles\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin"
Get-ChildItem $CudaDir
$env:path = "$env:path;$CudaDir"
# Download GPU driver.
Invoke-WebRequest "https://us.download.nvidia.com/tesla/516.94/516.94-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" -OutFile "$env:ProgramData\driver_installer.exe"
Write-Host "Installing utilities..."
# Install azcopy for cache download.