Automation test for spark CLI samples (#2377)

* Enable test for submit_spark_standalone_jobs

* Generate workflow yaml

* update spark job files for automation test

* Add workflow for serverless spark with user identity job

* Add scripts to upload input data

* Update workflow to refer the script

* Update source file path

* Update workflow with correct file path

* Update working directory

* Update workflow

* Update the path

* Update the script to upload data

* Update the overwrite mode

* Update destination blob name

* Use blob upload batch

* Add spark pipeline tests

* Update spark component extension

* Add script to attache uai

* Update property name in workflow

* Update script parameters

* Update assign uai script

* Format the script

* Update setup identities script

* Update path to infra bootstraping

* Enable automation test for attached spark job

* Update resource path

* Update setup attached resource script

* Update script of setup resources

* Update setup attached resource script2

* Add logic to assign identity role

* Format the empty check

* Check if identity is empty

* Update to get compute properties

* update readme

* Reformat the script

* Update schema location and revert sdk notebook changes

* Attach pool first

* Rename resources and merge main

* Update format in yml

* Add role assigment to uid
This commit is contained in:
Fred Li 2023-07-06 13:36:07 -07:00 коммит произвёл GitHub
Родитель 2cee822a7a
Коммит 0b829b9277
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
38 изменённых файлов: 910 добавлений и 57 удалений

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-pipeline-default-identity
on:
workflow_dispatch:
schedule:
- cron: "30 9/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-pipeline-default-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-pipeline-default-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,66 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-pipeline-managed-identity
on:
workflow_dispatch:
schedule:
- cron: "43 7/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-pipeline-managed-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/jobs/spark
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-pipeline-managed-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-pipeline-user-identity
on:
workflow_dispatch:
schedule:
- cron: "15 4/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-pipeline-user-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-pipeline-user-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-standalone-default-identity
on:
workflow_dispatch:
schedule:
- cron: "15 0/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-standalone-default-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-standalone-default-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,66 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-standalone-managed-identity
on:
workflow_dispatch:
schedule:
- cron: "16 1/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-standalone-managed-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/jobs/spark
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-standalone-managed-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-attached-spark-standalone-user-identity
on:
workflow_dispatch:
schedule:
- cron: "7 1/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-attached-spark-standalone-user-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup attached spark
working-directory: cli
continue-on-error: true
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh attached-spark-standalone-user-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,55 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-pipeline-default-identity
on:
workflow_dispatch:
schedule:
- cron: "33 10/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-pipeline-default-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-pipeline-default-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-pipeline-managed-identity
on:
workflow_dispatch:
schedule:
- cron: "57 5/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-pipeline-managed-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/jobs/spark
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-pipeline-managed-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,55 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-pipeline-user-identity
on:
workflow_dispatch:
schedule:
- cron: "56 7/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-pipeline-user-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-pipeline-user-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,55 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-standalone-default-identity
on:
workflow_dispatch:
schedule:
- cron: "19 11/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-standalone-default-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-standalone-default-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,61 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-standalone-managed-identity
on:
workflow_dispatch:
schedule:
- cron: "46 0/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-standalone-managed-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true
- name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/jobs/spark
continue-on-error: true
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-standalone-managed-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -0,0 +1,55 @@
# This code is autogenerated.
# Code is generated by running custom script: python3 readme.py
# Any manual changes to this file may cause incorrect behavior.
# Any manual changes will be overwritten if the code is regenerated.
name: cli-jobs-spark-serverless-spark-standalone-user-identity
on:
workflow_dispatch:
schedule:
- cron: "27 1/12 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/spark/**
- infra/bootstrapping/**
- .github/workflows/cli-jobs-spark-serverless-spark-standalone-user-identity.yml
- cli/jobs/spark/data/titanic.csv
- cli/setup.sh
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZUREML_CREDENTIALS}}
- name: bootstrap resources
run: |
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
- name: run job
run: |
source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
bash -x ../../run-job.sh serverless-spark-standalone-user-identity.yml
working-directory: cli/jobs/spark

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-pipeline-default-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-3
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-pipeline-managed-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-1
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-pipeline-user-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-2
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-standalone-default-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src
@ -29,4 +29,4 @@ args: >-
--titanic_data ${{inputs.titanic_data}}
--wrangled_data ${{outputs.wrangled_data}}
compute: yuachengcompute
compute: mysparkcompute

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-standalone-managed-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src

Просмотреть файл

@ -1,5 +1,5 @@
# attached-spark-standalone-user-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-pipeline-default-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-6
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-pipeline-managed-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-4
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-pipeline-user-identity.yaml
$schema: http://azureml/sdk-2-0/PipelineJob.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: Titanic-Spark-CLI-Pipeline-5
description: Spark component for Titanic data in Pipeline
@ -7,7 +7,7 @@ description: Spark component for Titanic data in Pipeline
jobs:
spark_job:
type: spark
component: ./spark-job-component.yaml
component: ./spark-job-component.yml
inputs:
titanic_data:
type: uri_file

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-standalone-default-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-standalone-managed-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src

Просмотреть файл

@ -1,5 +1,5 @@
# serverless-spark-standalone-user-identity.yaml
$schema: http://azureml/sdk-2-0/SparkJob.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkJob.schema.json
type: spark
code: ./src

Просмотреть файл

@ -0,0 +1,51 @@
# <create_variables>
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
LOCATION=$(az ml workspace show --query location -o tsv)
RESOURCE_GROUP=$(az group show --query name -o tsv)
AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
API_VERSION="2022-05-01"
TOKEN=$(az account get-access-token --query accessToken -o tsv)
GEN2_STORAGE_NAME=${RESOURCE_GROUP}gen2
GEN2_FILE_SYSTEM=${RESOURCE_GROUP}file
SYNAPSE_WORKSPACE_NAME=${AML_WORKSPACE_NAME}-syws
SQL_ADMIN_LOGIN_USER="automation"
SQL_ADMIN_LOGIN_PASSWORD="auto123!"
SPARK_POOL_NAME="automationpool"
SPARK_POOL_ADMIN_ROLE_ID="6e4bf58a-b8e1-4cc3-bbf9-d73143322b78"
ATTACHED_COMPUTE_NAME="mysparkcompute"
#</create_variables>
#<create_uai>
AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai
az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION
AML_USER_MANAGED_ID_OID=$(az identity show --resource-group $RESOURCE_GROUP -n $AML_USER_MANAGED_ID --query principalId -o tsv)
#</create_uai>
#<create_attached_resources>
az storage account create --name $GEN2_STORAGE_NAME --resource-group $RESOURCE_GROUP --location $LOCATION --sku Standard_LRS --kind StorageV2 --enable-hierarchical-namespace true
az storage fs create -n $GEN2_FILE_SYSTEM --account-name $GEN2_STORAGE_NAME
az synapse workspace create --name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --storage-account $GEN2_STORAGE_NAME --file-system $GEN2_FILE_SYSTEM --sql-admin-login-user $SQL_ADMIN_LOGIN_USER --sql-admin-login-password $SQL_ADMIN_LOGIN_PASSWORD --location $LOCATION
az role assignment create --role "Storage Blob Data Owner" --assignee $AML_USER_MANAGED_ID_OID --scope /subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.Storage/storageAccounts/$GEN2_STORAGE_NAME/blobServices/default/containers/$GEN2_FILE_SYSTEM
az synapse spark pool create --name $SPARK_POOL_NAME --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --spark-version 3.2 --node-count 3 --node-size Medium --min-node-count 3 --max-node-count 10 --enable-auto-scale true
az synapse workspace firewall-rule create --name allowAll --workspace-name $SYNAPSE_WORKSPACE_NAME --resource-group $RESOURCE_GROUP --start-ip-address 0.0.0.0 --end-ip-address 255.255.255.255
TEMP_COMPUTE_FILE="temp-compute-setup.yml"
cp $1 $TEMP_COMPUTE_FILE
sed -i "s/<SUBSCRIPTION_ID>/$SUBSCRIPTION_ID/g;
s/<RESOURCE_GROUP>/$RESOURCE_GROUP/g;
s/<SYNAPSE_WORKSPACE_NAME>/$SYNAPSE_WORKSPACE_NAME/g;
s/<SPARK_POOL_NAME>/$SPARK_POOL_NAME/g;
s/<AML_USER_MANAGED_ID>/$AML_USER_MANAGED_ID/g;" $TEMP_COMPUTE_FILE
az ml compute attach --file $TEMP_COMPUTE_FILE --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME
az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $AML_USER_MANAGED_ID_OID
COMPUTE_MANAGED_IDENTITY=$(az ml compute show --name $ATTACHED_COMPUTE_NAME --resource-group $RESOURCE_GROUP --workspace-name $AML_WORKSPACE_NAME --query identity.principal_id --out tsv)
if [[ ! -z "$COMPUTE_MANAGED_IDENTITY" ]]
then
az synapse role assignment create --workspace-name $SYNAPSE_WORKSPACE_NAME --role $SPARK_POOL_ADMIN_ROLE_ID --assignee $COMPUTE_MANAGED_IDENTITY
fi
#</create_attached_resources>

Просмотреть файл

@ -0,0 +1,24 @@
# <create_variables>
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
LOCATION=$(az ml workspace show --query location -o tsv)
RESOURCE_GROUP=$(az group show --query name -o tsv)
AML_WORKSPACE_NAME=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
API_VERSION="2022-05-01"
TOKEN=$(az account get-access-token --query accessToken -o tsv)
AML_USER_MANAGED_ID=${RESOURCE_GROUP}-uai
#</create_variables>
#<create_uai>
az identity create --name $AML_USER_MANAGED_ID --resource-group $RESOURCE_GROUP --location $LOCATION
#</create_uai>
TEMP_UAI_FILE="temp-user-assigned-identity.yml"
cp user-assigned-identity.yml $TEMP_UAI_FILE
sed -i "s/<SUBSCRIPTION_ID>/$SUBSCRIPTION_ID/g;
s/<RESOURCE_GROUP>/$RESOURCE_GROUP/g;
s/<AML_USER_MANAGED_ID>/$AML_USER_MANAGED_ID/g;" $TEMP_UAI_FILE
#<assign_uai_to_workspace>
az ml workspace update --subscription $SUBSCRIPTION_ID --resource-group $RESOURCE_GROUP --name $AML_WORKSPACE_NAME --file $TEMP_UAI_FILE
#</assign_uai_to_workspace>

Просмотреть файл

@ -1,5 +1,5 @@
# spark-job-component.yaml
$schema: http://azureml/sdk-2-0/SparkComponent.json
$schema: https://azuremlschemas.azureedge.net/latest/sparkComponent.schema.json
name: titanic_spark_component
type: spark
version: 1

Просмотреть файл

Просмотреть файл

@ -1,7 +0,0 @@
# user-assigned-identity.yaml
identity:
type: "system_assigned,user_assigned"
user_assigned_identities:
- resource_id: /subscriptions/<SUBSCRIPTION_ID/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.ManagedIdentity/userAssignedIdentities/<AML_USER_MANAGED_ID>
tenant_id: 00x000xx-00x0-00xx-00xx-0x0xx000xx00

Просмотреть файл

@ -0,0 +1,6 @@
# user-assigned-identity.yaml
identity:
type: "system_assigned,user_assigned"
user_assigned_identities:
"/subscriptions/<SUBSCRIPTION_ID>/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.ManagedIdentity/userAssignedIdentities/<AML_USER_MANAGED_ID>" : {}

Просмотреть файл

@ -9,7 +9,7 @@ import string
import yaml
# define constants
EXCLUDED_JOBS = ["java", "spark"]
EXCLUDED_JOBS = ["java", "spark-job-component", "storage_pe", "user-assigned-identity"]
# TODO: Re-include these below endpoints and deployments when the workflow generation code supports substituting vars in .yaml files.
EXCLUDED_ENDPOINTS = [
"1-uai-create-endpoint",
@ -77,6 +77,7 @@ def main(args):
jobs += sorted(glob.glob("jobs/basics/*.yml", recursive=False))
jobs += sorted(glob.glob("jobs/*/basics/**/*job*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/pipelines/**/*pipeline*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/spark/*.yml", recursive=False))
jobs += sorted(
glob.glob("jobs/automl-standalone-jobs/**/cli-automl-*.yml", recursive=True)
)
@ -420,6 +421,7 @@ def write_job_workflow(job):
filename, project_dir, hyphenated = parse_path(job)
posix_project_dir = project_dir.replace(os.sep, "/")
is_pipeline_sample = "jobs/pipelines" in job
is_spark_sample = "jobs/spark" in job
creds = CREDENTIALS
schedule_hour, schedule_minute = get_schedule_time(filename)
# Duplicate name in working directory during checkout
@ -439,6 +441,8 @@ on:
- .github/workflows/cli-{hyphenated}.yml\n"""
if is_pipeline_sample:
workflow_yaml += " - cli/run-pipeline-jobs.sh\n" ""
if is_spark_sample:
workflow_yaml += " - cli/jobs/spark/data/titanic.csv\n" ""
workflow_yaml += f""" - cli/setup.sh
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
@ -465,8 +469,10 @@ jobs:
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: run job
continue-on-error: true\n"""
if is_spark_sample:
workflow_yaml += get_spark_setup_workflow(job)
workflow_yaml += f""" - name: run job
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";\n"""
@ -701,7 +707,7 @@ jobs:
creds: {creds}
- name: bootstrap resources
run: |
bash bootstrap.sh
bash bootstrapping/bootstrap.sh
working-directory: infra
continue-on-error: false
- name: setup-cli
@ -856,6 +862,42 @@ def get_endpoint_name(filename, hyphenated):
return endpoint_name
def get_spark_setup_workflow(job):
is_attached = "attached-spark" in job
is_user_identity = "user-identity" in job
is_managed_identity = "managed-identity" in job
is_default_identity = "default-identity" in job
workflow = f""" - name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true\n"""
if is_managed_identity:
workflow += f""" - name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/jobs/spark
continue-on-error: true\n"""
if is_attached:
workflow += f""" - name: setup attached spark
working-directory: cli
continue-on-error: true"""
if is_attached and is_user_identity:
workflow += f"""
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml\n"""
if is_attached and is_managed_identity:
workflow += f"""
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml\n"""
if is_attached and is_default_identity:
workflow += f"""
run: |
bash -x jobs/spark/setup-attached-resources.sh resources/compute/attached-spark.yml\n"""
return workflow
# run functions
if __name__ == "__main__":
# setup argparse

Просмотреть файл

@ -1,9 +0,0 @@
# attached-spark-system-identity.yaml
name: my-spark-pool
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>
identity:
type: system_assigned

Просмотреть файл

@ -0,0 +1,9 @@
# attached-spark-system-identity.yaml
name: mysparkcompute
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID>/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>
identity:
type: system_assigned

Просмотреть файл

@ -1,11 +0,0 @@
# attached-spark-user-identity.yaml
name: my-spark-pool
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>
identity:
type: user_assigned
user_assigned_identities:
- resource_id: /subscriptions/<SUBSCRIPTION_ID/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.ManagedIdentity/userAssignedIdentities/<AML_USER_MANAGED_ID>

Просмотреть файл

@ -0,0 +1,11 @@
# attached-spark-user-identity.yml
name: mysparkcompute
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID>/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>
identity:
type: user_assigned
user_assigned_identities:
- resource_id: /subscriptions/<SUBSCRIPTION_ID>/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.ManagedIdentity/userAssignedIdentities/<AML_USER_MANAGED_ID>

Просмотреть файл

@ -1,6 +0,0 @@
# attached-spark.yaml
name: my-spark-pool
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>

Просмотреть файл

@ -0,0 +1,6 @@
# attached-spark.yaml
name: mysparkcompute
type: synapsespark
resource_id: /subscriptions/<SUBSCRIPTION_ID>/resourceGroups/<RESOURCE_GROUP>/providers/Microsoft.Synapse/workspaces/<SYNAPSE_WORKSPACE_NAME>/bigDataPools/<SPARK_POOL_NAME>

Просмотреть файл

@ -0,0 +1,19 @@
# <create_variables>
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
LOCATION=$(az ml workspace show --query location -o tsv)
RESOURCE_GROUP=$(az group show --query name -o tsv)
WORKSPACE=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
API_VERSION="2022-05-01"
TOKEN=$(az account get-access-token --query accessToken -o tsv)
#</create_variables>
# <get_storage_details>
response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/datastores?api-version=$API_VERSION&isDefault=true" \
--header "Authorization: Bearer $TOKEN")
AZUREML_DEFAULT_CONTAINER=$(echo $response | jq -r '.value[0].properties.containerName')
export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.accountName')
# </get_storage_details>
# <upload_data>
az storage blob upload-batch -s $1 --pattern *.csv -d $AZUREML_DEFAULT_CONTAINER --account-name $AZURE_STORAGE_ACCOUNT --overwrite true
# </upload_data>