Issue 43 (#1)
* chore: enviornments variables adjustment * chore: schema update * fix: add SP version on job variable * chore: force powershell version * chore: change task order to publish DB secrets * chore: Adjustments on Databricks Secrets deploy * chore: adjust variables info on parameter * chore: adjustment on Databricks scripts * chore: dataLakeName variable adjustment * chore: dataComputeRS variable change * chore: documentation adjustment att image * chore: change keyvalt set permissions on SP * chore: Databricks task order * chore: databricks adjustments * chore: var output display removal * chore: databricks adjustments * chore: documentation adjustment * chore: documentation change * chore: script adjustment * chore: add command to list scope content * chore: add SP into Admin role * chore: adjust Administrator Role SP * chore: add connect to AD first * chore: Remove connection do Azure AD * chore: documentation adjustment * chore: fix uppercase extensions on some images * chore: directory changes * chore: directory rename * chore: Service Principal secrets first adjustments * fix: adjustment of SP * chore: save secret on output * chore: syntax fix * chore: token adjustment * chore: store secret inside output hol file * chore: put SP inside parameters * chore: template adjustment * chore: lower parameter for fix * chore: syntax adjustment * chore: Add logs * chore: add log info * chore: log info
|
@ -19,7 +19,7 @@ Sept 2021
|
|||
|
||||
In this workshop, you will deploy a DataOps reference arquitecture, for understanding best practices of Data Engineering & Software Engineering combined.
|
||||
|
||||
![](./hands-on-lab/media/high-level-overview-dataops.png 'Solution Architecture')
|
||||
!['Solution Architecture'](./hands-on-lab/media/high-level-overview-dataops.png)
|
||||
|
||||
### Lab Instructions
|
||||
|
||||
|
|
|
@ -7,12 +7,6 @@ parameters:
|
|||
type: string
|
||||
- name: solutionName
|
||||
type: string
|
||||
- name: servicePrincipal
|
||||
type: string
|
||||
- name: resourceGroupData
|
||||
type: string
|
||||
- name: resourceGroupCompute
|
||||
type: string
|
||||
|
||||
stages:
|
||||
- stage: deploy
|
||||
|
@ -86,8 +80,8 @@ stages:
|
|||
env:
|
||||
AzureDevOpsPAT: $(System.AccessToken)
|
||||
|
||||
- deployment: deploy_databricks_secrets
|
||||
displayName: 'Deploy Databricks Secrets to ${{ parameters.environment }}'
|
||||
- deployment: deploy_dbw_clusters
|
||||
displayName: 'Deploy DBW Clusters to ${{ parameters.environment }}'
|
||||
condition: succeeded()
|
||||
dependsOn: deploy_arm
|
||||
pool:
|
||||
|
@ -103,36 +97,22 @@ stages:
|
|||
- template: step.install-databricks-cli.yml
|
||||
parameters:
|
||||
azureServiceConnection: ${{ parameters.azureServiceConnection }}
|
||||
|
||||
- task: AzurePowerShell@4
|
||||
displayName: 'Databricks Secrets'
|
||||
displayName: 'Publish Databricks Secrets'
|
||||
inputs:
|
||||
azureSubscription: ${{ parameters.azureServiceConnection }}
|
||||
scriptType: filePath
|
||||
scriptPath: infrastructure-as-code/scripts/DatabricksSecrets.ps1
|
||||
scriptArguments: >
|
||||
-ServicePrincipalName ${{ parameters.servicePrincipal }}
|
||||
-DataResourceGroup ${{ parameters.resourceGroupData }}
|
||||
-ComputeResourceGroup ${{ parameters.resourceGroupCompute }}
|
||||
-Environment ${{ parameters.environment }}
|
||||
-DataLakeName $(dataLakeName)
|
||||
-DatabricksName $(databricksName)
|
||||
-KeyVaultName $(keyVaultName)
|
||||
-DATABRICKS_TOKEN $(DATABRICKS_TOKEN)
|
||||
azurePowerShellVersion: 'OtherVersion'
|
||||
preferredAzurePowerShellVersion: $(azPowershellVersion)
|
||||
preferredAzurePowerShellVersion: 5.5.0
|
||||
|
||||
- deployment: deploy_dbw_clusters
|
||||
displayName: 'Deploy DBW Clusters to ${{ parameters.environment }}'
|
||||
condition: succeeded()
|
||||
dependsOn: deploy_databricks_secrets
|
||||
pool:
|
||||
vmImage: 'Ubuntu-20.04'
|
||||
environment: ${{ parameters.environment }}
|
||||
variables:
|
||||
- group: dataops-iac-cd-output-${{ parameters.environment }}
|
||||
strategy:
|
||||
runOnce:
|
||||
deploy:
|
||||
steps:
|
||||
- checkout: self
|
||||
- template: step.install-databricks-cli.yml
|
||||
parameters:
|
||||
azureServiceConnection: ${{ parameters.azureServiceConnection }}
|
||||
- task: PowerShell@2
|
||||
displayName: Deploy DBW Clusters
|
||||
inputs:
|
||||
|
|
|
@ -27,5 +27,5 @@ steps:
|
|||
-d "grant_type=client_credentials&client_id=$servicePrincipalId&resource=$databricks_resource_id&client_secret=$servicePrincipalKey" \
|
||||
https://login.microsoftonline.com/$tenantId/oauth2/token \
|
||||
| jq -r .access_token)
|
||||
echo "##vso[task.setvariable variable=DATABRICKS_TOKEN;issecret=true]$accessToken"
|
||||
echo "##vso[task.setvariable variable=DATABRICKS_TOKEN;isSecret=true]$accessToken"
|
||||
addSpnToEnvironment: true
|
|
@ -70,7 +70,7 @@ behave
|
|||
|
||||
The result should look similar to the next image:
|
||||
|
||||
![Behave Results](/lab-files/media/behave-results.PNG 'Behave Results')
|
||||
![Behave Results](/lab-files/media/behave-results.png)
|
||||
|
||||
## References
|
||||
|
||||
|
|
|
@ -94,7 +94,7 @@ In this task you will explore and understand the folder structure, navigating th
|
|||
|
||||
To proceed with the execution of the other exercises below, you must understand the structure of the "infrastructure-as-code" folder.
|
||||
|
||||
![infrastructure as code](./media/infrastructure-as-code-folder.PNG)
|
||||
![infrastructure as code](./media/infrastructure-as-code-folder.png)
|
||||
|
||||
```
|
||||
|infrastructure-as-code| ---> Main folder
|
||||
|
@ -154,7 +154,7 @@ To proceed with the execution of the other exercises below, you must understand
|
|||
|
||||
##### *File: azuredeploy.json*
|
||||
|
||||
![infrastructure-folder](./media/iac-folder-infrastructure.PNG)
|
||||
![infrastructure-folder](./media/iac-folder-infrastructure.png)
|
||||
|
||||
Main template, with declared parameters, variables and resources. Here we use linkedTemplates.
|
||||
>*NOTE*: We have the option of using separate parameter files as a good practice when using IaC templates, without the need to change directly in the main template.
|
||||
|
@ -170,7 +170,7 @@ In linkedTemplates we have templates with "parts" of declared resources that are
|
|||
|
||||
#### **Sub-Folders and Files: linkedTemplates**
|
||||
|
||||
![linkedTemplate-sub-folders](./media/iac-folder-linkedtemplates-subfolders.PNG)
|
||||
![linkedTemplate-sub-folders](./media/iac-folder-linkedtemplates-subfolders.png)
|
||||
|
||||
##### *File: template.json (subfolders 1, 2, 3)*
|
||||
|
||||
|
@ -182,7 +182,7 @@ For each subfolder **(1, 2, 3)** we have this file "similar" to the `azuredeploy
|
|||
|
||||
Example of an Azure Data Factory declaration in a template.
|
||||
|
||||
![lkd-template-compute](./media/iac-linkedtemplates-template-compute.PNG)
|
||||
![lkd-template-compute](./media/iac-linkedtemplates-template-compute.png)
|
||||
|
||||
#### **File: compute.json, data.json (subfolder 4)**
|
||||
|
||||
|
@ -194,7 +194,7 @@ To apply a correct role and permission to a resource, Azure uses features from A
|
|||
|
||||
Example of a resource declaration in this template.
|
||||
|
||||
![iac-service-principal](./media/iac-service-principal.PNG)
|
||||
![iac-service-principal](./media/iac-service-principal.png)
|
||||
|
||||
|
||||
#### **Folder: parameters**
|
||||
|
@ -206,18 +206,18 @@ Parameters folder and directory with templates files with parameters and values
|
|||
|
||||
Example of a parameters declaration in this template.
|
||||
|
||||
![iac-parameters](./media/parameters-dev-json.PNG)
|
||||
![iac-parameters](./media/parameters-dev-json.png)
|
||||
|
||||
|
||||
#### **Folder [databricks]**
|
||||
|
||||
In this file you will find declared settings related to the Databricks resource which will be used in executing the scripts (below) and provisioning your infrastructure, as well as its necessary resources.
|
||||
|
||||
![](./media/iac-folder-databricks.PNG 'iac-databricks')
|
||||
!['iac-databricks'](./media/iac-folder-databricks.png)
|
||||
|
||||
Example of a configuration declaration in this template.
|
||||
|
||||
![](./media/iac-file-corejson-databricks.PNG 'iac-databricks-corejson')
|
||||
!['iac-databricks-corejson'](./media/iac-file-corejson-databricks.png)
|
||||
|
||||
#### **Folder [scripts]**
|
||||
|
||||
|
@ -226,7 +226,7 @@ Some scripts are referenced with ARM templates, "calling" them to perform some n
|
|||
|
||||
There is a correct order for these scripts execution to succeed, as described in **Exercise 3**, in the IaC CI/CD discussion.
|
||||
|
||||
![](./media/iac-scripts.PNG 'iac-scripts')
|
||||
!['iac-scripts'](./media/iac-scripts.png)
|
||||
|
||||
|
||||
#### **Folder [tests]**
|
||||
|
@ -331,12 +331,12 @@ We are working with three environments `dev`, `qa` and `prod`, and this environm
|
|||
|
||||
>**Setting Azure Devops Project:** before starting to execute the pipelines and the git workflow, it is necessary to create environments in Azure Devops for the IaC and Databricks environments. Environments can be created inside the Pipelines menu of Azure DevOps.
|
||||
|
||||
![](./media/environments-qa-prod.PNG)
|
||||
![](./media/environments-qa-prod.png)
|
||||
|
||||
|
||||
>**Note**: Create Environments for `dev`, `qa`, `prod`, `databricks-dev`, `databricks-qa` and `databricks-prod` in Azure Devops before making any Pull Request (PR).
|
||||
|
||||
![](./media/environments.PNG)
|
||||
![](./media/environments.png)
|
||||
|
||||
### **Task 3: Git workflow**
|
||||
|
||||
|
@ -412,11 +412,11 @@ Now we will start to work with the pipelines and understand the funcionality tha
|
|||
|
||||
>**Note**: `dataops` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
![](./media/pipelines.PNG)
|
||||
![](./media/pipelines.png)
|
||||
|
||||
In the quickstart the process create the pipelines to IaC, the customized library dataops, databricks and azure data factory. Now we will see the IaC pipelines.
|
||||
|
||||
![](./media/Pipelines-IaC.PNG)
|
||||
![](./media/Pipelines-IaC.png)
|
||||
|
||||
>**Note**: `dataops` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
|
@ -424,32 +424,47 @@ In the quickstart the process create the pipelines to IaC, the customized librar
|
|||
|
||||
##### **Execute CI Pipeline:**
|
||||
|
||||
![](./media/Run-CIPipeline-Iac.PNG)
|
||||
![](./media/Run-CIPipeline-Iac.png)
|
||||
|
||||
![](./media/CI-Iac.PNG)
|
||||
![](./media/CI-Iac.png)
|
||||
|
||||
This pipeline was executed manually, but it has been configurated in the branch policies to start automatically if changes are made in the folder `infrastructure-as-code`:
|
||||
|
||||
![](./media/branch-policies-builder.PNG)
|
||||
![](./media/branch-policies-builder.png)
|
||||
|
||||
##### **Execute CD Pipeline**:
|
||||
|
||||
![](./media/Run-CDPipeline-Iac.PNG)
|
||||
![](./media/Run-CDPipeline-Iac.png)
|
||||
|
||||
The CD pipeline will be triggered automatically after the CI Pipeline. After executing the IaC CD, check your Azure Devops environments to see how it changes. When this pipeline finishes its execution, also validate if you see the Azure resources created in the resource groups of the development environment.
|
||||
|
||||
![](./media/RGComputeDev.PNG)
|
||||
![](./media/RGDataDev.PNG)
|
||||
![](./media/RGComputeDev.png)
|
||||
![](./media/RGDataDev.png)
|
||||
|
||||
>**Note**: Name of the Resource Groups and Resources depends on the custom alias defined by yourself and also the suscription id.
|
||||
|
||||
With these resources created, you can configure a secrets scope in databricks, for secure management of secrets.
|
||||
|
||||
##### **Databricks Secrets Scope**
|
||||
|
||||
When you have the resources created in the environment, it is time to configure the scope secrets in databricks, to do that, run the PowerShell script located at infrastructure-as-code/scripts to create the Databricks secrets scope for each environment:
|
||||
|
||||
You should to change `Key Valt Name` with your information and execute this script, and how the resources in development environment were created then we can create the scope in databricks dev.
|
||||
|
||||
```
|
||||
./DatabricksScopeCreation.ps1 `
|
||||
-KeyVaultName "<keyvault_name>",
|
||||
-ComputeResourceGroup "<compute_resource_group>",
|
||||
-DatabricksName "<databricks_name>" `
|
||||
```
|
||||
|
||||
>**Note**: To see Key names in secret scope dataops execute the follow command.
|
||||
|
||||
```
|
||||
databricks secrets list --scope dataops
|
||||
```
|
||||
|
||||
![](./media/scope-dataops.PNG)
|
||||
![](./media/scope-dataops.png)
|
||||
|
||||
### **Task 2: Exploring Azure Data Services**
|
||||
|
||||
|
@ -457,15 +472,15 @@ In this task, you will explore the main resources that have been deployed in you
|
|||
|
||||
The resource groups rerg-dataops-data-dev and rg-dataops-compute-dev contain data and compute services respectively.
|
||||
|
||||
![](./media/resource-groups.png 'Resource groups')
|
||||
!['Resource groups'](./media/resource-groups.png)
|
||||
|
||||
The rg-dataops-data resource group contains a [Data Lake Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) and a [Blob Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview)
|
||||
|
||||
![](./media/rg-dataops-data-dev.png 'Resource group: Data')
|
||||
!['Resource group: Data'](./media/rg-dataops-data-dev.png)
|
||||
|
||||
The resource group rg-dataops-compute contains an instance of [Azure Data Factory](https://docs.microsoft.com/en-us/azure/data-factory/) and [Azure Databricks](https://docs.microsoft.com/en-us/azure/databricks/)
|
||||
|
||||
![](./media/rg-dataops-compute-dev.png 'Resource group: Compute')
|
||||
!['Resource group: Compute'](./media/rg-dataops-compute-dev.png)
|
||||
|
||||
#### **Technology Overview**
|
||||
|
||||
|
@ -483,15 +498,15 @@ In this task, you will explore the Azure Blob Storage instance.
|
|||
|
||||
2. On the overview blade, select Containers
|
||||
|
||||
![](./media/stgdataopseastus2dev.png 'Blob Storage Overview')
|
||||
!['Blob Storage Overview'](./media/stgdataopseastus2dev.png)
|
||||
|
||||
3. Select and open the flights-data container.
|
||||
|
||||
![](./media/stgdataopseastus2dev-containers.png 'Containers')
|
||||
!['Containers'](./media/stgdataopseastus2dev-containers.png)
|
||||
|
||||
4. Review the CSV files. Select the CSV file and download it.
|
||||
|
||||
![](./media/stgdataopseastus2dev-airport-metadata.png 'Files')
|
||||
!['Files'](./media/stgdataopseastus2dev-airport-metadata.png)
|
||||
|
||||
## **Exercise 4: CI/CD Pipelines to Lib, Databricks and Data Factory**
|
||||
|
||||
|
@ -501,7 +516,7 @@ In this task, you will explore the Azure Blob Storage instance.
|
|||
|
||||
Now, we need to create the custom library that we use in the notebooks of databricks, then we have the CI and CD Pipeline for the lib. When these pipelines finished the execution, you could see the artifact in the feed `lib-packages` that you create in the [step 3 of the quickstart](../quickstart/docs/3a-azdo-setup-basic.md).
|
||||
|
||||
![](./media/Pipelines-lib.PNG)
|
||||
![](./media/Pipelines-lib.png)
|
||||
|
||||
>**Note**: `vic` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
|
@ -509,11 +524,11 @@ Now, we need to create the custom library that we use in the notebooks of databr
|
|||
|
||||
Execute the CI pipeline of the library to create the version `alpha` of the library.
|
||||
|
||||
![](./media/Run-CIPipeline-lib.PNG)
|
||||
![](./media/Run-CIPipeline-lib.png)
|
||||
|
||||
When this pipeline finished in artifacts you can see the version.
|
||||
|
||||
![](./media/alpbaVersionlib.PNG)
|
||||
![](./media/alpbaVersionlib.png)
|
||||
|
||||
>**Note**: The number in the version is variable depends of the Build Id.
|
||||
|
||||
|
@ -521,11 +536,11 @@ When this pipeline finished in artifacts you can see the version.
|
|||
|
||||
In the CD Pipeline you can to see the different stages by environment, we will to execute the CD Pipeline to left the version `beta` enable to be used in the databricks notebook.
|
||||
|
||||
![](./media/Run-CDPipeline-lib.PNG)
|
||||
![](./media/Run-CDPipeline-lib.png)
|
||||
|
||||
When this pipeline finished in artifacts you can see the version.
|
||||
|
||||
![](./media/betaVersionlib.PNG)
|
||||
![](./media/betaVersionlib.png)
|
||||
|
||||
>**Note**: The number in the version is variable depends of the Build Id.
|
||||
|
||||
|
@ -533,49 +548,49 @@ When this pipeline finished in artifacts you can see the version.
|
|||
|
||||
Now you could see the pipelines that work with databricks in the aspect of the custom library and the notebooks that will be executed in databricks.
|
||||
|
||||
![](./media/Pipelines-databricks.PNG)
|
||||
![](./media/Pipelines-databricks.png)
|
||||
|
||||
### **CI Pipeline**
|
||||
|
||||
This pipeline make the check of the notebooks in databricks.
|
||||
|
||||
![](./media/Run-CIPipeline-Databricks.PNG)
|
||||
![](./media/Run-CIPipeline-Databricks.png)
|
||||
|
||||
### **CD Pipeline Lib to Databricks**
|
||||
|
||||
This pipeline upload the current version library to the `dbfs` of databriks.
|
||||
|
||||
![](./media/Run-CDPipeline-Databricks-Lib.PNG)
|
||||
![](./media/Run-CDPipeline-Databricks-Lib.png)
|
||||
|
||||
You could see in the environments that the status in `databricks-dev` changed.
|
||||
|
||||
![](./media/environments-DEV-Databricks.PNG)
|
||||
![](./media/environments-DEV-Databricks.png)
|
||||
|
||||
### **CD Pipeline Notebooks to Databricks**
|
||||
|
||||
This pipeline upload the current notebooks to the shared folder in databricks.
|
||||
|
||||
![](./media/Run-CDPipeline-Databricks-Notebooks.PNG)
|
||||
![](./media/Run-CDPipeline-Databricks-Notebooks.png)
|
||||
|
||||
You could see in the environments that the status in `databricks-dev` changed.
|
||||
|
||||
![](./media/environments-DEV-Databricks-Notebooks.PNG)
|
||||
![](./media/environments-DEV-Databricks-Notebooks.png)
|
||||
|
||||
### **Task 3: Executing CI/CD Pipeline Azure Data Factory**
|
||||
|
||||
This pipeline check the integrity on the data and trigger the ADF Pipeline identifying some problems in it but this process doesnt wait that this pipeline finished.
|
||||
|
||||
![](./media/Pipelines-ADF.PNG)
|
||||
![](./media/Pipelines-ADF.png)
|
||||
|
||||
>**Note**: The first time that this pipeline is executed it fails, because it is necessary that ADF pipeline finished sucessful the first time to create some folders in the container in the datalake that are necessaries to check the integrity of the data.
|
||||
|
||||
![](./media/Run-CDPipeline-ADF.PNG)
|
||||
![](./media/Run-CDPipeline-ADF.png)
|
||||
|
||||
When the ADF Pipeline finished, you could execute again this CD Pipeline. you can check it, open ADF resource in the Azure Portal, and the in monitor the pipeline running.
|
||||
|
||||
![](./media/ADFPipelineRunning.PNG)
|
||||
![](./media/ADFPipelineRunning.png)
|
||||
|
||||
![](./media/Run-CDPipeline-ADFGood.PNG)
|
||||
![](./media/Run-CDPipeline-ADFGood.png)
|
||||
|
||||
Now that you understand the workflow, you can start with the other environments, and when the pipeline in ADF finished you can run again the pipeline to get the sucessful execution.
|
||||
|
||||
|
@ -585,54 +600,54 @@ In this task, you will explore the Azure Databricks instance dbw-dataops-eastus2
|
|||
|
||||
1. Navigate to the Azure Databricks instance `dbw-dataops-eastus2-dev` and Launch the Workspace.
|
||||
|
||||
![](./media/dbw-dataops-eastus2-dev-overview.png 'Databricks overview')
|
||||
!['Databricks overview'](./media/dbw-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Workspace hub (2). Open the folders shared with you (if someone share with you the databricks instance) or seek your user in Users (3). Open the DataOps Folder (4) and select the notebook named 01 ADLS Mount (5).
|
||||
|
||||
![](./media/dbw-dataops-eastus2-dev-ws.png 'Databricks workspace')
|
||||
!['Databricks workspace'](./media/dbw-dataops-eastus2-dev-ws.png)
|
||||
|
||||
3. To run the notebook you need attach a cluster from the list (1) or create a new one if you don't have clusters deployed.
|
||||
|
||||
![](./media/notebook-01-adls-mount.png 'Attach a cluster')
|
||||
!['Attach a cluster'](./media/notebook-01-adls-mount.png)
|
||||
|
||||
3.1 Provide a name for the new cluster, establish the cluster setting and select Create Cluster.
|
||||
|
||||
![](./media/dbw-dataops-new-cluster.png 'Creating a cluster')
|
||||
!['Creating a cluster'](./media/dbw-dataops-new-cluster.png)
|
||||
|
||||
3.2 Navigate back to the notebook named 01 ADLS Mount and attach the cluster
|
||||
|
||||
![](./media/dbw-dataops-attaching-cluster.png 'Creating a cluster')
|
||||
!['Creating a cluster'](./media/dbw-dataops-attaching-cluster.png)
|
||||
|
||||
4. Select Run Cell or Crt + Enter to run the cell and amount the Azure Data Lake.
|
||||
This code is to mount the Azure Data Lake Storage Gen2 account to Databricks File System. For the authentication, it uses Key Vault and OAuth 2.0.
|
||||
|
||||
![](./media/notebook-01-adls-runcell.png 'Run')
|
||||
!['Run'](./media/notebook-01-adls-runcell.png)
|
||||
|
||||
5. Navigate back to the notebook named `02 One Notebook to Rule Them All`.
|
||||
|
||||
5.1 Run the cells to import the libraries that you will use to process and transform the data.
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-1.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-1.png)
|
||||
|
||||
5.2 Read the file `FlightDelaysWithAirportCodes.csv` from the landing layer (1), transform the data (2), and create the a local table called flight_delays_with_airport_codes from the flight_delays_df Dataframe (3).
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-2.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-2.png)
|
||||
|
||||
5.3 Select clean columns to generate clean data (1) and save the clean data as a global table called flight_delays_clean (2).
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-3.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-3.png)
|
||||
|
||||
5.4 To see the created table: Click Data in the sidebar (1). In the databases folder, click on the default database (2). Open Tables Folder and Click the table name.
|
||||
|
||||
![](./media/globaltable-flight_delays_view.png 'Run')
|
||||
!['Run'](./media/globaltable-flight_delays_view.png)
|
||||
|
||||
5.5 Navigate back to the notebook. Run cells 9, 10 and 11 to prepare the weather data. Cell 9 reads raw data from landing layer and create a local table called flight_weather_with_airport_code. Cell 10 transforms data and Cell 11 creates a global table called flight_weather_clean.
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-4.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-4.png)
|
||||
|
||||
5.5 Run the rest of cells. Cell 14 copies clean data of flight dealys and weather into the trusted layer of the data lake (1). Cell 16 saves data of airports with the delayes into the logs folder as CSV file (trusted layer) (2). Finally,the path of the CSV file will be the notebook output (3).
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-5.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-5.png)
|
||||
|
||||
### **Task 5: Explore Azure Data Lake Storage**
|
||||
|
||||
|
@ -642,15 +657,15 @@ In this task, you will explore the layers defined to organize the data into the
|
|||
|
||||
2. On the Overview blade, select Containers
|
||||
|
||||
![](./media/lakedataopseastus2dev-overview.png 'Data Lake overview')
|
||||
!['Data Lake overview'](./media/lakedataopseastus2dev-overview.png)
|
||||
|
||||
3. Select and open the landing layer container.
|
||||
|
||||
![](./media/lakedataopseastus2dev-layers.png 'Containers')
|
||||
!['Containers'](./media/lakedataopseastus2dev-layers.png)
|
||||
|
||||
4. Select and open the directories airport-metada, flight-delays, flight-weather. They will contain CSV files with the infomation about airports, flights and weather.
|
||||
|
||||
![](./media/lakedataopseastus2dev-layer-landing.png 'Landing layer')
|
||||
!['Landing layer'](./media/lakedataopseastus2dev-layer-landing.png)
|
||||
|
||||
### **Task 6: Azure Data Factory**
|
||||
|
||||
|
@ -658,15 +673,15 @@ In this task, you will explore the `adf-dataops-eastus2-dev` Azure Data Factory
|
|||
|
||||
1. Navigate to the `adf-dataops-eastus2-dev` Azure Data Factory instance and launch the workspace (Author & Monitor).
|
||||
|
||||
![](./media/adf-dataops-eastus2-dev-overview.png 'Azure Data Factory Overview')
|
||||
!['Azure Data Factory Overview'](./media/adf-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Author hub.
|
||||
|
||||
![](./media/adf-dataops-eastus2-dev-workspace1.png 'Azure Data Factory Hub')
|
||||
!['Azure Data Factory Hub'](./media/adf-dataops-eastus2-dev-workspace1.png)
|
||||
|
||||
3. You will find the pipeline `ProcessFlightDelaysData` and 6 datasets. The pipeline contains the activities to copy data from the XXXXXXXSource datasets into the XXXXXXSink datasets.
|
||||
|
||||
![](./media/adf-dataops-eastus2-dev-author.PNG 'Author Hub')
|
||||
!['Author Hub'](./media/adf-dataops-eastus2-dev-author.png)
|
||||
|
||||
4. Open the pipeline `ProcessFlightDelaysData` and review the settings of the activities:
|
||||
- Copy Airport Codes Data
|
||||
|
@ -675,43 +690,43 @@ In this task, you will explore the `adf-dataops-eastus2-dev` Azure Data Factory
|
|||
- Mount ADLS
|
||||
- Transform Flights Data
|
||||
|
||||
![](./media/adf-dataops-eastus2-dev-process-data.PNG 'Pipeline')
|
||||
!['Pipeline'](./media/adf-dataops-eastus2-dev-process-data.png)
|
||||
|
||||
4.1. Select the Copy Airport Codes Data (1). Select the Source Tab (2) and Click on Open to see the settings of the AirportCodesSource dataset (3).
|
||||
|
||||
![](./media/copy-airport-codes.PNG 'Copy Airport Codes Data')
|
||||
!['Copy Airport Codes Data'](./media/copy-airport-codes.png)
|
||||
|
||||
4.2 Select Edit to review the Azure blob Storage linked service (1). View the file path that you want to copy (2). Select Browse to navigate into the `stgdataopseastus2dev` Azure Blob Storage instance (3) and Select the file path.
|
||||
|
||||
![](./media/airport-codes-source-csv.PNG 'Airport Codes Source dataset')
|
||||
!['Airport Codes Source dataset'](./media/airport-codes-source-csv.png)
|
||||
|
||||
4.3 Navigate back to the Copy Airport Codes Data Activity in the pipeline ProcessFlightDelaysData. Select the Sink tab (1) and Click on Open to see the setting of the AirportCodesSink dataset (2).
|
||||
|
||||
![](./media/copy-airport-codes-sink.PNG 'Sink')
|
||||
!['Sink'](./media/copy-airport-codes-sink.png)
|
||||
|
||||
4.4. Select Edit to review the Azure Data Lake linked service (1). View the layer where you will copy the data (2). Select Browse to navigate into the lakedataopseastus2dev Azure Data Lake instance (3) and select the layer (4).
|
||||
|
||||
![](./media/airport-codes-sync.PNG 'Airport dataset')
|
||||
!['Airport dataset'](./media/airport-codes-sync.png)
|
||||
|
||||
5. Repeat the steps 4.1 - 4.4 for the Copy Flights Delays Data and Copy Flights Weather Data activities.
|
||||
|
||||
6. Navigate back to the pipeline and select the notebook activity `Mount ADLS`. Select the Azure Databricks tab (1) and click on Edit to view the settings of the linked service of the Databricks instance.
|
||||
|
||||
![](./media/mount-adls-1.PNG 'notebook activity')
|
||||
!['notebook activity'](./media/mount-adls-1.png)
|
||||
|
||||
7. Select the settings tab of the notebook activity to configure the notebook to run in the databricks instance (1). In the Notebook path, indicate the path of the notebook to run (2). Select Browse if you want to explore the available notebooks (3) and explore the available folders in the Databricks instance (4). Select Open to open the Databricks workspace.
|
||||
|
||||
8. Repeat the steps 6 and 7 to explore the Notebook Activity Transform Flight Data.
|
||||
|
||||
![](./media/mount-adls-2.PNG 'notebook activity')
|
||||
!['notebook activity'](./media/mount-adls-2.png)
|
||||
|
||||
9. OPTIONAL - Navigate back to the pipeline and run it.
|
||||
|
||||
![](./media/pipeline-trigger.PNG 'Execute pipeline')
|
||||
!['Execute pipeline'](./media/pipeline-trigger.png)
|
||||
|
||||
9.1 Navigate to the Data Lake. Follow the file path that you indicated in the step 4.4. You will find the CSV file just copied.
|
||||
|
||||
![](./media/lakedataopseastus2dev-airport-metadata.png 'Exploring Data Lake')
|
||||
!['Exploring Data Lake'](./media/lakedataopseastus2dev-airport-metadata.png)
|
||||
|
||||
>**Note:** If you arrive here you have the environment of dev deploy sucessful, now the idea is to see in detail more about the Lib and the tests.
|
||||
|
||||
|
@ -733,7 +748,7 @@ Open the **02 One Notebook to Rule them all** notebook (located in the Workspace
|
|||
* Prepare and apply data quality in the flight delays and whether data sets.
|
||||
* Transform and combining dates columns between flights delay and whether forecast data set using the custom python library
|
||||
|
||||
![](./media/02-One-Notebook-to-Rule-Them-All-1.png 'Run')
|
||||
!['Run'](./media/02-One-Notebook-to-Rule-Them-All-1.png)
|
||||
|
||||
>**IMPORTANT NOTE**
|
||||
_Please note that each of these tasks will be addressed through several cells in the notebook. You don’t need to change them only execute and analyze the transformation operations._
|
||||
|
@ -976,31 +991,31 @@ First review how the DevOps pipeline was defined:
|
|||
|
||||
1. Go to the repositoy that was created as part the Quickstart and open the templates folder, were you will see 3 yml files.
|
||||
|
||||
![](./media/templates-folder.PNG 'Templastes Folder')
|
||||
!['Templastes Folder'](./media/templates-folder.png)
|
||||
|
||||
2. Open the test.yml file by clicking on it
|
||||
|
||||
![](./media/select-test-yml.PNG 'Test yml')
|
||||
!['Test yml'](./media/select-test-yml.png)
|
||||
|
||||
3. Indentify the script activity that runs the behave modulo and identify the different paramentes that are set before it is called
|
||||
|
||||
![](./media/behave-script.PNG 'behave activity')
|
||||
!['behave activity'](./media/behave-script.png)
|
||||
|
||||
Now lets review the DevOps pipeline execution results:
|
||||
|
||||
1. Go to DevOps Pipelines from the project defined on Execise 4 and select the Pipeline with the name "*\<your lab prefix>*-adf-cd" by clicking on it.
|
||||
|
||||
![](./media/last-pipeline-run.PNG 'Last Pipeline Run')
|
||||
!['Last Pipeline Run'](./media/last-pipeline-run.png)
|
||||
|
||||
2. You will see a list of resent runs of the selected pipeline, click on the lates run
|
||||
|
||||
3. At the stages secction select the "Run behavior tests" stage
|
||||
|
||||
![](./media/pipeline-stages-run.PNG 'Pipeline Stages')
|
||||
!['Pipeline Stages'](./media/pipeline-stages-run.png)
|
||||
|
||||
4. Review the Azure DevOps execution results for "Run behavior tests"\\"TEST: Run behave features"
|
||||
|
||||
![](./media/pipeline-run-results.PNG 'Pipeline Results')
|
||||
!['Pipeline Results'](./media/pipeline-run-results.png)
|
||||
|
||||
<p>Here you see the results of running the BDD test using <b>behave</b></p>
|
||||
|
||||
|
@ -1023,27 +1038,27 @@ When the all pipelines were executed in development branch and you validate the
|
|||
|
||||
Open a PR from `develop` to `qa` to promote the code changes to the QA environment. Please wait again for the creation of the QA infrastructure.
|
||||
|
||||
![](./media/PRDEV2QA.PNG)
|
||||
![](./media/PRDEV2QA.png)
|
||||
|
||||
>**Note**: It will be necessary modify branch policies to make the merge only with one reviewer and it can be the owner, click check `Allow requestors to approve their own changes` (only for the laboratory).
|
||||
|
||||
![](./media/branch-policies-own-owner.PNG)
|
||||
![](./media/branch-policies-own-owner.png)
|
||||
|
||||
![](./media/PRDEV2QA-1.PNG)
|
||||
![](./media/PRDEV2QA-1.png)
|
||||
|
||||
When you make the merge you could be that the CI Pipeline of IaC start automatically.
|
||||
|
||||
![](./media/PRDEV2QA-2.PNG)
|
||||
![](./media/PRDEV2QA-2.png)
|
||||
|
||||
>**Note:** Remember to **[configure the scope in Databricks for qa environment](#databricks-secrets-scope)** and run the pipeline of Lib for `qa` environment.
|
||||
|
||||
![](./media/rcVersionlib.PNG)
|
||||
![](./media/rcVersionlib.png)
|
||||
|
||||
### **Task 2: Promote QA to Prod**
|
||||
|
||||
Repeat the process one last time, opening a **PR** from `qa` to `main` to promote the code changes to the **PROD environment**. Please wait again for the creation of the PROD infrastructure. In artifact you can see the final version of the library for production.
|
||||
|
||||
![](./media/Versionlib.PNG)
|
||||
![](./media/Versionlib.png)
|
||||
|
||||
### Additional references
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ In this task you will explore and understand the folder structure and scripts, t
|
|||
|
||||
To proceed with the execution of the other exercises below, you must understand the structure of the "infrastructure-as-code" folder, as well as its content of templates and scripts.
|
||||
|
||||
![](media/infrastructure-as-code-folder.PNG 'infrastructure as code')
|
||||
!['infrastructure as code'](media/infrastructure-as-code-folder.png)
|
||||
|
||||
```
|
||||
|infrastructure-as-code| ---> Principal folder
|
||||
|
@ -80,7 +80,7 @@ To proceed with the execution of the other exercises below, you must understand
|
|||
|
||||
## File: azuredeploy.json
|
||||
|
||||
![](media/iac-folder-infrastructure.PNG 'infrastructure-folder')
|
||||
!['infrastructure-folder'](media/iac-folder-infrastructure.png)
|
||||
|
||||
Main template, with declared parameters, variables and resources. Here we use linkedTemplates.
|
||||
*NOTE*: We have the option of using separate parameter files as a good practice when using IaC templates, without the need to change directly in the main template.
|
||||
|
@ -93,7 +93,7 @@ To deploy complex solutions, you can break your Azure Resource Manager template
|
|||
|
||||
## Folder: linkedTemplates
|
||||
|
||||
![](media/iac-folder-linkedtemplates.PNG 'linkedTemplate-folder')
|
||||
!['linkedTemplate-folder'](media/iac-folder-linkedtemplates.png)
|
||||
|
||||
In linkedTemplates we have templates with "parts" of declared resources that are not declared in the main Template, in order to reuse and can link with other templates.
|
||||
*NOTE*: linkedTemplates is a widely used practice, for better organization and handling of templates of different types of resources and being able to link them to any template.
|
||||
|
@ -101,7 +101,7 @@ In linkedTemplates we have templates with "parts" of declared resources that are
|
|||
|
||||
## Sub-Folders and Files: linkedTemplates
|
||||
|
||||
![](media/iac-folder-linkedtemplates-subfolders.PNG 'linkedTemplate-sub-folders')
|
||||
!['linkedTemplate-sub-folders'](media/iac-folder-linkedtemplates-subfolders.png)
|
||||
|
||||
## File: template.json (subfolders 1, 2, 3)
|
||||
|
||||
|
@ -113,7 +113,7 @@ ML resources: Machine Learning Services
|
|||
|
||||
Example of a resource declaration in this template.
|
||||
|
||||
![](media/iac-linkedtemplates-template-compute.PNG 'lkd-template-compute')
|
||||
!['lkd-template-compute'](media/iac-linkedtemplates-template-compute.png)
|
||||
|
||||
## File: compute.json, data.json (subfolder 4)
|
||||
|
||||
|
@ -127,30 +127,30 @@ An Azure service principal is an identity created for use with applications, hos
|
|||
|
||||
Example of a resource declaration in this template.
|
||||
|
||||
![](media/iac-service-principal.PNG 'iac-service-principal')
|
||||
!['iac-service-principal'](media/iac-service-principal.png)
|
||||
|
||||
|
||||
## Folder: parameters
|
||||
|
||||
![](media/iac-folder-parameters.PNG 'parameters-folder')
|
||||
!['parameters-folder'](media/iac-folder-parameters.png)
|
||||
|
||||
Parameters folder and directory with templates files with parameters and values to be used by linkedTemplates and main template, without the need to change directly in the main template.
|
||||
*NOTE*: Using templates parameters is optional and can be used directly in the main template. However, following a model of good practice, the use separately is indicated.
|
||||
|
||||
Example of a parameters declaration in this template.
|
||||
|
||||
![](media/parameters-dev-json.PNG 'iac-parameters')
|
||||
!['iac-parameters'](media/parameters-dev-json.png)
|
||||
|
||||
|
||||
# Folder [databricks]
|
||||
|
||||
In this file you will find declared settings related to the Databricks resource which will be used in executing the scripts (below) and provisioning your infrastructure, as well as its necessary resources.
|
||||
|
||||
![](media/iac-folder-databricks.PNG 'iac-databricks')
|
||||
!['iac-databricks'](media/iac-folder-databricks.png)
|
||||
|
||||
Example of a configuration declaration in this template.
|
||||
|
||||
![](media/iac-file-corejson-databricks.PNG 'iac-databricks-corejson')
|
||||
!['iac-databricks-corejson'](media/iac-file-corejson-databricks.png)
|
||||
|
||||
# Folder [scripts]
|
||||
|
||||
|
@ -159,7 +159,7 @@ Some scripts are referenced with ARM templates, "calling" them to perform some n
|
|||
|
||||
However, we have a correct order for this execution as described in next task.
|
||||
|
||||
![](media/iac-scripts.PNG 'iac-scripts')
|
||||
!['iac-scripts'](media/iac-scripts.png)
|
||||
|
||||
|
||||
# Folder [tests]
|
||||
|
@ -172,14 +172,14 @@ You can practice a little more on this topic in Exercise 5: Testing.
|
|||
|
||||
However, we have a correct order for this execution as described in next task.
|
||||
|
||||
![](media/iac-folder-subfolder-tests.PNG 'iac-tests')
|
||||
!['iac-tests'](media/iac-folder-subfolder-tests.png)
|
||||
|
||||
|
||||
### Task 2: Creating a new sandbox environment with Powershell
|
||||
|
||||
In this task you will learn how to create your first sandbox environment, with Azure Powershell scripts.
|
||||
|
||||
![](media/iac-ordem-scripts.PNG 'iac-ordem-scripts')
|
||||
!['iac-ordem-scripts'](media/iac-ordem-scripts.png)
|
||||
|
||||
### Task 3: Checklist of IaC best practices
|
||||
|
||||
|
|
|
@ -55,12 +55,12 @@
|
|||
|
||||
**Setting Azure Devops Project:** before to start to execute the pipelines and execute the git workflow, it is necessary to create the environments in Azure Devops for the IaC and Databricks environments.
|
||||
|
||||
![](media/environments-qa-prod.PNG)
|
||||
![](media/environments-qa-prod.png)
|
||||
|
||||
|
||||
>**Note**: Create Environments to `qa`, `prod`, `databricks-qa` and `databricks-prod` in Azure Devops before to make any Pull Request (PR).
|
||||
|
||||
![](media/environments.PNG)
|
||||
![](media/environments.png)
|
||||
|
||||
|
||||
## Infrastructure as code git workflow
|
||||
|
@ -126,11 +126,11 @@
|
|||
|
||||
>**Note**: `dataops` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
![](./media/pipelines.PNG)
|
||||
![](./media/pipelines.png)
|
||||
|
||||
In the quickstart the process create the pipelines to IaC, the customized library dataops, databricks and azure data factory. Now we will see the IaC pipelines.
|
||||
|
||||
![](./media/Pipelines-IaC.PNG)
|
||||
![](./media/Pipelines-IaC.png)
|
||||
|
||||
>**Note**: `dataops` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
|
@ -140,23 +140,23 @@
|
|||
|
||||
## **Run CI Pipeline**:
|
||||
|
||||
![](./media/Run-CIPipeline-Iac.PNG)
|
||||
![](./media/Run-CIPipeline-Iac.png)
|
||||
|
||||
![](./media/CI-Iac.PNG)
|
||||
![](./media/CI-Iac.png)
|
||||
|
||||
This pipeline was executed manually, but it has in the branch policies configurated to start automatically if any change occur in branch in the folder `infrastructure-as-code`:
|
||||
|
||||
![](./media/branch-policies-builder.PNG)
|
||||
![](./media/branch-policies-builder.png)
|
||||
|
||||
|
||||
## **Run CD Pipeline**:
|
||||
|
||||
![](./media/Run-CDPipeline-Iac.PNG)
|
||||
![](./media/Run-CDPipeline-Iac.png)
|
||||
|
||||
When you execute the CD Pipeline of IaC you can see in the Azure Devops that you environment status will change, when this pipeline finished the execution, you can validate if you see the resources created in the resource group of development environment.
|
||||
|
||||
![](./media/RGComputeDev.PNG)
|
||||
![](./media/RGDataDev.PNG)
|
||||
![](./media/RGComputeDev.png)
|
||||
![](./media/RGDataDev.png)
|
||||
|
||||
>**Note**: Name of the Resource Groups and Resources depends of the alias and the suscription id.
|
||||
|
||||
|
@ -166,13 +166,13 @@
|
|||
databricks secrets list --scope dataops
|
||||
```
|
||||
|
||||
![](./media/scope-dataops.PNG)
|
||||
![](./media/scope-dataops.png)
|
||||
|
||||
# CI/CD Pipeline Library
|
||||
|
||||
Now, we need to create the custom library that we use in the notebooks of databricks, then we have the CI and CD Pipeline for the lib. When these pipelines finished the execution, you could see the artifact in the feed `lib-packages` that you create in the [step 3 of the quickstart](../quickstart/docs/3a-azdo-setup-basic.md).
|
||||
|
||||
![](./media/Pipelines-lib.PNG)
|
||||
![](./media/Pipelines-lib.png)
|
||||
|
||||
>**Note**: `vic` word as part of the name is the alias that you assign to the project.
|
||||
|
||||
|
@ -180,11 +180,11 @@
|
|||
|
||||
Execute the CI pipeline of the library to create the version `alpha` of the library.
|
||||
|
||||
![](./media/Run-CIPipeline-lib.PNG)
|
||||
![](./media/Run-CIPipeline-lib.png)
|
||||
|
||||
When this pipeline finished in artifacts you can see the version.
|
||||
|
||||
![](./media/alpbaVersionlib.PNG)
|
||||
![](./media/alpbaVersionlib.png)
|
||||
|
||||
>**Note**: The number in the version is variable depends of the Build Id.
|
||||
|
||||
|
@ -192,11 +192,11 @@
|
|||
|
||||
In the CD Pipeline you can to see the different stages by environment, we will to execute the CD Pipeline to left the version `beta` enable to be used in the databricks notebook.
|
||||
|
||||
![](./media/Run-CDPipeline-lib.PNG)
|
||||
![](./media/Run-CDPipeline-lib.png)
|
||||
|
||||
When this pipeline finished in artifacts you can see the version.
|
||||
|
||||
![](./media/betaVersionlib.PNG)
|
||||
![](./media/betaVersionlib.png)
|
||||
|
||||
>**Note**: The number in the version is variable depends of the Build Id.
|
||||
|
||||
|
@ -204,49 +204,49 @@
|
|||
|
||||
Now you could see the pipelines that work with databricks in the aspect of the custom library and the notebooks that will be executed in databricks.
|
||||
|
||||
![](./media/Pipelines-databricks.PNG)
|
||||
![](./media/Pipelines-databricks.png)
|
||||
|
||||
## CI Pipeline
|
||||
|
||||
This pipeline make the check of the notebooks in databricks.
|
||||
|
||||
![](./media/Run-CIPipeline-Databricks.PNG)
|
||||
![](./media/Run-CIPipeline-Databricks.png)
|
||||
|
||||
## CD Pipeline Lib
|
||||
|
||||
This pipeline upload the current version library to the `dbfs` of databriks.
|
||||
|
||||
![](./media/Run-CDPipeline-Databricks-Lib.PNG)
|
||||
![](./media/Run-CDPipeline-Databricks-Lib.png)
|
||||
|
||||
You could see in the environments that the status in `databricks-dev` changed.
|
||||
|
||||
![](./media/environments-DEV-Databricks.PNG)
|
||||
![](./media/environments-DEV-Databricks.png)
|
||||
|
||||
## CD Pipeline Notebooks
|
||||
|
||||
This pipeline upload the current notebooks to the shared folder in databricks.
|
||||
|
||||
![](./media/Run-CDPipeline-Databricks-Notebooks.PNG)
|
||||
![](./media/Run-CDPipeline-Databricks-Notebooks.png)
|
||||
|
||||
You could see in the environments that the status in `databricks-dev` changed.
|
||||
|
||||
![](./media/environments-DEV-Databricks-Notebooks.PNG)
|
||||
![](./media/environments-DEV-Databricks-Notebooks.png)
|
||||
|
||||
# CD Pipeline ADF
|
||||
|
||||
This pipeline check the integrity on the data and trigger the ADF Pipeline identifying some problems in it but this process doesnt wait that this pipeline finished.
|
||||
|
||||
![](./media/Pipelines-ADF.PNG)
|
||||
![](./media/Pipelines-ADF.png)
|
||||
|
||||
>**Note**: The first time that this pipeline is executed it fails, because it is necessary that ADF pipeline finished sucessful the first time to create some folders in the container in the datalake that are necessaries to check the integrity of the data.
|
||||
|
||||
![](./media/Run-CDPipeline-ADF.PNG)
|
||||
![](./media/Run-CDPipeline-ADF.png)
|
||||
|
||||
When the ADF Pipeline finished, you could execute again this CD Pipeline. you can check it, open ADF resource in the Azure Portal, and the in monitor the pipeline running.
|
||||
|
||||
![](./media/ADFPipelineRunning.PNG)
|
||||
![](./media/ADFPipelineRunning.png)
|
||||
|
||||
![](./media/Run-CDPipeline-ADFGood.PNG)
|
||||
![](./media/Run-CDPipeline-ADFGood.png)
|
||||
|
||||
Now that you understand the workflow, you can start with the other environments.
|
||||
|
||||
|
@ -256,25 +256,25 @@
|
|||
|
||||
Open a PR from `develop` to `qa` to promote the code changes to the QA environment. Please wait again for the creation of the QA infrastructure.
|
||||
|
||||
![](./media/PRDEV2QA.PNG)
|
||||
![](./media/PRDEV2QA.png)
|
||||
|
||||
>**Note**: It will be necessary modify branch policies to make the merge only with one reviewer and it can be the owner, click check `Allow requestors to approve their own changes` (only for the laboratory).
|
||||
|
||||
![](./media/branch-policies-own-owner.PNG)
|
||||
![](./media/branch-policies-own-owner.png)
|
||||
|
||||
![](./media/PRDEV2QA-1.PNG)
|
||||
![](./media/PRDEV2QA-1.png)
|
||||
|
||||
When you make the merge you could be that the CI Pipeline of IaC start automatically.
|
||||
|
||||
![](./media/PRDEV2QA-2.PNG)
|
||||
![](./media/PRDEV2QA-2.png)
|
||||
|
||||
>**Note:** Remember to configure the scope and run the pipeline of Lib for `qa` environment.
|
||||
|
||||
![](./media/rcVersionlib.PNG)
|
||||
![](./media/rcVersionlib.png)
|
||||
|
||||
Repeat the process one last time, opening a PR from `qa` to `main` to promote the code changes to the PROD environment. Please wait again for the creation of the PROD infrastructure. In artifact you can see the final version of the library for production.
|
||||
|
||||
![](./media/Versionlib.PNG)
|
||||
![](./media/Versionlib.png)
|
||||
|
||||
<br/><br/>
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ At the end of this hands-on lab, you will be better able to implement an end-to-
|
|||
|
||||
Below is a diagram of the solution architecture you will deploy in this lab, leveraging several DataOps best practices.
|
||||
|
||||
![](media/high-level-overview-dataops.png 'Solution Architecture')
|
||||
!['Solution Architecture'](media/high-level-overview-dataops.png)
|
||||
|
||||
Explain each one of the repos that will be user for this workshop:
|
||||
- IaC
|
||||
|
@ -106,15 +106,15 @@ In this exercise, you will explore the main resources that have been deployed in
|
|||
|
||||
The resource groups rerg-dataops-data-dev and rg-dataops-compute-dev contain data and compute services respectively.
|
||||
|
||||
![](media/resource-groups.png 'Resource groups')
|
||||
!['Resource groups'](media/resource-groups.png)
|
||||
|
||||
The rg-dataops-data resource group contains a [Data Lake Storage] (https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) and a [Blob Storage] (https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview)
|
||||
|
||||
![](media/rg-dataops-data-dev.png 'Resource group: Data')
|
||||
!['Resource group: Data'](media/rg-dataops-data-dev.png)
|
||||
|
||||
The resource group rg-dataops-compute contains an instance of [Azure Data Factory] (https://docs.microsoft.com/en-us/azure/data-factory/) and [Azure Databricks] (https://docs.microsoft.com/en-us/azure/databricks/)
|
||||
|
||||
![](media/rg-dataops-compute-dev.png 'Resource group: Compute')
|
||||
!['Resource group: Compute'](media/rg-dataops-compute-dev.png)
|
||||
|
||||
### Technology Overview
|
||||
|
||||
|
@ -132,15 +132,15 @@ In this task, you will explore the Azure Blob Storage instance.
|
|||
|
||||
2. On the overview blade, select Containers
|
||||
|
||||
![](media/stgdataopseastus2dev.png 'Blob Storage Overview')
|
||||
!['Blob Storage Overview'](media/stgdataopseastus2dev.png)
|
||||
|
||||
3. Select and open the flights-data container.
|
||||
|
||||
![](media/stgdataopseastus2dev-containers.png 'Containers')
|
||||
!['Containers'](media/stgdataopseastus2dev-containers.png)
|
||||
|
||||
4. Review the CSV files. Select the CSV file and download it.
|
||||
|
||||
![](media/stgdataopseastus2dev-airport-metadata.png 'Files')
|
||||
!['Files'](media/stgdataopseastus2dev-airport-metadata.png)
|
||||
|
||||
### Task 2: Explore Azure Data Lake Storage
|
||||
|
||||
|
@ -150,15 +150,15 @@ In this task, you will explore the layers defined to organize the data into the
|
|||
|
||||
2. On the Overview blade, select Containers
|
||||
|
||||
![](media/lakedataopseastus2dev-overview.png 'Data Lake overview')
|
||||
!['Data Lake overview'](media/lakedataopseastus2dev-overview.png)
|
||||
|
||||
3. Select and open the landing layer container.
|
||||
|
||||
![](media/lakedataopseastus2dev-layers.png 'Containers')
|
||||
!['Containers'](media/lakedataopseastus2dev-layers.png)
|
||||
|
||||
4. Select and open the directories airport-metada, flight-delays, flight-weather. They will contain CSV files with the infomation about airports, flights and weather.
|
||||
|
||||
![](media/lakedataopseastus2dev-layer-landing.png 'Landing layer')
|
||||
!['Landing layer'](media/lakedataopseastus2dev-layer-landing.png)
|
||||
|
||||
### Task 3: Azure Databricks
|
||||
|
||||
|
@ -166,54 +166,54 @@ In this task, you will explore the Azure Databricks instance dbw-dataops-eastus2
|
|||
|
||||
1. Navigate to the Azure Databricks instance dbw-dataops-eastus2-dev and Launch the Workspace.
|
||||
|
||||
![](media/dbw-dataops-eastus2-dev-overview.png 'Databricks overview')
|
||||
!['Databricks overview'](media/dbw-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Workspace hub (2). Open the folders shared with you (if someone share wiht you the databricks instance) or seek your user in Users (3). Open the DataOps Folder (4) and select the notebook named 01 ADLS Mount (5).
|
||||
|
||||
![](media/dbw-dataops-eastus2-dev-ws.png 'Databricks workspace')
|
||||
!['Databricks workspace'](media/dbw-dataops-eastus2-dev-ws.png)
|
||||
|
||||
3. To run the notebook you need attach a cluster from the list (1) or create a new one if you don't have clusters deployed.
|
||||
|
||||
![](media/notebook-01-adls-mount.png 'Attach a cluster')
|
||||
!['Attach a cluster'](media/notebook-01-adls-mount.png)
|
||||
|
||||
3.1 Provide a name for the new cluster, establish the cluster setting and select Create Cluster.
|
||||
|
||||
![](media/dbw-dataops-new-cluster.png 'Creating a cluster')
|
||||
!['Creating a cluster'](media/dbw-dataops-new-cluster.png)
|
||||
|
||||
3.2 Navigate back to the notebook named 01 ADLS Mount and attach the cluster
|
||||
|
||||
![](media/dbw-dataops-attaching-cluster.png 'Creating a cluster') .PNG
|
||||
!['Creating a cluster'](media/dbw-dataops-attaching-cluster.png)
|
||||
|
||||
4. Select Run Cell or Crt + Enter to run the cell and amount the Azure Data Lake.
|
||||
This code is to mount the Azure Data Lake Storage Gen2 account to Databricks File System. For the authentication, it uses Key Vault and OAuth 2.0.
|
||||
|
||||
![](media/notebook-01-adls-runcell.png 'Run')
|
||||
!['Run'](media/notebook-01-adls-runcell.png)
|
||||
|
||||
5. Navigate back to the notebook named 02 One Notebook to Rule Them All.
|
||||
|
||||
5.1 Run the cells to import the libraries that you will use to process and transform the data.
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-1.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-1.png)
|
||||
|
||||
5.2 Read the file FlightDelaysWithAirportCodes.csv from the landing layer (1), transform the data (2), and create the a local table called flight_delays_with_airport_codes from the flight_delays_df Dataframe (3).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-2.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-2.png)
|
||||
|
||||
5.3 Select clean columns to generate clean data (1) and save the clean data as a global table called flight_delays_clean (2).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-3.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-3.png)
|
||||
|
||||
5.4 To see the created table: Click Data in the sidebar (1). In the databases folder, click on the default database (2). Open Tables Folder and Click the table name.
|
||||
|
||||
![](media/globaltable-flight_delays_view.png 'Run')
|
||||
!['Run'](media/globaltable-flight_delays_view.png)
|
||||
|
||||
5.5 Navigate back to the notebook. Run cells 9, 10 and 11 to prepare the weather data. Cell 9 reads raw data from landing layer and create a local table called flight_weather_with_airport_code. Cell 10 transforms data and Cell 11 creates a global table called flight_weather_clean.
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-4.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-4.png)
|
||||
|
||||
5.5 Run the rest of cells. Cell 14 copies clean data of flight dealys and weather into the trusted layer of the data lake (1). Cell 16 saves data of airports with the delayes into the logs folder as CSV file (trusted layer) (2). Finally,the path of the CSV file will be the notebook output (3).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-5.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-5.png)
|
||||
|
||||
### Task 4: Azure Data Factory
|
||||
|
||||
|
@ -221,15 +221,15 @@ In this task, you will explore the adf-dataops-eastus2-dev Azure Data Factory in
|
|||
|
||||
1. Navigate to the adf-dataops-eastus2-dev Azure Data Factory instance and launch the workspace (Author & Monitor).
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-overview.png 'Azure Data Factory Overview')
|
||||
!['Azure Data Factory Overview'](media/adf-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Author hub.
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-workspace1.png 'Azure Data Factory Hub')
|
||||
!['Azure Data Factory Hub'](media/adf-dataops-eastus2-dev-workspace1.png)
|
||||
|
||||
3. You will find the pipeline ProcessFlightDelaysData and 6 datasets. The pipeline contains the activities to copy data from the XXXXXXXSource datasets into the XXXXXXSink datasets.
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-author.PNG 'Author Hub')
|
||||
!['Author Hub'](media/adf-dataops-eastus2-dev-author.png)
|
||||
|
||||
4. Open the pipeline ProcessFlightDelaysData and review the settings of the activities:
|
||||
- Copy Airport Codes Data
|
||||
|
@ -238,43 +238,43 @@ In this task, you will explore the adf-dataops-eastus2-dev Azure Data Factory in
|
|||
- Mount ADLS
|
||||
- Transform Flights Data
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-process-data.PNG 'Pipeline')
|
||||
!['Pipeline'](media/adf-dataops-eastus2-dev-process-data.png)
|
||||
|
||||
4.1. Select the Copy Airport Codes Data (1). Select the Source Tab (2) and Click on Open to see the settings of the AirportCodesSource dataset (3).
|
||||
|
||||
![](media/copy-airport-codes.PNG 'Copy Airport Codes Data')
|
||||
!['Copy Airport Codes Data'](media/copy-airport-codes.png)
|
||||
|
||||
4.2 Select Edit to review the Azure blob Storage linked service (1). View the file path that you want to copy (2). Select Browse to navigate into the stgdataopseastus2dev Azure Blob Storage instance (3) and Select the file path.
|
||||
|
||||
![](media/airport-codes-source-csv.PNG 'Airport Codes Source dataset')
|
||||
!['Airport Codes Source dataset'](media/airport-codes-source-csv.png)
|
||||
|
||||
4.3 Navigate back to the Copy Airport Codes Data Activity in the pipeline ProcessFlightDelaysData. Select the Sink tab (1) and Click on Open to see the setting of the AirportCodesSink dataset (2).
|
||||
|
||||
![](media/copy-airport-codes-sink.PNG 'Sink')
|
||||
!['Sink'](media/copy-airport-codes-sink.png)
|
||||
|
||||
4.4. Select Edit to review the Azure Data Lake linked service (1). View the layer where you will copy the data (2). Select Browse to navigate into the lakedataopseastus2dev Azure Data Lake instance (3) and select the layer (4).
|
||||
|
||||
![](media/airport-codes-sync.PNG 'Airport dataset')
|
||||
!['Airport dataset'](media/airport-codes-sync.png)
|
||||
|
||||
5. Repeat the steps 4.1 - 4.4 for the Copy Flights Delays Data and Copy Flights Weather Data activities.
|
||||
|
||||
6. Navigate back to the pipeline and select the notebook activity Mount ADLS. Select the Azure Databricks tab (1) and click on Edit to view the settings of the linked service of the Databricks instance.
|
||||
|
||||
![](media/mount-adls-1.PNG 'notebook activity')
|
||||
!['Notebook activity'](media/mount-adls-1.png )
|
||||
|
||||
7. Select the settings tab of the notebook activity to configure the notebook to run in the databricks instance (1). In the Notebook path, indicate the path of the notebook to run (2). Select Browse if you want to explore the available notebooks (3) and explore the available folders in the Databricks instance (4). Select Open to open the Databricks workspace.
|
||||
|
||||
8. Repeat the steps 6 and 7 to explore the Notebook Activity Transform Flight Data.
|
||||
|
||||
![](media/mount-adls-2.PNG 'notebook activity')
|
||||
!['Notebook activity'](media/mount-adls-2.png)
|
||||
|
||||
9. OPTIONAL - Navigate back to the pipeline and run it.
|
||||
|
||||
![](media/pipeline-trigger.PNG 'Execute pipeline')
|
||||
!['Execute pipeline'](media/pipeline-trigger.png)
|
||||
|
||||
9.1 Navigate to the Data Lake. Follow the file path that you indicated in the step 4.4. You will find the CSV file just copied.
|
||||
|
||||
![](media/lakedataopseastus2dev-airport-metadata.png 'Exploring Data Lake')
|
||||
!['Exploring Data Lake'](media/lakedataopseastus2dev-airport-metadata.png)
|
||||
|
||||
## Exercise 2: Infrastructure As Code
|
||||
|
||||
|
@ -292,7 +292,7 @@ In this task you will explore and understand the folder structure and scripts, t
|
|||
|
||||
To proceed with the execution of the other exercises below, you must understand the structure of the "infrastructure-as-code" folder, as well as its content of templates and scripts.
|
||||
|
||||
![](media/infrastructure-as-code-folder.PNG 'infrastructure as code')
|
||||
!['infrastructure as code'](media/infrastructure-as-code-folder.png)
|
||||
|
||||
```
|
||||
|infrastructure-as-code|
|
||||
|
@ -809,7 +809,7 @@ Main template, with declared parameters, variables and resources. Here we use li
|
|||
In linkedTemplates we have templates with "parts" of declared resources that are not declared in the main Template, in order to reuse and can link with other templates.
|
||||
*NOTE*: linkedTemplates is a widely used practice, for better organization and handling of templates of different types of resources and being able to link them to any template.
|
||||
|
||||
![](media/compute-template-json.PNG 'compute-linkedTemplate')
|
||||
!['compute-linkedTemplate'](media/compute-template-json.png)
|
||||
|
||||
# Folder: parameters
|
||||
```
|
||||
|
@ -825,7 +825,7 @@ In linkedTemplates we have templates with "parts" of declared resources that are
|
|||
Parameters folder and directory with templates files with parameters and values to be used by linkedTemplates and main template, without the need to change directly in the main template.
|
||||
*NOTE*: Using templates parameters is optional and can be used directly in the main template. However, following a model of good practice, the use separately is indicated.
|
||||
|
||||
![](media/parameters-dev-json.PNG 'parameters-dev-json')
|
||||
!['parameters-dev-json'](media/parameters-dev-json.png)
|
||||
|
||||
### Task 2: Creating a new sandbox environment with Powershell
|
||||
|
||||
|
@ -988,31 +988,31 @@ First review how the DevOps pipeline was defined:
|
|||
|
||||
1. Go to the repositoy that was created as part the Exercise 3, Task # and open the templates folder, were you will see 3 yml files.
|
||||
|
||||
![](media/templates-folder.PNG 'Templastes Folder')
|
||||
!['Templastes Folder'](media/templates-folder.png)
|
||||
|
||||
2. Open the test.yml file by clicking on it
|
||||
|
||||
![](media/select-test-yml.PNG 'Test yml')
|
||||
!['Test yml'](media/select-test-yml.png)
|
||||
|
||||
3. Indentify the script activity that runs the behave modulo and identify the different paramentes that are set before it is called
|
||||
|
||||
![](media/behave-script.PNG 'behave activity')
|
||||
!['Behave activity'](media/behave-script.png)
|
||||
|
||||
Now lets review the DevOps pipeline execution results:
|
||||
|
||||
1. Go to DevOps Pipelines from the project defined on Execise 3 and select the Pipeline with the name "*\<your lab prefix>*-adf-cd" by clciking on it.
|
||||
|
||||
![](media/last-pipeline-run.PNG 'Last Pipeline Run')
|
||||
!['Last Pipeline Run'](media/last-pipeline-run.png)
|
||||
|
||||
2. You will see a list of resent runs of the selected pipeline, click on the lates run
|
||||
|
||||
3. At the stages secction select the "Run behavior tests" stage
|
||||
|
||||
![](media/pipeline-stages-run.PNG 'Pipeline Stages')
|
||||
!['Pipeline Stages'](media/pipeline-stages-run.png)
|
||||
|
||||
4. Review the Azure DevOps execution results for "Run behavior tests"\\"TEST: Run behave features"
|
||||
|
||||
![](media/pipeline-run-results.PNG 'Pipeline Results')
|
||||
!['Pipeline Results'](media/pipeline-run-results.png)
|
||||
|
||||
<p>Here you see the results of running the BDD test using <b>behave</b></p>
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ At the end of this hands-on lab, you will be better able to implement an end-to-
|
|||
|
||||
Below is a diagram of the solution architecture you will deploy in this lab, leveraging several DataOps best practices.
|
||||
|
||||
![](media/high-level-overview-dataops.png 'Solution Architecture')
|
||||
!['Solution Architecture'](media/high-level-overview-dataops.png)
|
||||
|
||||
Explain each one of the repos that will be user for this workshop:
|
||||
- IaC
|
||||
|
@ -84,15 +84,15 @@ In this exercise, you will explore the main resources that have been deployed in
|
|||
|
||||
The resource groups rerg-dataops-data-dev and rg-dataops-compute-dev contain data and compute services respectively.
|
||||
|
||||
![](media/resource-groups.png 'Resource groups')
|
||||
!['Resource groups'](media/resource-groups.png)
|
||||
|
||||
The rg-dataops-data resource group contains a [Data Lake Storage] (https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) and a [Blob Storage] (https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blobs-overview)
|
||||
|
||||
![](media/rg-dataops-data-dev.png 'Resource group: Data')
|
||||
!['Resource group: Data'](media/rg-dataops-data-dev.png)
|
||||
|
||||
The resource group rg-dataops-compute contains an instance of [Azure Data Factory] (https://docs.microsoft.com/en-us/azure/data-factory/) and [Azure Databricks] (https://docs.microsoft.com/en-us/azure/databricks/)
|
||||
|
||||
![](media/rg-dataops-compute-dev.png 'Resource group: Compute')
|
||||
!['Resource group: Compute'](media/rg-dataops-compute-dev.png)
|
||||
|
||||
### Technology Overview
|
||||
|
||||
|
@ -110,15 +110,15 @@ In this task, you will explore the Azure Blob Storage instance.
|
|||
|
||||
2. On the overview blade, select Containers
|
||||
|
||||
![](media/stgdataopseastus2dev.png 'Blob Storage Overview')
|
||||
!['Blob Storage Overview'](media/stgdataopseastus2dev.png)
|
||||
|
||||
3. Select and open the flights-data container.
|
||||
|
||||
![](media/stgdataopseastus2dev-containers.png 'Containers')
|
||||
!['Containers'](media/stgdataopseastus2dev-containers.png)
|
||||
|
||||
4. Review the CSV files. Select the CSV file and download it.
|
||||
|
||||
![](media/stgdataopseastus2dev-airport-metadata.png 'Files')
|
||||
!['Files'](media/stgdataopseastus2dev-airport-metadata.png)
|
||||
|
||||
### Task 2: Explore Azure Data Lake Storage
|
||||
|
||||
|
@ -128,15 +128,15 @@ In this task, you will explore the layers defined to organize the data into the
|
|||
|
||||
2. On the Overview blade, select Containers
|
||||
|
||||
![](media/lakedataopseastus2dev-overview.png 'Data Lake overview')
|
||||
!['Data Lake overview'](media/lakedataopseastus2dev-overview.png)
|
||||
|
||||
3. Select and open the landing layer container.
|
||||
|
||||
![](media/lakedataopseastus2dev-layers.png 'Containers')
|
||||
!['Containers'](media/lakedataopseastus2dev-layers.png)
|
||||
|
||||
4. Select and open the directories airport-metada, flight-delays, flight-weather. They will contain CSV files with the infomation about airports, flights and weather.
|
||||
|
||||
![](media/lakedataopseastus2dev-layer-landing.png 'Landing layer')
|
||||
!['Landing layer'](media/lakedataopseastus2dev-layer-landing.png)
|
||||
|
||||
### Task 3: Azure Databricks
|
||||
|
||||
|
@ -144,54 +144,54 @@ In this task, you will explore the Azure Databricks instance dbw-dataops-eastus2
|
|||
|
||||
1. Navigate to the Azure Databricks instance dbw-dataops-eastus2-dev and Launch the Workspace.
|
||||
|
||||
![](media/dbw-dataops-eastus2-dev-overview.png 'Databricks overview')
|
||||
!['Databricks overview'](media/dbw-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Workspace hub (2). Open the folders shared with you (if someone share wiht you the databricks instance) or seek your user in Users (3). Open the DataOps Folder (4) and select the notebook named 01 ADLS Mount (5).
|
||||
|
||||
![](media/dbw-dataops-eastus2-dev-ws.png 'Databricks workspace')
|
||||
!['Databricks workspace'](media/dbw-dataops-eastus2-dev-ws.png)
|
||||
|
||||
3. To run the notebook you need attach a cluster from the list (1) or create a new one if you don't have clusters deployed.
|
||||
|
||||
![](media/notebook-01-adls-mount.png 'Attach a cluster')
|
||||
!['Attach a cluster'](media/notebook-01-adls-mount.png)
|
||||
|
||||
3.1 Provide a name for the new cluster, establish the cluster setting and select Create Cluster.
|
||||
|
||||
![](media/dbw-dataops-new-cluster.png 'Creating a cluster')
|
||||
!['Creating a cluster'](media/dbw-dataops-new-cluster.png)
|
||||
|
||||
3.2 Navigate back to the notebook named 01 ADLS Mount and attach the cluster
|
||||
|
||||
![](media/dbw-dataops-attaching-cluster.png 'Creating a cluster') .PNG
|
||||
!['Creating a cluster'](media/dbw-dataops-attaching-cluster.png)
|
||||
|
||||
4. Select Run Cell or Crt + Enter to run the cell and amount the Azure Data Lake.
|
||||
This code is to mount the Azure Data Lake Storage Gen2 account to Databricks File System. For the authentication, it uses Key Vault and OAuth 2.0.
|
||||
|
||||
![](media/notebook-01-adls-runcell.png 'Run')
|
||||
!['Run'](media/notebook-01-adls-runcell.png)
|
||||
|
||||
5. Navigate back to the notebook named 02 One Notebook to Rule Them All.
|
||||
|
||||
5.1 Run the cells to import the libraries that you will use to process and transform the data.
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-1.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-1.png)
|
||||
|
||||
5.2 Read the file FlightDelaysWithAirportCodes.csv from the landing layer (1), transform the data (2), and create the a local table called flight_delays_with_airport_codes from the flight_delays_df Dataframe (3).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-2.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-2.png)
|
||||
|
||||
5.3 Select clean columns to generate clean data (1) and save the clean data as a global table called flight_delays_clean (2).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-3.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-3.png)
|
||||
|
||||
5.4 To see the created table: Click Data in the sidebar (1). In the databases folder, click on the default database (2). Open Tables Folder and Click the table name.
|
||||
|
||||
![](media/globaltable-flight_delays_view.png 'Run')
|
||||
!['Run'](media/globaltable-flight_delays_view.png)
|
||||
|
||||
5.5 Navigate back to the notebook. Run cells 9, 10 and 11 to prepare the weather data. Cell 9 reads raw data from landing layer and create a local table called flight_weather_with_airport_code. Cell 10 transforms data and Cell 11 creates a global table called flight_weather_clean.
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-4.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-4.png)
|
||||
|
||||
5.5 Run the rest of cells. Cell 14 copies clean data of flight dealys and weather into the trusted layer of the data lake (1). Cell 16 saves data of airports with the delayes into the logs folder as CSV file (trusted layer) (2). Finally,the path of the CSV file will be the notebook output (3).
|
||||
|
||||
![](media/02-One-Notebook-to-Rule-Them-All-5.png 'Run')
|
||||
!['Run'](media/02-One-Notebook-to-Rule-Them-All-5.png)
|
||||
|
||||
### Task 4: Azure Data Factory
|
||||
|
||||
|
@ -199,15 +199,15 @@ In this task, you will explore the adf-dataops-eastus2-dev Azure Data Factory in
|
|||
|
||||
1. Navigate to the adf-dataops-eastus2-dev Azure Data Factory instance and launch the workspace (Author & Monitor).
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-overview.png 'Azure Data Factory Overview')
|
||||
!['Azure Data Factory Overview'](media/adf-dataops-eastus2-dev-overview.png)
|
||||
|
||||
2. Navigate to the Author hub.
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-workspace1.png 'Azure Data Factory Hub')
|
||||
!['Azure Data Factory Hub'](media/adf-dataops-eastus2-dev-workspace1.png)
|
||||
|
||||
3. You will find the pipeline ProcessFlightDelaysData and 6 datasets. The pipeline contains the activities to copy data from the XXXXXXXSource datasets into the XXXXXXSink datasets.
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-author.PNG 'Author Hub')
|
||||
!['Author Hub'](media/adf-dataops-eastus2-dev-author.png)
|
||||
|
||||
4. Open the pipeline ProcessFlightDelaysData and review the settings of the activities:
|
||||
- Copy Airport Codes Data
|
||||
|
@ -216,43 +216,43 @@ In this task, you will explore the adf-dataops-eastus2-dev Azure Data Factory in
|
|||
- Mount ADLS
|
||||
- Transform Flights Data
|
||||
|
||||
![](media/adf-dataops-eastus2-dev-process-data.PNG 'Pipeline')
|
||||
!['Pipeline'](media/adf-dataops-eastus2-dev-process-data.png)
|
||||
|
||||
4.1. Select the Copy Airport Codes Data (1). Select the Source Tab (2) and Click on Open to see the settings of the AirportCodesSource dataset (3).
|
||||
|
||||
![](media/copy-airport-codes.PNG 'Copy Airport Codes Data')
|
||||
!['Copy Airport Codes Data'](media/copy-airport-codes.png)
|
||||
|
||||
4.2 Select Edit to review the Azure blob Storage linked service (1). View the file path that you want to copy (2). Select Browse to navigate into the stgdataopseastus2dev Azure Blob Storage instance (3) and Select the file path.
|
||||
|
||||
![](media/airport-codes-source-csv.PNG 'Airport Codes Source dataset')
|
||||
!['Airport Codes Source dataset'](media/airport-codes-source-csv.png)
|
||||
|
||||
4.3 Navigate back to the Copy Airport Codes Data Activity in the pipeline ProcessFlightDelaysData. Select the Sink tab (1) and Click on Open to see the setting of the AirportCodesSink dataset (2).
|
||||
|
||||
![](media/copy-airport-codes-sink.PNG 'Sink')
|
||||
!['Sink'](media/copy-airport-codes-sink.png)
|
||||
|
||||
4.4. Select Edit to review the Azure Data Lake linked service (1). View the layer where you will copy the data (2). Select Browse to navigate into the lakedataopseastus2dev Azure Data Lake instance (3) and select the layer (4).
|
||||
|
||||
![](media/airport-codes-sync.PNG 'Airport dataset')
|
||||
!['Airport dataset'](media/airport-codes-sync.png)
|
||||
|
||||
5. Repeat the steps 4.1 - 4.4 for the Copy Flights Delays Data and Copy Flights Weather Data activities.
|
||||
|
||||
6. Navigate back to the pipeline and select the notebook activity Mount ADLS. Select the Azure Databricks tab (1) and click on Edit to view the settings of the linked service of the Databricks instance.
|
||||
|
||||
![](media/mount-adls-1.PNG 'notebook activity')
|
||||
!['Notebook activity'](media/mount-adls-1.png)
|
||||
|
||||
7. Select the settings tab of the notebook activity to configure the notebook to run in the databricks instance (1). In the Notebook path, indicate the path of the notebook to run (2). Select Browse if you want to explore the available notebooks (3) and explore the available folders in the Databricks instance (4). Select Open to open the Databricks workspace.
|
||||
|
||||
8. Repeat the steps 6 and 7 to explore the Notebook Activity Transform Flight Data.
|
||||
|
||||
![](media/mount-adls-2.PNG 'notebook activity')
|
||||
!['Notebook activity'](media/mount-adls-2.png)
|
||||
|
||||
9. OPTIONAL - Navigate back to the pipeline and run it.
|
||||
|
||||
![](media/pipeline-trigger.PNG 'Execute pipeline')
|
||||
!['Execute pipeline'](media/pipeline-trigger.png)
|
||||
|
||||
9.1 Navigate to the Data Lake. Follow the file path that you indicated in the step 4.4. You will find the CSV file just copied.
|
||||
|
||||
![](media/lakedataopseastus2dev-airport-metadata.png 'Exploring Data Lake')
|
||||
!['Exploring Data Lake'](media/lakedataopseastus2dev-airport-metadata.png)
|
||||
|
||||
|
||||
## Exercise 2: Infrastructure As Code
|
||||
|
@ -277,7 +277,7 @@ In this task you will explore and understand the folder structure and scripts, t
|
|||
|
||||
To proceed with the execution of the other exercises below, you must understand the structure of the "infrastructure-as-code" folder, as well as its content of templates and scripts.
|
||||
|
||||
![](media/infrastructure-as-code-folder.PNG 'infrastructure as code')
|
||||
!['infrastructure as code'](media/infrastructure-as-code-folder.png)
|
||||
|
||||
```
|
||||
|infrastructure-as-code| ---> Principal folder
|
||||
|
@ -337,7 +337,7 @@ To proceed with the execution of the other exercises below, you must understand
|
|||
|
||||
## File: azuredeploy.json
|
||||
|
||||
![](media/iac-folder-infrastructure.PNG 'infrastructure-folder')
|
||||
!['infrastructure-folder'](media/iac-folder-infrastructure.png)
|
||||
|
||||
Main template, with declared parameters, variables and resources. Here we use linkedTemplates.
|
||||
*NOTE*: We have the option of using separate parameter files as a good practice when using IaC templates, without the need to change directly in the main template.
|
||||
|
@ -350,7 +350,7 @@ To deploy complex solutions, you can break your Azure Resource Manager template
|
|||
|
||||
## Folder: linkedTemplates
|
||||
|
||||
![](media/iac-folder-linkedtemplates.PNG 'linkedTemplate-folder')
|
||||
!['linkedTemplate-folder'](media/iac-folder-linkedtemplates.png)
|
||||
|
||||
In linkedTemplates we have templates with "parts" of declared resources that are not declared in the main Template, in order to reuse and can link with other templates.
|
||||
*NOTE*: linkedTemplates is a widely used practice, for better organization and handling of templates of different types of resources and being able to link them to any template.
|
||||
|
@ -358,7 +358,7 @@ In linkedTemplates we have templates with "parts" of declared resources that are
|
|||
|
||||
## Sub-Folders and Files: linkedTemplates
|
||||
|
||||
![](media/iac-folder-linkedtemplates-subfolders.PNG 'linkedTemplate-sub-folders')
|
||||
!['linkedTemplate-sub-folders'](media/iac-folder-linkedtemplates-subfolders.png)
|
||||
|
||||
## File: template.json (subfolders 1, 2, 3)
|
||||
|
||||
|
@ -370,7 +370,7 @@ ML resources: Machine Learning Services
|
|||
|
||||
Example of a resource declaration in this template.
|
||||
|
||||
![](media/iac-linkedtemplates-template-compute.PNG 'lkd-template-compute')
|
||||
!['lkd-template-compute'](media/iac-linkedtemplates-template-compute.png)
|
||||
|
||||
## File: compute.json, data.json (subfolder 4)
|
||||
|
||||
|
@ -384,30 +384,30 @@ An Azure service principal is an identity created for use with applications, hos
|
|||
|
||||
Example of a resource declaration in this template.
|
||||
|
||||
![](media/iac-service-principal.PNG 'iac-service-principal')
|
||||
!['iac-service-principal'](media/iac-service-principal.png)
|
||||
|
||||
|
||||
## Folder: parameters
|
||||
|
||||
![](media/iac-folder-parameters.PNG 'parameters-folder')
|
||||
!['parameters-folder'](media/iac-folder-parameters.png)
|
||||
|
||||
Parameters folder and directory with templates files with parameters and values to be used by linkedTemplates and main template, without the need to change directly in the main template.
|
||||
*NOTE*: Using templates parameters is optional and can be used directly in the main template. However, following a model of good practice, the use separately is indicated.
|
||||
|
||||
Example of a parameters declaration in this template.
|
||||
|
||||
![](media/parameters-dev-json.PNG 'iac-parameters')
|
||||
!['iac-parameters'](media/parameters-dev-json.png)
|
||||
|
||||
|
||||
# Folder [databricks]
|
||||
|
||||
In this file you will find declared settings related to the Databricks resource which will be used in executing the scripts (below) and provisioning your infrastructure, as well as its necessary resources.
|
||||
|
||||
![](media/iac-folder-databricks.PNG 'iac-databricks')
|
||||
!['iac-databricks'](media/iac-folder-databricks.png)
|
||||
|
||||
Example of a configuration declaration in this template.
|
||||
|
||||
![](media/iac-file-corejson-databricks.PNG 'iac-databricks-corejson')
|
||||
!['iac-databricks-corejson'](media/iac-file-corejson-databricks.png)
|
||||
|
||||
# Folder [scripts]
|
||||
|
||||
|
@ -416,7 +416,7 @@ Some scripts are referenced with ARM templates, "calling" them to perform some n
|
|||
|
||||
However, we have a correct order for this execution as described in next task.
|
||||
|
||||
![](media/iac-scripts.PNG 'iac-scripts')
|
||||
!['iac-scripts'](media/iac-scripts.png)
|
||||
|
||||
|
||||
# Folder [tests]
|
||||
|
@ -429,14 +429,14 @@ You can practice a little more on this topic in Exercise 5: Testing.
|
|||
|
||||
However, we have a correct order for this execution as described in next task.
|
||||
|
||||
![](media/iac-folder-subfolder-tests.PNG 'iac-tests')
|
||||
!['iac-tests'](media/iac-folder-subfolder-tests.png)
|
||||
|
||||
|
||||
### Task 2: Creating a new sandbox environment with Powershell
|
||||
|
||||
In this task you will learn how to create your first sandbox environment, with Azure Powershell scripts.
|
||||
|
||||
![](media/iac-ordem-scripts.PNG 'iac-ordem-scripts')
|
||||
!['iac-ordem-scripts'](media/iac-ordem-scripts.png)
|
||||
|
||||
### Task 3: Checklist of IaC best practices
|
||||
|
||||
|
@ -519,31 +519,31 @@ First review how the DevOps pipeline was defined:
|
|||
|
||||
1. Go to the repositoy that was created as part the Exercise 3, Task # and open the templates folder, were you will see 3 yml files.
|
||||
|
||||
![](media/templates-folder.PNG 'Templastes Folder')
|
||||
!['Templastes Folder'](media/templates-folder.png)
|
||||
|
||||
2. Open the test.yml file by clicking on it
|
||||
|
||||
![](media/select-test-yml.PNG 'Test yml')
|
||||
!['Test yml'](media/select-test-yml.png)
|
||||
|
||||
3. Indentify the script activity that runs the behave modulo and identify the different paramentes that are set before it is called
|
||||
|
||||
![](media/behave-script.PNG 'behave activity')
|
||||
!['Behave activity'](media/behave-script.png)
|
||||
|
||||
Now lets review the DevOps pipeline execution results:
|
||||
|
||||
1. Go to DevOps Pipelines from the project defined on Execise 3 and select the Pipeline with the name "*\<your lab prefix>*-adf-cd" by clciking on it.
|
||||
|
||||
![](media/last-pipeline-run.PNG 'Last Pipeline Run')
|
||||
!['Last Pipeline Run'](media/last-pipeline-run.png)
|
||||
|
||||
2. You will see a list of resent runs of the selected pipeline, click on the lates run
|
||||
|
||||
3. At the stages secction select the "Run behavior tests" stage
|
||||
|
||||
![](media/pipeline-stages-run.PNG 'Pipeline Stages')
|
||||
!['Pipeline Stages'](media/pipeline-stages-run.png)
|
||||
|
||||
4. Review the Azure DevOps execution results for "Run behavior tests"\\"TEST: Run behave features"
|
||||
|
||||
![](media/pipeline-run-results.PNG 'Pipeline Results')
|
||||
!['Pipeline Results'](media/pipeline-run-results.png)
|
||||
|
||||
<p>Here you see the results of running the BDD test using <b>behave</b></p>
|
||||
|
||||
|
|
До Ширина: | Высота: | Размер: 58 KiB После Ширина: | Высота: | Размер: 58 KiB |
До Ширина: | Высота: | Размер: 248 KiB После Ширина: | Высота: | Размер: 248 KiB |
До Ширина: | Высота: | Размер: 296 KiB После Ширина: | Высота: | Размер: 296 KiB |
До Ширина: | Высота: | Размер: 23 KiB После Ширина: | Высота: | Размер: 23 KiB |
До Ширина: | Высота: | Размер: 169 KiB После Ширина: | Высота: | Размер: 169 KiB |
До Ширина: | Высота: | Размер: 23 KiB После Ширина: | Высота: | Размер: 23 KiB |
До Ширина: | Высота: | Размер: 22 KiB После Ширина: | Высота: | Размер: 22 KiB |
До Ширина: | Высота: | Размер: 50 KiB После Ширина: | Высота: | Размер: 50 KiB |
До Ширина: | Высота: | Размер: 46 KiB После Ширина: | Высота: | Размер: 46 KiB |
До Ширина: | Высота: | Размер: 27 KiB После Ширина: | Высота: | Размер: 27 KiB |
До Ширина: | Высота: | Размер: 6.3 KiB После Ширина: | Высота: | Размер: 6.3 KiB |
До Ширина: | Высота: | Размер: 9.4 KiB После Ширина: | Высота: | Размер: 9.4 KiB |
До Ширина: | Высота: | Размер: 20 KiB После Ширина: | Высота: | Размер: 20 KiB |
До Ширина: | Высота: | Размер: 7.2 KiB После Ширина: | Высота: | Размер: 7.2 KiB |
До Ширина: | Высота: | Размер: 28 KiB После Ширина: | Высота: | Размер: 28 KiB |
До Ширина: | Высота: | Размер: 44 KiB После Ширина: | Высота: | Размер: 44 KiB |
До Ширина: | Высота: | Размер: 41 KiB После Ширина: | Высота: | Размер: 41 KiB |
До Ширина: | Высота: | Размер: 21 KiB После Ширина: | Высота: | Размер: 21 KiB |
До Ширина: | Высота: | Размер: 18 KiB После Ширина: | Высота: | Размер: 18 KiB |
До Ширина: | Высота: | Размер: 29 KiB После Ширина: | Высота: | Размер: 29 KiB |
До Ширина: | Высота: | Размер: 68 KiB После Ширина: | Высота: | Размер: 68 KiB |
До Ширина: | Высота: | Размер: 102 KiB После Ширина: | Высота: | Размер: 102 KiB |
До Ширина: | Высота: | Размер: 14 KiB После Ширина: | Высота: | Размер: 14 KiB |
До Ширина: | Высота: | Размер: 40 KiB После Ширина: | Высота: | Размер: 40 KiB |
До Ширина: | Высота: | Размер: 124 KiB После Ширина: | Высота: | Размер: 124 KiB |
До Ширина: | Высота: | Размер: 80 KiB После Ширина: | Высота: | Размер: 80 KiB |
До Ширина: | Высота: | Размер: 87 KiB После Ширина: | Высота: | Размер: 87 KiB |
До Ширина: | Высота: | Размер: 5.9 KiB После Ширина: | Высота: | Размер: 5.9 KiB |
До Ширина: | Высота: | Размер: 79 KiB После Ширина: | Высота: | Размер: 79 KiB |
До Ширина: | Высота: | Размер: 246 KiB После Ширина: | Высота: | Размер: 246 KiB |
До Ширина: | Высота: | Размер: 67 KiB После Ширина: | Высота: | Размер: 67 KiB |
До Ширина: | Высота: | Размер: 24 KiB После Ширина: | Высота: | Размер: 24 KiB |
До Ширина: | Высота: | Размер: 228 KiB После Ширина: | Высота: | Размер: 228 KiB |
До Ширина: | Высота: | Размер: 216 KiB После Ширина: | Высота: | Размер: 216 KiB |
До Ширина: | Высота: | Размер: 5.7 KiB После Ширина: | Высота: | Размер: 5.7 KiB |
До Ширина: | Высота: | Размер: 5.7 KiB После Ширина: | Высота: | Размер: 5.7 KiB |
До Ширина: | Высота: | Размер: 52 KiB После Ширина: | Высота: | Размер: 52 KiB |
До Ширина: | Высота: | Размер: 30 KiB После Ширина: | Высота: | Размер: 30 KiB |
До Ширина: | Высота: | Размер: 104 KiB После Ширина: | Высота: | Размер: 104 KiB |
До Ширина: | Высота: | Размер: 124 KiB После Ширина: | Высота: | Размер: 124 KiB |
До Ширина: | Высота: | Размер: 153 KiB После Ширина: | Высота: | Размер: 153 KiB |
До Ширина: | Высота: | Размер: 28 KiB После Ширина: | Высота: | Размер: 28 KiB |
До Ширина: | Высота: | Размер: 96 KiB После Ширина: | Высота: | Размер: 96 KiB |
До Ширина: | Высота: | Размер: 263 KiB После Ширина: | Высота: | Размер: 263 KiB |
До Ширина: | Высота: | Размер: 186 KiB После Ширина: | Высота: | Размер: 186 KiB |
До Ширина: | Высота: | Размер: 270 KiB После Ширина: | Высота: | Размер: 270 KiB |
До Ширина: | Высота: | Размер: 6.1 KiB После Ширина: | Высота: | Размер: 6.1 KiB |
До Ширина: | Высота: | Размер: 6.5 KiB После Ширина: | Высота: | Размер: 6.5 KiB |
До Ширина: | Высота: | Размер: 54 KiB После Ширина: | Высота: | Размер: 54 KiB |
До Ширина: | Высота: | Размер: 28 KiB После Ширина: | Высота: | Размер: 28 KiB |
До Ширина: | Высота: | Размер: 106 KiB После Ширина: | Высота: | Размер: 106 KiB |
До Ширина: | Высота: | Размер: 8.4 KiB После Ширина: | Высота: | Размер: 8.4 KiB |
До Ширина: | Высота: | Размер: 18 KiB После Ширина: | Высота: | Размер: 18 KiB |
До Ширина: | Высота: | Размер: 8.2 KiB После Ширина: | Высота: | Размер: 8.2 KiB |
До Ширина: | Высота: | Размер: 18 KiB После Ширина: | Высота: | Размер: 18 KiB |
До Ширина: | Высота: | Размер: 12 KiB После Ширина: | Высота: | Размер: 12 KiB |
До Ширина: | Высота: | Размер: 32 KiB После Ширина: | Высота: | Размер: 32 KiB |
До Ширина: | Высота: | Размер: 30 KiB После Ширина: | Высота: | Размер: 30 KiB |
До Ширина: | Высота: | Размер: 14 KiB После Ширина: | Высота: | Размер: 14 KiB |
До Ширина: | Высота: | Размер: 18 KiB После Ширина: | Высота: | Размер: 18 KiB |
До Ширина: | Высота: | Размер: 351 KiB После Ширина: | Высота: | Размер: 351 KiB |
До Ширина: | Высота: | Размер: 202 KiB После Ширина: | Высота: | Размер: 202 KiB |
До Ширина: | Высота: | Размер: 254 KiB После Ширина: | Высота: | Размер: 254 KiB |
До Ширина: | Высота: | Размер: 319 KiB После Ширина: | Высота: | Размер: 319 KiB |
До Ширина: | Высота: | Размер: 483 KiB После Ширина: | Высота: | Размер: 483 KiB |
До Ширина: | Высота: | Размер: 301 KiB После Ширина: | Высота: | Размер: 301 KiB |
До Ширина: | Высота: | Размер: 265 KiB После Ширина: | Высота: | Размер: 265 KiB |
До Ширина: | Высота: | Размер: 340 KiB После Ширина: | Высота: | Размер: 340 KiB |
До Ширина: | Высота: | Размер: 94 KiB После Ширина: | Высота: | Размер: 94 KiB |
До Ширина: | Высота: | Размер: 105 KiB После Ширина: | Высота: | Размер: 105 KiB |
До Ширина: | Высота: | Размер: 16 KiB После Ширина: | Высота: | Размер: 16 KiB |
До Ширина: | Высота: | Размер: 3.7 KiB После Ширина: | Высота: | Размер: 3.7 KiB |
До Ширина: | Высота: | Размер: 294 KiB После Ширина: | Высота: | Размер: 294 KiB |
До Ширина: | Высота: | Размер: 9.5 KiB После Ширина: | Высота: | Размер: 9.5 KiB |
До Ширина: | Высота: | Размер: 402 KiB После Ширина: | Высота: | Размер: 402 KiB |
До Ширина: | Высота: | Размер: 223 KiB После Ширина: | Высота: | Размер: 223 KiB |
До Ширина: | Высота: | Размер: 342 KiB После Ширина: | Высота: | Размер: 342 KiB |
|
@ -20,6 +20,18 @@
|
|||
"description": "Region where the resource is provisioned"
|
||||
}
|
||||
},
|
||||
"servicePrincipal": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "Service Principal for credentials usage"
|
||||
}
|
||||
},
|
||||
"servicePrincipalSecret": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "Service Principal Secret for databricks usage"
|
||||
}
|
||||
},
|
||||
"resourceGroupData": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
|
|
|
@ -12,7 +12,10 @@
|
|||
"value": "eastus"
|
||||
},
|
||||
"servicePrincipal": {
|
||||
"value": "SP-<Project.Alias>-DevTest"
|
||||
"value": "SP-<Project.Name>-DevTest"
|
||||
},
|
||||
"servicePrincipalSecret": {
|
||||
"value": "<ServicePrincipal.Secret>"
|
||||
},
|
||||
"resourceGroupData":{
|
||||
"value": "rg-<Project.Alias>-data-dev"
|
||||
|
|
|
@ -12,7 +12,10 @@
|
|||
"value": "eastus"
|
||||
},
|
||||
"servicePrincipal": {
|
||||
"value": "SP-<Project.Alias>-DevTest"
|
||||
"value": "SP-<Project.Name>-DevTest"
|
||||
},
|
||||
"servicePrincipalSecret": {
|
||||
"value": "<ServicePrincipal.Secret>"
|
||||
},
|
||||
"resourceGroupData":{
|
||||
"value": "rg-<Project.Alias>-data-prod"
|
||||
|
|
|
@ -12,7 +12,10 @@
|
|||
"value": "eastus"
|
||||
},
|
||||
"servicePrincipal": {
|
||||
"value": "SP-<Project.Alias>-DevTest"
|
||||
"value": "SP-<Project.Name>-DevTest"
|
||||
},
|
||||
"servicePrincipalSecret": {
|
||||
"value": "<ServicePrincipal.Secret>"
|
||||
},
|
||||
"resourceGroupData":{
|
||||
"value": "rg-<Project.Alias>-data-qa"
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
param(
|
||||
[Parameter(Mandatory)] [string] $KeyVaultName,
|
||||
[Parameter(Mandatory)] [string] $ComputeResourceGroup,
|
||||
[Parameter(Mandatory)] [string] $DatabricksName
|
||||
)
|
||||
|
||||
Write-Host "Installing Databricks Cli..." -ForegroundColor Green
|
||||
pip install databricks-cli --upgrade
|
||||
|
||||
Write-Host "Getting Azure resources..." -ForegroundColor Green
|
||||
$kv = Get-AzKeyVault -VaultName $KeyVaultName
|
||||
$dbw = Get-AzDatabricksWorkspace -ResourceGroupName $ComputeResourceGroup -Name $DatabricksName
|
||||
|
||||
Write-Host "Creating the Key Vault secret scope on Databricks..." -ForegroundColor Green
|
||||
$accessToken = Get-AzAccessToken -ResourceUrl 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d
|
||||
$env:DATABRICKS_TOKEN = $accessToken.Token
|
||||
$env:DATABRICKS_HOST = "https://$($dbw.Url)"
|
||||
Write-Host "URL DBW https://$($dbw.Url)"
|
||||
|
||||
$scopesList = databricks secrets list-scopes --output json | ConvertFrom-Json
|
||||
if (! $scopesList.scopes.name -contains "dataops") {
|
||||
databricks secrets create-scope --scope 'dataops' --scope-backend-type AZURE_KEYVAULT --resource-id $kv.ResourceId --dns-name $kv.VaultUri
|
||||
}
|
||||
|
||||
Write-Host "Listing Databricks scope content..." -ForegroundColor Green
|
||||
databricks secrets list --scope dataops
|
||||
|
||||
Write-Host "Finished!" -ForegroundColor Blue
|
|
@ -1,28 +1,42 @@
|
|||
param(
|
||||
[Parameter(Mandatory)] [string] $ServicePrincipalName,
|
||||
[Parameter(Mandatory)] [string] $DataResourceGroup,
|
||||
[Parameter(Mandatory)] [string] $ComputeResourceGroup
|
||||
[Parameter(Mandatory)] [string] [ValidateSet("dev", "qa", "prod", "sandbox")] $Environment,
|
||||
[Parameter(Mandatory)] [string] $DataLakeName,
|
||||
[Parameter(Mandatory)] [string] $DatabricksName,
|
||||
[Parameter(Mandatory)] [string] $KeyVaultName,
|
||||
[Parameter(Mandatory)] [string] $DATABRICKS_TOKEN,
|
||||
[string] $SolutionParametersFile = "./infrastructure-as-code/infrastructure/parameters/parameters.$Environment.json"
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
# Load enviornments variables from DevOps Library
|
||||
$KeyVaultName = $env:keyVaultName
|
||||
$DataLakeName = $env:dataLakeName
|
||||
$DatabricksName = $env:databricksName
|
||||
Write-Host "Getting variables from Library file..." -ForegroundColor Green
|
||||
#Write-Host "DataLake: " $DataLakeName
|
||||
#Write-Host "DataBricks: " $DatabricksName
|
||||
#Write-Host "Key Valt: " $KeyVaultName
|
||||
|
||||
Write-Host "Getting variables from $SolutionParametersFile file..." -ForegroundColor Green
|
||||
$ParameterContent = Get-Content -Path $SolutionParametersFile | ConvertFrom-Json
|
||||
$DataResourceGroup = ($ParameterContent).PSObject.Properties["parameters"].Value.resourceGroupData.Value
|
||||
$ComputeResourceGroup = ($ParameterContent).PSObject.Properties["parameters"].Value.resourceGroupCompute.Value
|
||||
$ServicePrincipalName = ($ParameterContent).PSObject.Properties["parameters"].Value.servicePrincipal.Value
|
||||
|
||||
Write-Host "Parameter file " $SolutionParametersFile
|
||||
Write-Host "ServicePrincipalName " $ServicePrincipalName
|
||||
|
||||
$context = Get-AzContext
|
||||
Write-Host "Getting user and principal information..." -ForegroundColor Green
|
||||
$user = Get-AzADUser -UserPrincipalName $context.Account.Id
|
||||
$servicePrincipal = Get-AzADServicePrincipal -DisplayName $ServicePrincipalName
|
||||
|
||||
if ($servicePrincipal) {
|
||||
|
||||
Write-Host "Generating new client secret..." -ForegroundColor Green
|
||||
$startDate = Get-Date
|
||||
$endDate = $start.AddMonths(6)
|
||||
$endDate = $startDate.AddMonths(6)
|
||||
|
||||
$clientSecret = New-AzADSpCredential -ObjectId $servicePrincipal.Id -StartDate $startDate -EndDate $endDate
|
||||
Write-Host "Secret generated " $clientSecret -ForegroundColor Yellow
|
||||
$UnsecureSecret = ConvertFrom-SecureString -SecureString $clientSecret.Secret -AsPlainText
|
||||
|
||||
Write-Host "New Secret was generated for Service Principal " $UnsecureSecret -ForegroundColor Yellow
|
||||
|
||||
Write-Host "Getting Azure resources..." -ForegroundColor Green
|
||||
$kv = Get-AzKeyVault -VaultName $KeyVaultName
|
||||
|
@ -30,32 +44,37 @@ if ($servicePrincipal) {
|
|||
$dbw = Get-AzDatabricksWorkspace -ResourceGroupName $ComputeResourceGroup -Name $DatabricksName
|
||||
|
||||
Write-Host "Adding permissions to user on Key Vault..." -ForegroundColor Green
|
||||
$userPermissions = $kv.AccessPolicies | Where-Object { $_.ObjectId -eq $user.Id }
|
||||
$userPermissions = $kv.AccessPolicies | Where-Object { $_.ObjectId -eq $servicePrincipal.Id }
|
||||
$secretPermissions = $userPermissions.PermissionsToSecrets
|
||||
if (! $secretPermissions || ! $userPermissions.PermissionsToSecrets.Contains("set")) {
|
||||
Set-AzKeyVaultAccessPolicy -VaultName $KeyVaultName -ObjectId $user.Id -PermissionsToSecrets "set"
|
||||
Set-AzKeyVaultAccessPolicy -VaultName $KeyVaultName -ObjectId $servicePrincipal.Id -PermissionsToSecrets "set"
|
||||
}
|
||||
|
||||
Write-Host "Setting service principal secrets on Key Vault..." -ForegroundColor Green
|
||||
Set-AzKeyVaultSecret -VaultName $KeyVaultName -Name "tenantId" -SecretValue $(ConvertTo-SecureString $context.Tenant.Id -AsPlainText -Force)
|
||||
Set-AzKeyVaultSecret -VaultName $KeyVaultName -Name "clientId" -SecretValue $(ConvertTo-SecureString $ClientID -AsPlainText -Force)
|
||||
Set-AzKeyVaultSecret -VaultName $KeyVaultName -Name "clientSecret" -SecretValue $clientSecret
|
||||
Set-AzKeyVaultSecret -VaultName $KeyVaultName -Name "clientId" -SecretValue $(ConvertTo-SecureString $servicePrincipal.Id -AsPlainText -Force)
|
||||
Set-AzKeyVaultSecret -VaultName $KeyVaultName -Name "clientSecret" -SecretValue $clientSecret.Secret
|
||||
|
||||
Write-Host "Assigning roles to the service principal on the data lake..." -ForegroundColor Green
|
||||
$assigment = Get-AzRoleAssignment -ObjectId $principal.Id -Scope $lake.Id | Where-Object { $_.RoleDefinitionName -eq "Storage Blob Data Contributor" }
|
||||
$assigment = Get-AzRoleAssignment -ObjectId $servicePrincipal.Id -Scope $lake.Id | Where-Object { $_.RoleDefinitionName -eq "Storage Blob Data Contributor" }
|
||||
if(! $assigment){
|
||||
New-AzRoleAssignment -ObjectId $principal.Id -Scope $lake.Id -RoleDefinitionName "Storage Blob Data Contributor"
|
||||
New-AzRoleAssignment -ObjectId $servicePrincipal.Id -Scope $lake.Id -RoleDefinitionName "Storage Blob Data Contributor"
|
||||
}
|
||||
|
||||
Write-Host "Creating the Key Vault secret scope on Databricks..." -ForegroundColor Green
|
||||
$accessToken = Get-AzAccessToken -ResourceUrl 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d
|
||||
$env:DATABRICKS_TOKEN = $accessToken.Token
|
||||
$env:DATABRICKS_HOST = "https://$($dbw.Url)"
|
||||
Write-Host "URL DBW https://$($dbw.Url)"
|
||||
$scopesList = databricks secrets list-scopes --output json | ConvertFrom-Json
|
||||
if (! $scopesList.scopes.name -contains "dataops") {
|
||||
databricks secrets create-scope --scope 'dataops' --scope-backend-type AZURE_KEYVAULT --resource-id $kv.ResourceId --dns-name $kv.VaultUri
|
||||
}
|
||||
#Write-Host "Creating the Key Vault secret scope on Databricks..." -ForegroundColor Green
|
||||
#$accessToken = Get-AzAccessToken -ResourceUrl 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d
|
||||
#$env:DATABRICKS_TOKEN = $accessToken.Token
|
||||
#$env:DATABRICKS_HOST = "https://$($dbw.Url)"
|
||||
#$env:DATABRICKS_TOKEN = $DATABRICKS_TOKEN
|
||||
#Write-Host "URL DBW https://$($dbw.Url)"
|
||||
#Write-Host "Databricks Token " $DATABRICKS_TOKEN
|
||||
#Write-Host "Databricks Token (env) " $env:DATABRICKS_TOKEN
|
||||
|
||||
# $scopesList = databricks secrets list-scopes --output json | ConvertFrom-Json
|
||||
# Write-Host "List of scopes: " $scopesList
|
||||
# if (! $scopesList.scopes.name -contains "dataops") {
|
||||
# databricks secrets create-scope --scope 'dataops' --scope-backend-type AZURE_KEYVAULT --resource-id $kv.ResourceId --dns-name $kv.VaultUri
|
||||
# }
|
||||
}
|
||||
else {
|
||||
Write-Host "No Service Principal founded" -ForegroundColor Red
|
||||
|
|
|
@ -98,7 +98,7 @@ Execute all these steps below to setup your evironment before running the Hands-
|
|||
|
||||
```powershell
|
||||
az config set extension.use_dynamic_install=yes_without_prompt
|
||||
|
||||
Connect-AzureAD
|
||||
./quickstart/scripts/cloud-setup/Deploy-AzurePreReqs.ps1 -ConfigurationFile "quickstart/configs/cloud-setup/hol.json"
|
||||
```
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
{
|
||||
"Project": {
|
||||
"Name": "<project.name>",
|
||||
"Alias": "<project.alias>"
|
||||
"Alias": "<project.alias>",
|
||||
"ServicePrincipalSecret": "<serviceprincipal.secret>"
|
||||
},
|
||||
"RepoConfiguration": {
|
||||
"RepoName": "<project.name>",
|
||||
|
|
До Ширина: | Высота: | Размер: 5.8 KiB После Ширина: | Высота: | Размер: 5.8 KiB |
До Ширина: | Высота: | Размер: 5.7 KiB После Ширина: | Высота: | Размер: 5.7 KiB |
До Ширина: | Высота: | Размер: 34 KiB После Ширина: | Высота: | Размер: 34 KiB |
|
@ -37,6 +37,10 @@ EndScope
|
|||
|
||||
[hashtable]$servicePrincipals = SetupServicePrincipals -Configuration $config -Verbose:$VerbosePreference
|
||||
SetupEnvironments -Configuration $config -ServicePrincipals $servicePrincipals -Verbose:$VerbosePreference
|
||||
PublishOutputs -Configuration $config -Verbose:$VerbosePreference
|
||||
|
||||
#Save this password inside output hol file
|
||||
$ServicePrincipalSecret = $ServicePrincipals[$config.servicePrincipals[0]].clientSecret
|
||||
|
||||
PublishOutputs -Configuration $config -ServicePrincipalSecret $ServicePrincipalSecret -Verbose:$VerbosePreference
|
||||
|
||||
Write-Host "Done!"
|
||||
|
|
|
@ -76,6 +76,7 @@ function SetupEnvironments {
|
|||
Set-AzContext -Subscription $enviroment.subscriptionId
|
||||
|
||||
AssignRoleIfNotExists -RoleName "Owner" -ObjectId $servicePrincipal.objectId -SubscriptionId $enviroment.subscriptionId
|
||||
AssignApplicationAdministratorAZRole -ObjectId $servicePrincipal.objectId
|
||||
|
||||
SetupResourceGroups -Environment $envKey -Configuration $Configuration
|
||||
SetupServiceConnection -Environment $enviroment -ServicePrincipal $servicePrincipal -Configuration $Configuration
|
||||
|
@ -129,6 +130,41 @@ function CreateOrGetResourceGroup
|
|||
return $resourceGroup
|
||||
}
|
||||
|
||||
function AssignApplicationAdministratorAZRole
|
||||
{
|
||||
[cmdletbinding()]
|
||||
[OutputType([void])]
|
||||
param (
|
||||
[Parameter(Mandatory)] [string] $ObjectId
|
||||
)
|
||||
|
||||
# Login into Azure AD with current user
|
||||
# Connect-AzureAD
|
||||
|
||||
# Fetch role instance
|
||||
$role = Get-AzureADDirectoryRole | Where-Object {$_.displayName -eq 'Application administrator'}
|
||||
|
||||
# If role instance does not exist, instantiate it based on the role template
|
||||
if ($role -eq $null) {
|
||||
# Instantiate an instance of the role template
|
||||
$roleTemplate = Get-AzureADDirectoryRoleTemplate | Where-Object {$_.displayName -eq 'Application Administrator'}
|
||||
Enable-AzureADDirectoryRole -RoleTemplateId $roleTemplate.ObjectId
|
||||
|
||||
# Fetch role
|
||||
$role = Get-AzureADDirectoryRole | Where-Object {$_.displayName -eq 'Application Administrator'}
|
||||
}
|
||||
|
||||
# Add the SP to role
|
||||
try {
|
||||
Add-AzureADDirectoryRoleMember -ObjectId $role.ObjectId -RefObjectId $ObjectId
|
||||
LogInfo -Message "Service Principal add into Role Application administrator with success!"
|
||||
}
|
||||
catch {
|
||||
LogInfo -Message "Service Principal already have Application administrator role."
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function AssignRoleIfNotExists
|
||||
{
|
||||
[cmdletbinding()]
|
||||
|
|
|
@ -42,25 +42,30 @@ function UpdateIaCParameters {
|
|||
|
||||
BeginScope -Scope "IaC parameters"
|
||||
|
||||
$ServicePrincipalSecret = $Configuration.Project.ServicePrincipalSecret
|
||||
|
||||
ReplaceTemplateTokens -Configuration $Configuration -RemoveInput `
|
||||
-InputFile infrastructure-as-code/infrastructure/parameters/parameters.dev.template.json `
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.dev.json
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.dev.json `
|
||||
-ServicePrincipalSecret $ServicePrincipalSecret `
|
||||
|
||||
ReplaceTemplateTokens -Configuration $Configuration -RemoveInput `
|
||||
-InputFile infrastructure-as-code/infrastructure/parameters/parameters.qa.template.json `
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.qa.json
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.qa.json `
|
||||
-ServicePrincipalSecret $ServicePrincipalSecret `
|
||||
|
||||
ReplaceTemplateTokens -Configuration $Configuration -RemoveInput `
|
||||
-InputFile infrastructure-as-code/infrastructure/parameters/parameters.prod.template.json `
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.prod.json
|
||||
-OutputFile infrastructure-as-code/infrastructure/parameters/parameters.prod.json `
|
||||
-ServicePrincipalSecret $ServicePrincipalSecret `
|
||||
|
||||
ReplaceTemplateTokens -Configuration $Configuration -RemoveInput `
|
||||
-InputFile azure-pipelines/databricks/databricks-lib-cd.template.yml `
|
||||
-OutputFile azure-pipelines/databricks/databricks-lib-cd.yml
|
||||
-OutputFile azure-pipelines/databricks/databricks-lib-cd.yml `
|
||||
|
||||
ReplaceTemplateTokens -Configuration $Configuration -RemoveInput `
|
||||
-InputFile azure-pipelines/variable.environment.template.yml `
|
||||
-OutputFile azure-pipelines/variable.environment.yml
|
||||
-OutputFile azure-pipelines/variable.environment.yml `
|
||||
|
||||
EndScope
|
||||
|
||||
|
@ -73,15 +78,20 @@ function UpdateIaCParameters {
|
|||
|
||||
function PublishOutputs {
|
||||
param(
|
||||
[Parameter(Mandatory)] [hashtable] $Configuration
|
||||
[Parameter(Mandatory)] [hashtable] $Configuration,
|
||||
[Parameter(Mandatory)] [string] $ServicePrincipalSecret
|
||||
)
|
||||
|
||||
BeginScope -Scope "Outputs"
|
||||
|
||||
Write-Host "Input file " $Configuration.output.template
|
||||
Write-Host "Output file " $Configuration.output.file
|
||||
|
||||
ReplaceTemplateTokens `
|
||||
-Configuration $Configuration `
|
||||
-InputFile $Configuration.output.template `
|
||||
-OutputFile $Configuration.output.file `
|
||||
-ServicePrincipalSecret $ServicePrincipalSecret `
|
||||
|
||||
EndScope
|
||||
}
|
||||
|
@ -92,6 +102,7 @@ function ReplaceTemplateTokens {
|
|||
[Parameter(Mandatory)] [hashtable] $Configuration,
|
||||
[Parameter(Mandatory)] [string] $InputFile,
|
||||
[Parameter(Mandatory)] [string] $OutputFile,
|
||||
[string] $ServicePrincipalSecret,
|
||||
[string] $StartTokenPattern = '<',
|
||||
[string] $EndTokenPattern = '>',
|
||||
[switch] $RemoveInput
|
||||
|
@ -101,18 +112,30 @@ function ReplaceTemplateTokens {
|
|||
|
||||
[int]$totalTokens = 0
|
||||
|
||||
Write-Host "Input File '$InputFile'"
|
||||
Write-Host (Get-Content $InputFile)
|
||||
|
||||
(Get-Content $InputFile) | ForEach-Object {
|
||||
$line = $_
|
||||
$tokens = GetTokens -Line $line -StartTokenPattern $StartTokenPattern -EndTokenPattern $EndTokenPattern
|
||||
$totalTokens += $tokens.Count
|
||||
|
||||
foreach ($token in $tokens) {
|
||||
[string]$configPropertyName = $token -replace "$($StartTokenPattern)|$($EndTokenPattern)", ''
|
||||
[string]$tokenValue = Invoke-Expression -Command "`$Configuration.$configPropertyName"
|
||||
|
||||
Write-Host "Token '$token'"
|
||||
|
||||
[string]$configPropertyName = $token -replace "$($StartTokenPattern)|$($EndTokenPattern)", ''
|
||||
|
||||
if ( $configPropertyName -eq "serviceprincipal.secret") {
|
||||
Write-Verbose "Replacing '$token' token by '$ServicePrincipalSecret'"
|
||||
$line = $line -replace "$token", "$ServicePrincipalSecret"
|
||||
}
|
||||
else {
|
||||
[string]$tokenValue = Invoke-Expression -Command "`$Configuration.$configPropertyName"
|
||||
Write-Verbose "Replacing '$token' token by '$tokenValue'"
|
||||
$line = $line -replace "$token", "$tokenValue"
|
||||
}
|
||||
}
|
||||
|
||||
$line | Out-File -Append -FilePath $OutputFile
|
||||
}
|
||||
|
|