delta example and up-version mltable (#2205)
This commit is contained in:
Родитель
49cd66a846
Коммит
aab06c1c28
75
.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
поставляемый
Normal file
75
.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
поставляемый
Normal file
|
@ -0,0 +1,75 @@
|
|||
# This code is autogenerated.
|
||||
# Code is generated by running custom script: python3 readme.py
|
||||
# Any manual changes to this file may cause incorrect behavior.
|
||||
# Any manual changes will be overwritten if the code is regenerated.
|
||||
|
||||
name: sdk-using-mltable-delta-lake-example-delta-lake-example
|
||||
# This file is created by sdk/python/readme.py.
|
||||
# Please do not edit directly.
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "23 8/12 * * *"
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- sdk/python/using-mltable/delta-lake-example/**
|
||||
- .github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
|
||||
- sdk/python/dev-requirements.txt
|
||||
- infra/**
|
||||
- sdk/python/setup.sh
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: check out repo
|
||||
uses: actions/checkout@v2
|
||||
- name: setup python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: pip install notebook reqs
|
||||
run: pip install -r sdk/python/dev-requirements.txt
|
||||
- name: azure login
|
||||
uses: azure/login@v1
|
||||
with:
|
||||
creds: ${{secrets.AZUREML_CREDENTIALS}}
|
||||
- name: bootstrap resources
|
||||
run: |
|
||||
echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
|
||||
bash bootstrap.sh
|
||||
working-directory: infra
|
||||
continue-on-error: false
|
||||
- name: setup SDK
|
||||
run: |
|
||||
source "${{ github.workspace }}/infra/sdk_helpers.sh";
|
||||
source "${{ github.workspace }}/infra/init_environment.sh";
|
||||
bash setup.sh
|
||||
working-directory: sdk/python
|
||||
continue-on-error: true
|
||||
- name: setup-cli
|
||||
run: |
|
||||
source "${{ github.workspace }}/infra/sdk_helpers.sh";
|
||||
source "${{ github.workspace }}/infra/init_environment.sh";
|
||||
bash setup.sh
|
||||
working-directory: cli
|
||||
continue-on-error: true
|
||||
- name: run using-mltable/delta-lake-example/delta-lake-example.ipynb
|
||||
run: |
|
||||
source "${{ github.workspace }}/infra/sdk_helpers.sh";
|
||||
source "${{ github.workspace }}/infra/init_environment.sh";
|
||||
bash "${{ github.workspace }}/infra/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
|
||||
bash "${{ github.workspace }}/infra/sdk_helpers.sh" replace_template_values "delta-lake-example.ipynb";
|
||||
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";
|
||||
papermill -k python delta-lake-example.ipynb delta-lake-example.output.ipynb
|
||||
working-directory: sdk/python/using-mltable/delta-lake-example
|
||||
- name: upload notebook's working folder as an artifact
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: delta-lake-example
|
||||
path: sdk/python/using-mltable/delta-lake-example
|
|
@ -166,6 +166,7 @@ Test Status is for branch - **_main_**
|
|||
|using-mlflow|train-and-log|[xgboost_service_principal](using-mlflow/train-and-log/xgboost_service_principal.ipynb)|*no description* - _This sample is excluded from automated tests_|[![xgboost_service_principal](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml)|
|
||||
|using-mlflow|using-rest-api|[using_mlflow_rest_api](using-mlflow/using-rest-api/using_mlflow_rest_api.ipynb)|*no description* - _This sample is excluded from automated tests_|[![using_mlflow_rest_api](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml)|
|
||||
|using-mltable|delimited-files-example|[delimited-files-example](using-mltable/delimited-files-example/delimited-files-example.ipynb)|*no description*|[![delimited-files-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml)|
|
||||
|using-mltable|delta-lake-example|[delta-lake-example](using-mltable/delta-lake-example/delta-lake-example.ipynb)|*no description*|[![delta-lake-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml)|
|
||||
|using-mltable|from-paths-example|[from-paths-example](using-mltable/from-paths-example/from-paths-example.ipynb)|*no description*|[![from-paths-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml)|
|
||||
|using-mltable|local-to-cloud|[mltable-local-to-cloud](using-mltable/local-to-cloud/mltable-local-to-cloud.ipynb)|*no description*|[![mltable-local-to-cloud](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml)|
|
||||
|using-mltable|quickstart|[mltable-quickstart](using-mltable/quickstart/mltable-quickstart.ipynb)|*no description*|[![mltable-quickstart](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml)|
|
||||
|
|
|
@ -21,4 +21,5 @@ For more information on Azure ML Tables, read [Working with tables in Azure ML](
|
|||
| [Azure ML Tables Quickstart](./quickstart/mltable-quickstart.ipynb) | *Demonstrates an end-to-end example of using MLTable, including asset creation, loading into both interactive sessions and jobs. The data is in parquet format.* |
|
||||
| [Azure ML Tables Local-to-Cloud](./local-to-cloud/mltable-local-to-cloud.ipynb) | *Demonstrates how to work with data and tables locally and upload to the cloud as a data asset for improved sharing and reproducibility.* |
|
||||
| [Create an Azure ML Table from Delimited Text Files (CSV)](./delimited-files-example/delimited-files-example.ipynb) | *Demonstrates creating an MLTable from delimited files (CSV).* |
|
||||
| [Create an Azure ML Table from Delta Lake table](./delta-lake-example/delta-lake-example.ipynb) | *Demonstrates creating an MLTable from a data lake table on Azure storage.* |
|
||||
| [Create an Azure ML Table of paths](./from-paths-example/from-paths-example.ipynb) | *Demonstrates creating a Table of paths on cloud storage that can then be streamed into a Python session.* |
|
||||
|
|
|
@ -2,5 +2,5 @@ dependencies:
|
|||
- python=3.10
|
||||
- pip=21.2.4
|
||||
- pip:
|
||||
- mltable==1.2.0
|
||||
- azureml-dataprep[pandas]==4.9.5
|
||||
- mltable==1.3.0
|
||||
- azureml-dataprep[pandas]==4.10.6
|
|
@ -0,0 +1,290 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Create a Table from Delta Lake\n",
|
||||
"\n",
|
||||
"In this example notebook you will create an AzureML Table from a Delta Table."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 📦 Install dependencies\n",
|
||||
"\n",
|
||||
"Ensure you have the latest MLTable library and dependencies."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -r ../mltable-requirements.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 🐍 Create an MLTable using the Python SDK\n",
|
||||
"\n",
|
||||
"Here you build your data loading steps using the `mltable` Python SDK. The `show()` method allows you to see the effect of the data loading transformation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import mltable\n",
|
||||
"\n",
|
||||
"# create paths to the data files\n",
|
||||
"delta_table_uri = \"wasbs://data@azuremlexampledata.blob.core.windows.net/COVID-19_NYT\"\n",
|
||||
"\n",
|
||||
"# create an MLTable from the data files\n",
|
||||
"tbl = mltable.from_delta_lake(delta_table_uri, timestamp_as_of=\"2022-10-01T00:00:00Z\")\n",
|
||||
"\n",
|
||||
"# show the first 5 records\n",
|
||||
"tbl.show(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 🐼 Load into a Pandas data frame\n",
|
||||
"\n",
|
||||
"You can load your Azure ML Table into Pandas using:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = tbl.to_pandas_dataframe()\n",
|
||||
"df.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 💾 Save data loading steps \n",
|
||||
"Next, you'll save all your data loading steps into an `MLTable` file. This allows you to *reproduce* your Pandas data frame at a later point in time without having to redefine the data loading steps in your code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# save the data loading steps in an MLTable file\n",
|
||||
"tbl.save(\"./covid\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### 🔍 View the saved file\n",
|
||||
"\n",
|
||||
"In the next code cell, we show you the `MLTable` file so you can understand how the data loading steps are serialized into a file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"./covid/MLTable\", \"r\") as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ♻️ Reproduce data loading steps\n",
|
||||
"\n",
|
||||
"Now that the data loading steps have been serialized into a file, you can reproduce them at any point in time using the `load()` method. This means you do not need to redefine your data loading steps in code and makes it easier to share with others."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import mltable\n",
|
||||
"\n",
|
||||
"# load the previously saved MLTable file\n",
|
||||
"tbl = mltable.load(\"./covid/\")\n",
|
||||
"df = tbl.to_pandas_dataframe()\n",
|
||||
"df.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 🤝 Create a data asset to aid sharing and reproducibility\n",
|
||||
"\n",
|
||||
"You'll now create a data asset, which will automatically upload the `MLTable` to cloud storage (the default AzureML datastore) so that others can use it easily."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"subscription_id = \"<SUBSCRIPTION_ID>\"\n",
|
||||
"resource_group = \"<RESOURCE_GROUP>\"\n",
|
||||
"workspace = \"<AML_WORKSPACE_NAME>\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from azure.ai.ml import MLClient\n",
|
||||
"from azure.ai.ml.entities import Data\n",
|
||||
"from azure.ai.ml.constants import AssetTypes\n",
|
||||
"from azure.identity import DefaultAzureCredential\n",
|
||||
"\n",
|
||||
"# set the version number of the data asset to the current UTC time\n",
|
||||
"VERSION = time.strftime(\"%Y.%m.%d.%H%M%S\", time.gmtime())\n",
|
||||
"\n",
|
||||
"# connect to the AzureML workspace\n",
|
||||
"ml_client = MLClient(\n",
|
||||
" DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"my_data = Data(\n",
|
||||
" path=\"./covid\",\n",
|
||||
" type=AssetTypes.MLTABLE,\n",
|
||||
" description=\"COVID-19 dataset.\",\n",
|
||||
" name=\"covid-delta-example\",\n",
|
||||
" version=VERSION,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ml_client.data.create_or_update(my_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 📖 Read the data asset in an interactive session\n",
|
||||
"\n",
|
||||
"Now you have your MLTable stored in the cloud, you and Team members can access it using a friendly name in an interactive session (for example, a notebook)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import mltable\n",
|
||||
"from azure.ai.ml import MLClient\n",
|
||||
"from azure.identity import DefaultAzureCredential\n",
|
||||
"\n",
|
||||
"# connect to the AzureML workspace\n",
|
||||
"ml_client = MLClient(\n",
|
||||
" DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# get the latest version of the data asset\n",
|
||||
"# Note: The version was set in the previous code cell.\n",
|
||||
"data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n",
|
||||
"\n",
|
||||
"# create a table\n",
|
||||
"tbl = mltable.load(f\"azureml:/{data_asset.id}\")\n",
|
||||
"\n",
|
||||
"# load into pandas\n",
|
||||
"df = tbl.to_pandas_dataframe()\n",
|
||||
"df.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 📖 Read the data asset in a job\n",
|
||||
"\n",
|
||||
"You can also access your Table in a job, using:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azure.ai.ml import MLClient, command, Input\n",
|
||||
"from azure.ai.ml.entities import Environment\n",
|
||||
"from azure.identity import DefaultAzureCredential\n",
|
||||
"\n",
|
||||
"# connect to the AzureML workspace\n",
|
||||
"ml_client = MLClient(\n",
|
||||
" DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# get the latest version of the data asset\n",
|
||||
"# Note: the VERSION was set in a previous cell.\n",
|
||||
"data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n",
|
||||
"\n",
|
||||
"job = command(\n",
|
||||
" command=\"python train.py --input ${{inputs.titanic}}\",\n",
|
||||
" inputs={\"titanic\": Input(type=\"mltable\", path=data_asset.id)},\n",
|
||||
" compute=\"cpu-cluster\",\n",
|
||||
" environment=Environment(\n",
|
||||
" image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04\",\n",
|
||||
" conda_file=\"./job-env/conda_dependencies.yml\",\n",
|
||||
" ),\n",
|
||||
" code=\"./src\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ml_client.jobs.create_or_update(job)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.10 - SDK V2",
|
||||
"language": "python",
|
||||
"name": "python310-sdkv2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
dependencies:
|
||||
- python=3.10
|
||||
- pip=21.2.4
|
||||
- pip:
|
||||
- mltable==1.3.0
|
||||
- azureml-dataprep[pandas]==4.10.6
|
|
@ -2,5 +2,5 @@ dependencies:
|
|||
- python=3.10
|
||||
- pip=21.2.4
|
||||
- pip:
|
||||
- mltable==1.2.0
|
||||
- azureml-dataprep[pandas]==4.9.5
|
||||
- mltable==1.3.0
|
||||
- azureml-dataprep[pandas]==4.10.6
|
|
@ -1,2 +1,2 @@
|
|||
mltable==1.2.0
|
||||
azureml-dataprep[pandas]==4.9.5
|
||||
mltable==1.3.0
|
||||
azureml-dataprep[pandas]==4.10.6
|
|
@ -2,5 +2,5 @@ dependencies:
|
|||
- python=3.10
|
||||
- pip=21.2.4
|
||||
- pip:
|
||||
- mltable==1.1.0
|
||||
- azureml-dataprep[pandas]
|
||||
- mltable==1.3.0
|
||||
- azureml-dataprep[pandas]==4.10.6
|
Загрузка…
Ссылка в новой задаче