delta example and up-version mltable (#2205)

2023-04-18 17:23:28 +01:00 · 2023-04-18 17:23:28 +01:00 · aab06c1c28
--- a/.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
+++ b/.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
@ -0,0 +1,75 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: sdk-using-mltable-delta-lake-example-delta-lake-example
+# This file is created by sdk/python/readme.py.
+# Please do not edit directly.
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "23 8/12 * * *"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - sdk/python/using-mltable/delta-lake-example/**
+      - .github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml
+      - sdk/python/dev-requirements.txt
+      - infra/**
+      - sdk/python/setup.sh
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: setup python
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.8"
+    - name: pip install notebook reqs
+      run: pip install -r sdk/python/dev-requirements.txt
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZUREML_CREDENTIALS}}
+    - name: bootstrap resources
+      run: |
+          echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
+          bash bootstrap.sh
+      working-directory: infra
+      continue-on-error: false
+    - name: setup SDK
+      run: |
+          source "${{ github.workspace }}/infra/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/init_environment.sh";
+          bash setup.sh
+      working-directory: sdk/python
+      continue-on-error: true
+    - name: setup-cli
+      run: |
+          source "${{ github.workspace }}/infra/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/init_environment.sh";
+          bash setup.sh
+      working-directory: cli
+      continue-on-error: true
+    - name: run using-mltable/delta-lake-example/delta-lake-example.ipynb
+      run: |
+          source "${{ github.workspace }}/infra/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/init_environment.sh";
+          bash "${{ github.workspace }}/infra/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
+          bash "${{ github.workspace }}/infra/sdk_helpers.sh" replace_template_values "delta-lake-example.ipynb";
+          [ -f "../../.azureml/config" ] && cat "../../.azureml/config";
+          papermill -k python delta-lake-example.ipynb delta-lake-example.output.ipynb
+      working-directory: sdk/python/using-mltable/delta-lake-example
+    - name: upload notebook's working folder as an artifact
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: delta-lake-example
+        path: sdk/python/using-mltable/delta-lake-example
--- a/sdk/python/README.md
+++ b/sdk/python/README.md
@ -166,6 +166,7 @@ Test Status is for branch - **_main_**
 |using-mlflow|train-and-log|[xgboost_service_principal](using-mlflow/train-and-log/xgboost_service_principal.ipynb)|*no description* - _This sample is excluded from automated tests_|[![xgboost_service_principal](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml)|
 |using-mlflow|using-rest-api|[using_mlflow_rest_api](using-mlflow/using-rest-api/using_mlflow_rest_api.ipynb)|*no description* - _This sample is excluded from automated tests_|[![using_mlflow_rest_api](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml)|
 |using-mltable|delimited-files-example|[delimited-files-example](using-mltable/delimited-files-example/delimited-files-example.ipynb)|*no description*|[![delimited-files-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml)|
+|using-mltable|delta-lake-example|[delta-lake-example](using-mltable/delta-lake-example/delta-lake-example.ipynb)|*no description*|[![delta-lake-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml)|
 |using-mltable|from-paths-example|[from-paths-example](using-mltable/from-paths-example/from-paths-example.ipynb)|*no description*|[![from-paths-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml)|
 |using-mltable|local-to-cloud|[mltable-local-to-cloud](using-mltable/local-to-cloud/mltable-local-to-cloud.ipynb)|*no description*|[![mltable-local-to-cloud](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml)|
 |using-mltable|quickstart|[mltable-quickstart](using-mltable/quickstart/mltable-quickstart.ipynb)|*no description*|[![mltable-quickstart](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml)|
--- a/sdk/python/using-mltable/README.md
+++ b/sdk/python/using-mltable/README.md
@ -21,4 +21,5 @@ For more information on Azure ML Tables, read [Working with tables in Azure ML](
 | [Azure ML Tables Quickstart](./quickstart/mltable-quickstart.ipynb) | *Demonstrates an end-to-end example of using MLTable, including asset creation, loading into both interactive sessions and jobs. The data is in parquet format.* |
 | [Azure ML Tables Local-to-Cloud](./local-to-cloud/mltable-local-to-cloud.ipynb) | *Demonstrates how to work with data and tables locally and upload to the cloud as a data asset for improved sharing and reproducibility.* |
 | [Create an Azure ML Table from Delimited Text Files (CSV)](./delimited-files-example/delimited-files-example.ipynb) | *Demonstrates creating an MLTable from delimited files (CSV).* |
+| [Create an Azure ML Table from Delta Lake table](./delta-lake-example/delta-lake-example.ipynb) | *Demonstrates creating an MLTable from a data lake table on Azure storage.* |
 | [Create an Azure ML Table of paths](./from-paths-example/from-paths-example.ipynb) | *Demonstrates creating a Table of paths on cloud storage that can then be streamed into a Python session.* |
--- a/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml
+++ b/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml
@ -2,5 +2,5 @@ dependencies:
  - python=3.10
  - pip=21.2.4
  - pip:
-      - mltable==1.2.0
-      - azureml-dataprep[pandas]==4.9.5
+      - mltable==1.3.0
+      - azureml-dataprep[pandas]==4.10.6
--- a/sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb
+++ b/sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb
@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create a Table from Delta Lake\n",
+    "\n",
+    "In this example notebook you will create an AzureML Table from a Delta Table."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📦 Install dependencies\n",
+    "\n",
+    "Ensure you have the latest MLTable library and dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r ../mltable-requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 🐍 Create an MLTable using the Python SDK\n",
+    "\n",
+    "Here you build your data loading steps using the `mltable` Python SDK. The `show()` method allows you to see the effect of the data loading transformation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mltable\n",
+    "\n",
+    "# create paths to the data files\n",
+    "delta_table_uri = \"wasbs://data@azuremlexampledata.blob.core.windows.net/COVID-19_NYT\"\n",
+    "\n",
+    "# create an MLTable from the data files\n",
+    "tbl = mltable.from_delta_lake(delta_table_uri, timestamp_as_of=\"2022-10-01T00:00:00Z\")\n",
+    "\n",
+    "# show the first 5 records\n",
+    "tbl.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 🐼 Load into a Pandas data frame\n",
+    "\n",
+    "You can load your Azure ML Table into Pandas using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = tbl.to_pandas_dataframe()\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 💾 Save data loading steps \n",
+    "Next, you'll save all your data loading steps into an `MLTable` file. This allows you to *reproduce* your Pandas data frame at a later point in time without having to redefine the data loading steps in your code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save the data loading steps in an MLTable file\n",
+    "tbl.save(\"./covid\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 🔍 View the saved file\n",
+    "\n",
+    "In the next code cell, we show you the `MLTable` file so you can understand how the data loading steps are serialized into a file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"./covid/MLTable\", \"r\") as f:\n",
+    "    print(f.read())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ♻️ Reproduce data loading steps\n",
+    "\n",
+    "Now that the data loading steps have been serialized into a file, you can reproduce them at any point in time using the `load()` method. This means you do not need to redefine your data loading steps in code and makes it easier to share with others."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mltable\n",
+    "\n",
+    "# load the previously saved MLTable file\n",
+    "tbl = mltable.load(\"./covid/\")\n",
+    "df = tbl.to_pandas_dataframe()\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 🤝 Create a data asset to aid sharing and reproducibility\n",
+    "\n",
+    "You'll now create a data asset, which will automatically upload the `MLTable` to cloud storage (the default AzureML datastore) so that others can use it easily."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subscription_id = \"<SUBSCRIPTION_ID>\"\n",
+    "resource_group = \"<RESOURCE_GROUP>\"\n",
+    "workspace = \"<AML_WORKSPACE_NAME>\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from azure.ai.ml import MLClient\n",
+    "from azure.ai.ml.entities import Data\n",
+    "from azure.ai.ml.constants import AssetTypes\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "\n",
+    "# set the version number of the data asset to the current UTC time\n",
+    "VERSION = time.strftime(\"%Y.%m.%d.%H%M%S\", time.gmtime())\n",
+    "\n",
+    "# connect to the AzureML workspace\n",
+    "ml_client = MLClient(\n",
+    "    DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
+    ")\n",
+    "\n",
+    "my_data = Data(\n",
+    "    path=\"./covid\",\n",
+    "    type=AssetTypes.MLTABLE,\n",
+    "    description=\"COVID-19 dataset.\",\n",
+    "    name=\"covid-delta-example\",\n",
+    "    version=VERSION,\n",
+    ")\n",
+    "\n",
+    "ml_client.data.create_or_update(my_data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 📖 Read the data asset in an interactive session\n",
+    "\n",
+    "Now you have your MLTable stored in the cloud, you and Team members can access it using a friendly name in an interactive session (for example, a notebook)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mltable\n",
+    "from azure.ai.ml import MLClient\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "\n",
+    "# connect to the AzureML workspace\n",
+    "ml_client = MLClient(\n",
+    "    DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
+    ")\n",
+    "\n",
+    "# get the latest version of the data asset\n",
+    "# Note: The version was set in the previous code cell.\n",
+    "data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n",
+    "\n",
+    "# create a table\n",
+    "tbl = mltable.load(f\"azureml:/{data_asset.id}\")\n",
+    "\n",
+    "# load into pandas\n",
+    "df = tbl.to_pandas_dataframe()\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 📖 Read the data asset in a job\n",
+    "\n",
+    "You can also access your Table in a job, using:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.ai.ml import MLClient, command, Input\n",
+    "from azure.ai.ml.entities import Environment\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "\n",
+    "# connect to the AzureML workspace\n",
+    "ml_client = MLClient(\n",
+    "    DefaultAzureCredential(), subscription_id, resource_group, workspace\n",
+    ")\n",
+    "\n",
+    "# get the latest version of the data asset\n",
+    "# Note: the VERSION was set in a previous cell.\n",
+    "data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n",
+    "\n",
+    "job = command(\n",
+    "    command=\"python train.py --input ${{inputs.titanic}}\",\n",
+    "    inputs={\"titanic\": Input(type=\"mltable\", path=data_asset.id)},\n",
+    "    compute=\"cpu-cluster\",\n",
+    "    environment=Environment(\n",
+    "        image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04\",\n",
+    "        conda_file=\"./job-env/conda_dependencies.yml\",\n",
+    "    ),\n",
+    "    code=\"./src\",\n",
+    ")\n",
+    "\n",
+    "ml_client.jobs.create_or_update(job)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10 - SDK V2",
+   "language": "python",
+   "name": "python310-sdkv2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml
+++ b/sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml
@ -0,0 +1,6 @@
+dependencies:
+  - python=3.10
+  - pip=21.2.4
+  - pip:
+      - mltable==1.3.0
+      - azureml-dataprep[pandas]==4.10.6
--- a/sdk/python/using-mltable/local-to-cloud/job-env/src/train.py
+++ b/sdk/python/using-mltable/local-to-cloud/job-env/src/train.py
--- a/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml
+++ b/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml
@ -2,5 +2,5 @@ dependencies:
  - python=3.10
  - pip=21.2.4
  - pip:
-      - mltable==1.2.0
-      - azureml-dataprep[pandas]==4.9.5
+      - mltable==1.3.0
+      - azureml-dataprep[pandas]==4.10.6
--- a/sdk/python/using-mltable/mltable-requirements.txt
+++ b/sdk/python/using-mltable/mltable-requirements.txt
@ -1,2 +1,2 @@
-mltable==1.2.0
-azureml-dataprep[pandas]==4.9.5
+mltable==1.3.0
+azureml-dataprep[pandas]==4.10.6
--- a/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml
+++ b/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml
@ -2,5 +2,5 @@ dependencies:
  - python=3.10
  - pip=21.2.4
  - pip:
-      - mltable==1.1.0
-      - azureml-dataprep[pandas]
+      - mltable==1.3.0
+      - azureml-dataprep[pandas]==4.10.6