Refactoring and cleanign up data references

This commit is contained in:
John Wu 2019-11-26 01:50:14 +00:00 коммит произвёл Ubuntu
Родитель c77de014c8
Коммит 71e731dfcf
6 изменённых файлов: 16 добавлений и 25 удалений

Просмотреть файл

@ -1,19 +1,16 @@
import os
import requests
import argparse
import pandas as pd
from time_util import time_limit
# Define arguments
parser = argparse.ArgumentParser(description='Web scraping arg parser')
parser.add_argument('--root_dir', type=str, help='Root directory of datastore')
parser.add_argument('--raw_data_dir', type=str, help='Directory to store output raw data')
parser.add_argument('--output_dir', type=str, help='Directory to store output raw data')
parser.add_argument('--num_images', type=int, help='Number of images per class')
args = parser.parse_args()
# Get arguments from parser
root_dir = args.root_dir
raw_data_dir = args.raw_data_dir
output_dir = args.output_dir
num_images = args.num_images
# Set search headers and URL
@ -27,14 +24,10 @@ search_url = 'https://eastus.api.cognitive.microsoft.com/bing/v7.0/images/search
# Define classes
classes = ['airpane', 'automobile', 'bird', 'cat', 'ship']
# Write classes file
classes_df = pd.DataFrame(classes)
classes_df.to_csv(os.path.join(root_dir, 'classes.txt'), index=False)
# Make query for each class and download images
for name in classes:
dir_name = os.path.join(raw_data_dir, name)
dir_name = os.path.join(output_dir, name)
if not os.path.exists(dir_name):
os.makedirs(dir_name)

Просмотреть файл

@ -3,7 +3,7 @@ from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData
def data_ingestion_step(root_dir, compute_target):
def data_ingestion_step(datastore_reference, compute_target):
run_config = RunConfiguration()
run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']}
@ -11,16 +11,17 @@ def data_ingestion_step(root_dir, compute_target):
raw_data_dir = PipelineData(
name='raw_data_dir',
datastore=root_dir.datastore,
output_mode='mount')
datastore=datastore_reference.datastore,
output_mode='mount',
is_directory=True)
step = PythonScriptStep(
script_name='data_ingestion.py',
arguments=['--root_dir', root_dir, '--raw_data_dir', raw_data_dir, '--num_images', 5],
inputs=[root_dir],
arguments=['--output_dir', raw_data_dir, '--num_images', 5],
inputs=[datastore_reference],
outputs=[raw_data_dir],
compute_target=compute_target,
source_directory='src',
source_directory=os.path.dirname(os.path.abspath(__file__)),
runconfig=run_config,
allow_reuse=False
)

Просмотреть файл

Просмотреть файл

Просмотреть файл

@ -10,7 +10,7 @@ def data_preprocess_step(raw_data_dir, compute_target):
arguments=['--raw_data_dir', raw_data_dir],
inputs=[raw_data_dir],
compute_target=compute_target,
source_directory='src'
source_directory=os.path.dirname(os.path.abspath(__file__))
)
return step

Просмотреть файл

@ -2,22 +2,19 @@ from azureml.core import Workspace
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.data.data_reference import DataReference
from modules.data_ingestion_step import data_ingestion_step
from modules.data_preprocess_step import data_preprocess_step
from modules.ingestion.data_ingestion_step import data_ingestion_step
from modules.preprocess.data_preprocess_step import data_preprocess_step
# Get workspace, datastores, and compute targets
workspace = Workspace.from_config()
datastore = workspace.get_default_datastore()
cpu_compute_target = workspace.compute_targets['ds3cluster']
# Get datastore root directory reference
datastore_root_dir = DataReference(datastore,
data_reference_name='datastore_root_dir',
path_on_datastore='object_recognition_data',
mode='mount')
# Get datastore reference
datastore = DataReference(datastore, mode='mount')
# Step 1: Data ingestion step
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore_root_dir, cpu_compute_target)
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore, cpu_compute_target)
# Step 2: Data preprocessing step
data_preprocess_step = data_preprocess_step(data_ingestion_outputs[0], cpu_compute_target)