Refactoring and cleanign up data references
This commit is contained in:
Родитель
c77de014c8
Коммит
71e731dfcf
|
@ -1,19 +1,16 @@
|
|||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from time_util import time_limit
|
||||
|
||||
# Define arguments
|
||||
parser = argparse.ArgumentParser(description='Web scraping arg parser')
|
||||
parser.add_argument('--root_dir', type=str, help='Root directory of datastore')
|
||||
parser.add_argument('--raw_data_dir', type=str, help='Directory to store output raw data')
|
||||
parser.add_argument('--output_dir', type=str, help='Directory to store output raw data')
|
||||
parser.add_argument('--num_images', type=int, help='Number of images per class')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get arguments from parser
|
||||
root_dir = args.root_dir
|
||||
raw_data_dir = args.raw_data_dir
|
||||
output_dir = args.output_dir
|
||||
num_images = args.num_images
|
||||
|
||||
# Set search headers and URL
|
||||
|
@ -27,14 +24,10 @@ search_url = 'https://eastus.api.cognitive.microsoft.com/bing/v7.0/images/search
|
|||
# Define classes
|
||||
classes = ['airpane', 'automobile', 'bird', 'cat', 'ship']
|
||||
|
||||
# Write classes file
|
||||
classes_df = pd.DataFrame(classes)
|
||||
classes_df.to_csv(os.path.join(root_dir, 'classes.txt'), index=False)
|
||||
|
||||
# Make query for each class and download images
|
||||
for name in classes:
|
||||
|
||||
dir_name = os.path.join(raw_data_dir, name)
|
||||
dir_name = os.path.join(output_dir, name)
|
||||
if not os.path.exists(dir_name):
|
||||
os.makedirs(dir_name)
|
||||
|
|
@ -3,7 +3,7 @@ from azureml.pipeline.steps import PythonScriptStep
|
|||
from azureml.core.runconfig import RunConfiguration
|
||||
from azureml.pipeline.core import PipelineData
|
||||
|
||||
def data_ingestion_step(root_dir, compute_target):
|
||||
def data_ingestion_step(datastore_reference, compute_target):
|
||||
|
||||
run_config = RunConfiguration()
|
||||
run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']}
|
||||
|
@ -11,16 +11,17 @@ def data_ingestion_step(root_dir, compute_target):
|
|||
|
||||
raw_data_dir = PipelineData(
|
||||
name='raw_data_dir',
|
||||
datastore=root_dir.datastore,
|
||||
output_mode='mount')
|
||||
datastore=datastore_reference.datastore,
|
||||
output_mode='mount',
|
||||
is_directory=True)
|
||||
|
||||
step = PythonScriptStep(
|
||||
script_name='data_ingestion.py',
|
||||
arguments=['--root_dir', root_dir, '--raw_data_dir', raw_data_dir, '--num_images', 5],
|
||||
inputs=[root_dir],
|
||||
arguments=['--output_dir', raw_data_dir, '--num_images', 5],
|
||||
inputs=[datastore_reference],
|
||||
outputs=[raw_data_dir],
|
||||
compute_target=compute_target,
|
||||
source_directory='src',
|
||||
source_directory=os.path.dirname(os.path.abspath(__file__)),
|
||||
runconfig=run_config,
|
||||
allow_reuse=False
|
||||
)
|
|
@ -10,7 +10,7 @@ def data_preprocess_step(raw_data_dir, compute_target):
|
|||
arguments=['--raw_data_dir', raw_data_dir],
|
||||
inputs=[raw_data_dir],
|
||||
compute_target=compute_target,
|
||||
source_directory='src'
|
||||
source_directory=os.path.dirname(os.path.abspath(__file__))
|
||||
)
|
||||
|
||||
return step
|
|
@ -2,22 +2,19 @@ from azureml.core import Workspace
|
|||
from azureml.core import Experiment
|
||||
from azureml.pipeline.core import Pipeline
|
||||
from azureml.data.data_reference import DataReference
|
||||
from modules.data_ingestion_step import data_ingestion_step
|
||||
from modules.data_preprocess_step import data_preprocess_step
|
||||
from modules.ingestion.data_ingestion_step import data_ingestion_step
|
||||
from modules.preprocess.data_preprocess_step import data_preprocess_step
|
||||
|
||||
# Get workspace, datastores, and compute targets
|
||||
workspace = Workspace.from_config()
|
||||
datastore = workspace.get_default_datastore()
|
||||
cpu_compute_target = workspace.compute_targets['ds3cluster']
|
||||
|
||||
# Get datastore root directory reference
|
||||
datastore_root_dir = DataReference(datastore,
|
||||
data_reference_name='datastore_root_dir',
|
||||
path_on_datastore='object_recognition_data',
|
||||
mode='mount')
|
||||
# Get datastore reference
|
||||
datastore = DataReference(datastore, mode='mount')
|
||||
|
||||
# Step 1: Data ingestion step
|
||||
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore_root_dir, cpu_compute_target)
|
||||
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore, cpu_compute_target)
|
||||
|
||||
# Step 2: Data preprocessing step
|
||||
data_preprocess_step = data_preprocess_step(data_ingestion_outputs[0], cpu_compute_target)
|
||||
|
|
Загрузка…
Ссылка в новой задаче