Refactoring and cleanign up data references

2019-11-26 01:50:14 +00:00 · 2019-11-26 01:50:14 +00:00 · 71e731dfcf
--- a/modules/ingestion/data_ingestion.py
+++ b/modules/ingestion/data_ingestion.py
@ -1,19 +1,16 @@
 import os
 import requests
 import argparse
-import pandas as pd
 from time_util import time_limit

 # Define arguments
 parser = argparse.ArgumentParser(description='Web scraping arg parser')
-parser.add_argument('--root_dir', type=str, help='Root directory of datastore')
-parser.add_argument('--raw_data_dir', type=str, help='Directory to store output raw data')
+parser.add_argument('--output_dir', type=str, help='Directory to store output raw data')
 parser.add_argument('--num_images', type=int, help='Number of images per class')
 args = parser.parse_args()

 # Get arguments from parser
-root_dir = args.root_dir
-raw_data_dir = args.raw_data_dir
+output_dir = args.output_dir
 num_images = args.num_images

 # Set search headers and URL
@ -27,14 +24,10 @@ search_url = 'https://eastus.api.cognitive.microsoft.com/bing/v7.0/images/search
 # Define classes
 classes = ['airpane', 'automobile', 'bird', 'cat', 'ship']

-# Write classes file 
-classes_df = pd.DataFrame(classes)
-classes_df.to_csv(os.path.join(root_dir, 'classes.txt'), index=False)
-
 # Make query for each class and download images
 for name in classes:

-    dir_name = os.path.join(raw_data_dir, name)
+    dir_name = os.path.join(output_dir, name)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    
--- a/modules/ingestion/data_ingestion_step.py
+++ b/modules/ingestion/data_ingestion_step.py
@ -3,7 +3,7 @@ from azureml.pipeline.steps import PythonScriptStep
 from azureml.core.runconfig import RunConfiguration
 from azureml.pipeline.core import PipelineData

-def data_ingestion_step(root_dir, compute_target):
+def data_ingestion_step(datastore_reference, compute_target):

    run_config = RunConfiguration()
    run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']}
@ -11,16 +11,17 @@ def data_ingestion_step(root_dir, compute_target):

    raw_data_dir = PipelineData(
        name='raw_data_dir', 
-        datastore=root_dir.datastore,
-        output_mode='mount')
+        datastore=datastore_reference.datastore,
+        output_mode='mount',
+        is_directory=True)

    step = PythonScriptStep(
        script_name='data_ingestion.py',
-        arguments=['--root_dir', root_dir, '--raw_data_dir', raw_data_dir, '--num_images', 5],
-        inputs=[root_dir],
+        arguments=['--output_dir', raw_data_dir, '--num_images', 5],
+        inputs=[datastore_reference],
        outputs=[raw_data_dir],
        compute_target=compute_target,
-        source_directory='src',
+        source_directory=os.path.dirname(os.path.abspath(__file__)),
        runconfig=run_config,
        allow_reuse=False
    )
--- a/modules/ingestion/time_util.py
+++ b/modules/ingestion/time_util.py
--- a/modules/preprocess/data_preprocess.py
+++ b/modules/preprocess/data_preprocess.py
--- a/modules/preprocess/data_preprocess_step.py
+++ b/modules/preprocess/data_preprocess_step.py
@ -10,7 +10,7 @@ def data_preprocess_step(raw_data_dir, compute_target):
        arguments=['--raw_data_dir', raw_data_dir],
        inputs=[raw_data_dir],
        compute_target=compute_target,
-        source_directory='src'
+        source_directory=os.path.dirname(os.path.abspath(__file__))
    )

    return step
--- a/object-recognition-pipeline.py
+++ b/object-recognition-pipeline.py
@ -2,22 +2,19 @@ from azureml.core import Workspace
 from azureml.core import Experiment
 from azureml.pipeline.core import Pipeline
 from azureml.data.data_reference import DataReference
-from modules.data_ingestion_step import data_ingestion_step
-from modules.data_preprocess_step import data_preprocess_step
+from modules.ingestion.data_ingestion_step import data_ingestion_step
+from modules.preprocess.data_preprocess_step import data_preprocess_step

 # Get workspace, datastores, and compute targets
 workspace = Workspace.from_config()
 datastore = workspace.get_default_datastore()
 cpu_compute_target = workspace.compute_targets['ds3cluster']

-# Get datastore root directory reference
-datastore_root_dir = DataReference(datastore,
-                                   data_reference_name='datastore_root_dir', 
-                                   path_on_datastore='object_recognition_data', 
-                                   mode='mount')
+# Get datastore reference
+datastore = DataReference(datastore, mode='mount')

 # Step 1: Data ingestion step
-data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore_root_dir, cpu_compute_target)
+data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore, cpu_compute_target)

 # Step 2: Data preprocessing step
 data_preprocess_step = data_preprocess_step(data_ingestion_outputs[0], cpu_compute_target)