From d6e18ed9ea03a66128bf1cb51cca2de01e45b321 Mon Sep 17 00:00:00 2001 From: John Wu Date: Mon, 9 Dec 2019 13:54:20 -0800 Subject: [PATCH] Added comments and refactoring code in hashmaps --- modules/__pycache__/__init__.cpython-36.pyc | Bin 158 -> 0 bytes .../data_ingestion_step.cpython-36.pyc | Bin 970 -> 0 bytes .../data_preprocess_step.cpython-36.pyc | Bin 639 -> 0 bytes .../__pycache__/deploy_step.cpython-36.pyc | Bin 914 -> 0 bytes .../__pycache__/deploy_step.cpython-37.pyc | Bin 935 -> 0 bytes modules/deploy/deploy_step.py | 19 ++++- .../.ipynb_checkpoints/evaluate-checkpoint.py | 76 ------------------ .../evaluate_step-checkpoint.py | 40 --------- .../__pycache__/evaluate_step.cpython-36.pyc | Bin 1219 -> 0 bytes .../__pycache__/evaluate_step.cpython-37.pyc | Bin 1240 -> 0 bytes modules/evaluate/evaluate.py | 8 +- modules/evaluate/evaluate_step.py | 16 +++- .../data_ingestion_step-checkpoint.py | 35 -------- .../data_ingestion_step.cpython-36.pyc | Bin 1169 -> 0 bytes .../data_ingestion_step.cpython-37.pyc | Bin 1190 -> 0 bytes modules/ingestion/data_ingestion_step.py | 17 +++- .../data_preprocess-checkpoint.py | 72 ----------------- .../data_preprocess_step-checkpoint.py | 56 ------------- .../data_preprocess_step.cpython-36.pyc | Bin 1435 -> 0 bytes .../data_preprocess_step.cpython-37.pyc | Bin 1456 -> 0 bytes modules/preprocess/data_preprocess_step.py | 19 ++++- .../.ipynb_checkpoints/train-checkpoint.py | 11 --- .../train_step-checkpoint.py | 49 ----------- .../__pycache__/train_step.cpython-36.pyc | Bin 1463 -> 0 bytes .../__pycache__/train_step.cpython-37.pyc | Bin 1484 -> 0 bytes modules/train/train_step.py | 17 +++- object-recognition-pipeline.py | 8 +- 27 files changed, 93 insertions(+), 350 deletions(-) delete mode 100644 modules/__pycache__/__init__.cpython-36.pyc delete mode 100644 modules/__pycache__/data_ingestion_step.cpython-36.pyc delete mode 100644 modules/__pycache__/data_preprocess_step.cpython-36.pyc delete mode 100644 modules/deploy/__pycache__/deploy_step.cpython-36.pyc delete mode 100644 modules/deploy/__pycache__/deploy_step.cpython-37.pyc delete mode 100644 modules/evaluate/.ipynb_checkpoints/evaluate-checkpoint.py delete mode 100644 modules/evaluate/.ipynb_checkpoints/evaluate_step-checkpoint.py delete mode 100644 modules/evaluate/__pycache__/evaluate_step.cpython-36.pyc delete mode 100644 modules/evaluate/__pycache__/evaluate_step.cpython-37.pyc delete mode 100644 modules/ingestion/.ipynb_checkpoints/data_ingestion_step-checkpoint.py delete mode 100644 modules/ingestion/__pycache__/data_ingestion_step.cpython-36.pyc delete mode 100644 modules/ingestion/__pycache__/data_ingestion_step.cpython-37.pyc delete mode 100644 modules/preprocess/.ipynb_checkpoints/data_preprocess-checkpoint.py delete mode 100644 modules/preprocess/.ipynb_checkpoints/data_preprocess_step-checkpoint.py delete mode 100644 modules/preprocess/__pycache__/data_preprocess_step.cpython-36.pyc delete mode 100644 modules/preprocess/__pycache__/data_preprocess_step.cpython-37.pyc delete mode 100644 modules/train/.ipynb_checkpoints/train-checkpoint.py delete mode 100644 modules/train/.ipynb_checkpoints/train_step-checkpoint.py delete mode 100644 modules/train/__pycache__/train_step.cpython-36.pyc delete mode 100644 modules/train/__pycache__/train_step.cpython-37.pyc diff --git a/modules/__pycache__/__init__.cpython-36.pyc b/modules/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 30a667add6e019475d2b080cba38562247ff27f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 158 zcmXr!<>m59xe?0%1dl-k3@`#24nSPY0whuxf*CX!{Z=v*frJsnFK_+ayb}GyDiE2R zpOUH{TAW%`te=&iQC_N_n46gzl;)%s>&M4u=4F<| U$LkeT-r}%pF diff --git a/modules/__pycache__/data_ingestion_step.cpython-36.pyc b/modules/__pycache__/data_ingestion_step.cpython-36.pyc deleted file mode 100644 index 3a65bed396bae16c07ffdc3b1a7630ad1fb578d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 970 zcmZuvOK%e~5VpOKY#xouqevhD61NhxBY(%@#;JdS6XQ)=1%a&@kH?<*W0!IMduEQMn5HUQn5p3l z=i4jANQ$FuR;t8gQh=|sn=N>r3H~ZEN$RR_BLHcEj6gI4Wzbi2P8c?L4*EXIa7E@G z^%-&NJfNXzFt0-Mi1{qI@XkHcWFc!*h((phnyfVoD>TEE2ZC9s=nE*6x8!K3CyQwU|!>NHw+}KSmOOC3uK9eY+RgVN($S)G8~xQTHDZ*L}kh6oI?prreC;nLC(RH z4{227lzY-%V594f)uuyeK>>7CfvTJIHT;q9NIitp-|vToFvDc&bP6O64>VVLIG4w# z=sx3x90JiLlrj99YU)w~-}cUb{2}1zC!i1s@on591QYDzs9w8Z`LK60 z^3-Fv)r-_!SmJxhvQ)gt2iGMG9Ez^nv^R$-e1pLa$175|{)u$AR!ljBy7e{n2z)kr Rw(9LUhMjhVM!4f9`2&!j8}I-C diff --git a/modules/__pycache__/data_preprocess_step.cpython-36.pyc b/modules/__pycache__/data_preprocess_step.cpython-36.pyc deleted file mode 100644 index 5fa44a6a514372f1501a874206dff194635bcbc7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 639 zcmZuvy^a$x5VpNP*@GmU5K!>~TO?c1P#}bK)Y0i8nw8MXvQ2K$+Fp%qof3tPmN(!9 zcpXZMj_O|FD#i z=aExBo;w>rs18dg;VtZI3Te(di}Ta^aE8H{JJkAxzydFKn?@Vo@IyjjrE1Mj0(>j#y>z03&r3nRW3A zKLk*ESpV*~P8HoG;}0n;s5Jy5rYC~*2J!JfndvLYk((M!I!_?Y5vvzJg}lU{jJOJ+ IqVu%b4}}D^z5oCK diff --git a/modules/deploy/__pycache__/deploy_step.cpython-36.pyc b/modules/deploy/__pycache__/deploy_step.cpython-36.pyc deleted file mode 100644 index bd3fedef9c3d378dfb7fb97a75122f59fb176085..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 914 zcmZWnOOMkq5VjrX(RRz0=Lrxeq#m+4aN&Ru5{MI8RjRmDg2=?)ZYn2^Y)7kB>1ny{ z58*HQgv0^uU*N|eFpZ)xF8zA%p9W4U-8xZpnfFq72TKJPl zT#*IAUJqnY(FN^w3VOH*K@WNK6)hr;=V%(=!fSLs#cncN+e=l?3tcvLZbbvWlUdma zS=QnmvkdIJ2YbdetArK0p#D9q7-R_Y7Q}o8ut1694zDmLJmB;u+|rIN@iw%?Q|J(n zJIs@g@B#Rf4t2OA%b*Lpzy>S23VV!fe2uR$WEkGW+o+4W@PYvG(YYI$Lg}(z=B<`4 ztW;au_tVhP8mPGk&7OL$THCZX@4W*Kg&3=}aHGAyQe3!`(&W6WgLW-na~Ec-fR z_Qmv`B=4E#IS?BBsl<){slz3QD)hu>4LzLt)YQNrd_(#%7{5MbRc$l2@mN4pvX4e+ zldaU{RhzL&riYfBl#5Db@Lel~$#&h`AGyy6opqy~ySF1qZ=M4nLa^TyBq5Qv2nCDc zh$MaF9!>)N6mCvqeIE|@h;3ReDtUJJ{mlPhcCC5%Ul2w{>>B7Nx6jm9cLYy-+T_)a P{%Z`DNdOXj;y3vXBE1Qd diff --git a/modules/deploy/__pycache__/deploy_step.cpython-37.pyc b/modules/deploy/__pycache__/deploy_step.cpython-37.pyc deleted file mode 100644 index 8e6f97704173a553ac8239cb050da0d019ba70be..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 935 zcmZWnOOMkq5VjrX(M`*i=LyiBka|dS*#ifJ5Fl2=?2YUG z2lmKc@|6qw7dSCaTUdmx8P6*--%RH9Xmo(!ERTPl-*^c9aLWD=5MBZ4=fD_ZSfH7E ziNF<^5$t47com(|UZx=XGaqE11z*uDVE7D8!W;M+olLMDpRUbP)@Ow(8*^rO1GeF5 z*>F+T{B3S>(C_Z`(_G~hH(V9eeGexDihwJLs8gP1BmJw+ zJ#E&TI7wW9aNf;flH^6vs=Qcd^HT85L6R7*P465f$t^sS(7|(~+!fB%qOEvsv<=D{ z@Ms%u0cd-tkQG>XW*~?gJJzyQ1^)}k#<>vkB2&E8d=e>G1*8U!xh1efYD=5kENuuV z`#!_`v+gZX)+5a_05tg1i5>mZhmH1KsIklHJ9g+&Qw@vo9jV8l{5nso+N4GLQFEo! zm0Vu5=?B;iRes?|$TrK?yFB-1-+fs!~dMH hLwq|?x5|B{zTI2wiAx*3*usB_LDLbi2oK#QzX2WN51;@5 diff --git a/modules/deploy/deploy_step.py b/modules/deploy/deploy_step.py index 1692902..891458f 100644 --- a/modules/deploy/deploy_step.py +++ b/modules/deploy/deploy_step.py @@ -4,6 +4,22 @@ from azureml.pipeline.core import PipelineData from azureml.pipeline.core import PipelineParameter def deploy_step(model_dir, accuracy_file, test_dir, compute_target): + ''' + This step registers and deploys a new model on its first run. In subsequent runs, it will only register + and deploy a new model if the training dataset has changed or the dataset did not change, but the accuracy improved. + + :param model_dir: The reference to the directory containing the trained model + :type model_dir: DataReference + :param accuracy_file: The reference to the file containing the evaluation accuracy + :type accuracy_file: DataReference + :param test_dir: The reference to the directory containing the testing data + :type test_dir: DataReference + :param compute_target: The compute target to run the step on + :type compute_target: ComputeTarget + + :return: The preprocess step, step outputs dictionary (keys: scoring_url) + :rtype: PythonScriptStep, dict + ''' scoring_url = PipelineData( name='scoring_url', @@ -13,6 +29,7 @@ def deploy_step(model_dir, accuracy_file, test_dir, compute_target): is_directory=False) outputs = [scoring_url] + outputs_map = { 'scoring_url': scoring_url } step = PythonScriptStep( script_name='deploy.py', @@ -29,5 +46,5 @@ def deploy_step(model_dir, accuracy_file, test_dir, compute_target): allow_reuse=False ) - return step, outputs + return step, outputs_map \ No newline at end of file diff --git a/modules/evaluate/.ipynb_checkpoints/evaluate-checkpoint.py b/modules/evaluate/.ipynb_checkpoints/evaluate-checkpoint.py deleted file mode 100644 index ecd5737..0000000 --- a/modules/evaluate/.ipynb_checkpoints/evaluate-checkpoint.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import print_function, division -import argparse -import time -import torch -import torch.nn as nn -from torchvision import datasets, models, transforms - -def load_data(test_dir): - - test_transform = transforms.Compose([ - transforms.Resize(200), - transforms.CenterCrop(200), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.405], - std=[0.229, 0.224, 0.225]) - ]) - - test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=4) - - dataset_size = len(test_loader.dataset) - class_names = test_dataset.classes - - return test_loader, dataset_size, class_names - -def evaluate_model(model, criterion, dataloader, dataset_size, class_names, device): - - model.eval() - running_loss = 0.0 - running_corrects = 0 - - for batch_idx, (inputs, labels) in enumerate(dataloader): - inputs = inputs.to(device) - labels = labels.to(device) - - outputs = model(inputs) - _, preds = torch.max(outputs, 1) - loss = criterion(outputs, labels) - - running_loss += loss.item() * inputs.size(0) - corrects = torch.sum(preds == labels.data) - running_corrects += corrects - - print('{}/{} predictions correct.'.format(running_corrects, dataset_size)) - loss = running_loss / dataset_size - acc = running_corrects.double() / dataset_size - print('Loss: {:.4f} Acc: {:.4f}'.format(loss, acc)) - - return acc - - # Define arguments -parser = argparse.ArgumentParser(description='Evaluate arg parser') -parser.add_argument('--test_dir', type=str, help='Directory where testing data is stored') -parser.add_argument('--model_dir', type=str, help='Directory where model is stored') -parser.add_argument('--accuracy_file', type=str, help='File to output the accuracy to') -args = parser.parse_args() - -# Get arguments from parser -test_dir = args.test_dir -model_dir = args.model_dir -accuracy_file = args.accuracy_file - -# Load testing data, model, and device -test_loader, dataset_size, class_names = load_data(test_dir) -model = torch.load(os.path.join(model_dir,'model.pt')) -device = torch.device('cuda:0') - -# Define criterion -criterion = nn.CrossEntropyLoss() - -# Evaluate model -acc = evaluate_model(model, criterion, test_loader, dataset_size, class_names, device) - -# Output accuracy to file -with open(accuracy_file, 'w+') as f: - f.write(str(acc.item())) diff --git a/modules/evaluate/.ipynb_checkpoints/evaluate_step-checkpoint.py b/modules/evaluate/.ipynb_checkpoints/evaluate_step-checkpoint.py deleted file mode 100644 index a3e613d..0000000 --- a/modules/evaluate/.ipynb_checkpoints/evaluate_step-checkpoint.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -from azureml.pipeline.steps import PythonScriptStep -from azureml.core.runconfig import RunConfiguration -from azureml.core.conda_dependencies import CondaDependencies -from azureml.pipeline.core import PipelineData -from azureml.pipeline.core import PipelineParameter -from azureml.pipeline.steps import EstimatorStep -from azureml.train.dnn import PyTorch - -def evaluate_step(model_dir, test_dir, compute_target): - - accuracy_file = PipelineData( - name='accuracy_file', - pipeline_output_name='accuracy_file', - datastore=test_dir.datastore, - output_mode='mount', - is_directory=False) - - outputs = [accuracy_file] - - estimator = PyTorch( - source_directory=os.path.dirname(os.path.abspath(__file__)), - entry_script='evaluate.py', - framework_version='1.3', - compute_target=compute_target, - use_gpu=True) - - step = EstimatorStep( - estimator=estimator, - estimator_entry_script_arguments=[ - '--test_dir', test_dir, - '--model_dir', model_dir, - '--accuracy_file', accuracy_file - ], - inputs=[model_dir, test_dir], - outputs=outputs, - compute_target=compute_target, - allow_reuse=False) - - return step, outputs diff --git a/modules/evaluate/__pycache__/evaluate_step.cpython-36.pyc b/modules/evaluate/__pycache__/evaluate_step.cpython-36.pyc deleted file mode 100644 index d54268437b0c6c409e905627cc8bb1735f77f2b9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1219 zcmY*ZOK;RL5RRSqK1vG|Ra`*8We?dE>J=fR7LL`5mP;fQnb^CVB`?`dt6in1a_tY{ zx9|^qMMB~qAWn?E&2~xT$;>z7dD=N14tqyGetrJqSk_N#=fS|ggC!O~s6|n3#rlhL zRNxpR&D*(MII&}RCwB`k_6+alei6ih;XUBPI5fNudwX%u@Il@$2Jry+koLY=@sOf( zYt;V@(X6u(Qv1`DTvX+G%ClOYOIB;_V^f|~woXD&y0XqPIl2cZ*lCm<*m;i7v z&1#lsB|9aOXroS^5>5&x83%CqUdXHd+97)Lp|x=pc_P8>b02qGypqS2CYlO7SUd7(LU_*TB}8K z+&W-m)4{EG?X_O(%rNv}c&3IVO`*(mmCUl7DYvMaQhpe@$|<1)^|-z(F{v6^H!?A3 zNTFZ>B{9{%!;6YCHONFlGtN>_t=4^ZN%DqBHm+A|`}O$v0^IJ4s^KZSH&X*vO1?^j z8G$;O>DFCU{A+T_xPb1fy|gMIG)pAmb0(GF2$sz2X5EV-$%NFt*8M2bDda|Y5Jmr$ zHS$ylhMO^{=UX<}4V{4ZrT|1JFDoISQ2wSk!5_hBRG;K|b(L@i@kb$d0pmkW^zam} zgmP*k7s`ixOq+c2MHnPVOm~xn>sgO{6>f7?L05ta!b9!wNRvbD)e&$ezgzqT2!E%O zqLdS|)-;7;C!YkgXtJypSIvYJdDIO{M3fcP1cEd<6O(QZTPD$7VJd3asSMJCAYu%} z!Wik-ffZm6q=!b<65!YYjfXvh1(?5r9iu(|6h7(+SvQ;&`MAqvtYZnTFZ#VQg`+d( zO_`c&#`R55&$^vP2^vZgx^v+;{5vdl*ja7;YxB)8JTNdFRtKGxn#FF>;nnVVE4uU0E$Ls%m4rY diff --git a/modules/evaluate/__pycache__/evaluate_step.cpython-37.pyc b/modules/evaluate/__pycache__/evaluate_step.cpython-37.pyc deleted file mode 100644 index ed77e06a0cd89cffaccab45d7b05dcc0071df588..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1240 zcmY*ZOK;RL5Kf%;KA;7P5Em-ovWM&n!4V-u3pdJ&mP;fQnb^CVlEjhiwAxjA->{;+EUi|v}$F;1VTIe1G%xhR;0fbn@F04d< zQDK)TLAK_d!YSRvHN0DRrJwkQ_llqllhE)!@KF*OK7hTwq-XfB=$C_J0DMGx->hUv z>|<-x{|(Wsqmiw4rc1eC)p5r2S{_SUYwTlF9k6Pa&l`?q&MLrm0U-F0*0dtD%5o|I z98B|?7I{SvvBcV_lc$*Dl1j<}9KIDYFR^6&+I=uxo-m#*GDjx?PXH@~bpXrsK|&vh z-P)%%LBt{MrFY@9&fLCmC6dmm``t6tm%){NWfPD1Eg}I4!OoLm>yfCnNw2j?A9nez z)!K8^x?tmw!KHuUw|?u+5cFYqq=q=lpv-KU&hmmPuVhUn-;F%wR#1YvS6`QyvPRa8 zObr?lC|E#ARQ0d%k`bx~xkyRQX$Gq0s!z{w(O^l(^>XDr8$Uk*w>yG0Jfk;eYCtQ= zm#Hu#P`fkTx^u?Are~B3=)Ss@u@XYlRAN4-QU#5m>AY@My*QRsNbPIYk7J!eVT9Xp z{9jokUqxWJ9)sH7u<3T_6udVjAVT?h1p$Q$*2M{a7e=G{xG30p$|=MjMcf074>i%l zQ-}%W)>tl70Qs0U1^A0FNSK=LrYYC69tA4eP(D=wVScv#j*fF}rAHql7$E$|Z zvKV){jCCx*_07LgW^k6qys0vC@3_7i>QT4TC_zJTO15qwho6b1_ByMLe{H^=jt>T= o~vt_XqBfFe+dno`6{$a2?Cx?z7J+uN$q=0dsl zL3jq<#86|6KW&75p*tT2{S_>^4}u|P#V9a4 zj;#a-*fM%0t|WF~m%1H0i5s}3?ttzE9_TLfzN5fr)(ExyYd}H=ElY0>FVtz4jzSU5 z)JXA}@g3%AFH6VKBo|agSqi?!Fq-i=O8Guj6zr`Vdq@RMxZ)x-Wtz`}RfE-oCBK1C z=n^e3vy?RjxrZXFz|#t`88-XWA+GXRh1txx!i$RXp;Wg((<<{AX8x74unT))EgV&2 zRaPr(Rxc_OyFe4Ga0-K3pEgR_R2!GpC7`ngYhL*aw{Qz*jN!o>2f?G>hqniVqrq{X zjQWSigI<3`UJnQ4UH|iuuBCZGqJ&PkyxxP7TUgsEyh}G3AJaTm2?GiU zi-dNPEKgO-5k6eeJF^u#}UwDhU;nJmIO5+Kp1kEw#72EcHg1CE(#iK@eAZQ)amc z`E7Dt6?qz#{nm9F$Jsd%JeRz6PgKmia0X~RJ&Qz^>iSAC$@K0W0|*ikJ&8G#z_Rcw z7v_?2FyR12JX@nURZO^DW)gHX9q{*Wvvh?_bjH8$n((;1sHFr?amTsT#+LbQSUz@3t5#YV&cRm^ zPa(Qh6%Szx9?*F%coKJRit3nba>>{FFoaR+tUI=B(e8gYP1KU|l6;e2w4eylMop=NCA4zAo5aEPM!V})t(*&R z?SG(0egpqxuUzO~;KZ!+D2TOoc4l^VUf=jtvsp7>ji3Jd_`@=cpITTh1k4NAVh;#0 zh?y9HK2c((C_tv>D@i4_0;}Y$#7>>SDR~=sH*kS>i2KzDJYo)vmUjc5jDwb`*83N7 z%(6km{xu%9v%KYxLtOP2!C1g)#(rV=4DT9;Q@M9_cjoa$(nD zcVUaqK&0`-m?2_Ha{_e7h|K~`D%c`OpA#E9(j^sQ5&H_wD$;{gox+$@iAxaiuI!mr zSR-?0%NnVYT49lTQ5ji@|0#$C@0P!wovZ;u~2IfOF=`i z1S2Zdx?nkv=xuOS<#`sBc&mDtBnhV;x!z;Qj#K0TP&SK6oRbMbVnA-kJfClk6 zJWVL1Kv?vNa{Wjv2)Drvhp9GrQxiQX%9@69tX!zAY?B+Fic;fYJd6{Hamx)hSG|Lb zhLqDRqI^rgu0hIpo-=o}PyP4s&C;=eDgJKohamhO`)MZq$bT;=7yg8e&vX9}hM31^ zbho|xE{-@8YzQWRi65p(dkLaw6Phv~u*#t(e+7;IFH%@X1GrR~D_)tEdi^JeCqNA3 zn0L(@Y9M6l&w}kCr<7`?gy0@@75*5kTUCA^j^JK+nRA*Zon=xT9Zk#|xVnsBwmK^a n=a_B(_flKUF(}c{VagW$MIM1pdC%r0KQ}=Vx~-nsn1ARG^afzr diff --git a/modules/ingestion/data_ingestion_step.py b/modules/ingestion/data_ingestion_step.py index ac5199e..90346da 100644 --- a/modules/ingestion/data_ingestion_step.py +++ b/modules/ingestion/data_ingestion_step.py @@ -5,6 +5,20 @@ from azureml.pipeline.core import PipelineData from azureml.pipeline.core import PipelineParameter def data_ingestion_step(datastore_reference, compute_target): + ''' + This step will leverage Azure Cognitive Services to search the web for images + to create a dataset. This replicates the real-world scenario of data being + ingested from a constantly changing source. The same 10 classes in the CIFAR-10 dataset + will be used (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck). + + :param datastore_reference: The reference to the datastore that will be used + :type datastore_reference: DataReference + :param compute_target: The compute target to run the step on + :type compute_target: ComputeTarget + + :return: The ingestion step, step outputs dictionary (keys: raw_data_dir) + :rtype: PythonScriptStep, dict + ''' run_config = RunConfiguration() run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']} @@ -20,6 +34,7 @@ def data_ingestion_step(datastore_reference, compute_target): is_directory=True) outputs = [raw_data_dir] + outputs_map = { 'raw_data_dir': raw_data_dir } step = PythonScriptStep( script_name='data_ingestion.py', @@ -32,4 +47,4 @@ def data_ingestion_step(datastore_reference, compute_target): allow_reuse=False ) - return step, outputs + return step, outputs_map diff --git a/modules/preprocess/.ipynb_checkpoints/data_preprocess-checkpoint.py b/modules/preprocess/.ipynb_checkpoints/data_preprocess-checkpoint.py deleted file mode 100644 index 928df7c..0000000 --- a/modules/preprocess/.ipynb_checkpoints/data_preprocess-checkpoint.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import argparse -import random -import cv2 -from imutils import paths - -def preprocess_images(files, image_dim, output_dir, label): - ''' - Load files, crop to consistent size, and save to respective folder - ''' - # Make class directory - class_directory = '{}/{}'.format(output_dir, label) - if not os.path.exists(class_directory): - os.makedirs(class_directory) - - # Iterate through files - for f in files: - temp = f.split('/') - output_file = '{}/{}/{}'.format(output_dir, label, temp[-1]) - try: - image = cv2.imread(f) - image = cv2.resize(image, (image_dim, image_dim)) - cv2.imwrite(output_file, image) - print('Cropping image: {}'.format(output_file)) - except: - print('Removing corrupted file: {}'.format(output_file)) - -# Define arguments -parser = argparse.ArgumentParser(description='Web scraping arg parser') -parser.add_argument('--raw_data_dir', type=str, help='Directory where raw data is stored') -parser.add_argument('--image_dim', type=int, help='Image dimension to be cropped to') -parser.add_argument('--train_dir', type=str, help='Directory to output the processed training data') -parser.add_argument('--valid_dir', type=str, help='Directory to output the processed valid data') -parser.add_argument('--test_dir', type=str, help='Directory to output the processed test data') -args = parser.parse_args() - -# Get arguments from parser -raw_data_dir = args.raw_data_dir -image_dim = args.image_dim -train_dir = args.train_dir -valid_dir = args.valid_dir -test_dir = args.test_dir - -# Make train, valid, test directories -if not os.path.exists(train_dir): - os.makedirs(train_dir) - -if not os.path.exists(valid_dir): - os.makedirs(valid_dir) - -if not os.path.exists(test_dir): - os.makedirs(test_dir) - -# Get all the classes that have been sorted into directories from previous step -classes = os.listdir(raw_data_dir) - -for label in classes: - - # Get and shuffle files - image_files = list(paths.list_images('{}/{}'.format(raw_data_dir, label))) - random.shuffle(image_files) - - # Split into train, valid, test sets - num_images = len(image_files) - train_files = image_files[0:int(num_images*0.7)] - valid_files = image_files[int(num_images*0.7):int(num_images*0.9)] - test_files = image_files[int(num_images*0.9):num_images] - - # Load files, crop to consistent size, and save to respective folder - preprocess_images(train_files, image_dim, train_dir, label) - preprocess_images(valid_files, image_dim, valid_dir, label) - preprocess_images(test_files, image_dim, test_dir, label) diff --git a/modules/preprocess/.ipynb_checkpoints/data_preprocess_step-checkpoint.py b/modules/preprocess/.ipynb_checkpoints/data_preprocess_step-checkpoint.py deleted file mode 100644 index 0c09fee..0000000 --- a/modules/preprocess/.ipynb_checkpoints/data_preprocess_step-checkpoint.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -from azureml.pipeline.steps import PythonScriptStep -from azureml.core.runconfig import RunConfiguration -from azureml.core.conda_dependencies import CondaDependencies -from azureml.pipeline.core import PipelineData -from azureml.pipeline.core import PipelineParameter - -def data_preprocess_step(raw_data_dir, compute_target): - - run_config = RunConfiguration() - run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['opencv-python==4.1.1.26', 'imutils==0.5.3']) - run_config.environment.docker.enabled = True - - image_dim = PipelineParameter(name='image_dim', default_value=200) - - train_dir = PipelineData( - name='train_dir', - pipeline_output_name='train_dir', - datastore=raw_data_dir.datastore, - output_mode='mount', - is_directory=True) - - valid_dir = PipelineData( - name='valid_dir', - pipeline_output_name='valid_dir', - datastore=raw_data_dir.datastore, - output_mode='mount', - is_directory=True) - - test_dir = PipelineData( - name='test_dir', - pipeline_output_name='test_dir', - datastore=raw_data_dir.datastore, - output_mode='mount', - is_directory=True) - - outputs = [train_dir, valid_dir, test_dir] - - step = PythonScriptStep( - script_name='data_preprocess.py', - arguments=[ - '--raw_data_dir', raw_data_dir, - '--train_dir', train_dir, - '--valid_dir', valid_dir, - '--test_dir', test_dir, - '--image_dim', image_dim - ], - inputs=[raw_data_dir], - outputs=outputs, - compute_target=compute_target, - runconfig=run_config, - source_directory=os.path.dirname(os.path.abspath(__file__)), - allow_reuse=False - ) - - return step, outputs diff --git a/modules/preprocess/__pycache__/data_preprocess_step.cpython-36.pyc b/modules/preprocess/__pycache__/data_preprocess_step.cpython-36.pyc deleted file mode 100644 index bd3b89036672c604d227c092a3b503f7559cdca7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1435 zcmZ`&&2Aev5Z>kfuU4|HI7u%>f!@?cQromW#YKPwIrI=1b$h6Qf?#%Nd6oSG$+cl& zb#ktLkiJCEeFk3BOJ1R;4yj0W`m+RQhcm-(hu_D2JRGL*;*VcGjvVK$bLS!GKgB6e z(SQSP=`75flx{^9#5KNGdX>NMt?!pX6)r;S2W3>ni`e=h`pF_eKZ5v;vq-@`b0*0< zM0H+HTzz+jtfDocvvhh>`F~q&IN+AIs(%WF;V{8C}4`KnvGRQuNg}vVgeYo^Cks81dM!f?E zy$^?d07rcYQR)*h(C8~4VAdvJUkZ{j}gqjQ1`-ha7%*dQ7C z<+Qa7o}7F=`x3`j->yeR)v2PCCnsObzL`BXM72drTb93OOD=!ZNl~Fgp{U;dj#NyD z_G{!z4}qVvu2l4rl^xfK60E4POz5C$x>`*Fkz&0*MmWnmZ93It!Q#C<*dD4kET<>uBgqJn!$9s@5e0m zIO`NMJ0wi(5BFjc>w&a4>wSkzEW3)=O6stvv2m%RZKqO?@}|NFrwWU>(uwHm++IR` zD4R~?hOgZr2COWbD=K&=`Q%7=NQ&?cGhr?`Q%C@>FALGs=3_eCDx}AV2aLj}ZJy^0M_Db4FOzC-1a!MzO&h8Q>J~CTV zJOe8RBD= z86kSxU1K-)#3MwT48*7S(hu0W6TB*CyS`_p9l2$8pUm;v&i0a9MfE4Uoi_Pz@wSrR l-&O7LP5D+rE2e42QhUdIZaRyfY`Oc+#g#O|#BR15dJc~BhkyM1e&jg6+s$qe zRKCKcPEmjZZsE-BmlSSE=ESwSS9qmA_dDG$f-;DV=gr&s!1RA;&5wLa6VwzeOd>anO6eA!5$?KU$S)>!$Kc7y!|7)@?|-o#^0LvqP$ws!PCuVMute3o&T5)Jr%R@OGzl+J$$*z{ zenTq8#P}8RWd^_&v?+9UL5qf&L`%vm43j1(MN{c)~V~>1Eb=Y)pSL z+2><7dz^I-Jv$_H>@WAtI5vINb=KPqsa!TCtF$s9uP||CqHU(i40BPUg=HFpm^O)Q zs=RALbErfkbIaH65`9_};xd!0QEYr9JtRf?mYFb@jA|r+RTo@}%KprR+X|Tx;sMRz zMiZM5ME;yf6S0auD;UVoVg*(lSYi9u5TT4;Q+;kC%%LMYqR&)kiL-3M3zlW$#H2R~ zlkeH1J-kM?y@!k~pnU{#WW_Bj?xHZh!iV>ZBR|5;pQ~9}=~+JeNinHrD{+3=%)ZAb zCiw-sH@Wv6&!tdefgxDj87+&+t{c<@SSe;$^rm2H_IEjF|0r)}|A#d;$N%N9X{v8g zI3#rMxIK~*;@ausivL&dSr73s%FYnA?WvI)d-4-R8~5dh_%V0rx{<6brn|hSHXXI4 zdhg7!kEi<{=t^up*q!X0|E}h?2X618_V_k@Thp$(Hkpp=8{%V|S^Q|r-Ip%zl&w_k Irn^c00toD?WdHyG diff --git a/modules/preprocess/data_preprocess_step.py b/modules/preprocess/data_preprocess_step.py index 0c09fee..f2de1b9 100644 --- a/modules/preprocess/data_preprocess_step.py +++ b/modules/preprocess/data_preprocess_step.py @@ -6,6 +6,18 @@ from azureml.pipeline.core import PipelineData from azureml.pipeline.core import PipelineParameter def data_preprocess_step(raw_data_dir, compute_target): + ''' + This step will take the raw data downloaded from the previous step and preprocess it by cropping + it to a consistent size, shuffling the data, and splitting it into train, valid, and test directories. + + :param raw_data_dir: The reference to the directory containing the raw data + :type raw_data_dir: DataReference + :param compute_target: The compute target to run the step on + :type compute_target: ComputeTarget + + :return: The preprocess step, step outputs dictionary (keys: train_dir, valid_dir, test_dir) + :rtype: PythonScriptStep, dict + ''' run_config = RunConfiguration() run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['opencv-python==4.1.1.26', 'imutils==0.5.3']) @@ -35,6 +47,11 @@ def data_preprocess_step(raw_data_dir, compute_target): is_directory=True) outputs = [train_dir, valid_dir, test_dir] + outputs_map = { + 'train_dir': train_dir, + 'valid_dir': valid_dir, + 'test_dir': test_dir, + } step = PythonScriptStep( script_name='data_preprocess.py', @@ -53,4 +70,4 @@ def data_preprocess_step(raw_data_dir, compute_target): allow_reuse=False ) - return step, outputs + return step, outputs_map diff --git a/modules/train/.ipynb_checkpoints/train-checkpoint.py b/modules/train/.ipynb_checkpoints/train-checkpoint.py deleted file mode 100644 index 2536b21..0000000 --- a/modules/train/.ipynb_checkpoints/train-checkpoint.py +++ /dev/null @@ -1,11 +0,0 @@ - # Define arguments -parser = argparse.ArgumentParser(description='Training arg parser') -parser.add_argument('--train_dir', type=str, help='Directory where training data is stored') -parser.add_argument('--valid_dir', type=str, help='Directory where validation data is stored') -parser.add_argument('--output_dir', type=str, help='Directory to output the model to') -args = parser.parse_args() - -# Get arguments from parser -train_dir = args.train_dir -valid_dir = args.valid_dir -output_dir = args.output_dir diff --git a/modules/train/.ipynb_checkpoints/train_step-checkpoint.py b/modules/train/.ipynb_checkpoints/train_step-checkpoint.py deleted file mode 100644 index e92d2f0..0000000 --- a/modules/train/.ipynb_checkpoints/train_step-checkpoint.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -from azureml.pipeline.steps import PythonScriptStep -from azureml.core.runconfig import RunConfiguration -from azureml.core.conda_dependencies import CondaDependencies -from azureml.pipeline.core import PipelineData -from azureml.pipeline.core import PipelineParameter -from azureml.pipeline.steps import EstimatorStep -from azureml.train.dnn import PyTorch - -def train_step(train_dir, valid_dir, compute_target): - - num_epochs = PipelineParameter(name='num_epochs', default_value=25) - batch_size = PipelineParameter(name='batch_size', default_value=16) - learning_rate = PipelineParameter(name='learning_rate', default_value=0.001) - momentum = PipelineParameter(name='momentum', default_value=0.9) - - model_dir = PipelineData( - name='model_dir', - pipeline_output_name='model_dir', - datastore=train_dir.datastore, - output_mode='mount', - is_directory=True) - - outputs = [model_dir] - - estimator = PyTorch( - source_directory=os.path.dirname(os.path.abspath(__file__)), - entry_script='train.py', - framework_version='1.3', - compute_target=compute_target, - use_gpu=True) - - step = EstimatorStep( - estimator=estimator, - estimator_entry_script_arguments=[ - '--train_dir', train_dir, - '--valid_dir', valid_dir, - '--output_dir', model_dir, - '--num_epochs', num_epochs, - '--batch_size', batch_size, - '--learning_rate', learning_rate, - '--momentum', momentum - ], - inputs=[train_dir, valid_dir], - compute_target=compute_target, - outputs=outputs, - allow_reuse=False) - - return step, outputs diff --git a/modules/train/__pycache__/train_step.cpython-36.pyc b/modules/train/__pycache__/train_step.cpython-36.pyc deleted file mode 100644 index af535325addce24c617b094b77bf34bc540e7917..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1463 zcmY*Z&u`pB6t-vmdv}v;n-&TZV^nKUWFPnN*tUEzf(bV9ZfrHUk zyyZ3HbzX1@&dIW9d0Eu_6;%|2275^bt+?U>oYSwQDk`cPag*L(Zr?N_-{d1K0aBCa#%#i-SMH_X`)m6$ zP$T7>yXT%kUj^^%cQ!jzs5dvfJnMt#%=D*`%x^jb%CS7p3y@{PQH0=36PJ2h-xPZ?j)u2k7uT6SDV zXH?~zOcpzSjhofel2cI^^*RHQ`TF1YfB*jN+46-Bs;1(#>Zye#seB z)uEbFdqoX}N`XDn{=NdK$FwY)i%f8+JB|c=otAJBju8TkC5_ZhOVvjEP_WV0r)Sc5 zf-GAVCC@V9LufqK;q9h%c)Mk@rU)_P6yXp7&w!XAe1h;XfOaH&fbT8w85sY3v#4sd zpgY7Iy0`dVg6NC$W^>UkXjLXd>t(`t)hwXHUCHIbRItYk^Ff1kv@Z+I2y)Bk02Z0p z<|f3(+kp`f53C2qMhpDJftlgbp@?QSxM z^Eek>ots~az^`9_I=pEnVTRF+-TTQ12csShQMdUqeBUV3GE_gE4N+#Zb5_@;fu>R; Y={<-sZSZWb*>f8#i46(%iVR5p171e0!~g&Q diff --git a/modules/train/__pycache__/train_step.cpython-37.pyc b/modules/train/__pycache__/train_step.cpython-37.pyc deleted file mode 100644 index 1bc1368fdf278747c158a52c3c0bc1ad9b807ee3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1484 zcmY*Z&2Qv16i+<)o|(3@bQcyR=#e00MhK2*7qJVsT`IaNQo|%+<$#z=JD7|s* z|G=JL{};G$^py*hxNt=rc+bez_$H-Yk(AhoN9Cw;QpaGYbSp3Q40cPu3evz}4{(@<2K(@Bltu;z<+z%p z6Tl&hezejlvrnved=0BvFJoID9B&_&Y7(qBukmU@U2*cG^s>wGRawe*NrJ z8cRUwvM4#F!iUg!ti#()>+p8VW=#=d#wm(@6nF;248{8>-UFc>2|wT)OMD2*Ki@5? zS}pR$w-UHsoHgrUbKj=+p&`46Z;w2}Y- diff --git a/modules/train/train_step.py b/modules/train/train_step.py index e92d2f0..9f50c18 100644 --- a/modules/train/train_step.py +++ b/modules/train/train_step.py @@ -8,6 +8,20 @@ from azureml.pipeline.steps import EstimatorStep from azureml.train.dnn import PyTorch def train_step(train_dir, valid_dir, compute_target): + ''' + This step will fine-tune a RESNET-18 model on our dataset using PyTorch. + It will use the corresponding input image directories as training and validation data. + + :param train_dir: The reference to the directory containing the training data + :type train_dir: DataReference + :param valid_dir: The reference to the directory containing the validation data + :type valid_dir: DataReference + :param compute_target: The compute target to run the step on + :type compute_target: ComputeTarget + + :return: The preprocess step, step outputs dictionary (keys: model_dir) + :rtype: EstimatorStep, dict + ''' num_epochs = PipelineParameter(name='num_epochs', default_value=25) batch_size = PipelineParameter(name='batch_size', default_value=16) @@ -22,6 +36,7 @@ def train_step(train_dir, valid_dir, compute_target): is_directory=True) outputs = [model_dir] + outputs_map = { 'model_dir': model_dir } estimator = PyTorch( source_directory=os.path.dirname(os.path.abspath(__file__)), @@ -46,4 +61,4 @@ def train_step(train_dir, valid_dir, compute_target): outputs=outputs, allow_reuse=False) - return step, outputs + return step, outputs_map diff --git a/object-recognition-pipeline.py b/object-recognition-pipeline.py index 847671d..73c9855 100644 --- a/object-recognition-pipeline.py +++ b/object-recognition-pipeline.py @@ -41,16 +41,16 @@ datastore = DataReference(datastore, mode='mount') data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore, cpu_compute_target) # Step 2: Data preprocessing -data_preprocess_step, data_preprocess_outputs = data_preprocess_step(data_ingestion_outputs[0], cpu_compute_target) +data_preprocess_step, data_preprocess_outputs = data_preprocess_step(data_ingestion_outputs['raw_data_dir'], cpu_compute_target) # Step 3: Train Model -train_step, train_outputs = train_step(data_preprocess_outputs[0], data_preprocess_outputs[1], gpu_compute_target) +train_step, train_outputs = train_step(data_preprocess_outputs['train_dir'], data_preprocess_outputs['valid_dir'], gpu_compute_target) # Step 4: Evaluate Model -evaluate_step, evaluate_outputs = evaluate_step(train_outputs[0], data_preprocess_outputs[2], gpu_compute_target) +evaluate_step, evaluate_outputs = evaluate_step(train_outputs['model_dir'], data_preprocess_outputs['test_dir'], gpu_compute_target) # Step 5: Deploy Model -deploy_step, deploy_outputs = deploy_step(train_outputs[0], evaluate_outputs[0], data_preprocess_outputs[2], cpu_compute_target) +deploy_step, deploy_outputs = deploy_step(train_outputs['model_dir'], evaluate_outputs['accuracy_file'], data_preprocess_outputs['test_dir'], cpu_compute_target) # Submit pipeline print('Submitting pipeline ...')