This commit is contained in:
Alejandra Angulo Rico 2021-09-28 13:45:43 -05:00
Родитель ea6624902d
Коммит 3632d72d61
4 изменённых файлов: 668 добавлений и 0 удалений

27
model/CNNModel.py Normal file
Просмотреть файл

@ -0,0 +1,27 @@
import os
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
import torchvision
class CNNModel (nn.Module):
def __init__(self, classCount, isTrained):
super(CNNModel, self).__init__()
self.cnnmodel = torchvision.models.densenet121(pretrained=isTrained)
kernelCount = self.cnnmodel.classifier.in_features
self.cnnmodel.classifier = nn.Sequential(nn.Linear(kernelCount, classCount), nn.Sigmoid())
def forward(self, x):
x = self.cnnmodel(x)
return x

116
model/DatasetGenerator.py Normal file
Просмотреть файл

@ -0,0 +1,116 @@
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset
import numpy as np
import torchvision
from torch.utils import data
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
class DatasetGenerator (Dataset):
def __init__ (self, pathImageDirectory, pathDatasetFile, listImages, labelList, transform, csvFilePath):
self.listImagePaths = []
self.listImageLabels = []
self.transform = transform
# for i listImages:
# imagesNames.aprint(listImages)
labelList= self.createLists(listImages)
print("listImageLabels")
print(labelList)
#print(labelList)
for i,image in enumerate(listImages):
listImages[i]=pathImageDirectory+"/"+mainImages+"/"+image
self.listImagePaths.append(listImages[i])
for label in labelList :
self.listImageLabels.append(label)
def __getitem__(self, idx):
image_index = self.listImagePaths[idx]
img = Image.open(image_index).convert('RGB')
#print(image_index)
imageLabel= torch.FloatTensor(self.listImageLabels[idx])
if self.transform != None: imageData = self.transform(img)
return imageData, imageLabel
def __len__(self):
return len(self.listImagePaths)
def createLists(self,images): #just Images file names not paths
#-------------------- SETTINGS: AML WORKSPACE AND DATASTORE
interactive_auth = InteractiveLoginAuthentication(tenant_id=os.environ['TENANT_ID'])
ws = Workspace(
subscription_id=os.environ['SUBSCRIPTION_ID'],
resource_group=os.environ["RESOURCE_GROUP"],
workspace_name=os.environ['WORKSPACE_NAME'],
auth=interactive_auth
)
datastore = Datastore.get(ws, datastore_name=os.environ['DATASTORE_NAME'])
#-------------------- SETTINGS: MOUNTING THE DATASET TO MAKE IT AVAILABLE
chestist_data = Dataset.get_by_name(ws,os.environ['DATASET_NAME_CSV'])
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
#files=os.listdir(mountFolder+"/dataset"+"/images_01") #Need to generalize for the whole dataset
patientDataFiltered = pd.read_csv(f"{mountFolder}/Data_Entry_2017_v2020 (1).csv", header=0)
patientDataFiltered = patientDataFiltered.dropna()
patientDataFiltered=patientDataFiltered[patientDataFiltered.isin(images).iloc[:,0]]
images=patientDataFiltered['Image Index'].tolist()
patientDataFiltered['Finding Labels'] = patientDataFiltered['Finding Labels'].replace('No Finding', '')
all_labels = ['Emphysema', 'Hernia', 'Pneumonia', 'Edema', 'Fibrosis', 'Pleural_Thickening', 'Mass', 'Atelectasis', 'Nodule', 'Effusion', 'Infiltration', 'Pneumothorax', 'Consolidation', 'Cardiomegaly']
print("all_labels",all_labels)
# obtain list of unique diseases
all_labels = [x for x in all_labels if len(x) > 0]
#perform one-hot encoding based on diseases extracted
for c_label in all_labels:
if len(c_label)> 1: # leave out empty labels
patientDataFiltered[c_label] = patientDataFiltered['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
labelPatients=[]
for index, rows in patientDataFiltered.iterrows():
labels = []
for label in all_labels:
if label in rows:
labels.append(int(rows[label]))
else:
labels.append(0)
labelPatients.append(labels)
return labelPatients

201
model/Main.py Normal file
Просмотреть файл

@ -0,0 +1,201 @@
import numpy as np
import pandas as pd
import os
import shutil
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import azureml.core
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
# #%matplotlib inline
from TrainerTester import TrainerTester
import random
import time
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch
from torchvision import transforms
from PIL import Image
from sklearn.metrics.ranking import roc_auc_score
# %env TENANT_ID=
# %env SUBSCRIPTION_ID=
# %env RESOURCE_GROUP=
# %env WORKSPACE_NAME=
# %env DATASTORE_NAME=
# %env DATASET_NAME=
# %env DATASET_NAME_CSV=
# %env IMAGES_SUBFOLDER=
def main ():
runTrain()
#return true
def runTrain():
#Do some training here
CNNMODEL='CNNModel'
timestampTime = time.strftime("%H%M%S")
timestampDate = time.strftime("%d%m%Y")
timestampLaunch = timestampDate + '-' + timestampTime
#-------------------- SETTINGS: AML WORKSPACE AND DATASTORE
interactive_auth = InteractiveLoginAuthentication(tenant_id=os.environ['TENANT_ID'])
ws = Workspace(
subscription_id=os.environ['SUBSCRIPTION_ID'],
resource_group=os.environ["RESOURCE_GROUP"],
workspace_name=os.environ['WORKSPACE_NAME'],
auth=interactive_auth
)
datastore = Datastore.get(ws, datastore_name=os.environ['DATASTORE_NAME'])
#-------------------- SETTINGS: MOUNTING THE DATASET TO MAKE IT AVAILABLE
chestist_data = Dataset.get_by_name(ws,os.environ['DATASET_NAME'])
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
pathDirData=mountFolder
#pathDirData=files
chestist_data_csv = Dataset.get_by_name(ws,os.environ['DATASET_NAME_CSV'])
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
csv=mountFolder
#pathDirData=files
#-------------------- SPLIT DATA
patient_data = pd.read_csv(f"{csv}/Data_Entry_2017_v2020 (1).csv", header=0)
patient_data = patient_data.dropna()
lenDataset=int(patient_data.shape[0])
indexDataset=list(patient_data.index)
#pathFileTrain=[] #Path for images to train the model
#pathFileVal=[]#Path for validation set
trainPercentage=(0.02*lenDataset)/100
devPercentage=(0.01*lenDataset)/100
valPercentage=(0.01*lenDataset)/100
train=random.sample(range(0, 112120), int(trainPercentage))
indexDataset = [i for i in indexDataset if i not in train]
dev=random.sample(range(0, 112120), int(devPercentage))
indexDataset = [i for i in indexDataset if i not in dev]
val=random.sample(range(0, 112120), int(valPercentage))
indexDataset = [i for i in indexDataset if i not in val]
#patient_data
train_dataframe = patient_data.iloc[train, :]
dev_dataframe = patient_data.iloc[dev, :]
val_dataframe = patient_data.iloc[val, :]
trainListImages=train_dataframe['Image Index'].tolist()
#patient_data.loc.values.flatten().tolist()
#print(trainListImages)
listImagePathsTrain=[]
for i,image in enumerate(trainListImages):
trainListImages[i]=mountFolder+"/"+image
listImagePathsTrain.append(trainListImages[i])
valListImages=val_dataframe['Image Index'].tolist()
#patient_data.loc.values.flatten().tolist()
#print(trainListImages)
listImagePathsVal=[]
for i,image in enumerate(valListImages):
valListImages[i]=mountFolder+"/"+image
listImagePathsVal.append(valListImages[i])
devListImages=dev_dataframe['Image Index'].tolist()
#patient_data.loc.values.flatten().tolist()
#print(trainListImages)
listImagesPathDev=[]
for i,image in enumerate(valListImages):
devListImages[i]=mountFolder+"/"+image
listImagesPathDev.append(devListImages[i])
#listImageLabels=labelList
#listImagePathsVal
#pathFileTrain=train
#pathFileVal=val
#---- Neural network parameters: type of the network, is it pre-trained on imagenet? , number of classes
nnArchitecture = CNNMODEL
nnIsTrained = True
nnClassCount = 14 #Because it has 14 different labels to be detected
#---- Training settings: batch size, maximum number of epochs
trBatchSize = 16 #Might change! this is for the first iteration
trMaxEpoch = 1 #Same here, this could change
#---- Parameters related to image transforms: size of the down-scaled image, cropped image
imgtransResize = 256
imgtransCrop = 224
#Path to save the trained model
pathModel = 'm-' + timestampLaunch + '.pth.tar'
print ('Architecture Selected to train = ', nnArchitecture)
print ('pathDirData',pathDirData)
print ('listDir',os.listdir(pathDirData))
TrainerTester.trainer(pathDirData, listImagePathsTrain, listImagePathsVal,listImagesPathDev, nnArchitecture, nnIsTrained, nnClassCount, trBatchSize, trMaxEpoch, imgtransResize, imgtransCrop, timestampLaunch, None)
print ('Testing the trained model...')
TrainerTester.tester(pathDirData, listImagesPathDev, pathModel, nnArchitecture, nnClassCount, nnIsTrained, trBatchSize, imgtransResize, imgtransCrop, timestampLaunch)
def runTest():
#Do some model testing here
timestampTime = time.strftime("%H%M%S")
timestampDate = time.strftime("%d%m%Y")
timestampLaunch = timestampDate + '-' + timestampTime
#-------------------- SETTINGS: AML WORKSPACE AND DATASTORE
interactive_auth = InteractiveLoginAuthentication(tenant_id=TENANT_ID)
ws = Workspace(
subscription_id=SUBSCRIPTION_ID,
resource_group=RESOURCE_GROUP,
workspace_name=WORKSPACE_NAME,
auth=interactive_auth
)
datastore = Datastore.get(ws, datastore_name=DATASTORE_NAME)
#-------------------- SETTINGS: MOUNTING THE DATASET TO MAKE IT AVAILABLE
chestist_data = Dataset.get_by_name(ws,DATASET_NAME)
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
files=os.listdir(mountFolder+os.environ['IMAGES_SUBFOLDER']) #Need to generalize for the whole dataset
pathDirData = files
pathFileTest = '' # Pat of images for test
nnArchitecture = 'CNNModel'
nnIsTrained = True
nnClassCount = 14
trBatchSize = 16
imgtransResize = 256
imgtransCrop = 224
pathModel = 'm-' + timestampLaunch + '.pth.tar' #path of the model to test, needs to change
#timestampLaunch = ''
TrainerTester.tester(pathDirData, pathFileTest, pathModel, nnArchitecture, nnClassCount, nnIsTrained, trBatchSize, imgtransResize, imgtransCrop, timestampLaunch)

324
model/TrainerTester.py Normal file
Просмотреть файл

@ -0,0 +1,324 @@
import os
import numpy as np
import time
import sys
import re
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn.functional as tfunc
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tempfile import TemporaryFile
import torch.nn.functional as func
from CNNModel import CNNModel
from DatasetGenerator import DatasetGenerator
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
from sklearn.metrics.ranking import roc_auc_score
class TrainerTester ():
def trainer(pathDirData, pathFileTrain, pathFileVal,pathFileTest, nnArchitecture, nnIsTrained, nnClassCount, trBatchSize, trMaxEpoch, transResize, transCrop, launchTimestamp, checkpoint):
#-------------------- SETTINGS: NETWORK ARCHITECTURE
if nnArchitecture == 'CNNModel': model = CNNModel(nnClassCount, nnIsTrained).cuda()
model = torch.nn.DataParallel(model).cuda()
#-------------------- SETTINGS: AML WORKSPACE AND DATASTORE
interactive_auth = InteractiveLoginAuthentication(tenant_id=os.environ['TENANT_ID'])
ws = Workspace(
subscription_id=os.environ['SUBSCRIPTION_ID'],
resource_group=os.environ["RESOURCE_GROUP"],
workspace_name=os.environ['WORKSPACE_NAME'],
auth=interactive_auth
)
datastore = Datastore.get(ws, datastore_name=os.environ['DATASTORE_NAME'])
#-------------------- SETTINGS: MOUNTING THE DATASET TO MAKE IT AVAILABLE
chestist_data = Dataset.get_by_name(ws,os.environ['DATASET_NAME_CSV'])
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
files=os.listdir(mountFolder) #Need to generalize for the whole dataset
# pathDirData=files
csvFilePath= mountFolder #Path for the csv file with the labels
#-------------------- SETTINGS: DATA TRANSFORMS (IMAGES SETTINGS)
normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) #Using the mean and std of Imagenet is a common practice. They are calculated based on millions of images. We can calculate the new mean and std
transformList = []
transformList.append(transforms.RandomResizedCrop(transCrop)) #randomize size as well
transformList.append(transforms.RandomHorizontalFlip()) #we are adding here a random flip so that is not always horizontal
transformList.append(transforms.ToTensor()) #This converts to tensor
transformList.append(normalize)
transformSequence=transforms.Compose(transformList)
#List of images paths for train and validation
listImagesTrain=[]
listImagesVal=[]
listImagesTest=[]
for imagePath in pathFileTrain:
listImagesTrain.append(os.path.basename(imagePath))
for imagePath in pathFileVal:
listImagesVal.append(os.path.basename(imagePath))
for imagePath in pathFileTest:
listImagesTest.append(os.path.basename(imagePath))
labelList=[]
#-------------------- DATASET BUILDERS
datasetTrain = DatasetGenerator(pathImageDirectory=pathDirData, pathDatasetFile=pathFileTrain, listImages=listImagesTrain,labelList=labelList, transform=transformSequence,csvFilePath=csvFilePath)
datasetVal = DatasetGenerator(pathImageDirectory=pathDirData, pathDatasetFile=pathFileVal, listImages=listImagesVal,labelList=labelList, transform=transformSequence,csvFilePath=csvFilePath)
dataLoaderTrain = DataLoader(dataset=datasetTrain, batch_size=trBatchSize, shuffle=True, num_workers=24, pin_memory=True)
dataLoaderVal = DataLoader(dataset=datasetVal, batch_size=trBatchSize, shuffle=False, num_workers=24, pin_memory=True)
print("dataset")
print(list(datasetTrain))
#-------------------- SETTINGS: OPTIMIZER & SCHEDULER
optimizer = optim.Adam (model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, factor = 0.1, patience = 5, mode = 'min')
#-------------------- SETTINGS: LOSS
loss = torch.nn.BCELoss(size_average = True)
#---- Load checkpoint
if checkpoint != None:
modelCheckpoint = torch.load(checkpoint)
model.load_state_dict(modelCheckpoint['state_dict'],strict=False)
optimizer.load_state_dict(modelCheckpoint['optimizer'])
#---- TRAIN THE NETWORK
lossMIN = 100000 #Fixable
for epochID in range (0, trMaxEpoch):
timestampTime = time.strftime("%H%M%S")
timestampDate = time.strftime("%d%m%Y")
timestampSTART = timestampDate + '-' + timestampTime
print("1")
#print(list(dataLoaderTrain))
TrainerTester.epochTrain (model, dataLoaderTrain, optimizer, scheduler, trMaxEpoch, nnClassCount, loss)
#del dataLoaderTrain
#torch.cuda.empty_cache()
lossVal, losstensor = TrainerTester.epochVal (model, dataLoaderVal, optimizer, scheduler, trMaxEpoch, nnClassCount, loss)
#del dataLoaderVal
timestampTime = time.strftime("%H%M%S")
timestampDate = time.strftime("%d%m%Y")
timestampEND = timestampDate + '-' + timestampTime
scheduler.step(losstensor.item())
if lossVal < lossMIN:
lossMIN = lossVal
torch.save({'epoch': epochID + 1, 'state_dict': model.state_dict(), 'best_loss': lossMIN, 'optimizer' : optimizer.state_dict()}, 'm-' + launchTimestamp + '.pth.tar')
print ('Epoch [' + str(epochID + 1) + '] [save] [' + timestampEND + '] loss= ' + str(lossVal))
else:
print ('Epoch [' + str(epochID + 1) + '] [----] [' + timestampEND + '] loss= ' + str(lossVal))
#--------------------------------------------------------------------------------
def epochTrain (model, dataLoader, optimizer, scheduler, epochMax, classCount, loss):
model.train()
print("Before batchID")
for batchID, (input, target) in enumerate (dataLoader):
#print(input)
print("batchID")
target = target.cuda(non_blocking = True)
varInput = torch.autograd.Variable(input)
varTarget = torch.autograd.Variable(target)
varOutput = model(varInput)
lossvalue = loss(varOutput, varTarget)
optimizer.zero_grad()
lossvalue.backward()
optimizer.step()
#--------------------------------------------------------------------------------
def epochVal (model, dataLoader, optimizer, scheduler, epochMax, classCount, loss):
with torch.no_grad():
model.eval()
lossVal = 0
lossValNorm = 0
losstensorMean = 0
for i, (input, target) in enumerate (dataLoader):
print("validation")
target = target.cuda(non_blocking=True)
varInput = torch.autograd.Variable(input, volatile=True)
varTarget = torch.autograd.Variable(target, volatile=True)
varOutput = model(varInput)
losstensor = loss(varOutput, varTarget)
losstensorMean += losstensor
lossVal += losstensor.item()
lossValNorm += 1
outLoss = lossVal / lossValNorm
losstensorMean = losstensorMean / lossValNorm
return outLoss, losstensorMean
def computeAUROC (dataGT, dataPRED, classCount):
outAUROC = []
datanpGT = dataGT.cpu().numpy()
datanpPRED = dataPRED.cpu().numpy()
for i in range(classCount):
try:
#roc_auc_score(y_true, y_scores)
outAUROC.append(roc_auc_score(datanpGT[:, i], datanpPRED[:, i]))
except ValueError:
pass
print(datanpGT)
print(datanpPRED)
# save numpy array as csv file
#from numpy import asarray
#from numpy import savetxt
# define data
#data = asarray([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
# save to csv file
#tuple=datanpPRED.shape
#savetxt('predictions_'+str(tuple[0])+'images'+'.csv', datanpPRED, delimiter=',')
#ChexnetTrainer.uploadToBlob('predictions_'+str(tuple[0])+'images'+'.csv',"datashowcaseprod")
return outAUROC
#--------------------------------------------------------------------------------
def tester (pathDirData, pathFileTest, pathModel, nnArchitecture, nnClassCount, nnIsTrained, trBatchSize, transResize, transCrop, launchTimeStamp):
#CLASS_NAMES = ['Cardiomegaly', 'Effusion', 'Nodule', 'Pneumonia','Pneumothorax']
CLASS_NAMES = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia','Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia']
#CLASS_NAMES = [ 'Effusion', 'Infiltration', 'Mass', 'Nodule','Pneumothorax']
cudnn.benchmark = True
if nnArchitecture == 'CNNModel': model = CNNModel(nnClassCount, nnIsTrained).cuda()
#import re
model = torch.nn.DataParallel(model).cuda()
print("PWD")
# !pwd
checkpoint = torch.load(pathModel)
state_dict = checkpoint['state_dict']
remove_data_parallel = False # Change if you don't want to use nn.DataParallel(model)
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
for key in list(state_dict.keys()):
match = pattern.match(key)
new_key = match.group(1) + match.group(2) if match else key
new_key = new_key[7:] if remove_data_parallel else new_key
state_dict[new_key] = state_dict[key]
# Delete old key only if modified.
if match or remove_data_parallel:
del state_dict[key]
model.load_state_dict(checkpoint['state_dict'], strict=False )
#optimizer.load_state_dict(checkpoint['optimizer'])
#modelCheckpoint = torch.load(pathModel)
#model.load_state_dict(modelCheckpoint['state_dict'])
#-------------------- SETTINGS: DATA TRANSFORMS, TEN CROPS
normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
#-------------------- SETTINGS: DATASET BUILDERS
transformList = []
transformList.append(transforms.Resize(transResize))
transformList.append(transforms.TenCrop(transCrop))
transformList.append(transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])))
transformList.append(transforms.Lambda(lambda crops: torch.stack([normalize(crop) for crop in crops])))
transformSequence=transforms.Compose(transformList)
interactive_auth = InteractiveLoginAuthentication(tenant_id=os.environ['TENANT_ID'])
ws = Workspace(
subscription_id=os.environ['SUBSCRIPTION_ID'],
resource_group=os.environ["RESOURCE_GROUP"],
workspace_name=os.environ['WORKSPACE_NAME'],
auth=interactive_auth
)
datastore = Datastore.get(ws, datastore_name=os.environ['DATASTORE_NAME'])
#-------------------- SETTINGS: MOUNTING THE DATASET TO MAKE IT AVAILABLE
chestist_data = Dataset.get_by_name(ws,os.environ['DATASET_NAME'])
mountPoint = chestist_data.mount()
mountPoint.start()
mountFolder = mountPoint.mount_point
pathDirData=mountFolder
#pathDirData=files
listImagesTest=[]
for imagePath in pathFileTest:
listImagesTest.append(os.path.basename(imagePath))
csvFilePath=""
labelList=[]
datasetTest = DatasetGenerator(pathImageDirectory=pathDirData, pathDatasetFile=pathFileTest, listImages=listImagesTest,labelList=labelList, transform=transformSequence,csvFilePath=csvFilePath)
dataLoaderTest = DataLoader(dataset=datasetTest, batch_size=trBatchSize, num_workers=8, shuffle=False, pin_memory=True)
print("HEY2")
with torch.no_grad():
print(list(datasetTest))
outGT = torch.FloatTensor().cuda()
outPRED = torch.FloatTensor().cuda()
model.eval()
for i, (input, target) in enumerate(dataLoaderTest):
target = target.cuda()
outGT = torch.cat((outGT, target), 0)
bs, n_crops, c, h, w = input.size()
varInput = torch.autograd.Variable(input.view(-1, c, h, w).cuda(), volatile=True)
out = model(varInput)
outMean = out.view(bs, n_crops, -1).mean(1)
outPRED = torch.cat((outPRED, outMean.data), 0)
aurocIndividual = TrainerTester.computeAUROC(outGT, outPRED, nnClassCount)
aurocMean = np.array(aurocIndividual).mean()
#del dataLoaderTest
print ('AUROC mean ', aurocMean)
for i in range (0, len(aurocIndividual)):
print (CLASS_NAMES[i], ' ', aurocIndividual[i])
return