Added gDrive_download.py

2019-03-14 16:47:33 -07:00 · 2019-03-14 16:47:33 -07:00 · 59a4511d34
--- a/gDrive_download.py
+++ b/gDrive_download.py
@ -0,0 +1,420 @@
 #
 # gDrive_download.py
 #
 # Recursively enumerates a google drive directory, optionally downloading all the 
 # files in that directory.  These are two separate steps; files are enumerated
 # and written to file before anything is downloaded.  If you only want to connect
 # file names and gDrive GUIDs, you don't need to download anything.
 # 
 # Uses the PyDrive library to talk to google drive, and assumes you've created a 
 # .json file with your secret key to access the drive, following this tutorial 
 # verbatim:
 # 
 # https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html#authentication
 #
 # It can take a few tries to run this on large data sets (in particular to
 # retry failed downloads a semi-arbitrary number of times), so this isn't 
 # entirely meant to be run from scratch; I'd say I ran this semi-interactvely.
 #
 # Note that gDrive caps free access at 1000 queries / 100 seconds / user = 
 # 10 queries / second.  You may get slightly faster access than that in practice, but
 # not much.
 #
 # dan@microsoft.com
 #
 #%% Imports
 import time
 import datetime
 import json
 import os
 import csv
 from pydrive.auth import GoogleAuth
 from multiprocessing.pool import ThreadPool
 from pydrive.drive import GoogleDrive
 import humanfriendly
 #%% Configuration and constants
 # Should we actually download images, or just enumerate images?
 downloadImages = 1
 # Set to 'errors' when you've already downloaded most of the files and are just 
 # re-trying failures
 #
 # 'all','errors','ifnecessary'
 enumerationMode = 'ifnecessary'
 # The GUID for the top-level folder
 parentID = ''
 # client_secrets.json lives here
 clientSecretsPath = r'd:\git\ai4edev\dan\danMisc'
 # Limits the number of files we enumerate (for debugging).  Set to -1 to enumerate 
 # all files.
 maxFiles = -1
 # This can be empty if we're not writing images
 imageOutputDir = r'f:\video'
 # When duplicate folders exist, should we merge them?  The alternative is
 # renaming the second instance of "blah" to "blah (1)".  My experience has been
 # that the gDrive sync behavior varies with OS; on Windows, renaming occurs, on MacOS,
 # folders are merged.
 bMergeDuplicateFolders = True
 #%% Derived constants
 # Change to the path where the client secrets file lives, to simplify auth
 os.chdir(clientSecretsPath) 
 # Create a datestamped filename to which we'll write all the metadata we
 # retrieve when we crawl the gDrive.
 metadataOutputDir = os.path.join(imageOutputDir,'metadata_cache')
 os.makedirs(metadataOutputDir,exist_ok=True)
 metadataFileBase = os.path.join(metadataOutputDir,'imageMetadata.json')
 dateStamp = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
 name, ext = os.path.splitext(metadataFileBase)
 metadataFile = "{}.{}{}".format(name,dateStamp,ext)
 # List of files we need to download, just filename and GUID.  This .csv
 # file is written by the enumeration step.
 downloadListFileBase = os.path.join(metadataOutputDir,'downloadList.csv')
 name, ext = os.path.splitext(downloadListFileBase)
 downloadListFile = "{}.{}{}".format(name,dateStamp,ext)
 # List of download errors
 errorListFileBase = os.path.join(metadataOutputDir,'enumerationErrors.csv')
 name, ext = os.path.splitext(errorListFileBase)
 errorListFile = "{}.{}{}".format(name,dateStamp,ext)
 # If we are running in "errors" mode, this is the list of directories we want to re-try
 errorListFileResume = os.path.join(metadataOutputDir,r"enumerationErrors.csv")
 assert (not downloadImages) or (not len(imageOutputDir)==0), 'Can\'t have empty output dir if you\'re downloading images'
 # Only applies to downloading; enumeration is not currently multi-threaded
 nThreads = 10
 #%% Authenticate
 gauth = GoogleAuth()
 gauth.LocalWebserverAuth()
 drive = GoogleDrive(gauth)
 #%% Enumerate files for download (functions)
 class DataEnumerator:
    nFiles = 0
    nFolders = 0
    errors = []
    fileInfo = []
    downloadList = []
 def PrepareFolderDownload(folderID,folderTargetDir,dataEnumerator=None):
    '''
    Enumerate files and directories in a single folder, specified by the GUID
    folderID.  Will be called once for every folder we encounter.  Does not make
    recursive calls.
    '''
    if dataEnumerator == None:
        dataEnumerator = DataEnumerator()
    try:
        fileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % folderID}).GetList()
    except Exception as ex:
        # ex = sys.exc_info()[0]
        errorString = str(ex)
        print("Error listing directory {}:{}:{}".format(folderTargetDir,folderID,errorString))
        dataEnumerator.errors.append( ['folder',folderTargetDir,folderID,errorString] )
        return dataEnumerator
    titles = set()
    # Handle redundant directory names
    for f in fileList:
        title = f['title']
        nRenames = 0
        if title in titles:            
            nRenames = nRenames + 1
            if bMergeDuplicateFolders:
                print("Warning: folder conflict at {}/{}".format(folderTargetDir,title))
            else:
                # Try to rename folders and files the way the gDrive sync app does, i.e. if there are 
                # two files called "Blah",  we want "Blah" and "Blah (1)".
                newTitle = title + " ({})".format(nRenames)
                print("Renaming {} to {} in [{}]".format(title,newTitle,folderTargetDir))
                title = newTitle
                f['title'] = title                
        else:
            titles.add(title)
    # ...for every file in our list (handling redundant directory names)
    # Enumerate and process files in this folder
    for f in fileList:
        if maxFiles > 0 and dataEnumerator.nFiles > maxFiles:
            return dataEnumerator
        dataEnumerator.fileInfo.append(f)
        title = f['title']
        if f['mimeType']=='application/vnd.google-apps.folder': # if folder
            dataEnumerator.nFolders = dataEnumerator.nFolders + 1
            # Create the target directory if necessary
            outputDir = os.path.join(folderTargetDir,title)
            f['target'] = outputDir
            if downloadImages:
                if not os.path.exists(outputDir):
                    os.mkdir(outputDir)
            print("Enumerating folder {} to {}".format(title,outputDir))        
            # Recurse
            dataEnumerator = PrepareFolderDownload(f['id'],outputDir,dataEnumerator)            
        else:            
            dataEnumerator.nFiles = dataEnumerator.nFiles + 1
            targetFile = os.path.join(folderTargetDir,title)            
            f['target'] = targetFile
            print("Downloading file {} to {}".format(title,targetFile))        
            dataEnumerator.downloadList.append( [targetFile,f['id']] )
    # ...for each file in this folder
    return dataEnumerator    
 # ... def PrepareFolderDownload
 #%% Enumerate files for download (execution)
 startTime = time.time()
 if (enumerationMode == 'ifnecessary') and (os.path.exists(downloadListFile)):
    downloadList = []
    with open(downloadListFile) as csvfile:
        r = csv.reader(csvfile)
        for iRow,row in enumerate(r):
            if maxFiles > 0 and iRow > maxFiles:
                break
            else:
                downloadList.append(row)
    print("Read {} downloads from {}".format(len(downloadList),downloadListFile))
 else:
    dataEnumerator = None
    if enumerationMode == 'errors':
        splitLines = []
        assert(os.path.isfile(errorListFileResume))
        # Read the error file
        # For each line in the input file
        with open(errorListFileResume) as f:
            rows = csv.reader(f)    
            for iRow,row in enumerate(rows):
                splitLines.append(row)
        # Lines look like:
        # 
        # ['folder',folderTargetDir,folderID,errorString]
        for iRow,row in enumerate(splitLines):
            targetDir = row[1]
            folderID = row[2]
            errorString = row[3]
            print('Re-trying folder ID {} ({})'.format(targetDir,folderID))
            dataEnumerator = PrepareFolderDownload(folderID,targetDir)
    # Either we're in 'all' mode or we're in 'ifnecessary' mode and enumeration is necessary
    else:
        print("Starting enumeration")
        startTime = time.time()
        dataEnumerator = PrepareFolderDownload(parentID,imageOutputDir)
        elapsed = time.time() - startTime
        print("Finished enumeration in {}".format(str(datetime.timedelta(seconds=elapsed))))
        print("Enumerated {} files".format(len(dataEnumerator.downloadList)))
    s = json.dumps(dataEnumerator.fileInfo)
    with open(metadataFile, "w+") as f:
        f.write(s)
    print("Finished writing metadata to {}".format(metadataFile))
    with open(downloadListFile,'w+') as f:
        for fileInfo in dataEnumerator.downloadList:
            f.write(",".join(fileInfo) + "\n")
    print("Finished writing download list to {}".format(downloadListFile))
    with open(errorListFile,'w+') as f:
        for e in dataEnumerator.errors:
            f.write(",".join(e) + "\n")
    print("Finished writing error list ({} errors) to {}".format(len(dataEnumerator.errors),errorListFile))
    elapsed = time.time() - startTime
    print("Done enumerating files in {}".format(humanfriendly.format_timespan(elapsed)))
    downloadList = dataEnumerator.downloadList
 # if/else on enumeration modes
 #%% Compute total download size
 import tqdm
 import humanfriendly
 sizeBytes = 0
 for f in tqdm.tqdm(dataEnumerator.fileInfo):
    if 'fileSize' in f:
        sizeBytes = sizeBytes + int(f['fileSize'])
 print('Total download size is {} in {} files'.format(
        humanfriendly.format_size(sizeBytes),len(dataEnumerator.fileInfo)))
 #%% Download images (functions)
 import sys
 def ProcessDownload(fileInfo):
    status = 'unknown'
    targetFile = fileInfo[0]
    if os.path.exists(targetFile):
        print("Skipping download of file {}".format(targetFile))
        status = 'skipped'
        return status
    id = fileInfo[1]
    try:
        f = drive.CreateFile({'id': id})
        title = f['title']    
    except:
        print("File creation error for {}".format(targetFile))
        status = 'create_error'
        return status
    print("Downloading file {} to {}".format(title,targetFile))        
    try:
        f.GetContentFile(targetFile)
        status = 'success'
        return status
    except:
        print("Download error for {}: {}".format(targetFile,sys.exc_info()[0]))
        status = 'download_error'
        return status
 def ProcessDownloadList(downloadList):
    pool = ThreadPool(nThreads)
    # results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
    results = pool.map(ProcessDownload, downloadList)
    # for iFile,fileInfo in enumerate(downloadList):
    #    ProcessDownload(fileInfo)
    return results
 #%% Download images (execution)
 if downloadImages:
    print('Downloading data...')
    # results = ProcessDownloadList(downloadList[1:10])
    results = ProcessDownloadList(downloadList)
    print('...done.')
 #%% Scrap
 if False:
    #%% List files
    from pydrive.drive import GoogleDrive
    drive = GoogleDrive(gauth)
    file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
    for file1 in file_list:
        print('title: %s, id: %s' % (file1['title'], file1['id']))
    #%% List a particular directory
    from pydrive.drive import GoogleDrive
    drive = GoogleDrive(gauth)
    folder_id = 'blahblahblah'
    q = {'q': "'{}' in parents and trashed=false".format(folder_id)}
    file_list = drive.ListFile(q).GetList()
    for iFile,f in enumerate(file_list):
        print('{}: {}, id: {}'.format(iFile,f['title'],f['id']))
    #%% Recursive list
    from pydrive.drive import GoogleDrive
    drive = GoogleDrive(gauth)
    def ListFolder(parentID,fileListOut=None):
        if  fileListOut is None:
            fileListOut = []
        parentList = drive.ListFile({'q': "'%s' in parents and trashed=false" % parentID}).GetList()
        for f in parentList:
            if len(fileListOut) > maxFiles:
                return fileListOut
            if f['mimeType']=='application/vnd.google-apps.folder': # if folder
                title = f['title']
                print("Enumerating folder {}".format(title))        
                childFiles = ListFolder(f['id'],fileListOut)
                print("Enumerated {} files".format(len(childFiles)))
                fileListOut = fileListOut + childFiles
                # fileListOut.append({"id":f['id'],"title":f['title'],"list":})
            else:            
                fileListOut.append(f['title'])
        return fileListOut
    parent = -1;
    file_list = ListFolder(parent)
    print("Enumerated {} files".format(len(file_list)))