Added gDrive_download.py

2019-03-14 16:47:33 -07:00 · 2019-03-14 16:47:33 -07:00 · 59a4511d34
--- a/gDrive_download.py
+++ b/gDrive_download.py
@ -0,0 +1,420 @@
+#
+# gDrive_download.py
+#
+# Recursively enumerates a google drive directory, optionally downloading all the 
+# files in that directory.  These are two separate steps; files are enumerated
+# and written to file before anything is downloaded.  If you only want to connect
+# file names and gDrive GUIDs, you don't need to download anything.
+# 
+# Uses the PyDrive library to talk to google drive, and assumes you've created a 
+# .json file with your secret key to access the drive, following this tutorial 
+# verbatim:
+# 
+# https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html#authentication
+#
+# It can take a few tries to run this on large data sets (in particular to
+# retry failed downloads a semi-arbitrary number of times), so this isn't 
+# entirely meant to be run from scratch; I'd say I ran this semi-interactvely.
+#
+# Note that gDrive caps free access at 1000 queries / 100 seconds / user = 
+# 10 queries / second.  You may get slightly faster access than that in practice, but
+# not much.
+#
+# dan@microsoft.com
+#
+
+#%% Imports
+
+import time
+import datetime
+import json
+import os
+import csv
+from pydrive.auth import GoogleAuth
+from multiprocessing.pool import ThreadPool
+from pydrive.drive import GoogleDrive
+import humanfriendly
+
+
+#%% Configuration and constants
+
+# Should we actually download images, or just enumerate images?
+downloadImages = 1
+
+# Set to 'errors' when you've already downloaded most of the files and are just 
+# re-trying failures
+#
+# 'all','errors','ifnecessary'
+enumerationMode = 'ifnecessary'
+
+# The GUID for the top-level folder
+parentID = ''
+
+# client_secrets.json lives here
+clientSecretsPath = r'd:\git\ai4edev\dan\danMisc'
+
+# Limits the number of files we enumerate (for debugging).  Set to -1 to enumerate 
+# all files.
+maxFiles = -1
+
+# This can be empty if we're not writing images
+imageOutputDir = r'f:\video'
+
+# When duplicate folders exist, should we merge them?  The alternative is
+# renaming the second instance of "blah" to "blah (1)".  My experience has been
+# that the gDrive sync behavior varies with OS; on Windows, renaming occurs, on MacOS,
+# folders are merged.
+bMergeDuplicateFolders = True
+
+
+#%% Derived constants
+
+# Change to the path where the client secrets file lives, to simplify auth
+os.chdir(clientSecretsPath) 
+
+# Create a datestamped filename to which we'll write all the metadata we
+# retrieve when we crawl the gDrive.
+metadataOutputDir = os.path.join(imageOutputDir,'metadata_cache')
+
+os.makedirs(metadataOutputDir,exist_ok=True)
+
+metadataFileBase = os.path.join(metadataOutputDir,'imageMetadata.json')
+dateStamp = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
+name, ext = os.path.splitext(metadataFileBase)
+metadataFile = "{}.{}{}".format(name,dateStamp,ext)
+
+# List of files we need to download, just filename and GUID.  This .csv
+# file is written by the enumeration step.
+downloadListFileBase = os.path.join(metadataOutputDir,'downloadList.csv')
+name, ext = os.path.splitext(downloadListFileBase)
+downloadListFile = "{}.{}{}".format(name,dateStamp,ext)
+
+# List of download errors
+errorListFileBase = os.path.join(metadataOutputDir,'enumerationErrors.csv')
+name, ext = os.path.splitext(errorListFileBase)
+errorListFile = "{}.{}{}".format(name,dateStamp,ext)
+
+# If we are running in "errors" mode, this is the list of directories we want to re-try
+errorListFileResume = os.path.join(metadataOutputDir,r"enumerationErrors.csv")
+
+assert (not downloadImages) or (not len(imageOutputDir)==0), 'Can\'t have empty output dir if you\'re downloading images'
+
+# Only applies to downloading; enumeration is not currently multi-threaded
+nThreads = 10
+    
+
+#%% Authenticate
+
+gauth = GoogleAuth()
+gauth.LocalWebserverAuth()
+drive = GoogleDrive(gauth)
+
+
+#%% Enumerate files for download (functions)
+
+class DataEnumerator:
+
+    nFiles = 0
+    nFolders = 0
+    errors = []
+    fileInfo = []
+    downloadList = []
+
+
+def PrepareFolderDownload(folderID,folderTargetDir,dataEnumerator=None):
+    '''
+    Enumerate files and directories in a single folder, specified by the GUID
+    folderID.  Will be called once for every folder we encounter.  Does not make
+    recursive calls.
+    '''
+    if dataEnumerator == None:
+
+        dataEnumerator = DataEnumerator()
+
+    try:
+
+        fileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % folderID}).GetList()
+
+    except Exception as ex:
+
+        # ex = sys.exc_info()[0]
+        errorString = str(ex)
+        print("Error listing directory {}:{}:{}".format(folderTargetDir,folderID,errorString))
+        dataEnumerator.errors.append( ['folder',folderTargetDir,folderID,errorString] )
+        return dataEnumerator
+
+    titles = set()
+
+    # Handle redundant directory names
+    for f in fileList:
+        
+        title = f['title']
+        nRenames = 0
+        
+        if title in titles:            
+            nRenames = nRenames + 1
+            if bMergeDuplicateFolders:
+                print("Warning: folder conflict at {}/{}".format(folderTargetDir,title))
+            else:
+                # Try to rename folders and files the way the gDrive sync app does, i.e. if there are 
+                # two files called "Blah",  we want "Blah" and "Blah (1)".
+                newTitle = title + " ({})".format(nRenames)
+                print("Renaming {} to {} in [{}]".format(title,newTitle,folderTargetDir))
+                title = newTitle
+                f['title'] = title                
+        else:
+            titles.add(title)
+            
+    # ...for every file in our list (handling redundant directory names)
+    
+    # Enumerate and process files in this folder
+    for f in fileList:
+    
+        if maxFiles > 0 and dataEnumerator.nFiles > maxFiles:
+            return dataEnumerator
+
+        dataEnumerator.fileInfo.append(f)
+
+        title = f['title']
+            
+        if f['mimeType']=='application/vnd.google-apps.folder': # if folder
+
+            dataEnumerator.nFolders = dataEnumerator.nFolders + 1
+        
+            # Create the target directory if necessary
+            outputDir = os.path.join(folderTargetDir,title)
+            f['target'] = outputDir
+            if downloadImages:
+                if not os.path.exists(outputDir):
+                    os.mkdir(outputDir)
+
+            print("Enumerating folder {} to {}".format(title,outputDir))        
+
+            # Recurse
+            dataEnumerator = PrepareFolderDownload(f['id'],outputDir,dataEnumerator)            
+    
+        else:            
+
+            dataEnumerator.nFiles = dataEnumerator.nFiles + 1
+        
+            targetFile = os.path.join(folderTargetDir,title)            
+            f['target'] = targetFile
+            print("Downloading file {} to {}".format(title,targetFile))        
+            dataEnumerator.downloadList.append( [targetFile,f['id']] )
+
+    # ...for each file in this folder
+    
+    return dataEnumerator    
+
+# ... def PrepareFolderDownload
+    
+
+#%% Enumerate files for download (execution)
+
+startTime = time.time()
+
+if (enumerationMode == 'ifnecessary') and (os.path.exists(downloadListFile)):
+
+    downloadList = []
+    with open(downloadListFile) as csvfile:
+        r = csv.reader(csvfile)
+        for iRow,row in enumerate(r):
+            if maxFiles > 0 and iRow > maxFiles:
+                break
+            else:
+                downloadList.append(row)
+
+    print("Read {} downloads from {}".format(len(downloadList),downloadListFile))
+
+else:
+
+    dataEnumerator = None
+        
+    if enumerationMode == 'errors':
+
+        splitLines = []
+
+        assert(os.path.isfile(errorListFileResume))
+
+        # Read the error file
+        # For each line in the input file
+        with open(errorListFileResume) as f:
+            rows = csv.reader(f)    
+            for iRow,row in enumerate(rows):
+                splitLines.append(row)
+
+        # Lines look like:
+        # 
+        # ['folder',folderTargetDir,folderID,errorString]
+
+        for iRow,row in enumerate(splitLines):
+            targetDir = row[1]
+            folderID = row[2]
+            errorString = row[3]
+            print('Re-trying folder ID {} ({})'.format(targetDir,folderID))
+            dataEnumerator = PrepareFolderDownload(folderID,targetDir)
+
+    # Either we're in 'all' mode or we're in 'ifnecessary' mode and enumeration is necessary
+    else:
+
+        print("Starting enumeration")
+        startTime = time.time()
+        dataEnumerator = PrepareFolderDownload(parentID,imageOutputDir)
+        elapsed = time.time() - startTime
+        print("Finished enumeration in {}".format(str(datetime.timedelta(seconds=elapsed))))
+    
+        print("Enumerated {} files".format(len(dataEnumerator.downloadList)))
+
+    s = json.dumps(dataEnumerator.fileInfo)
+    with open(metadataFile, "w+") as f:
+        f.write(s)
+    print("Finished writing metadata to {}".format(metadataFile))
+
+    with open(downloadListFile,'w+') as f:
+        for fileInfo in dataEnumerator.downloadList:
+            f.write(",".join(fileInfo) + "\n")
+    print("Finished writing download list to {}".format(downloadListFile))
+
+    with open(errorListFile,'w+') as f:
+        for e in dataEnumerator.errors:
+            f.write(",".join(e) + "\n")
+    print("Finished writing error list ({} errors) to {}".format(len(dataEnumerator.errors),errorListFile))
+
+    elapsed = time.time() - startTime
+    print("Done enumerating files in {}".format(humanfriendly.format_timespan(elapsed)))
+
+    downloadList = dataEnumerator.downloadList
+    
+# if/else on enumeration modes
+
+
+#%% Compute total download size
+    
+import tqdm
+import humanfriendly
+
+sizeBytes = 0
+
+for f in tqdm.tqdm(dataEnumerator.fileInfo):
+    
+    if 'fileSize' in f:
+        sizeBytes = sizeBytes + int(f['fileSize'])
+    
+print('Total download size is {} in {} files'.format(
+        humanfriendly.format_size(sizeBytes),len(dataEnumerator.fileInfo)))
+
+    
+#%% Download images (functions)
+
+import sys
+
+def ProcessDownload(fileInfo):
+
+    status = 'unknown'
+    targetFile = fileInfo[0]
+    if os.path.exists(targetFile):
+        print("Skipping download of file {}".format(targetFile))
+        status = 'skipped'
+        return status
+    id = fileInfo[1]
+    try:
+        f = drive.CreateFile({'id': id})
+        title = f['title']    
+    except:
+        print("File creation error for {}".format(targetFile))
+        status = 'create_error'
+        return status
+    print("Downloading file {} to {}".format(title,targetFile))        
+    try:
+        f.GetContentFile(targetFile)
+        status = 'success'
+        return status
+    except:
+        print("Download error for {}: {}".format(targetFile,sys.exc_info()[0]))
+        status = 'download_error'
+        return status
+
+def ProcessDownloadList(downloadList):
+
+    pool = ThreadPool(nThreads)
+    # results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
+    results = pool.map(ProcessDownload, downloadList)
+
+    # for iFile,fileInfo in enumerate(downloadList):
+    #    ProcessDownload(fileInfo)
+    return results
+
+
+#%% Download images (execution)
+
+if downloadImages:
+
+    print('Downloading data...')
+    # results = ProcessDownloadList(downloadList[1:10])
+    results = ProcessDownloadList(downloadList)
+    print('...done.')
+    
+
+#%% Scrap
+
+if False:
+
+    #%% List files
+
+    from pydrive.drive import GoogleDrive
+    drive = GoogleDrive(gauth)
+    file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
+    for file1 in file_list:
+        print('title: %s, id: %s' % (file1['title'], file1['id']))
+
+
+    #%% List a particular directory
+
+    from pydrive.drive import GoogleDrive
+    drive = GoogleDrive(gauth)
+    folder_id = 'blahblahblah'
+    q = {'q': "'{}' in parents and trashed=false".format(folder_id)}
+    file_list = drive.ListFile(q).GetList()
+
+    for iFile,f in enumerate(file_list):
+        print('{}: {}, id: {}'.format(iFile,f['title'],f['id']))
+
+
+    #%% Recursive list
+
+    from pydrive.drive import GoogleDrive
+    drive = GoogleDrive(gauth)
+
+    def ListFolder(parentID,fileListOut=None):
+
+        if  fileListOut is None:
+            fileListOut = []
+
+        parentList = drive.ListFile({'q': "'%s' in parents and trashed=false" % parentID}).GetList()
+
+        for f in parentList:
+    
+            if len(fileListOut) > maxFiles:
+                return fileListOut
+
+            if f['mimeType']=='application/vnd.google-apps.folder': # if folder
+
+                title = f['title']
+                print("Enumerating folder {}".format(title))        
+                childFiles = ListFolder(f['id'],fileListOut)
+                print("Enumerated {} files".format(len(childFiles)))
+
+                fileListOut = fileListOut + childFiles
+                # fileListOut.append({"id":f['id'],"title":f['title'],"list":})
+    
+            else:            
+                fileListOut.append(f['title'])
+
+        return fileListOut
+
+    parent = -1;
+    file_list = ListFolder(parent)
+
+    print("Enumerated {} files".format(len(file_list)))
+
+