ai4eutils/gDrive_download.py

421 строка
13 KiB
Python

#
# gDrive_download.py
#
# Recursively enumerates a google drive directory, optionally downloading all the
# files in that directory. These are two separate steps; files are enumerated
# and written to file before anything is downloaded. If you only want to connect
# file names and gDrive GUIDs, you don't need to download anything.
#
# Uses the PyDrive library to talk to google drive, and assumes you've created a
# .json file with your secret key to access the drive, following this tutorial
# verbatim:
#
# https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html#authentication
#
# It can take a few tries to run this on large data sets (in particular to
# retry failed downloads a semi-arbitrary number of times), so this isn't
# entirely meant to be run from scratch; I'd say I ran this semi-interactvely.
#
# Note that gDrive caps free access at 1000 queries / 100 seconds / user =
# 10 queries / second. You may get slightly faster access than that in practice, but
# not much.
#
# dan@microsoft.com
#
#%% Imports
import time
import datetime
import json
import os
import csv
from pydrive.auth import GoogleAuth
from multiprocessing.pool import ThreadPool
from pydrive.drive import GoogleDrive
import humanfriendly
#%% Configuration and constants
# Should we actually download images, or just enumerate images?
downloadImages = 1
# Set to 'errors' when you've already downloaded most of the files and are just
# re-trying failures
#
# 'all','errors','ifnecessary'
enumerationMode = 'ifnecessary'
# The GUID for the top-level folder
parentID = ''
# client_secrets.json lives here
clientSecretsPath = r'd:\git\ai4edev\dan\danMisc'
# Limits the number of files we enumerate (for debugging). Set to -1 to enumerate
# all files.
maxFiles = -1
# This can be empty if we're not writing images
imageOutputDir = r'f:\video'
# When duplicate folders exist, should we merge them? The alternative is
# renaming the second instance of "blah" to "blah (1)". My experience has been
# that the gDrive sync behavior varies with OS; on Windows, renaming occurs, on MacOS,
# folders are merged.
bMergeDuplicateFolders = True
#%% Derived constants
# Change to the path where the client secrets file lives, to simplify auth
os.chdir(clientSecretsPath)
# Create a datestamped filename to which we'll write all the metadata we
# retrieve when we crawl the gDrive.
metadataOutputDir = os.path.join(imageOutputDir,'metadata_cache')
os.makedirs(metadataOutputDir,exist_ok=True)
metadataFileBase = os.path.join(metadataOutputDir,'imageMetadata.json')
dateStamp = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
name, ext = os.path.splitext(metadataFileBase)
metadataFile = "{}.{}{}".format(name,dateStamp,ext)
# List of files we need to download, just filename and GUID. This .csv
# file is written by the enumeration step.
downloadListFileBase = os.path.join(metadataOutputDir,'downloadList.csv')
name, ext = os.path.splitext(downloadListFileBase)
downloadListFile = "{}.{}{}".format(name,dateStamp,ext)
# List of download errors
errorListFileBase = os.path.join(metadataOutputDir,'enumerationErrors.csv')
name, ext = os.path.splitext(errorListFileBase)
errorListFile = "{}.{}{}".format(name,dateStamp,ext)
# If we are running in "errors" mode, this is the list of directories we want to re-try
errorListFileResume = os.path.join(metadataOutputDir,r"enumerationErrors.csv")
assert (not downloadImages) or (not len(imageOutputDir)==0), 'Can\'t have empty output dir if you\'re downloading images'
# Only applies to downloading; enumeration is not currently multi-threaded
nThreads = 10
#%% Authenticate
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)
#%% Enumerate files for download (functions)
class DataEnumerator:
nFiles = 0
nFolders = 0
errors = []
fileInfo = []
downloadList = []
def PrepareFolderDownload(folderID,folderTargetDir,dataEnumerator=None):
'''
Enumerate files and directories in a single folder, specified by the GUID
folderID. Will be called once for every folder we encounter. Does not make
recursive calls.
'''
if dataEnumerator == None:
dataEnumerator = DataEnumerator()
try:
fileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % folderID}).GetList()
except Exception as ex:
# ex = sys.exc_info()[0]
errorString = str(ex)
print("Error listing directory {}:{}:{}".format(folderTargetDir,folderID,errorString))
dataEnumerator.errors.append( ['folder',folderTargetDir,folderID,errorString] )
return dataEnumerator
titles = set()
# Handle redundant directory names
for f in fileList:
title = f['title']
nRenames = 0
if title in titles:
nRenames = nRenames + 1
if bMergeDuplicateFolders:
print("Warning: folder conflict at {}/{}".format(folderTargetDir,title))
else:
# Try to rename folders and files the way the gDrive sync app does, i.e. if there are
# two files called "Blah", we want "Blah" and "Blah (1)".
newTitle = title + " ({})".format(nRenames)
print("Renaming {} to {} in [{}]".format(title,newTitle,folderTargetDir))
title = newTitle
f['title'] = title
else:
titles.add(title)
# ...for every file in our list (handling redundant directory names)
# Enumerate and process files in this folder
for f in fileList:
if maxFiles > 0 and dataEnumerator.nFiles > maxFiles:
return dataEnumerator
dataEnumerator.fileInfo.append(f)
title = f['title']
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
dataEnumerator.nFolders = dataEnumerator.nFolders + 1
# Create the target directory if necessary
outputDir = os.path.join(folderTargetDir,title)
f['target'] = outputDir
if downloadImages:
if not os.path.exists(outputDir):
os.mkdir(outputDir)
print("Enumerating folder {} to {}".format(title,outputDir))
# Recurse
dataEnumerator = PrepareFolderDownload(f['id'],outputDir,dataEnumerator)
else:
dataEnumerator.nFiles = dataEnumerator.nFiles + 1
targetFile = os.path.join(folderTargetDir,title)
f['target'] = targetFile
print("Downloading file {} to {}".format(title,targetFile))
dataEnumerator.downloadList.append( [targetFile,f['id']] )
# ...for each file in this folder
return dataEnumerator
# ... def PrepareFolderDownload
#%% Enumerate files for download (execution)
startTime = time.time()
if (enumerationMode == 'ifnecessary') and (os.path.exists(downloadListFile)):
downloadList = []
with open(downloadListFile) as csvfile:
r = csv.reader(csvfile)
for iRow,row in enumerate(r):
if maxFiles > 0 and iRow > maxFiles:
break
else:
downloadList.append(row)
print("Read {} downloads from {}".format(len(downloadList),downloadListFile))
else:
dataEnumerator = None
if enumerationMode == 'errors':
splitLines = []
assert(os.path.isfile(errorListFileResume))
# Read the error file
# For each line in the input file
with open(errorListFileResume) as f:
rows = csv.reader(f)
for iRow,row in enumerate(rows):
splitLines.append(row)
# Lines look like:
#
# ['folder',folderTargetDir,folderID,errorString]
for iRow,row in enumerate(splitLines):
targetDir = row[1]
folderID = row[2]
errorString = row[3]
print('Re-trying folder ID {} ({})'.format(targetDir,folderID))
dataEnumerator = PrepareFolderDownload(folderID,targetDir)
# Either we're in 'all' mode or we're in 'ifnecessary' mode and enumeration is necessary
else:
print("Starting enumeration")
startTime = time.time()
dataEnumerator = PrepareFolderDownload(parentID,imageOutputDir)
elapsed = time.time() - startTime
print("Finished enumeration in {}".format(str(datetime.timedelta(seconds=elapsed))))
print("Enumerated {} files".format(len(dataEnumerator.downloadList)))
s = json.dumps(dataEnumerator.fileInfo)
with open(metadataFile, "w+") as f:
f.write(s)
print("Finished writing metadata to {}".format(metadataFile))
with open(downloadListFile,'w+') as f:
for fileInfo in dataEnumerator.downloadList:
f.write(",".join(fileInfo) + "\n")
print("Finished writing download list to {}".format(downloadListFile))
with open(errorListFile,'w+') as f:
for e in dataEnumerator.errors:
f.write(",".join(e) + "\n")
print("Finished writing error list ({} errors) to {}".format(len(dataEnumerator.errors),errorListFile))
elapsed = time.time() - startTime
print("Done enumerating files in {}".format(humanfriendly.format_timespan(elapsed)))
downloadList = dataEnumerator.downloadList
# if/else on enumeration modes
#%% Compute total download size
import tqdm
import humanfriendly
sizeBytes = 0
for f in tqdm.tqdm(dataEnumerator.fileInfo):
if 'fileSize' in f:
sizeBytes = sizeBytes + int(f['fileSize'])
print('Total download size is {} in {} files'.format(
humanfriendly.format_size(sizeBytes),len(dataEnumerator.fileInfo)))
#%% Download images (functions)
import sys
def ProcessDownload(fileInfo):
status = 'unknown'
targetFile = fileInfo[0]
if os.path.exists(targetFile):
print("Skipping download of file {}".format(targetFile))
status = 'skipped'
return status
id = fileInfo[1]
try:
f = drive.CreateFile({'id': id})
title = f['title']
except:
print("File creation error for {}".format(targetFile))
status = 'create_error'
return status
print("Downloading file {} to {}".format(title,targetFile))
try:
f.GetContentFile(targetFile)
status = 'success'
return status
except:
print("Download error for {}: {}".format(targetFile,sys.exc_info()[0]))
status = 'download_error'
return status
def ProcessDownloadList(downloadList):
pool = ThreadPool(nThreads)
# results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
results = pool.map(ProcessDownload, downloadList)
# for iFile,fileInfo in enumerate(downloadList):
# ProcessDownload(fileInfo)
return results
#%% Download images (execution)
if downloadImages:
print('Downloading data...')
# results = ProcessDownloadList(downloadList[1:10])
results = ProcessDownloadList(downloadList)
print('...done.')
#%% Scrap
if False:
#%% List files
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for file1 in file_list:
print('title: %s, id: %s' % (file1['title'], file1['id']))
#%% List a particular directory
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)
folder_id = 'blahblahblah'
q = {'q': "'{}' in parents and trashed=false".format(folder_id)}
file_list = drive.ListFile(q).GetList()
for iFile,f in enumerate(file_list):
print('{}: {}, id: {}'.format(iFile,f['title'],f['id']))
#%% Recursive list
from pydrive.drive import GoogleDrive
drive = GoogleDrive(gauth)
def ListFolder(parentID,fileListOut=None):
if fileListOut is None:
fileListOut = []
parentList = drive.ListFile({'q': "'%s' in parents and trashed=false" % parentID}).GetList()
for f in parentList:
if len(fileListOut) > maxFiles:
return fileListOut
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
title = f['title']
print("Enumerating folder {}".format(title))
childFiles = ListFolder(f['id'],fileListOut)
print("Enumerated {} files".format(len(childFiles)))
fileListOut = fileListOut + childFiles
# fileListOut.append({"id":f['id'],"title":f['title'],"list":})
else:
fileListOut.append(f['title'])
return fileListOut
parent = -1;
file_list = ListFolder(parent)
print("Enumerated {} files".format(len(file_list)))