Added gDrive_download.py
This commit is contained in:
Родитель
bc9c5010d1
Коммит
59a4511d34
|
@ -0,0 +1,420 @@
|
|||
#
|
||||
# gDrive_download.py
|
||||
#
|
||||
# Recursively enumerates a google drive directory, optionally downloading all the
|
||||
# files in that directory. These are two separate steps; files are enumerated
|
||||
# and written to file before anything is downloaded. If you only want to connect
|
||||
# file names and gDrive GUIDs, you don't need to download anything.
|
||||
#
|
||||
# Uses the PyDrive library to talk to google drive, and assumes you've created a
|
||||
# .json file with your secret key to access the drive, following this tutorial
|
||||
# verbatim:
|
||||
#
|
||||
# https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html#authentication
|
||||
#
|
||||
# It can take a few tries to run this on large data sets (in particular to
|
||||
# retry failed downloads a semi-arbitrary number of times), so this isn't
|
||||
# entirely meant to be run from scratch; I'd say I ran this semi-interactvely.
|
||||
#
|
||||
# Note that gDrive caps free access at 1000 queries / 100 seconds / user =
|
||||
# 10 queries / second. You may get slightly faster access than that in practice, but
|
||||
# not much.
|
||||
#
|
||||
# dan@microsoft.com
|
||||
#
|
||||
|
||||
#%% Imports
|
||||
|
||||
import time
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import csv
|
||||
from pydrive.auth import GoogleAuth
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from pydrive.drive import GoogleDrive
|
||||
import humanfriendly
|
||||
|
||||
|
||||
#%% Configuration and constants
|
||||
|
||||
# Should we actually download images, or just enumerate images?
|
||||
downloadImages = 1
|
||||
|
||||
# Set to 'errors' when you've already downloaded most of the files and are just
|
||||
# re-trying failures
|
||||
#
|
||||
# 'all','errors','ifnecessary'
|
||||
enumerationMode = 'ifnecessary'
|
||||
|
||||
# The GUID for the top-level folder
|
||||
parentID = ''
|
||||
|
||||
# client_secrets.json lives here
|
||||
clientSecretsPath = r'd:\git\ai4edev\dan\danMisc'
|
||||
|
||||
# Limits the number of files we enumerate (for debugging). Set to -1 to enumerate
|
||||
# all files.
|
||||
maxFiles = -1
|
||||
|
||||
# This can be empty if we're not writing images
|
||||
imageOutputDir = r'f:\video'
|
||||
|
||||
# When duplicate folders exist, should we merge them? The alternative is
|
||||
# renaming the second instance of "blah" to "blah (1)". My experience has been
|
||||
# that the gDrive sync behavior varies with OS; on Windows, renaming occurs, on MacOS,
|
||||
# folders are merged.
|
||||
bMergeDuplicateFolders = True
|
||||
|
||||
|
||||
#%% Derived constants
|
||||
|
||||
# Change to the path where the client secrets file lives, to simplify auth
|
||||
os.chdir(clientSecretsPath)
|
||||
|
||||
# Create a datestamped filename to which we'll write all the metadata we
|
||||
# retrieve when we crawl the gDrive.
|
||||
metadataOutputDir = os.path.join(imageOutputDir,'metadata_cache')
|
||||
|
||||
os.makedirs(metadataOutputDir,exist_ok=True)
|
||||
|
||||
metadataFileBase = os.path.join(metadataOutputDir,'imageMetadata.json')
|
||||
dateStamp = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
|
||||
name, ext = os.path.splitext(metadataFileBase)
|
||||
metadataFile = "{}.{}{}".format(name,dateStamp,ext)
|
||||
|
||||
# List of files we need to download, just filename and GUID. This .csv
|
||||
# file is written by the enumeration step.
|
||||
downloadListFileBase = os.path.join(metadataOutputDir,'downloadList.csv')
|
||||
name, ext = os.path.splitext(downloadListFileBase)
|
||||
downloadListFile = "{}.{}{}".format(name,dateStamp,ext)
|
||||
|
||||
# List of download errors
|
||||
errorListFileBase = os.path.join(metadataOutputDir,'enumerationErrors.csv')
|
||||
name, ext = os.path.splitext(errorListFileBase)
|
||||
errorListFile = "{}.{}{}".format(name,dateStamp,ext)
|
||||
|
||||
# If we are running in "errors" mode, this is the list of directories we want to re-try
|
||||
errorListFileResume = os.path.join(metadataOutputDir,r"enumerationErrors.csv")
|
||||
|
||||
assert (not downloadImages) or (not len(imageOutputDir)==0), 'Can\'t have empty output dir if you\'re downloading images'
|
||||
|
||||
# Only applies to downloading; enumeration is not currently multi-threaded
|
||||
nThreads = 10
|
||||
|
||||
|
||||
#%% Authenticate
|
||||
|
||||
gauth = GoogleAuth()
|
||||
gauth.LocalWebserverAuth()
|
||||
drive = GoogleDrive(gauth)
|
||||
|
||||
|
||||
#%% Enumerate files for download (functions)
|
||||
|
||||
class DataEnumerator:
|
||||
|
||||
nFiles = 0
|
||||
nFolders = 0
|
||||
errors = []
|
||||
fileInfo = []
|
||||
downloadList = []
|
||||
|
||||
|
||||
def PrepareFolderDownload(folderID,folderTargetDir,dataEnumerator=None):
|
||||
'''
|
||||
Enumerate files and directories in a single folder, specified by the GUID
|
||||
folderID. Will be called once for every folder we encounter. Does not make
|
||||
recursive calls.
|
||||
'''
|
||||
if dataEnumerator == None:
|
||||
|
||||
dataEnumerator = DataEnumerator()
|
||||
|
||||
try:
|
||||
|
||||
fileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % folderID}).GetList()
|
||||
|
||||
except Exception as ex:
|
||||
|
||||
# ex = sys.exc_info()[0]
|
||||
errorString = str(ex)
|
||||
print("Error listing directory {}:{}:{}".format(folderTargetDir,folderID,errorString))
|
||||
dataEnumerator.errors.append( ['folder',folderTargetDir,folderID,errorString] )
|
||||
return dataEnumerator
|
||||
|
||||
titles = set()
|
||||
|
||||
# Handle redundant directory names
|
||||
for f in fileList:
|
||||
|
||||
title = f['title']
|
||||
nRenames = 0
|
||||
|
||||
if title in titles:
|
||||
nRenames = nRenames + 1
|
||||
if bMergeDuplicateFolders:
|
||||
print("Warning: folder conflict at {}/{}".format(folderTargetDir,title))
|
||||
else:
|
||||
# Try to rename folders and files the way the gDrive sync app does, i.e. if there are
|
||||
# two files called "Blah", we want "Blah" and "Blah (1)".
|
||||
newTitle = title + " ({})".format(nRenames)
|
||||
print("Renaming {} to {} in [{}]".format(title,newTitle,folderTargetDir))
|
||||
title = newTitle
|
||||
f['title'] = title
|
||||
else:
|
||||
titles.add(title)
|
||||
|
||||
# ...for every file in our list (handling redundant directory names)
|
||||
|
||||
# Enumerate and process files in this folder
|
||||
for f in fileList:
|
||||
|
||||
if maxFiles > 0 and dataEnumerator.nFiles > maxFiles:
|
||||
return dataEnumerator
|
||||
|
||||
dataEnumerator.fileInfo.append(f)
|
||||
|
||||
title = f['title']
|
||||
|
||||
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
|
||||
|
||||
dataEnumerator.nFolders = dataEnumerator.nFolders + 1
|
||||
|
||||
# Create the target directory if necessary
|
||||
outputDir = os.path.join(folderTargetDir,title)
|
||||
f['target'] = outputDir
|
||||
if downloadImages:
|
||||
if not os.path.exists(outputDir):
|
||||
os.mkdir(outputDir)
|
||||
|
||||
print("Enumerating folder {} to {}".format(title,outputDir))
|
||||
|
||||
# Recurse
|
||||
dataEnumerator = PrepareFolderDownload(f['id'],outputDir,dataEnumerator)
|
||||
|
||||
else:
|
||||
|
||||
dataEnumerator.nFiles = dataEnumerator.nFiles + 1
|
||||
|
||||
targetFile = os.path.join(folderTargetDir,title)
|
||||
f['target'] = targetFile
|
||||
print("Downloading file {} to {}".format(title,targetFile))
|
||||
dataEnumerator.downloadList.append( [targetFile,f['id']] )
|
||||
|
||||
# ...for each file in this folder
|
||||
|
||||
return dataEnumerator
|
||||
|
||||
# ... def PrepareFolderDownload
|
||||
|
||||
|
||||
#%% Enumerate files for download (execution)
|
||||
|
||||
startTime = time.time()
|
||||
|
||||
if (enumerationMode == 'ifnecessary') and (os.path.exists(downloadListFile)):
|
||||
|
||||
downloadList = []
|
||||
with open(downloadListFile) as csvfile:
|
||||
r = csv.reader(csvfile)
|
||||
for iRow,row in enumerate(r):
|
||||
if maxFiles > 0 and iRow > maxFiles:
|
||||
break
|
||||
else:
|
||||
downloadList.append(row)
|
||||
|
||||
print("Read {} downloads from {}".format(len(downloadList),downloadListFile))
|
||||
|
||||
else:
|
||||
|
||||
dataEnumerator = None
|
||||
|
||||
if enumerationMode == 'errors':
|
||||
|
||||
splitLines = []
|
||||
|
||||
assert(os.path.isfile(errorListFileResume))
|
||||
|
||||
# Read the error file
|
||||
# For each line in the input file
|
||||
with open(errorListFileResume) as f:
|
||||
rows = csv.reader(f)
|
||||
for iRow,row in enumerate(rows):
|
||||
splitLines.append(row)
|
||||
|
||||
# Lines look like:
|
||||
#
|
||||
# ['folder',folderTargetDir,folderID,errorString]
|
||||
|
||||
for iRow,row in enumerate(splitLines):
|
||||
targetDir = row[1]
|
||||
folderID = row[2]
|
||||
errorString = row[3]
|
||||
print('Re-trying folder ID {} ({})'.format(targetDir,folderID))
|
||||
dataEnumerator = PrepareFolderDownload(folderID,targetDir)
|
||||
|
||||
# Either we're in 'all' mode or we're in 'ifnecessary' mode and enumeration is necessary
|
||||
else:
|
||||
|
||||
print("Starting enumeration")
|
||||
startTime = time.time()
|
||||
dataEnumerator = PrepareFolderDownload(parentID,imageOutputDir)
|
||||
elapsed = time.time() - startTime
|
||||
print("Finished enumeration in {}".format(str(datetime.timedelta(seconds=elapsed))))
|
||||
|
||||
print("Enumerated {} files".format(len(dataEnumerator.downloadList)))
|
||||
|
||||
s = json.dumps(dataEnumerator.fileInfo)
|
||||
with open(metadataFile, "w+") as f:
|
||||
f.write(s)
|
||||
print("Finished writing metadata to {}".format(metadataFile))
|
||||
|
||||
with open(downloadListFile,'w+') as f:
|
||||
for fileInfo in dataEnumerator.downloadList:
|
||||
f.write(",".join(fileInfo) + "\n")
|
||||
print("Finished writing download list to {}".format(downloadListFile))
|
||||
|
||||
with open(errorListFile,'w+') as f:
|
||||
for e in dataEnumerator.errors:
|
||||
f.write(",".join(e) + "\n")
|
||||
print("Finished writing error list ({} errors) to {}".format(len(dataEnumerator.errors),errorListFile))
|
||||
|
||||
elapsed = time.time() - startTime
|
||||
print("Done enumerating files in {}".format(humanfriendly.format_timespan(elapsed)))
|
||||
|
||||
downloadList = dataEnumerator.downloadList
|
||||
|
||||
# if/else on enumeration modes
|
||||
|
||||
|
||||
#%% Compute total download size
|
||||
|
||||
import tqdm
|
||||
import humanfriendly
|
||||
|
||||
sizeBytes = 0
|
||||
|
||||
for f in tqdm.tqdm(dataEnumerator.fileInfo):
|
||||
|
||||
if 'fileSize' in f:
|
||||
sizeBytes = sizeBytes + int(f['fileSize'])
|
||||
|
||||
print('Total download size is {} in {} files'.format(
|
||||
humanfriendly.format_size(sizeBytes),len(dataEnumerator.fileInfo)))
|
||||
|
||||
|
||||
#%% Download images (functions)
|
||||
|
||||
import sys
|
||||
|
||||
def ProcessDownload(fileInfo):
|
||||
|
||||
status = 'unknown'
|
||||
targetFile = fileInfo[0]
|
||||
if os.path.exists(targetFile):
|
||||
print("Skipping download of file {}".format(targetFile))
|
||||
status = 'skipped'
|
||||
return status
|
||||
id = fileInfo[1]
|
||||
try:
|
||||
f = drive.CreateFile({'id': id})
|
||||
title = f['title']
|
||||
except:
|
||||
print("File creation error for {}".format(targetFile))
|
||||
status = 'create_error'
|
||||
return status
|
||||
print("Downloading file {} to {}".format(title,targetFile))
|
||||
try:
|
||||
f.GetContentFile(targetFile)
|
||||
status = 'success'
|
||||
return status
|
||||
except:
|
||||
print("Download error for {}: {}".format(targetFile,sys.exc_info()[0]))
|
||||
status = 'download_error'
|
||||
return status
|
||||
|
||||
def ProcessDownloadList(downloadList):
|
||||
|
||||
pool = ThreadPool(nThreads)
|
||||
# results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
|
||||
results = pool.map(ProcessDownload, downloadList)
|
||||
|
||||
# for iFile,fileInfo in enumerate(downloadList):
|
||||
# ProcessDownload(fileInfo)
|
||||
return results
|
||||
|
||||
|
||||
#%% Download images (execution)
|
||||
|
||||
if downloadImages:
|
||||
|
||||
print('Downloading data...')
|
||||
# results = ProcessDownloadList(downloadList[1:10])
|
||||
results = ProcessDownloadList(downloadList)
|
||||
print('...done.')
|
||||
|
||||
|
||||
#%% Scrap
|
||||
|
||||
if False:
|
||||
|
||||
#%% List files
|
||||
|
||||
from pydrive.drive import GoogleDrive
|
||||
drive = GoogleDrive(gauth)
|
||||
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
|
||||
for file1 in file_list:
|
||||
print('title: %s, id: %s' % (file1['title'], file1['id']))
|
||||
|
||||
|
||||
#%% List a particular directory
|
||||
|
||||
from pydrive.drive import GoogleDrive
|
||||
drive = GoogleDrive(gauth)
|
||||
folder_id = 'blahblahblah'
|
||||
q = {'q': "'{}' in parents and trashed=false".format(folder_id)}
|
||||
file_list = drive.ListFile(q).GetList()
|
||||
|
||||
for iFile,f in enumerate(file_list):
|
||||
print('{}: {}, id: {}'.format(iFile,f['title'],f['id']))
|
||||
|
||||
|
||||
#%% Recursive list
|
||||
|
||||
from pydrive.drive import GoogleDrive
|
||||
drive = GoogleDrive(gauth)
|
||||
|
||||
def ListFolder(parentID,fileListOut=None):
|
||||
|
||||
if fileListOut is None:
|
||||
fileListOut = []
|
||||
|
||||
parentList = drive.ListFile({'q': "'%s' in parents and trashed=false" % parentID}).GetList()
|
||||
|
||||
for f in parentList:
|
||||
|
||||
if len(fileListOut) > maxFiles:
|
||||
return fileListOut
|
||||
|
||||
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
|
||||
|
||||
title = f['title']
|
||||
print("Enumerating folder {}".format(title))
|
||||
childFiles = ListFolder(f['id'],fileListOut)
|
||||
print("Enumerated {} files".format(len(childFiles)))
|
||||
|
||||
fileListOut = fileListOut + childFiles
|
||||
# fileListOut.append({"id":f['id'],"title":f['title'],"list":})
|
||||
|
||||
else:
|
||||
fileListOut.append(f['title'])
|
||||
|
||||
return fileListOut
|
||||
|
||||
parent = -1;
|
||||
file_list = ListFolder(parent)
|
||||
|
||||
print("Enumerated {} files".format(len(file_list)))
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче