421 строка
13 KiB
Python
421 строка
13 KiB
Python
#
|
|
# gDrive_download.py
|
|
#
|
|
# Recursively enumerates a google drive directory, optionally downloading all the
|
|
# files in that directory. These are two separate steps; files are enumerated
|
|
# and written to file before anything is downloaded. If you only want to connect
|
|
# file names and gDrive GUIDs, you don't need to download anything.
|
|
#
|
|
# Uses the PyDrive library to talk to google drive, and assumes you've created a
|
|
# .json file with your secret key to access the drive, following this tutorial
|
|
# verbatim:
|
|
#
|
|
# https://gsuitedevs.github.io/PyDrive/docs/build/html/quickstart.html#authentication
|
|
#
|
|
# It can take a few tries to run this on large data sets (in particular to
|
|
# retry failed downloads a semi-arbitrary number of times), so this isn't
|
|
# entirely meant to be run from scratch; I'd say I ran this semi-interactvely.
|
|
#
|
|
# Note that gDrive caps free access at 1000 queries / 100 seconds / user =
|
|
# 10 queries / second. You may get slightly faster access than that in practice, but
|
|
# not much.
|
|
#
|
|
# dan@microsoft.com
|
|
#
|
|
|
|
#%% Imports
|
|
|
|
import time
|
|
import datetime
|
|
import json
|
|
import os
|
|
import csv
|
|
from pydrive.auth import GoogleAuth
|
|
from multiprocessing.pool import ThreadPool
|
|
from pydrive.drive import GoogleDrive
|
|
import humanfriendly
|
|
|
|
|
|
#%% Configuration and constants
|
|
|
|
# Should we actually download images, or just enumerate images?
|
|
downloadImages = 1
|
|
|
|
# Set to 'errors' when you've already downloaded most of the files and are just
|
|
# re-trying failures
|
|
#
|
|
# 'all','errors','ifnecessary'
|
|
enumerationMode = 'ifnecessary'
|
|
|
|
# The GUID for the top-level folder
|
|
parentID = ''
|
|
|
|
# client_secrets.json lives here
|
|
clientSecretsPath = r'd:\git\ai4edev\dan\danMisc'
|
|
|
|
# Limits the number of files we enumerate (for debugging). Set to -1 to enumerate
|
|
# all files.
|
|
maxFiles = -1
|
|
|
|
# This can be empty if we're not writing images
|
|
imageOutputDir = r'f:\video'
|
|
|
|
# When duplicate folders exist, should we merge them? The alternative is
|
|
# renaming the second instance of "blah" to "blah (1)". My experience has been
|
|
# that the gDrive sync behavior varies with OS; on Windows, renaming occurs, on MacOS,
|
|
# folders are merged.
|
|
bMergeDuplicateFolders = True
|
|
|
|
|
|
#%% Derived constants
|
|
|
|
# Change to the path where the client secrets file lives, to simplify auth
|
|
os.chdir(clientSecretsPath)
|
|
|
|
# Create a datestamped filename to which we'll write all the metadata we
|
|
# retrieve when we crawl the gDrive.
|
|
metadataOutputDir = os.path.join(imageOutputDir,'metadata_cache')
|
|
|
|
os.makedirs(metadataOutputDir,exist_ok=True)
|
|
|
|
metadataFileBase = os.path.join(metadataOutputDir,'imageMetadata.json')
|
|
dateStamp = datetime.datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
|
|
name, ext = os.path.splitext(metadataFileBase)
|
|
metadataFile = "{}.{}{}".format(name,dateStamp,ext)
|
|
|
|
# List of files we need to download, just filename and GUID. This .csv
|
|
# file is written by the enumeration step.
|
|
downloadListFileBase = os.path.join(metadataOutputDir,'downloadList.csv')
|
|
name, ext = os.path.splitext(downloadListFileBase)
|
|
downloadListFile = "{}.{}{}".format(name,dateStamp,ext)
|
|
|
|
# List of download errors
|
|
errorListFileBase = os.path.join(metadataOutputDir,'enumerationErrors.csv')
|
|
name, ext = os.path.splitext(errorListFileBase)
|
|
errorListFile = "{}.{}{}".format(name,dateStamp,ext)
|
|
|
|
# If we are running in "errors" mode, this is the list of directories we want to re-try
|
|
errorListFileResume = os.path.join(metadataOutputDir,r"enumerationErrors.csv")
|
|
|
|
assert (not downloadImages) or (not len(imageOutputDir)==0), 'Can\'t have empty output dir if you\'re downloading images'
|
|
|
|
# Only applies to downloading; enumeration is not currently multi-threaded
|
|
nThreads = 10
|
|
|
|
|
|
#%% Authenticate
|
|
|
|
gauth = GoogleAuth()
|
|
gauth.LocalWebserverAuth()
|
|
drive = GoogleDrive(gauth)
|
|
|
|
|
|
#%% Enumerate files for download (functions)
|
|
|
|
class DataEnumerator:
|
|
|
|
nFiles = 0
|
|
nFolders = 0
|
|
errors = []
|
|
fileInfo = []
|
|
downloadList = []
|
|
|
|
|
|
def PrepareFolderDownload(folderID,folderTargetDir,dataEnumerator=None):
|
|
'''
|
|
Enumerate files and directories in a single folder, specified by the GUID
|
|
folderID. Will be called once for every folder we encounter. Does not make
|
|
recursive calls.
|
|
'''
|
|
if dataEnumerator == None:
|
|
|
|
dataEnumerator = DataEnumerator()
|
|
|
|
try:
|
|
|
|
fileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % folderID}).GetList()
|
|
|
|
except Exception as ex:
|
|
|
|
# ex = sys.exc_info()[0]
|
|
errorString = str(ex)
|
|
print("Error listing directory {}:{}:{}".format(folderTargetDir,folderID,errorString))
|
|
dataEnumerator.errors.append( ['folder',folderTargetDir,folderID,errorString] )
|
|
return dataEnumerator
|
|
|
|
titles = set()
|
|
|
|
# Handle redundant directory names
|
|
for f in fileList:
|
|
|
|
title = f['title']
|
|
nRenames = 0
|
|
|
|
if title in titles:
|
|
nRenames = nRenames + 1
|
|
if bMergeDuplicateFolders:
|
|
print("Warning: folder conflict at {}/{}".format(folderTargetDir,title))
|
|
else:
|
|
# Try to rename folders and files the way the gDrive sync app does, i.e. if there are
|
|
# two files called "Blah", we want "Blah" and "Blah (1)".
|
|
newTitle = title + " ({})".format(nRenames)
|
|
print("Renaming {} to {} in [{}]".format(title,newTitle,folderTargetDir))
|
|
title = newTitle
|
|
f['title'] = title
|
|
else:
|
|
titles.add(title)
|
|
|
|
# ...for every file in our list (handling redundant directory names)
|
|
|
|
# Enumerate and process files in this folder
|
|
for f in fileList:
|
|
|
|
if maxFiles > 0 and dataEnumerator.nFiles > maxFiles:
|
|
return dataEnumerator
|
|
|
|
dataEnumerator.fileInfo.append(f)
|
|
|
|
title = f['title']
|
|
|
|
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
|
|
|
|
dataEnumerator.nFolders = dataEnumerator.nFolders + 1
|
|
|
|
# Create the target directory if necessary
|
|
outputDir = os.path.join(folderTargetDir,title)
|
|
f['target'] = outputDir
|
|
if downloadImages:
|
|
if not os.path.exists(outputDir):
|
|
os.mkdir(outputDir)
|
|
|
|
print("Enumerating folder {} to {}".format(title,outputDir))
|
|
|
|
# Recurse
|
|
dataEnumerator = PrepareFolderDownload(f['id'],outputDir,dataEnumerator)
|
|
|
|
else:
|
|
|
|
dataEnumerator.nFiles = dataEnumerator.nFiles + 1
|
|
|
|
targetFile = os.path.join(folderTargetDir,title)
|
|
f['target'] = targetFile
|
|
print("Downloading file {} to {}".format(title,targetFile))
|
|
dataEnumerator.downloadList.append( [targetFile,f['id']] )
|
|
|
|
# ...for each file in this folder
|
|
|
|
return dataEnumerator
|
|
|
|
# ... def PrepareFolderDownload
|
|
|
|
|
|
#%% Enumerate files for download (execution)
|
|
|
|
startTime = time.time()
|
|
|
|
if (enumerationMode == 'ifnecessary') and (os.path.exists(downloadListFile)):
|
|
|
|
downloadList = []
|
|
with open(downloadListFile) as csvfile:
|
|
r = csv.reader(csvfile)
|
|
for iRow,row in enumerate(r):
|
|
if maxFiles > 0 and iRow > maxFiles:
|
|
break
|
|
else:
|
|
downloadList.append(row)
|
|
|
|
print("Read {} downloads from {}".format(len(downloadList),downloadListFile))
|
|
|
|
else:
|
|
|
|
dataEnumerator = None
|
|
|
|
if enumerationMode == 'errors':
|
|
|
|
splitLines = []
|
|
|
|
assert(os.path.isfile(errorListFileResume))
|
|
|
|
# Read the error file
|
|
# For each line in the input file
|
|
with open(errorListFileResume) as f:
|
|
rows = csv.reader(f)
|
|
for iRow,row in enumerate(rows):
|
|
splitLines.append(row)
|
|
|
|
# Lines look like:
|
|
#
|
|
# ['folder',folderTargetDir,folderID,errorString]
|
|
|
|
for iRow,row in enumerate(splitLines):
|
|
targetDir = row[1]
|
|
folderID = row[2]
|
|
errorString = row[3]
|
|
print('Re-trying folder ID {} ({})'.format(targetDir,folderID))
|
|
dataEnumerator = PrepareFolderDownload(folderID,targetDir)
|
|
|
|
# Either we're in 'all' mode or we're in 'ifnecessary' mode and enumeration is necessary
|
|
else:
|
|
|
|
print("Starting enumeration")
|
|
startTime = time.time()
|
|
dataEnumerator = PrepareFolderDownload(parentID,imageOutputDir)
|
|
elapsed = time.time() - startTime
|
|
print("Finished enumeration in {}".format(str(datetime.timedelta(seconds=elapsed))))
|
|
|
|
print("Enumerated {} files".format(len(dataEnumerator.downloadList)))
|
|
|
|
s = json.dumps(dataEnumerator.fileInfo)
|
|
with open(metadataFile, "w+") as f:
|
|
f.write(s)
|
|
print("Finished writing metadata to {}".format(metadataFile))
|
|
|
|
with open(downloadListFile,'w+') as f:
|
|
for fileInfo in dataEnumerator.downloadList:
|
|
f.write(",".join(fileInfo) + "\n")
|
|
print("Finished writing download list to {}".format(downloadListFile))
|
|
|
|
with open(errorListFile,'w+') as f:
|
|
for e in dataEnumerator.errors:
|
|
f.write(",".join(e) + "\n")
|
|
print("Finished writing error list ({} errors) to {}".format(len(dataEnumerator.errors),errorListFile))
|
|
|
|
elapsed = time.time() - startTime
|
|
print("Done enumerating files in {}".format(humanfriendly.format_timespan(elapsed)))
|
|
|
|
downloadList = dataEnumerator.downloadList
|
|
|
|
# if/else on enumeration modes
|
|
|
|
|
|
#%% Compute total download size
|
|
|
|
import tqdm
|
|
import humanfriendly
|
|
|
|
sizeBytes = 0
|
|
|
|
for f in tqdm.tqdm(dataEnumerator.fileInfo):
|
|
|
|
if 'fileSize' in f:
|
|
sizeBytes = sizeBytes + int(f['fileSize'])
|
|
|
|
print('Total download size is {} in {} files'.format(
|
|
humanfriendly.format_size(sizeBytes),len(dataEnumerator.fileInfo)))
|
|
|
|
|
|
#%% Download images (functions)
|
|
|
|
import sys
|
|
|
|
def ProcessDownload(fileInfo):
|
|
|
|
status = 'unknown'
|
|
targetFile = fileInfo[0]
|
|
if os.path.exists(targetFile):
|
|
print("Skipping download of file {}".format(targetFile))
|
|
status = 'skipped'
|
|
return status
|
|
id = fileInfo[1]
|
|
try:
|
|
f = drive.CreateFile({'id': id})
|
|
title = f['title']
|
|
except:
|
|
print("File creation error for {}".format(targetFile))
|
|
status = 'create_error'
|
|
return status
|
|
print("Downloading file {} to {}".format(title,targetFile))
|
|
try:
|
|
f.GetContentFile(targetFile)
|
|
status = 'success'
|
|
return status
|
|
except:
|
|
print("Download error for {}: {}".format(targetFile,sys.exc_info()[0]))
|
|
status = 'download_error'
|
|
return status
|
|
|
|
def ProcessDownloadList(downloadList):
|
|
|
|
pool = ThreadPool(nThreads)
|
|
# results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
|
|
results = pool.map(ProcessDownload, downloadList)
|
|
|
|
# for iFile,fileInfo in enumerate(downloadList):
|
|
# ProcessDownload(fileInfo)
|
|
return results
|
|
|
|
|
|
#%% Download images (execution)
|
|
|
|
if downloadImages:
|
|
|
|
print('Downloading data...')
|
|
# results = ProcessDownloadList(downloadList[1:10])
|
|
results = ProcessDownloadList(downloadList)
|
|
print('...done.')
|
|
|
|
|
|
#%% Scrap
|
|
|
|
if False:
|
|
|
|
#%% List files
|
|
|
|
from pydrive.drive import GoogleDrive
|
|
drive = GoogleDrive(gauth)
|
|
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
|
|
for file1 in file_list:
|
|
print('title: %s, id: %s' % (file1['title'], file1['id']))
|
|
|
|
|
|
#%% List a particular directory
|
|
|
|
from pydrive.drive import GoogleDrive
|
|
drive = GoogleDrive(gauth)
|
|
folder_id = 'blahblahblah'
|
|
q = {'q': "'{}' in parents and trashed=false".format(folder_id)}
|
|
file_list = drive.ListFile(q).GetList()
|
|
|
|
for iFile,f in enumerate(file_list):
|
|
print('{}: {}, id: {}'.format(iFile,f['title'],f['id']))
|
|
|
|
|
|
#%% Recursive list
|
|
|
|
from pydrive.drive import GoogleDrive
|
|
drive = GoogleDrive(gauth)
|
|
|
|
def ListFolder(parentID,fileListOut=None):
|
|
|
|
if fileListOut is None:
|
|
fileListOut = []
|
|
|
|
parentList = drive.ListFile({'q': "'%s' in parents and trashed=false" % parentID}).GetList()
|
|
|
|
for f in parentList:
|
|
|
|
if len(fileListOut) > maxFiles:
|
|
return fileListOut
|
|
|
|
if f['mimeType']=='application/vnd.google-apps.folder': # if folder
|
|
|
|
title = f['title']
|
|
print("Enumerating folder {}".format(title))
|
|
childFiles = ListFolder(f['id'],fileListOut)
|
|
print("Enumerated {} files".format(len(childFiles)))
|
|
|
|
fileListOut = fileListOut + childFiles
|
|
# fileListOut.append({"id":f['id'],"title":f['title'],"list":})
|
|
|
|
else:
|
|
fileListOut.append(f['title'])
|
|
|
|
return fileListOut
|
|
|
|
parent = -1;
|
|
file_list = ListFolder(parent)
|
|
|
|
print("Enumerated {} files".format(len(file_list)))
|
|
|
|
|