119 строки
3.1 KiB
Python
119 строки
3.1 KiB
Python
|
# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
|
||
|
#
|
||
|
# Use of this source code is governed by a BSD-style license
|
||
|
# that can be found in the LICENSE file in the root of the source
|
||
|
# tree. An additional intellectual property rights grant can be found
|
||
|
# in the file PATENTS. All contributing project authors may
|
||
|
# be found in the AUTHORS file in the root of the source tree.
|
||
|
#
|
||
|
# This simple script pulls test files from the webm homepage
|
||
|
# It is intelligent enough to only pull files if
|
||
|
# 1) File / test_data folder does not exist
|
||
|
# 2) SHA mismatch
|
||
|
|
||
|
import pycurl
|
||
|
import csv
|
||
|
import hashlib
|
||
|
import re
|
||
|
import os.path
|
||
|
import time
|
||
|
import itertools
|
||
|
import sys
|
||
|
import getopt
|
||
|
|
||
|
#globals
|
||
|
url = ''
|
||
|
file_list_path = ''
|
||
|
local_resource_path = ''
|
||
|
|
||
|
# Helper functions:
|
||
|
# A simple function which returns the sha hash of a file in hex
|
||
|
def get_file_sha(filename):
|
||
|
try:
|
||
|
sha_hash = hashlib.sha1()
|
||
|
with open(filename, 'rb') as file:
|
||
|
buf = file.read(HASH_CHUNK)
|
||
|
while len(buf) > 0:
|
||
|
sha_hash.update(buf)
|
||
|
buf = file.read(HASH_CHUNK)
|
||
|
return sha_hash.hexdigest()
|
||
|
except IOError:
|
||
|
print "Error reading " + filename
|
||
|
|
||
|
# Downloads a file from a url, and then checks the sha against the passed
|
||
|
# in sha
|
||
|
def download_and_check_sha(url, filename, sha):
|
||
|
path = os.path.join(local_resource_path, filename)
|
||
|
fp = open(path, "wb")
|
||
|
curl = pycurl.Curl()
|
||
|
curl.setopt(pycurl.URL, url + "/" + filename)
|
||
|
curl.setopt(pycurl.WRITEDATA, fp)
|
||
|
curl.perform()
|
||
|
curl.close()
|
||
|
fp.close()
|
||
|
return get_file_sha(path) == sha
|
||
|
|
||
|
#constants
|
||
|
ftp_retries = 3
|
||
|
|
||
|
SHA_COL = 0
|
||
|
NAME_COL = 1
|
||
|
EXPECTED_COL = 2
|
||
|
HASH_CHUNK = 65536
|
||
|
|
||
|
# Main script
|
||
|
try:
|
||
|
opts, args = \
|
||
|
getopt.getopt(sys.argv[1:], \
|
||
|
"u:i:o:", ["url=", "input_csv=", "output_dir="])
|
||
|
except:
|
||
|
print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
|
||
|
sys.exit(2)
|
||
|
|
||
|
for opt, arg in opts:
|
||
|
if opt == '-u':
|
||
|
url = arg
|
||
|
elif opt in ("-i", "--input_csv"):
|
||
|
file_list_path = os.path.join(arg)
|
||
|
elif opt in ("-o", "--output_dir"):
|
||
|
local_resource_path = os.path.join(arg)
|
||
|
|
||
|
if len(sys.argv) != 7:
|
||
|
print "Expects two paths and a url!"
|
||
|
exit(1)
|
||
|
|
||
|
if not os.path.isdir(local_resource_path):
|
||
|
os.makedirs(local_resource_path)
|
||
|
|
||
|
file_list_csv = open(file_list_path, "rb")
|
||
|
|
||
|
# Our 'csv' file uses multiple spaces as a delimiter, python's
|
||
|
# csv class only uses single character delimiters, so we convert them below
|
||
|
file_list_reader = csv.reader((re.sub(' +', ' ', line) \
|
||
|
for line in file_list_csv), delimiter = ' ')
|
||
|
|
||
|
file_shas = []
|
||
|
file_names = []
|
||
|
|
||
|
for row in file_list_reader:
|
||
|
if len(row) != EXPECTED_COL:
|
||
|
continue
|
||
|
file_shas.append(row[SHA_COL])
|
||
|
file_names.append(row[NAME_COL])
|
||
|
|
||
|
file_list_csv.close()
|
||
|
|
||
|
# Download files, only if they don't already exist and have correct shas
|
||
|
for filename, sha in itertools.izip(file_names, file_shas):
|
||
|
path = os.path.join(local_resource_path, filename)
|
||
|
if os.path.isfile(path) \
|
||
|
and get_file_sha(path) == sha:
|
||
|
print path + ' exists, skipping'
|
||
|
continue
|
||
|
for retry in range(0, ftp_retries):
|
||
|
print "Downloading " + path
|
||
|
if not download_and_check_sha(url, filename, sha):
|
||
|
print "Sha does not match, retrying..."
|
||
|
else:
|
||
|
break
|