ai4eutils/ai4e_azure_utils.py

200 строки
6.7 KiB
Python
Исходник Обычный вид История

#
# ai4e_azure_utils.py
#
# Miscellaneous Azure utilities
#
2020-06-06 20:18:01 +03:00
import json
import re
from azure.storage.blob._models import BlobPrefix
2020-06-06 20:18:01 +03:00
from azure.storage.blob import BlobServiceClient
# Based on:
#
# https://github.com/Azure/azure-sdk-for-python/blob/master/sdk/storage/azure-storage-blob/samples/blob_samples_walk_blob_hierarchy.py
def walk_container(container_client, max_depth=-1, prefix='',
store_folders=True, store_blobs=True, debug_max_items=-1):
"""
Recursively walk folders in the ContainerClient object *container_client*
"""
depth = 1
def walk_blob_hierarchy(prefix=prefix, folders=None, blobs=None):
if folders is None:
folders = []
if blobs is None:
blobs = []
nonlocal depth
if max_depth > 0 and depth > max_depth:
return folders, blobs
for item in container_client.walk_blobs(name_starts_with=prefix):
short_name = item.name[len(prefix):]
if isinstance(item, BlobPrefix):
2020-05-29 21:27:06 +03:00
# print('F: ' + prefix + short_name)
if store_folders:
folders.append(prefix + short_name)
depth += 1
walk_blob_hierarchy(prefix=item.name,folders=folders,blobs=blobs)
if (debug_max_items > 0) and (len(folders)+len(blobs) > debug_max_items):
return folders, blobs
depth -= 1
else:
if store_blobs:
blobs.append(prefix + short_name)
return folders,blobs
folders,blobs = walk_blob_hierarchy()
assert(all([s.endswith('/') for s in folders]))
folders = [s.strip('/') for s in folders]
return folders,blobs
def list_top_level_blob_folders(container_client):
"""
List all top-level folders in the ContainerClient object *container_client*
"""
top_level_folders,_ = walk_container(container_client,max_depth=1,store_blobs=False)
return top_level_folders
2020-06-06 20:18:01 +03:00
#%% Blob enumeration
def concatenate_json_string_lists(input_files,output_file=None):
"""
Given several files that contain json-formatted lists of strings (typically filenames),
concatenate them into one new file.
"""
output_list = []
for fn in input_files:
file_list = json.load(open(fn))
output_list.extend(file_list)
if output_file is not None:
s = json.dumps(output_list,indent=1)
with open(output_file,'w') as f:
f.write(s)
return output_list
def write_list_to_file(output_file,strings):
"""
Writes a list of strings to file, either .json or text depending on extension
"""
if output_file.endswith('.json'):
s = json.dumps(strings,indent=1)
with open(output_file,'w') as f:
f.write(s)
else:
with open(output_file,'w') as f:
for fn in strings:
f.write(fn + '\n')
# print('Finished writing list {}'.format(output_file))
def read_list_from_file(filename):
"""
Reads a json-formatted list of strings from *filename*
"""
assert filename.endswith('.json')
file_list = json.load(open(filename))
assert isinstance(file_list,list)
for s in file_list:
assert isinstance(s,str)
return file_list
def account_name_to_url(account_name):
storage_account_url_blob = 'https://' + account_name + '.blob.core.windows.net'
return storage_account_url_blob
def copy_file_to_blob(account_name,sas_token,container_name,
local_path,remote_path):
"""
Copies a local file to blob storage
"""
blob_service_client = BlobServiceClient(account_url=account_name_to_url(account_name),
credential=sas_token)
container_client = blob_service_client.get_container_client(container_name)
with open(local_path, 'rb') as data:
container_client.upload_blob(remote_path, data)
def enumerate_blobs(account_name,sas_token,container_name,
rmatch=None,prefix=None,max_blobs=None):
2020-06-06 20:18:01 +03:00
"""
Enumerates blobs in a container, optionally filtering with a regex
Using the prefix parameter is faster than using a regex starting with ^
sas_token should start with st=
"""
folder_string = '{}/{}'.format(account_name,container_name)
if prefix is not None:
folder_string += '/{}'.format(prefix)
if rmatch is not None:
folder_string += ' (matching {})'.format(rmatch)
print('Enumerating blobs from {}'.format(folder_string))
blob_service_client = BlobServiceClient(account_url=account_name_to_url(account_name),
credential=sas_token)
container_client = blob_service_client.get_container_client(container_name)
generator = container_client.list_blobs(name_starts_with=prefix)
matched_blobs = []
i_blob = 0
for blob in generator:
blob_name = blob.name
if rmatch is not None:
m = re.match(rmatch,blob_name)
if m is None:
continue
matched_blobs.append(blob.name)
i_blob += 1
if (i_blob % 1000) == 0:
print('.',end='')
if (i_blob % 50000) == 0:
print('{} blobs enumerated ({} matches)'.format(i_blob,len(matched_blobs)))
if (max_blobs is not None) and (i_blob >= max_blobs):
print('Terminating enumeration after {} blobs'.format(max_blobs))
break
2020-06-06 20:18:01 +03:00
print('Enumerated {} matching blobs (of {} total) from {}/{}'.format(len(matched_blobs),
i_blob,account_name,container_name))
return matched_blobs
def enumerate_blobs_to_file(output_file,account_name,sas_token,container_name,account_key=None,rmatch=None,prefix=None,max_blobs=None):
2020-06-06 20:18:01 +03:00
"""
Enumerates to a .json string if output_file ends in ".json", otherwise enumerates to a
newline-delimited list.
See enumerate_blobs for parameter information.
"""
matched_blobs = enumerate_blobs(account_name=account_name,
sas_token=sas_token,
container_name=container_name,
rmatch=rmatch,
prefix=prefix,
max_blobs=max_blobs)
2020-06-06 20:18:01 +03:00
write_list_to_file(output_file,matched_blobs)
return matched_blobs