ai4eutils/ai4e_azure_utils.py

183 строки
6.5 KiB
Python

"""
Miscellaneous Azure Blob Storage utilities
Requires azure-storage-blob>=12.4.0
"""
import json
from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union
from azure.storage.blob import BlobPrefix, ContainerClient
import sas_blob_utils
def walk_container(container_client: ContainerClient,
max_depth: int = -1,
prefix: str = '',
store_folders: bool = True,
store_blobs: bool = True,
debug_max_items: int = -1) -> Tuple[List[str], List[str]]:
"""
Recursively walk folders a Azure Blob Storage container.
Based on:
https://github.com/Azure/azure-sdk-for-python/blob/master/sdk/storage/azure-storage-blob/samples/blob_samples_walk_blob_hierarchy.py
"""
depth = 1
def walk_blob_hierarchy(prefix: str,
folders: Optional[List[str]] = None,
blobs: Optional[List[str]] = None
) -> Tuple[List[str], List[str]]:
if folders is None:
folders = []
if blobs is None:
blobs = []
nonlocal depth
if 0 < max_depth < depth:
return folders, blobs
for item in container_client.walk_blobs(name_starts_with=prefix):
short_name = item.name[len(prefix):]
if isinstance(item, BlobPrefix):
# print('F: ' + prefix + short_name)
if store_folders:
folders.append(prefix + short_name)
depth += 1
walk_blob_hierarchy(item.name, folders=folders, blobs=blobs)
if (debug_max_items > 0
and len(folders) + len(blobs) > debug_max_items):
return folders, blobs
depth -= 1
else:
if store_blobs:
blobs.append(prefix + short_name)
return folders, blobs
folders, blobs = walk_blob_hierarchy(prefix=prefix)
assert all(s.endswith('/') for s in folders)
folders = [s.strip('/') for s in folders]
return folders, blobs
def list_top_level_blob_folders(container_client: ContainerClient) -> List[str]:
"""
List all top-level folders in a container.
"""
top_level_folders, _ = walk_container(
container_client, max_depth=1, store_blobs=False)
return top_level_folders
def concatenate_json_lists(input_files: Iterable[str],
output_file: Optional[str] = None
) -> List[Any]:
"""
Given a list of JSON files that contain lists (typically string
filenames), concatenates the lists into a single list and optionally
writes out this list to a new output JSON file.
"""
output_list = []
for fn in input_files:
with open(fn, 'r') as f:
file_list = json.load(f)
output_list.extend(file_list)
if output_file is not None:
with open(output_file, 'w') as f:
json.dump(output_list, f, indent=1)
return output_list
def write_list_to_file(output_file: str, strings: Sequence[str]) -> None:
"""
Writes a list of strings to either a JSON file or text file,
depending on extension of the given file name.
"""
with open(output_file, 'w') as f:
if output_file.endswith('.json'):
json.dump(strings, f, indent=1)
else:
f.write('\n'.join(strings))
def read_list_from_file(filename: str) -> List[str]:
"""
Reads a json-formatted list of strings from a file.
"""
assert filename.endswith('.json')
with open(filename, 'r') as f:
file_list = json.load(f)
assert isinstance(file_list, list)
for s in file_list:
assert isinstance(s, str)
return file_list
def upload_file_to_blob(account_name: str,
container_name: str,
local_path: str,
blob_name: str,
sas_token: str,
overwrite: bool=False) -> str:
"""
Uploads a local file to Azure Blob Storage and returns the uploaded
blob URI with SAS token.
"""
container_uri = sas_blob_utils.build_azure_storage_uri(
account=account_name, container=container_name, sas_token=sas_token)
with open(local_path, 'rb') as data:
return sas_blob_utils.upload_blob(
container_uri=container_uri, blob_name=blob_name, data=data,
overwrite=overwrite)
def enumerate_blobs_to_file(
output_file: str,
account_name: str,
container_name: str,
sas_token: Optional[str] = None,
blob_prefix: Optional[str] = None,
blob_suffix: Optional[Union[str, Tuple[str]]] = None,
rsearch: Optional[str] = None,
limit: Optional[int] = None
) -> List[str]:
"""
Enumerates blobs in a container, and writes the blob names to an output
file.
Args:
output_file: str, path to save list of files in container
If ends in '.json', writes a JSON string. Otherwise, writes a
newline-delimited list
account_name: str, Azure Storage account name
container_name: str, Azure Blob Storage container name
sas_token: optional str, container SAS token, leading ? will be removed if present.
blob_prefix: optional str, returned results will only contain blob names
to with this prefix
blob_suffix: optional str or tuple of str, returned results will only
contain blob names with this/these suffix(es). The blob names will
be lowercased first before comparing with the suffix(es).
rsearch: optional str, returned results will only contain blob names
that match this Python regex pattern at any point in the blob name.
Use '^' character to only match from the beginning of the blob name.
limit: int, maximum # of blob names to list
if None, then returns all blob names
Returns: list of str, sorted blob names, of length limit or shorter.
"""
if sas_token is not None and len(sas_token) > 9 and sas_token[0] == '?':
sas_token = sas_token[1:]
container_uri = sas_blob_utils.build_azure_storage_uri(
account=account_name, container=container_name, sas_token=sas_token)
matched_blobs = sas_blob_utils.list_blobs_in_container(
container_uri=container_uri, blob_prefix=blob_prefix,
blob_suffix=blob_suffix, rsearch=rsearch, limit=limit)
write_list_to_file(output_file, matched_blobs)
return matched_blobs