added AudioMerge for Speech Batch Transcript

This commit is contained in:
ruoccofabrizio 2021-04-22 15:19:36 +02:00
Родитель f3a1419b07
Коммит 1ffb05eaf7
2 изменённых файлов: 106 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,94 @@
import logging
import azure.functions as func
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import json
import os
from isodate import parse_duration
import dateutil.parser
def main(myblob: func.InputStream):
logging.info(f"Python blob trigger function processed blob \n"
f"Name: {myblob.name}\n"
f"Blob Size: {myblob.length} bytes")
connection_string = os.environ['conversationalkm_STORAGE']
container = os.environ['telemetry_processed']
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# Processing
get_transcript(myblob, blob_service_client, container)
def get_transcript(blob, blob_service_client, container):
# speechtotext/v3.0/transcriptions/{id}
transcript_id = blob.name.split('/')[-1]
transcript= json.loads(blob.read())
timestamp = transcript['timeStamp']
recognizedPhrases = transcript['recognizedPhrases']
phrases = list(map(lambda x: extract_data(x, timestamp), recognizedPhrases))
save_conversation(transcript_id, phrases, blob_service_client, container)
# Extract date from Speech Data
def extract_data(x, timestamp):
return {
"speaker" : x['channel'],
"phrase" : x['nBest'][0]['display'],
"offset" : x['offset'],
"duration": x['duration'],
"offsetInTicks" : x['offsetInTicks'],
"durationInTicks" : x['durationInTicks'],
"timestamp" : timestamp
}
# Format single phrase as message
def format_message(id, phrase):
event_time = dateutil.parser.parse(phrase['timestamp']) + parse_duration(phrase['offset'])
return {
"Id": f"{id}_{phrase['offset']}",
"ReferenceId": None,
"EventType": "MessageFromUser" if phrase['speaker'] == 0 else "MessageFromBotOrAgent",
"EventTime": str(event_time),
"ConversationId": id,
"Value": phrase['phrase'],
"UserId": phrase['speaker'],
"CustomProperties": {
"offset": phrase['offset'],
"duration": phrase['duration'],
"offsetInTicks": phrase['offsetInTicks'],
"durationInTicks": phrase['durationInTicks']
}
}
# Save conversation in Azure Blob Storage
def save_conversation(conversation_id, messages, blob_service_client, container):
# Define structure for conversational data
data = {
"ConversationId" : conversation_id,
"Messages" : []
}
blob = f"{conversation_id}.json"
# Define Azure Blob Storage client
blob_client = blob_service_client.get_blob_client(container=container, blob=blob)
data['Messages'] = list(map(lambda x: format_message(conversation_id, x),messages))
data['Messages'] = sorted(data['Messages'], key = lambda x:x['EventTime'])
# Extract Min and Max EventTime for conversation
data["StartTime"] = min(list(map(lambda x: x['EventTime'], data['Messages'])))
data["EndTime"] = max(list(map(lambda x: x['EventTime'], data['Messages'])))
data["merged_content"] = "".join(list(filter(None, map(lambda x: x['Value'] , data['Messages']))))
data["merged_content_user"] = "".join(list(filter(None, map(lambda x: x['Value'] if x['UserId'] == 0 else None, data['Messages']))))
data["merged_content_agent"] = "".join(list(filter(None, map(lambda x: x['Value'] if x['UserId'] == 1 else None, data['Messages']))))
# Write results on Azure Blob Storage
blob_client.upload_blob(json.dumps(data), overwrite=True)

Просмотреть файл

@ -0,0 +1,12 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"name": "myblob",
"type": "blobTrigger",
"direction": "in",
"path": "json-result-output/{name}",
"connection": "conversationalkm_STORAGE"
}
]
}