зеркало из https://github.com/microsoft/glue.git
updates for luis generator
This commit is contained in:
Родитель
ba5bfa738b
Коммит
e75539927a
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,271 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Automated Training Data for LUIS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 482,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import relevant packages\n",
|
||||
"import json\n",
|
||||
"import re\n",
|
||||
"import logging\n",
|
||||
"import pandas as pd\n",
|
||||
"import random"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 543,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class LUISGenerator():\n",
|
||||
" '''LUIS Text Generator to multiply sample utterances given several entities.\n",
|
||||
" Example:\n",
|
||||
" Input sentence: \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n",
|
||||
" Sample values: city: ['Stuttgart', 'Singapore', 'Frankfurt'], name: ['Nadella', 'Gates']\n",
|
||||
" Returns: \n",
|
||||
" - \"I would like to book a flight from Frankfurt to Singapore and my name is Nadella.\"\n",
|
||||
" - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n",
|
||||
" - \"I would like to book a flight from Singapore to Frankfurt and my name is Gates.\"\n",
|
||||
" - ...'''\n",
|
||||
" \n",
|
||||
" def __init__(self, utterances, values, intents = None):\n",
|
||||
" '''Set variables and execute preprocessing methods within the class.\n",
|
||||
" Args:\n",
|
||||
" utterances: list of utterances to be multiplied.\n",
|
||||
" values: dictionary with potential values, one list per key.\n",
|
||||
" intents: list of intents, optional, has to match the length AND order of utterances list.\n",
|
||||
" Raises:\n",
|
||||
" Assertion Error: checks compatibility of input, does not guarantee full sanity of content.\n",
|
||||
" '''\n",
|
||||
" if isinstance(intents, list):\n",
|
||||
" assert len(intents) == len(utterances), f\"Length of utterances ({len(utterances)}) and intents ({len(intents)}) do not match, please validate!\"\n",
|
||||
" elif isinstance(intents, str):\n",
|
||||
" assert intents == None, \"Intents has to be a list, cannot be a string!\"\n",
|
||||
" self.utterances = utterances\n",
|
||||
" self.values = values\n",
|
||||
" self.intents = intents\n",
|
||||
" self.preprocessed_text = self.prepare_text()\n",
|
||||
" self.tags_per_row, self.tags_flat = self.get_entities()\n",
|
||||
" self.preprocessed_luis = self.prepare_luis()\n",
|
||||
" \n",
|
||||
" def prepare_text(self):\n",
|
||||
" '''Prepares input text by setting a counter for recurring entities.\n",
|
||||
" Args:\n",
|
||||
" self.utterances: list of utterances.\n",
|
||||
" Returns:\n",
|
||||
" self.preprocessed_text: list of utterances with preprocessed entities.'''\n",
|
||||
" self.preprocessed_text = []\n",
|
||||
" logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.')\n",
|
||||
" # Extract all entities\n",
|
||||
" for index, value in enumerate(self.utterances):\n",
|
||||
" orig = re.compile('\\\\{(.*?)\\\\}').findall(value)\n",
|
||||
" subs = [f'{v}%{str(orig[:i].count(v) + 1)}' if orig.count(v) > 1 else v for i, v in enumerate(orig)]\n",
|
||||
" orig = [\"{\" + item + \"}\" for item in orig]\n",
|
||||
" subs = [\"{\" + item + \"}\" for item in subs]\n",
|
||||
" subs = [sub.replace('%1}', '}') for sub in subs]\n",
|
||||
" # Point i to the last element in list\n",
|
||||
" if len(subs) > 0:\n",
|
||||
" i = len(subs) - 1\n",
|
||||
" # Iterate till 1st element and keep on decrementing i\n",
|
||||
" while i >= 0:\n",
|
||||
" value = subs[i].join(value.rsplit(orig[i], 1))\n",
|
||||
" i -= 1\n",
|
||||
" self.preprocessed_text.append(value)\n",
|
||||
" logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.')\n",
|
||||
" return self.preprocessed_text\n",
|
||||
" \n",
|
||||
" # List all possible entitites\n",
|
||||
" def get_entities(self):\n",
|
||||
" '''Gets entities as list for every utterance.\n",
|
||||
" Args:\n",
|
||||
" self.preprocessed_text: list of utterances with preprocessed entities.\n",
|
||||
" Returns:\n",
|
||||
" self.tags_per_row: list with list of entities for every utterance.\n",
|
||||
" self.tags: flattened list with all unique entities of the corpus.\n",
|
||||
" '''\n",
|
||||
" self.tags_per_row = []\n",
|
||||
" # Extract all entities\n",
|
||||
" for index, value in enumerate(self.preprocessed_text):\n",
|
||||
" try:\n",
|
||||
" entity = re.compile('\\\\{(.*?)\\\\}').findall(value)\n",
|
||||
" except:\n",
|
||||
" entity = []\n",
|
||||
" self.tags_per_row.append(entity)\n",
|
||||
"\n",
|
||||
" # Flatten List (as some rows have multiple entities) and drop duplicates from list\n",
|
||||
" self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))\n",
|
||||
" logging.info(f\"[STATUS] - detected {len(self.tags_flat)} different entities\")\n",
|
||||
" return self.tags_per_row, self.tags_flat\n",
|
||||
" \n",
|
||||
" # Prepare \n",
|
||||
" def prepare_luis(self):\n",
|
||||
" '''Prepares preprocessed text to be compatible with the lu-notation. Sets a temporary placeholder for { / } to avoid issues with value replacement.\n",
|
||||
" Args:\n",
|
||||
" self.preprocessed_text: list of utterances with preprocessed entities.\n",
|
||||
" self.tags_per_row: list with list of entities for every utterance.\n",
|
||||
" Returns:\n",
|
||||
" self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n",
|
||||
" '''\n",
|
||||
" self.preprocessed_luis = []\n",
|
||||
" for index, utterance in enumerate(self.preprocessed_text):\n",
|
||||
" for entity in self.tags_per_row[index]:\n",
|
||||
" utterance = utterance.replace(\"{\" + entity + \"}\", \"&?\" + entity + \"={\" + entity + \"}?&\")\n",
|
||||
" utterance = utterance.replace(\"}%\", \"%\")\n",
|
||||
" self.preprocessed_luis.append(utterance)\n",
|
||||
" return self.preprocessed_luis\n",
|
||||
" \n",
|
||||
" def get_values(self):\n",
|
||||
" '''Gets random values from dictionary based on the available entities. Avoids duplicate values in every utterance.\n",
|
||||
" Args:\n",
|
||||
" self.tags_per_row: list with list of entities for every utterance.\n",
|
||||
" self.values: dictionary with potential values, one list per key.\n",
|
||||
" Returns:\n",
|
||||
" self.return_values: list of dictionaries with values for insertion.\n",
|
||||
" '''\n",
|
||||
" self.return_values = []\n",
|
||||
" for index, utterance in enumerate(self.tags_per_row):\n",
|
||||
" u_values = {}\n",
|
||||
" for entity in utterance:\n",
|
||||
" random_value = random.choice(self.values[entity.split(\"%\")[0]])\n",
|
||||
" while random_value in u_values.values():\n",
|
||||
" random_value = random.choice(self.values[entity.split(\"%\")[0]])\n",
|
||||
" u_values[entity] = random_value\n",
|
||||
" self.return_values.append(u_values)\n",
|
||||
" return self.return_values\n",
|
||||
" \n",
|
||||
" def fill_values(self):\n",
|
||||
" '''Fills sentences with values. \n",
|
||||
" If there are no intents, only lists with transformed utterances are returned. If there are intents, zipped lists with intents are returned.\n",
|
||||
" Args:\n",
|
||||
" self.preprocessed_text: list of utterances with preprocessed entities.\n",
|
||||
" self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n",
|
||||
" self.return_values: list of dictionaries with values for insertion.\n",
|
||||
" Returns:\n",
|
||||
" self.utterances_filled: list of utterances with entities substituted by values.\n",
|
||||
" self.utterances_filled: list of lu-file utterances with entities substituted by values and lu-entity notation.\n",
|
||||
" OR\n",
|
||||
" zip(self.intents, self.utterances_filled): zipped list, intent list and list of utterances with entities substituted by values.\n",
|
||||
" zip(self.intents, self.utterances_luis): zipped list, intent list and list of lu-file utterances with entities substituted by values and lu-entity notation.\n",
|
||||
" '''\n",
|
||||
" self.utterances_filled = []\n",
|
||||
" self.utterances_luis = []\n",
|
||||
" for index, value in enumerate(self.preprocessed_text): \n",
|
||||
" formatted = str(value).format(**self.return_values[index])\n",
|
||||
" self.utterances_filled.append(formatted)\n",
|
||||
" for index, value in enumerate(self.preprocessed_luis): \n",
|
||||
" formatted = str(value).format(**self.return_values[index])\n",
|
||||
" formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')\n",
|
||||
" self.utterances_luis.append(formatted)\n",
|
||||
" if self.intents == []:\n",
|
||||
" return self.utterances_filled, self.utterances_luis\n",
|
||||
" else:\n",
|
||||
" return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)\n",
|
||||
" \n",
|
||||
"def transform_lu(zipped_list, lu_file=\"lu_file\"):\n",
|
||||
" '''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.\n",
|
||||
" Args:\n",
|
||||
" zipped_list: zipped list of utterances, consisting of intent list and utterance list.\n",
|
||||
" lu_file: file name of your lu-file, no file ending necessary, default \"lu_file\"\n",
|
||||
" Output:\n",
|
||||
" Writes lu-file to your working folder'''\n",
|
||||
" compare = \"\"\n",
|
||||
" luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')\n",
|
||||
" with open(f'{lu_file}.lu', 'w') as f:\n",
|
||||
" for index, row in luis_file.iterrows():\n",
|
||||
" if compare != row['intent']:\n",
|
||||
" # Begin intent\n",
|
||||
" line = f\"\\n# {row['intent']}\"\n",
|
||||
" print(line, file = f)\n",
|
||||
" line = f\"- {str(row['text'])}\"\n",
|
||||
" print(line, file = f)\n",
|
||||
" compare = row['intent']\n",
|
||||
" else:\n",
|
||||
" line = f\"- {str(row['text'])}\"\n",
|
||||
" print(line, file = f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 544,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define input values\n",
|
||||
"utterances = ['ich möchte einen flug von {city} nach {city} buchen via {station}, mein Name ist {name}.', \n",
|
||||
" 'ich komme aus {city} und möchte über den {station} nach {city}.', \n",
|
||||
" 'was geht ab?']\n",
|
||||
"\n",
|
||||
"values = {'city': ['Stuttgart', 'Singapore', 'Frankfurt'], \n",
|
||||
" 'station': ['Airport', 'Central', 'Bus Stop'], \n",
|
||||
" 'name': ['Nadella', 'Gates']}\n",
|
||||
"\n",
|
||||
"intents = ['123_Test', \n",
|
||||
" '234_Test', \n",
|
||||
" 'None']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 545,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create instance of the class\n",
|
||||
"luis_generator = LUISGenerator(utterances, values, ['BookFlight', 'BookFlight', 'None'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 546,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Loop through the generator multiple times and get a lu file!\n",
|
||||
"results = []\n",
|
||||
"for _ in range(0, 1000):\n",
|
||||
" luis_generator.get_values()\n",
|
||||
" speech, luis = luis_generator.fill_values()\n",
|
||||
" results.extend(luis)\n",
|
||||
"transform_lu(results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -12,9 +12,9 @@ import configparser
|
|||
import pandas as pd
|
||||
|
||||
# Import custom modules
|
||||
import luis
|
||||
import stt
|
||||
import tts
|
||||
import luis_scoring
|
||||
import params as pa
|
||||
import helper as he
|
||||
import evaluate as eval
|
||||
|
|
Загрузка…
Ссылка в новой задаче