From e75539927af34e0864ee748f403413cde0ea1724 Mon Sep 17 00:00:00 2001 From: nonstoptimm Date: Tue, 5 Jan 2021 12:59:42 +0100 Subject: [PATCH] updates for luis generator --- notebooks/Data - LUIS Generator.ipynb | 107 +++++++ notebooks/LUIS Generator.ipynb | 271 ------------------ src/glue.py | 2 +- ...uis_generate.py => luis_data_generator.py} | 0 src/{luis.py => luis_scoring.py} | 0 5 files changed, 108 insertions(+), 272 deletions(-) create mode 100644 notebooks/Data - LUIS Generator.ipynb delete mode 100644 notebooks/LUIS Generator.ipynb rename src/{luis_generate.py => luis_data_generator.py} (100%) rename src/{luis.py => luis_scoring.py} (100%) diff --git a/notebooks/Data - LUIS Generator.ipynb b/notebooks/Data - LUIS Generator.ipynb new file mode 100644 index 0000000..9531970 --- /dev/null +++ b/notebooks/Data - LUIS Generator.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Training Data for LUIS" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Import relevant packages\n", + "import json\n", + "import re\n", + "import logging\n", + "import pandas as pd\n", + "import random\n", + "import sys\n", + "sys.path.append(\"../src/\")\n", + "from luis_data_generator import LUISGenerator\n", + "from luis_data_generator import transform_lu" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Define input values, or import them from a pandas data frame\n", + "utterances = ['ich möchte einen flug von {city} nach {city} buchen via {station}, mein Name ist {name}.', \n", + " 'ich komme aus {city} und möchte über den {station} nach {city}.', \n", + " 'was geht ab?']\n", + "\n", + "values = {'city': ['Stuttgart', 'Singapore', 'Frankfurt'], \n", + " 'station': ['Airport', 'Central', 'Bus Stop'], \n", + " 'name': ['Nadella', 'Gates']}\n", + "\n", + "intents = ['123_Test', \n", + " '234_Test', \n", + " 'None']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Create instance of the class\n", + "luis_generator = LUISGenerator(utterances, values, ['BookFlight', 'BookFlight', 'None'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n# BookFlight\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Airport} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Central} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Central} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Airport} nach {city=Singapore}.\n- ich komme aus {city=Singapore} und möchte über den {station=Central} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Bus Stop} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Airport} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Central} nach {city=Singapore}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Bus Stop} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Airport} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Airport} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Central} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Singapore} und möchte über den {station=Airport} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Bus Stop} nach {city=Frankfurt}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Bus Stop} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Central} nach {city=Singapore}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Bus Stop} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Bus Stop} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Nadella}.\n\n# None\n- was geht ab?\n" + ] + } + ], + "source": [ + "# Loop through the generator multiple times and get a lu file!\n", + "results = []\n", + "for _ in range(0, 1000):\n", + " luis_generator.get_values()\n", + " speech, luis = luis_generator.fill_values()\n", + " results.extend(luis)\n", + "transform_lu(results)" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.6 64-bit ('nlp': conda)", + "metadata": { + "interpreter": { + "hash": "0d92b4570cf170047a8c40549154a6dffe47dd8c5b7bd394f81eede6f5d748fa" + } + } + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/notebooks/LUIS Generator.ipynb b/notebooks/LUIS Generator.ipynb deleted file mode 100644 index 191d232..0000000 --- a/notebooks/LUIS Generator.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Automated Training Data for LUIS" - ] - }, - { - "cell_type": "code", - "execution_count": 482, - "metadata": {}, - "outputs": [], - "source": [ - "# Import relevant packages\n", - "import json\n", - "import re\n", - "import logging\n", - "import pandas as pd\n", - "import random" - ] - }, - { - "cell_type": "code", - "execution_count": 543, - "metadata": {}, - "outputs": [], - "source": [ - "class LUISGenerator():\n", - " '''LUIS Text Generator to multiply sample utterances given several entities.\n", - " Example:\n", - " Input sentence: \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n", - " Sample values: city: ['Stuttgart', 'Singapore', 'Frankfurt'], name: ['Nadella', 'Gates']\n", - " Returns: \n", - " - \"I would like to book a flight from Frankfurt to Singapore and my name is Nadella.\"\n", - " - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n", - " - \"I would like to book a flight from Singapore to Frankfurt and my name is Gates.\"\n", - " - ...'''\n", - " \n", - " def __init__(self, utterances, values, intents = None):\n", - " '''Set variables and execute preprocessing methods within the class.\n", - " Args:\n", - " utterances: list of utterances to be multiplied.\n", - " values: dictionary with potential values, one list per key.\n", - " intents: list of intents, optional, has to match the length AND order of utterances list.\n", - " Raises:\n", - " Assertion Error: checks compatibility of input, does not guarantee full sanity of content.\n", - " '''\n", - " if isinstance(intents, list):\n", - " assert len(intents) == len(utterances), f\"Length of utterances ({len(utterances)}) and intents ({len(intents)}) do not match, please validate!\"\n", - " elif isinstance(intents, str):\n", - " assert intents == None, \"Intents has to be a list, cannot be a string!\"\n", - " self.utterances = utterances\n", - " self.values = values\n", - " self.intents = intents\n", - " self.preprocessed_text = self.prepare_text()\n", - " self.tags_per_row, self.tags_flat = self.get_entities()\n", - " self.preprocessed_luis = self.prepare_luis()\n", - " \n", - " def prepare_text(self):\n", - " '''Prepares input text by setting a counter for recurring entities.\n", - " Args:\n", - " self.utterances: list of utterances.\n", - " Returns:\n", - " self.preprocessed_text: list of utterances with preprocessed entities.'''\n", - " self.preprocessed_text = []\n", - " logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.')\n", - " # Extract all entities\n", - " for index, value in enumerate(self.utterances):\n", - " orig = re.compile('\\\\{(.*?)\\\\}').findall(value)\n", - " subs = [f'{v}%{str(orig[:i].count(v) + 1)}' if orig.count(v) > 1 else v for i, v in enumerate(orig)]\n", - " orig = [\"{\" + item + \"}\" for item in orig]\n", - " subs = [\"{\" + item + \"}\" for item in subs]\n", - " subs = [sub.replace('%1}', '}') for sub in subs]\n", - " # Point i to the last element in list\n", - " if len(subs) > 0:\n", - " i = len(subs) - 1\n", - " # Iterate till 1st element and keep on decrementing i\n", - " while i >= 0:\n", - " value = subs[i].join(value.rsplit(orig[i], 1))\n", - " i -= 1\n", - " self.preprocessed_text.append(value)\n", - " logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.')\n", - " return self.preprocessed_text\n", - " \n", - " # List all possible entitites\n", - " def get_entities(self):\n", - " '''Gets entities as list for every utterance.\n", - " Args:\n", - " self.preprocessed_text: list of utterances with preprocessed entities.\n", - " Returns:\n", - " self.tags_per_row: list with list of entities for every utterance.\n", - " self.tags: flattened list with all unique entities of the corpus.\n", - " '''\n", - " self.tags_per_row = []\n", - " # Extract all entities\n", - " for index, value in enumerate(self.preprocessed_text):\n", - " try:\n", - " entity = re.compile('\\\\{(.*?)\\\\}').findall(value)\n", - " except:\n", - " entity = []\n", - " self.tags_per_row.append(entity)\n", - "\n", - " # Flatten List (as some rows have multiple entities) and drop duplicates from list\n", - " self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))\n", - " logging.info(f\"[STATUS] - detected {len(self.tags_flat)} different entities\")\n", - " return self.tags_per_row, self.tags_flat\n", - " \n", - " # Prepare \n", - " def prepare_luis(self):\n", - " '''Prepares preprocessed text to be compatible with the lu-notation. Sets a temporary placeholder for { / } to avoid issues with value replacement.\n", - " Args:\n", - " self.preprocessed_text: list of utterances with preprocessed entities.\n", - " self.tags_per_row: list with list of entities for every utterance.\n", - " Returns:\n", - " self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n", - " '''\n", - " self.preprocessed_luis = []\n", - " for index, utterance in enumerate(self.preprocessed_text):\n", - " for entity in self.tags_per_row[index]:\n", - " utterance = utterance.replace(\"{\" + entity + \"}\", \"&?\" + entity + \"={\" + entity + \"}?&\")\n", - " utterance = utterance.replace(\"}%\", \"%\")\n", - " self.preprocessed_luis.append(utterance)\n", - " return self.preprocessed_luis\n", - " \n", - " def get_values(self):\n", - " '''Gets random values from dictionary based on the available entities. Avoids duplicate values in every utterance.\n", - " Args:\n", - " self.tags_per_row: list with list of entities for every utterance.\n", - " self.values: dictionary with potential values, one list per key.\n", - " Returns:\n", - " self.return_values: list of dictionaries with values for insertion.\n", - " '''\n", - " self.return_values = []\n", - " for index, utterance in enumerate(self.tags_per_row):\n", - " u_values = {}\n", - " for entity in utterance:\n", - " random_value = random.choice(self.values[entity.split(\"%\")[0]])\n", - " while random_value in u_values.values():\n", - " random_value = random.choice(self.values[entity.split(\"%\")[0]])\n", - " u_values[entity] = random_value\n", - " self.return_values.append(u_values)\n", - " return self.return_values\n", - " \n", - " def fill_values(self):\n", - " '''Fills sentences with values. \n", - " If there are no intents, only lists with transformed utterances are returned. If there are intents, zipped lists with intents are returned.\n", - " Args:\n", - " self.preprocessed_text: list of utterances with preprocessed entities.\n", - " self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n", - " self.return_values: list of dictionaries with values for insertion.\n", - " Returns:\n", - " self.utterances_filled: list of utterances with entities substituted by values.\n", - " self.utterances_filled: list of lu-file utterances with entities substituted by values and lu-entity notation.\n", - " OR\n", - " zip(self.intents, self.utterances_filled): zipped list, intent list and list of utterances with entities substituted by values.\n", - " zip(self.intents, self.utterances_luis): zipped list, intent list and list of lu-file utterances with entities substituted by values and lu-entity notation.\n", - " '''\n", - " self.utterances_filled = []\n", - " self.utterances_luis = []\n", - " for index, value in enumerate(self.preprocessed_text): \n", - " formatted = str(value).format(**self.return_values[index])\n", - " self.utterances_filled.append(formatted)\n", - " for index, value in enumerate(self.preprocessed_luis): \n", - " formatted = str(value).format(**self.return_values[index])\n", - " formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')\n", - " self.utterances_luis.append(formatted)\n", - " if self.intents == []:\n", - " return self.utterances_filled, self.utterances_luis\n", - " else:\n", - " return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)\n", - " \n", - "def transform_lu(zipped_list, lu_file=\"lu_file\"):\n", - " '''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.\n", - " Args:\n", - " zipped_list: zipped list of utterances, consisting of intent list and utterance list.\n", - " lu_file: file name of your lu-file, no file ending necessary, default \"lu_file\"\n", - " Output:\n", - " Writes lu-file to your working folder'''\n", - " compare = \"\"\n", - " luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')\n", - " with open(f'{lu_file}.lu', 'w') as f:\n", - " for index, row in luis_file.iterrows():\n", - " if compare != row['intent']:\n", - " # Begin intent\n", - " line = f\"\\n# {row['intent']}\"\n", - " print(line, file = f)\n", - " line = f\"- {str(row['text'])}\"\n", - " print(line, file = f)\n", - " compare = row['intent']\n", - " else:\n", - " line = f\"- {str(row['text'])}\"\n", - " print(line, file = f)" - ] - }, - { - "cell_type": "code", - "execution_count": 544, - "metadata": {}, - "outputs": [], - "source": [ - "# Define input values\n", - "utterances = ['ich möchte einen flug von {city} nach {city} buchen via {station}, mein Name ist {name}.', \n", - " 'ich komme aus {city} und möchte über den {station} nach {city}.', \n", - " 'was geht ab?']\n", - "\n", - "values = {'city': ['Stuttgart', 'Singapore', 'Frankfurt'], \n", - " 'station': ['Airport', 'Central', 'Bus Stop'], \n", - " 'name': ['Nadella', 'Gates']}\n", - "\n", - "intents = ['123_Test', \n", - " '234_Test', \n", - " 'None']" - ] - }, - { - "cell_type": "code", - "execution_count": 545, - "metadata": {}, - "outputs": [], - "source": [ - "# Create instance of the class\n", - "luis_generator = LUISGenerator(utterances, values, ['BookFlight', 'BookFlight', 'None'])" - ] - }, - { - "cell_type": "code", - "execution_count": 546, - "metadata": {}, - "outputs": [], - "source": [ - "# Loop through the generator multiple times and get a lu file!\n", - "results = []\n", - "for _ in range(0, 1000):\n", - " luis_generator.get_values()\n", - " speech, luis = luis_generator.fill_values()\n", - " results.extend(luis)\n", - "transform_lu(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.6 - AzureML", - "language": "python", - "name": "python3-azureml" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/src/glue.py b/src/glue.py index 5b665ec..6fd3810 100644 --- a/src/glue.py +++ b/src/glue.py @@ -12,9 +12,9 @@ import configparser import pandas as pd # Import custom modules -import luis import stt import tts +import luis_scoring import params as pa import helper as he import evaluate as eval diff --git a/src/luis_generate.py b/src/luis_data_generator.py similarity index 100% rename from src/luis_generate.py rename to src/luis_data_generator.py diff --git a/src/luis.py b/src/luis_scoring.py similarity index 100% rename from src/luis.py rename to src/luis_scoring.py