updates for luis generator

2021-01-05 12:59:42 +01:00 · 2021-01-05 12:59:42 +01:00 · e75539927a
--- a/Generator.ipynb
+++ b/Generator.ipynb
--- a/Generator.ipynb
+++ b/Generator.ipynb
@ -1,271 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Automated Training Data for LUIS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 482,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import relevant packages\n",
-    "import json\n",
-    "import re\n",
-    "import logging\n",
-    "import pandas as pd\n",
-    "import random"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 543,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class LUISGenerator():\n",
-    "    '''LUIS Text Generator to multiply sample utterances given several entities.\n",
-    "    Example:\n",
-    "        Input sentence: \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n",
-    "        Sample values: city: ['Stuttgart', 'Singapore', 'Frankfurt'], name: ['Nadella', 'Gates']\n",
-    "    Returns: \n",
-    "        - \"I would like to book a flight from Frankfurt to Singapore and my name is Nadella.\"\n",
-    "        - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n",
-    "        - \"I would like to book a flight from Singapore to Frankfurt and my name is Gates.\"\n",
-    "            - ...'''\n",
-    "    \n",
-    "    def __init__(self, utterances, values, intents = None):\n",
-    "        '''Set variables and execute preprocessing methods within the class.\n",
-    "        Args:\n",
-    "            utterances: list of utterances to be multiplied.\n",
-    "            values: dictionary with potential values, one list per key.\n",
-    "            intents: list of intents, optional, has to match the length AND order of utterances list.\n",
-    "        Raises:\n",
-    "            Assertion Error: checks compatibility of input, does not guarantee full sanity of content.\n",
-    "        '''\n",
-    "        if isinstance(intents, list):\n",
-    "            assert len(intents) == len(utterances), f\"Length of utterances ({len(utterances)}) and intents ({len(intents)}) do not match, please validate!\"\n",
-    "        elif isinstance(intents, str):\n",
-    "            assert intents == None, \"Intents has to be a list, cannot be a string!\"\n",
-    "        self.utterances = utterances\n",
-    "        self.values = values\n",
-    "        self.intents = intents\n",
-    "        self.preprocessed_text = self.prepare_text()\n",
-    "        self.tags_per_row, self.tags_flat = self.get_entities()\n",
-    "        self.preprocessed_luis = self.prepare_luis()\n",
-    "        \n",
-    "    def prepare_text(self):\n",
-    "        '''Prepares input text by setting a counter for recurring entities.\n",
-    "        Args:\n",
-    "            self.utterances: list of utterances.\n",
-    "        Returns:\n",
-    "            self.preprocessed_text: list of utterances with preprocessed entities.'''\n",
-    "        self.preprocessed_text = []\n",
-    "        logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.')\n",
-    "        # Extract all entities\n",
-    "        for index, value in enumerate(self.utterances):\n",
-    "            orig = re.compile('\\\\{(.*?)\\\\}').findall(value)\n",
-    "            subs = [f'{v}%{str(orig[:i].count(v) + 1)}' if orig.count(v) > 1 else v for i, v in enumerate(orig)]\n",
-    "            orig = [\"{\" + item + \"}\" for item in orig]\n",
-    "            subs = [\"{\" + item + \"}\" for item in subs]\n",
-    "            subs = [sub.replace('%1}', '}') for sub in subs]\n",
-    "            # Point i to the last element in list\n",
-    "            if len(subs) > 0:\n",
-    "                i = len(subs) - 1\n",
-    "                # Iterate till 1st element and keep on decrementing i\n",
-    "                while i >= 0:\n",
-    "                    value = subs[i].join(value.rsplit(orig[i], 1))\n",
-    "                    i -= 1\n",
-    "            self.preprocessed_text.append(value)\n",
-    "        logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.')\n",
-    "        return self.preprocessed_text\n",
-    "    \n",
-    "    # List all possible entitites\n",
-    "    def get_entities(self):\n",
-    "        '''Gets entities as list for every utterance.\n",
-    "        Args:\n",
-    "            self.preprocessed_text: list of utterances with preprocessed entities.\n",
-    "        Returns:\n",
-    "            self.tags_per_row: list with list of entities for every utterance.\n",
-    "            self.tags: flattened list with all unique entities of the corpus.\n",
-    "        '''\n",
-    "        self.tags_per_row = []\n",
-    "        # Extract all entities\n",
-    "        for index, value in enumerate(self.preprocessed_text):\n",
-    "            try:\n",
-    "                entity = re.compile('\\\\{(.*?)\\\\}').findall(value)\n",
-    "            except:\n",
-    "                entity = []\n",
-    "            self.tags_per_row.append(entity)\n",
-    "\n",
-    "        # Flatten List (as some rows have multiple entities) and drop duplicates from list\n",
-    "        self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))\n",
-    "        logging.info(f\"[STATUS] - detected {len(self.tags_flat)} different entities\")\n",
-    "        return self.tags_per_row, self.tags_flat\n",
-    "    \n",
-    "    # Prepare \n",
-    "    def prepare_luis(self):\n",
-    "        '''Prepares preprocessed text to be compatible with the lu-notation. Sets a temporary placeholder for { / } to avoid issues with value replacement.\n",
-    "        Args:\n",
-    "            self.preprocessed_text: list of utterances with preprocessed entities.\n",
-    "            self.tags_per_row: list with list of entities for every utterance.\n",
-    "        Returns:\n",
-    "            self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n",
-    "        '''\n",
-    "        self.preprocessed_luis = []\n",
-    "        for index, utterance in enumerate(self.preprocessed_text):\n",
-    "            for entity in self.tags_per_row[index]:\n",
-    "                utterance = utterance.replace(\"{\" + entity + \"}\", \"&?\" + entity + \"={\" + entity + \"}?&\")\n",
-    "            utterance = utterance.replace(\"}%\", \"%\")\n",
-    "            self.preprocessed_luis.append(utterance)\n",
-    "        return self.preprocessed_luis\n",
-    "        \n",
-    "    def get_values(self):\n",
-    "        '''Gets random values from dictionary based on the available entities. Avoids duplicate values in every utterance.\n",
-    "        Args:\n",
-    "            self.tags_per_row: list with list of entities for every utterance.\n",
-    "            self.values: dictionary with potential values, one list per key.\n",
-    "        Returns:\n",
-    "            self.return_values: list of dictionaries with values for insertion.\n",
-    "        '''\n",
-    "        self.return_values = []\n",
-    "        for index, utterance in enumerate(self.tags_per_row):\n",
-    "            u_values = {}\n",
-    "            for entity in utterance:\n",
-    "                random_value = random.choice(self.values[entity.split(\"%\")[0]])\n",
-    "                while random_value in u_values.values():\n",
-    "                    random_value = random.choice(self.values[entity.split(\"%\")[0]])\n",
-    "                u_values[entity] = random_value\n",
-    "            self.return_values.append(u_values)\n",
-    "        return self.return_values\n",
-    "        \n",
-    "    def fill_values(self):\n",
-    "        '''Fills sentences with values. \n",
-    "        If there are no intents, only lists with transformed utterances are returned. If there are intents, zipped lists with intents are returned.\n",
-    "        Args:\n",
-    "            self.preprocessed_text: list of utterances with preprocessed entities.\n",
-    "            self.preprocessed_luis: list of lu-and replacement-compatible utterances.\n",
-    "            self.return_values: list of dictionaries with values for insertion.\n",
-    "        Returns:\n",
-    "            self.utterances_filled: list of utterances with entities substituted by values.\n",
-    "            self.utterances_filled: list of lu-file utterances with entities substituted by values and lu-entity notation.\n",
-    "            OR\n",
-    "            zip(self.intents, self.utterances_filled): zipped list, intent list and list of utterances with entities substituted by values.\n",
-    "            zip(self.intents, self.utterances_luis): zipped list, intent list and list of lu-file utterances with entities substituted by values and lu-entity notation.\n",
-    "        '''\n",
-    "        self.utterances_filled = []\n",
-    "        self.utterances_luis = []\n",
-    "        for index, value in enumerate(self.preprocessed_text): \n",
-    "            formatted = str(value).format(**self.return_values[index])\n",
-    "            self.utterances_filled.append(formatted)\n",
-    "        for index, value in enumerate(self.preprocessed_luis): \n",
-    "            formatted = str(value).format(**self.return_values[index])\n",
-    "            formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')\n",
-    "            self.utterances_luis.append(formatted)\n",
-    "        if self.intents == []:\n",
-    "            return self.utterances_filled, self.utterances_luis\n",
-    "        else:\n",
-    "            return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)\n",
-    "        \n",
-    "def transform_lu(zipped_list, lu_file=\"lu_file\"):\n",
-    "    '''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.\n",
-    "    Args:\n",
-    "        zipped_list: zipped list of utterances, consisting of intent list and utterance list.\n",
-    "        lu_file: file name of your lu-file, no file ending necessary, default \"lu_file\"\n",
-    "    Output:\n",
-    "        Writes lu-file to your working folder'''\n",
-    "    compare = \"\"\n",
-    "    luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')\n",
-    "    with open(f'{lu_file}.lu', 'w') as f:\n",
-    "        for index, row in luis_file.iterrows():\n",
-    "            if compare != row['intent']:\n",
-    "                # Begin intent\n",
-    "                line = f\"\\n# {row['intent']}\"\n",
-    "                print(line, file = f)\n",
-    "                line = f\"- {str(row['text'])}\"\n",
-    "                print(line, file = f)\n",
-    "                compare = row['intent']\n",
-    "            else:\n",
-    "                line = f\"- {str(row['text'])}\"\n",
-    "                print(line, file = f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 544,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define input values\n",
-    "utterances = ['ich möchte einen flug von {city} nach {city} buchen via {station}, mein Name ist {name}.', \n",
-    "              'ich komme aus {city} und möchte über den {station} nach {city}.', \n",
-    "              'was geht ab?']\n",
-    "\n",
-    "values = {'city': ['Stuttgart', 'Singapore', 'Frankfurt'], \n",
-    "          'station': ['Airport', 'Central', 'Bus Stop'], \n",
-    "          'name': ['Nadella', 'Gates']}\n",
-    "\n",
-    "intents = ['123_Test', \n",
-    "           '234_Test', \n",
-    "           'None']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 545,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create instance of the class\n",
-    "luis_generator = LUISGenerator(utterances, values, ['BookFlight', 'BookFlight', 'None'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 546,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Loop through the generator multiple times and get a lu file!\n",
-    "results = []\n",
-    "for _ in range(0, 1000):\n",
-    "    luis_generator.get_values()\n",
-    "    speech, luis = luis_generator.fill_values()\n",
-    "    results.extend(luis)\n",
-    "transform_lu(results)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.6 - AzureML",
-   "language": "python",
-   "name": "python3-azureml"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/src/glue.py
+++ b/src/glue.py
@ -12,9 +12,9 @@ import configparser
 import pandas as pd

 # Import custom modules
-import luis
 import stt
 import tts
+import luis_scoring
 import params as pa
 import helper as he
 import evaluate as eval
--- a/src/luis_data_generator.py
+++ b/src/luis_data_generator.py
--- a/src/luis_scoring.py
+++ b/src/luis_scoring.py