diff --git a/notebooks/Data - LUIS Generator.ipynb b/notebooks/Data - LUIS Generator.ipynb deleted file mode 100644 index 9531970..0000000 --- a/notebooks/Data - LUIS Generator.ipynb +++ /dev/null @@ -1,107 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Automated Training Data for LUIS" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Import relevant packages\n", - "import json\n", - "import re\n", - "import logging\n", - "import pandas as pd\n", - "import random\n", - "import sys\n", - "sys.path.append(\"../src/\")\n", - "from luis_data_generator import LUISGenerator\n", - "from luis_data_generator import transform_lu" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Define input values, or import them from a pandas data frame\n", - "utterances = ['ich möchte einen flug von {city} nach {city} buchen via {station}, mein Name ist {name}.', \n", - " 'ich komme aus {city} und möchte über den {station} nach {city}.', \n", - " 'was geht ab?']\n", - "\n", - "values = {'city': ['Stuttgart', 'Singapore', 'Frankfurt'], \n", - " 'station': ['Airport', 'Central', 'Bus Stop'], \n", - " 'name': ['Nadella', 'Gates']}\n", - "\n", - "intents = ['123_Test', \n", - " '234_Test', \n", - " 'None']" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Create instance of the class\n", - "luis_generator = LUISGenerator(utterances, values, ['BookFlight', 'BookFlight', 'None'])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n# BookFlight\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Airport} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Central} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Central} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Airport} nach {city=Singapore}.\n- ich komme aus {city=Singapore} und möchte über den {station=Central} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Bus Stop} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Airport} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Central} nach {city=Singapore}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Bus Stop} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Airport} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Airport} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Central} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Singapore} und möchte über den {station=Airport} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Bus Stop} nach {city=Frankfurt}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Bus Stop} nach {city=Singapore}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich komme aus {city=Stuttgart} und möchte über den {station=Central} nach {city=Singapore}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Airport}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich komme aus {city=Frankfurt} und möchte über den {station=Bus Stop} nach {city=Stuttgart}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Frankfurt} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich komme aus {city=Singapore} und möchte über den {station=Bus Stop} nach {city=Frankfurt}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Stuttgart} buchen via {station=Bus Stop}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Singapore} nach {city=Stuttgart} buchen via {station=Central}, mein Name ist {name=Nadella}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Airport}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Frankfurt} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Gates}.\n- ich möchte einen flug von {city=Stuttgart} nach {city=Singapore} buchen via {station=Central}, mein Name ist {name=Nadella}.\n\n# None\n- was geht ab?\n" - ] - } - ], - "source": [ - "# Loop through the generator multiple times and get a lu file!\n", - "results = []\n", - "for _ in range(0, 1000):\n", - " luis_generator.get_values()\n", - " speech, luis = luis_generator.fill_values()\n", - " results.extend(luis)\n", - "transform_lu(results)" - ] - } - ], - "metadata": { - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.6 64-bit ('nlp': conda)", - "metadata": { - "interpreter": { - "hash": "0d92b4570cf170047a8c40549154a6dffe47dd8c5b7bd394f81eede6f5d748fa" - } - } - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6-final" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/notebooks/Data - Training Data Generator.ipynb b/notebooks/Data - Training Data Generator.ipynb new file mode 100644 index 0000000..d5f22b5 --- /dev/null +++ b/notebooks/Data - Training Data Generator.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Generation of Training Data for Microsoft LUIS and Speech Service\n", + "This notebook serves to batch-generate training data for [Microsoft LUIS](https://luis.ai) and [Microsoft Speech Service](https://speech.microsoft.com) based on example utterances and possible entity-values.\n", + "\n", + "## Example\n", + "### Input sentence: \n", + "- \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n", + "\n", + "### Sample values: \n", + "- city: 'Stuttgart', 'Singapore', 'Frankfurt', 'Kuala Lumpur'\n", + "- name: 'Nadella', 'Gates'\n", + "\n", + "### Returns:\n", + "- Training Data for Speech-To-Text Engine or textual input for Text-to-Speech generation\n", + " - \"I would like to book a flight from Frankfurt to Kuala Lumpur and my name is Nadella.\"\n", + " - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n", + " - \"I would like to book a flight from Singapore to Frankfurt and my name is Ballmer.\"\n", + "- Training data for Microsoft LUIS (see the concept of [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding))\n", + " - I would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Bus Stop} and my name is {name=Nadella}.\n", + " - I would like to book a flight from {city=Singapore} to {city=Stuttgart} via {station=Airport} and my name is {name=Gates}.\n", + " - I would like to book a flight from {city=Singapore} to {city=Frankfurt} via {station=Airport} and my name is {name=Ballmer}." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# Import relevant packages\n", + "import json\n", + "import re\n", + "import logging\n", + "import pandas as pd\n", + "import random\n", + "import sys\n", + "\n", + "# Import LUIS generator components\n", + "sys.path.append(\"../src/\")\n", + "from luis_data_generator import LUISGenerator\n", + "from luis_data_generator import transform_lu\n", + "\n", + "# Auto Reload\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Define input values, or import them from a pandas data frame\n", + "utterances = ['i would like to book a flight from {city} to {city} via {station}, my name is {name}.', \n", + " 'i am coming from {city} and want to travel via {station} to {city}.',\n", + " 'i want to book a seat on my flight to {city}.', \n", + " 'how are you doing?']\n", + "\n", + "values = {'city': ['Singapore', 'Frankfurt', 'Kuala Lumpur', 'Stuttgart'], \n", + " 'station': ['Airport', 'Central Station', 'Bus Stop'], \n", + " 'name': ['Nadella', 'Gates', 'Ballmer']}\n", + "\n", + "intents = ['BookFlight', \n", + " 'BookFlight', \n", + " 'BookSeat',\n", + " 'None']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Create instance of the LUISGenerator-class along with your utterances, values and intents\n", + "# If you have no intents, just remove it. It is an optional argument for the class.\n", + "flight_generator = LUISGenerator(utterances, values)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Define amount of iterations below.\n", + "# Keep in mind that it does not necessarily mean, that there will be 1,000 examples of every utterance, as duplicates will be filtered out.\n", + "# The amount of utterances per example depends on the maximum number of combinations based on example-entity value combinations.\n", + "iterations = 1000" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Done!\nWall time: 139 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "# Loop through the generator multiple times to get a variation of utterances\n", + "# If you have intents, speech_results and luis_results will be zipped lists each\n", + "# If you have no intents, speech_results and luis_results will be one-dimensional lists\n", + "speech_results = []\n", + "luis_results = []\n", + "for _ in range(1, iterations):\n", + " flight_generator.get_values()\n", + " speech, luis = flight_generator.fill_values()\n", + " speech_results.extend(speech)\n", + " luis_results.extend(luis)\n", + "print(\"Done!\")" + ] + }, + { + "source": [ + "## Speech to Text / Text to Speech\n", + "The section below give you a glance on the results and writes them to a text file.\n", + "If you write generated these utterances along with intents, you may also use it for LUIS scoring with GLUE, as you have intent-text combinations.\n", + "This can help you to evaluate the performance of the model given different entity values.\n" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['i would like to book a flight from Stuttgart to Frankfurt via Central Station, my name is Nadella.',\n", + " 'i am coming from Singapore and want to travel via Central Station to Stuttgart.',\n", + " 'i want to book a seat on my flight to Kuala Lumpur.',\n", + " 'how are you doing?']" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], + "source": [ + "# Show the head of the speech-results\n", + "speech_results[:4]" + ] + }, + { + "source": [ + "## LUIS\n", + "The section below shows you how the results look like and writes them to a [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding). This file can be used as input file for [LUIS](https://luis.ai) training and to accelerate your model development." + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['i would like to book a flight from {city=Stuttgart} to {city=Frankfurt} via {station=Central Station}, my name is {name=Nadella}.',\n", + " 'i am coming from {city=Singapore} and want to travel via {station=Central Station} to {city=Stuttgart}.',\n", + " 'i want to book a seat on my flight to {city=Kuala Lumpur}.',\n", + " 'how are you doing?',\n", + " 'i would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Airport}, my name is {name=Nadella}.']" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ], + "source": [ + "# Show the head of the luis results\n", + "luis_results[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:root:Writing output to file \"example_lu_file\".\n" + ] + }, + { + "output_type": "error", + "ename": "ValueError", + "evalue": "Shape of passed values is (3996, 1), indices imply (3996, 2)", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m 1653\u001b[0m blocks = [\n\u001b[1;32m-> 1654\u001b[1;33m \u001b[0mmake_block\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mslice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1655\u001b[0m ]\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36mmake_block\u001b[1;34m(values, placement, klass, ndim, dtype)\u001b[0m\n\u001b[0;32m 3040\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3041\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3042\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m 2588\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2589\u001b[1;33m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2590\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m 124\u001b[0m raise ValueError(\n\u001b[1;32m--> 125\u001b[1;33m \u001b[1;34mf\"Wrong number of items passed {len(self.values)}, \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 126\u001b[0m \u001b[1;34mf\"placement implies {len(self.mgr_locs)}\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: Wrong number of items passed 1, placement implies 2", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mwrite\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# Transform to LU-file\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mtransform_lu\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mluis_results\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfile_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwrite\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mc:\\Users\\tiwalz\\Documents\\Projects\\Strategy Innovation\\SpeechServices\\src\\luis_data_generator.py\u001b[0m in \u001b[0;36mtransform_lu\u001b[1;34m(zipped_list, lu_file, write)\u001b[0m\n\u001b[0;32m 162\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Writing no output file, just display.'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mcompare\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 164\u001b[1;33m \u001b[0mluis_file\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzipped_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 165\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf'{lu_file}.lu'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 166\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mluis_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 486\u001b[0m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marrays_to_mgr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 487\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 488\u001b[1;33m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_ndarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 489\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_dict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\construction.py\u001b[0m in \u001b[0;36minit_ndarray\u001b[1;34m(values, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 208\u001b[0m \u001b[0mblock_values\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 210\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcreate_block_manager_from_blocks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mblock_values\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 211\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 212\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m 1662\u001b[0m \u001b[0mblocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"values\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1663\u001b[0m \u001b[0mtot_items\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1664\u001b[1;33m \u001b[0mconstruction_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtot_items\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1665\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1666\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[1;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[0;32m 1692\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mblock_shape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1693\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Empty data passed with indices specified.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1694\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Shape of passed values is {passed}, indices imply {implied}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1695\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1696\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: Shape of passed values is (3996, 1), indices imply (3996, 2)" + ] + } + ], + "source": [ + "# File name of your target LU-file.\n", + "file_name = 'example_lu_file' \n", + "# Boolean to write to file, if false it will only show in the output.\n", + "write = True\n", + "# Transform to LU-file. Keep in mind, that you will need a list of tuples with intents, otherwise the function will throw an error.\n", + "transform_lu(luis_results, file_name, write=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.6 64-bit", + "metadata": { + "interpreter": { + "hash": "0d92b4570cf170047a8c40549154a6dffe47dd8c5b7bd394f81eede6f5d748fa" + } + } + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/src/luis_data_generator.py b/src/luis_data_generator.py index aef42c4..7aff58a 100644 --- a/src/luis_data_generator.py +++ b/src/luis_data_generator.py @@ -42,7 +42,7 @@ class LUISGenerator(): Returns: self.preprocessed_text: list of utterances with preprocessed entities.''' self.preprocessed_text = [] - logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.') + logging.info(f'[INFO] - loaded {len(self.utterances)} rows.') # Extract all entities for index, value in enumerate(self.utterances): orig = re.compile('\\{(.*?)\\}').findall(value) @@ -58,7 +58,7 @@ class LUISGenerator(): value = subs[i].join(value.rsplit(orig[i], 1)) i -= 1 self.preprocessed_text.append(value) - logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.') + logging.info(f'[INFO] - finished processing {len(self.utterances)} rows.') return self.preprocessed_text # List all possible entitites @@ -81,7 +81,7 @@ class LUISGenerator(): # Flatten List (as some rows have multiple entities) and drop duplicates from list self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]]))) - logging.info(f"[STATUS] - detected {len(self.tags_flat)} different entities") + logging.info(f"[INFO] - detected {len(self.tags_flat)} different entities") return self.tags_per_row, self.tags_flat # Prepare @@ -143,18 +143,23 @@ class LUISGenerator(): formatted = str(value).format(**self.return_values[index]) formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '') self.utterances_luis.append(formatted) - if self.intents == []: + if self.intents == None: return self.utterances_filled, self.utterances_luis else: return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis) -def transform_lu(zipped_list, lu_file="lu_file"): +def transform_lu(zipped_list, lu_file="lu_file", write=True): '''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way. Args: zipped_list: zipped list of utterances, consisting of intent list and utterance list. lu_file: file name of your lu-file, no file ending necessary, default "lu_file" + write: boolean, whether lu should be written to a file, default True Output: Writes lu-file to your working folder''' + if write: + logging.warning(f'Writing output to file "{lu_file}".') + else: + logging.warning('Writing no output file, just display.') compare = "" luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text') with open(f'{lu_file}.lu', 'w') as f: @@ -162,15 +167,15 @@ def transform_lu(zipped_list, lu_file="lu_file"): if compare != row['intent']: # Begin intent line = f"\n# {row['intent']}" - #print(line, file = f) + if write: print(line, file = f) print(line) line = f"- {str(row['text'])}" - #print(line, file = f) + if write: print(line, file = f) compare = row['intent'] print(line) else: line = f"- {str(row['text'])}" - #print(line, file = f) + if write: print(line, file = f) print(line) def main(utterances, values, intents):