update for training data generator

2021-01-06 16:50:32 +01:00 · 2021-01-06 16:50:32 +01:00 · 3c5c4c29de
--- a/Generator.ipynb
+++ b/Generator.ipynb
--- a/Generator.ipynb
+++ b/Generator.ipynb
@ -0,0 +1,279 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Automated Generation of Training Data for Microsoft LUIS and Speech Service\n",
+    "This notebook serves to batch-generate training data for [Microsoft LUIS](https://luis.ai) and [Microsoft Speech Service](https://speech.microsoft.com) based on example utterances and possible entity-values.\n",
+    "\n",
+    "## Example\n",
+    "### Input sentence: \n",
+    "- \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n",
+    "\n",
+    "### Sample values: \n",
+    "- city: 'Stuttgart', 'Singapore', 'Frankfurt', 'Kuala Lumpur'\n",
+    "- name: 'Nadella', 'Gates'\n",
+    "\n",
+    "### Returns:\n",
+    "- Training Data for Speech-To-Text Engine or textual input for Text-to-Speech generation\n",
+    "    - \"I would like to book a flight from Frankfurt to Kuala Lumpur and my name is Nadella.\"\n",
+    "    - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n",
+    "    - \"I would like to book a flight from Singapore to Frankfurt and my name is Ballmer.\"\n",
+    "- Training data for Microsoft LUIS (see the concept of [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding))\n",
+    "    - I would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Bus Stop} and my name is {name=Nadella}.\n",
+    "    - I would like to book a flight from {city=Singapore} to {city=Stuttgart} via {station=Airport} and my name is {name=Gates}.\n",
+    "    - I would like to book a flight from {city=Singapore} to {city=Frankfurt} via {station=Airport} and my name is {name=Ballmer}."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import relevant packages\n",
+    "import json\n",
+    "import re\n",
+    "import logging\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "import sys\n",
+    "\n",
+    "# Import LUIS generator components\n",
+    "sys.path.append(\"../src/\")\n",
+    "from luis_data_generator import LUISGenerator\n",
+    "from luis_data_generator import transform_lu\n",
+    "\n",
+    "# Auto Reload\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define input values, or import them from a pandas data frame\n",
+    "utterances = ['i would like to book a flight from {city} to {city} via {station}, my name is {name}.', \n",
+    "              'i am coming from {city} and want to travel via {station} to {city}.',\n",
+    "              'i want to book a seat on my flight to {city}.', \n",
+    "              'how are you doing?']\n",
+    "\n",
+    "values = {'city': ['Singapore', 'Frankfurt', 'Kuala Lumpur', 'Stuttgart'], \n",
+    "          'station': ['Airport', 'Central Station', 'Bus Stop'], \n",
+    "          'name': ['Nadella', 'Gates', 'Ballmer']}\n",
+    "\n",
+    "intents =  ['BookFlight', \n",
+    "            'BookFlight', \n",
+    "            'BookSeat',\n",
+    "            'None']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create instance of the LUISGenerator-class along with your utterances, values and intents\n",
+    "# If you have no intents, just remove it. It is an optional argument for the class.\n",
+    "flight_generator = LUISGenerator(utterances, values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define amount of iterations below.\n",
+    "# Keep in mind that it does not necessarily mean, that there will be 1,000 examples of every utterance, as duplicates will be filtered out.\n",
+    "# The amount of utterances per example depends on the maximum number of combinations based on example-entity value combinations.\n",
+    "iterations = 1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Done!\nWall time: 139 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# Loop through the generator multiple times to get a variation of utterances\n",
+    "# If you have intents, speech_results and luis_results will be zipped lists each\n",
+    "# If you have no intents, speech_results and luis_results will be one-dimensional lists\n",
+    "speech_results = []\n",
+    "luis_results = []\n",
+    "for _ in range(1, iterations):\n",
+    "    flight_generator.get_values()\n",
+    "    speech, luis = flight_generator.fill_values()\n",
+    "    speech_results.extend(speech)\n",
+    "    luis_results.extend(luis)\n",
+    "print(\"Done!\")"
+   ]
+  },
+  {
+   "source": [
+    "## Speech to Text / Text to Speech\n",
+    "The section below give you a glance on the results and writes them to a text file.\n",
+    "If you write generated these utterances along with intents, you may also use it for LUIS scoring with GLUE, as you have intent-text combinations.\n",
+    "This can help you to evaluate the performance of the model given different entity values.\n"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "['i would like to book a flight from Stuttgart to Frankfurt via Central Station, my name is Nadella.',\n",
+       " 'i am coming from Singapore and want to travel via Central Station to Stuttgart.',\n",
+       " 'i want to book a seat on my flight to Kuala Lumpur.',\n",
+       " 'how are you doing?']"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 28
+    }
+   ],
+   "source": [
+    "# Show the head of the speech-results\n",
+    "speech_results[:4]"
+   ]
+  },
+  {
+   "source": [
+    "## LUIS\n",
+    "The section below shows you how the results look like and writes them to a [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding). This file can be used as input file for [LUIS](https://luis.ai) training and to accelerate your model development."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "['i would like to book a flight from {city=Stuttgart} to {city=Frankfurt} via {station=Central Station}, my name is {name=Nadella}.',\n",
+       " 'i am coming from {city=Singapore} and want to travel via {station=Central Station} to {city=Stuttgart}.',\n",
+       " 'i want to book a seat on my flight to {city=Kuala Lumpur}.',\n",
+       " 'how are you doing?',\n",
+       " 'i would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Airport}, my name is {name=Nadella}.']"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 29
+    }
+   ],
+   "source": [
+    "# Show the head of the luis results\n",
+    "luis_results[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "WARNING:root:Writing output to file \"example_lu_file\".\n"
+     ]
+    },
+    {
+     "output_type": "error",
+     "ename": "ValueError",
+     "evalue": "Shape of passed values is (3996, 1), indices imply (3996, 2)",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m   1653\u001b[0m                 blocks = [\n\u001b[1;32m-> 1654\u001b[1;33m                     \u001b[0mmake_block\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mslice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1655\u001b[0m                 ]\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36mmake_block\u001b[1;34m(values, placement, klass, ndim, dtype)\u001b[0m\n\u001b[0;32m   3040\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3041\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3042\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m   2588\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2589\u001b[1;33m         \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2590\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m    124\u001b[0m             raise ValueError(\n\u001b[1;32m--> 125\u001b[1;33m                 \u001b[1;34mf\"Wrong number of items passed {len(self.values)}, \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    126\u001b[0m                 \u001b[1;34mf\"placement implies {len(self.mgr_locs)}\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: Wrong number of items passed 1, placement implies 2",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m<ipython-input-30-7cbd2e344177>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mwrite\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;31m# Transform to LU-file\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mtransform_lu\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mluis_results\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfile_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwrite\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32mc:\\Users\\tiwalz\\Documents\\Projects\\Strategy Innovation\\SpeechServices\\src\\luis_data_generator.py\u001b[0m in \u001b[0;36mtransform_lu\u001b[1;34m(zipped_list, lu_file, write)\u001b[0m\n\u001b[0;32m    162\u001b[0m         \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Writing no output file, just display.'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    163\u001b[0m     \u001b[0mcompare\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 164\u001b[1;33m     \u001b[0mluis_file\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzipped_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    165\u001b[0m     \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf'{lu_file}.lu'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    166\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mluis_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m    486\u001b[0m                     \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marrays_to_mgr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    487\u001b[0m                 \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 488\u001b[1;33m                     \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_ndarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    489\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    490\u001b[0m                 \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_dict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\construction.py\u001b[0m in \u001b[0;36minit_ndarray\u001b[1;34m(values, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m    208\u001b[0m         \u001b[0mblock_values\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    209\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 210\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mcreate_block_manager_from_blocks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mblock_values\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    211\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    212\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m   1662\u001b[0m         \u001b[0mblocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"values\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1663\u001b[0m         \u001b[0mtot_items\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1664\u001b[1;33m         \u001b[0mconstruction_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtot_items\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1665\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1666\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[1;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[0;32m   1692\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mblock_shape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1693\u001b[0m         \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Empty data passed with indices specified.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1694\u001b[1;33m     \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Shape of passed values is {passed}, indices imply {implied}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1695\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1696\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mValueError\u001b[0m: Shape of passed values is (3996, 1), indices imply (3996, 2)"
+     ]
+    }
+   ],
+   "source": [
+    "# File name of your target LU-file.\n",
+    "file_name = 'example_lu_file' \n",
+    "# Boolean to write to file, if false it will only show in the output.\n",
+    "write = True\n",
+    "# Transform to LU-file. Keep in mind, that you will need a list of tuples with intents, otherwise the function will throw an error.\n",
+    "transform_lu(luis_results, file_name, write=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.6 64-bit",
+   "metadata": {
+    "interpreter": {
+     "hash": "0d92b4570cf170047a8c40549154a6dffe47dd8c5b7bd394f81eede6f5d748fa"
+    }
+   }
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6-final"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/src/luis_data_generator.py
+++ b/src/luis_data_generator.py
@ -42,7 +42,7 @@ class LUISGenerator():
        Returns:
            self.preprocessed_text: list of utterances with preprocessed entities.'''
        self.preprocessed_text = []
-        logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.')
+        logging.info(f'[INFO] - loaded {len(self.utterances)} rows.')
        # Extract all entities
        for index, value in enumerate(self.utterances):
            orig = re.compile('\\{(.*?)\\}').findall(value)
@ -58,7 +58,7 @@ class LUISGenerator():
                    value = subs[i].join(value.rsplit(orig[i], 1))
                    i -= 1
            self.preprocessed_text.append(value)
-        logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.')
+        logging.info(f'[INFO] - finished processing {len(self.utterances)} rows.')
        return self.preprocessed_text
    
    # List all possible entitites
@ -81,7 +81,7 @@ class LUISGenerator():

        # Flatten List (as some rows have multiple entities) and drop duplicates from list
        self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))
-        logging.info(f"[STATUS] - detected {len(self.tags_flat)} different entities")
+        logging.info(f"[INFO] - detected {len(self.tags_flat)} different entities")
        return self.tags_per_row, self.tags_flat
    
    # Prepare 
@ -143,18 +143,23 @@ class LUISGenerator():
            formatted = str(value).format(**self.return_values[index])
            formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')
            self.utterances_luis.append(formatted)
-        if self.intents == []:
+        if self.intents == None:
            return self.utterances_filled, self.utterances_luis
        else:
            return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)
        
-def transform_lu(zipped_list, lu_file="lu_file"):
+def transform_lu(zipped_list, lu_file="lu_file", write=True):
    '''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.
    Args:
        zipped_list: zipped list of utterances, consisting of intent list and utterance list.
        lu_file: file name of your lu-file, no file ending necessary, default "lu_file"
+        write: boolean, whether lu should be written to a file, default True
    Output:
        Writes lu-file to your working folder'''
+    if write:
+        logging.warning(f'Writing output to file "{lu_file}".')
+    else:
+        logging.warning('Writing no output file, just display.')
    compare = ""
    luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')
    with open(f'{lu_file}.lu', 'w') as f:
@ -162,15 +167,15 @@ def transform_lu(zipped_list, lu_file="lu_file"):
            if compare != row['intent']:
                # Begin intent
                line = f"\n# {row['intent']}"
-                #print(line, file = f)
+                if write: print(line, file = f)
                print(line)
                line = f"- {str(row['text'])}"
-                #print(line, file = f)
+                if write: print(line, file = f)
                compare = row['intent']
                print(line)
            else:
                line = f"- {str(row['text'])}"
-                #print(line, file = f)
+                if write: print(line, file = f)
                print(line)

 def main(utterances, values, intents):