зеркало из https://github.com/microsoft/glue.git
update for training data generator
This commit is contained in:
Родитель
b92d00feda
Коммит
3c5c4c29de
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,279 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Automated Generation of Training Data for Microsoft LUIS and Speech Service\n",
|
||||||
|
"This notebook serves to batch-generate training data for [Microsoft LUIS](https://luis.ai) and [Microsoft Speech Service](https://speech.microsoft.com) based on example utterances and possible entity-values.\n",
|
||||||
|
"\n",
|
||||||
|
"## Example\n",
|
||||||
|
"### Input sentence: \n",
|
||||||
|
"- \"I would like to book a flight from {city} to {city} and my name is {name}.\"\n",
|
||||||
|
"\n",
|
||||||
|
"### Sample values: \n",
|
||||||
|
"- city: 'Stuttgart', 'Singapore', 'Frankfurt', 'Kuala Lumpur'\n",
|
||||||
|
"- name: 'Nadella', 'Gates'\n",
|
||||||
|
"\n",
|
||||||
|
"### Returns:\n",
|
||||||
|
"- Training Data for Speech-To-Text Engine or textual input for Text-to-Speech generation\n",
|
||||||
|
" - \"I would like to book a flight from Frankfurt to Kuala Lumpur and my name is Nadella.\"\n",
|
||||||
|
" - \"I would like to book a flight from Singapore to Stuttgart and my name is Gates.\"\n",
|
||||||
|
" - \"I would like to book a flight from Singapore to Frankfurt and my name is Ballmer.\"\n",
|
||||||
|
"- Training data for Microsoft LUIS (see the concept of [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding))\n",
|
||||||
|
" - I would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Bus Stop} and my name is {name=Nadella}.\n",
|
||||||
|
" - I would like to book a flight from {city=Singapore} to {city=Stuttgart} via {station=Airport} and my name is {name=Gates}.\n",
|
||||||
|
" - I would like to book a flight from {city=Singapore} to {city=Frankfurt} via {station=Airport} and my name is {name=Ballmer}."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"The autoreload extension is already loaded. To reload it, use:\n %reload_ext autoreload\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Import relevant packages\n",
|
||||||
|
"import json\n",
|
||||||
|
"import re\n",
|
||||||
|
"import logging\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import random\n",
|
||||||
|
"import sys\n",
|
||||||
|
"\n",
|
||||||
|
"# Import LUIS generator components\n",
|
||||||
|
"sys.path.append(\"../src/\")\n",
|
||||||
|
"from luis_data_generator import LUISGenerator\n",
|
||||||
|
"from luis_data_generator import transform_lu\n",
|
||||||
|
"\n",
|
||||||
|
"# Auto Reload\n",
|
||||||
|
"%load_ext autoreload\n",
|
||||||
|
"%autoreload 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Define input values, or import them from a pandas data frame\n",
|
||||||
|
"utterances = ['i would like to book a flight from {city} to {city} via {station}, my name is {name}.', \n",
|
||||||
|
" 'i am coming from {city} and want to travel via {station} to {city}.',\n",
|
||||||
|
" 'i want to book a seat on my flight to {city}.', \n",
|
||||||
|
" 'how are you doing?']\n",
|
||||||
|
"\n",
|
||||||
|
"values = {'city': ['Singapore', 'Frankfurt', 'Kuala Lumpur', 'Stuttgart'], \n",
|
||||||
|
" 'station': ['Airport', 'Central Station', 'Bus Stop'], \n",
|
||||||
|
" 'name': ['Nadella', 'Gates', 'Ballmer']}\n",
|
||||||
|
"\n",
|
||||||
|
"intents = ['BookFlight', \n",
|
||||||
|
" 'BookFlight', \n",
|
||||||
|
" 'BookSeat',\n",
|
||||||
|
" 'None']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create instance of the LUISGenerator-class along with your utterances, values and intents\n",
|
||||||
|
"# If you have no intents, just remove it. It is an optional argument for the class.\n",
|
||||||
|
"flight_generator = LUISGenerator(utterances, values)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Define amount of iterations below.\n",
|
||||||
|
"# Keep in mind that it does not necessarily mean, that there will be 1,000 examples of every utterance, as duplicates will be filtered out.\n",
|
||||||
|
"# The amount of utterances per example depends on the maximum number of combinations based on example-entity value combinations.\n",
|
||||||
|
"iterations = 1000"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Done!\nWall time: 139 ms\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%time\n",
|
||||||
|
"# Loop through the generator multiple times to get a variation of utterances\n",
|
||||||
|
"# If you have intents, speech_results and luis_results will be zipped lists each\n",
|
||||||
|
"# If you have no intents, speech_results and luis_results will be one-dimensional lists\n",
|
||||||
|
"speech_results = []\n",
|
||||||
|
"luis_results = []\n",
|
||||||
|
"for _ in range(1, iterations):\n",
|
||||||
|
" flight_generator.get_values()\n",
|
||||||
|
" speech, luis = flight_generator.fill_values()\n",
|
||||||
|
" speech_results.extend(speech)\n",
|
||||||
|
" luis_results.extend(luis)\n",
|
||||||
|
"print(\"Done!\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": [
|
||||||
|
"## Speech to Text / Text to Speech\n",
|
||||||
|
"The section below give you a glance on the results and writes them to a text file.\n",
|
||||||
|
"If you write generated these utterances along with intents, you may also use it for LUIS scoring with GLUE, as you have intent-text combinations.\n",
|
||||||
|
"This can help you to evaluate the performance of the model given different entity values.\n"
|
||||||
|
],
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['i would like to book a flight from Stuttgart to Frankfurt via Central Station, my name is Nadella.',\n",
|
||||||
|
" 'i am coming from Singapore and want to travel via Central Station to Stuttgart.',\n",
|
||||||
|
" 'i want to book a seat on my flight to Kuala Lumpur.',\n",
|
||||||
|
" 'how are you doing?']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 28
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Show the head of the speech-results\n",
|
||||||
|
"speech_results[:4]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": [
|
||||||
|
"## LUIS\n",
|
||||||
|
"The section below shows you how the results look like and writes them to a [LU-files](https://docs.microsoft.com/en-us/composer/concept-language-understanding). This file can be used as input file for [LUIS](https://luis.ai) training and to accelerate your model development."
|
||||||
|
],
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['i would like to book a flight from {city=Stuttgart} to {city=Frankfurt} via {station=Central Station}, my name is {name=Nadella}.',\n",
|
||||||
|
" 'i am coming from {city=Singapore} and want to travel via {station=Central Station} to {city=Stuttgart}.',\n",
|
||||||
|
" 'i want to book a seat on my flight to {city=Kuala Lumpur}.',\n",
|
||||||
|
" 'how are you doing?',\n",
|
||||||
|
" 'i would like to book a flight from {city=Frankfurt} to {city=Kuala Lumpur} via {station=Airport}, my name is {name=Nadella}.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 29
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Show the head of the luis results\n",
|
||||||
|
"luis_results[:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"WARNING:root:Writing output to file \"example_lu_file\".\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "error",
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "Shape of passed values is (3996, 1), indices imply (3996, 2)",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m 1653\u001b[0m blocks = [\n\u001b[1;32m-> 1654\u001b[1;33m \u001b[0mmake_block\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mslice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1655\u001b[0m ]\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36mmake_block\u001b[1;34m(values, placement, klass, ndim, dtype)\u001b[0m\n\u001b[0;32m 3040\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3041\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3042\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m 2588\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2589\u001b[1;33m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mndim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mplacement\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mplacement\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2590\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\blocks.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, values, placement, ndim)\u001b[0m\n\u001b[0;32m 124\u001b[0m raise ValueError(\n\u001b[1;32m--> 125\u001b[1;33m \u001b[1;34mf\"Wrong number of items passed {len(self.values)}, \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 126\u001b[0m \u001b[1;34mf\"placement implies {len(self.mgr_locs)}\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m: Wrong number of items passed 1, placement implies 2",
|
||||||
|
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-30-7cbd2e344177>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mwrite\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# Transform to LU-file\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mtransform_lu\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mluis_results\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfile_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwrite\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;32mc:\\Users\\tiwalz\\Documents\\Projects\\Strategy Innovation\\SpeechServices\\src\\luis_data_generator.py\u001b[0m in \u001b[0;36mtransform_lu\u001b[1;34m(zipped_list, lu_file, write)\u001b[0m\n\u001b[0;32m 162\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Writing no output file, just display.'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[0mcompare\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 164\u001b[1;33m \u001b[0mluis_file\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzipped_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'text'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'intent'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 165\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf'{lu_file}.lu'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 166\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mluis_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 486\u001b[0m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marrays_to_mgr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 487\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 488\u001b[1;33m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_ndarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 489\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 490\u001b[0m \u001b[0mmgr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minit_dict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\construction.py\u001b[0m in \u001b[0;36minit_ndarray\u001b[1;34m(values, index, columns, dtype, copy)\u001b[0m\n\u001b[0;32m 208\u001b[0m \u001b[0mblock_values\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 210\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcreate_block_manager_from_blocks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mblock_values\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 211\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 212\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_blocks\u001b[1;34m(blocks, axes)\u001b[0m\n\u001b[0;32m 1662\u001b[0m \u001b[0mblocks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"values\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1663\u001b[0m \u001b[0mtot_items\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mb\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1664\u001b[1;33m \u001b[0mconstruction_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtot_items\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mblocks\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1665\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1666\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\nlp\\lib\\site-packages\\pandas\\core\\internals\\managers.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[1;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[0;32m 1692\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mblock_shape\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1693\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Empty data passed with indices specified.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1694\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Shape of passed values is {passed}, indices imply {implied}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1695\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1696\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m: Shape of passed values is (3996, 1), indices imply (3996, 2)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# File name of your target LU-file.\n",
|
||||||
|
"file_name = 'example_lu_file' \n",
|
||||||
|
"# Boolean to write to file, if false it will only show in the output.\n",
|
||||||
|
"write = True\n",
|
||||||
|
"# Transform to LU-file. Keep in mind, that you will need a list of tuples with intents, otherwise the function will throw an error.\n",
|
||||||
|
"transform_lu(luis_results, file_name, write=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3.7.6 64-bit",
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "0d92b4570cf170047a8c40549154a6dffe47dd8c5b7bd394f81eede6f5d748fa"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6-final"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
|
@ -42,7 +42,7 @@ class LUISGenerator():
|
||||||
Returns:
|
Returns:
|
||||||
self.preprocessed_text: list of utterances with preprocessed entities.'''
|
self.preprocessed_text: list of utterances with preprocessed entities.'''
|
||||||
self.preprocessed_text = []
|
self.preprocessed_text = []
|
||||||
logging.info(f'[STATUS] - loaded {len(self.utterances)} rows.')
|
logging.info(f'[INFO] - loaded {len(self.utterances)} rows.')
|
||||||
# Extract all entities
|
# Extract all entities
|
||||||
for index, value in enumerate(self.utterances):
|
for index, value in enumerate(self.utterances):
|
||||||
orig = re.compile('\\{(.*?)\\}').findall(value)
|
orig = re.compile('\\{(.*?)\\}').findall(value)
|
||||||
|
@ -58,7 +58,7 @@ class LUISGenerator():
|
||||||
value = subs[i].join(value.rsplit(orig[i], 1))
|
value = subs[i].join(value.rsplit(orig[i], 1))
|
||||||
i -= 1
|
i -= 1
|
||||||
self.preprocessed_text.append(value)
|
self.preprocessed_text.append(value)
|
||||||
logging.info(f'[STATUS] - finished processing {len(self.utterances)} rows.')
|
logging.info(f'[INFO] - finished processing {len(self.utterances)} rows.')
|
||||||
return self.preprocessed_text
|
return self.preprocessed_text
|
||||||
|
|
||||||
# List all possible entitites
|
# List all possible entitites
|
||||||
|
@ -81,7 +81,7 @@ class LUISGenerator():
|
||||||
|
|
||||||
# Flatten List (as some rows have multiple entities) and drop duplicates from list
|
# Flatten List (as some rows have multiple entities) and drop duplicates from list
|
||||||
self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))
|
self.tags_flat = list(dict.fromkeys(sorted([item for item in [item for sublist in self.tags_per_row for item in sublist]])))
|
||||||
logging.info(f"[STATUS] - detected {len(self.tags_flat)} different entities")
|
logging.info(f"[INFO] - detected {len(self.tags_flat)} different entities")
|
||||||
return self.tags_per_row, self.tags_flat
|
return self.tags_per_row, self.tags_flat
|
||||||
|
|
||||||
# Prepare
|
# Prepare
|
||||||
|
@ -143,18 +143,23 @@ class LUISGenerator():
|
||||||
formatted = str(value).format(**self.return_values[index])
|
formatted = str(value).format(**self.return_values[index])
|
||||||
formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')
|
formatted = formatted.replace('&?', '{').replace('?&','}').replace('%2', '').replace('%3', '').replace('%4', '')
|
||||||
self.utterances_luis.append(formatted)
|
self.utterances_luis.append(formatted)
|
||||||
if self.intents == []:
|
if self.intents == None:
|
||||||
return self.utterances_filled, self.utterances_luis
|
return self.utterances_filled, self.utterances_luis
|
||||||
else:
|
else:
|
||||||
return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)
|
return zip(self.intents, self.utterances_filled), zip(self.intents, self.utterances_luis)
|
||||||
|
|
||||||
def transform_lu(zipped_list, lu_file="lu_file"):
|
def transform_lu(zipped_list, lu_file="lu_file", write=True):
|
||||||
'''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.
|
'''Transforms zipped list (including intents and text) into lu-file. Drops exact duplicates as LUIS will not take them either way.
|
||||||
Args:
|
Args:
|
||||||
zipped_list: zipped list of utterances, consisting of intent list and utterance list.
|
zipped_list: zipped list of utterances, consisting of intent list and utterance list.
|
||||||
lu_file: file name of your lu-file, no file ending necessary, default "lu_file"
|
lu_file: file name of your lu-file, no file ending necessary, default "lu_file"
|
||||||
|
write: boolean, whether lu should be written to a file, default True
|
||||||
Output:
|
Output:
|
||||||
Writes lu-file to your working folder'''
|
Writes lu-file to your working folder'''
|
||||||
|
if write:
|
||||||
|
logging.warning(f'Writing output to file "{lu_file}".')
|
||||||
|
else:
|
||||||
|
logging.warning('Writing no output file, just display.')
|
||||||
compare = ""
|
compare = ""
|
||||||
luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')
|
luis_file = pd.DataFrame(list(zipped_list), columns=['intent', 'text']).sort_values('intent').drop_duplicates('text')
|
||||||
with open(f'{lu_file}.lu', 'w') as f:
|
with open(f'{lu_file}.lu', 'w') as f:
|
||||||
|
@ -162,15 +167,15 @@ def transform_lu(zipped_list, lu_file="lu_file"):
|
||||||
if compare != row['intent']:
|
if compare != row['intent']:
|
||||||
# Begin intent
|
# Begin intent
|
||||||
line = f"\n# {row['intent']}"
|
line = f"\n# {row['intent']}"
|
||||||
#print(line, file = f)
|
if write: print(line, file = f)
|
||||||
print(line)
|
print(line)
|
||||||
line = f"- {str(row['text'])}"
|
line = f"- {str(row['text'])}"
|
||||||
#print(line, file = f)
|
if write: print(line, file = f)
|
||||||
compare = row['intent']
|
compare = row['intent']
|
||||||
print(line)
|
print(line)
|
||||||
else:
|
else:
|
||||||
line = f"- {str(row['text'])}"
|
line = f"- {str(row['text'])}"
|
||||||
#print(line, file = f)
|
if write: print(line, file = f)
|
||||||
print(line)
|
print(line)
|
||||||
|
|
||||||
def main(utterances, values, intents):
|
def main(utterances, values, intents):
|
||||||
|
|
Загрузка…
Ссылка в новой задаче