This commit is contained in:
nonstoptimm 2020-09-03 16:16:37 +02:00
Родитель 5c101bdc93
Коммит 1f398c1c8d
4 изменённых файлов: 150 добавлений и 31 удалений

Просмотреть файл

@ -103,6 +103,9 @@ The following section contains a list of possible new features or enhancements.
### Tests ### Tests
- [ ] unit tests (pytest) - [ ] unit tests (pytest)
## Please note:
- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us

Просмотреть файл

@ -20,7 +20,8 @@
"# Import packages\n", "# Import packages\n",
"import pandas as pd\n", "import pandas as pd\n",
"import uuid\n", "import uuid\n",
"import json" "import json\n",
"import logging"
] ]
}, },
{ {
@ -30,7 +31,8 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Change this to your format respectively\n", "# Change this to your format respectively\n",
"df = pd.read_csv(\"file.csv\", sep=\";\", encoding=\"utf-8\")" "fname = 'file.csv'\n",
"df = pd.read_csv(fname, sep=\"\\t\", encoding=\"utf-8\")"
] ]
}, },
{ {
@ -39,43 +41,51 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Define the data transformer function\n", "def transform_json(df, language, chunk=0):\n",
"def transformJSON(df, language):\n", " fname = f\"output-{language}-train-{chunk}.json\"\n",
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n", " with open(fname, \"w\", encoding='utf-8') as file:\n",
" file.write(\"[\")\n", " file.write(\"[\")\n",
" k = 0\n",
" for index, row in df.iterrows():\n", " for index, row in df.iterrows():\n",
" fileid = uuid.uuid4().hex\n", " fileid = row['id']\n",
" q_views = 0\n", " q_views = row['views']\n",
" q_tags = row['label']\n", " q_tags = row['appliesTo']\n",
" url = \"https://aka.ms/nlp-demo\"\n", " url = row['url']\n",
" lang = language\n", " lang = language\n",
"\n", "\n",
" # PACK Q JSON\n", " # PACK Q JSON\n",
" question = {}\n", " question = {}\n",
" question['title'] = \"Text\"\n", " question['title'] = row['question.title']\n",
" question['author'] = \"Author\"\n", " question['author'] = row['question.author']\n",
" question['createdAt'] = \"01 January, 2020\"\n", " question['createdAt'] = row['question.createdAt']\n",
" question['text'] = row['text']\n", " question['text'] = row['question.text']\n",
" question['upvotes'] = 0\n", " question['upvotes'] = int(row['question.upvotes'])\n",
"\n", "\n",
" # PACK A JSON\n", " # PACK A JSON\n",
" answer = {}\n", " answer = {}\n",
" answer['markedAsAnswer'] = \"false\"\n", " answer['markedAsAnswer'] = str(row['answer.markedAsAnswer'])\n",
" answer['createdAt'] = \"01 January, 2020\"\n", " answer['createdAt'] = row['answer.createdAt']\n",
" answer['text'] = \"Text\"\n", " answer['text'] = row['answer.text']\n",
" answer['upvotes'] = 0\n", " answer['upvotes'] = int(row['answer.upvotes'])\n",
"\n", "\n",
" # PACK JSON\n", " # PACK JSON\n",
" data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n", " data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n",
" content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii = False)\n", " content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)\n",
"\n", "\n",
" # WRITE TO JSON FILE\n", " # WRITE TO JSON FILE\n",
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n", " with open(fname, \"a\", encoding='utf-8') as file:\n",
" if index == len(df) - 1:\n", " if k == len(df) - 1: # cannot take index as it is read chunk-wise and therefore misleading\n",
" file.write(content + \"]\")\n", " file.write(content + \"]\")\n",
" else:\n", " else:\n",
" file.write(content + \",\")\n", " file.write(content + \",\")\n",
" print(f\"[SUCCESS] - File {fileid}\\n\")" " k = k + 1\n",
" try:\n",
" with open(fname) as f:\n",
" json.load(f)\n",
" logging.info(f'[INFO] - File {fname} is valid!')\n",
" except Exception as e:\n",
" logging.error(f'File {fname} seems to be invalid -> {e}.')\n",
" logging.info(f\"[SUCCESS] - File {chunk} -> {k} / {len(df)}\")"
] ]
}, },
{ {
@ -84,8 +94,110 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Initiate the transformation\n", "# Initiate the transformation, if you want to transform the data set. Do this if you have less than 10.000 documents in your dataset\n",
"transformJSON(df, \"en-us\")" "transform_json(df, \"en-us\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Chunk the data sets\n",
"## Please note:\n",
"- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function\n",
" - Íf you have less than 10.000 documents, you may go ahead and simply upload the file to the BLOB storage using the Azure Storage Explorer\n",
"- The following section helps you to read a large file and split it into chunks\n",
"- Below, there is a script to upload them one-by-one while having a break for five minutes to unload the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_chunks(lang, language):\n",
" print(f'[INFO] - Start reading data chunks for {lang}.')\n",
" i = 0\n",
" for _ in pd.read_csv(f'data_{lang}.txt', sep=\"\\t\", encoding='utf-8', chunksize=5000):\n",
" transform_json(_, language, i)\n",
" i = i + 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"get_chunks(lang, language)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Copy to BLOB\n",
"- Upload all the files of the export folder to the BLOB storage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings\n",
"from datetime import datetime\n",
"import time\n",
"import logging\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def copy_to_blob(local_file_path, blobstring, container):\n",
" # Create a blob client using the local file name as the name for the blob\n",
" logging.info(f'[INFO] - Initiating upload to BLOB-storage.')\n",
" blob_service_client = BlobServiceClient.from_connection_string(blobstring)\n",
" logging.info(f'[INFO] - Built connection to BLOB storage.')\n",
" for path, subdirs, files in os.walk(local_file_path):\n",
" for name in files:\n",
" try:\n",
" path_full = os.path.join(path, name)\n",
" path_blob = os.path.join(path, name).replace(local_file_path, \"\")\n",
" logging.info(f'[UPLOAD - {datetime.now()}] - Uploading to Azure Storage as BLOB: {path_blob}.')\n",
" blob_client = blob_service_client.get_blob_client(container=container, blob=path_blob)\n",
" # Upload the created file\n",
" with open(path_full, \"rb\") as data:\n",
" blob_client.upload_blob(data, content_settings=ContentSettings(content_type='application/json'))\n",
" logging.info(f'[INFO - {datetime.now()}] - Upload completed, sleeping for 10 minutes ... zZz ...')\n",
" time.sleep(600)\n",
" except Exception as e:\n",
" logging.error(f'[STATUS - {datetime.now()}] - Copy to BLOB failed -> {e}.')\n",
" logging.info(f'[STATUS - {datetime.now()}] - Successfully uploaded to BLOB.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"blobstring = \"DefaultEndpointsProtocol=https;AccountName=###getyourblobstringhere###;AccountKey=###getyourkeyhere###;EndpointSuffix=core.windows.net\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"copy_to_blob(f\"export-{lang}/\", blobstring, \"data\")"
] ]
} }
], ],

Просмотреть файл

@ -4,7 +4,7 @@ numpy>=1.18.1
pandas==1.0.5 pandas==1.0.5
azure-cosmos==3.1.2 azure-cosmos==3.1.2
azureml-sdk>=1.1.5 azureml-sdk>=1.1.5
azureml-dataprep[pandas,fuse]>=1.3.5 azureml-dataprep[pandas,fuse]==2.0.7
# mlflow>=1.6.0 #NOT NEEDED? # mlflow>=1.6.0 #NOT NEEDED?
# azureml-mlflow>=1.1.5 #NOT NEEDED? # azureml-mlflow>=1.1.5 #NOT NEEDED?
azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0 azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0
@ -35,5 +35,5 @@ flake8>=3.7.9
ipykernel>=5.1.4 ipykernel>=5.1.4
streamlit==0.65 streamlit==0.65
tqdm tqdm
torch==1.6.0+cpu torch==1.5.1+cpu
torchvision==0.7.0+cpu torchvision==0.6.1+cpu

Просмотреть файл

@ -88,14 +88,18 @@ def run(req):
category = _labels, category = _labels,
score = [_ref[i] for i in _indices] score = [_ref[i] for i in _indices]
)) ))
else: _cat = _temp[0].get('category')
result = _temp
elif tm['params'].get('type') == 'classification':
for r in result[0]['predictions']: for r in result[0]['predictions']:
_temp.append(dict( _temp.append(dict(
category = r.get('label'), category = r.get('label'),
score = f"{r.get('probability'):.3}" score = f"{r.get('probability'):.3}"
)) ))
_cat = _temp[0].get('category') _cat = _temp[0].get('category')
result = _temp result = _temp
else:
logger.warning(f'[INFO] - Not a FARM model -> {tm["params"].get("type")}')
# Prepare output # Prepare output
res.append({ res.append({
@ -110,6 +114,6 @@ if __name__ == '__main__':
#NOTE: FOR TESTING ONLY #NOTE: FOR TESTING ONLY
init() init()
print(run(json.dumps([{"subject":"My pc won't start", print(run(json.dumps([{"subject":"My pc won't start",
"body":"When I try booting, a red light goes on and then nothing happens", "body":"When I try booting, a red light goes on and then nothing happens, Bill Gates should help...",
"attachment":"" "attachment":""
}]))) }])))