requirements+inferencer fix
This commit is contained in:
Родитель
5c101bdc93
Коммит
1f398c1c8d
|
@ -103,6 +103,9 @@ The following section contains a list of possible new features or enhancements.
|
||||||
### Tests
|
### Tests
|
||||||
- [ ] unit tests (pytest)
|
- [ ] unit tests (pytest)
|
||||||
|
|
||||||
|
## Please note:
|
||||||
|
- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||||
|
|
|
@ -20,7 +20,8 @@
|
||||||
"# Import packages\n",
|
"# Import packages\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import uuid\n",
|
"import uuid\n",
|
||||||
"import json"
|
"import json\n",
|
||||||
|
"import logging"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -30,7 +31,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Change this to your format respectively\n",
|
"# Change this to your format respectively\n",
|
||||||
"df = pd.read_csv(\"file.csv\", sep=\";\", encoding=\"utf-8\")"
|
"fname = 'file.csv'\n",
|
||||||
|
"df = pd.read_csv(fname, sep=\"\\t\", encoding=\"utf-8\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -39,43 +41,51 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Define the data transformer function\n",
|
"def transform_json(df, language, chunk=0):\n",
|
||||||
"def transformJSON(df, language):\n",
|
" fname = f\"output-{language}-train-{chunk}.json\"\n",
|
||||||
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
|
" with open(fname, \"w\", encoding='utf-8') as file:\n",
|
||||||
" file.write(\"[\")\n",
|
" file.write(\"[\")\n",
|
||||||
|
" k = 0\n",
|
||||||
" for index, row in df.iterrows():\n",
|
" for index, row in df.iterrows():\n",
|
||||||
" fileid = uuid.uuid4().hex\n",
|
" fileid = row['id']\n",
|
||||||
" q_views = 0\n",
|
" q_views = row['views']\n",
|
||||||
" q_tags = row['label']\n",
|
" q_tags = row['appliesTo']\n",
|
||||||
" url = \"https://aka.ms/nlp-demo\"\n",
|
" url = row['url']\n",
|
||||||
" lang = language\n",
|
" lang = language\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # PACK Q JSON\n",
|
" # PACK Q JSON\n",
|
||||||
" question = {}\n",
|
" question = {}\n",
|
||||||
" question['title'] = \"Text\"\n",
|
" question['title'] = row['question.title']\n",
|
||||||
" question['author'] = \"Author\"\n",
|
" question['author'] = row['question.author']\n",
|
||||||
" question['createdAt'] = \"01 January, 2020\"\n",
|
" question['createdAt'] = row['question.createdAt']\n",
|
||||||
" question['text'] = row['text']\n",
|
" question['text'] = row['question.text']\n",
|
||||||
" question['upvotes'] = 0\n",
|
" question['upvotes'] = int(row['question.upvotes'])\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # PACK A JSON\n",
|
" # PACK A JSON\n",
|
||||||
" answer = {}\n",
|
" answer = {}\n",
|
||||||
" answer['markedAsAnswer'] = \"false\"\n",
|
" answer['markedAsAnswer'] = str(row['answer.markedAsAnswer'])\n",
|
||||||
" answer['createdAt'] = \"01 January, 2020\"\n",
|
" answer['createdAt'] = row['answer.createdAt']\n",
|
||||||
" answer['text'] = \"Text\"\n",
|
" answer['text'] = row['answer.text']\n",
|
||||||
" answer['upvotes'] = 0\n",
|
" answer['upvotes'] = int(row['answer.upvotes'])\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # PACK JSON\n",
|
" # PACK JSON\n",
|
||||||
" data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n",
|
" data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n",
|
||||||
" content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)\n",
|
" content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # WRITE TO JSON FILE\n",
|
" # WRITE TO JSON FILE\n",
|
||||||
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
|
" with open(fname, \"a\", encoding='utf-8') as file:\n",
|
||||||
" if index == len(df) - 1:\n",
|
" if k == len(df) - 1: # cannot take index as it is read chunk-wise and therefore misleading\n",
|
||||||
" file.write(content + \"]\")\n",
|
" file.write(content + \"]\")\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" file.write(content + \",\")\n",
|
" file.write(content + \",\")\n",
|
||||||
" print(f\"[SUCCESS] - File {fileid}\\n\")"
|
" k = k + 1\n",
|
||||||
|
" try:\n",
|
||||||
|
" with open(fname) as f:\n",
|
||||||
|
" json.load(f)\n",
|
||||||
|
" logging.info(f'[INFO] - File {fname} is valid!')\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" logging.error(f'File {fname} seems to be invalid -> {e}.')\n",
|
||||||
|
" logging.info(f\"[SUCCESS] - File {chunk} -> {k} / {len(df)}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -84,8 +94,110 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initiate the transformation\n",
|
"# Initiate the transformation, if you want to transform the data set. Do this if you have less than 10.000 documents in your dataset\n",
|
||||||
"transformJSON(df, \"en-us\")"
|
"transform_json(df, \"en-us\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Chunk the data sets\n",
|
||||||
|
"## Please note:\n",
|
||||||
|
"- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function\n",
|
||||||
|
" - Íf you have less than 10.000 documents, you may go ahead and simply upload the file to the BLOB storage using the Azure Storage Explorer\n",
|
||||||
|
"- The following section helps you to read a large file and split it into chunks\n",
|
||||||
|
"- Below, there is a script to upload them one-by-one while having a break for five minutes to unload the pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_chunks(lang, language):\n",
|
||||||
|
" print(f'[INFO] - Start reading data chunks for {lang}.')\n",
|
||||||
|
" i = 0\n",
|
||||||
|
" for _ in pd.read_csv(f'data_{lang}.txt', sep=\"\\t\", encoding='utf-8', chunksize=5000):\n",
|
||||||
|
" transform_json(_, language, i)\n",
|
||||||
|
" i = i + 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"get_chunks(lang, language)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Copy to BLOB\n",
|
||||||
|
"- Upload all the files of the export folder to the BLOB storage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"import time\n",
|
||||||
|
"import logging\n",
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def copy_to_blob(local_file_path, blobstring, container):\n",
|
||||||
|
" # Create a blob client using the local file name as the name for the blob\n",
|
||||||
|
" logging.info(f'[INFO] - Initiating upload to BLOB-storage.')\n",
|
||||||
|
" blob_service_client = BlobServiceClient.from_connection_string(blobstring)\n",
|
||||||
|
" logging.info(f'[INFO] - Built connection to BLOB storage.')\n",
|
||||||
|
" for path, subdirs, files in os.walk(local_file_path):\n",
|
||||||
|
" for name in files:\n",
|
||||||
|
" try:\n",
|
||||||
|
" path_full = os.path.join(path, name)\n",
|
||||||
|
" path_blob = os.path.join(path, name).replace(local_file_path, \"\")\n",
|
||||||
|
" logging.info(f'[UPLOAD - {datetime.now()}] - Uploading to Azure Storage as BLOB: {path_blob}.')\n",
|
||||||
|
" blob_client = blob_service_client.get_blob_client(container=container, blob=path_blob)\n",
|
||||||
|
" # Upload the created file\n",
|
||||||
|
" with open(path_full, \"rb\") as data:\n",
|
||||||
|
" blob_client.upload_blob(data, content_settings=ContentSettings(content_type='application/json'))\n",
|
||||||
|
" logging.info(f'[INFO - {datetime.now()}] - Upload completed, sleeping for 10 minutes ... zZz ...')\n",
|
||||||
|
" time.sleep(600)\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" logging.error(f'[STATUS - {datetime.now()}] - Copy to BLOB failed -> {e}.')\n",
|
||||||
|
" logging.info(f'[STATUS - {datetime.now()}] - Successfully uploaded to BLOB.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"blobstring = \"DefaultEndpointsProtocol=https;AccountName=###getyourblobstringhere###;AccountKey=###getyourkeyhere###;EndpointSuffix=core.windows.net\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"copy_to_blob(f\"export-{lang}/\", blobstring, \"data\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
@ -4,7 +4,7 @@ numpy>=1.18.1
|
||||||
pandas==1.0.5
|
pandas==1.0.5
|
||||||
azure-cosmos==3.1.2
|
azure-cosmos==3.1.2
|
||||||
azureml-sdk>=1.1.5
|
azureml-sdk>=1.1.5
|
||||||
azureml-dataprep[pandas,fuse]>=1.3.5
|
azureml-dataprep[pandas,fuse]==2.0.7
|
||||||
# mlflow>=1.6.0 #NOT NEEDED?
|
# mlflow>=1.6.0 #NOT NEEDED?
|
||||||
# azureml-mlflow>=1.1.5 #NOT NEEDED?
|
# azureml-mlflow>=1.1.5 #NOT NEEDED?
|
||||||
azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0
|
azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0
|
||||||
|
@ -35,5 +35,5 @@ flake8>=3.7.9
|
||||||
ipykernel>=5.1.4
|
ipykernel>=5.1.4
|
||||||
streamlit==0.65
|
streamlit==0.65
|
||||||
tqdm
|
tqdm
|
||||||
torch==1.6.0+cpu
|
torch==1.5.1+cpu
|
||||||
torchvision==0.7.0+cpu
|
torchvision==0.6.1+cpu
|
|
@ -88,7 +88,9 @@ def run(req):
|
||||||
category = _labels,
|
category = _labels,
|
||||||
score = [_ref[i] for i in _indices]
|
score = [_ref[i] for i in _indices]
|
||||||
))
|
))
|
||||||
else:
|
_cat = _temp[0].get('category')
|
||||||
|
result = _temp
|
||||||
|
elif tm['params'].get('type') == 'classification':
|
||||||
for r in result[0]['predictions']:
|
for r in result[0]['predictions']:
|
||||||
_temp.append(dict(
|
_temp.append(dict(
|
||||||
category = r.get('label'),
|
category = r.get('label'),
|
||||||
|
@ -96,6 +98,8 @@ def run(req):
|
||||||
))
|
))
|
||||||
_cat = _temp[0].get('category')
|
_cat = _temp[0].get('category')
|
||||||
result = _temp
|
result = _temp
|
||||||
|
else:
|
||||||
|
logger.warning(f'[INFO] - Not a FARM model -> {tm["params"].get("type")}')
|
||||||
|
|
||||||
# Prepare output
|
# Prepare output
|
||||||
res.append({
|
res.append({
|
||||||
|
@ -110,6 +114,6 @@ if __name__ == '__main__':
|
||||||
#NOTE: FOR TESTING ONLY
|
#NOTE: FOR TESTING ONLY
|
||||||
init()
|
init()
|
||||||
print(run(json.dumps([{"subject":"My pc won't start",
|
print(run(json.dumps([{"subject":"My pc won't start",
|
||||||
"body":"When I try booting, a red light goes on and then nothing happens",
|
"body":"When I try booting, a red light goes on and then nothing happens, Bill Gates should help...",
|
||||||
"attachment":""
|
"attachment":""
|
||||||
}])))
|
}])))
|
Загрузка…
Ссылка в новой задаче