requirements+inferencer fix
This commit is contained in:
Родитель
5c101bdc93
Коммит
1f398c1c8d
|
@ -103,6 +103,9 @@ The following section contains a list of possible new features or enhancements.
|
|||
### Tests
|
||||
- [ ] unit tests (pytest)
|
||||
|
||||
## Please note:
|
||||
- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function
|
||||
|
||||
## Contributing
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
|
|
|
@ -20,7 +20,8 @@
|
|||
"# Import packages\n",
|
||||
"import pandas as pd\n",
|
||||
"import uuid\n",
|
||||
"import json"
|
||||
"import json\n",
|
||||
"import logging"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -30,7 +31,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Change this to your format respectively\n",
|
||||
"df = pd.read_csv(\"file.csv\", sep=\";\", encoding=\"utf-8\")"
|
||||
"fname = 'file.csv'\n",
|
||||
"df = pd.read_csv(fname, sep=\"\\t\", encoding=\"utf-8\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -39,43 +41,51 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define the data transformer function\n",
|
||||
"def transformJSON(df, language):\n",
|
||||
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
|
||||
"def transform_json(df, language, chunk=0):\n",
|
||||
" fname = f\"output-{language}-train-{chunk}.json\"\n",
|
||||
" with open(fname, \"w\", encoding='utf-8') as file:\n",
|
||||
" file.write(\"[\")\n",
|
||||
" k = 0\n",
|
||||
" for index, row in df.iterrows():\n",
|
||||
" fileid = uuid.uuid4().hex\n",
|
||||
" q_views = 0\n",
|
||||
" q_tags = row['label']\n",
|
||||
" url = \"https://aka.ms/nlp-demo\"\n",
|
||||
" fileid = row['id']\n",
|
||||
" q_views = row['views']\n",
|
||||
" q_tags = row['appliesTo']\n",
|
||||
" url = row['url']\n",
|
||||
" lang = language\n",
|
||||
"\n",
|
||||
" # PACK Q JSON\n",
|
||||
" question = {}\n",
|
||||
" question['title'] = \"Text\"\n",
|
||||
" question['author'] = \"Author\"\n",
|
||||
" question['createdAt'] = \"01 January, 2020\"\n",
|
||||
" question['text'] = row['text']\n",
|
||||
" question['upvotes'] = 0\n",
|
||||
" question['title'] = row['question.title']\n",
|
||||
" question['author'] = row['question.author']\n",
|
||||
" question['createdAt'] = row['question.createdAt']\n",
|
||||
" question['text'] = row['question.text']\n",
|
||||
" question['upvotes'] = int(row['question.upvotes'])\n",
|
||||
"\n",
|
||||
" # PACK A JSON\n",
|
||||
" answer = {}\n",
|
||||
" answer['markedAsAnswer'] = \"false\"\n",
|
||||
" answer['createdAt'] = \"01 January, 2020\"\n",
|
||||
" answer['text'] = \"Text\"\n",
|
||||
" answer['upvotes'] = 0\n",
|
||||
" answer['markedAsAnswer'] = str(row['answer.markedAsAnswer'])\n",
|
||||
" answer['createdAt'] = row['answer.createdAt']\n",
|
||||
" answer['text'] = row['answer.text']\n",
|
||||
" answer['upvotes'] = int(row['answer.upvotes'])\n",
|
||||
"\n",
|
||||
" # PACK JSON\n",
|
||||
" data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n",
|
||||
" content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii = False)\n",
|
||||
" content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)\n",
|
||||
"\n",
|
||||
" # WRITE TO JSON FILE\n",
|
||||
" with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
|
||||
" if index == len(df) - 1:\n",
|
||||
" with open(fname, \"a\", encoding='utf-8') as file:\n",
|
||||
" if k == len(df) - 1: # cannot take index as it is read chunk-wise and therefore misleading\n",
|
||||
" file.write(content + \"]\")\n",
|
||||
" else:\n",
|
||||
" file.write(content + \",\")\n",
|
||||
" print(f\"[SUCCESS] - File {fileid}\\n\")"
|
||||
" k = k + 1\n",
|
||||
" try:\n",
|
||||
" with open(fname) as f:\n",
|
||||
" json.load(f)\n",
|
||||
" logging.info(f'[INFO] - File {fname} is valid!')\n",
|
||||
" except Exception as e:\n",
|
||||
" logging.error(f'File {fname} seems to be invalid -> {e}.')\n",
|
||||
" logging.info(f\"[SUCCESS] - File {chunk} -> {k} / {len(df)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -84,8 +94,110 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initiate the transformation\n",
|
||||
"transformJSON(df, \"en-us\")"
|
||||
"# Initiate the transformation, if you want to transform the data set. Do this if you have less than 10.000 documents in your dataset\n",
|
||||
"transform_json(df, \"en-us\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Chunk the data sets\n",
|
||||
"## Please note:\n",
|
||||
"- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function\n",
|
||||
" - Íf you have less than 10.000 documents, you may go ahead and simply upload the file to the BLOB storage using the Azure Storage Explorer\n",
|
||||
"- The following section helps you to read a large file and split it into chunks\n",
|
||||
"- Below, there is a script to upload them one-by-one while having a break for five minutes to unload the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_chunks(lang, language):\n",
|
||||
" print(f'[INFO] - Start reading data chunks for {lang}.')\n",
|
||||
" i = 0\n",
|
||||
" for _ in pd.read_csv(f'data_{lang}.txt', sep=\"\\t\", encoding='utf-8', chunksize=5000):\n",
|
||||
" transform_json(_, language, i)\n",
|
||||
" i = i + 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"get_chunks(lang, language)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Copy to BLOB\n",
|
||||
"- Upload all the files of the export folder to the BLOB storage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings\n",
|
||||
"from datetime import datetime\n",
|
||||
"import time\n",
|
||||
"import logging\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def copy_to_blob(local_file_path, blobstring, container):\n",
|
||||
" # Create a blob client using the local file name as the name for the blob\n",
|
||||
" logging.info(f'[INFO] - Initiating upload to BLOB-storage.')\n",
|
||||
" blob_service_client = BlobServiceClient.from_connection_string(blobstring)\n",
|
||||
" logging.info(f'[INFO] - Built connection to BLOB storage.')\n",
|
||||
" for path, subdirs, files in os.walk(local_file_path):\n",
|
||||
" for name in files:\n",
|
||||
" try:\n",
|
||||
" path_full = os.path.join(path, name)\n",
|
||||
" path_blob = os.path.join(path, name).replace(local_file_path, \"\")\n",
|
||||
" logging.info(f'[UPLOAD - {datetime.now()}] - Uploading to Azure Storage as BLOB: {path_blob}.')\n",
|
||||
" blob_client = blob_service_client.get_blob_client(container=container, blob=path_blob)\n",
|
||||
" # Upload the created file\n",
|
||||
" with open(path_full, \"rb\") as data:\n",
|
||||
" blob_client.upload_blob(data, content_settings=ContentSettings(content_type='application/json'))\n",
|
||||
" logging.info(f'[INFO - {datetime.now()}] - Upload completed, sleeping for 10 minutes ... zZz ...')\n",
|
||||
" time.sleep(600)\n",
|
||||
" except Exception as e:\n",
|
||||
" logging.error(f'[STATUS - {datetime.now()}] - Copy to BLOB failed -> {e}.')\n",
|
||||
" logging.info(f'[STATUS - {datetime.now()}] - Successfully uploaded to BLOB.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"blobstring = \"DefaultEndpointsProtocol=https;AccountName=###getyourblobstringhere###;AccountKey=###getyourkeyhere###;EndpointSuffix=core.windows.net\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"copy_to_blob(f\"export-{lang}/\", blobstring, \"data\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
@ -4,7 +4,7 @@ numpy>=1.18.1
|
|||
pandas==1.0.5
|
||||
azure-cosmos==3.1.2
|
||||
azureml-sdk>=1.1.5
|
||||
azureml-dataprep[pandas,fuse]>=1.3.5
|
||||
azureml-dataprep[pandas,fuse]==2.0.7
|
||||
# mlflow>=1.6.0 #NOT NEEDED?
|
||||
# azureml-mlflow>=1.1.5 #NOT NEEDED?
|
||||
azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0
|
||||
|
@ -35,5 +35,5 @@ flake8>=3.7.9
|
|||
ipykernel>=5.1.4
|
||||
streamlit==0.65
|
||||
tqdm
|
||||
torch==1.6.0+cpu
|
||||
torchvision==0.7.0+cpu
|
||||
torch==1.5.1+cpu
|
||||
torchvision==0.6.1+cpu
|
12
src/infer.py
12
src/infer.py
|
@ -88,14 +88,18 @@ def run(req):
|
|||
category = _labels,
|
||||
score = [_ref[i] for i in _indices]
|
||||
))
|
||||
else:
|
||||
_cat = _temp[0].get('category')
|
||||
result = _temp
|
||||
elif tm['params'].get('type') == 'classification':
|
||||
for r in result[0]['predictions']:
|
||||
_temp.append(dict(
|
||||
category = r.get('label'),
|
||||
score = f"{r.get('probability'):.3}"
|
||||
))
|
||||
_cat = _temp[0].get('category')
|
||||
result = _temp
|
||||
_cat = _temp[0].get('category')
|
||||
result = _temp
|
||||
else:
|
||||
logger.warning(f'[INFO] - Not a FARM model -> {tm["params"].get("type")}')
|
||||
|
||||
# Prepare output
|
||||
res.append({
|
||||
|
@ -110,6 +114,6 @@ if __name__ == '__main__':
|
|||
#NOTE: FOR TESTING ONLY
|
||||
init()
|
||||
print(run(json.dumps([{"subject":"My pc won't start",
|
||||
"body":"When I try booting, a red light goes on and then nothing happens",
|
||||
"body":"When I try booting, a red light goes on and then nothing happens, Bill Gates should help...",
|
||||
"attachment":""
|
||||
}])))
|
Загрузка…
Ссылка в новой задаче