requirements+inferencer fix

2020-09-03 16:16:37 +02:00 · 2020-09-03 16:16:37 +02:00 · 1f398c1c8d
--- a/README.md
+++ b/README.md
@ -103,6 +103,9 @@ The following section contains a list of possible new features or enhancements.
 ### Tests
 - [ ] unit tests (pytest)

+## Please note:
+- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function
+
 ## Contributing
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
--- a/notebook/Data
+++ b/notebook/Data
@ -20,7 +20,8 @@
    "# Import packages\n",
    "import pandas as pd\n",
    "import uuid\n",
-    "import json"
+    "import json\n",
+    "import logging"
   ]
  },
  {
@ -30,7 +31,8 @@
   "outputs": [],
   "source": [
    "# Change this to your format respectively\n",
-    "df = pd.read_csv(\"file.csv\", sep=\";\", encoding=\"utf-8\")"
+    "fname = 'file.csv'\n",
+    "df = pd.read_csv(fname, sep=\"\\t\", encoding=\"utf-8\")"
   ]
  },
  {
@ -39,43 +41,51 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Define the data transformer function\n",
-    "def transformJSON(df, language):\n",
-    "    with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
+    "def transform_json(df, language, chunk=0):\n",
+    "    fname = f\"output-{language}-train-{chunk}.json\"\n",
+    "    with open(fname, \"w\", encoding='utf-8') as file:\n",
    "        file.write(\"[\")\n",
+    "    k = 0\n",
    "    for index, row in df.iterrows():\n",
-    "        fileid = uuid.uuid4().hex\n",
-    "        q_views = 0\n",
-    "        q_tags = row['label']\n",
-    "        url = \"https://aka.ms/nlp-demo\"\n",
+    "        fileid = row['id']\n",
+    "        q_views = row['views']\n",
+    "        q_tags = row['appliesTo']\n",
+    "        url = row['url']\n",
    "        lang = language\n",
    "\n",
    "        # PACK Q JSON\n",
    "        question = {}\n",
-    "        question['title'] = \"Text\"\n",
-    "        question['author'] = \"Author\"\n",
-    "        question['createdAt'] = \"01 January, 2020\"\n",
-    "        question['text'] = row['text']\n",
-    "        question['upvotes'] = 0\n",
+    "        question['title'] = row['question.title']\n",
+    "        question['author'] = row['question.author']\n",
+    "        question['createdAt'] = row['question.createdAt']\n",
+    "        question['text'] = row['question.text']\n",
+    "        question['upvotes'] = int(row['question.upvotes'])\n",
    "\n",
    "        # PACK A JSON\n",
    "        answer = {}\n",
-    "        answer['markedAsAnswer'] = \"false\"\n",
-    "        answer['createdAt'] = \"01 January, 2020\"\n",
-    "        answer['text'] = \"Text\"\n",
-    "        answer['upvotes'] = 0\n",
+    "        answer['markedAsAnswer'] = str(row['answer.markedAsAnswer'])\n",
+    "        answer['createdAt'] = row['answer.createdAt']\n",
+    "        answer['text'] = row['answer.text']\n",
+    "        answer['upvotes'] = int(row['answer.upvotes'])\n",
    "\n",
    "        # PACK JSON\n",
    "        data = {'question': question, 'id': fileid, 'views': q_views, 'appliesTo': q_tags, 'url': url, 'language': lang, 'answer': answer}\n",
-    "        content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii = False)\n",
+    "        content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)\n",
    "\n",
    "        # WRITE TO JSON FILE\n",
-    "        with open(f\"output-{language}-train.json\", \"a\", encoding='utf-8') as file:\n",
-    "            if index == len(df) - 1:\n",
+    "        with open(fname, \"a\", encoding='utf-8') as file:\n",
+    "            if k == len(df) - 1: # cannot take index as it is read chunk-wise and therefore misleading\n",
    "                file.write(content + \"]\")\n",
    "            else:\n",
    "                file.write(content + \",\")\n",
-    "            print(f\"[SUCCESS] - File {fileid}\\n\")"
+    "        k = k + 1\n",
+    "    try:\n",
+    "        with open(fname) as f:\n",
+    "            json.load(f)\n",
+    "        logging.info(f'[INFO] - File {fname} is valid!')\n",
+    "    except Exception as e:\n",
+    "        logging.error(f'File {fname} seems to be invalid -> {e}.')\n",
+    "    logging.info(f\"[SUCCESS] - File {chunk} -> {k} / {len(df)}\")"
   ]
  },
  {
@ -84,8 +94,110 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Initiate the transformation\n",
-    "transformJSON(df, \"en-us\")"
+    "# Initiate the transformation, if you want to transform the data set. Do this if you have less than 10.000 documents in your dataset\n",
+    "transform_json(df, \"en-us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chunk the data sets\n",
+    "## Please note:\n",
+    "- For training data corpora larger than 10.000 documents, we recommend to upload them chunk-wise to the BLOB-storage, otherwise it might come to bottlenecks in the document processor function\n",
+    "    - Íf you have less than 10.000 documents, you may go ahead and simply upload the file to the BLOB storage using the Azure Storage Explorer\n",
+    "- The following section helps you to read a large file and split it into chunks\n",
+    "- Below, there is a script to upload them one-by-one while having a break for five minutes to unload the pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_chunks(lang, language):\n",
+    "    print(f'[INFO] - Start reading data chunks for {lang}.')\n",
+    "    i = 0\n",
+    "    for _ in pd.read_csv(f'data_{lang}.txt', sep=\"\\t\", encoding='utf-8', chunksize=5000):\n",
+    "        transform_json(_, language, i)\n",
+    "        i = i + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_chunks(lang, language)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Copy to BLOB\n",
+    "- Upload all the files of the export folder to the BLOB storage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings\n",
+    "from datetime import datetime\n",
+    "import time\n",
+    "import logging\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def copy_to_blob(local_file_path, blobstring, container):\n",
+    "    # Create a blob client using the local file name as the name for the blob\n",
+    "    logging.info(f'[INFO] - Initiating upload to BLOB-storage.')\n",
+    "    blob_service_client = BlobServiceClient.from_connection_string(blobstring)\n",
+    "    logging.info(f'[INFO] - Built connection to BLOB storage.')\n",
+    "    for path, subdirs, files in os.walk(local_file_path):\n",
+    "        for name in files:\n",
+    "            try:\n",
+    "                path_full = os.path.join(path, name)\n",
+    "                path_blob = os.path.join(path, name).replace(local_file_path, \"\")\n",
+    "                logging.info(f'[UPLOAD - {datetime.now()}] - Uploading to Azure Storage as BLOB: {path_blob}.')\n",
+    "                blob_client = blob_service_client.get_blob_client(container=container, blob=path_blob)\n",
+    "                # Upload the created file\n",
+    "                with open(path_full, \"rb\") as data:\n",
+    "                    blob_client.upload_blob(data, content_settings=ContentSettings(content_type='application/json'))\n",
+    "                logging.info(f'[INFO - {datetime.now()}] - Upload completed, sleeping for 10 minutes ... zZz ...')\n",
+    "                time.sleep(600)\n",
+    "            except Exception as e:\n",
+    "                logging.error(f'[STATUS - {datetime.now()}] - Copy to BLOB failed -> {e}.')\n",
+    "    logging.info(f'[STATUS - {datetime.now()}] - Successfully uploaded to BLOB.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "blobstring = \"DefaultEndpointsProtocol=https;AccountName=###getyourblobstringhere###;AccountKey=###getyourkeyhere###;EndpointSuffix=core.windows.net\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "copy_to_blob(f\"export-{lang}/\", blobstring, \"data\")"
   ]
  }
 ],
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@ numpy>=1.18.1
 pandas==1.0.5
 azure-cosmos==3.1.2
 azureml-sdk>=1.1.5
-azureml-dataprep[pandas,fuse]>=1.3.5
+azureml-dataprep[pandas,fuse]==2.0.7
 # mlflow>=1.6.0 #NOT NEEDED?
 # azureml-mlflow>=1.1.5 #NOT NEEDED?
 azure-ai-textanalytics==1.0.0b3 #TO BE UPGRADED -> 5.0.0
@ -35,5 +35,5 @@ flake8>=3.7.9
 ipykernel>=5.1.4
 streamlit==0.65
 tqdm
-torch==1.6.0+cpu 
-torchvision==0.7.0+cpu
+torch==1.5.1+cpu 
+torchvision==0.6.1+cpu
--- a/src/infer.py
+++ b/src/infer.py
@ -88,14 +88,18 @@ def run(req):
                    category = _labels,
                    score = [_ref[i] for i in _indices]
                ))
-        else:
+                _cat = _temp[0].get('category')
+                result = _temp
+        elif tm['params'].get('type') == 'classification':
            for r in result[0]['predictions']:
                _temp.append(dict(
                    category = r.get('label'),
                    score = f"{r.get('probability'):.3}"
                ))
-        _cat = _temp[0].get('category')
-        result = _temp
+            _cat = _temp[0].get('category')
+            result = _temp
+        else:
+            logger.warning(f'[INFO] - Not a FARM model -> {tm["params"].get("type")}')

        # Prepare output
        res.append({
@ -110,6 +114,6 @@ if __name__ == '__main__':
    #NOTE: FOR TESTING ONLY
    init()
    print(run(json.dumps([{"subject":"My pc won't start", 
-                        "body":"When I try booting, a red light goes on and then nothing happens",
+                        "body":"When I try booting, a red light goes on and then nothing happens, Bill Gates should help...",
                        "attachment":""
        }])))