Merge branch 'master' of https://github.com/microsoft/verseagility

2020-03-08 18:58:40 +01:00 · 2020-03-08 18:58:40 +01:00 · 3df4cf534d
--- a/README.md
+++ b/README.md
@ -10,21 +10,6 @@
 ## Live Demo
 > http://nlp-demo-app.azurewebsites.net/

-## Deployment
-
-1. Click on the button to start the resource deployment:
-<a href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fchristian-vorhemus%2Ffunction-app%2Fmaster%2Fazuredeploy.json" target="_blank">
-<img src="https://raw.githubusercontent.com/Azure/azure-quickstart-templates/master/1-CONTRIBUTION-GUIDE/images/deploytoazure.png"/>
-</a>
-
-2. After the deployment has finished (~30min) as a workaround for now, add the function "default" host key as an environment variable
-named "FunctionHostKey" in the function (if the variable is already there, replace the value) and click "Save"
-<img src="demo/functionkey.png" width="400">
-
-3. If you put files in the storage account "data" container, the files are processed and put in CosmosDB following the standardized output format.
-<img src="demo/data_container.png" width="400">
-
-
 ## Naming
 ### Azure
 > nlp-\<component\>-\<environment\>
--- a/code/data.py
+++ b/code/data.py
@ -226,11 +226,16 @@ class Data():
        logger.warning(f'SAVED: {self.fn_lookup[fn]}')

    def load(self, fn, header=0, encoding='utf-8', file_type='dataframe'):
+        if fn in self.fn_lookup:
+            fn = self.fn_lookup[fn]
        if file_type == 'dataframe':
-            return pd.read_csv(self.fn_lookup[fn], sep='\t', encoding=encoding, header=header)
+            data = pd.read_csv(fn, sep='\t', encoding=encoding, header=header)
        elif file_type == 'list':
-            with open(self.fn_lookup[fn], encoding=encoding) as f:
+            with open(fn, encoding=encoding) as f:
                data = f.readlines()
-            return data
+        elif file_type == 'json':
+            with open(fn, encoding=encoding) as f:
+                data = json.load(f)
        else:
-            raise Exception(f'[ERROR] - file type ({file_type}) not supported in data loader')
+            raise Exception(f'[ERROR] - file type ({file_type}) not supported in data loader')
+        return data
--- a/code/helper.py
+++ b/code/helper.py
@ -133,6 +133,7 @@ farm_model_lookup = {
        'en' : 'albert-base-v2'
    },
    'distilbert' : {
+        'xx' : 'distilbert-base-multilingual-cased',
        'de' : 'distilbert-base-german-cased'
    }
 }
--- a/deploy/pipeline.py
+++ b/deploy/pipeline.py
@ -2,9 +2,10 @@
 Functions to deploy pipeline

 To run locally, use:
-> cd ./code
+> cd ./root
 > conda activate nlp
-> python deploy/pipeline.py
+> python deploy/pipeline.py --language en --do_prepare --do_train
+> python deploy/pipeline.py --language en --do_deploy

 #NOTE: not using AML Pipelines yet, 
 due to technical restrictions
@ -14,6 +15,7 @@ import json
 import shutil
 import math
 import logging
+import argparse

 from azureml.core import Workspace, Experiment, Model
 from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
@ -70,16 +72,27 @@ def get_best_argument(details, argument):
 # Two stages: dev + train.
 ## dev: test changes, trial runs
 ## train: training, full runs, deployment
+parser = argparse.ArgumentParser()
+parser.add_argument("--language", 
+                    default='en',
+                    type=str,
+                    help="")
+parser.add_argument('--do_prepare',
+                        action='store_true',
+                        help="")
+parser.add_argument('--do_train',
+                        action='store_true',
+                        help="")
+parser.add_argument('--do_deploy',
+                        action='store_true',
+                        help="")
+args = parser.parse_args()

 # PARAMETERS
-project_name = f"msforum_en"
-compute_name = 'gpucluster-nc12'
+project_name = f"msforum_{args.language}"
+compute_name = 'gpucluster-nc6'
 experiment_name = project_name

-do_prepare  =   False
-do_train    =   True
-do_deploy   =   False
-
 ## Load 
 params = get_project_config(f'{project_name}.config.json')
 language = params.get('language')
@ -94,12 +107,13 @@ env = params.get('environment')
 #     auth = MsiAuthentication()
 # except Exception as e:
 #     logger.warning(e)
-# auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")
+# auth = None
+auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

 ws = Workspace.get(name='nlp-ml', 
                subscription_id='50324bce-875f-4a7b-9d3c-0e33679f5d72', 
-                resource_group='nlp')
-                # ,auth=auth)
+                resource_group='nlp',
+                auth=auth)

 ## Compute target
 compute_target = ws.compute_targets[compute_name]
@ -147,7 +161,7 @@ tasks = params.get("tasks")
 #####  PREPARE
 ############################################

-if do_prepare:
+if args.do_prepare:
    logging.warning(f'[INFO] Running  prepare for {project_name}')
    for task in tasks:
        config = tasks.get(task)
@ -167,13 +181,14 @@ if do_prepare:
                        use_gpu = False
                        )
            run = exp.submit(est)
-    run.wait_for_completion(show_output = True)
+    if args.do_train:
+        run.wait_for_completion(show_output = True)

 ############################################
 #####  TRAIN
 ############################################

-if do_train:
+if args.do_train:
    logging.warning(f'[INFO] Running train for {project_name}')
    for task in tasks:
        config = tasks.get(task)
@ -221,11 +236,11 @@ if do_train:
 #####  DEPLOY
 ############################################

-version = '0.1'
+version = '0.2'
 auth_enabled = True
 compute_type = 'ACI'

-if do_deploy:
+if args.do_deploy:
    logging.warning(f'[INFO] Running deploy for {project_name}')
    # Fetch Models
    models = []
@ -240,8 +255,12 @@ if do_deploy:
        logging.warning(f'[INFO] Added Model : {model.name} (v{model.version})')
    
    # Deployment Target
+    memory_gb = 2
+    # ram_size = params.get('environment')
+    # if ram_size is not None:
+        # memory_gb = ram_size
    if compute_type == 'ACI':
-        compute_config = AciWebservice.deploy_configuration(cpu_cores=2, memory_gb=6, auth_enabled=auth_enabled)
+        compute_config = AciWebservice.deploy_configuration(cpu_cores=2, memory_gb=memory_gb, auth_enabled=auth_enabled)
    elif compute_type == 'AKS':
        compute_config = AksWebservice.deploy_configuration() #TODO:
    
@ -288,6 +307,7 @@ if do_deploy:
    # Test service
    try:
        service.run(json.dumps([{"body": "Mein Windows Vista rechner will nicht mehr - ich kriege dauernd fehler meldungen. Ich wollte mir eh einen neuen kaufen, aber ich hab kein Geld. Kann Bill Gates mir helfen?"}]))
+        logging.warning(f'[SUCCESS] Service was deployed.')
    except Exception as e:
        logging.warning(f'[ERROR] Service was not deployed as expected. {e}')

--- a/environment.yml
+++ b/environment.yml
@ -23,12 +23,12 @@ dependencies:
  - bs4
  ##DEMO ENV
  - pillow
-  - streamlit==0.48.1
+  - streamlit==0.56
 #   - langdetect
 #   - lightgbm
 #   - pandas_ml
 - numpy
- pandas
+- pandas=1.0.1
 # - scikit-learn
 # - nltk
 # - nb_conda
--- a/project/msforum_es.config.json
+++ b/project/msforum_es.config.json
@ -10,10 +10,10 @@
        "1": {
            "label": "subcat",
            "type": "classification",
-            "model_type": "bert",
+            "model_type": "distilbert",
            "max_seq_len": 256,
            "embeds_dropout":0.3,
-            "learning_rate":3e-5,
+            "learning_rate":2e-5,
            "prepare": true
        },
        "3": {
--- a/project/msforum_fr.config.json
+++ b/project/msforum_fr.config.json
@ -10,10 +10,10 @@
        "1": {
            "label": "subcat",
            "type": "classification",
-            "model_type": "camembert",
+            "model_type": "distilbert",
            "max_seq_len": 256,
            "embeds_dropout":0.3,
-            "learning_rate":3e-5,
+            "learning_rate":2e-5,
            "prepare": true
        },
        "3": {
--- a/project/msforum_it.config.json
+++ b/project/msforum_it.config.json
@ -10,10 +10,10 @@
        "1": {
            "label": "subcat",
            "type": "classification",
-            "model_type": "bert",
+            "model_type": "distilbert",
            "max_seq_len": 256,
            "embeds_dropout":0.3,
-            "learning_rate":3e-5,
+            "learning_rate":2e-5,
            "prepare": true
        },
        "3": {