updated data handling, new naming, improved cat

2020-02-14 15:42:52 +01:00 · 2020-02-14 15:42:52 +01:00 · cfaddc2bf3
--- a/.amlignore
+++ b/.amlignore
@ -1,10 +0,0 @@
-data
-temp
-.vscode
-saved_models
-mlruns
-cache
-demo
-notebook
-scraper
-.git
--- a/.gitignore
+++ b/.gitignore
@ -110,4 +110,7 @@ config.ini
 .vscode
 saved_models
 mlruns
-cache
+cache
+scraper/*.txt
+scraper/*.json
+scraper/old
--- a/README.md
+++ b/README.md
@ -11,14 +11,22 @@ NLP Toolkit
 ## Live Demo
 > http://nlp-demo-app.azurewebsites.net/

+## Naming
+### Assets
+> \<project name\>(-\<task\>)-\<step\>(-\<environment\>)
+- where step in [source, train, deploy], for data assets.
+- where task is an int, referring to the parameters, for models.
+
 ## TODO
 ### Project
 - [x] Move to single project config file (for deployment and scoring)
+- [ ] Overview architecture
 - [ ] Detailed documentation
 - [ ] Data storage strategy
 ### Prepare
- [x] source from AML datastore
- [ ] output to AML datastore
+- [x] integrate with AML datastore
+- [ ] connect to CosmosDB (pipeline ready)
+- [ ] **(IP)** document cracking to standardized format
 ### Classification
 - [ ] **(IP)** Multi label support
 - [ ] integrate handling for larger documents
@ -27,21 +35,18 @@ NLP Toolkit
 - [ ] upload best model to AML Model
 ### NER
 - [ ] Improve duplicate handling
- [x] custom NER
+- [x] basic custom NER
 ### Rank
 - [ ] **(IP)** Improve answer quality
 ### Deployment
 - [x] Collect, Package and upload assets
 - [ ] **(IP)** Param script for deploy (incl language param!)
+- [ ] Deploy to Azure Function (without AzureML)
 ### Notebooks
 - [x] review prepared data
 - [ ] **(IP)** review model results (auto generate after each training step)
 - [ ] review model bias (auto generate after each training step)
-### Pipeline
- [ ] **(IP)** document cracking to standardized format
-### DevOps
- [ ] Yaml based infrastructure deployment
- [ ] Integrate with Azure/GitHub DevOps
+- [ ] available models benchmark
 ### Tests
 - [ ] integrate testing framework
 - [ ] placeholder for custom data loading test
@ -50,17 +55,17 @@ NLP Toolkit
 ### New Features (TBD)
 - Summarization
 - Deployable feedback loop
+- Integration with GitHub Actions

 # Acknowledgements
- Verseagility is built in parts using the following:
- - [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
- - [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
- - [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
- - [flair](https://github.com/flairNLP/flair/) by Zalando Research
- - [gensim](https://radimrehurek.com/gensim/)
+- Verseagility is built in part using the following:
+ - [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
+ - [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
+ - [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
+ - [flair](https://github.com/flairNLP/flair/) by Zalando Research
+ - [gensim](https://radimrehurek.com/gensim/)

 # Contributing
-
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
--- a/assets/stopwords-de.txt
+++ b/assets/stopwords-de.txt
--- a/code/classification.py
+++ b/code/classification.py
@ -9,6 +9,7 @@ Example (in the command line):
 > python code/classification.py --task 1 --model_type bert --use_cuda

 """
+import os
 from pathlib import Path
 import json
 import argparse
@ -40,7 +41,8 @@ logger = he.get_logger(location=__name__)
 aml_run = he.get_context()

 def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every, 
-                        use_cuda, max_seq_len, learning_rate, do_lower_case, register_model):
+                        use_cuda, max_seq_len, learning_rate, do_lower_case, 
+                        register_model, save_model=True, early_stopping=True):
    language = cu.params.get('language')
    # Check task
    if cu.tasks.get(str(task)).get('type') != 'classification':
@ -48,12 +50,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
    
    # Data
    dt_task = dt.Data(task=task)
+    ## Download training files
+    if not os.path.isfile(dt_task.fn_lookup['fn_train']):
+        dt_task.download(task=task, step='train')

    # Settings
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)
-    lang_model = he.farm_model_lookup.get(model_type).get(language)
+    lang_model = he.get_farm_model(model_type, language)
    save_dir = dt_task.model_dir.replace('model_type', model_type)
    label_list = dt_task.load('fn_label', header=None)[0].to_list()
    
@ -63,6 +68,9 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
        aml_run.log('language', language)
        aml_run.log('n_epochs', n_epochs)
        aml_run.log('batch_size', batch_size)
+        aml_run.log('learning_rate', learning_rate)
+        aml_run.log('embeds_dropout', embeds_dropout)
+        aml_run.log('max_seq_len', max_seq_len)
        aml_run.log('lang_model', lang_model)
        aml_run.log_list('label_list', label_list)
    except:
@ -85,7 +93,6 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
        # AML log
        try:
            aml_run.log('acc', acc.get('acc'))
-            aml_run.log('acc_backup', acc)
            aml_run.log('f1macro', f1macro)
            aml_run.log('f1micro', f1micro)
        except:
@ -143,12 +150,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
-    earlystopping = EarlyStopping(
-        metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
-        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
-        save_dir=save_dir,  # where to save the best model
-        patience=1    # number of evaluations to wait for improvement before terminating the training
-    )
+    if early_stopping:
+        earlystopping = EarlyStopping(
+            metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
+            # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
+            save_dir=save_dir,  # where to save the best model
+            patience=2    # number of evaluations to wait for improvement before terminating the training
+        )
+    else:
+        earlystopping = None

    trainer = Trainer(
        model=model,
@ -170,8 +180,12 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
-    model.save(save_dir)
-    processor.save(save_dir)
+    if save_model:
+        model.save(save_dir)
+        processor.save(save_dir)
+
+        if register_model:
+            dt_task.upload(save_dir, task, destination='model')

 def run():
    # Run arguments
@ -196,7 +210,7 @@ def run():
                        action='store_true',
                        help="Use CUDA for training")
    parser.add_argument('--n_epochs',
-                    default=5,
+                    default=3,
                    type=int,
                    help='')  
    parser.add_argument('--batch_size',
@ -216,7 +230,7 @@ def run():
                    type=int,
                    help='')  
    parser.add_argument('--learning_rate',
-                    default=0.5e-5,
+                    default=3e-5,
                    type=float,
                    help='')  
    parser.add_argument('--do_lower_case',
--- a/code/custom.py
+++ b/code/custom.py
@ -16,10 +16,10 @@ logger = he.get_logger(location=__name__)
 ############################################

 # Load parameters from config
-params = he.get_project_config('msforum_en.config.json')
+params = he.get_project_config('msforum_de.config.json') #TODO: select parameters
 tasks = params.get('tasks')
-logger.info(f'[INFO] *** Project Target Language -> {params.get("language")} ***')
-logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment")} ***')
+logger.warning(f'[INFO] *** Project target lang \t-> {params.get("language")} \t***')
+logger.warning(f'[INFO] *** Project target env \t-> {params.get("environment")} \t***')

 ############################################
 #####   Data Preparation
@ -28,10 +28,10 @@ logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment"
 def prepare_source(data):
    """Normalize source data for use in downstram tasks. 
    NOTE: should be task agnostic"""
-    data_norm = pd.io.json.json_normalize(data, sep='_').to_dict(orient='records')
+    data_norm = pd.json_normalize(data, sep='_').to_dict(orient='records')
    return pd.read_json(json.dumps(data_norm))

-def remove(line):
+def remove(line): 
    line = re.sub(r'Original Title\:', '', line)
    return line

@ -59,9 +59,9 @@ def filter_qa(data):
    _temp = data[data.answer_markedAsAnswer == 'true'].reset_index(drop=True).copy()
    if len(_temp) == 0:
        _temp = data[data.answer_markedAsAnswer == True].reset_index(drop=True).copy()
-    logger.info(f'Data Length : {len(_temp)}  \t- after marked as answer ')
+    logger.warning(f'Data Length : {len(_temp)}  \t- after marked as answer ')
    # Filter by UpVotes
    # _temp = _temp[_temp['answer_upvotes'] > 1].reset_index(drop=True).copy() #TODO: evaluate
-    logger.info(f'Data Length : {len(_temp)}  \t- after min upvotes of 2')
+    logger.warning(f'Data Length : {len(_temp)}  \t- after min upvotes of 2')
    return _temp

--- a/code/data.py
+++ b/code/data.py
@ -1,10 +1,16 @@
+"""
+Helper function for data management
+Includes source & prepared data, as well as
+model assets.
+
+"""
 import pandas as pd
 import json
 import os
 import pathlib
 from io import StringIO
 from pathlib import Path
-from azureml.core import Dataset, Run, Workspace
+from azureml.core import Run, Dataset, Model
 # from azure.storage.blob import BlockBlobService

 # Custom functions
@ -13,22 +19,13 @@ sys.path.append('../code')
 import helper as he
 import custom as cu

-# Get config
-run_config = he.get_config()
 logger = he.get_logger(location=__name__)

-flair_model_lookup = {
-    'en' : 'en-ner-ontonotes-fast-v0.4.pt', 
-    'de' : 'ner-multi-fast.pt',
-    'xx' : 'ner-multi-fast.pt'
-}
-
 class Data():
-    def __init__(self, fn_source    =   'answers_microsoft_lang.json',
-                    task            =   1,
-                    version         =   1,
-                    env             =   1,
-                    inference       =   False
+    def __init__(self,  task            =   1,
+                        version         =   1,
+                        env             =   1,
+                        inference       =   False
                ):
        # Parameters
        self.task = task
@ -38,87 +35,164 @@ class Data():

        # Directories
        ## Asset directory
-        if inference:
-            ## Assuming deployment via AzureML
-            try:
-                self.data_dir = os.environ['AZUREML_MODEL_DIR']
-            except KeyError:
-                logger.info(f'[WARNING] Not running on AML')
-                self.data_dir = he.run_config['path']['infer_dir']
+        ## Assuming deployment via AzureML
+        if 'AZUREML_MODEL_DIR' in os.environ:
+            self.data_dir = os.environ['AZUREML_MODEL_DIR']
        else:
-            self.data_dir = he.run_config['path']['data_dir']
+            self.data_dir = cu.params.get('data_dir')
+            os.makedirs(self.data_dir, exist_ok=True)
+        logger.warning(f'[INFO] Root data directory: {self.data_dir}')
+        
+        
        ## Model directory
-        self.model_dir  = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}").resolve())
-        ### If present, replace language tag in name
-        fn_source = fn_source.replace('lang', self.language)
+        self.model_dir  = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}-{self.env}").resolve())
+        ### NOTE: source file expected to follow naming convention, otherwise edit here
+        self.fn_source = f"{cu.params.get('name')}-source.{cu.params.get('prepare').get('data_type')}"
+        self.fp_train = f"{cu.params.get('name')}-train-{cu.params.get('environment')}"

        # Lookup
        self.fn_lookup = {
-            'fn_source' : fn_source,
-            'fn_prep'   : f'data_l{self.language}.txt',
-            'fn_clean'  : f'clean_l{self.language}_t{self.task}.txt',
-            'fn_train'  : f'train_l{self.language}_t{self.task}.txt',
-            'fn_test'   : f'test_l{self.language}_t{self.task}.txt',
-            'fn_label'  : f'label_l{self.language}_t{self.task}.txt',
+            ## DATASTORE
+            'fn_source' : self.fn_source,
+            'fp_train' : self.fp_train,
+            ## LOCAL
+            'fp_data'   : os.path.abspath(self.data_dir),
+            'fn_prep'   : f'{self.data_dir}/data-l{self.language}.txt',
+            'fn_clean'  : f'{self.data_dir}/clean-l{self.language}-t{self.task}.txt',
+            'fn_train'  : f'{self.data_dir}/train-l{self.language}-t{self.task}.txt',
+            'fn_test'   : f'{self.data_dir}/test-l{self.language}-t{self.task}.txt',
+            'fn_label'  : f'{self.data_dir}/label-l{self.language}-t{self.task}.txt',
            'fn_eval'   : f'TODO:',
-            ## ASSETS #TODO: auto generate fetching param list
-            'fn_asset'      : f'{self.data_dir}/assets_{self.language}.zip',     
+            ## ASSETS
+            'fn_asset'      : f'{self.data_dir}/assets-{self.language}.zip',     
            'fn_cat'        : self.model_dir.replace('model_type', cu.params.get('tasks').get('1').get('model_type')),
-            'fn_rank'       : f'{self.data_dir}/data_l{self.language}_t4.pkl',
+            'fn_rank'       : f'{self.data_dir}/data-l{self.language}-t4.pkl',
            'fn_ner_list'   : f'{self.data_dir}/ner.txt',
-            'fn_ner_flair'  : f'{self.data_dir}/{flair_model_lookup[self.language]}',
+            'fn_ner_flair'  : f'{self.data_dir}/{he.get_flair_model(self.language, "fn")}',
            'fn_ner_spacy'  : f'TODO:',
            'fn_names'      : f'{self.data_dir}/names.txt',
-            'fn_stopwords'  : f'{self.data_dir}/stopwords_{self.language}.txt',
-        }
+            'fn_stopwords'  : f'{self.data_dir}/stopwords-{self.language}.txt',
+        } #TODO: when to link data dir, when only filename?
+        # for t in cu.params.get('tasks'): #TODO: auto generate fetching param list
+        #     task_property = cu.params.get('tasks').get(t)
+        #     self.fn_lookup[f'fn_{task_property.get('type')}']

        # Files
-        self.fn_source = fn_source
        self.fn_data = self.fn_lookup['fn_prep']

-    def download(self, container=None, fn_blob=None, fn_local=None,
+        # AML Components
+        try:
+            run = Run.get_context()
+            self.ws = run.experiment.workspace
+        except Exception as e:
+            logger.warning(f'[WARNING] AML Workspace not loaded -> {e}')
+
+    ### DOWNLOAD
+    def _download_blob(self):
+        # self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'], 
+        #                                             account_key=run_config['blob']['key'])
+        # if no_run_version:
+        #     self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
+        # elif not encrypted:
+        #     self.block_blob_service.get_blob_to_path(container, 
+        #             str(fn_blob).replace('./',''),
+        #             fn_local)
+        # elif encrypted:
+        #     self.block_blob_service.get_blob_to_path(container, 
+        #             str(fn_blob).replace('.txt', '.enc').replace('./',''),
+        #             fn_local)
+        # if to_dataframe:
+        #     with open(str(fn_local), "rb") as text_file:
+        #         _data = text_file.read()
+        #     if encrypted:
+        #         df = decrypt(_data, dataframe=True)
+        #     else:
+        #         df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8') 
+        #     df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
+        pass
+
+    def _download_datastore(self):
+        pass
+
+    def _download_model(self):
+        #NOTE: not needed when running on AML compute
+        pass
+
+    def download(self, dataset_name=None, 
+                        task='',
+                        step='',
+                        container=None, 
+                        fn_blob=None, 
+                        fn_local=None,
                        no_run_version=False,
                        encrypted=False,
                        to_dataframe=False,
-                        source='blob'):
-        """Download file from Azure"""
+                        source='datastore'):
+        """Download file from online storage"""
        if source == 'blob':
-            self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'], 
-                                                        account_key=run_config['blob']['key'])
-            if no_run_version:
-                self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
-            elif not encrypted:
-                self.block_blob_service.get_blob_to_path(container, 
-                        str(fn_blob).replace('./',''),
-                        fn_local)
-            elif encrypted:
-                self.block_blob_service.get_blob_to_path(container, 
-                        str(fn_blob).replace('.txt', '.enc').replace('./',''),
-                        fn_local)
-            if to_dataframe:
-                with open(str(fn_local), "rb") as text_file:
-                    _data = text_file.read()
-                if encrypted:
-                    df = decrypt(_data, dataframe=True)
-                else:
-                    df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8') 
-                df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
+            self._download_blob() #TODO:
        elif source == 'datastore':
            run = Run.get_context()
            ws = run.experiment.workspace
-            dataset_name = self.fn_source.split('.')[0]
-            Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
-            logger.info(f'[INFO] Downloaded data from data store {dataset_name}')
+            if dataset_name is None:
+                dataset_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
+            try:
+                Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
+            except Exception as e:
+                logger.warning(f'[WARNING] Dataset {dataset_name} not found. Trying without <env>. -> {e}')
+                Dataset.get_by_name(workspace=ws, name=dataset_name.replace(f'_{self.env}', '')).download(self.data_dir, overwrite=True)
+            else:
+                logger.warning(f'[INFO] Downloaded data from data store {dataset_name}')
+        elif source == 'model':
+            pass
        else:
-            logger.info('[ERROR] Source <{source}> does not exist. Can not download file.')
+            logger.warning(f'[ERROR] Source <{source}> does not exist. Can not download file.')

-    def upload(self):
-        #TODO:
-        pass
+    ### UPLOAD
+    def _upload_dataset(self, fp, task, step, ws):
+        """Upload dataset to AzureML Datastore
+        Note:
+        -only works for single file or directory
+        -not meant for model assets
+        """
+        target_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
+        datastore = ws.get_default_datastore()
+        datastore.upload(src_dir = str(fp),
+                            target_path = target_name,
+                            overwrite = True,
+                            show_progress = True)
+        ds = Dataset.File.from_files([(datastore, target_name)])
+        #ds = Dataset.File.from_files(path=[fp])
+        ds.register(workspace = ws,
+                    name = target_name,
+                    description = f'Data set for {step}',
+                    create_new_version = True)

+    def _upload_model(self, fp, task, ws):
+        """Upload model to AzureML Models"""
+        Model.register(workspace=ws,
+                model_name=f'{cu.params.get("name")}-{task}-{cu.params.get("environment")}',
+                model_path=fp, # Local file to upload and register as a model.
+                description='Model assets',
+                tags={'task' : task,
+                        # 'model_type': model_type,
+                        'language': cu.params.get('language'), 
+                        'environment': cu.params.get('environment')})
+
+    def upload(self, fp, task='', step='', destination='model'):
+        if fp in self.fn_lookup:
+            fp = self.fn_lookup[fp]
+        if destination == 'dataset':
+            self._upload_dataset(fp, task, step, self.ws)
+        elif destination == 'model':
+            self._upload_model(fp, task, self.ws)
+        else:
+            logger.warning(f'[ERROR] Destination <{destination}> does not exist. Can not upload file.')
+        logger.warning(f'[INFO] Upload complete to <{destination}> completed.')
+
+    ## PROCESS
    def process(self, data_type='json', save=True):
        """Convert source data to normalized data structure"""
-        
        # Load source data
        if data_type == 'json':
            with open(self.data_dir + self.fn_source, encoding='utf-8') as fp:
@ -126,7 +200,7 @@ class Data():
        elif data_type == 'dataframe':
            data = self.load('fn_source')
        else:
-            logger.info('SOURCE DATA TYPE NOT SUPPORTED')
+            logger.warning('SOURCE DATA TYPE NOT SUPPORTED')
       
        # Custom steps
        df = cu.prepare_source(data)
@ -137,8 +211,15 @@ class Data():
        return df

    def save(self, data, fn, header=True):
-        data.to_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
-        logger.info(f'SAVED: {self.fn_lookup[fn]}')
+        data.to_csv(self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
+        logger.warning(f'SAVED: {self.fn_lookup[fn]}')

-    def load(self, fn, header=0):
-        return pd.read_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', header=header)
+    def load(self, fn, header=0, encoding='utf-8', file_type='dataframe'):
+        if file_type == 'dataframe':
+            return pd.read_csv(self.fn_lookup[fn], sep='\t', encoding=encoding, header=header)
+        elif file_type == 'list':
+            with open(self.fn_lookup[fn], encoding=encoding) as f:
+                data = f.readlines()
+            return data
+        else:
+            raise Exception(f'ERROR - file type ({file_type}) not supported in data loader')
--- a/code/helper.py
+++ b/code/helper.py
@ -15,12 +15,8 @@ from flair.models import SequenceTagger

 def get_logger(level='info', location = None, excl_az_storage=True):
    '''Get runtime logger'''
-    # Location
-    if location is None:
-        logger = logging.getLogger(__name__)
-    else:
-        logger = logging.getLogger(location)
-    
+    global logger
+
    # Exceptions
    if excl_az_storage:
        logging.getLogger("azure.storage.common.storageclient").setLevel(logging.WARN)
@ -31,16 +27,20 @@ def get_logger(level='info', location = None, excl_az_storage=True):
    elif level == 'debug':
        _level = logging.DEBUG
    elif level == 'warning':
-        _level = logging.WARN
+        _level = logging.WARNING

    # Format
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = _level)

-    return logger
+    # Location
+    if location is None:
+        logger = logging.getLogger(__name__)
+    else:
+        logger = logging.getLogger(location)

-logger = get_logger(location=__name__)
+    return logger

 def get_context():
    '''Get AML Run Context for Logging to AML Services'''
@ -48,7 +48,7 @@ def get_context():
        from azureml.core import Run
        run = Run.get_context()
    except Exception as e:
-        logger.info(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
+        logger.warning(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
        run = ''
    return run

@ -56,20 +56,6 @@ def get_context():
 #####   Config
 ############################################

-def get_config():
-    # Get config
-    run_config = configparser.ConfigParser()
-    run_config.read('./code/config.ini')
-    if 'path' not in run_config:
-        run_config.read('./config.ini')
-    if 'path' not in run_config:
-        run_config.read('../config.ini')
-    if 'path' not in run_config:
-        logger.info('[ERROR] Could not find correct config.ini.')
-    return run_config
-
-run_config = get_config()
-
 def get_project_config(fn):
    try: 
        with open(f'./project/{fn}', encoding='utf-8') as fp:
@ -79,30 +65,48 @@ def get_project_config(fn):
            with open(f'../project/{fn}', encoding='utf-8') as fp:
                params = json.load(fp)
        except FileNotFoundError:
-            ## Inference Config
-            with open('./code/config.json', encoding='utf-8') as fp:
-                params = json.load(fp)
+            try:
+                ## Training Config
+                with open('config.json', encoding='utf-8') as fp:
+                    params = json.load(fp)
+            except FileNotFoundError:
+                ## Inference Config
+                with open('./code/config.json', encoding='utf-8') as fp:
+                    params = json.load(fp)
    return params

+def get_config():
+    #TODO: remove this, new use: keys to env, settings to params
+    # Get config
+    run_config = configparser.ConfigParser()
+    run_config.read('./code/config.ini')
+    if 'path' not in run_config:
+        run_config.read('./config.ini')
+    if 'path' not in run_config:
+        run_config.read('../config.ini')
+    if 'path' not in run_config:
+        logger.warning('[ERROR] Could not find correct config.ini.')
+    return run_config
+
 ############################################
 #####   Azure
 ############################################

-def get_credentials():
-    '''Retrieve Service Principal Credentials'''
-    credentials = ServicePrincipalCredentials(
-        client_id = run_config['sp']['client_id'],
-        secret = run_config['sp']['secret'],
-        tenant = run_config['sp']['tenant']
-    )
-    return credentials
+# def get_credentials():
+#     '''Retrieve Service Principal Credentials'''
+#     credentials = ServicePrincipalCredentials(
+#         client_id = run_config['sp']['client_id'],
+#         secret = run_config['sp']['secret'],
+#         tenant = run_config['sp']['tenant']
+#     )
+#     return credentials

-def get_secret():
-    '''Retrieve Secret from KeyVault'''
-    client = KeyVaultClient(get_credentials())
-    vault_url = run_config['keyvault']['url']
-    vault_name = run_config['keyvault']['name_data']
-    return client.get_secret(vault_url, vault_name, "").value
+# def get_secret():
+#     '''Retrieve Secret from KeyVault'''
+#     client = KeyVaultClient(get_credentials())
+#     vault_url = run_config['keyvault']['url']
+#     vault_name = run_config['keyvault']['name_data']
+#     return client.get_secret(vault_url, vault_name, "").value

 ############################################
 #####   ML Frameworks
@ -115,14 +119,14 @@ farm_model_lookup = {
        'xx':'bert-base-multilingual-cased',
        'en':'bert-base-cased',
        'de':'bert-base-german-cased',
+        'fr':'camembert-base',
        'cn':'bert-base-chinese'
        },
    'roberta' : {
        'en' : 'roberta-base'
    },
    'xlm-roberta' : {
-        'xx' : 'xlm-roberta-multi', #TODO: check if it exists?
-        'en' : 'xlm-roberta-large'
+        'xx' : 'xlm-roberta-base'
    },
    'albert' : {
        'en' : 'albert-base-v2'
@ -132,40 +136,66 @@ farm_model_lookup = {
    }
 }

+def get_farm_model(model_type, language):
+    mt = farm_model_lookup.get(model_type)
+    if mt is not None:
+        ml = mt.get(language)
+    if ml is None:
+        ml = mt.get('xx')
+    if ml is None:
+        raise Exception('No Transformer/FARM model found')
+    return ml
+
 spacy_model_lookup = {
    'en':'en_core_web_sm',
    'de':'de_core_news_sm',
+    'fr':'fr_core_news_sm',
+    'es':'es_core_news_sm',
    'it':'it_core_news_sm',
    'xx':'xx_ent_wiki_sm'
 }

-flair_model_lookup = {
-    'en' : 'ner-ontonotes-fast', 
-    'de' : 'ner-multi-fast',
-    'xx' : 'ner-multi-fast'
-}
-
 def load_spacy_model(language='xx', disable=[]):
    try:
        nlp = spacy.load(spacy_model_lookup[language], disable=disable)
    except OSError:
-        logging.info(f'[INFO] Download spacy language model for {language}')
+        logging.warning(f'[INFO] Download spacy language model for {language}')
        from spacy.cli import download
        download(spacy_model_lookup[language])
        nlp = spacy.load(spacy_model_lookup[language], disable=disable)
    return nlp

+flair_model_lookup = {
+    'en' : 'ner-ontonotes-fast', 
+    'de' : 'ner-multi-fast',
+    'xx' : 'ner-multi-fast'
+}
+flair_model_file_lookup = {
+    'en' : 'en-ner-ontonotes-fast-v0.4.pt', 
+    'de' : 'ner-multi-fast.pt',
+    'xx' : 'ner-multi-fast.pt'
+}
+
+def get_flair_model(language, object_type):
+    if object_type == 'model':
+        lookup = flair_model_lookup
+    elif object_type == 'fn':
+        lookup = flair_model_file_lookup
+    m = lookup.get(language)
+    if m is None:
+        m = lookup.get('xx')
+    return m
+
 def load_flair_model(path=None, language='xx', task='ner'):
    if task == 'ner':
        if path is None:
-            model = SequenceTagger.load(flair_model_lookup.get(language))
+            model = SequenceTagger.load(get_flair_model(language, 'model'))
        else:
            model = SequenceTagger.load(path)
    else:
-        logging.info(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
+        logging.warning(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
        model = None
    return model
-    

 ############################################
 #####   Dataframe
@ -191,7 +221,7 @@ def validate_concat(col1, col2, max_len=1000):
                new_line = sub + '. ' + des
                text_concat.append(new_line[:max_len])
        except Exception as e:
-            logger.info(f'[WARNING] Validate Concat - {e}')
+            logger.warning(f'[WARNING] Validate Concat - {e}')
            if 'float' in str(e):
                text_concat.append(str(des))
            else:
@ -215,34 +245,34 @@ def append_ner(v, s, e, l, t=''):
 #####   Cryptography
 ############################################

-def decrypt(token, dataframe=False):
-    ''' Decrypt symetric object using Fernet '''
-    secret = get_secret()
-    f = Fernet(bytes(secret, encoding='utf-8'))
-    token = f.decrypt(token)
-    if dataframe:
-        _data = StringIO(str(token, 'utf-8'))
-        return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8')
-    else:
-        return token
+# def decrypt(token, dataframe=False):
+#     ''' Decrypt symetric object using Fernet '''
+#     secret = get_secret()
+#     f = Fernet(bytes(secret, encoding='utf-8'))
+#     token = f.decrypt(token)
+#     if dataframe:
+#         _data = StringIO(str(token, 'utf-8'))
+#         return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8')
+#     else:
+#         return token

-def decrypt_and_save(fn):
-    with open(fn, "rb") as text_file:
-        token = text_file.read()
-    content = StringIO(str(decrypt(token), 'utf-8'))
-    df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8')
-    df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out
+# def decrypt_and_save(fn):
+#     with open(fn, "rb") as text_file:
+#         token = text_file.read()
+#     content = StringIO(str(decrypt(token), 'utf-8'))
+#     df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False,  encoding='utf-8')
+#     df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out

-def encrypt(token, dataframe=False):
-    ''' Encrypt symetric object using Fernet '''
-    secret = get_secret()
-    f = Fernet(bytes(secret, encoding='utf-8'))
-    if dataframe:
-        token = bytes(to_csv_string(token), encoding='utf-8')
-    return f.encrypt(token)
+# def encrypt(token, dataframe=False):
+#     ''' Encrypt symetric object using Fernet '''
+#     secret = get_secret()
+#     f = Fernet(bytes(secret, encoding='utf-8'))
+#     if dataframe:
+#         token = bytes(to_csv_string(token), encoding='utf-8')
+#     return f.encrypt(token)

-def encrypt_and_save(fn, data, file_type='.txt'):
-    data = encrypt(bytes(data, encoding='utf-8'))
-    fn_new = fn.replace(file_type, '.enc')
-    with open(fn_new, "wb") as text_file:
-        text_file.write(data)
+# def encrypt_and_save(fn, data, file_type='.txt'):
+#     data = encrypt(bytes(data, encoding='utf-8'))
+#     fn_new = fn.replace(file_type, '.enc')
+#     with open(fn_new, "wb") as text_file:
+#         text_file.write(data)
--- a/code/infer.py
+++ b/code/infer.py
@ -32,16 +32,16 @@ def score(task):
    elif task_type == 'qa':
        return rank.Rank(task=task, inference=True)
    else:
-        logger.info('TASK TYPE NOT SUPPORTED')
+        logger.warning('TASK TYPE NOT SUPPORTED')
        return None
    
 def init():
    global task_models, prepare_classes

    # Unpack model dependencies
-    dt_init = dt.Data(inference=True)
-    shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
-    logger.info(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')
+    # dt_init = dt.Data(inference=True)
+    # shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
+    # logger.warning(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')

    # Load models & prepare steps
    task_models = []
@ -54,21 +54,18 @@ def init():
            'params' : cu.tasks.get(str(task))
        })
        prepare_classes[task] = pr.Clean(task=task, inference=True)
-        logger.info(f'[INFO] Loaded model and prepare steps for task {task}.')
-
-def run_model():
-    pass
+        logger.warning(f'[INFO] Loaded model and prepare steps for task {task}.')

 def run(req):
    # Load request
-    req_data = json.loads(req)
+    req_data = json.loads(req)[0]
    # Prepare text
-    if 'subject' in req_data[0]:
-        s = req_data[0]['subject']
+    if 'subject' in req_data:
+        s = req_data['subject']
    else:
        s = ''
-    if 'body' in req_data[0]:
-        b = req_data[0]['body']
+    if 'body' in req_data:
+        b = req_data['body']
    else:
        b = ''
    text = he.validate_concat(s, b)
@ -82,6 +79,7 @@ def run(req):
        result = tm['infer'].inference_from_dicts(dicts=[{"text": clean, "cat": _cat}])
        try:
            # Special treatment for classification (FARM)
+            ##TODO: standardize for all
            _temp = []
            for r in result[0]['predictions']:
                _temp.append(dict(
@ -98,7 +96,7 @@ def run(req):
            "params" : tm['params'],
            "result" : result
        })
-        logger.info(f'[INFO] Completed {tm["task"]}.')
+        logger.warning(f'[INFO] Completed task {tm["task"]}.')
    return res

 if __name__ == '__main__':
--- a/code/ner.py
+++ b/code/ner.py
@ -29,7 +29,7 @@ import helper as he
 # Custom FLAIR element for spacy pipeline
 class FlairMatcher(object):
    name = "flair"
-
+    ##TODO: run on stored headless models
    def __init__(self, path):
        self.tagger = he.load_flair_model(path=path)

@ -57,7 +57,7 @@ class CustomNER():

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
-        lang_model = he.farm_model_lookup.get(model_type).get(language)
+        lang_model = he.get_farm_model(model_type, language)
        save_dir = dt_task.model_dir.replace('model_type', model_type)
        # ner_labels = dt_task.load('fn_label', header=None)[0].to_list() TODO:
        ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
--- a/code/prepare.py
+++ b/code/prepare.py
@ -48,6 +48,7 @@ class Clean():
    """

    def __init__(self, task,
+                        download=True,
                        inference=False):
        self.task = task
        self.language = cu.params.get('language')
@ -55,6 +56,11 @@ class Clean():
        # Load data class
        self.dt = dt.Data(task=self.task, inference=inference)

+        # Download data, if needed
+        if download:
+            self.dt.download(dataset_name = self.dt.fn_lookup.get('fn_source').split('.')[0], source='datastore')
+            self.dt.download(dataset_name = self.dt.fn_lookup.get('fp_train'), source='datastore')
+
        # Load spacy model
        self.nlp = he.load_spacy_model(language=self.language, disable=['ner','parser','tagger'])
        
@ -62,21 +68,20 @@ class Clean():
        stopwords_active = []
        ## Load names
        try:
-            with open(self.dt.fn_lookup['fn_names'], encoding='utf-8') as f:
-                names = f.readlines()
+            names = self.dt.load('fn_names', file_type='list')
            stopwords_active = stopwords_active + names
        except Exception as e:
-            logger.info(f'[WARNING] No names list loaded: {e}')
+            logger.warning(f'[WARNING] No names list loaded: {e}')
        
        ## Load stopwords
        try:
-            with open(self.dt.fn_lookup['fn_stopwords'], encoding='utf-8') as f:
-                stopwords = f.readlines()
+            stopwords = self.dt.load('fn_stopwords', file_type='list')
            stopwords_active = stopwords_active + stopwords
        except Exception as e:
-            logger.info(f'[WARNING] No stopwords list loaded: {e}')
-        logger.info(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
+            logger.warning(f'[WARNING] No stopwords list loaded: {e}')
+
        ## Add to Spacy stopword list
+        logger.warning(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
        for w in stopwords_active:
            self.nlp.vocab[w.replace('\n','')].is_stop = True
   
@ -254,23 +259,21 @@ class Clean():
                    return_token        = True
                )[0]
        else:
-            logger.info('[WARNING] No transform by task found.')
+            logger.warning('[WARNING] No transform by task found.')
            return text[0]

 def prepare_classification(task, do_format, train_split, min_cat_occurance, 
                            min_char_length, download_source):
-    # Get clean object
-    cl = Clean(task=task)

-    if download_source:
-        cl.dt.download(source='datastore')
+    # Get clean object
+    cl = Clean(task=task, download=download_source)

    # Load data
    if do_format:
        data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
    else:
        data = cl.dt.load('fn_prep')
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    # Load text & label field
    text_raw = cu.load_text(data)
@ -286,21 +289,21 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
    
    # Filter by length
    data = he.remove_short(data, 'text', min_char_length=min_char_length)
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    # Remove duplicates
    data_red = data.drop_duplicates(subset=['text'])
-    logger.info(f'Data Length : {len(data_red)}')
+    logger.warning(f'Data Length : {len(data_red)}')
    
    # Min class occurance
    data_red = data_red[data_red.groupby('label').label.transform('size') > min_cat_occurance]
-    logger.info(f'Data Length : {len(data_red)}')
+    logger.warning(f'Data Length : {len(data_red)}')

    data_red = data_red.reset_index(drop=True).copy()

    # Label list
    label_list = data_red.label.drop_duplicates()
-    logger.info(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')
+    logger.warning(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')

    # Split data
    strf_split = StratifiedShuffleSplit(n_splits = 1, test_size=(1-train_split), random_state=200)
@ -314,26 +317,28 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
    cl.dt.save(df_cat_test[['text','label']], fn = 'fn_test')
    cl.dt.save(label_list, fn = 'fn_label', header=False)

+    # Upload data
+    # cl.dt.upload('fn_prep', task=task, step='prep', destination='dataset')
+    cl.dt.upload('fp_data', task=task, step='train', destination='dataset')
+
 def prepare_ner(task, do_format=True):
    pass

 def prepare_qa(task, do_format, min_char_length, download_source):
-    # Get clean object
-    cl = Clean(task=task)

-    if download_source:
-        cl.dt.download(source='datastore')
+    # Get clean object
+    cl = Clean(task=task, download=download_source)
    
    # Load data
    if do_format:
        data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
    else:
        data = cl.dt.load('fn_prep')
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    # Filter relevant question answer pairs
    data = cu.filter_qa(data)
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    # Load question & answer fields
    question, answer = cu.load_qa(data)
@ -370,19 +375,24 @@ def prepare_qa(task, do_format, min_char_length, download_source):

    # Filter by length
    data = he.remove_short(data, 'question_clean', min_char_length=min_char_length)
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    # Remove duplicates
    data = data.drop_duplicates(subset=['question_clean'])
-    logger.info(f'Data Length : {len(data)}')
+    logger.warning(f'Data Length : {len(data)}')

    data = data.reset_index(drop=True).copy()

    # Save data
    cl.dt.save(data, fn = 'fn_clean')

-def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_char_length=20,  download_source=False):
-    logger.info(f'Running <PREPARE> for task {task}')
+def main(task=1, 
+                do_format=False, 
+                split=0.9, 
+                min_cat_occurance=300, 
+                min_char_length=20, 
+                download_source=False):
+    logger.warning(f'Running <PREPARE> for task {task}')

    task_type = cu.tasks.get(str(task)).get('type')
    if 'classification' == task_type:
@ -392,34 +402,40 @@ def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_c
    elif 'qa' == task_type:
        prepare_qa(task, do_format, min_char_length, download_source)
    else:
-        logger.info('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')
+        logger.warning('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')

-def run():
-    """Run from the command line"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--task", 
-                    default=1,
-                    type=int,
-                    help="Task where: \
-                            -task 1 : classification subcat \
-                            -task 2 : classification cat \
-                            -task 3 : ner \
-                            -task 4 : qa") 
-    parser.add_argument('--do_format',
-                    action='store_true',
-                    help="Avoid reloading and normalizing data")
-    parser.add_argument("--split", 
-                    default=0.9,
-                    type=float,
-                    help="Train test split. Dev split is taken from train set.")    
-    parser.add_argument("--min_cat_occurance", 
-                    default=300,
-                    type=int,
-                    help="Min occurance required by category.") 
-    parser.add_argument("--download_source", 
-                    action='store_true')          
-    args = parser.parse_args()
-    run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
+# def run():  'TODO: run train.py for single run
+#     """Run from the command line"""
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--task", 
+#                     default=1,
+#                     type=int,
+#                     help="Task where: \
+#                             -task 1 : classification subcat \
+#                             -task 2 : classification cat \
+#                             -task 3 : ner \
+#                             -task 4 : qa") 
+#     parser.add_argument('--do_format',
+#                     action='store_true',
+#                     help="Avoid reloading and normalizing data")
+#     parser.add_argument("--split", 
+#                     default=0.9,
+#                     type=float,
+#                     help="Train test split. Dev split is taken from train set.")    
+#     parser.add_argument("--min_char_length", 
+#                     default=20,
+#                     type=int,
+#                     help="") 
+#     parser.add_argument("--min_cat_occurance", 
+#                     default=300,
+#                     type=int,
+#                     help="Min occurance required by category.") 
+#     parser.add_argument("--download_source", 
+#                     action='store_true')          
+#     args = parser.parse_args()
+#     run_prepare(args.task, args.do_format, args.split, min_cat_occurance=args.min_cat_occurance, 
+#                     min_char_length=args.min_char_length, download_source=args.download_source)
+#                     #TODO: cleanup
        
 if __name__ == '__main__':
-    run()
+    main()
--- a/code/rank.py
+++ b/code/rank.py
@ -48,7 +48,7 @@ class Rank():
        if cats is not None and cats != '':
            #TODO: does not work for lists
            _data = _data[_data.appliesTo.str.contains(cats)].reset_index(drop=True)
-            logger.info(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
+            logger.warning(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
        
        # BM25 Score threshold
        _data = _data[_data.score > ans_thresh].reset_index(drop=True)
@ -90,7 +90,7 @@ def create_bm25():
    with open(cl.dt.fn_lookup['fn_rank'], 'wb') as fp:
        pickle.dump(bm, fp)
        pickle.dump(data, fp)
-    logger.info('Create and stored BM25 object.')
+    logger.warning('Create and stored BM25 object.')

 if __name__ == "__main__":
    create_bm25()
--- a/code/train.py
+++ b/code/train.py
@ -1,4 +1,5 @@
 """ 
+#TODO: replace with python step functions
 Finetuning the model for sub category classification.

 Task 1 - forum entry classification
@ -6,8 +7,6 @@ Task 2 - ner (MS products)
 Task 3 - qa ranking
 Task 4 - urgency / priority

-
-
 INPUT:
 - language
 - task
@ -25,6 +24,8 @@ OUTPUT:
 - status
 """
 import argparse
+import mlflow
+from farm.utils import MLFlowLogger

 # Custom functions
 import sys
@ -34,8 +35,7 @@ import classification
 # import ner
 # import rank

-import logging
-logging.basicConfig(level=logging.DEBUG)
+# from azureml.core import Run

 def main():
    parser = argparse.ArgumentParser()
@ -50,7 +50,6 @@ def main():
                            -task 3 : ner \
                            -task 4 : qa") 

-
    ### PREPARE
    parser.add_argument('--do_format',
                    action='store_true',
@ -63,6 +62,10 @@ def main():
                    default=300,
                    type=int,
                    help="Min occurance required by category.") 
+    parser.add_argument("--min_char_length", 
+                    default=20,
+                    type=int,
+                    help="") 
    parser.add_argument("--download_source", 
                    action='store_true') 

@ -79,7 +82,7 @@ def main():
                        action='store_true',
                        help="Use CUDA for training")
    parser.add_argument('--n_epochs',
-                    default=5,
+                    default=3,
                    type=int,
                    help='')  
    parser.add_argument('--batch_size',
@ -99,7 +102,7 @@ def main():
                    type=int,
                    help='')  
    parser.add_argument('--learning_rate',
-                    default=0.5e-5,
+                    default=3e-5,
                    type=float,
                    help='')  
    parser.add_argument('--do_lower_case',
@ -108,18 +111,16 @@ def main():
    parser.add_argument('--register_model',
                        action='store_true',
                        help="Register model in AML")
-
-
    args = parser.parse_args()

-
    # Run prepare
-    prepare.run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
-
+    prepare.main(args.task, args.do_format, args.split, args.min_cat_occurance, args.min_char_length, args.download_source)
+    
    # Run train
-    classification.doc_classification(args.task, args.model_type, args.n_epochs, args.batch_size, args.embeds_dropout, args.evaluate_every, 
-                        args.use_cuda, args.max_seq_len, args.learning_rate, args.do_lower_case, args.register_model)
-
+    classification.doc_classification(task=args.task, model_type=args.model_type, n_epochs=args.n_epochs, 
+                                    batch_size=args.batch_size, embeds_dropout=args.embeds_dropout, evaluate_every=args.evaluate_every, 
+                        use_cuda=args.use_cuda, max_seq_len=args.max_seq_len, learning_rate=args.learning_rate, do_lower_case=args.do_lower_case, 
+                        register_model=args.register_model)

 if __name__ == "__main__":
    main()
--- a/deploy/inference.py
+++ b/deploy/inference.py
@ -17,8 +17,7 @@ import shutil
 import json

 from azureml.core.authentication import InteractiveLoginAuthentication, MsiAuthentication
-from azureml.core import Workspace
-from azureml.core import Model
+from azureml.core import Workspace, Model
 from azureml.core.resource_configuration import ResourceConfiguration
 from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
 from azureml.core import Environment
@ -66,6 +65,7 @@ dt_assets = dt.Data()
 ##############################
 ## ZIP DEPENDENCIES
 ##############################
+model_name = f'nlp_{language}_{env}'
 if do_zip:
    logger.warning(f'[INFO] Zipping model assets -> {model_name}')
    # Zip Assets
@ -88,7 +88,6 @@ if do_zip:
 ##############################
 ## UPLOAD DEPENDENCIES
 ##############################
-model_name = f'nlp_{language}_{env}'
 if upload:   
    logger.warning(f'[INFO] Uploading model assets -> {model_name}') 
    # Upload Assets
@ -121,6 +120,8 @@ else:
 environment = Environment('farmenv')
 environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[
                                                                                'azureml-defaults',
+                                                                                'mlflow', 
+                                                                                'azureml-mlflow',
                                                                                'spacy',
                                                                                'transformers==2.3.0',
                                                                                'scipy',
--- a/deploy/training.py
+++ b/deploy/training.py
@ -1,8 +1,15 @@
-'''Functions to deploy training
+"""
+Functions to deploy training

-'''
+To run locally, use:
+> cd ./code
+> conda activate nlp
+> python deploy/training.py
+
+"""

 import os
+import shutil
 import math
 from azureml.core import Workspace, Experiment
 from azureml.train.dnn import PyTorch
@ -10,26 +17,33 @@ from azureml.train.hyperdrive import  (BayesianParameterSampling,
                                        HyperDriveConfig, PrimaryMetricGoal,
                                        choice, uniform, loguniform)

+# PARAMETERS
+language = 'de'
+single_run = True
+compute_name = 'gpucluster-nc6'
+experiment_name = f"msforum_{language}"
+
 ############################################
 #####   AML Setup
 ############################################

 ## Workspace
 # auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")
-
 ws = Workspace.get(name='nlp-ml', 
                subscription_id='50324bce-875f-4a7b-9d3c-0e33679f5d72', 
                resource_group='nlp')
                # ,auth=auth)

 ## Compute target
-compute_name = 'gpucluster-nc12'
 compute_target= ws.compute_targets[compute_name]
-script_folder = "./"
+script_folder = "."

+#TODO: load from file
 pip_packages=[
                        'azureml-sdk',
                        'azureml-dataprep[pandas,fuse]',
+                        'mlflow', 
+                        'azureml-mlflow',
                        'spacy',
                        'transformers==2.3.0',
                        'scipy',
@ -42,8 +56,8 @@ pip_packages=[
                        'seqeval',
                        'mlflow==1.0.0',
                        'dotmap==1.3.0',
-                        'git+https://github.com/deepset-ai/FARM.git',
-                        'git+https://github.com/zalandoresearch/flair.git'
+                        'farm==0.4.1',
+                        'flair==0.4.5'
                    ]
 conda_packages=[
                        # 'pytorch',
@ -58,46 +72,63 @@ conda_packages=[
 #####   Task 1
 ############################################

+fn_config_infer = 'config.json'
+shutil.copy(f'./project/msforum_{language}.config.json', f'./code/{fn_config_infer}')
+
+os.chdir('./code')
+
 ## Experiment
-experiment_name = "answers-de"
 exp = Experiment(workspace = ws, name = experiment_name)
 ## Config
 script_params = {
    '--task'            : 1,
    '--do_format'       : '',
    '--download_source' : '',
-    # '--model_type'      : 'roberta',
    '--use_cuda'        : '',
-    '--batch_size'      : 4
-    # '--learning_rate'   : 0.5e-5
+    '--n_epochs'        : 3,
+    # '--learning_rate'   : 2e-5,
+    # '--model_type'      : 'roberta',
+    # '--max_seq_len'     : 128, #256,
+    # '--embeds_dropout'  : 0.3,
+    # '--register_model'  : ''
 }
 est = PyTorch(source_directory = script_folder,
            compute_target = compute_target,
            script_params = script_params,
-            entry_script = 'code/train.py',
+            entry_script = 'train.py',
            pip_packages = pip_packages,
            conda_packages = conda_packages,
            use_gpu = True)
 ## Run
-# run = exp.submit(est)
-# run.wait_for_completion(show_output = True)
-### Hyperparameters params
-param_sampling = BayesianParameterSampling( {
-    '--learning_rate' : choice(0.5e-5, 1e-5, 2e-5, 3e-5),
-    # '--model_type' : choice('roberta','bert','albert')
-    '--model_type' : choice('distilbert','bert')
-})
-## Prepare HyperDrive Config
-hdc = HyperDriveConfig(estimator=est, 
-            hyperparameter_sampling = param_sampling,
-            policy = None, # NOTE: not possible for bayesian
-            primary_metric_name = 'f1macro',
-            primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
-            max_total_runs = 40,
-            max_concurrent_runs = 1)
-## Run hyperparameter tuning
-hyperdrive_run = exp.submit(config=hdc)
-hyperdrive_run.wait_for_completion(show_output = True)
-## Get Results
-best_run = hyperdrive_run.get_best_run_by_primary_metric()
-print(best_run)
+if single_run:
+    run = exp.submit(est)
+    #Remove temp config
+    os.remove(fn_config_infer)
+    run.wait_for_completion(show_output = True)
+else:
+    ### Hyperparameters params
+    if language == 'en':
+        model_type = choice('roberta','bert','albert') #,'xlm-roberta'
+    elif language == 'de':
+        model_type = choice('distilbert','bert', 'roberta')
+    param_sampling = BayesianParameterSampling({
+        '--learning_rate' : choice(1e-5, 2e-5, 3e-5, 4e-5),
+        '--model_type' :  model_type,
+        '--max_seq_len' : choice(64, 128, 256),
+        '--embeds_dropout' : choice(0.1, 0.2, 0.3, 0.4)    
+    })
+    ## Prepare HyperDrive Config
+    hdc = HyperDriveConfig(estimator=est, 
+                hyperparameter_sampling = param_sampling,
+                policy = None, # NOTE: not possible for bayesian
+                primary_metric_name = 'f1macro',
+                primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
+                max_total_runs = 80,
+                max_concurrent_runs = 1)
+    ## Run hyperparameter tuning
+    hyperdrive_run = exp.submit(config=hdc)
+    #Remove temp config
+    os.remove(fn_config_infer)
+    hyperdrive_run.wait_for_completion(show_output = True)
+    ## Get Results
+    # best_run = hyperdrive_run.get_best_run_by_primary_metric()
--- a/environment.yml
+++ b/environment.yml
@ -7,20 +7,23 @@ dependencies:
 - gensim=3.8.1
 - pip:
  # - azureml-defaults for aml deployment
-  - azureml-sdk
-  - azureml-dataprep[pandas,fuse]
+  - azureml-sdk==1.0.85
+  - azureml-dataprep[pandas,fuse]==1.1.38
+  - mlflow==1.0.0
+  - azureml-mlflow==1.0.85
  # - imblearn
  - spacy==2.2.1
-  - transformers==2.3.0
-#   - farm==0.3.2
-  - 'git+https://github.com/deepset-ai/FARM.git'
-#   - flair
-  - git+https://github.com/zalandoresearch/flair.git
+  - transformers==2.4.1
+  - farm==0.4.1
+#   - 'git+https://github.com/deepset-ai/FARM.git'
+  - flair==0.4.5
+#   - git+https://github.com/zalandoresearch/flair.git
  - azure-storage-blob
-  - streamlit
  - selenium==3.141.0
  - bs4
+  ##DEMO ENV
  - pillow
+  - streamlit==0.48.1
 #   - langdetect
 #   - lightgbm
 #   - pandas_ml
@ -32,9 +35,4 @@ dependencies:
 # - matplotlib
 # - seaborn

-# temporary fix for falir
-# pip install --upgrade git+https://github.com/zalandoresearch/flair.git
-
-#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"
-
-# conda install pytorch torchvision cpuonly -c pytorch
+#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"
--- a/notebook/.azureml/config.json
+++ b/notebook/.azureml/config.json
@ -1 +0,0 @@
-{"Id": null, "Scope": "/subscriptions/50324bce-875f-4a7b-9d3c-0e33679f5d72/resourceGroups/nlp/providers/Microsoft.MachineLearningServices/workspaces/nlp-ml"}
--- a/Deploy.ipynb
+++ b/Deploy.ipynb
--- a/notebook/Test
+++ b/notebook/Test
@ -12,15 +12,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "I0113 15:26:41.951092  3456 file_utils.py:35] PyTorch version 1.3.1 available.\n",
-      "I0113 15:26:45.606345  3456 custom.py:19] [INFO] Project Target Language **en**\n"
+      "I0110 11:14:41.291058 22612 custom.py:19] [INFO] Project Target Language **en**\n"
     ]
    }
   ],
@ -32,76 +31,6 @@
    "import ner"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2020-01-13 18:15:32,174 loading file C:/Users/makayser/Desktop/nlp_local//en-ner-ontonotes-fast-v0.4.pt\n"
-     ]
-    }
-   ],
-   "source": [
-    "nr = ner.NER()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = 'I have been using windows 7 and getting some errors with the following code 0x800700c1. What could this mean? I use windows 7 from steve ballmer'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'value': '7',\n",
-       "  'start': 26,\n",
-       "  'end': 27,\n",
-       "  'label': 'CARDINAL',\n",
-       "  'source': 'flair'},\n",
-       " {'value': 'ballmer',\n",
-       "  'start': 137,\n",
-       "  'end': 144,\n",
-       "  'label': 'PERSON',\n",
-       "  'source': 'flair'},\n",
-       " {'value': 'windows 7',\n",
-       "  'start': 4,\n",
-       "  'end': 6,\n",
-       "  'label': 'Product',\n",
-       "  'source': 'list'},\n",
-       " {'value': 'steve ballmer',\n",
-       "  'start': 26,\n",
-       "  'end': 28,\n",
-       "  'label': 'Boss',\n",
-       "  'source': 'list'},\n",
-       " {'value': '0x800700c1.',\n",
-       "  'start': 76,\n",
-       "  'end': 87,\n",
-       "  'label': 'ERROR CODE',\n",
-       "  'source': 'Regex'}]"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "nr.run(text)"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/project/INSTRUCTIONS.md
+++ b/project/INSTRUCTIONS.md
@ -0,0 +1,16 @@
+# How to run your first project
+
+## Steps
+1. Upload your source dataset (raw) to AzureML Datasets
+2. Upload your dependencies to AzureML Datasets (stopwords, custom ner)
+3. Customize custom.py with any required pre-processing steps
+4. Create a *.config.json where * = project name
+5. Run deploy/training.py
+6. Run deploy/inference.py
+
+## Requirements
+To make the NLP kit work flawlessly, there are some naming requirements and best practices.
+ 
+- Use language short froms (eg. German = de, French = fr)
+- Naming
+ - stopword list: stopwords-<language>.txt (tab delimited, utf-8)
--- a/project/msforum_de.config.json
+++ b/project/msforum_de.config.json
@ -1,6 +1,8 @@
 {
+    "name":"msforum_de",
    "language": "de",
    "environment" : "dev",
+    "data_dir" : "./",
    "prepare" : {
        "data_type" : "json"
    },
--- a/project/msforum_en.config.json
+++ b/project/msforum_en.config.json
@ -1,6 +1,8 @@
 {
+    "name":"msforum_en",
    "language": "en",
    "environment" : "dev",
+    "data_dir" : "./",
    "prepare" : {
        "data_type" : "json"
    },
--- a/scraper/1_getsites.py
+++ b/scraper/1_getsites.py
@ -1,14 +1,15 @@
 import argparse
 # Run arguments
+# example: python 1_getsites.py --language de-de --product xbox
 parser = argparse.ArgumentParser()
-parser.add_argument("--lang", 
+parser.add_argument("--language", 
                default="de-de",
                type=str,
                help="'en-us' or 'de-de")
 parser.add_argument('--product',
                default='windows',
                type=str,
-                help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect','edge','ie','musicandvideo'")  
+                help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo'")  
 args = parser.parse_args()

 # Import and set driver
@ -20,14 +21,26 @@ driver = webdriver.Chrome(executable_path = path + 'chromedriver.exe')
 product = args.product
 language = args.language

-# Scrape sites
-for x in range(1, 8000):
-    driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
-    html = driver.page_source
-    if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html):
-        print('##### EMPTY PAGE REACHED -> EXIT')
-        break
-    else:
-        with open('output-' + product + '.txt', 'a', encoding='utf-8') as myfile:
-            myfile.write(html+'\n\n\n')
-            print('Written:' + str(x))
+languages = ['it-it', 'fr-fr', 'en-us']
+products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
+#
+for product in products:
+    print(f'[START] {language}, {product}.')
+    # Scrape sites
+    for x in range(1, 10000):
+        driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
+        html = driver.page_source
+        if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html) or ('Aucun résultat trouvé' in html) or ('Nessun risultato trovato' in html) or ('Pubblica domande, segui le discussioni, condividi le tue conoscenze' in html) or ('Posten Sie Fragen, folgen Sie Diskussionen und teilen Sie Ihr Wissen' in html) or ('Post questions, follow discussions, share your knowledge' in html) or ('Publiez des questions, suivez des discussions et partagez vos connaissances' in html) or ('Publique preguntas, siga conversaciones y comparta sus conocimientos' in html):
+            print(f'[EXIT] EMPTY PAGE REACHED -> - {language}, {product}.')
+            break
+        else:
+            url_temp = re.findall(r'(https?://answers.microsoft.com/' + language + '/' + product + '/forum/[^\s]+)', html)
+            #url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/windows/forum/[^\s]+)', docs)
+            url_temp2 = [s.strip('"') for s in url_temp]
+            url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
+            with open('output-' + product + '-' + language + '.txt', 'a', encoding='utf-8') as outfile:
+                # Prepare Links
+                outfile.write("\n".join(url_list))
+                #myfile.write(url_list+'\n\n\n')
+                if (x%500):
+                    print(f'[STATUS] Page no. {str(x)} written.')
--- a/scraper/2_extract.py
+++ b/scraper/2_extract.py
@ -17,26 +17,26 @@ import argparse

 # Run arguments
 parser = argparse.ArgumentParser()
-parser.add_argument("--lang", 
+parser.add_argument("--language", 
                default="de-de",
                type=str,
                help="'en-us' or 'de-de")
 parser.add_argument('--product',
                default='windows',
                type=str,
-                help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect']")  
+                help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo']")  
 args = parser.parse_args()
+
+# Example: python 2_extract.py --language de-de --product windows
+
 # Set params
 lang = args.language
 product = args.product

-# Read File
-docs = codecs.open("output-" +  product + ".txt", 'r', encoding='utf-8').read()
-
-# Prepare Links
-url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
-url_temp2 = [s.strip('"') for s in url_temp]
-url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
+#with open("output-" +  product + "-" + lang + ".txt") as f:
+#    urls = f.readlines()
+# you may also want to remove whitespace characters like `\n` at the end of each line
+#url_list = [x.strip() for x in urls] 

 # Extract text content
 def getText(soup):
@ -83,12 +83,12 @@ def getUsernameAnswer(soup):

 # Create date of question
 def getDateQuestion(soup):
-    date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\n", "")
+    date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\nCréé le ", "").replace("\nCreado el ", "").replace("\nCreato il ", "").replace("\n", "")
    return date_question

 # Create date of answer
 def getDateAnswer(soup):
-    date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n", "")
+    date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n Répondu le ", "").replace("\nRespondió el ", "").replace("\nRisposta il ", "").replace("\n", "")
    return date_answer

 # Get number of same cases
@ -120,18 +120,17 @@ def getTags(soup, product):
                tags.append(subitem.text)
    except:
        tags = ""
-    return product + "," + ",".join(tags)
+    return f'{product},{",".join(tags)}'

 # Put it all together
 def scrapeMe(url, product):
-    print("Proceeding: ", url)
+    print("[URL] -", url)
    ### GET WEBSITE
    try:
        response = get(url)
    except:
-        print("### ERROR")
+        print("[ERROR] - There is an issue with the respective website.\n")
    html_soup = BeautifulSoup(response.text, 'html.parser')
-    lang = "de-de"
    fileid = uuid.uuid4().hex
    
    ### GET TEXT
@ -185,17 +184,37 @@ def scrapeMe(url, product):
    content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)
    
    ### WRITE TO JSON FILE
-    with open("output-" + product + ".json", "a", encoding='utf-8') as file:
+    #with open("output-" + product + "-" + lang + ".json", "a", encoding='utf-8') as file:
+    with open(f"output-{lang}.json", "a", encoding='utf-8') as file:
        file.write(content+",")
-        print("Written: File", fileid, "\n")
+        print(f"[SUCCESS] - File {fileid}\n")

 ######################################################
 # LOOP THROUGH THE OUTPUT TEXT FILES AND CREATE JSON #
 ######################################################
-for i, value in enumerate(url_list):
-    i += 1
+
+products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
+
+for product in products:
    try:
-        scrapeMe(value, product)
-    except Exception as e:
-        print(f'[ERROR] Failed to extract {value}')
-        continue
+        # Read File
+        docs = codecs.open(f"output-{product}-{lang}.txt", 'r', encoding='utf-8').read()
+
+        # Prepare Links
+        url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
+        url_temp2 = [s.strip('"') for s in url_temp]
+        url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
+
+        failed_url = []
+        for i, value in enumerate(url_list):
+            i += 1
+            try:
+                print(f'[STATUS] - {product}, {i}/{len(url_list)}')
+                scrapeMe(value, product)
+            except Exception as e:
+                failed_url.append(value)
+                print(f'[ERROR] - Failed to extract {value}')
+                continue
+        print(f"[DONE] - List for {product} of failed URLs: {failed_url},\nlen{failed_url}.")
+    except:
+        print(f"[ERROR] - 'output-{product}-{lang}.txt' does not exist.\n")
				`@ -1 +0,0 @@`
				`{"Id": null, "Scope": "/subscriptions/50324bce-875f-4a7b-9d3c-0e33679f5d72/resourceGroups/nlp/providers/Microsoft.MachineLearningServices/workspaces/nlp-ml"}`