updated data handling, new naming, improved cat

This commit is contained in:
Martin Kayser 2020-02-14 15:42:52 +01:00
Родитель 482bc0b0e2
Коммит cfaddc2bf3
24 изменённых файлов: 757 добавлений и 580 удалений

Просмотреть файл

@ -1,10 +0,0 @@
data
temp
.vscode
saved_models
mlruns
cache
demo
notebook
scraper
.git

5
.gitignore поставляемый
Просмотреть файл

@ -110,4 +110,7 @@ config.ini
.vscode
saved_models
mlruns
cache
cache
scraper/*.txt
scraper/*.json
scraper/old

Просмотреть файл

@ -11,14 +11,22 @@ NLP Toolkit
## Live Demo
> http://nlp-demo-app.azurewebsites.net/
## Naming
### Assets
> \<project name\>(-\<task\>)-\<step\>(-\<environment\>)
- where step in [source, train, deploy], for data assets.
- where task is an int, referring to the parameters, for models.
## TODO
### Project
- [x] Move to single project config file (for deployment and scoring)
- [ ] Overview architecture
- [ ] Detailed documentation
- [ ] Data storage strategy
### Prepare
- [x] source from AML datastore
- [ ] output to AML datastore
- [x] integrate with AML datastore
- [ ] connect to CosmosDB (pipeline ready)
- [ ] **(IP)** document cracking to standardized format
### Classification
- [ ] **(IP)** Multi label support
- [ ] integrate handling for larger documents
@ -27,21 +35,18 @@ NLP Toolkit
- [ ] upload best model to AML Model
### NER
- [ ] Improve duplicate handling
- [x] custom NER
- [x] basic custom NER
### Rank
- [ ] **(IP)** Improve answer quality
### Deployment
- [x] Collect, Package and upload assets
- [ ] **(IP)** Param script for deploy (incl language param!)
- [ ] Deploy to Azure Function (without AzureML)
### Notebooks
- [x] review prepared data
- [ ] **(IP)** review model results (auto generate after each training step)
- [ ] review model bias (auto generate after each training step)
### Pipeline
- [ ] **(IP)** document cracking to standardized format
### DevOps
- [ ] Yaml based infrastructure deployment
- [ ] Integrate with Azure/GitHub DevOps
- [ ] available models benchmark
### Tests
- [ ] integrate testing framework
- [ ] placeholder for custom data loading test
@ -50,17 +55,17 @@ NLP Toolkit
### New Features (TBD)
- Summarization
- Deployable feedback loop
- Integration with GitHub Actions
# Acknowledgements
- Verseagility is built in parts using the following:
- - [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
- - [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
- - [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
- - [flair](https://github.com/flairNLP/flair/) by Zalando Research
- - [gensim](https://radimrehurek.com/gensim/)
- Verseagility is built in part using the following:
- [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
- [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
- [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
- [flair](https://github.com/flairNLP/flair/) by Zalando Research
- [gensim](https://radimrehurek.com/gensim/)
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.

Просмотреть файл

Просмотреть файл

@ -9,6 +9,7 @@ Example (in the command line):
> python code/classification.py --task 1 --model_type bert --use_cuda
"""
import os
from pathlib import Path
import json
import argparse
@ -40,7 +41,8 @@ logger = he.get_logger(location=__name__)
aml_run = he.get_context()
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every,
use_cuda, max_seq_len, learning_rate, do_lower_case, register_model):
use_cuda, max_seq_len, learning_rate, do_lower_case,
register_model, save_model=True, early_stopping=True):
language = cu.params.get('language')
# Check task
if cu.tasks.get(str(task)).get('type') != 'classification':
@ -48,12 +50,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
# Data
dt_task = dt.Data(task=task)
## Download training files
if not os.path.isfile(dt_task.fn_lookup['fn_train']):
dt_task.download(task=task, step='train')
# Settings
set_all_seeds(seed=42)
use_amp = None
device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)
lang_model = he.farm_model_lookup.get(model_type).get(language)
lang_model = he.get_farm_model(model_type, language)
save_dir = dt_task.model_dir.replace('model_type', model_type)
label_list = dt_task.load('fn_label', header=None)[0].to_list()
@ -63,6 +68,9 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
aml_run.log('language', language)
aml_run.log('n_epochs', n_epochs)
aml_run.log('batch_size', batch_size)
aml_run.log('learning_rate', learning_rate)
aml_run.log('embeds_dropout', embeds_dropout)
aml_run.log('max_seq_len', max_seq_len)
aml_run.log('lang_model', lang_model)
aml_run.log_list('label_list', label_list)
except:
@ -85,7 +93,6 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
# AML log
try:
aml_run.log('acc', acc.get('acc'))
aml_run.log('acc_backup', acc)
aml_run.log('f1macro', f1macro)
aml_run.log('f1micro', f1micro)
except:
@ -143,12 +150,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
# An early stopping instance can be used to save the model that performs best on the dev set
# according to some metric and stop training when no improvement is happening for some iterations.
earlystopping = EarlyStopping(
metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer
# metric="loss", mode="min", # use loss from the dev evaluator of the trainer
save_dir=save_dir, # where to save the best model
patience=1 # number of evaluations to wait for improvement before terminating the training
)
if early_stopping:
earlystopping = EarlyStopping(
metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer
# metric="loss", mode="min", # use loss from the dev evaluator of the trainer
save_dir=save_dir, # where to save the best model
patience=2 # number of evaluations to wait for improvement before terminating the training
)
else:
earlystopping = None
trainer = Trainer(
model=model,
@ -170,8 +180,12 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
# defined with the EarlyStopping instance
# The model we have at this moment is the model from the last training epoch that was carried
# out before early stopping terminated the training
model.save(save_dir)
processor.save(save_dir)
if save_model:
model.save(save_dir)
processor.save(save_dir)
if register_model:
dt_task.upload(save_dir, task, destination='model')
def run():
# Run arguments
@ -196,7 +210,7 @@ def run():
action='store_true',
help="Use CUDA for training")
parser.add_argument('--n_epochs',
default=5,
default=3,
type=int,
help='')
parser.add_argument('--batch_size',
@ -216,7 +230,7 @@ def run():
type=int,
help='')
parser.add_argument('--learning_rate',
default=0.5e-5,
default=3e-5,
type=float,
help='')
parser.add_argument('--do_lower_case',

Просмотреть файл

@ -16,10 +16,10 @@ logger = he.get_logger(location=__name__)
############################################
# Load parameters from config
params = he.get_project_config('msforum_en.config.json')
params = he.get_project_config('msforum_de.config.json') #TODO: select parameters
tasks = params.get('tasks')
logger.info(f'[INFO] *** Project Target Language -> {params.get("language")} ***')
logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment")} ***')
logger.warning(f'[INFO] *** Project target lang \t-> {params.get("language")} \t***')
logger.warning(f'[INFO] *** Project target env \t-> {params.get("environment")} \t***')
############################################
##### Data Preparation
@ -28,10 +28,10 @@ logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment"
def prepare_source(data):
"""Normalize source data for use in downstram tasks.
NOTE: should be task agnostic"""
data_norm = pd.io.json.json_normalize(data, sep='_').to_dict(orient='records')
data_norm = pd.json_normalize(data, sep='_').to_dict(orient='records')
return pd.read_json(json.dumps(data_norm))
def remove(line):
def remove(line):
line = re.sub(r'Original Title\:', '', line)
return line
@ -59,9 +59,9 @@ def filter_qa(data):
_temp = data[data.answer_markedAsAnswer == 'true'].reset_index(drop=True).copy()
if len(_temp) == 0:
_temp = data[data.answer_markedAsAnswer == True].reset_index(drop=True).copy()
logger.info(f'Data Length : {len(_temp)} \t- after marked as answer ')
logger.warning(f'Data Length : {len(_temp)} \t- after marked as answer ')
# Filter by UpVotes
# _temp = _temp[_temp['answer_upvotes'] > 1].reset_index(drop=True).copy() #TODO: evaluate
logger.info(f'Data Length : {len(_temp)} \t- after min upvotes of 2')
logger.warning(f'Data Length : {len(_temp)} \t- after min upvotes of 2')
return _temp

Просмотреть файл

@ -1,10 +1,16 @@
"""
Helper function for data management
Includes source & prepared data, as well as
model assets.
"""
import pandas as pd
import json
import os
import pathlib
from io import StringIO
from pathlib import Path
from azureml.core import Dataset, Run, Workspace
from azureml.core import Run, Dataset, Model
# from azure.storage.blob import BlockBlobService
# Custom functions
@ -13,22 +19,13 @@ sys.path.append('../code')
import helper as he
import custom as cu
# Get config
run_config = he.get_config()
logger = he.get_logger(location=__name__)
flair_model_lookup = {
'en' : 'en-ner-ontonotes-fast-v0.4.pt',
'de' : 'ner-multi-fast.pt',
'xx' : 'ner-multi-fast.pt'
}
class Data():
def __init__(self, fn_source = 'answers_microsoft_lang.json',
task = 1,
version = 1,
env = 1,
inference = False
def __init__(self, task = 1,
version = 1,
env = 1,
inference = False
):
# Parameters
self.task = task
@ -38,87 +35,164 @@ class Data():
# Directories
## Asset directory
if inference:
## Assuming deployment via AzureML
try:
self.data_dir = os.environ['AZUREML_MODEL_DIR']
except KeyError:
logger.info(f'[WARNING] Not running on AML')
self.data_dir = he.run_config['path']['infer_dir']
## Assuming deployment via AzureML
if 'AZUREML_MODEL_DIR' in os.environ:
self.data_dir = os.environ['AZUREML_MODEL_DIR']
else:
self.data_dir = he.run_config['path']['data_dir']
self.data_dir = cu.params.get('data_dir')
os.makedirs(self.data_dir, exist_ok=True)
logger.warning(f'[INFO] Root data directory: {self.data_dir}')
## Model directory
self.model_dir = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}").resolve())
### If present, replace language tag in name
fn_source = fn_source.replace('lang', self.language)
self.model_dir = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}-{self.env}").resolve())
### NOTE: source file expected to follow naming convention, otherwise edit here
self.fn_source = f"{cu.params.get('name')}-source.{cu.params.get('prepare').get('data_type')}"
self.fp_train = f"{cu.params.get('name')}-train-{cu.params.get('environment')}"
# Lookup
self.fn_lookup = {
'fn_source' : fn_source,
'fn_prep' : f'data_l{self.language}.txt',
'fn_clean' : f'clean_l{self.language}_t{self.task}.txt',
'fn_train' : f'train_l{self.language}_t{self.task}.txt',
'fn_test' : f'test_l{self.language}_t{self.task}.txt',
'fn_label' : f'label_l{self.language}_t{self.task}.txt',
## DATASTORE
'fn_source' : self.fn_source,
'fp_train' : self.fp_train,
## LOCAL
'fp_data' : os.path.abspath(self.data_dir),
'fn_prep' : f'{self.data_dir}/data-l{self.language}.txt',
'fn_clean' : f'{self.data_dir}/clean-l{self.language}-t{self.task}.txt',
'fn_train' : f'{self.data_dir}/train-l{self.language}-t{self.task}.txt',
'fn_test' : f'{self.data_dir}/test-l{self.language}-t{self.task}.txt',
'fn_label' : f'{self.data_dir}/label-l{self.language}-t{self.task}.txt',
'fn_eval' : f'TODO:',
## ASSETS #TODO: auto generate fetching param list
'fn_asset' : f'{self.data_dir}/assets_{self.language}.zip',
## ASSETS
'fn_asset' : f'{self.data_dir}/assets-{self.language}.zip',
'fn_cat' : self.model_dir.replace('model_type', cu.params.get('tasks').get('1').get('model_type')),
'fn_rank' : f'{self.data_dir}/data_l{self.language}_t4.pkl',
'fn_rank' : f'{self.data_dir}/data-l{self.language}-t4.pkl',
'fn_ner_list' : f'{self.data_dir}/ner.txt',
'fn_ner_flair' : f'{self.data_dir}/{flair_model_lookup[self.language]}',
'fn_ner_flair' : f'{self.data_dir}/{he.get_flair_model(self.language, "fn")}',
'fn_ner_spacy' : f'TODO:',
'fn_names' : f'{self.data_dir}/names.txt',
'fn_stopwords' : f'{self.data_dir}/stopwords_{self.language}.txt',
}
'fn_stopwords' : f'{self.data_dir}/stopwords-{self.language}.txt',
} #TODO: when to link data dir, when only filename?
# for t in cu.params.get('tasks'): #TODO: auto generate fetching param list
# task_property = cu.params.get('tasks').get(t)
# self.fn_lookup[f'fn_{task_property.get('type')}']
# Files
self.fn_source = fn_source
self.fn_data = self.fn_lookup['fn_prep']
def download(self, container=None, fn_blob=None, fn_local=None,
# AML Components
try:
run = Run.get_context()
self.ws = run.experiment.workspace
except Exception as e:
logger.warning(f'[WARNING] AML Workspace not loaded -> {e}')
### DOWNLOAD
def _download_blob(self):
# self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'],
# account_key=run_config['blob']['key'])
# if no_run_version:
# self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
# elif not encrypted:
# self.block_blob_service.get_blob_to_path(container,
# str(fn_blob).replace('./',''),
# fn_local)
# elif encrypted:
# self.block_blob_service.get_blob_to_path(container,
# str(fn_blob).replace('.txt', '.enc').replace('./',''),
# fn_local)
# if to_dataframe:
# with open(str(fn_local), "rb") as text_file:
# _data = text_file.read()
# if encrypted:
# df = decrypt(_data, dataframe=True)
# else:
# df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
# df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
pass
def _download_datastore(self):
pass
def _download_model(self):
#NOTE: not needed when running on AML compute
pass
def download(self, dataset_name=None,
task='',
step='',
container=None,
fn_blob=None,
fn_local=None,
no_run_version=False,
encrypted=False,
to_dataframe=False,
source='blob'):
"""Download file from Azure"""
source='datastore'):
"""Download file from online storage"""
if source == 'blob':
self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'],
account_key=run_config['blob']['key'])
if no_run_version:
self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
elif not encrypted:
self.block_blob_service.get_blob_to_path(container,
str(fn_blob).replace('./',''),
fn_local)
elif encrypted:
self.block_blob_service.get_blob_to_path(container,
str(fn_blob).replace('.txt', '.enc').replace('./',''),
fn_local)
if to_dataframe:
with open(str(fn_local), "rb") as text_file:
_data = text_file.read()
if encrypted:
df = decrypt(_data, dataframe=True)
else:
df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
self._download_blob() #TODO:
elif source == 'datastore':
run = Run.get_context()
ws = run.experiment.workspace
dataset_name = self.fn_source.split('.')[0]
Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
logger.info(f'[INFO] Downloaded data from data store {dataset_name}')
if dataset_name is None:
dataset_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
try:
Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
except Exception as e:
logger.warning(f'[WARNING] Dataset {dataset_name} not found. Trying without <env>. -> {e}')
Dataset.get_by_name(workspace=ws, name=dataset_name.replace(f'_{self.env}', '')).download(self.data_dir, overwrite=True)
else:
logger.warning(f'[INFO] Downloaded data from data store {dataset_name}')
elif source == 'model':
pass
else:
logger.info('[ERROR] Source <{source}> does not exist. Can not download file.')
logger.warning(f'[ERROR] Source <{source}> does not exist. Can not download file.')
def upload(self):
#TODO:
pass
### UPLOAD
def _upload_dataset(self, fp, task, step, ws):
"""Upload dataset to AzureML Datastore
Note:
-only works for single file or directory
-not meant for model assets
"""
target_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
datastore = ws.get_default_datastore()
datastore.upload(src_dir = str(fp),
target_path = target_name,
overwrite = True,
show_progress = True)
ds = Dataset.File.from_files([(datastore, target_name)])
#ds = Dataset.File.from_files(path=[fp])
ds.register(workspace = ws,
name = target_name,
description = f'Data set for {step}',
create_new_version = True)
def _upload_model(self, fp, task, ws):
"""Upload model to AzureML Models"""
Model.register(workspace=ws,
model_name=f'{cu.params.get("name")}-{task}-{cu.params.get("environment")}',
model_path=fp, # Local file to upload and register as a model.
description='Model assets',
tags={'task' : task,
# 'model_type': model_type,
'language': cu.params.get('language'),
'environment': cu.params.get('environment')})
def upload(self, fp, task='', step='', destination='model'):
if fp in self.fn_lookup:
fp = self.fn_lookup[fp]
if destination == 'dataset':
self._upload_dataset(fp, task, step, self.ws)
elif destination == 'model':
self._upload_model(fp, task, self.ws)
else:
logger.warning(f'[ERROR] Destination <{destination}> does not exist. Can not upload file.')
logger.warning(f'[INFO] Upload complete to <{destination}> completed.')
## PROCESS
def process(self, data_type='json', save=True):
"""Convert source data to normalized data structure"""
# Load source data
if data_type == 'json':
with open(self.data_dir + self.fn_source, encoding='utf-8') as fp:
@ -126,7 +200,7 @@ class Data():
elif data_type == 'dataframe':
data = self.load('fn_source')
else:
logger.info('SOURCE DATA TYPE NOT SUPPORTED')
logger.warning('SOURCE DATA TYPE NOT SUPPORTED')
# Custom steps
df = cu.prepare_source(data)
@ -137,8 +211,15 @@ class Data():
return df
def save(self, data, fn, header=True):
data.to_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
logger.info(f'SAVED: {self.fn_lookup[fn]}')
data.to_csv(self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
logger.warning(f'SAVED: {self.fn_lookup[fn]}')
def load(self, fn, header=0):
return pd.read_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', header=header)
def load(self, fn, header=0, encoding='utf-8', file_type='dataframe'):
if file_type == 'dataframe':
return pd.read_csv(self.fn_lookup[fn], sep='\t', encoding=encoding, header=header)
elif file_type == 'list':
with open(self.fn_lookup[fn], encoding=encoding) as f:
data = f.readlines()
return data
else:
raise Exception(f'ERROR - file type ({file_type}) not supported in data loader')

Просмотреть файл

@ -15,12 +15,8 @@ from flair.models import SequenceTagger
def get_logger(level='info', location = None, excl_az_storage=True):
'''Get runtime logger'''
# Location
if location is None:
logger = logging.getLogger(__name__)
else:
logger = logging.getLogger(location)
global logger
# Exceptions
if excl_az_storage:
logging.getLogger("azure.storage.common.storageclient").setLevel(logging.WARN)
@ -31,16 +27,20 @@ def get_logger(level='info', location = None, excl_az_storage=True):
elif level == 'debug':
_level = logging.DEBUG
elif level == 'warning':
_level = logging.WARN
_level = logging.WARNING
# Format
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = _level)
return logger
# Location
if location is None:
logger = logging.getLogger(__name__)
else:
logger = logging.getLogger(location)
logger = get_logger(location=__name__)
return logger
def get_context():
'''Get AML Run Context for Logging to AML Services'''
@ -48,7 +48,7 @@ def get_context():
from azureml.core import Run
run = Run.get_context()
except Exception as e:
logger.info(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
logger.warning(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
run = ''
return run
@ -56,20 +56,6 @@ def get_context():
##### Config
############################################
def get_config():
# Get config
run_config = configparser.ConfigParser()
run_config.read('./code/config.ini')
if 'path' not in run_config:
run_config.read('./config.ini')
if 'path' not in run_config:
run_config.read('../config.ini')
if 'path' not in run_config:
logger.info('[ERROR] Could not find correct config.ini.')
return run_config
run_config = get_config()
def get_project_config(fn):
try:
with open(f'./project/{fn}', encoding='utf-8') as fp:
@ -79,30 +65,48 @@ def get_project_config(fn):
with open(f'../project/{fn}', encoding='utf-8') as fp:
params = json.load(fp)
except FileNotFoundError:
## Inference Config
with open('./code/config.json', encoding='utf-8') as fp:
params = json.load(fp)
try:
## Training Config
with open('config.json', encoding='utf-8') as fp:
params = json.load(fp)
except FileNotFoundError:
## Inference Config
with open('./code/config.json', encoding='utf-8') as fp:
params = json.load(fp)
return params
def get_config():
#TODO: remove this, new use: keys to env, settings to params
# Get config
run_config = configparser.ConfigParser()
run_config.read('./code/config.ini')
if 'path' not in run_config:
run_config.read('./config.ini')
if 'path' not in run_config:
run_config.read('../config.ini')
if 'path' not in run_config:
logger.warning('[ERROR] Could not find correct config.ini.')
return run_config
############################################
##### Azure
############################################
def get_credentials():
'''Retrieve Service Principal Credentials'''
credentials = ServicePrincipalCredentials(
client_id = run_config['sp']['client_id'],
secret = run_config['sp']['secret'],
tenant = run_config['sp']['tenant']
)
return credentials
# def get_credentials():
# '''Retrieve Service Principal Credentials'''
# credentials = ServicePrincipalCredentials(
# client_id = run_config['sp']['client_id'],
# secret = run_config['sp']['secret'],
# tenant = run_config['sp']['tenant']
# )
# return credentials
def get_secret():
'''Retrieve Secret from KeyVault'''
client = KeyVaultClient(get_credentials())
vault_url = run_config['keyvault']['url']
vault_name = run_config['keyvault']['name_data']
return client.get_secret(vault_url, vault_name, "").value
# def get_secret():
# '''Retrieve Secret from KeyVault'''
# client = KeyVaultClient(get_credentials())
# vault_url = run_config['keyvault']['url']
# vault_name = run_config['keyvault']['name_data']
# return client.get_secret(vault_url, vault_name, "").value
############################################
##### ML Frameworks
@ -115,14 +119,14 @@ farm_model_lookup = {
'xx':'bert-base-multilingual-cased',
'en':'bert-base-cased',
'de':'bert-base-german-cased',
'fr':'camembert-base',
'cn':'bert-base-chinese'
},
'roberta' : {
'en' : 'roberta-base'
},
'xlm-roberta' : {
'xx' : 'xlm-roberta-multi', #TODO: check if it exists?
'en' : 'xlm-roberta-large'
'xx' : 'xlm-roberta-base'
},
'albert' : {
'en' : 'albert-base-v2'
@ -132,40 +136,66 @@ farm_model_lookup = {
}
}
def get_farm_model(model_type, language):
mt = farm_model_lookup.get(model_type)
if mt is not None:
ml = mt.get(language)
if ml is None:
ml = mt.get('xx')
if ml is None:
raise Exception('No Transformer/FARM model found')
return ml
spacy_model_lookup = {
'en':'en_core_web_sm',
'de':'de_core_news_sm',
'fr':'fr_core_news_sm',
'es':'es_core_news_sm',
'it':'it_core_news_sm',
'xx':'xx_ent_wiki_sm'
}
flair_model_lookup = {
'en' : 'ner-ontonotes-fast',
'de' : 'ner-multi-fast',
'xx' : 'ner-multi-fast'
}
def load_spacy_model(language='xx', disable=[]):
try:
nlp = spacy.load(spacy_model_lookup[language], disable=disable)
except OSError:
logging.info(f'[INFO] Download spacy language model for {language}')
logging.warning(f'[INFO] Download spacy language model for {language}')
from spacy.cli import download
download(spacy_model_lookup[language])
nlp = spacy.load(spacy_model_lookup[language], disable=disable)
return nlp
flair_model_lookup = {
'en' : 'ner-ontonotes-fast',
'de' : 'ner-multi-fast',
'xx' : 'ner-multi-fast'
}
flair_model_file_lookup = {
'en' : 'en-ner-ontonotes-fast-v0.4.pt',
'de' : 'ner-multi-fast.pt',
'xx' : 'ner-multi-fast.pt'
}
def get_flair_model(language, object_type):
if object_type == 'model':
lookup = flair_model_lookup
elif object_type == 'fn':
lookup = flair_model_file_lookup
m = lookup.get(language)
if m is None:
m = lookup.get('xx')
return m
def load_flair_model(path=None, language='xx', task='ner'):
if task == 'ner':
if path is None:
model = SequenceTagger.load(flair_model_lookup.get(language))
model = SequenceTagger.load(get_flair_model(language, 'model'))
else:
model = SequenceTagger.load(path)
else:
logging.info(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
logging.warning(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
model = None
return model
############################################
##### Dataframe
@ -191,7 +221,7 @@ def validate_concat(col1, col2, max_len=1000):
new_line = sub + '. ' + des
text_concat.append(new_line[:max_len])
except Exception as e:
logger.info(f'[WARNING] Validate Concat - {e}')
logger.warning(f'[WARNING] Validate Concat - {e}')
if 'float' in str(e):
text_concat.append(str(des))
else:
@ -215,34 +245,34 @@ def append_ner(v, s, e, l, t=''):
##### Cryptography
############################################
def decrypt(token, dataframe=False):
''' Decrypt symetric object using Fernet '''
secret = get_secret()
f = Fernet(bytes(secret, encoding='utf-8'))
token = f.decrypt(token)
if dataframe:
_data = StringIO(str(token, 'utf-8'))
return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
else:
return token
# def decrypt(token, dataframe=False):
# ''' Decrypt symetric object using Fernet '''
# secret = get_secret()
# f = Fernet(bytes(secret, encoding='utf-8'))
# token = f.decrypt(token)
# if dataframe:
# _data = StringIO(str(token, 'utf-8'))
# return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
# else:
# return token
def decrypt_and_save(fn):
with open(fn, "rb") as text_file:
token = text_file.read()
content = StringIO(str(decrypt(token), 'utf-8'))
df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out
# def decrypt_and_save(fn):
# with open(fn, "rb") as text_file:
# token = text_file.read()
# content = StringIO(str(decrypt(token), 'utf-8'))
# df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
# df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out
def encrypt(token, dataframe=False):
''' Encrypt symetric object using Fernet '''
secret = get_secret()
f = Fernet(bytes(secret, encoding='utf-8'))
if dataframe:
token = bytes(to_csv_string(token), encoding='utf-8')
return f.encrypt(token)
# def encrypt(token, dataframe=False):
# ''' Encrypt symetric object using Fernet '''
# secret = get_secret()
# f = Fernet(bytes(secret, encoding='utf-8'))
# if dataframe:
# token = bytes(to_csv_string(token), encoding='utf-8')
# return f.encrypt(token)
def encrypt_and_save(fn, data, file_type='.txt'):
data = encrypt(bytes(data, encoding='utf-8'))
fn_new = fn.replace(file_type, '.enc')
with open(fn_new, "wb") as text_file:
text_file.write(data)
# def encrypt_and_save(fn, data, file_type='.txt'):
# data = encrypt(bytes(data, encoding='utf-8'))
# fn_new = fn.replace(file_type, '.enc')
# with open(fn_new, "wb") as text_file:
# text_file.write(data)

Просмотреть файл

@ -32,16 +32,16 @@ def score(task):
elif task_type == 'qa':
return rank.Rank(task=task, inference=True)
else:
logger.info('TASK TYPE NOT SUPPORTED')
logger.warning('TASK TYPE NOT SUPPORTED')
return None
def init():
global task_models, prepare_classes
# Unpack model dependencies
dt_init = dt.Data(inference=True)
shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
logger.info(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')
# dt_init = dt.Data(inference=True)
# shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
# logger.warning(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')
# Load models & prepare steps
task_models = []
@ -54,21 +54,18 @@ def init():
'params' : cu.tasks.get(str(task))
})
prepare_classes[task] = pr.Clean(task=task, inference=True)
logger.info(f'[INFO] Loaded model and prepare steps for task {task}.')
def run_model():
pass
logger.warning(f'[INFO] Loaded model and prepare steps for task {task}.')
def run(req):
# Load request
req_data = json.loads(req)
req_data = json.loads(req)[0]
# Prepare text
if 'subject' in req_data[0]:
s = req_data[0]['subject']
if 'subject' in req_data:
s = req_data['subject']
else:
s = ''
if 'body' in req_data[0]:
b = req_data[0]['body']
if 'body' in req_data:
b = req_data['body']
else:
b = ''
text = he.validate_concat(s, b)
@ -82,6 +79,7 @@ def run(req):
result = tm['infer'].inference_from_dicts(dicts=[{"text": clean, "cat": _cat}])
try:
# Special treatment for classification (FARM)
##TODO: standardize for all
_temp = []
for r in result[0]['predictions']:
_temp.append(dict(
@ -98,7 +96,7 @@ def run(req):
"params" : tm['params'],
"result" : result
})
logger.info(f'[INFO] Completed {tm["task"]}.')
logger.warning(f'[INFO] Completed task {tm["task"]}.')
return res
if __name__ == '__main__':

Просмотреть файл

@ -29,7 +29,7 @@ import helper as he
# Custom FLAIR element for spacy pipeline
class FlairMatcher(object):
name = "flair"
##TODO: run on stored headless models
def __init__(self, path):
self.tagger = he.load_flair_model(path=path)
@ -57,7 +57,7 @@ class CustomNER():
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
lang_model = he.farm_model_lookup.get(model_type).get(language)
lang_model = he.get_farm_model(model_type, language)
save_dir = dt_task.model_dir.replace('model_type', model_type)
# ner_labels = dt_task.load('fn_label', header=None)[0].to_list() TODO:
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

Просмотреть файл

@ -48,6 +48,7 @@ class Clean():
"""
def __init__(self, task,
download=True,
inference=False):
self.task = task
self.language = cu.params.get('language')
@ -55,6 +56,11 @@ class Clean():
# Load data class
self.dt = dt.Data(task=self.task, inference=inference)
# Download data, if needed
if download:
self.dt.download(dataset_name = self.dt.fn_lookup.get('fn_source').split('.')[0], source='datastore')
self.dt.download(dataset_name = self.dt.fn_lookup.get('fp_train'), source='datastore')
# Load spacy model
self.nlp = he.load_spacy_model(language=self.language, disable=['ner','parser','tagger'])
@ -62,21 +68,20 @@ class Clean():
stopwords_active = []
## Load names
try:
with open(self.dt.fn_lookup['fn_names'], encoding='utf-8') as f:
names = f.readlines()
names = self.dt.load('fn_names', file_type='list')
stopwords_active = stopwords_active + names
except Exception as e:
logger.info(f'[WARNING] No names list loaded: {e}')
logger.warning(f'[WARNING] No names list loaded: {e}')
## Load stopwords
try:
with open(self.dt.fn_lookup['fn_stopwords'], encoding='utf-8') as f:
stopwords = f.readlines()
stopwords = self.dt.load('fn_stopwords', file_type='list')
stopwords_active = stopwords_active + stopwords
except Exception as e:
logger.info(f'[WARNING] No stopwords list loaded: {e}')
logger.info(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
logger.warning(f'[WARNING] No stopwords list loaded: {e}')
## Add to Spacy stopword list
logger.warning(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
for w in stopwords_active:
self.nlp.vocab[w.replace('\n','')].is_stop = True
@ -254,23 +259,21 @@ class Clean():
return_token = True
)[0]
else:
logger.info('[WARNING] No transform by task found.')
logger.warning('[WARNING] No transform by task found.')
return text[0]
def prepare_classification(task, do_format, train_split, min_cat_occurance,
min_char_length, download_source):
# Get clean object
cl = Clean(task=task)
if download_source:
cl.dt.download(source='datastore')
# Get clean object
cl = Clean(task=task, download=download_source)
# Load data
if do_format:
data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
else:
data = cl.dt.load('fn_prep')
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
# Load text & label field
text_raw = cu.load_text(data)
@ -286,21 +289,21 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
# Filter by length
data = he.remove_short(data, 'text', min_char_length=min_char_length)
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
# Remove duplicates
data_red = data.drop_duplicates(subset=['text'])
logger.info(f'Data Length : {len(data_red)}')
logger.warning(f'Data Length : {len(data_red)}')
# Min class occurance
data_red = data_red[data_red.groupby('label').label.transform('size') > min_cat_occurance]
logger.info(f'Data Length : {len(data_red)}')
logger.warning(f'Data Length : {len(data_red)}')
data_red = data_red.reset_index(drop=True).copy()
# Label list
label_list = data_red.label.drop_duplicates()
logger.info(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')
logger.warning(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')
# Split data
strf_split = StratifiedShuffleSplit(n_splits = 1, test_size=(1-train_split), random_state=200)
@ -314,26 +317,28 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
cl.dt.save(df_cat_test[['text','label']], fn = 'fn_test')
cl.dt.save(label_list, fn = 'fn_label', header=False)
# Upload data
# cl.dt.upload('fn_prep', task=task, step='prep', destination='dataset')
cl.dt.upload('fp_data', task=task, step='train', destination='dataset')
def prepare_ner(task, do_format=True):
pass
def prepare_qa(task, do_format, min_char_length, download_source):
# Get clean object
cl = Clean(task=task)
if download_source:
cl.dt.download(source='datastore')
# Get clean object
cl = Clean(task=task, download=download_source)
# Load data
if do_format:
data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
else:
data = cl.dt.load('fn_prep')
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
# Filter relevant question answer pairs
data = cu.filter_qa(data)
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
# Load question & answer fields
question, answer = cu.load_qa(data)
@ -370,19 +375,24 @@ def prepare_qa(task, do_format, min_char_length, download_source):
# Filter by length
data = he.remove_short(data, 'question_clean', min_char_length=min_char_length)
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
# Remove duplicates
data = data.drop_duplicates(subset=['question_clean'])
logger.info(f'Data Length : {len(data)}')
logger.warning(f'Data Length : {len(data)}')
data = data.reset_index(drop=True).copy()
# Save data
cl.dt.save(data, fn = 'fn_clean')
def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_char_length=20, download_source=False):
logger.info(f'Running <PREPARE> for task {task}')
def main(task=1,
do_format=False,
split=0.9,
min_cat_occurance=300,
min_char_length=20,
download_source=False):
logger.warning(f'Running <PREPARE> for task {task}')
task_type = cu.tasks.get(str(task)).get('type')
if 'classification' == task_type:
@ -392,34 +402,40 @@ def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_c
elif 'qa' == task_type:
prepare_qa(task, do_format, min_char_length, download_source)
else:
logger.info('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')
logger.warning('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')
def run():
"""Run from the command line"""
parser = argparse.ArgumentParser()
parser.add_argument("--task",
default=1,
type=int,
help="Task where: \
-task 1 : classification subcat \
-task 2 : classification cat \
-task 3 : ner \
-task 4 : qa")
parser.add_argument('--do_format',
action='store_true',
help="Avoid reloading and normalizing data")
parser.add_argument("--split",
default=0.9,
type=float,
help="Train test split. Dev split is taken from train set.")
parser.add_argument("--min_cat_occurance",
default=300,
type=int,
help="Min occurance required by category.")
parser.add_argument("--download_source",
action='store_true')
args = parser.parse_args()
run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
# def run(): 'TODO: run train.py for single run
# """Run from the command line"""
# parser = argparse.ArgumentParser()
# parser.add_argument("--task",
# default=1,
# type=int,
# help="Task where: \
# -task 1 : classification subcat \
# -task 2 : classification cat \
# -task 3 : ner \
# -task 4 : qa")
# parser.add_argument('--do_format',
# action='store_true',
# help="Avoid reloading and normalizing data")
# parser.add_argument("--split",
# default=0.9,
# type=float,
# help="Train test split. Dev split is taken from train set.")
# parser.add_argument("--min_char_length",
# default=20,
# type=int,
# help="")
# parser.add_argument("--min_cat_occurance",
# default=300,
# type=int,
# help="Min occurance required by category.")
# parser.add_argument("--download_source",
# action='store_true')
# args = parser.parse_args()
# run_prepare(args.task, args.do_format, args.split, min_cat_occurance=args.min_cat_occurance,
# min_char_length=args.min_char_length, download_source=args.download_source)
# #TODO: cleanup
if __name__ == '__main__':
run()
main()

Просмотреть файл

@ -48,7 +48,7 @@ class Rank():
if cats is not None and cats != '':
#TODO: does not work for lists
_data = _data[_data.appliesTo.str.contains(cats)].reset_index(drop=True)
logger.info(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
logger.warning(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
# BM25 Score threshold
_data = _data[_data.score > ans_thresh].reset_index(drop=True)
@ -90,7 +90,7 @@ def create_bm25():
with open(cl.dt.fn_lookup['fn_rank'], 'wb') as fp:
pickle.dump(bm, fp)
pickle.dump(data, fp)
logger.info('Create and stored BM25 object.')
logger.warning('Create and stored BM25 object.')
if __name__ == "__main__":
create_bm25()

Просмотреть файл

@ -1,4 +1,5 @@
"""
#TODO: replace with python step functions
Finetuning the model for sub category classification.
Task 1 - forum entry classification
@ -6,8 +7,6 @@ Task 2 - ner (MS products)
Task 3 - qa ranking
Task 4 - urgency / priority
INPUT:
- language
- task
@ -25,6 +24,8 @@ OUTPUT:
- status
"""
import argparse
import mlflow
from farm.utils import MLFlowLogger
# Custom functions
import sys
@ -34,8 +35,7 @@ import classification
# import ner
# import rank
import logging
logging.basicConfig(level=logging.DEBUG)
# from azureml.core import Run
def main():
parser = argparse.ArgumentParser()
@ -50,7 +50,6 @@ def main():
-task 3 : ner \
-task 4 : qa")
### PREPARE
parser.add_argument('--do_format',
action='store_true',
@ -63,6 +62,10 @@ def main():
default=300,
type=int,
help="Min occurance required by category.")
parser.add_argument("--min_char_length",
default=20,
type=int,
help="")
parser.add_argument("--download_source",
action='store_true')
@ -79,7 +82,7 @@ def main():
action='store_true',
help="Use CUDA for training")
parser.add_argument('--n_epochs',
default=5,
default=3,
type=int,
help='')
parser.add_argument('--batch_size',
@ -99,7 +102,7 @@ def main():
type=int,
help='')
parser.add_argument('--learning_rate',
default=0.5e-5,
default=3e-5,
type=float,
help='')
parser.add_argument('--do_lower_case',
@ -108,18 +111,16 @@ def main():
parser.add_argument('--register_model',
action='store_true',
help="Register model in AML")
args = parser.parse_args()
# Run prepare
prepare.run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
prepare.main(args.task, args.do_format, args.split, args.min_cat_occurance, args.min_char_length, args.download_source)
# Run train
classification.doc_classification(args.task, args.model_type, args.n_epochs, args.batch_size, args.embeds_dropout, args.evaluate_every,
args.use_cuda, args.max_seq_len, args.learning_rate, args.do_lower_case, args.register_model)
classification.doc_classification(task=args.task, model_type=args.model_type, n_epochs=args.n_epochs,
batch_size=args.batch_size, embeds_dropout=args.embeds_dropout, evaluate_every=args.evaluate_every,
use_cuda=args.use_cuda, max_seq_len=args.max_seq_len, learning_rate=args.learning_rate, do_lower_case=args.do_lower_case,
register_model=args.register_model)
if __name__ == "__main__":
main()

Просмотреть файл

@ -17,8 +17,7 @@ import shutil
import json
from azureml.core.authentication import InteractiveLoginAuthentication, MsiAuthentication
from azureml.core import Workspace
from azureml.core import Model
from azureml.core import Workspace, Model
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
from azureml.core import Environment
@ -66,6 +65,7 @@ dt_assets = dt.Data()
##############################
## ZIP DEPENDENCIES
##############################
model_name = f'nlp_{language}_{env}'
if do_zip:
logger.warning(f'[INFO] Zipping model assets -> {model_name}')
# Zip Assets
@ -88,7 +88,6 @@ if do_zip:
##############################
## UPLOAD DEPENDENCIES
##############################
model_name = f'nlp_{language}_{env}'
if upload:
logger.warning(f'[INFO] Uploading model assets -> {model_name}')
# Upload Assets
@ -121,6 +120,8 @@ else:
environment = Environment('farmenv')
environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[
'azureml-defaults',
'mlflow',
'azureml-mlflow',
'spacy',
'transformers==2.3.0',
'scipy',

Просмотреть файл

@ -1,8 +1,15 @@
'''Functions to deploy training
"""
Functions to deploy training
'''
To run locally, use:
> cd ./code
> conda activate nlp
> python deploy/training.py
"""
import os
import shutil
import math
from azureml.core import Workspace, Experiment
from azureml.train.dnn import PyTorch
@ -10,26 +17,33 @@ from azureml.train.hyperdrive import (BayesianParameterSampling,
HyperDriveConfig, PrimaryMetricGoal,
choice, uniform, loguniform)
# PARAMETERS
language = 'de'
single_run = True
compute_name = 'gpucluster-nc6'
experiment_name = f"msforum_{language}"
############################################
##### AML Setup
############################################
## Workspace
# auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")
ws = Workspace.get(name='nlp-ml',
subscription_id='50324bce-875f-4a7b-9d3c-0e33679f5d72',
resource_group='nlp')
# ,auth=auth)
## Compute target
compute_name = 'gpucluster-nc12'
compute_target= ws.compute_targets[compute_name]
script_folder = "./"
script_folder = "."
#TODO: load from file
pip_packages=[
'azureml-sdk',
'azureml-dataprep[pandas,fuse]',
'mlflow',
'azureml-mlflow',
'spacy',
'transformers==2.3.0',
'scipy',
@ -42,8 +56,8 @@ pip_packages=[
'seqeval',
'mlflow==1.0.0',
'dotmap==1.3.0',
'git+https://github.com/deepset-ai/FARM.git',
'git+https://github.com/zalandoresearch/flair.git'
'farm==0.4.1',
'flair==0.4.5'
]
conda_packages=[
# 'pytorch',
@ -58,46 +72,63 @@ conda_packages=[
##### Task 1
############################################
fn_config_infer = 'config.json'
shutil.copy(f'./project/msforum_{language}.config.json', f'./code/{fn_config_infer}')
os.chdir('./code')
## Experiment
experiment_name = "answers-de"
exp = Experiment(workspace = ws, name = experiment_name)
## Config
script_params = {
'--task' : 1,
'--do_format' : '',
'--download_source' : '',
# '--model_type' : 'roberta',
'--use_cuda' : '',
'--batch_size' : 4
# '--learning_rate' : 0.5e-5
'--n_epochs' : 3,
# '--learning_rate' : 2e-5,
# '--model_type' : 'roberta',
# '--max_seq_len' : 128, #256,
# '--embeds_dropout' : 0.3,
# '--register_model' : ''
}
est = PyTorch(source_directory = script_folder,
compute_target = compute_target,
script_params = script_params,
entry_script = 'code/train.py',
entry_script = 'train.py',
pip_packages = pip_packages,
conda_packages = conda_packages,
use_gpu = True)
## Run
# run = exp.submit(est)
# run.wait_for_completion(show_output = True)
### Hyperparameters params
param_sampling = BayesianParameterSampling( {
'--learning_rate' : choice(0.5e-5, 1e-5, 2e-5, 3e-5),
# '--model_type' : choice('roberta','bert','albert')
'--model_type' : choice('distilbert','bert')
})
## Prepare HyperDrive Config
hdc = HyperDriveConfig(estimator=est,
hyperparameter_sampling = param_sampling,
policy = None, # NOTE: not possible for bayesian
primary_metric_name = 'f1macro',
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
max_total_runs = 40,
max_concurrent_runs = 1)
## Run hyperparameter tuning
hyperdrive_run = exp.submit(config=hdc)
hyperdrive_run.wait_for_completion(show_output = True)
## Get Results
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run)
if single_run:
run = exp.submit(est)
#Remove temp config
os.remove(fn_config_infer)
run.wait_for_completion(show_output = True)
else:
### Hyperparameters params
if language == 'en':
model_type = choice('roberta','bert','albert') #,'xlm-roberta'
elif language == 'de':
model_type = choice('distilbert','bert', 'roberta')
param_sampling = BayesianParameterSampling({
'--learning_rate' : choice(1e-5, 2e-5, 3e-5, 4e-5),
'--model_type' : model_type,
'--max_seq_len' : choice(64, 128, 256),
'--embeds_dropout' : choice(0.1, 0.2, 0.3, 0.4)
})
## Prepare HyperDrive Config
hdc = HyperDriveConfig(estimator=est,
hyperparameter_sampling = param_sampling,
policy = None, # NOTE: not possible for bayesian
primary_metric_name = 'f1macro',
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
max_total_runs = 80,
max_concurrent_runs = 1)
## Run hyperparameter tuning
hyperdrive_run = exp.submit(config=hdc)
#Remove temp config
os.remove(fn_config_infer)
hyperdrive_run.wait_for_completion(show_output = True)
## Get Results
# best_run = hyperdrive_run.get_best_run_by_primary_metric()

Просмотреть файл

@ -7,20 +7,23 @@ dependencies:
- gensim=3.8.1
- pip:
# - azureml-defaults for aml deployment
- azureml-sdk
- azureml-dataprep[pandas,fuse]
- azureml-sdk==1.0.85
- azureml-dataprep[pandas,fuse]==1.1.38
- mlflow==1.0.0
- azureml-mlflow==1.0.85
# - imblearn
- spacy==2.2.1
- transformers==2.3.0
# - farm==0.3.2
- 'git+https://github.com/deepset-ai/FARM.git'
# - flair
- git+https://github.com/zalandoresearch/flair.git
- transformers==2.4.1
- farm==0.4.1
# - 'git+https://github.com/deepset-ai/FARM.git'
- flair==0.4.5
# - git+https://github.com/zalandoresearch/flair.git
- azure-storage-blob
- streamlit
- selenium==3.141.0
- bs4
##DEMO ENV
- pillow
- streamlit==0.48.1
# - langdetect
# - lightgbm
# - pandas_ml
@ -32,9 +35,4 @@ dependencies:
# - matplotlib
# - seaborn
# temporary fix for falir
# pip install --upgrade git+https://github.com/zalandoresearch/flair.git
#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"
# conda install pytorch torchvision cpuonly -c pytorch
#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"

Просмотреть файл

@ -1 +0,0 @@
{"Id": null, "Scope": "/subscriptions/50324bce-875f-4a7b-9d3c-0e33679f5d72/resourceGroups/nlp/providers/Microsoft.MachineLearningServices/workspaces/nlp-ml"}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -12,15 +12,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"I0113 15:26:41.951092 3456 file_utils.py:35] PyTorch version 1.3.1 available.\n",
"I0113 15:26:45.606345 3456 custom.py:19] [INFO] Project Target Language **en**\n"
"I0110 11:14:41.291058 22612 custom.py:19] [INFO] Project Target Language **en**\n"
]
}
],
@ -32,76 +31,6 @@
"import ner"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-01-13 18:15:32,174 loading file C:/Users/makayser/Desktop/nlp_local//en-ner-ontonotes-fast-v0.4.pt\n"
]
}
],
"source": [
"nr = ner.NER()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"text = 'I have been using windows 7 and getting some errors with the following code 0x800700c1. What could this mean? I use windows 7 from steve ballmer'"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'value': '7',\n",
" 'start': 26,\n",
" 'end': 27,\n",
" 'label': 'CARDINAL',\n",
" 'source': 'flair'},\n",
" {'value': 'ballmer',\n",
" 'start': 137,\n",
" 'end': 144,\n",
" 'label': 'PERSON',\n",
" 'source': 'flair'},\n",
" {'value': 'windows 7',\n",
" 'start': 4,\n",
" 'end': 6,\n",
" 'label': 'Product',\n",
" 'source': 'list'},\n",
" {'value': 'steve ballmer',\n",
" 'start': 26,\n",
" 'end': 28,\n",
" 'label': 'Boss',\n",
" 'source': 'list'},\n",
" {'value': '0x800700c1.',\n",
" 'start': 76,\n",
" 'end': 87,\n",
" 'label': 'ERROR CODE',\n",
" 'source': 'Regex'}]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nr.run(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},

16
project/INSTRUCTIONS.md Normal file
Просмотреть файл

@ -0,0 +1,16 @@
# How to run your first project
## Steps
1. Upload your source dataset (raw) to AzureML Datasets
2. Upload your dependencies to AzureML Datasets (stopwords, custom ner)
3. Customize custom.py with any required pre-processing steps
4. Create a *.config.json where * = project name
5. Run deploy/training.py
6. Run deploy/inference.py
## Requirements
To make the NLP kit work flawlessly, there are some naming requirements and best practices.
- Use language short froms (eg. German = de, French = fr)
- Naming
- stopword list: stopwords-<language>.txt (tab delimited, utf-8)

Просмотреть файл

@ -1,6 +1,8 @@
{
"name":"msforum_de",
"language": "de",
"environment" : "dev",
"data_dir" : "./",
"prepare" : {
"data_type" : "json"
},

Просмотреть файл

@ -1,6 +1,8 @@
{
"name":"msforum_en",
"language": "en",
"environment" : "dev",
"data_dir" : "./",
"prepare" : {
"data_type" : "json"
},

Просмотреть файл

@ -1,14 +1,15 @@
import argparse
# Run arguments
# example: python 1_getsites.py --language de-de --product xbox
parser = argparse.ArgumentParser()
parser.add_argument("--lang",
parser.add_argument("--language",
default="de-de",
type=str,
help="'en-us' or 'de-de")
parser.add_argument('--product',
default='windows',
type=str,
help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect','edge','ie','musicandvideo'")
help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo'")
args = parser.parse_args()
# Import and set driver
@ -20,14 +21,26 @@ driver = webdriver.Chrome(executable_path = path + 'chromedriver.exe')
product = args.product
language = args.language
# Scrape sites
for x in range(1, 8000):
driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
html = driver.page_source
if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html):
print('##### EMPTY PAGE REACHED -> EXIT')
break
else:
with open('output-' + product + '.txt', 'a', encoding='utf-8') as myfile:
myfile.write(html+'\n\n\n')
print('Written:' + str(x))
languages = ['it-it', 'fr-fr', 'en-us']
products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
#
for product in products:
print(f'[START] {language}, {product}.')
# Scrape sites
for x in range(1, 10000):
driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
html = driver.page_source
if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html) or ('Aucun résultat trouvé' in html) or ('Nessun risultato trovato' in html) or ('Pubblica domande, segui le discussioni, condividi le tue conoscenze' in html) or ('Posten Sie Fragen, folgen Sie Diskussionen und teilen Sie Ihr Wissen' in html) or ('Post questions, follow discussions, share your knowledge' in html) or ('Publiez des questions, suivez des discussions et partagez vos connaissances' in html) or ('Publique preguntas, siga conversaciones y comparta sus conocimientos' in html):
print(f'[EXIT] EMPTY PAGE REACHED -> - {language}, {product}.')
break
else:
url_temp = re.findall(r'(https?://answers.microsoft.com/' + language + '/' + product + '/forum/[^\s]+)', html)
#url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/windows/forum/[^\s]+)', docs)
url_temp2 = [s.strip('"') for s in url_temp]
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
with open('output-' + product + '-' + language + '.txt', 'a', encoding='utf-8') as outfile:
# Prepare Links
outfile.write("\n".join(url_list))
#myfile.write(url_list+'\n\n\n')
if (x%500):
print(f'[STATUS] Page no. {str(x)} written.')

Просмотреть файл

@ -17,26 +17,26 @@ import argparse
# Run arguments
parser = argparse.ArgumentParser()
parser.add_argument("--lang",
parser.add_argument("--language",
default="de-de",
type=str,
help="'en-us' or 'de-de")
parser.add_argument('--product',
default='windows',
type=str,
help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect']")
help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo']")
args = parser.parse_args()
# Example: python 2_extract.py --language de-de --product windows
# Set params
lang = args.language
product = args.product
# Read File
docs = codecs.open("output-" + product + ".txt", 'r', encoding='utf-8').read()
# Prepare Links
url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
url_temp2 = [s.strip('"') for s in url_temp]
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
#with open("output-" + product + "-" + lang + ".txt") as f:
# urls = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
#url_list = [x.strip() for x in urls]
# Extract text content
def getText(soup):
@ -83,12 +83,12 @@ def getUsernameAnswer(soup):
# Create date of question
def getDateQuestion(soup):
date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\n", "")
date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\nCréé le ", "").replace("\nCreado el ", "").replace("\nCreato il ", "").replace("\n", "")
return date_question
# Create date of answer
def getDateAnswer(soup):
date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n", "")
date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n Répondu le ", "").replace("\nRespondió el ", "").replace("\nRisposta il ", "").replace("\n", "")
return date_answer
# Get number of same cases
@ -120,18 +120,17 @@ def getTags(soup, product):
tags.append(subitem.text)
except:
tags = ""
return product + "," + ",".join(tags)
return f'{product},{",".join(tags)}'
# Put it all together
def scrapeMe(url, product):
print("Proceeding: ", url)
print("[URL] -", url)
### GET WEBSITE
try:
response = get(url)
except:
print("### ERROR")
print("[ERROR] - There is an issue with the respective website.\n")
html_soup = BeautifulSoup(response.text, 'html.parser')
lang = "de-de"
fileid = uuid.uuid4().hex
### GET TEXT
@ -185,17 +184,37 @@ def scrapeMe(url, product):
content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)
### WRITE TO JSON FILE
with open("output-" + product + ".json", "a", encoding='utf-8') as file:
#with open("output-" + product + "-" + lang + ".json", "a", encoding='utf-8') as file:
with open(f"output-{lang}.json", "a", encoding='utf-8') as file:
file.write(content+",")
print("Written: File", fileid, "\n")
print(f"[SUCCESS] - File {fileid}\n")
######################################################
# LOOP THROUGH THE OUTPUT TEXT FILES AND CREATE JSON #
######################################################
for i, value in enumerate(url_list):
i += 1
products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
for product in products:
try:
scrapeMe(value, product)
except Exception as e:
print(f'[ERROR] Failed to extract {value}')
continue
# Read File
docs = codecs.open(f"output-{product}-{lang}.txt", 'r', encoding='utf-8').read()
# Prepare Links
url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
url_temp2 = [s.strip('"') for s in url_temp]
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
failed_url = []
for i, value in enumerate(url_list):
i += 1
try:
print(f'[STATUS] - {product}, {i}/{len(url_list)}')
scrapeMe(value, product)
except Exception as e:
failed_url.append(value)
print(f'[ERROR] - Failed to extract {value}')
continue
print(f"[DONE] - List for {product} of failed URLs: {failed_url},\nlen{failed_url}.")
except:
print(f"[ERROR] - 'output-{product}-{lang}.txt' does not exist.\n")