updated data handling, new naming, improved cat
This commit is contained in:
Родитель
482bc0b0e2
Коммит
cfaddc2bf3
10
.amlignore
10
.amlignore
|
@ -1,10 +0,0 @@
|
|||
data
|
||||
temp
|
||||
.vscode
|
||||
saved_models
|
||||
mlruns
|
||||
cache
|
||||
demo
|
||||
notebook
|
||||
scraper
|
||||
.git
|
|
@ -110,4 +110,7 @@ config.ini
|
|||
.vscode
|
||||
saved_models
|
||||
mlruns
|
||||
cache
|
||||
cache
|
||||
scraper/*.txt
|
||||
scraper/*.json
|
||||
scraper/old
|
||||
|
|
35
README.md
35
README.md
|
@ -11,14 +11,22 @@ NLP Toolkit
|
|||
## Live Demo
|
||||
> http://nlp-demo-app.azurewebsites.net/
|
||||
|
||||
## Naming
|
||||
### Assets
|
||||
> \<project name\>(-\<task\>)-\<step\>(-\<environment\>)
|
||||
- where step in [source, train, deploy], for data assets.
|
||||
- where task is an int, referring to the parameters, for models.
|
||||
|
||||
## TODO
|
||||
### Project
|
||||
- [x] Move to single project config file (for deployment and scoring)
|
||||
- [ ] Overview architecture
|
||||
- [ ] Detailed documentation
|
||||
- [ ] Data storage strategy
|
||||
### Prepare
|
||||
- [x] source from AML datastore
|
||||
- [ ] output to AML datastore
|
||||
- [x] integrate with AML datastore
|
||||
- [ ] connect to CosmosDB (pipeline ready)
|
||||
- [ ] **(IP)** document cracking to standardized format
|
||||
### Classification
|
||||
- [ ] **(IP)** Multi label support
|
||||
- [ ] integrate handling for larger documents
|
||||
|
@ -27,21 +35,18 @@ NLP Toolkit
|
|||
- [ ] upload best model to AML Model
|
||||
### NER
|
||||
- [ ] Improve duplicate handling
|
||||
- [x] custom NER
|
||||
- [x] basic custom NER
|
||||
### Rank
|
||||
- [ ] **(IP)** Improve answer quality
|
||||
### Deployment
|
||||
- [x] Collect, Package and upload assets
|
||||
- [ ] **(IP)** Param script for deploy (incl language param!)
|
||||
- [ ] Deploy to Azure Function (without AzureML)
|
||||
### Notebooks
|
||||
- [x] review prepared data
|
||||
- [ ] **(IP)** review model results (auto generate after each training step)
|
||||
- [ ] review model bias (auto generate after each training step)
|
||||
### Pipeline
|
||||
- [ ] **(IP)** document cracking to standardized format
|
||||
### DevOps
|
||||
- [ ] Yaml based infrastructure deployment
|
||||
- [ ] Integrate with Azure/GitHub DevOps
|
||||
- [ ] available models benchmark
|
||||
### Tests
|
||||
- [ ] integrate testing framework
|
||||
- [ ] placeholder for custom data loading test
|
||||
|
@ -50,17 +55,17 @@ NLP Toolkit
|
|||
### New Features (TBD)
|
||||
- Summarization
|
||||
- Deployable feedback loop
|
||||
- Integration with GitHub Actions
|
||||
|
||||
# Acknowledgements
|
||||
- Verseagility is built in parts using the following:
|
||||
- - [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
|
||||
- - [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
|
||||
- - [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
|
||||
- - [flair](https://github.com/flairNLP/flair/) by Zalando Research
|
||||
- - [gensim](https://radimrehurek.com/gensim/)
|
||||
- Verseagility is built in part using the following:
|
||||
- [Transformers](https://github.com/huggingface/pytorch-transformers) by HuggingFace
|
||||
- [FARM](https://github.com/deepset-ai/FARM/) by deepset ai
|
||||
- [spaCy](https://github.com/explosion/spaCy/) by Explosion ai
|
||||
- [flair](https://github.com/flairNLP/flair/) by Zalando Research
|
||||
- [gensim](https://radimrehurek.com/gensim/)
|
||||
|
||||
# Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
|
|
@ -9,6 +9,7 @@ Example (in the command line):
|
|||
> python code/classification.py --task 1 --model_type bert --use_cuda
|
||||
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
import json
|
||||
import argparse
|
||||
|
@ -40,7 +41,8 @@ logger = he.get_logger(location=__name__)
|
|||
aml_run = he.get_context()
|
||||
|
||||
def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, evaluate_every,
|
||||
use_cuda, max_seq_len, learning_rate, do_lower_case, register_model):
|
||||
use_cuda, max_seq_len, learning_rate, do_lower_case,
|
||||
register_model, save_model=True, early_stopping=True):
|
||||
language = cu.params.get('language')
|
||||
# Check task
|
||||
if cu.tasks.get(str(task)).get('type') != 'classification':
|
||||
|
@ -48,12 +50,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
|
|||
|
||||
# Data
|
||||
dt_task = dt.Data(task=task)
|
||||
## Download training files
|
||||
if not os.path.isfile(dt_task.fn_lookup['fn_train']):
|
||||
dt_task.download(task=task, step='train')
|
||||
|
||||
# Settings
|
||||
set_all_seeds(seed=42)
|
||||
use_amp = None
|
||||
device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp)
|
||||
lang_model = he.farm_model_lookup.get(model_type).get(language)
|
||||
lang_model = he.get_farm_model(model_type, language)
|
||||
save_dir = dt_task.model_dir.replace('model_type', model_type)
|
||||
label_list = dt_task.load('fn_label', header=None)[0].to_list()
|
||||
|
||||
|
@ -63,6 +68,9 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
|
|||
aml_run.log('language', language)
|
||||
aml_run.log('n_epochs', n_epochs)
|
||||
aml_run.log('batch_size', batch_size)
|
||||
aml_run.log('learning_rate', learning_rate)
|
||||
aml_run.log('embeds_dropout', embeds_dropout)
|
||||
aml_run.log('max_seq_len', max_seq_len)
|
||||
aml_run.log('lang_model', lang_model)
|
||||
aml_run.log_list('label_list', label_list)
|
||||
except:
|
||||
|
@ -85,7 +93,6 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
|
|||
# AML log
|
||||
try:
|
||||
aml_run.log('acc', acc.get('acc'))
|
||||
aml_run.log('acc_backup', acc)
|
||||
aml_run.log('f1macro', f1macro)
|
||||
aml_run.log('f1micro', f1micro)
|
||||
except:
|
||||
|
@ -143,12 +150,15 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
|
|||
|
||||
# An early stopping instance can be used to save the model that performs best on the dev set
|
||||
# according to some metric and stop training when no improvement is happening for some iterations.
|
||||
earlystopping = EarlyStopping(
|
||||
metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer
|
||||
# metric="loss", mode="min", # use loss from the dev evaluator of the trainer
|
||||
save_dir=save_dir, # where to save the best model
|
||||
patience=1 # number of evaluations to wait for improvement before terminating the training
|
||||
)
|
||||
if early_stopping:
|
||||
earlystopping = EarlyStopping(
|
||||
metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer
|
||||
# metric="loss", mode="min", # use loss from the dev evaluator of the trainer
|
||||
save_dir=save_dir, # where to save the best model
|
||||
patience=2 # number of evaluations to wait for improvement before terminating the training
|
||||
)
|
||||
else:
|
||||
earlystopping = None
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
|
@ -170,8 +180,12 @@ def doc_classification(task, model_type, n_epochs, batch_size, embeds_dropout, e
|
|||
# defined with the EarlyStopping instance
|
||||
# The model we have at this moment is the model from the last training epoch that was carried
|
||||
# out before early stopping terminated the training
|
||||
model.save(save_dir)
|
||||
processor.save(save_dir)
|
||||
if save_model:
|
||||
model.save(save_dir)
|
||||
processor.save(save_dir)
|
||||
|
||||
if register_model:
|
||||
dt_task.upload(save_dir, task, destination='model')
|
||||
|
||||
def run():
|
||||
# Run arguments
|
||||
|
@ -196,7 +210,7 @@ def run():
|
|||
action='store_true',
|
||||
help="Use CUDA for training")
|
||||
parser.add_argument('--n_epochs',
|
||||
default=5,
|
||||
default=3,
|
||||
type=int,
|
||||
help='')
|
||||
parser.add_argument('--batch_size',
|
||||
|
@ -216,7 +230,7 @@ def run():
|
|||
type=int,
|
||||
help='')
|
||||
parser.add_argument('--learning_rate',
|
||||
default=0.5e-5,
|
||||
default=3e-5,
|
||||
type=float,
|
||||
help='')
|
||||
parser.add_argument('--do_lower_case',
|
||||
|
|
|
@ -16,10 +16,10 @@ logger = he.get_logger(location=__name__)
|
|||
############################################
|
||||
|
||||
# Load parameters from config
|
||||
params = he.get_project_config('msforum_en.config.json')
|
||||
params = he.get_project_config('msforum_de.config.json') #TODO: select parameters
|
||||
tasks = params.get('tasks')
|
||||
logger.info(f'[INFO] *** Project Target Language -> {params.get("language")} ***')
|
||||
logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment")} ***')
|
||||
logger.warning(f'[INFO] *** Project target lang \t-> {params.get("language")} \t***')
|
||||
logger.warning(f'[INFO] *** Project target env \t-> {params.get("environment")} \t***')
|
||||
|
||||
############################################
|
||||
##### Data Preparation
|
||||
|
@ -28,10 +28,10 @@ logger.info(f'[INFO] *** Project Target Environment -> {params.get("environment"
|
|||
def prepare_source(data):
|
||||
"""Normalize source data for use in downstram tasks.
|
||||
NOTE: should be task agnostic"""
|
||||
data_norm = pd.io.json.json_normalize(data, sep='_').to_dict(orient='records')
|
||||
data_norm = pd.json_normalize(data, sep='_').to_dict(orient='records')
|
||||
return pd.read_json(json.dumps(data_norm))
|
||||
|
||||
def remove(line):
|
||||
def remove(line):
|
||||
line = re.sub(r'Original Title\:', '', line)
|
||||
return line
|
||||
|
||||
|
@ -59,9 +59,9 @@ def filter_qa(data):
|
|||
_temp = data[data.answer_markedAsAnswer == 'true'].reset_index(drop=True).copy()
|
||||
if len(_temp) == 0:
|
||||
_temp = data[data.answer_markedAsAnswer == True].reset_index(drop=True).copy()
|
||||
logger.info(f'Data Length : {len(_temp)} \t- after marked as answer ')
|
||||
logger.warning(f'Data Length : {len(_temp)} \t- after marked as answer ')
|
||||
# Filter by UpVotes
|
||||
# _temp = _temp[_temp['answer_upvotes'] > 1].reset_index(drop=True).copy() #TODO: evaluate
|
||||
logger.info(f'Data Length : {len(_temp)} \t- after min upvotes of 2')
|
||||
logger.warning(f'Data Length : {len(_temp)} \t- after min upvotes of 2')
|
||||
return _temp
|
||||
|
||||
|
|
229
code/data.py
229
code/data.py
|
@ -1,10 +1,16 @@
|
|||
"""
|
||||
Helper function for data management
|
||||
Includes source & prepared data, as well as
|
||||
model assets.
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from azureml.core import Dataset, Run, Workspace
|
||||
from azureml.core import Run, Dataset, Model
|
||||
# from azure.storage.blob import BlockBlobService
|
||||
|
||||
# Custom functions
|
||||
|
@ -13,22 +19,13 @@ sys.path.append('../code')
|
|||
import helper as he
|
||||
import custom as cu
|
||||
|
||||
# Get config
|
||||
run_config = he.get_config()
|
||||
logger = he.get_logger(location=__name__)
|
||||
|
||||
flair_model_lookup = {
|
||||
'en' : 'en-ner-ontonotes-fast-v0.4.pt',
|
||||
'de' : 'ner-multi-fast.pt',
|
||||
'xx' : 'ner-multi-fast.pt'
|
||||
}
|
||||
|
||||
class Data():
|
||||
def __init__(self, fn_source = 'answers_microsoft_lang.json',
|
||||
task = 1,
|
||||
version = 1,
|
||||
env = 1,
|
||||
inference = False
|
||||
def __init__(self, task = 1,
|
||||
version = 1,
|
||||
env = 1,
|
||||
inference = False
|
||||
):
|
||||
# Parameters
|
||||
self.task = task
|
||||
|
@ -38,87 +35,164 @@ class Data():
|
|||
|
||||
# Directories
|
||||
## Asset directory
|
||||
if inference:
|
||||
## Assuming deployment via AzureML
|
||||
try:
|
||||
self.data_dir = os.environ['AZUREML_MODEL_DIR']
|
||||
except KeyError:
|
||||
logger.info(f'[WARNING] Not running on AML')
|
||||
self.data_dir = he.run_config['path']['infer_dir']
|
||||
## Assuming deployment via AzureML
|
||||
if 'AZUREML_MODEL_DIR' in os.environ:
|
||||
self.data_dir = os.environ['AZUREML_MODEL_DIR']
|
||||
else:
|
||||
self.data_dir = he.run_config['path']['data_dir']
|
||||
self.data_dir = cu.params.get('data_dir')
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
logger.warning(f'[INFO] Root data directory: {self.data_dir}')
|
||||
|
||||
|
||||
## Model directory
|
||||
self.model_dir = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}").resolve())
|
||||
### If present, replace language tag in name
|
||||
fn_source = fn_source.replace('lang', self.language)
|
||||
self.model_dir = str(Path(self.data_dir + f"/model_type-l{self.language}-t{self.task}-{self.env}").resolve())
|
||||
### NOTE: source file expected to follow naming convention, otherwise edit here
|
||||
self.fn_source = f"{cu.params.get('name')}-source.{cu.params.get('prepare').get('data_type')}"
|
||||
self.fp_train = f"{cu.params.get('name')}-train-{cu.params.get('environment')}"
|
||||
|
||||
# Lookup
|
||||
self.fn_lookup = {
|
||||
'fn_source' : fn_source,
|
||||
'fn_prep' : f'data_l{self.language}.txt',
|
||||
'fn_clean' : f'clean_l{self.language}_t{self.task}.txt',
|
||||
'fn_train' : f'train_l{self.language}_t{self.task}.txt',
|
||||
'fn_test' : f'test_l{self.language}_t{self.task}.txt',
|
||||
'fn_label' : f'label_l{self.language}_t{self.task}.txt',
|
||||
## DATASTORE
|
||||
'fn_source' : self.fn_source,
|
||||
'fp_train' : self.fp_train,
|
||||
## LOCAL
|
||||
'fp_data' : os.path.abspath(self.data_dir),
|
||||
'fn_prep' : f'{self.data_dir}/data-l{self.language}.txt',
|
||||
'fn_clean' : f'{self.data_dir}/clean-l{self.language}-t{self.task}.txt',
|
||||
'fn_train' : f'{self.data_dir}/train-l{self.language}-t{self.task}.txt',
|
||||
'fn_test' : f'{self.data_dir}/test-l{self.language}-t{self.task}.txt',
|
||||
'fn_label' : f'{self.data_dir}/label-l{self.language}-t{self.task}.txt',
|
||||
'fn_eval' : f'TODO:',
|
||||
## ASSETS #TODO: auto generate fetching param list
|
||||
'fn_asset' : f'{self.data_dir}/assets_{self.language}.zip',
|
||||
## ASSETS
|
||||
'fn_asset' : f'{self.data_dir}/assets-{self.language}.zip',
|
||||
'fn_cat' : self.model_dir.replace('model_type', cu.params.get('tasks').get('1').get('model_type')),
|
||||
'fn_rank' : f'{self.data_dir}/data_l{self.language}_t4.pkl',
|
||||
'fn_rank' : f'{self.data_dir}/data-l{self.language}-t4.pkl',
|
||||
'fn_ner_list' : f'{self.data_dir}/ner.txt',
|
||||
'fn_ner_flair' : f'{self.data_dir}/{flair_model_lookup[self.language]}',
|
||||
'fn_ner_flair' : f'{self.data_dir}/{he.get_flair_model(self.language, "fn")}',
|
||||
'fn_ner_spacy' : f'TODO:',
|
||||
'fn_names' : f'{self.data_dir}/names.txt',
|
||||
'fn_stopwords' : f'{self.data_dir}/stopwords_{self.language}.txt',
|
||||
}
|
||||
'fn_stopwords' : f'{self.data_dir}/stopwords-{self.language}.txt',
|
||||
} #TODO: when to link data dir, when only filename?
|
||||
# for t in cu.params.get('tasks'): #TODO: auto generate fetching param list
|
||||
# task_property = cu.params.get('tasks').get(t)
|
||||
# self.fn_lookup[f'fn_{task_property.get('type')}']
|
||||
|
||||
# Files
|
||||
self.fn_source = fn_source
|
||||
self.fn_data = self.fn_lookup['fn_prep']
|
||||
|
||||
def download(self, container=None, fn_blob=None, fn_local=None,
|
||||
# AML Components
|
||||
try:
|
||||
run = Run.get_context()
|
||||
self.ws = run.experiment.workspace
|
||||
except Exception as e:
|
||||
logger.warning(f'[WARNING] AML Workspace not loaded -> {e}')
|
||||
|
||||
### DOWNLOAD
|
||||
def _download_blob(self):
|
||||
# self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'],
|
||||
# account_key=run_config['blob']['key'])
|
||||
# if no_run_version:
|
||||
# self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
|
||||
# elif not encrypted:
|
||||
# self.block_blob_service.get_blob_to_path(container,
|
||||
# str(fn_blob).replace('./',''),
|
||||
# fn_local)
|
||||
# elif encrypted:
|
||||
# self.block_blob_service.get_blob_to_path(container,
|
||||
# str(fn_blob).replace('.txt', '.enc').replace('./',''),
|
||||
# fn_local)
|
||||
# if to_dataframe:
|
||||
# with open(str(fn_local), "rb") as text_file:
|
||||
# _data = text_file.read()
|
||||
# if encrypted:
|
||||
# df = decrypt(_data, dataframe=True)
|
||||
# else:
|
||||
# df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
# df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
|
||||
pass
|
||||
|
||||
def _download_datastore(self):
|
||||
pass
|
||||
|
||||
def _download_model(self):
|
||||
#NOTE: not needed when running on AML compute
|
||||
pass
|
||||
|
||||
def download(self, dataset_name=None,
|
||||
task='',
|
||||
step='',
|
||||
container=None,
|
||||
fn_blob=None,
|
||||
fn_local=None,
|
||||
no_run_version=False,
|
||||
encrypted=False,
|
||||
to_dataframe=False,
|
||||
source='blob'):
|
||||
"""Download file from Azure"""
|
||||
source='datastore'):
|
||||
"""Download file from online storage"""
|
||||
if source == 'blob':
|
||||
self.block_blob_service = BlockBlobService(account_name=run_config['blob']['account'],
|
||||
account_key=run_config['blob']['key'])
|
||||
if no_run_version:
|
||||
self.block_blob_service.get_blob_to_path(container, fn_blob, fn_local)
|
||||
elif not encrypted:
|
||||
self.block_blob_service.get_blob_to_path(container,
|
||||
str(fn_blob).replace('./',''),
|
||||
fn_local)
|
||||
elif encrypted:
|
||||
self.block_blob_service.get_blob_to_path(container,
|
||||
str(fn_blob).replace('.txt', '.enc').replace('./',''),
|
||||
fn_local)
|
||||
if to_dataframe:
|
||||
with open(str(fn_local), "rb") as text_file:
|
||||
_data = text_file.read()
|
||||
if encrypted:
|
||||
df = decrypt(_data, dataframe=True)
|
||||
else:
|
||||
df = pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
df.to_csv(fn_local, sep='\t', encoding='utf-8', index=False)
|
||||
self._download_blob() #TODO:
|
||||
elif source == 'datastore':
|
||||
run = Run.get_context()
|
||||
ws = run.experiment.workspace
|
||||
dataset_name = self.fn_source.split('.')[0]
|
||||
Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
|
||||
logger.info(f'[INFO] Downloaded data from data store {dataset_name}')
|
||||
if dataset_name is None:
|
||||
dataset_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
|
||||
try:
|
||||
Dataset.get_by_name(workspace=ws, name=dataset_name).download(self.data_dir, overwrite=True)
|
||||
except Exception as e:
|
||||
logger.warning(f'[WARNING] Dataset {dataset_name} not found. Trying without <env>. -> {e}')
|
||||
Dataset.get_by_name(workspace=ws, name=dataset_name.replace(f'_{self.env}', '')).download(self.data_dir, overwrite=True)
|
||||
else:
|
||||
logger.warning(f'[INFO] Downloaded data from data store {dataset_name}')
|
||||
elif source == 'model':
|
||||
pass
|
||||
else:
|
||||
logger.info('[ERROR] Source <{source}> does not exist. Can not download file.')
|
||||
logger.warning(f'[ERROR] Source <{source}> does not exist. Can not download file.')
|
||||
|
||||
def upload(self):
|
||||
#TODO:
|
||||
pass
|
||||
### UPLOAD
|
||||
def _upload_dataset(self, fp, task, step, ws):
|
||||
"""Upload dataset to AzureML Datastore
|
||||
Note:
|
||||
-only works for single file or directory
|
||||
-not meant for model assets
|
||||
"""
|
||||
target_name = f'{cu.params.get("name")}-{task}-{step}-{cu.params.get("environment")}'
|
||||
datastore = ws.get_default_datastore()
|
||||
datastore.upload(src_dir = str(fp),
|
||||
target_path = target_name,
|
||||
overwrite = True,
|
||||
show_progress = True)
|
||||
ds = Dataset.File.from_files([(datastore, target_name)])
|
||||
#ds = Dataset.File.from_files(path=[fp])
|
||||
ds.register(workspace = ws,
|
||||
name = target_name,
|
||||
description = f'Data set for {step}',
|
||||
create_new_version = True)
|
||||
|
||||
def _upload_model(self, fp, task, ws):
|
||||
"""Upload model to AzureML Models"""
|
||||
Model.register(workspace=ws,
|
||||
model_name=f'{cu.params.get("name")}-{task}-{cu.params.get("environment")}',
|
||||
model_path=fp, # Local file to upload and register as a model.
|
||||
description='Model assets',
|
||||
tags={'task' : task,
|
||||
# 'model_type': model_type,
|
||||
'language': cu.params.get('language'),
|
||||
'environment': cu.params.get('environment')})
|
||||
|
||||
def upload(self, fp, task='', step='', destination='model'):
|
||||
if fp in self.fn_lookup:
|
||||
fp = self.fn_lookup[fp]
|
||||
if destination == 'dataset':
|
||||
self._upload_dataset(fp, task, step, self.ws)
|
||||
elif destination == 'model':
|
||||
self._upload_model(fp, task, self.ws)
|
||||
else:
|
||||
logger.warning(f'[ERROR] Destination <{destination}> does not exist. Can not upload file.')
|
||||
logger.warning(f'[INFO] Upload complete to <{destination}> completed.')
|
||||
|
||||
## PROCESS
|
||||
def process(self, data_type='json', save=True):
|
||||
"""Convert source data to normalized data structure"""
|
||||
|
||||
# Load source data
|
||||
if data_type == 'json':
|
||||
with open(self.data_dir + self.fn_source, encoding='utf-8') as fp:
|
||||
|
@ -126,7 +200,7 @@ class Data():
|
|||
elif data_type == 'dataframe':
|
||||
data = self.load('fn_source')
|
||||
else:
|
||||
logger.info('SOURCE DATA TYPE NOT SUPPORTED')
|
||||
logger.warning('SOURCE DATA TYPE NOT SUPPORTED')
|
||||
|
||||
# Custom steps
|
||||
df = cu.prepare_source(data)
|
||||
|
@ -137,8 +211,15 @@ class Data():
|
|||
return df
|
||||
|
||||
def save(self, data, fn, header=True):
|
||||
data.to_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
|
||||
logger.info(f'SAVED: {self.fn_lookup[fn]}')
|
||||
data.to_csv(self.fn_lookup[fn], sep='\t', encoding='utf-8', index=False, header=header)
|
||||
logger.warning(f'SAVED: {self.fn_lookup[fn]}')
|
||||
|
||||
def load(self, fn, header=0):
|
||||
return pd.read_csv(self.data_dir + self.fn_lookup[fn], sep='\t', encoding='utf-8', header=header)
|
||||
def load(self, fn, header=0, encoding='utf-8', file_type='dataframe'):
|
||||
if file_type == 'dataframe':
|
||||
return pd.read_csv(self.fn_lookup[fn], sep='\t', encoding=encoding, header=header)
|
||||
elif file_type == 'list':
|
||||
with open(self.fn_lookup[fn], encoding=encoding) as f:
|
||||
data = f.readlines()
|
||||
return data
|
||||
else:
|
||||
raise Exception(f'ERROR - file type ({file_type}) not supported in data loader')
|
194
code/helper.py
194
code/helper.py
|
@ -15,12 +15,8 @@ from flair.models import SequenceTagger
|
|||
|
||||
def get_logger(level='info', location = None, excl_az_storage=True):
|
||||
'''Get runtime logger'''
|
||||
# Location
|
||||
if location is None:
|
||||
logger = logging.getLogger(__name__)
|
||||
else:
|
||||
logger = logging.getLogger(location)
|
||||
|
||||
global logger
|
||||
|
||||
# Exceptions
|
||||
if excl_az_storage:
|
||||
logging.getLogger("azure.storage.common.storageclient").setLevel(logging.WARN)
|
||||
|
@ -31,16 +27,20 @@ def get_logger(level='info', location = None, excl_az_storage=True):
|
|||
elif level == 'debug':
|
||||
_level = logging.DEBUG
|
||||
elif level == 'warning':
|
||||
_level = logging.WARN
|
||||
_level = logging.WARNING
|
||||
|
||||
# Format
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = _level)
|
||||
|
||||
return logger
|
||||
# Location
|
||||
if location is None:
|
||||
logger = logging.getLogger(__name__)
|
||||
else:
|
||||
logger = logging.getLogger(location)
|
||||
|
||||
logger = get_logger(location=__name__)
|
||||
return logger
|
||||
|
||||
def get_context():
|
||||
'''Get AML Run Context for Logging to AML Services'''
|
||||
|
@ -48,7 +48,7 @@ def get_context():
|
|||
from azureml.core import Run
|
||||
run = Run.get_context()
|
||||
except Exception as e:
|
||||
logger.info(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
|
||||
logger.warning(f'[WARNING] Azure ML not loaded. Nothing will be logged. {e}')
|
||||
run = ''
|
||||
return run
|
||||
|
||||
|
@ -56,20 +56,6 @@ def get_context():
|
|||
##### Config
|
||||
############################################
|
||||
|
||||
def get_config():
|
||||
# Get config
|
||||
run_config = configparser.ConfigParser()
|
||||
run_config.read('./code/config.ini')
|
||||
if 'path' not in run_config:
|
||||
run_config.read('./config.ini')
|
||||
if 'path' not in run_config:
|
||||
run_config.read('../config.ini')
|
||||
if 'path' not in run_config:
|
||||
logger.info('[ERROR] Could not find correct config.ini.')
|
||||
return run_config
|
||||
|
||||
run_config = get_config()
|
||||
|
||||
def get_project_config(fn):
|
||||
try:
|
||||
with open(f'./project/{fn}', encoding='utf-8') as fp:
|
||||
|
@ -79,30 +65,48 @@ def get_project_config(fn):
|
|||
with open(f'../project/{fn}', encoding='utf-8') as fp:
|
||||
params = json.load(fp)
|
||||
except FileNotFoundError:
|
||||
## Inference Config
|
||||
with open('./code/config.json', encoding='utf-8') as fp:
|
||||
params = json.load(fp)
|
||||
try:
|
||||
## Training Config
|
||||
with open('config.json', encoding='utf-8') as fp:
|
||||
params = json.load(fp)
|
||||
except FileNotFoundError:
|
||||
## Inference Config
|
||||
with open('./code/config.json', encoding='utf-8') as fp:
|
||||
params = json.load(fp)
|
||||
return params
|
||||
|
||||
def get_config():
|
||||
#TODO: remove this, new use: keys to env, settings to params
|
||||
# Get config
|
||||
run_config = configparser.ConfigParser()
|
||||
run_config.read('./code/config.ini')
|
||||
if 'path' not in run_config:
|
||||
run_config.read('./config.ini')
|
||||
if 'path' not in run_config:
|
||||
run_config.read('../config.ini')
|
||||
if 'path' not in run_config:
|
||||
logger.warning('[ERROR] Could not find correct config.ini.')
|
||||
return run_config
|
||||
|
||||
############################################
|
||||
##### Azure
|
||||
############################################
|
||||
|
||||
def get_credentials():
|
||||
'''Retrieve Service Principal Credentials'''
|
||||
credentials = ServicePrincipalCredentials(
|
||||
client_id = run_config['sp']['client_id'],
|
||||
secret = run_config['sp']['secret'],
|
||||
tenant = run_config['sp']['tenant']
|
||||
)
|
||||
return credentials
|
||||
# def get_credentials():
|
||||
# '''Retrieve Service Principal Credentials'''
|
||||
# credentials = ServicePrincipalCredentials(
|
||||
# client_id = run_config['sp']['client_id'],
|
||||
# secret = run_config['sp']['secret'],
|
||||
# tenant = run_config['sp']['tenant']
|
||||
# )
|
||||
# return credentials
|
||||
|
||||
def get_secret():
|
||||
'''Retrieve Secret from KeyVault'''
|
||||
client = KeyVaultClient(get_credentials())
|
||||
vault_url = run_config['keyvault']['url']
|
||||
vault_name = run_config['keyvault']['name_data']
|
||||
return client.get_secret(vault_url, vault_name, "").value
|
||||
# def get_secret():
|
||||
# '''Retrieve Secret from KeyVault'''
|
||||
# client = KeyVaultClient(get_credentials())
|
||||
# vault_url = run_config['keyvault']['url']
|
||||
# vault_name = run_config['keyvault']['name_data']
|
||||
# return client.get_secret(vault_url, vault_name, "").value
|
||||
|
||||
############################################
|
||||
##### ML Frameworks
|
||||
|
@ -115,14 +119,14 @@ farm_model_lookup = {
|
|||
'xx':'bert-base-multilingual-cased',
|
||||
'en':'bert-base-cased',
|
||||
'de':'bert-base-german-cased',
|
||||
'fr':'camembert-base',
|
||||
'cn':'bert-base-chinese'
|
||||
},
|
||||
'roberta' : {
|
||||
'en' : 'roberta-base'
|
||||
},
|
||||
'xlm-roberta' : {
|
||||
'xx' : 'xlm-roberta-multi', #TODO: check if it exists?
|
||||
'en' : 'xlm-roberta-large'
|
||||
'xx' : 'xlm-roberta-base'
|
||||
},
|
||||
'albert' : {
|
||||
'en' : 'albert-base-v2'
|
||||
|
@ -132,40 +136,66 @@ farm_model_lookup = {
|
|||
}
|
||||
}
|
||||
|
||||
def get_farm_model(model_type, language):
|
||||
mt = farm_model_lookup.get(model_type)
|
||||
if mt is not None:
|
||||
ml = mt.get(language)
|
||||
if ml is None:
|
||||
ml = mt.get('xx')
|
||||
if ml is None:
|
||||
raise Exception('No Transformer/FARM model found')
|
||||
return ml
|
||||
|
||||
spacy_model_lookup = {
|
||||
'en':'en_core_web_sm',
|
||||
'de':'de_core_news_sm',
|
||||
'fr':'fr_core_news_sm',
|
||||
'es':'es_core_news_sm',
|
||||
'it':'it_core_news_sm',
|
||||
'xx':'xx_ent_wiki_sm'
|
||||
}
|
||||
|
||||
flair_model_lookup = {
|
||||
'en' : 'ner-ontonotes-fast',
|
||||
'de' : 'ner-multi-fast',
|
||||
'xx' : 'ner-multi-fast'
|
||||
}
|
||||
|
||||
def load_spacy_model(language='xx', disable=[]):
|
||||
try:
|
||||
nlp = spacy.load(spacy_model_lookup[language], disable=disable)
|
||||
except OSError:
|
||||
logging.info(f'[INFO] Download spacy language model for {language}')
|
||||
logging.warning(f'[INFO] Download spacy language model for {language}')
|
||||
from spacy.cli import download
|
||||
download(spacy_model_lookup[language])
|
||||
nlp = spacy.load(spacy_model_lookup[language], disable=disable)
|
||||
return nlp
|
||||
|
||||
flair_model_lookup = {
|
||||
'en' : 'ner-ontonotes-fast',
|
||||
'de' : 'ner-multi-fast',
|
||||
'xx' : 'ner-multi-fast'
|
||||
}
|
||||
flair_model_file_lookup = {
|
||||
'en' : 'en-ner-ontonotes-fast-v0.4.pt',
|
||||
'de' : 'ner-multi-fast.pt',
|
||||
'xx' : 'ner-multi-fast.pt'
|
||||
}
|
||||
|
||||
def get_flair_model(language, object_type):
|
||||
if object_type == 'model':
|
||||
lookup = flair_model_lookup
|
||||
elif object_type == 'fn':
|
||||
lookup = flair_model_file_lookup
|
||||
m = lookup.get(language)
|
||||
if m is None:
|
||||
m = lookup.get('xx')
|
||||
return m
|
||||
|
||||
def load_flair_model(path=None, language='xx', task='ner'):
|
||||
if task == 'ner':
|
||||
if path is None:
|
||||
model = SequenceTagger.load(flair_model_lookup.get(language))
|
||||
model = SequenceTagger.load(get_flair_model(language, 'model'))
|
||||
else:
|
||||
model = SequenceTagger.load(path)
|
||||
else:
|
||||
logging.info(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
|
||||
logging.warning(f'FLAIR MODEL TASK NOT SUPPORTED --> {task}')
|
||||
model = None
|
||||
return model
|
||||
|
||||
|
||||
############################################
|
||||
##### Dataframe
|
||||
|
@ -191,7 +221,7 @@ def validate_concat(col1, col2, max_len=1000):
|
|||
new_line = sub + '. ' + des
|
||||
text_concat.append(new_line[:max_len])
|
||||
except Exception as e:
|
||||
logger.info(f'[WARNING] Validate Concat - {e}')
|
||||
logger.warning(f'[WARNING] Validate Concat - {e}')
|
||||
if 'float' in str(e):
|
||||
text_concat.append(str(des))
|
||||
else:
|
||||
|
@ -215,34 +245,34 @@ def append_ner(v, s, e, l, t=''):
|
|||
##### Cryptography
|
||||
############################################
|
||||
|
||||
def decrypt(token, dataframe=False):
|
||||
''' Decrypt symetric object using Fernet '''
|
||||
secret = get_secret()
|
||||
f = Fernet(bytes(secret, encoding='utf-8'))
|
||||
token = f.decrypt(token)
|
||||
if dataframe:
|
||||
_data = StringIO(str(token, 'utf-8'))
|
||||
return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
else:
|
||||
return token
|
||||
# def decrypt(token, dataframe=False):
|
||||
# ''' Decrypt symetric object using Fernet '''
|
||||
# secret = get_secret()
|
||||
# f = Fernet(bytes(secret, encoding='utf-8'))
|
||||
# token = f.decrypt(token)
|
||||
# if dataframe:
|
||||
# _data = StringIO(str(token, 'utf-8'))
|
||||
# return pd.read_csv(_data, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
# else:
|
||||
# return token
|
||||
|
||||
def decrypt_and_save(fn):
|
||||
with open(fn, "rb") as text_file:
|
||||
token = text_file.read()
|
||||
content = StringIO(str(decrypt(token), 'utf-8'))
|
||||
df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out
|
||||
# def decrypt_and_save(fn):
|
||||
# with open(fn, "rb") as text_file:
|
||||
# token = text_file.read()
|
||||
# content = StringIO(str(decrypt(token), 'utf-8'))
|
||||
# df = pd.read_csv(content, sep='\t', error_bad_lines=False, warn_bad_lines=False, encoding='utf-8')
|
||||
# df.to_csv(fn.replace('.enc', '.txt'), sep='\t', encoding='utf-8', index=False) #TODO: match encrypt fn out
|
||||
|
||||
def encrypt(token, dataframe=False):
|
||||
''' Encrypt symetric object using Fernet '''
|
||||
secret = get_secret()
|
||||
f = Fernet(bytes(secret, encoding='utf-8'))
|
||||
if dataframe:
|
||||
token = bytes(to_csv_string(token), encoding='utf-8')
|
||||
return f.encrypt(token)
|
||||
# def encrypt(token, dataframe=False):
|
||||
# ''' Encrypt symetric object using Fernet '''
|
||||
# secret = get_secret()
|
||||
# f = Fernet(bytes(secret, encoding='utf-8'))
|
||||
# if dataframe:
|
||||
# token = bytes(to_csv_string(token), encoding='utf-8')
|
||||
# return f.encrypt(token)
|
||||
|
||||
def encrypt_and_save(fn, data, file_type='.txt'):
|
||||
data = encrypt(bytes(data, encoding='utf-8'))
|
||||
fn_new = fn.replace(file_type, '.enc')
|
||||
with open(fn_new, "wb") as text_file:
|
||||
text_file.write(data)
|
||||
# def encrypt_and_save(fn, data, file_type='.txt'):
|
||||
# data = encrypt(bytes(data, encoding='utf-8'))
|
||||
# fn_new = fn.replace(file_type, '.enc')
|
||||
# with open(fn_new, "wb") as text_file:
|
||||
# text_file.write(data)
|
|
@ -32,16 +32,16 @@ def score(task):
|
|||
elif task_type == 'qa':
|
||||
return rank.Rank(task=task, inference=True)
|
||||
else:
|
||||
logger.info('TASK TYPE NOT SUPPORTED')
|
||||
logger.warning('TASK TYPE NOT SUPPORTED')
|
||||
return None
|
||||
|
||||
def init():
|
||||
global task_models, prepare_classes
|
||||
|
||||
# Unpack model dependencies
|
||||
dt_init = dt.Data(inference=True)
|
||||
shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
|
||||
logger.info(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')
|
||||
# dt_init = dt.Data(inference=True)
|
||||
# shutil.unpack_archive(dt_init.fn_lookup['fn_asset'], dt_init.data_dir, 'zip')
|
||||
# logger.warning(f'[INFO] Unpacked model assets from {dt_init.fn_lookup["fn_asset"]}')
|
||||
|
||||
# Load models & prepare steps
|
||||
task_models = []
|
||||
|
@ -54,21 +54,18 @@ def init():
|
|||
'params' : cu.tasks.get(str(task))
|
||||
})
|
||||
prepare_classes[task] = pr.Clean(task=task, inference=True)
|
||||
logger.info(f'[INFO] Loaded model and prepare steps for task {task}.')
|
||||
|
||||
def run_model():
|
||||
pass
|
||||
logger.warning(f'[INFO] Loaded model and prepare steps for task {task}.')
|
||||
|
||||
def run(req):
|
||||
# Load request
|
||||
req_data = json.loads(req)
|
||||
req_data = json.loads(req)[0]
|
||||
# Prepare text
|
||||
if 'subject' in req_data[0]:
|
||||
s = req_data[0]['subject']
|
||||
if 'subject' in req_data:
|
||||
s = req_data['subject']
|
||||
else:
|
||||
s = ''
|
||||
if 'body' in req_data[0]:
|
||||
b = req_data[0]['body']
|
||||
if 'body' in req_data:
|
||||
b = req_data['body']
|
||||
else:
|
||||
b = ''
|
||||
text = he.validate_concat(s, b)
|
||||
|
@ -82,6 +79,7 @@ def run(req):
|
|||
result = tm['infer'].inference_from_dicts(dicts=[{"text": clean, "cat": _cat}])
|
||||
try:
|
||||
# Special treatment for classification (FARM)
|
||||
##TODO: standardize for all
|
||||
_temp = []
|
||||
for r in result[0]['predictions']:
|
||||
_temp.append(dict(
|
||||
|
@ -98,7 +96,7 @@ def run(req):
|
|||
"params" : tm['params'],
|
||||
"result" : result
|
||||
})
|
||||
logger.info(f'[INFO] Completed {tm["task"]}.')
|
||||
logger.warning(f'[INFO] Completed task {tm["task"]}.')
|
||||
return res
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -29,7 +29,7 @@ import helper as he
|
|||
# Custom FLAIR element for spacy pipeline
|
||||
class FlairMatcher(object):
|
||||
name = "flair"
|
||||
|
||||
##TODO: run on stored headless models
|
||||
def __init__(self, path):
|
||||
self.tagger = he.load_flair_model(path=path)
|
||||
|
||||
|
@ -57,7 +57,7 @@ class CustomNER():
|
|||
|
||||
set_all_seeds(seed=42)
|
||||
device, n_gpu = initialize_device_settings(use_cuda=True)
|
||||
lang_model = he.farm_model_lookup.get(model_type).get(language)
|
||||
lang_model = he.get_farm_model(model_type, language)
|
||||
save_dir = dt_task.model_dir.replace('model_type', model_type)
|
||||
# ner_labels = dt_task.load('fn_label', header=None)[0].to_list() TODO:
|
||||
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
|
||||
|
|
126
code/prepare.py
126
code/prepare.py
|
@ -48,6 +48,7 @@ class Clean():
|
|||
"""
|
||||
|
||||
def __init__(self, task,
|
||||
download=True,
|
||||
inference=False):
|
||||
self.task = task
|
||||
self.language = cu.params.get('language')
|
||||
|
@ -55,6 +56,11 @@ class Clean():
|
|||
# Load data class
|
||||
self.dt = dt.Data(task=self.task, inference=inference)
|
||||
|
||||
# Download data, if needed
|
||||
if download:
|
||||
self.dt.download(dataset_name = self.dt.fn_lookup.get('fn_source').split('.')[0], source='datastore')
|
||||
self.dt.download(dataset_name = self.dt.fn_lookup.get('fp_train'), source='datastore')
|
||||
|
||||
# Load spacy model
|
||||
self.nlp = he.load_spacy_model(language=self.language, disable=['ner','parser','tagger'])
|
||||
|
||||
|
@ -62,21 +68,20 @@ class Clean():
|
|||
stopwords_active = []
|
||||
## Load names
|
||||
try:
|
||||
with open(self.dt.fn_lookup['fn_names'], encoding='utf-8') as f:
|
||||
names = f.readlines()
|
||||
names = self.dt.load('fn_names', file_type='list')
|
||||
stopwords_active = stopwords_active + names
|
||||
except Exception as e:
|
||||
logger.info(f'[WARNING] No names list loaded: {e}')
|
||||
logger.warning(f'[WARNING] No names list loaded: {e}')
|
||||
|
||||
## Load stopwords
|
||||
try:
|
||||
with open(self.dt.fn_lookup['fn_stopwords'], encoding='utf-8') as f:
|
||||
stopwords = f.readlines()
|
||||
stopwords = self.dt.load('fn_stopwords', file_type='list')
|
||||
stopwords_active = stopwords_active + stopwords
|
||||
except Exception as e:
|
||||
logger.info(f'[WARNING] No stopwords list loaded: {e}')
|
||||
logger.info(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
|
||||
logger.warning(f'[WARNING] No stopwords list loaded: {e}')
|
||||
|
||||
## Add to Spacy stopword list
|
||||
logger.warning(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
|
||||
for w in stopwords_active:
|
||||
self.nlp.vocab[w.replace('\n','')].is_stop = True
|
||||
|
||||
|
@ -254,23 +259,21 @@ class Clean():
|
|||
return_token = True
|
||||
)[0]
|
||||
else:
|
||||
logger.info('[WARNING] No transform by task found.')
|
||||
logger.warning('[WARNING] No transform by task found.')
|
||||
return text[0]
|
||||
|
||||
def prepare_classification(task, do_format, train_split, min_cat_occurance,
|
||||
min_char_length, download_source):
|
||||
# Get clean object
|
||||
cl = Clean(task=task)
|
||||
|
||||
if download_source:
|
||||
cl.dt.download(source='datastore')
|
||||
# Get clean object
|
||||
cl = Clean(task=task, download=download_source)
|
||||
|
||||
# Load data
|
||||
if do_format:
|
||||
data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
|
||||
else:
|
||||
data = cl.dt.load('fn_prep')
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
# Load text & label field
|
||||
text_raw = cu.load_text(data)
|
||||
|
@ -286,21 +289,21 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
|
|||
|
||||
# Filter by length
|
||||
data = he.remove_short(data, 'text', min_char_length=min_char_length)
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
# Remove duplicates
|
||||
data_red = data.drop_duplicates(subset=['text'])
|
||||
logger.info(f'Data Length : {len(data_red)}')
|
||||
logger.warning(f'Data Length : {len(data_red)}')
|
||||
|
||||
# Min class occurance
|
||||
data_red = data_red[data_red.groupby('label').label.transform('size') > min_cat_occurance]
|
||||
logger.info(f'Data Length : {len(data_red)}')
|
||||
logger.warning(f'Data Length : {len(data_red)}')
|
||||
|
||||
data_red = data_red.reset_index(drop=True).copy()
|
||||
|
||||
# Label list
|
||||
label_list = data_red.label.drop_duplicates()
|
||||
logger.info(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')
|
||||
logger.warning(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')
|
||||
|
||||
# Split data
|
||||
strf_split = StratifiedShuffleSplit(n_splits = 1, test_size=(1-train_split), random_state=200)
|
||||
|
@ -314,26 +317,28 @@ def prepare_classification(task, do_format, train_split, min_cat_occurance,
|
|||
cl.dt.save(df_cat_test[['text','label']], fn = 'fn_test')
|
||||
cl.dt.save(label_list, fn = 'fn_label', header=False)
|
||||
|
||||
# Upload data
|
||||
# cl.dt.upload('fn_prep', task=task, step='prep', destination='dataset')
|
||||
cl.dt.upload('fp_data', task=task, step='train', destination='dataset')
|
||||
|
||||
def prepare_ner(task, do_format=True):
|
||||
pass
|
||||
|
||||
def prepare_qa(task, do_format, min_char_length, download_source):
|
||||
# Get clean object
|
||||
cl = Clean(task=task)
|
||||
|
||||
if download_source:
|
||||
cl.dt.download(source='datastore')
|
||||
# Get clean object
|
||||
cl = Clean(task=task, download=download_source)
|
||||
|
||||
# Load data
|
||||
if do_format:
|
||||
data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
|
||||
else:
|
||||
data = cl.dt.load('fn_prep')
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
# Filter relevant question answer pairs
|
||||
data = cu.filter_qa(data)
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
# Load question & answer fields
|
||||
question, answer = cu.load_qa(data)
|
||||
|
@ -370,19 +375,24 @@ def prepare_qa(task, do_format, min_char_length, download_source):
|
|||
|
||||
# Filter by length
|
||||
data = he.remove_short(data, 'question_clean', min_char_length=min_char_length)
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
# Remove duplicates
|
||||
data = data.drop_duplicates(subset=['question_clean'])
|
||||
logger.info(f'Data Length : {len(data)}')
|
||||
logger.warning(f'Data Length : {len(data)}')
|
||||
|
||||
data = data.reset_index(drop=True).copy()
|
||||
|
||||
# Save data
|
||||
cl.dt.save(data, fn = 'fn_clean')
|
||||
|
||||
def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_char_length=20, download_source=False):
|
||||
logger.info(f'Running <PREPARE> for task {task}')
|
||||
def main(task=1,
|
||||
do_format=False,
|
||||
split=0.9,
|
||||
min_cat_occurance=300,
|
||||
min_char_length=20,
|
||||
download_source=False):
|
||||
logger.warning(f'Running <PREPARE> for task {task}')
|
||||
|
||||
task_type = cu.tasks.get(str(task)).get('type')
|
||||
if 'classification' == task_type:
|
||||
|
@ -392,34 +402,40 @@ def run_prepare(task=1, do_format=False, split=0.9, min_cat_occurance=300, min_c
|
|||
elif 'qa' == task_type:
|
||||
prepare_qa(task, do_format, min_char_length, download_source)
|
||||
else:
|
||||
logger.info('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')
|
||||
logger.warning('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')
|
||||
|
||||
def run():
|
||||
"""Run from the command line"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--task",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Task where: \
|
||||
-task 1 : classification subcat \
|
||||
-task 2 : classification cat \
|
||||
-task 3 : ner \
|
||||
-task 4 : qa")
|
||||
parser.add_argument('--do_format',
|
||||
action='store_true',
|
||||
help="Avoid reloading and normalizing data")
|
||||
parser.add_argument("--split",
|
||||
default=0.9,
|
||||
type=float,
|
||||
help="Train test split. Dev split is taken from train set.")
|
||||
parser.add_argument("--min_cat_occurance",
|
||||
default=300,
|
||||
type=int,
|
||||
help="Min occurance required by category.")
|
||||
parser.add_argument("--download_source",
|
||||
action='store_true')
|
||||
args = parser.parse_args()
|
||||
run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
|
||||
# def run(): 'TODO: run train.py for single run
|
||||
# """Run from the command line"""
|
||||
# parser = argparse.ArgumentParser()
|
||||
# parser.add_argument("--task",
|
||||
# default=1,
|
||||
# type=int,
|
||||
# help="Task where: \
|
||||
# -task 1 : classification subcat \
|
||||
# -task 2 : classification cat \
|
||||
# -task 3 : ner \
|
||||
# -task 4 : qa")
|
||||
# parser.add_argument('--do_format',
|
||||
# action='store_true',
|
||||
# help="Avoid reloading and normalizing data")
|
||||
# parser.add_argument("--split",
|
||||
# default=0.9,
|
||||
# type=float,
|
||||
# help="Train test split. Dev split is taken from train set.")
|
||||
# parser.add_argument("--min_char_length",
|
||||
# default=20,
|
||||
# type=int,
|
||||
# help="")
|
||||
# parser.add_argument("--min_cat_occurance",
|
||||
# default=300,
|
||||
# type=int,
|
||||
# help="Min occurance required by category.")
|
||||
# parser.add_argument("--download_source",
|
||||
# action='store_true')
|
||||
# args = parser.parse_args()
|
||||
# run_prepare(args.task, args.do_format, args.split, min_cat_occurance=args.min_cat_occurance,
|
||||
# min_char_length=args.min_char_length, download_source=args.download_source)
|
||||
# #TODO: cleanup
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
main()
|
|
@ -48,7 +48,7 @@ class Rank():
|
|||
if cats is not None and cats != '':
|
||||
#TODO: does not work for lists
|
||||
_data = _data[_data.appliesTo.str.contains(cats)].reset_index(drop=True)
|
||||
logger.info(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
|
||||
logger.warning(f'[INFO] Reduced answer selection to {len(_data)} from {len(self.data)}.')
|
||||
|
||||
# BM25 Score threshold
|
||||
_data = _data[_data.score > ans_thresh].reset_index(drop=True)
|
||||
|
@ -90,7 +90,7 @@ def create_bm25():
|
|||
with open(cl.dt.fn_lookup['fn_rank'], 'wb') as fp:
|
||||
pickle.dump(bm, fp)
|
||||
pickle.dump(data, fp)
|
||||
logger.info('Create and stored BM25 object.')
|
||||
logger.warning('Create and stored BM25 object.')
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_bm25()
|
|
@ -1,4 +1,5 @@
|
|||
"""
|
||||
#TODO: replace with python step functions
|
||||
Finetuning the model for sub category classification.
|
||||
|
||||
Task 1 - forum entry classification
|
||||
|
@ -6,8 +7,6 @@ Task 2 - ner (MS products)
|
|||
Task 3 - qa ranking
|
||||
Task 4 - urgency / priority
|
||||
|
||||
|
||||
|
||||
INPUT:
|
||||
- language
|
||||
- task
|
||||
|
@ -25,6 +24,8 @@ OUTPUT:
|
|||
- status
|
||||
"""
|
||||
import argparse
|
||||
import mlflow
|
||||
from farm.utils import MLFlowLogger
|
||||
|
||||
# Custom functions
|
||||
import sys
|
||||
|
@ -34,8 +35,7 @@ import classification
|
|||
# import ner
|
||||
# import rank
|
||||
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
# from azureml.core import Run
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -50,7 +50,6 @@ def main():
|
|||
-task 3 : ner \
|
||||
-task 4 : qa")
|
||||
|
||||
|
||||
### PREPARE
|
||||
parser.add_argument('--do_format',
|
||||
action='store_true',
|
||||
|
@ -63,6 +62,10 @@ def main():
|
|||
default=300,
|
||||
type=int,
|
||||
help="Min occurance required by category.")
|
||||
parser.add_argument("--min_char_length",
|
||||
default=20,
|
||||
type=int,
|
||||
help="")
|
||||
parser.add_argument("--download_source",
|
||||
action='store_true')
|
||||
|
||||
|
@ -79,7 +82,7 @@ def main():
|
|||
action='store_true',
|
||||
help="Use CUDA for training")
|
||||
parser.add_argument('--n_epochs',
|
||||
default=5,
|
||||
default=3,
|
||||
type=int,
|
||||
help='')
|
||||
parser.add_argument('--batch_size',
|
||||
|
@ -99,7 +102,7 @@ def main():
|
|||
type=int,
|
||||
help='')
|
||||
parser.add_argument('--learning_rate',
|
||||
default=0.5e-5,
|
||||
default=3e-5,
|
||||
type=float,
|
||||
help='')
|
||||
parser.add_argument('--do_lower_case',
|
||||
|
@ -108,18 +111,16 @@ def main():
|
|||
parser.add_argument('--register_model',
|
||||
action='store_true',
|
||||
help="Register model in AML")
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Run prepare
|
||||
prepare.run_prepare(args.task, args.do_format, args.split, args.min_cat_occurance, args.download_source)
|
||||
|
||||
prepare.main(args.task, args.do_format, args.split, args.min_cat_occurance, args.min_char_length, args.download_source)
|
||||
|
||||
# Run train
|
||||
classification.doc_classification(args.task, args.model_type, args.n_epochs, args.batch_size, args.embeds_dropout, args.evaluate_every,
|
||||
args.use_cuda, args.max_seq_len, args.learning_rate, args.do_lower_case, args.register_model)
|
||||
|
||||
classification.doc_classification(task=args.task, model_type=args.model_type, n_epochs=args.n_epochs,
|
||||
batch_size=args.batch_size, embeds_dropout=args.embeds_dropout, evaluate_every=args.evaluate_every,
|
||||
use_cuda=args.use_cuda, max_seq_len=args.max_seq_len, learning_rate=args.learning_rate, do_lower_case=args.do_lower_case,
|
||||
register_model=args.register_model)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -17,8 +17,7 @@ import shutil
|
|||
import json
|
||||
|
||||
from azureml.core.authentication import InteractiveLoginAuthentication, MsiAuthentication
|
||||
from azureml.core import Workspace
|
||||
from azureml.core import Model
|
||||
from azureml.core import Workspace, Model
|
||||
from azureml.core.resource_configuration import ResourceConfiguration
|
||||
from azureml.core.webservice import Webservice, AciWebservice, AksWebservice
|
||||
from azureml.core import Environment
|
||||
|
@ -66,6 +65,7 @@ dt_assets = dt.Data()
|
|||
##############################
|
||||
## ZIP DEPENDENCIES
|
||||
##############################
|
||||
model_name = f'nlp_{language}_{env}'
|
||||
if do_zip:
|
||||
logger.warning(f'[INFO] Zipping model assets -> {model_name}')
|
||||
# Zip Assets
|
||||
|
@ -88,7 +88,6 @@ if do_zip:
|
|||
##############################
|
||||
## UPLOAD DEPENDENCIES
|
||||
##############################
|
||||
model_name = f'nlp_{language}_{env}'
|
||||
if upload:
|
||||
logger.warning(f'[INFO] Uploading model assets -> {model_name}')
|
||||
# Upload Assets
|
||||
|
@ -121,6 +120,8 @@ else:
|
|||
environment = Environment('farmenv')
|
||||
environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[
|
||||
'azureml-defaults',
|
||||
'mlflow',
|
||||
'azureml-mlflow',
|
||||
'spacy',
|
||||
'transformers==2.3.0',
|
||||
'scipy',
|
||||
|
|
|
@ -1,8 +1,15 @@
|
|||
'''Functions to deploy training
|
||||
"""
|
||||
Functions to deploy training
|
||||
|
||||
'''
|
||||
To run locally, use:
|
||||
> cd ./code
|
||||
> conda activate nlp
|
||||
> python deploy/training.py
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import math
|
||||
from azureml.core import Workspace, Experiment
|
||||
from azureml.train.dnn import PyTorch
|
||||
|
@ -10,26 +17,33 @@ from azureml.train.hyperdrive import (BayesianParameterSampling,
|
|||
HyperDriveConfig, PrimaryMetricGoal,
|
||||
choice, uniform, loguniform)
|
||||
|
||||
# PARAMETERS
|
||||
language = 'de'
|
||||
single_run = True
|
||||
compute_name = 'gpucluster-nc6'
|
||||
experiment_name = f"msforum_{language}"
|
||||
|
||||
############################################
|
||||
##### AML Setup
|
||||
############################################
|
||||
|
||||
## Workspace
|
||||
# auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")
|
||||
|
||||
ws = Workspace.get(name='nlp-ml',
|
||||
subscription_id='50324bce-875f-4a7b-9d3c-0e33679f5d72',
|
||||
resource_group='nlp')
|
||||
# ,auth=auth)
|
||||
|
||||
## Compute target
|
||||
compute_name = 'gpucluster-nc12'
|
||||
compute_target= ws.compute_targets[compute_name]
|
||||
script_folder = "./"
|
||||
script_folder = "."
|
||||
|
||||
#TODO: load from file
|
||||
pip_packages=[
|
||||
'azureml-sdk',
|
||||
'azureml-dataprep[pandas,fuse]',
|
||||
'mlflow',
|
||||
'azureml-mlflow',
|
||||
'spacy',
|
||||
'transformers==2.3.0',
|
||||
'scipy',
|
||||
|
@ -42,8 +56,8 @@ pip_packages=[
|
|||
'seqeval',
|
||||
'mlflow==1.0.0',
|
||||
'dotmap==1.3.0',
|
||||
'git+https://github.com/deepset-ai/FARM.git',
|
||||
'git+https://github.com/zalandoresearch/flair.git'
|
||||
'farm==0.4.1',
|
||||
'flair==0.4.5'
|
||||
]
|
||||
conda_packages=[
|
||||
# 'pytorch',
|
||||
|
@ -58,46 +72,63 @@ conda_packages=[
|
|||
##### Task 1
|
||||
############################################
|
||||
|
||||
fn_config_infer = 'config.json'
|
||||
shutil.copy(f'./project/msforum_{language}.config.json', f'./code/{fn_config_infer}')
|
||||
|
||||
os.chdir('./code')
|
||||
|
||||
## Experiment
|
||||
experiment_name = "answers-de"
|
||||
exp = Experiment(workspace = ws, name = experiment_name)
|
||||
## Config
|
||||
script_params = {
|
||||
'--task' : 1,
|
||||
'--do_format' : '',
|
||||
'--download_source' : '',
|
||||
# '--model_type' : 'roberta',
|
||||
'--use_cuda' : '',
|
||||
'--batch_size' : 4
|
||||
# '--learning_rate' : 0.5e-5
|
||||
'--n_epochs' : 3,
|
||||
# '--learning_rate' : 2e-5,
|
||||
# '--model_type' : 'roberta',
|
||||
# '--max_seq_len' : 128, #256,
|
||||
# '--embeds_dropout' : 0.3,
|
||||
# '--register_model' : ''
|
||||
}
|
||||
est = PyTorch(source_directory = script_folder,
|
||||
compute_target = compute_target,
|
||||
script_params = script_params,
|
||||
entry_script = 'code/train.py',
|
||||
entry_script = 'train.py',
|
||||
pip_packages = pip_packages,
|
||||
conda_packages = conda_packages,
|
||||
use_gpu = True)
|
||||
## Run
|
||||
# run = exp.submit(est)
|
||||
# run.wait_for_completion(show_output = True)
|
||||
### Hyperparameters params
|
||||
param_sampling = BayesianParameterSampling( {
|
||||
'--learning_rate' : choice(0.5e-5, 1e-5, 2e-5, 3e-5),
|
||||
# '--model_type' : choice('roberta','bert','albert')
|
||||
'--model_type' : choice('distilbert','bert')
|
||||
})
|
||||
## Prepare HyperDrive Config
|
||||
hdc = HyperDriveConfig(estimator=est,
|
||||
hyperparameter_sampling = param_sampling,
|
||||
policy = None, # NOTE: not possible for bayesian
|
||||
primary_metric_name = 'f1macro',
|
||||
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
|
||||
max_total_runs = 40,
|
||||
max_concurrent_runs = 1)
|
||||
## Run hyperparameter tuning
|
||||
hyperdrive_run = exp.submit(config=hdc)
|
||||
hyperdrive_run.wait_for_completion(show_output = True)
|
||||
## Get Results
|
||||
best_run = hyperdrive_run.get_best_run_by_primary_metric()
|
||||
print(best_run)
|
||||
if single_run:
|
||||
run = exp.submit(est)
|
||||
#Remove temp config
|
||||
os.remove(fn_config_infer)
|
||||
run.wait_for_completion(show_output = True)
|
||||
else:
|
||||
### Hyperparameters params
|
||||
if language == 'en':
|
||||
model_type = choice('roberta','bert','albert') #,'xlm-roberta'
|
||||
elif language == 'de':
|
||||
model_type = choice('distilbert','bert', 'roberta')
|
||||
param_sampling = BayesianParameterSampling({
|
||||
'--learning_rate' : choice(1e-5, 2e-5, 3e-5, 4e-5),
|
||||
'--model_type' : model_type,
|
||||
'--max_seq_len' : choice(64, 128, 256),
|
||||
'--embeds_dropout' : choice(0.1, 0.2, 0.3, 0.4)
|
||||
})
|
||||
## Prepare HyperDrive Config
|
||||
hdc = HyperDriveConfig(estimator=est,
|
||||
hyperparameter_sampling = param_sampling,
|
||||
policy = None, # NOTE: not possible for bayesian
|
||||
primary_metric_name = 'f1macro',
|
||||
primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
|
||||
max_total_runs = 80,
|
||||
max_concurrent_runs = 1)
|
||||
## Run hyperparameter tuning
|
||||
hyperdrive_run = exp.submit(config=hdc)
|
||||
#Remove temp config
|
||||
os.remove(fn_config_infer)
|
||||
hyperdrive_run.wait_for_completion(show_output = True)
|
||||
## Get Results
|
||||
# best_run = hyperdrive_run.get_best_run_by_primary_metric()
|
|
@ -7,20 +7,23 @@ dependencies:
|
|||
- gensim=3.8.1
|
||||
- pip:
|
||||
# - azureml-defaults for aml deployment
|
||||
- azureml-sdk
|
||||
- azureml-dataprep[pandas,fuse]
|
||||
- azureml-sdk==1.0.85
|
||||
- azureml-dataprep[pandas,fuse]==1.1.38
|
||||
- mlflow==1.0.0
|
||||
- azureml-mlflow==1.0.85
|
||||
# - imblearn
|
||||
- spacy==2.2.1
|
||||
- transformers==2.3.0
|
||||
# - farm==0.3.2
|
||||
- 'git+https://github.com/deepset-ai/FARM.git'
|
||||
# - flair
|
||||
- git+https://github.com/zalandoresearch/flair.git
|
||||
- transformers==2.4.1
|
||||
- farm==0.4.1
|
||||
# - 'git+https://github.com/deepset-ai/FARM.git'
|
||||
- flair==0.4.5
|
||||
# - git+https://github.com/zalandoresearch/flair.git
|
||||
- azure-storage-blob
|
||||
- streamlit
|
||||
- selenium==3.141.0
|
||||
- bs4
|
||||
##DEMO ENV
|
||||
- pillow
|
||||
- streamlit==0.48.1
|
||||
# - langdetect
|
||||
# - lightgbm
|
||||
# - pandas_ml
|
||||
|
@ -32,9 +35,4 @@ dependencies:
|
|||
# - matplotlib
|
||||
# - seaborn
|
||||
|
||||
# temporary fix for falir
|
||||
# pip install --upgrade git+https://github.com/zalandoresearch/flair.git
|
||||
|
||||
#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"
|
||||
|
||||
# conda install pytorch torchvision cpuonly -c pytorch
|
||||
#python -m ipykernel install --user --name nlp --display-name "Python (nlp)"
|
|
@ -1 +0,0 @@
|
|||
{"Id": null, "Scope": "/subscriptions/50324bce-875f-4a7b-9d3c-0e33679f5d72/resourceGroups/nlp/providers/Microsoft.MachineLearningServices/workspaces/nlp-ml"}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -12,15 +12,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I0113 15:26:41.951092 3456 file_utils.py:35] PyTorch version 1.3.1 available.\n",
|
||||
"I0113 15:26:45.606345 3456 custom.py:19] [INFO] Project Target Language **en**\n"
|
||||
"I0110 11:14:41.291058 22612 custom.py:19] [INFO] Project Target Language **en**\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -32,76 +31,6 @@
|
|||
"import ner"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-01-13 18:15:32,174 loading file C:/Users/makayser/Desktop/nlp_local//en-ner-ontonotes-fast-v0.4.pt\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nr = ner.NER()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = 'I have been using windows 7 and getting some errors with the following code 0x800700c1. What could this mean? I use windows 7 from steve ballmer'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'value': '7',\n",
|
||||
" 'start': 26,\n",
|
||||
" 'end': 27,\n",
|
||||
" 'label': 'CARDINAL',\n",
|
||||
" 'source': 'flair'},\n",
|
||||
" {'value': 'ballmer',\n",
|
||||
" 'start': 137,\n",
|
||||
" 'end': 144,\n",
|
||||
" 'label': 'PERSON',\n",
|
||||
" 'source': 'flair'},\n",
|
||||
" {'value': 'windows 7',\n",
|
||||
" 'start': 4,\n",
|
||||
" 'end': 6,\n",
|
||||
" 'label': 'Product',\n",
|
||||
" 'source': 'list'},\n",
|
||||
" {'value': 'steve ballmer',\n",
|
||||
" 'start': 26,\n",
|
||||
" 'end': 28,\n",
|
||||
" 'label': 'Boss',\n",
|
||||
" 'source': 'list'},\n",
|
||||
" {'value': '0x800700c1.',\n",
|
||||
" 'start': 76,\n",
|
||||
" 'end': 87,\n",
|
||||
" 'label': 'ERROR CODE',\n",
|
||||
" 'source': 'Regex'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nr.run(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# How to run your first project
|
||||
|
||||
## Steps
|
||||
1. Upload your source dataset (raw) to AzureML Datasets
|
||||
2. Upload your dependencies to AzureML Datasets (stopwords, custom ner)
|
||||
3. Customize custom.py with any required pre-processing steps
|
||||
4. Create a *.config.json where * = project name
|
||||
5. Run deploy/training.py
|
||||
6. Run deploy/inference.py
|
||||
|
||||
## Requirements
|
||||
To make the NLP kit work flawlessly, there are some naming requirements and best practices.
|
||||
|
||||
- Use language short froms (eg. German = de, French = fr)
|
||||
- Naming
|
||||
- stopword list: stopwords-<language>.txt (tab delimited, utf-8)
|
|
@ -1,6 +1,8 @@
|
|||
{
|
||||
"name":"msforum_de",
|
||||
"language": "de",
|
||||
"environment" : "dev",
|
||||
"data_dir" : "./",
|
||||
"prepare" : {
|
||||
"data_type" : "json"
|
||||
},
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
{
|
||||
"name":"msforum_en",
|
||||
"language": "en",
|
||||
"environment" : "dev",
|
||||
"data_dir" : "./",
|
||||
"prepare" : {
|
||||
"data_type" : "json"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
import argparse
|
||||
# Run arguments
|
||||
# example: python 1_getsites.py --language de-de --product xbox
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--lang",
|
||||
parser.add_argument("--language",
|
||||
default="de-de",
|
||||
type=str,
|
||||
help="'en-us' or 'de-de")
|
||||
parser.add_argument('--product',
|
||||
default='windows',
|
||||
type=str,
|
||||
help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect','edge','ie','musicandvideo'")
|
||||
help="'windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo'")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Import and set driver
|
||||
|
@ -20,14 +21,26 @@ driver = webdriver.Chrome(executable_path = path + 'chromedriver.exe')
|
|||
product = args.product
|
||||
language = args.language
|
||||
|
||||
# Scrape sites
|
||||
for x in range(1, 8000):
|
||||
driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
|
||||
html = driver.page_source
|
||||
if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html):
|
||||
print('##### EMPTY PAGE REACHED -> EXIT')
|
||||
break
|
||||
else:
|
||||
with open('output-' + product + '.txt', 'a', encoding='utf-8') as myfile:
|
||||
myfile.write(html+'\n\n\n')
|
||||
print('Written:' + str(x))
|
||||
languages = ['it-it', 'fr-fr', 'en-us']
|
||||
products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
|
||||
#
|
||||
for product in products:
|
||||
print(f'[START] {language}, {product}.')
|
||||
# Scrape sites
|
||||
for x in range(1, 10000):
|
||||
driver.get(f'https://answers.microsoft.com/{language}/' + product + '/forum?sort=LastReplyDate&dir=Desc&tab=All&status=all&mod=&modAge=&advFil=&postedAfter=&postedBefore=&threadType=All&isFilterExpanded=false&page=' + str(x))
|
||||
html = driver.page_source
|
||||
if ('Es wurden keine Ergebnisse gefunden' in html) or ('No results found' in html) or ('Aucun résultat trouvé' in html) or ('Nessun risultato trovato' in html) or ('Pubblica domande, segui le discussioni, condividi le tue conoscenze' in html) or ('Posten Sie Fragen, folgen Sie Diskussionen und teilen Sie Ihr Wissen' in html) or ('Post questions, follow discussions, share your knowledge' in html) or ('Publiez des questions, suivez des discussions et partagez vos connaissances' in html) or ('Publique preguntas, siga conversaciones y comparta sus conocimientos' in html):
|
||||
print(f'[EXIT] EMPTY PAGE REACHED -> - {language}, {product}.')
|
||||
break
|
||||
else:
|
||||
url_temp = re.findall(r'(https?://answers.microsoft.com/' + language + '/' + product + '/forum/[^\s]+)', html)
|
||||
#url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/windows/forum/[^\s]+)', docs)
|
||||
url_temp2 = [s.strip('"') for s in url_temp]
|
||||
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
|
||||
with open('output-' + product + '-' + language + '.txt', 'a', encoding='utf-8') as outfile:
|
||||
# Prepare Links
|
||||
outfile.write("\n".join(url_list))
|
||||
#myfile.write(url_list+'\n\n\n')
|
||||
if (x%500):
|
||||
print(f'[STATUS] Page no. {str(x)} written.')
|
|
@ -17,26 +17,26 @@ import argparse
|
|||
|
||||
# Run arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--lang",
|
||||
parser.add_argument("--language",
|
||||
default="de-de",
|
||||
type=str,
|
||||
help="'en-us' or 'de-de")
|
||||
parser.add_argument('--product',
|
||||
default='windows',
|
||||
type=str,
|
||||
help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect']")
|
||||
help="['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge','ie','musicandvideo']")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Example: python 2_extract.py --language de-de --product windows
|
||||
|
||||
# Set params
|
||||
lang = args.language
|
||||
product = args.product
|
||||
|
||||
# Read File
|
||||
docs = codecs.open("output-" + product + ".txt", 'r', encoding='utf-8').read()
|
||||
|
||||
# Prepare Links
|
||||
url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
|
||||
url_temp2 = [s.strip('"') for s in url_temp]
|
||||
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
|
||||
#with open("output-" + product + "-" + lang + ".txt") as f:
|
||||
# urls = f.readlines()
|
||||
# you may also want to remove whitespace characters like `\n` at the end of each line
|
||||
#url_list = [x.strip() for x in urls]
|
||||
|
||||
# Extract text content
|
||||
def getText(soup):
|
||||
|
@ -83,12 +83,12 @@ def getUsernameAnswer(soup):
|
|||
|
||||
# Create date of question
|
||||
def getDateQuestion(soup):
|
||||
date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\n", "")
|
||||
date_question = soup.find_all("span", "asking-text-asked-on-link")[0].text.replace("\nErstellt am ", "").replace("\nCréé le ", "").replace("\nCreado el ", "").replace("\nCreato il ", "").replace("\n", "")
|
||||
return date_question
|
||||
|
||||
# Create date of answer
|
||||
def getDateAnswer(soup):
|
||||
date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n", "")
|
||||
date_answer = soup.find_all("span", "asking-text-asked-on-link")[1].text.replace("\nBeantwortet am ", "").replace("\n Répondu le ", "").replace("\nRespondió el ", "").replace("\nRisposta il ", "").replace("\n", "")
|
||||
return date_answer
|
||||
|
||||
# Get number of same cases
|
||||
|
@ -120,18 +120,17 @@ def getTags(soup, product):
|
|||
tags.append(subitem.text)
|
||||
except:
|
||||
tags = ""
|
||||
return product + "," + ",".join(tags)
|
||||
return f'{product},{",".join(tags)}'
|
||||
|
||||
# Put it all together
|
||||
def scrapeMe(url, product):
|
||||
print("Proceeding: ", url)
|
||||
print("[URL] -", url)
|
||||
### GET WEBSITE
|
||||
try:
|
||||
response = get(url)
|
||||
except:
|
||||
print("### ERROR")
|
||||
print("[ERROR] - There is an issue with the respective website.\n")
|
||||
html_soup = BeautifulSoup(response.text, 'html.parser')
|
||||
lang = "de-de"
|
||||
fileid = uuid.uuid4().hex
|
||||
|
||||
### GET TEXT
|
||||
|
@ -185,17 +184,37 @@ def scrapeMe(url, product):
|
|||
content = json.dumps(data, indent=4, separators=(',', ': '), ensure_ascii=False)
|
||||
|
||||
### WRITE TO JSON FILE
|
||||
with open("output-" + product + ".json", "a", encoding='utf-8') as file:
|
||||
#with open("output-" + product + "-" + lang + ".json", "a", encoding='utf-8') as file:
|
||||
with open(f"output-{lang}.json", "a", encoding='utf-8') as file:
|
||||
file.write(content+",")
|
||||
print("Written: File", fileid, "\n")
|
||||
print(f"[SUCCESS] - File {fileid}\n")
|
||||
|
||||
######################################################
|
||||
# LOOP THROUGH THE OUTPUT TEXT FILES AND CREATE JSON #
|
||||
######################################################
|
||||
for i, value in enumerate(url_list):
|
||||
i += 1
|
||||
|
||||
products = ['windows', 'msoffice', 'xbox', 'outlook_com', 'skype', 'surface', 'protect', 'edge', 'ie', 'musicandvideo']
|
||||
|
||||
for product in products:
|
||||
try:
|
||||
scrapeMe(value, product)
|
||||
except Exception as e:
|
||||
print(f'[ERROR] Failed to extract {value}')
|
||||
continue
|
||||
# Read File
|
||||
docs = codecs.open(f"output-{product}-{lang}.txt", 'r', encoding='utf-8').read()
|
||||
|
||||
# Prepare Links
|
||||
url_temp = re.findall(r'(https?://answers.microsoft.com/' + lang + '/' + product + '/forum/[^\s]+)', docs)
|
||||
url_temp2 = [s.strip('"') for s in url_temp]
|
||||
url_list = [x for x in url_temp2 if not x.endswith('LastReply')]
|
||||
|
||||
failed_url = []
|
||||
for i, value in enumerate(url_list):
|
||||
i += 1
|
||||
try:
|
||||
print(f'[STATUS] - {product}, {i}/{len(url_list)}')
|
||||
scrapeMe(value, product)
|
||||
except Exception as e:
|
||||
failed_url.append(value)
|
||||
print(f'[ERROR] - Failed to extract {value}')
|
||||
continue
|
||||
print(f"[DONE] - List for {product} of failed URLs: {failed_url},\nlen{failed_url}.")
|
||||
except:
|
||||
print(f"[ERROR] - 'output-{product}-{lang}.txt' does not exist.\n")
|
Загрузка…
Ссылка в новой задаче