verseagility/code/prepare.py

"""
PREPARE

Before running train, you need to run prepare.py with the respective task.

Example (in the command line):
> cd to root dir
> conda activate nlp
> python code/prepare.py --do_format --task 1
"""
import os
import spacy
import pandas as pd
import string
import re
import argparse

from sklearn.model_selection import StratifiedShuffleSplit

# Custom functions
import sys
sys.path.append('./code')
import helper as he
import data as dt
import custom as cu

logger = he.get_logger(location=__name__)
class Clean():
    """Text preprocessing and cleaning steps

    SUPPORTED LANGUAGES
    - EN
    - DE
    - IT
    - XX (multi - NER only)

    SUPPORTED MODULES
    - Remove Noise
    Remove formatting and other noise that may be contained in emails or
    other document types.
    - Get Placeholders
    Placeholders for common items such as dates, times, urls but also
    custom customer IDs.
    - Remove Stopwords
    Stopwords can be added by adding a language specific stopword file
    to /assets. Format: "assets/stopwords_<language>.txt".
    - Lemmatize
    """

    def __init__(self, task,
                        download_source=False,
                        download_train=False,
                        inference=False):
        self.task = task
        self.language = cu.params.get('language')
        
        # Load data class
        self.dt = dt.Data(task=self.task, inference=inference)

        # Download data, if needed #TODO: move all downloads to data
        if download_source and not os.path.isfile(self.dt.fn_lookup.get('fn_source')):
            self.dt.download(dataset_name = self.dt.n_source, source = 'datastore') #TODO: still downloading?
        if download_train:
            self.dt.download(step = 'extra', source = 'datastore')
            self.dt.download(task = task, step = 'train', source = 'datastore')
        # if inference:
            # self.dt.download(step = 'extra', source = 'datastore') #TODO: not working in deployed

        # Load spacy model
        self.nlp = he.load_spacy_model(language=self.language, disable=['ner','parser','tagger'])
        
        # Create stopword list
        stopwords_active = []
        ## Load names
        try:
            names = self.dt.load('fn_names', file_type='list')
            stopwords_active = stopwords_active + names
        except FileNotFoundError as e:
            logger.warning(f'[WARNING] No names list loaded: {e}')
        
        ## Load stopwords
        try:
            stopwords = self.dt.load('fn_stopwords', file_type='list')
            stopwords_active = stopwords_active + stopwords
        except FileNotFoundError as e:
            logger.warning(f'[WARNING] No stopwords list loaded: {e}')

        ## Add to Spacy stopword list
        logger.warning(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')
        for w in stopwords_active:
            self.nlp.vocab[w.replace('\n','')].is_stop = True
   
    def remove(self, line, 
                rm_email_formatting=False, 
                rm_email_header=False, 
                rm_email_footer=False,
                rm_punctuation=False):
        """Remove content from text"""
    
        # Customer Remove
        line = cu.remove(line)

        if rm_email_formatting:
            line = re.sub(r'<[^>]+>', ' ', line) # Remove HTML tags
            line = re.sub(r'^(.*\.eml)', ' ', line) # remove header for system generated emails

        if rm_email_header:
            #DE/EN
            if self.language == 'en' or self.language == 'de':
                line = re.sub(r'\b(AW|RE|VON|WG|FWD|FW)(\:| )', '', line, flags=re.I)
            #DE
            if self.language == 'de':
                line = re.sub(r'(Sehr geehrte( Damen und Herren.)?.)|hallo.|guten( tag)?.', '', line, flags=re.I)

        if rm_email_footer:
            #EN
            if self.language == 'en':
                line = re.sub(r'\bkind regards.*', '', line, flags=re.I)
            #DE
            if self.language == 'de':
                line = re.sub(r'\b(mit )?(beste|viele|liebe|freundlich\w+)? (gr[u,ü][ß,ss].*)', '', line, flags=re.I)
                line = re.sub(r'\b(besten|herzlichen|lieben) dank.*', '', line, flags=re.I)
                line = re.sub(r'\bvielen dank für ihr verständnis.*', '', line, flags=re.I) 
                line = re.sub(r'\bvielen dank im voraus.*', '', line, flags=re.I) 
                line = re.sub(r'\b(mfg|m\.f\.g) .*','', line, flags=re.I)
                line = re.sub(r'\b(lg) .*','',line, flags=re.I)
                line = re.sub(r'\b(meinem iPhone gesendet) .*','',line, flags=re.I)
                line = re.sub(r'\b(Gesendet mit der (WEB|GMX)) .*','',line, flags=re.I)
                line = re.sub(r'\b(Diese E-Mail wurde von Avast) .*','',line, flags=re.I)

        # Remove remaining characters
        ##NOTE: may break other regex
        if rm_punctuation:
            line = re.sub('['+string.punctuation+']',' ',line)
        
        return line

    def get_placeholder(self, line,
                        rp_generic=False,
                        rp_custom=False,
                        rp_num=False):
        '''Replace text with type specfic placeholders'''
        # Customer placeholders
        line = cu.get_placeholder(line)

        # Generic placeholder
        if rp_generic:
            line = re.sub(r' \+[0-9]+', ' ', line) # remove phone numbers
            line = re.sub(r'0x([a-z]|[0-9])+ ',' PER ',line, re.IGNORECASE) # replace 
            line = re.sub(r'[0-9]{2}[\/.,:][0-9]{2}[\/.,:][0-9]{2,4}', ' PDT ', line) # remove dates and time, replace with placeholder
            line = re.sub(r'([0-9]{2,3}[\.]){3}[0-9]{1,3}',' PIP ',line) # replace ip with placeholder
            line = re.sub(r'[0-9]{1,2}[\/.,:][0-9]{1,2}', ' PTI ', line) # remove only time, replace with placeholder
            line = re.sub(r'[\w\.-]+@[\w\.-]+', ' PEM ', line) # remove emails
            line = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', ' PUR ', line) # Remove links
            line = re.sub(r'€|\$|(USD)|(EURO)', ' PMO ', line)
        
        # Placeholders for numerics
        if rp_num:
            line = re.sub(r' ([0-9]{4,30}) ',' PNL ', line) # placeholder for long stand alone numbers
            line = re.sub(r' [0-9]{2,3} ',' PNS ', line) # placeholder for short stand alone numbers

        return line

    def tokenize(self, line, lemmatize = False, rm_stopwords = False):
        '''Tokenizer for non DL tasks'''
        if not isinstance(line, str):
            line = str(line)
        
        if lemmatize and rm_stopwords:
            line = ' '.join([t.lemma_ for t in self.nlp(line) if not t.is_stop])
        elif lemmatize:
            line = ' '.join([t.lemma_ for t in self.nlp(line)])
        elif rm_stopwords:
            line = ' '.join([t.text for t in self.nlp(line) if not t.is_stop])

        return line
    
    def transform(self, texts, 
                    to_lower            = False,
                    # Remove
                    rm_email_formatting = False, 
                    rm_email_header     = False,
                    rm_email_footer     = False, 
                    rm_punctuation      = False,
                    # Placeholders
                    rp_generic          = False, 
                    rp_num              = False,
                    # Tokenize
                    lemmatize           = False,
                    rm_stopwords        = False,
                    return_token        = False,
                    # Whitespace
                    remove_whitespace   = True
                ):
        """Main run function for cleaning process"""

        if isinstance(texts, str):
            texts = [texts]

        # Convert to series for improved efficiency
        df_texts = pd.Series(texts)

        # Avoid loading errors
        df_texts = df_texts.replace('\t', ' ', regex=True)
        
        # Remove noise
        if any((rm_email_formatting, rm_email_header, 
                    rm_email_footer, rm_punctuation)):
            df_texts = df_texts.apply(lambda x: self.remove(x,
                                            rm_email_formatting =   rm_email_formatting, 
                                            rm_email_header     =   rm_email_header, 
                                            rm_email_footer     =   rm_email_footer,
                                            rm_punctuation      =   rm_punctuation))

        # Replace placeholders
        if any((rp_generic, rp_num)):
            df_texts = df_texts.apply(lambda x: self.get_placeholder(x,
                                                        rp_generic  =   rp_generic,
                                                        rp_num      =   rp_num))

        # Tokenize text
        if any((lemmatize, rm_stopwords, return_token)):
            df_texts = df_texts.apply(self.tokenize,
                                    lemmatize = lemmatize,
                                    rm_stopwords = rm_stopwords)
        # To lower
        if to_lower:
            df_texts = df_texts.apply(str.lower)

        # Remove spacing
        if remove_whitespace:
            df_texts = df_texts.apply(lambda x: " ".join(x.split()))
        
        # Return Tokens
        if return_token:
            return [t.split(' ') for t in df_texts.to_list()]
        else:
            return df_texts.to_list()

    def transform_by_task(self, text):
        # CUTOM FUNCTION
        if cu.tasks.get(str(self.task)).get('type') == 'classification':
            return self.transform(text,
                    rm_email_formatting = True, 
                    rm_email_header     = True,
                    rm_email_footer     = True,
                    rp_generic          = True)[0]
        elif cu.tasks.get(str(self.task)).get('type') == 'ner':
            return text[0]
        elif cu.tasks.get(str(self.task)).get('type') == 'qa':
            return self.transform(text,
                    to_lower            = True,
                    # Remove
                    rm_email_formatting = True, 
                    rm_email_header     = True,
                    rm_email_footer     = True, 
                    rm_punctuation      = True,
                    # Placeholders
                    rp_generic          = True, 
                    rp_num              = True,
                    # Tokenize
                    lemmatize           = True,
                    rm_stopwords        = True,
                    return_token        = True
                )[0]
        else:
            logger.warning('[WARNING] No transform by task found.')
            return text[0]

def prepare_classification(task, do_format, train_split, min_cat_occurance, 
                            min_char_length, register_data):

    # Get clean object
    cl = Clean(task=task, download_source=True)

    # Load data
    if do_format:
        data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
    else:
        data = cl.dt.load('fn_prep')
    logger.warning(f'Data Length : {len(data)}')

    # Load text & label field
    text_raw = cu.load_text(data)
    data['label'] = cu.load_label(data, task)
    label_list_raw = data.label.drop_duplicates()
    
    # Clean text
    data['text'] = cl.transform(text_raw,
                    rm_email_formatting = True, 
                    rm_email_header     = True,
                    rm_email_footer     = True,
                    rp_generic          = True)
    
    # Filter by length
    data = he.remove_short(data, 'text', min_char_length=min_char_length)
    logger.warning(f'Data Length : {len(data)}')

    # Remove duplicates
    data_red = data.drop_duplicates(subset=['text'])
    logger.warning(f'Data Length : {len(data_red)}')
    
    # Min class occurance
    data_red = data_red[data_red.groupby('label').label.transform('size') > min_cat_occurance]
    logger.warning(f'Data Length : {len(data_red)}')

    data_red = data_red.reset_index(drop=True).copy()

    # Label list
    label_list = data_red.label.drop_duplicates()
    logger.warning(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')

    # Split data
    strf_split = StratifiedShuffleSplit(n_splits = 1, test_size=(1-train_split), random_state=200)
    for train_index, test_index in strf_split.split(data_red, data_red['label']):
        df_cat_train = data_red.loc[train_index]
        df_cat_test = data_red.loc[test_index]
    
    # Save data
    cl.dt.save(data_red, fn = 'fn_clean')
    cl.dt.save(df_cat_train[['text','label']], fn = 'fn_train')
    cl.dt.save(df_cat_test[['text','label']], fn = 'fn_test')
    cl.dt.save(label_list, fn = 'fn_label', header=False)

    # Upload data
    if register_data:
        cl.dt.upload('fp_data', task=task, step='train', destination='dataset')

def prepare_ner(task, do_format, register_data):
    pass

def prepare_qa(task, do_format, min_char_length, register_data):

    # Get clean object
    cl = Clean(task=task, download_source=True)
    
    # Load data
    if do_format:
        data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))
    else:
        data = cl.dt.load('fn_prep')
    logger.warning(f'Data Length : {len(data)}')

    # Filter relevant question answer pairs
    data = cu.filter_qa(data)
    logger.warning(f'Data Length : {len(data)}')

    # Load question & answer fields
    question, answer = cu.load_qa(data)
    
    # Clean text
    data['question_clean'] = cl.transform(question,
                    to_lower            = True,
                    rm_email_formatting = True, 
                    rm_email_header     = True,
                    rm_email_footer     = True, 
                    rm_punctuation      = True,
                    rp_generic          = True, 
                    rp_num              = True,
                    lemmatize           = True,
                    rm_stopwords        = True
                    )
    data['answer_clean'] = cl.transform(answer,
                    to_lower            = True,
                    rm_email_formatting = True, 
                    rm_email_header     = True,
                    rm_email_footer     = True, 
                    rm_punctuation      = True,
                    rp_generic          = True, 
                    rp_num              = True,
                    lemmatize           = True,
                    rm_stopwords        = True
                    )
    # For display
    data['answer_text_clean'] = cl.transform(answer,
                rm_email_formatting = True, 
                rm_email_header     = True,
                rm_email_footer     = True
            )

    # Filter by length
    data = he.remove_short(data, 'question_clean', min_char_length=min_char_length)
    logger.warning(f'Data Length : {len(data)}')

    # Remove duplicates
    data = data.drop_duplicates(subset=['question_clean'])
    logger.warning(f'Data Length : {len(data)}')

    data = data.reset_index(drop=True).copy()

    # Save data
    cl.dt.save(data, fn = 'fn_clean')
    # Upload data
    if register_data:
        cl.dt.upload('fp_data', task=task, step='train', destination='dataset')

def main(task=1, 
            do_format=False, 
            split=0.9, 
            min_cat_occurance=300, 
            min_char_length=20,
            register_data=False):
    logger.warning(f'Running <PREPARE> for task {task}')
    task_type = cu.tasks.get(str(task)).get('type')
    if 'classification' == task_type:
        prepare_classification(task, do_format, split, min_cat_occurance, min_char_length, register_data)
    elif 'ner' == task_type:
        prepare_ner(task, do_format, register_data)
    elif 'qa' == task_type:
        prepare_qa(task, do_format, min_char_length, register_data)
    else:
        logger.warning('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')

def run():
    """Run from the command line"""
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", 
                    default=1,
                    type=int,
                    help="Task where: \
                            -task 1 : classification subcat \
                            -task 2 : classification cat \
                            -task 3 : ner \
                            -task 4 : qa") 
    parser.add_argument('--do_format',
                    action='store_true',
                    help="Avoid reloading and normalizing data")
    parser.add_argument("--split", 
                    default=0.9,
                    type=float,
                    help="Train test split. Dev split is taken from train set.")    
    parser.add_argument("--min_cat_occurance", 
                    default=300,
                    type=int,
                    help="Min occurance required by category.")      
    parser.add_argument("--min_char_length", 
                    default=20,
                    type=int,
                    help="") 
    parser.add_argument('--register_data',
                    action='store_true',
                    help="")
    args = parser.parse_args()
    main(args.task, args.do_format, args.split, min_cat_occurance=args.min_cat_occurance, 
                    min_char_length=args.min_char_length, register_data=args.register_data)
        
if __name__ == '__main__':
    run()
first push 2020-01-24 21:07:28 +03:00			`"""`
			`PREPARE`

			`Before running train, you need to run prepare.py with the respective task.`

			`Example (in the command line):`
			`> cd to root dir`
			`> conda activate nlp`
			`> python code/prepare.py --do_format --task 1`
			`"""`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`import os`
first push 2020-01-24 21:07:28 +03:00			`import spacy`
			`import pandas as pd`
			`import string`
			`import re`
			`import argparse`

			`from sklearn.model_selection import StratifiedShuffleSplit`

			`# Custom functions`
			`import sys`
			`sys.path.append('./code')`
			`import helper as he`
			`import data as dt`
			`import custom as cu`

			`logger = he.get_logger(location=__name__)`
			`class Clean():`
			`"""Text preprocessing and cleaning steps`

			`SUPPORTED LANGUAGES`
			`- EN`
			`- DE`
			`- IT`
			`- XX (multi - NER only)`

			`SUPPORTED MODULES`
			`- Remove Noise`
			`Remove formatting and other noise that may be contained in emails or`
			`other document types.`
			`- Get Placeholders`
			`Placeholders for common items such as dates, times, urls but also`
			`custom customer IDs.`
			`- Remove Stopwords`
			`Stopwords can be added by adding a language specific stopword file`
			`to /assets. Format: "assets/stopwords_<language>.txt".`
			`- Lemmatize`
			`"""`

			`def __init__(self, task,`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`download_source=False,`
			`download_train=False,`
first push 2020-01-24 21:07:28 +03:00			`inference=False):`
			`self.task = task`
			`self.language = cu.params.get('language')`

			`# Load data class`
			`self.dt = dt.Data(task=self.task, inference=inference)`

training and deployment pipeline 2020-02-18 16:43:58 +03:00			`# Download data, if needed #TODO: move all downloads to data`
			`if download_source and not os.path.isfile(self.dt.fn_lookup.get('fn_source')):`
			`self.dt.download(dataset_name = self.dt.n_source, source = 'datastore') #TODO: still downloading?`
			`if download_train:`
			`self.dt.download(step = 'extra', source = 'datastore')`
			`self.dt.download(task = task, step = 'train', source = 'datastore')`
			`# if inference:`
			`# self.dt.download(step = 'extra', source = 'datastore') #TODO: not working in deployed`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00
first push 2020-01-24 21:07:28 +03:00			`# Load spacy model`
			`self.nlp = he.load_spacy_model(language=self.language, disable=['ner','parser','tagger'])`

			`# Create stopword list`
			`stopwords_active = []`
			`## Load names`
			`try:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`names = self.dt.load('fn_names', file_type='list')`
first push 2020-01-24 21:07:28 +03:00			`stopwords_active = stopwords_active + names`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`except FileNotFoundError as e:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'[WARNING] No names list loaded: {e}')`
first push 2020-01-24 21:07:28 +03:00
			`## Load stopwords`
			`try:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`stopwords = self.dt.load('fn_stopwords', file_type='list')`
first push 2020-01-24 21:07:28 +03:00			`stopwords_active = stopwords_active + stopwords`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`except FileNotFoundError as e:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'[WARNING] No stopwords list loaded: {e}')`

first push 2020-01-24 21:07:28 +03:00			`## Add to Spacy stopword list`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'[INFO] Active stopwords list lenght: {len(stopwords_active)}')`
first push 2020-01-24 21:07:28 +03:00			`for w in stopwords_active:`
			`self.nlp.vocab[w.replace('\n','')].is_stop = True`

			`def remove(self, line,`
			`rm_email_formatting=False,`
			`rm_email_header=False,`
			`rm_email_footer=False,`
			`rm_punctuation=False):`
			`"""Remove content from text"""`

			`# Customer Remove`
			`line = cu.remove(line)`

			`if rm_email_formatting:`
			`line = re.sub(r'<[^>]+>', ' ', line) # Remove HTML tags`
			`line = re.sub(r'^(.*\.eml)', ' ', line) # remove header for system generated emails`

			`if rm_email_header:`
			`#DE/EN`
			`if self.language == 'en' or self.language == 'de':`
			`line = re.sub(r'\b(AW\|RE\|VON\|WG\|FWD\|FW)(\:\| )', '', line, flags=re.I)`
			`#DE`
			`if self.language == 'de':`
			`line = re.sub(r'(Sehr geehrte( Damen und Herren.)?.)\|hallo.\|guten( tag)?.', '', line, flags=re.I)`

			`if rm_email_footer:`
			`#EN`
			`if self.language == 'en':`
			`line = re.sub(r'\bkind regards.*', '', line, flags=re.I)`
			`#DE`
			`if self.language == 'de':`
			`line = re.sub(r'\b(mit )?(beste\|viele\|liebe\|freundlich\w+)? (gr[u,ü][ß,ss].*)', '', line, flags=re.I)`
			`line = re.sub(r'\b(besten\|herzlichen\|lieben) dank.*', '', line, flags=re.I)`
			`line = re.sub(r'\bvielen dank für ihr verständnis.*', '', line, flags=re.I)`
			`line = re.sub(r'\bvielen dank im voraus.*', '', line, flags=re.I)`
			`line = re.sub(r'\b(mfg\|m\.f\.g) .*','', line, flags=re.I)`
			`line = re.sub(r'\b(lg) .*','',line, flags=re.I)`
			`line = re.sub(r'\b(meinem iPhone gesendet) .*','',line, flags=re.I)`
			`line = re.sub(r'\b(Gesendet mit der (WEB\|GMX)) .*','',line, flags=re.I)`
			`line = re.sub(r'\b(Diese E-Mail wurde von Avast) .*','',line, flags=re.I)`

			`# Remove remaining characters`
			`##NOTE: may break other regex`
			`if rm_punctuation:`
			`line = re.sub('['+string.punctuation+']',' ',line)`

			`return line`

			`def get_placeholder(self, line,`
			`rp_generic=False,`
			`rp_custom=False,`
			`rp_num=False):`
			`'''Replace text with type specfic placeholders'''`
			`# Customer placeholders`
			`line = cu.get_placeholder(line)`

			`# Generic placeholder`
			`if rp_generic:`
			`line = re.sub(r' \+[0-9]+', ' ', line) # remove phone numbers`
			`line = re.sub(r'0x([a-z]\|[0-9])+ ',' PER ',line, re.IGNORECASE) # replace`
			`line = re.sub(r'[0-9]{2}[\/.,:][0-9]{2}[\/.,:][0-9]{2,4}', ' PDT ', line) # remove dates and time, replace with placeholder`
			`line = re.sub(r'([0-9]{2,3}[\.]){3}[0-9]{1,3}',' PIP ',line) # replace ip with placeholder`
			`line = re.sub(r'[0-9]{1,2}[\/.,:][0-9]{1,2}', ' PTI ', line) # remove only time, replace with placeholder`
			`line = re.sub(r'[\w\.-]+@[\w\.-]+', ' PEM ', line) # remove emails`
			`line = re.sub(r'http[s]?://(?:[a-z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-f][0-9a-f]))+', ' PUR ', line) # Remove links`
			`line = re.sub(r'€\|\$\|(USD)\|(EURO)', ' PMO ', line)`

			`# Placeholders for numerics`
			`if rp_num:`
			`line = re.sub(r' ([0-9]{4,30}) ',' PNL ', line) # placeholder for long stand alone numbers`
			`line = re.sub(r' [0-9]{2,3} ',' PNS ', line) # placeholder for short stand alone numbers`

			`return line`

			`def tokenize(self, line, lemmatize = False, rm_stopwords = False):`
			`'''Tokenizer for non DL tasks'''`
			`if not isinstance(line, str):`
			`line = str(line)`

			`if lemmatize and rm_stopwords:`
			`line = ' '.join([t.lemma_ for t in self.nlp(line) if not t.is_stop])`
			`elif lemmatize:`
			`line = ' '.join([t.lemma_ for t in self.nlp(line)])`
custom ner, cleanup 2020-01-30 05:00:11 +03:00			`elif rm_stopwords:`
first push 2020-01-24 21:07:28 +03:00			`line = ' '.join([t.text for t in self.nlp(line) if not t.is_stop])`

			`return line`

			`def transform(self, texts,`
			`to_lower = False,`
			`# Remove`
			`rm_email_formatting = False,`
			`rm_email_header = False,`
			`rm_email_footer = False,`
			`rm_punctuation = False,`
			`# Placeholders`
			`rp_generic = False,`
			`rp_num = False,`
			`# Tokenize`
			`lemmatize = False,`
			`rm_stopwords = False,`
			`return_token = False,`
			`# Whitespace`
			`remove_whitespace = True`
			`):`
			`"""Main run function for cleaning process"""`

			`if isinstance(texts, str):`
			`texts = [texts]`

			`# Convert to series for improved efficiency`
			`df_texts = pd.Series(texts)`

			`# Avoid loading errors`
			`df_texts = df_texts.replace('\t', ' ', regex=True)`

			`# Remove noise`
			`if any((rm_email_formatting, rm_email_header,`
			`rm_email_footer, rm_punctuation)):`
			`df_texts = df_texts.apply(lambda x: self.remove(x,`
			`rm_email_formatting = rm_email_formatting,`
			`rm_email_header = rm_email_header,`
			`rm_email_footer = rm_email_footer,`
			`rm_punctuation = rm_punctuation))`

			`# Replace placeholders`
			`if any((rp_generic, rp_num)):`
			`df_texts = df_texts.apply(lambda x: self.get_placeholder(x,`
			`rp_generic = rp_generic,`
			`rp_num = rp_num))`

			`# Tokenize text`
			`if any((lemmatize, rm_stopwords, return_token)):`
			`df_texts = df_texts.apply(self.tokenize,`
			`lemmatize = lemmatize,`
			`rm_stopwords = rm_stopwords)`
			`# To lower`
			`if to_lower:`
			`df_texts = df_texts.apply(str.lower)`

			`# Remove spacing`
			`if remove_whitespace:`
			`df_texts = df_texts.apply(lambda x: " ".join(x.split()))`

			`# Return Tokens`
			`if return_token:`
			`return [t.split(' ') for t in df_texts.to_list()]`
			`else:`
			`return df_texts.to_list()`

			`def transform_by_task(self, text):`
			`# CUTOM FUNCTION`
			`if cu.tasks.get(str(self.task)).get('type') == 'classification':`
			`return self.transform(text,`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True,`
			`rp_generic = True)[0]`
			`elif cu.tasks.get(str(self.task)).get('type') == 'ner':`
			`return text[0]`
			`elif cu.tasks.get(str(self.task)).get('type') == 'qa':`
			`return self.transform(text,`
			`to_lower = True,`
			`# Remove`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True,`
			`rm_punctuation = True,`
			`# Placeholders`
			`rp_generic = True,`
			`rp_num = True,`
			`# Tokenize`
			`lemmatize = True,`
			`rm_stopwords = True,`
			`return_token = True`
			`)[0]`
			`else:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning('[WARNING] No transform by task found.')`
first push 2020-01-24 21:07:28 +03:00			`return text[0]`

custom ner, cleanup 2020-01-30 05:00:11 +03:00			`def prepare_classification(task, do_format, train_split, min_cat_occurance,`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`min_char_length, register_data):`
first push 2020-01-24 21:07:28 +03:00
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`# Get clean object`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`cl = Clean(task=task, download_source=True)`
first push 2020-01-24 21:07:28 +03:00
			`# Load data`
			`if do_format:`
			`data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))`
			`else:`
			`data = cl.dt.load('fn_prep')`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Load text & label field`
			`text_raw = cu.load_text(data)`
			`data['label'] = cu.load_label(data, task)`
			`label_list_raw = data.label.drop_duplicates()`

			`# Clean text`
			`data['text'] = cl.transform(text_raw,`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True,`
			`rp_generic = True)`

			`# Filter by length`
custom ner, cleanup 2020-01-30 05:00:11 +03:00			`data = he.remove_short(data, 'text', min_char_length=min_char_length)`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Remove duplicates`
			`data_red = data.drop_duplicates(subset=['text'])`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data_red)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Min class occurance`
			`data_red = data_red[data_red.groupby('label').label.transform('size') > min_cat_occurance]`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data_red)}')`
first push 2020-01-24 21:07:28 +03:00
			`data_red = data_red.reset_index(drop=True).copy()`

			`# Label list`
			`label_list = data_red.label.drop_duplicates()`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Excluded labels: {list(set(label_list_raw)-set(label_list))}')`
first push 2020-01-24 21:07:28 +03:00
			`# Split data`
			`strf_split = StratifiedShuffleSplit(n_splits = 1, test_size=(1-train_split), random_state=200)`
			`for train_index, test_index in strf_split.split(data_red, data_red['label']):`
			`df_cat_train = data_red.loc[train_index]`
			`df_cat_test = data_red.loc[test_index]`

			`# Save data`
			`cl.dt.save(data_red, fn = 'fn_clean')`
			`cl.dt.save(df_cat_train[['text','label']], fn = 'fn_train')`
			`cl.dt.save(df_cat_test[['text','label']], fn = 'fn_test')`
			`cl.dt.save(label_list, fn = 'fn_label', header=False)`

updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`# Upload data`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`if register_data:`
			`cl.dt.upload('fp_data', task=task, step='train', destination='dataset')`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`def prepare_ner(task, do_format, register_data):`
first push 2020-01-24 21:07:28 +03:00			`pass`

training and deployment pipeline 2020-02-18 16:43:58 +03:00			`def prepare_qa(task, do_format, min_char_length, register_data):`
first push 2020-01-24 21:07:28 +03:00
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`# Get clean object`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`cl = Clean(task=task, download_source=True)`
first push 2020-01-24 21:07:28 +03:00
			`# Load data`
			`if do_format:`
			`data = cl.dt.process(data_type=cu.params.get('prepare').get('data_type'))`
			`else:`
			`data = cl.dt.load('fn_prep')`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Filter relevant question answer pairs`
			`data = cu.filter_qa(data)`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Load question & answer fields`
			`question, answer = cu.load_qa(data)`

			`# Clean text`
			`data['question_clean'] = cl.transform(question,`
			`to_lower = True,`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True,`
			`rm_punctuation = True,`
			`rp_generic = True,`
			`rp_num = True,`
			`lemmatize = True,`
			`rm_stopwords = True`
			`)`
			`data['answer_clean'] = cl.transform(answer,`
			`to_lower = True,`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True,`
			`rm_punctuation = True,`
			`rp_generic = True,`
			`rp_num = True,`
			`lemmatize = True,`
			`rm_stopwords = True`
			`)`
			`# For display`
			`data['answer_text_clean'] = cl.transform(answer,`
			`rm_email_formatting = True,`
			`rm_email_header = True,`
			`rm_email_footer = True`
			`)`

			`# Filter by length`
custom ner, cleanup 2020-01-30 05:00:11 +03:00			`data = he.remove_short(data, 'question_clean', min_char_length=min_char_length)`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`# Remove duplicates`
			`data = data.drop_duplicates(subset=['question_clean'])`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Data Length : {len(data)}')`
first push 2020-01-24 21:07:28 +03:00
			`data = data.reset_index(drop=True).copy()`

			`# Save data`
			`cl.dt.save(data, fn = 'fn_clean')`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`# Upload data`
			`if register_data:`
			`cl.dt.upload('fp_data', task=task, step='train', destination='dataset')`
first push 2020-01-24 21:07:28 +03:00
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`def main(task=1,`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`do_format=False,`
			`split=0.9,`
			`min_cat_occurance=300,`
			`min_char_length=20,`
			`register_data=False):`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning(f'Running <PREPARE> for task {task}')`
first push 2020-01-24 21:07:28 +03:00			`task_type = cu.tasks.get(str(task)).get('type')`
			`if 'classification' == task_type:`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`prepare_classification(task, do_format, split, min_cat_occurance, min_char_length, register_data)`
first push 2020-01-24 21:07:28 +03:00			`elif 'ner' == task_type:`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`prepare_ner(task, do_format, register_data)`
first push 2020-01-24 21:07:28 +03:00			`elif 'qa' == task_type:`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`prepare_qa(task, do_format, min_char_length, register_data)`
first push 2020-01-24 21:07:28 +03:00			`else:`
updated data handling, new naming, improved cat 2020-02-14 17:42:52 +03:00			`logger.warning('[ERROR] TASK TYPE UNKNOWN. Nothing was processed.')`

training and deployment pipeline 2020-02-18 16:43:58 +03:00			`def run():`
			`"""Run from the command line"""`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--task",`
			`default=1,`
			`type=int,`
			`help="Task where: \`
			`-task 1 : classification subcat \`
			`-task 2 : classification cat \`
			`-task 3 : ner \`
			`-task 4 : qa")`
			`parser.add_argument('--do_format',`
			`action='store_true',`
			`help="Avoid reloading and normalizing data")`
			`parser.add_argument("--split",`
			`default=0.9,`
			`type=float,`
			`help="Train test split. Dev split is taken from train set.")`
			`parser.add_argument("--min_cat_occurance",`
			`default=300,`
			`type=int,`
			`help="Min occurance required by category.")`
			`parser.add_argument("--min_char_length",`
			`default=20,`
			`type=int,`
			`help="")`
			`parser.add_argument('--register_data',`
			`action='store_true',`
			`help="")`
			`args = parser.parse_args()`
			`main(args.task, args.do_format, args.split, min_cat_occurance=args.min_cat_occurance,`
			`min_char_length=args.min_char_length, register_data=args.register_data)`
first push 2020-01-24 21:07:28 +03:00
			`if __name__ == '__main__':`
training and deployment pipeline 2020-02-18 16:43:58 +03:00			`run()`