feat(code review) fix to_nltk_tokens, add to_lowercase_all and to_lowercase as per said's comments

2019-05-09 23:42:23 -04:00 · 2019-05-09 23:42:23 -04:00 · 6e3523810a
--- a/utils_nlp/dataset/preprocess.py
+++ b/utils_nlp/dataset/preprocess.py
@ -7,19 +7,37 @@ import nltk
 from nltk.corpus import stopwords


-def to_lowercase(df):
+def to_lowercase_all(df):
    """
-	This function transforms all strings in the dataframe to lowercase 
+    This function transforms all strings in the dataframe to lowercase

-	Args:
-		df (pd.DataFrame): Raw dataframe with some text columns.
+    Args:
+        df (pd.DataFrame): Raw dataframe with some text columns.

-	Returns:
-		pd.DataFrame: Dataframe with lowercase standardization.
-	"""
+    Returns:
+        pd.DataFrame: Dataframe with lowercase standardization.
+    """
    return df.applymap(lambda s: s.lower() if type(s) == str else s)


+def to_lowercase(df, column_names=[]):
+    """
+    This function transforms strings of the column names in the dataframe passed to lowercase
+
+    Args:
+        df (pd.DataFrame): Raw dataframe with some text columns.
+        column_names(list, optional): column names to be changed to lowercase.
+
+    Returns:
+        pd.DataFrame: Dataframe with columns with lowercase standardization.
+    """
+    if not column_names:
+        to_lowercase_all(df)
+    else:
+        df[column_names] = df[column_names].applymap(lambda s: s.lower() if type(s) == str else s)
+        return df
+
+
 def to_spacy_tokens(
    df,
    sentence_cols=["sentence1", "sentence2"],
@ -92,25 +110,25 @@ def to_nltk_tokens(
    token_cols=["sentence1_tokens", "sentence2_tokens"],
 ):
    """
-	This function converts a sentence to word tokens using nltk.
-	
-	Args:
-		df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
-		sentence_cols (list, optional): Column names for the raw sentence pairs.
-		token_cols (list, optional): Column names for the tokenized sentences.
-	
-	Returns:
-		pd.DataFrame: Dataframe with new columns token_cols, each containing a 
-			list of tokens for their respective sentences.
-	"""
-    nltk.download("punkt")
-    df[token_cols[0]] = df.apply(
-        lambda row: nltk.word_tokenize(row[sentence_cols[0]]), axis=1
-    )
-    df[token_cols[1]] = df.apply(
-        lambda row: nltk.word_tokenize(row[sentence_cols[1]]), axis=1
-    )
+    This function converts a sentence to word tokens using nltk.

+    Args:
+        df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
+        sentence_cols (list, optional): Column names for the raw sentence pairs.
+        token_cols (list, optional): Column names for the tokenized sentences.
+
+    Returns:
+    pd.DataFrame: Dataframe with new columns token_cols, each containing a
+    list of tokens for their respective sentences.
+    """
+
+    nltk.download("punkt")
+    df[token_cols] = df[sentence_cols].applymap(lambda sentence: nltk.word_tokenize(sentence))
+    pd.concat(
+        [
+        df[sentence_cols],
+        df[token_cols]
+        ], axis=1)
    return df


@ -129,13 +147,14 @@ def rm_nltk_stopwords(
 		df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
 		token_cols (list, optional): Column names for the tokenized sentence 
 			pairs.
-		stop_cols (list, optional): Column names for the tokenized sentences 
+		stop_cols (list, optional): Column names for the tokenized sentences
 			without stop words.
-	
-	Returns:
-		pd.DataFrame: Dataframe with new columns stop_cols, each containing a 
-			list of tokens for their respective sentences.
-	"""
+
+    Returns:
+        pd.DataFrame: Dataframe with new columns stop_cols, each containing a
+        list of tokens for their respective sentences.
+    """
+
    if not set(token_cols).issubset(df.columns):
        df = to_nltk_tokens(df)