feat(code review) fix to_nltk_tokens, add to_lowercase_all and to_lowercase as per said's comments

This commit is contained in:
Janhavi Mahajan 2019-05-09 23:42:23 -04:00
Родитель 2058c77a2c
Коммит 6e3523810a
1 изменённых файлов: 50 добавлений и 31 удалений

Просмотреть файл

@ -7,19 +7,37 @@ import nltk
from nltk.corpus import stopwords
def to_lowercase(df):
def to_lowercase_all(df):
"""
This function transforms all strings in the dataframe to lowercase
This function transforms all strings in the dataframe to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
Returns:
pd.DataFrame: Dataframe with lowercase standardization.
"""
Returns:
pd.DataFrame: Dataframe with lowercase standardization.
"""
return df.applymap(lambda s: s.lower() if type(s) == str else s)
def to_lowercase(df, column_names=[]):
"""
This function transforms strings of the column names in the dataframe passed to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
column_names(list, optional): column names to be changed to lowercase.
Returns:
pd.DataFrame: Dataframe with columns with lowercase standardization.
"""
if not column_names:
to_lowercase_all(df)
else:
df[column_names] = df[column_names].applymap(lambda s: s.lower() if type(s) == str else s)
return df
def to_spacy_tokens(
df,
sentence_cols=["sentence1", "sentence2"],
@ -92,25 +110,25 @@ def to_nltk_tokens(
token_cols=["sentence1_tokens", "sentence2_tokens"],
):
"""
This function converts a sentence to word tokens using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentence pairs.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("punkt")
df[token_cols[0]] = df.apply(
lambda row: nltk.word_tokenize(row[sentence_cols[0]]), axis=1
)
df[token_cols[1]] = df.apply(
lambda row: nltk.word_tokenize(row[sentence_cols[1]]), axis=1
)
This function converts a sentence to word tokens using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentence pairs.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("punkt")
df[token_cols] = df[sentence_cols].applymap(lambda sentence: nltk.word_tokenize(sentence))
pd.concat(
[
df[sentence_cols],
df[token_cols]
], axis=1)
return df
@ -129,13 +147,14 @@ def rm_nltk_stopwords(
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
token_cols (list, optional): Column names for the tokenized sentence
pairs.
stop_cols (list, optional): Column names for the tokenized sentences
stop_cols (list, optional): Column names for the tokenized sentences
without stop words.
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
if not set(token_cols).issubset(df.columns):
df = to_nltk_tokens(df)