feat(code review) fix to_nltk_tokens, add to_lowercase_all and to_lowercase as per said's comments
This commit is contained in:
Родитель
2058c77a2c
Коммит
6e3523810a
|
@ -7,19 +7,37 @@ import nltk
|
|||
from nltk.corpus import stopwords
|
||||
|
||||
|
||||
def to_lowercase(df):
|
||||
def to_lowercase_all(df):
|
||||
"""
|
||||
This function transforms all strings in the dataframe to lowercase
|
||||
This function transforms all strings in the dataframe to lowercase
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Raw dataframe with some text columns.
|
||||
Args:
|
||||
df (pd.DataFrame): Raw dataframe with some text columns.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with lowercase standardization.
|
||||
"""
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with lowercase standardization.
|
||||
"""
|
||||
return df.applymap(lambda s: s.lower() if type(s) == str else s)
|
||||
|
||||
|
||||
def to_lowercase(df, column_names=[]):
|
||||
"""
|
||||
This function transforms strings of the column names in the dataframe passed to lowercase
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Raw dataframe with some text columns.
|
||||
column_names(list, optional): column names to be changed to lowercase.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with columns with lowercase standardization.
|
||||
"""
|
||||
if not column_names:
|
||||
to_lowercase_all(df)
|
||||
else:
|
||||
df[column_names] = df[column_names].applymap(lambda s: s.lower() if type(s) == str else s)
|
||||
return df
|
||||
|
||||
|
||||
def to_spacy_tokens(
|
||||
df,
|
||||
sentence_cols=["sentence1", "sentence2"],
|
||||
|
@ -92,25 +110,25 @@ def to_nltk_tokens(
|
|||
token_cols=["sentence1_tokens", "sentence2_tokens"],
|
||||
):
|
||||
"""
|
||||
This function converts a sentence to word tokens using nltk.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
|
||||
sentence_cols (list, optional): Column names for the raw sentence pairs.
|
||||
token_cols (list, optional): Column names for the tokenized sentences.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with new columns token_cols, each containing a
|
||||
list of tokens for their respective sentences.
|
||||
"""
|
||||
nltk.download("punkt")
|
||||
df[token_cols[0]] = df.apply(
|
||||
lambda row: nltk.word_tokenize(row[sentence_cols[0]]), axis=1
|
||||
)
|
||||
df[token_cols[1]] = df.apply(
|
||||
lambda row: nltk.word_tokenize(row[sentence_cols[1]]), axis=1
|
||||
)
|
||||
This function converts a sentence to word tokens using nltk.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
|
||||
sentence_cols (list, optional): Column names for the raw sentence pairs.
|
||||
token_cols (list, optional): Column names for the tokenized sentences.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with new columns token_cols, each containing a
|
||||
list of tokens for their respective sentences.
|
||||
"""
|
||||
|
||||
nltk.download("punkt")
|
||||
df[token_cols] = df[sentence_cols].applymap(lambda sentence: nltk.word_tokenize(sentence))
|
||||
pd.concat(
|
||||
[
|
||||
df[sentence_cols],
|
||||
df[token_cols]
|
||||
], axis=1)
|
||||
return df
|
||||
|
||||
|
||||
|
@ -129,13 +147,14 @@ def rm_nltk_stopwords(
|
|||
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
|
||||
token_cols (list, optional): Column names for the tokenized sentence
|
||||
pairs.
|
||||
stop_cols (list, optional): Column names for the tokenized sentences
|
||||
stop_cols (list, optional): Column names for the tokenized sentences
|
||||
without stop words.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
|
||||
list of tokens for their respective sentences.
|
||||
"""
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
|
||||
list of tokens for their respective sentences.
|
||||
"""
|
||||
|
||||
if not set(token_cols).issubset(df.columns):
|
||||
df = to_nltk_tokens(df)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче