This commit is contained in:
Casey Hong 2019-06-12 12:37:43 -04:00
Родитель ef96d27f61
Коммит e031d3d225
4 изменённых файлов: 16 добавлений и 44 удалений

Просмотреть файл

@ -38,7 +38,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n"
"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
"[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
]
}
],
@ -92,7 +93,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92.3k/92.3k [00:23<00:00, 3.86kKB/s]\n"
"100%|██████████| 92.3k/92.3k [01:01<00:00, 1.50kKB/s]\n"
]
}
],
@ -437,17 +438,7 @@
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"outputs": [],
"source": [
"train_tok = to_nltk_tokens(to_lowercase_all(train))"
]
@ -583,23 +574,7 @@
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"outputs": [],
"source": [
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
@ -669,9 +644,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "nlp",
"display_name": "Python (nlp_cpu)",
"language": "python",
"name": "nlp"
"name": "nlp_cpu"
},
"language_info": {
"codemirror_mode": {

Просмотреть файл

@ -62,14 +62,14 @@ PIP_BASE = {
"pyemd": "pyemd==0.5.1",
"ipywebrtc": "ipywebrtc==0.4.3",
"pre-commit": "pre-commit>=1.14.4",
"spacy": "spacy>=2.1.4",
"spacy-models": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz",
"gensim": "gensim>=3.7.0",
"nltk": "nltk>=3.4",
"pytorch-pretrained-bert": "pytorch-pretrained-bert>=0.6",
"horovod": "horovod>=0.16.1",
}
PIP_GPU = {}
PIP_GPU = {"horovod": "horovod>=0.16.1"}
if __name__ == "__main__":

Просмотреть файл

@ -0,0 +1,3 @@
import nltk
nltk.download("punkt", quiet=True)

Просмотреть файл

@ -122,12 +122,8 @@ def to_nltk_tokens(
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("punkt")
text_df = df[sentence_cols]
tok_df = text_df.applymap(
lambda sentence: nltk.word_tokenize(sentence)
)
tok_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence))
tok_df.columns = token_cols
tokenized = pd.concat([df, tok_df], axis=1)
return tokenized
@ -158,11 +154,9 @@ def rm_nltk_stopwords(
nltk.download("stopwords")
stop_words = tuple(stopwords.words("english"))
text_df = df[sentence_cols]
stop_df = (
text_df
.applymap(lambda sentence: nltk.word_tokenize(sentence))
.applymap(lambda l: [word for word in l if word not in stop_words])
)
stop_df = text_df.applymap(
lambda sentence: nltk.word_tokenize(sentence)
).applymap(lambda l: [word for word in l if word not in stop_words])
stop_df.columns = stop_cols
return pd.concat([df, stop_df], axis=1)