suppress nltk messages

2019-06-12 12:37:43 -04:00 · 2019-06-12 12:37:43 -04:00 · e031d3d225
--- a/scenarios/data_prep/snli.ipynb
+++ b/scenarios/data_prep/snli.ipynb
@ -38,7 +38,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n"
+      "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
+      "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
     ]
    }
   ],
@ -92,7 +93,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92.3k/92.3k [00:23<00:00, 3.86kKB/s]\n"
+      "100%|██████████| 92.3k/92.3k [01:01<00:00, 1.50kKB/s]\n"
     ]
    }
   ],
@ -437,17 +438,7 @@
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "train_tok = to_nltk_tokens(to_lowercase_all(train))"
   ]
@ -583,23 +574,7 @@
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n",
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n",
-      "[nltk_data] Downloading package punkt to\n",
-      "[nltk_data]     C:\\Users\\abeswara\\AppData\\Roaming\\nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
    "dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
@ -669,9 +644,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "nlp",
+   "display_name": "Python (nlp_cpu)",
   "language": "python",
-   "name": "nlp"
+   "name": "nlp_cpu"
  },
  "language_info": {
   "codemirror_mode": {
--- a/tools/generate_conda_file.py
+++ b/tools/generate_conda_file.py
@ -62,14 +62,14 @@ PIP_BASE = {
    "pyemd": "pyemd==0.5.1",
    "ipywebrtc": "ipywebrtc==0.4.3",
    "pre-commit": "pre-commit>=1.14.4",
+    "spacy": "spacy>=2.1.4",
    "spacy-models": "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz",
    "gensim": "gensim>=3.7.0",
    "nltk": "nltk>=3.4",
    "pytorch-pretrained-bert": "pytorch-pretrained-bert>=0.6",
-    "horovod": "horovod>=0.16.1",
 }

-PIP_GPU = {}
+PIP_GPU = {"horovod": "horovod>=0.16.1"}


 if __name__ == "__main__":
--- a/utils_nlp/dataset/init.py
+++ b/utils_nlp/dataset/init.py
@ -0,0 +1,3 @@
+import nltk
+
+nltk.download("punkt", quiet=True)
--- a/utils_nlp/dataset/preprocess.py
+++ b/utils_nlp/dataset/preprocess.py
@ -122,12 +122,8 @@ def to_nltk_tokens(
    pd.DataFrame: Dataframe with new columns token_cols, each containing a
    list of tokens for their respective sentences.
    """
-
-    nltk.download("punkt")
    text_df = df[sentence_cols]
-    tok_df = text_df.applymap(
-        lambda sentence: nltk.word_tokenize(sentence)
-    )
+    tok_df = text_df.applymap(lambda sentence: nltk.word_tokenize(sentence))
    tok_df.columns = token_cols
    tokenized = pd.concat([df, tok_df], axis=1)
    return tokenized
@ -158,11 +154,9 @@ def rm_nltk_stopwords(
    nltk.download("stopwords")
    stop_words = tuple(stopwords.words("english"))
    text_df = df[sentence_cols]
-    stop_df = (
-        text_df
-        .applymap(lambda sentence: nltk.word_tokenize(sentence))
-        .applymap(lambda l: [word for word in l if word not in stop_words])
-    )
+    stop_df = text_df.applymap(
+        lambda sentence: nltk.word_tokenize(sentence)
+    ).applymap(lambda l: [word for word in l if word not in stop_words])

    stop_df.columns = stop_cols
    return pd.concat([df, stop_df], axis=1)