From 1ed2c4dc0a4e361b525e6eb386b889f6a57c6144 Mon Sep 17 00:00:00 2001 From: Janhavi Mahajan Date: Mon, 13 May 2019 18:14:31 -0400 Subject: [PATCH] feat(bug fix) updated snli notebook with to_lowercase_all() instead of to_lowercase() that expects a column name list. Fixed None object returning in to_lowercase when column name list is not passed --- .../01-prep-data/snli.ipynb | 25 +++++++++++-------- utils_nlp/dataset/preprocess.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/scenarios/sentence_similarity/01-prep-data/snli.ipynb b/scenarios/sentence_similarity/01-prep-data/snli.ipynb index c886b1f..1fdb6d8 100644 --- a/scenarios/sentence_similarity/01-prep-data/snli.ipynb +++ b/scenarios/sentence_similarity/01-prep-data/snli.ipynb @@ -38,8 +38,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", - "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n" + "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n" ] } ], @@ -48,7 +47,7 @@ "sys.path.append(\"../../../\")\n", "\n", "import os\n", - "from utils_nlp.dataset.preprocess import to_lowercase, to_nltk_tokens\n", + "from utils_nlp.dataset.preprocess import to_lowercase_all, to_nltk_tokens\n", "from utils_nlp.dataset import snli\n", "\n", "print(\"System version: {}\".format(sys.version))" @@ -429,13 +428,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ - "train_tok = to_nltk_tokens(to_lowercase(train))" + "train_tok = to_nltk_tokens(to_lowercase_all(train))" ] }, { @@ -574,11 +574,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n", + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } @@ -592,9 +595,9 @@ "clean_dev = clean(dev, file_split=\"dev\")\n", "clean_test = clean(dev, file_split=\"test\")\n", "\n", - "train_tok = to_nltk_tokens(to_lowercase(clean_train))\n", - "dev_tok = to_nltk_tokens(to_lowercase(clean_dev))\n", - "test_tok = to_nltk_tokens(to_lowercase(clean_test))\n", + "train_tok = to_nltk_tokens(to_lowercase_all(clean_train))\n", + "dev_tok = to_nltk_tokens(to_lowercase_all(clean_dev))\n", + "test_tok = to_nltk_tokens(to_lowercase_all(clean_test))\n", "\n", "split_map = {'train': train_tok, 'dev': dev_tok, 'test': test_tok}\n", "for file_split, df in split_map.items():\n", diff --git a/utils_nlp/dataset/preprocess.py b/utils_nlp/dataset/preprocess.py index 1b6c5c3..8533980 100644 --- a/utils_nlp/dataset/preprocess.py +++ b/utils_nlp/dataset/preprocess.py @@ -32,7 +32,7 @@ def to_lowercase(df, column_names=[]): pd.DataFrame: Dataframe with columns with lowercase standardization. """ if not column_names: - to_lowercase_all(df) + return to_lowercase_all(df) else: df[column_names] = df[column_names].applymap( lambda s: s.lower() if type(s) == str else s