From a7795c17bb2dcde8173698e9bd4a7c7e35abde86 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Fri, 20 Mar 2020 19:51:02 +0000 Subject: [PATCH] update to make sure if works for raw dataset and preprocessed dataset --- ...tive_summarization_cnndm_transformer.ipynb | 1568 ++--------------- 1 file changed, 109 insertions(+), 1459 deletions(-) diff --git a/examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb b/examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb index def1922..c3da053 100644 --- a/examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb +++ b/examples/text_summarization/extractive_summarization_cnndm_transformer.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "tags": [ "parameters" @@ -72,7 +72,7 @@ "## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n", "QUICK_RUN = True\n", "## Set USE_PREPROCSSED_DATA = True to skip the data preprocessing\n", - "USE_PREPROCSSED_DATA = True" + "USE_PREPROCSSED_DATA = False" ] }, { @@ -84,40 +84,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", - "/home/daden/.local/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", - " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import shutil\n", @@ -130,7 +99,7 @@ " sys.path.insert(0, nlp_path)\n", "\n", "from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n", - "from utils_nlp.eval import compute_rouge_python\n", + "from utils_nlp.eval import compute_rouge_python, compute_rouge_perl\n", "from utils_nlp.models.transformers.extractive_summarization import (\n", " ExtractiveSummarizer,\n", " ExtSumProcessedData,\n", @@ -160,64 +129,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
model_name
0bert-base-uncased
1distilbert-base-uncased
\n", - "
" - ], - "text/plain": [ - " model_name\n", - "0 bert-base-uncased\n", - "1 distilbert-base-uncased" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pd.DataFrame({\"model_name\": ExtractiveSummarizer.list_supported_models()})" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "tags": [ "parameters" @@ -231,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "tags": [ "parameters" @@ -287,7 +208,7 @@ "outputs": [], "source": [ "# the data path used to save the downloaded data file\n", - "DATA_PATH = \"./temp\" # TemporaryDirectory().name\n", + "DATA_PATH = TemporaryDirectory().name\n", "# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\n", "TOP_N = 1000\n", "CHUNK_SIZE=200\n", @@ -298,19 +219,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { - "scrolled": true + "scrolled": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 489k/489k [00:07<00:00, 63.5kKB/s] \n" - ] - } - ], + "outputs": [], "source": [ "train_dataset, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=DATA_PATH)" ] @@ -324,67 +237,52 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [], "source": [ "\n", "ext_sum_train = processor.preprocess(train_dataset, oracle_mode=\"greedy\")\n", - "ext_sum_test = processor.preprocess(train_dataset, oracle_mode=\"greedy\")" + "ext_sum_test = processor.preprocess(test_dataset, oracle_mode=\"greedy\")\n" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "# save and load preprocessed data\n", + "save_path = os.path.join(DATA_PATH, \"processed\")\n", + "torch.save(ext_sum_train, os.path.join(save_path, \"train_full.pt\"))\n", + "torch.save(ext_sum_test, os.path.join(save_path, \"test_full.pt\"))\n", + "\n", + "\"\"\"\n", + "# ext_sum_train = torch.load(os.path.join(save_path, \"train_full.pt\"))\n", + "# ext_sum_test = torch.load(os.path.join(save_path, \"test_full.pt\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "995" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(ext_sum_train)" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n# save and load preprocessed data\\nsave_path = os.path.join(DATA_PATH, \"processed\")\\ntrain_files = ExtSumProcessedData.save_data(\\n ext_sum_train, is_test=False, save_path=save_path, chunk_size=CHUNK_SIZE\\n)\\ntest_files = ExtSumProcessedData.save_data(\\n ext_sum_test, is_test=True, save_path=save_path, chunk_size=CHUNK_SIZE\\n)\\next_sum_train, ext_sum_test = ExtSumProcessedData().splits(root=save_path, train_iterable=False)\\n'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "\"\"\"\n", - "# save and load preprocessed data\n", - "save_path = os.path.join(DATA_PATH, \"processed\")\n", - "train_files = ExtSumProcessedData.save_data(\n", - " ext_sum_train, is_test=False, save_path=save_path, chunk_size=CHUNK_SIZE\n", - ")\n", - "test_files = ExtSumProcessedData.save_data(\n", - " ext_sum_test, is_test=True, save_path=save_path, chunk_size=CHUNK_SIZE\n", - ")\n", - "ext_sum_train, ext_sum_test = ExtSumProcessedData().splits(root=save_path, train_iterable=False)\n", - "\"\"\"\n" + "len(ext_sum_test)" ] }, { @@ -396,905 +294,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'src': [['editor',\n", - " \"'s\",\n", - " 'note',\n", - " ':',\n", - " 'in',\n", - " 'our',\n", - " 'behind',\n", - " 'the',\n", - " 'scenes',\n", - " 'series',\n", - " ',',\n", - " 'cnn',\n", - " 'correspondents',\n", - " 'share',\n", - " 'their',\n", - " 'experiences',\n", - " 'in',\n", - " 'covering',\n", - " 'news',\n", - " 'and',\n", - " 'analyze',\n", - " 'the',\n", - " 'stories',\n", - " 'behind',\n", - " 'the',\n", - " 'events',\n", - " '.'],\n", - " ['here',\n", - " ',',\n", - " 'soledad',\n", - " \"o'brien\",\n", - " 'takes',\n", - " 'users',\n", - " 'inside',\n", - " 'a',\n", - " 'jail',\n", - " 'where',\n", - " 'many',\n", - " 'of',\n", - " 'the',\n", - " 'inmates',\n", - " 'are',\n", - " 'mentally',\n", - " 'ill',\n", - " '.'],\n", - " ['an',\n", - " 'inmate',\n", - " 'housed',\n", - " 'on',\n", - " 'the',\n", - " '``',\n", - " 'forgotten',\n", - " 'floor',\n", - " ',',\n", - " '``',\n", - " 'where',\n", - " 'many',\n", - " 'mentally',\n", - " 'ill',\n", - " 'inmates',\n", - " 'are',\n", - " 'housed',\n", - " 'in',\n", - " 'miami',\n", - " 'before',\n", - " 'trial',\n", - " '.'],\n", - " ['miami',\n", - " ',',\n", - " 'florida',\n", - " '(',\n", - " 'cnn',\n", - " ')',\n", - " '--',\n", - " 'the',\n", - " 'ninth',\n", - " 'floor',\n", - " 'of',\n", - " 'the',\n", - " 'miami-dade',\n", - " 'pretrial',\n", - " 'detention',\n", - " 'facility',\n", - " 'is',\n", - " 'dubbed',\n", - " 'the',\n", - " '``',\n", - " 'forgotten',\n", - " 'floor',\n", - " '.',\n", - " '``'],\n", - " ['here',\n", - " ',',\n", - " 'inmates',\n", - " 'with',\n", - " 'the',\n", - " 'most',\n", - " 'severe',\n", - " 'mental',\n", - " 'illnesses',\n", - " 'are',\n", - " 'incarcerated',\n", - " 'until',\n", - " 'they',\n", - " \"'re\",\n", - " 'ready',\n", - " 'to',\n", - " 'appear',\n", - " 'in',\n", - " 'court',\n", - " '.'],\n", - " ['most',\n", - " 'often',\n", - " ',',\n", - " 'they',\n", - " 'face',\n", - " 'drug',\n", - " 'charges',\n", - " 'or',\n", - " 'charges',\n", - " 'of',\n", - " 'assaulting',\n", - " 'an',\n", - " 'officer',\n", - " '--',\n", - " 'charges',\n", - " 'that',\n", - " 'judge',\n", - " 'steven',\n", - " 'leifman',\n", - " 'says',\n", - " 'are',\n", - " 'usually',\n", - " '``',\n", - " 'avoidable',\n", - " 'felonies',\n", - " '.',\n", - " '``'],\n", - " ['he',\n", - " 'says',\n", - " 'the',\n", - " 'arrests',\n", - " 'often',\n", - " 'result',\n", - " 'from',\n", - " 'confrontations',\n", - " 'with',\n", - " 'police',\n", - " '.'],\n", - " ['mentally',\n", - " 'ill',\n", - " 'people',\n", - " 'often',\n", - " 'wo',\n", - " \"n't\",\n", - " 'do',\n", - " 'what',\n", - " 'they',\n", - " \"'re\",\n", - " 'told',\n", - " 'when',\n", - " 'police',\n", - " 'arrive',\n", - " 'on',\n", - " 'the',\n", - " 'scene',\n", - " '--',\n", - " 'confrontation',\n", - " 'seems',\n", - " 'to',\n", - " 'exacerbate',\n", - " 'their',\n", - " 'illness',\n", - " 'and',\n", - " 'they',\n", - " 'become',\n", - " 'more',\n", - " 'paranoid',\n", - " ',',\n", - " 'delusional',\n", - " ',',\n", - " 'and',\n", - " 'less',\n", - " 'likely',\n", - " 'to',\n", - " 'follow',\n", - " 'directions',\n", - " ',',\n", - " 'according',\n", - " 'to',\n", - " 'leifman',\n", - " '.'],\n", - " ['so',\n", - " ',',\n", - " 'they',\n", - " 'end',\n", - " 'up',\n", - " 'on',\n", - " 'the',\n", - " 'ninth',\n", - " 'floor',\n", - " 'severely',\n", - " 'mentally',\n", - " 'disturbed',\n", - " ',',\n", - " 'but',\n", - " 'not',\n", - " 'getting',\n", - " 'any',\n", - " 'real',\n", - " 'help',\n", - " 'because',\n", - " 'they',\n", - " \"'re\",\n", - " 'in',\n", - " 'jail',\n", - " '.'],\n", - " ['we', 'toured', 'the', 'jail', 'with', 'leifman', '.'],\n", - " ['he',\n", - " 'is',\n", - " 'well',\n", - " 'known',\n", - " 'in',\n", - " 'miami',\n", - " 'as',\n", - " 'an',\n", - " 'advocate',\n", - " 'for',\n", - " 'justice',\n", - " 'and',\n", - " 'the',\n", - " 'mentally',\n", - " 'ill',\n", - " '.'],\n", - " ['even',\n", - " 'though',\n", - " 'we',\n", - " 'were',\n", - " 'not',\n", - " 'exactly',\n", - " 'welcomed',\n", - " 'with',\n", - " 'open',\n", - " 'arms',\n", - " 'by',\n", - " 'the',\n", - " 'guards',\n", - " ',',\n", - " 'we',\n", - " 'were',\n", - " 'given',\n", - " 'permission',\n", - " 'to',\n", - " 'shoot',\n", - " 'videotape',\n", - " 'and',\n", - " 'tour',\n", - " 'the',\n", - " 'floor',\n", - " '.'],\n", - " ['go',\n", - " 'inside',\n", - " 'the',\n", - " '`',\n", - " 'forgotten',\n", - " 'floor',\n", - " \"'\",\n", - " '``',\n", - " 'at',\n", - " 'first',\n", - " ',',\n", - " 'it',\n", - " \"'s\",\n", - " 'hard',\n", - " 'to',\n", - " 'determine',\n", - " 'where',\n", - " 'the',\n", - " 'people',\n", - " 'are',\n", - " '.'],\n", - " ['the', 'prisoners', 'are', 'wearing', 'sleeveless', 'robes', '.'],\n", - " ['imagine',\n", - " 'cutting',\n", - " 'holes',\n", - " 'for',\n", - " 'arms',\n", - " 'and',\n", - " 'feet',\n", - " 'in',\n", - " 'a',\n", - " 'heavy',\n", - " 'wool',\n", - " 'sleeping',\n", - " 'bag',\n", - " '--',\n", - " 'that',\n", - " \"'s\",\n", - " 'kind',\n", - " 'of',\n", - " 'what',\n", - " 'they',\n", - " 'look',\n", - " 'like',\n", - " '.'],\n", - " ['they',\n", - " \"'re\",\n", - " 'designed',\n", - " 'to',\n", - " 'keep',\n", - " 'the',\n", - " 'mentally',\n", - " 'ill',\n", - " 'patients',\n", - " 'from',\n", - " 'injuring',\n", - " 'themselves',\n", - " '.'],\n", - " ['that',\n", - " \"'s\",\n", - " 'also',\n", - " 'why',\n", - " 'they',\n", - " 'have',\n", - " 'no',\n", - " 'shoes',\n", - " ',',\n", - " 'laces',\n", - " 'or',\n", - " 'mattresses',\n", - " '.'],\n", - " ['leifman',\n", - " 'says',\n", - " 'about',\n", - " 'one-third',\n", - " 'of',\n", - " 'all',\n", - " 'people',\n", - " 'in',\n", - " 'miami-dade',\n", - " 'county',\n", - " 'jails',\n", - " 'are',\n", - " 'mentally',\n", - " 'ill',\n", - " '.'],\n", - " ['so',\n", - " ',',\n", - " 'he',\n", - " 'says',\n", - " ',',\n", - " 'the',\n", - " 'sheer',\n", - " 'volume',\n", - " 'is',\n", - " 'overwhelming',\n", - " 'the',\n", - " 'system',\n", - " ',',\n", - " 'and',\n", - " 'the',\n", - " 'result',\n", - " 'is',\n", - " 'what',\n", - " 'we',\n", - " 'see',\n", - " 'on',\n", - " 'the',\n", - " 'ninth',\n", - " 'floor',\n", - " '.'],\n", - " ['of',\n", - " 'course',\n", - " ',',\n", - " 'it',\n", - " 'is',\n", - " 'a',\n", - " 'jail',\n", - " ',',\n", - " 'so',\n", - " 'it',\n", - " \"'s\",\n", - " 'not',\n", - " 'supposed',\n", - " 'to',\n", - " 'be',\n", - " 'warm',\n", - " 'and',\n", - " 'comforting',\n", - " ',',\n", - " 'but',\n", - " 'the',\n", - " 'lights',\n", - " 'glare',\n", - " ',',\n", - " 'the',\n", - " 'cells',\n", - " 'are',\n", - " 'tiny',\n", - " 'and',\n", - " 'it',\n", - " \"'s\",\n", - " 'loud',\n", - " '.'],\n", - " ['we',\n", - " 'see',\n", - " 'two',\n", - " ',',\n", - " 'sometimes',\n", - " 'three',\n", - " 'men',\n", - " '--',\n", - " 'sometimes',\n", - " 'in',\n", - " 'the',\n", - " 'robes',\n", - " ',',\n", - " 'sometimes',\n", - " 'naked',\n", - " ',',\n", - " 'lying',\n", - " 'or',\n", - " 'sitting',\n", - " 'in',\n", - " 'their',\n", - " 'cells',\n", - " '.',\n", - " '``'],\n", - " ['i', 'am', 'the', 'son', 'of', 'the', 'president', '.'],\n", - " ['you', 'need', 'to', 'get', 'me', 'out', 'of', 'here', '!', '``'],\n", - " ['one', 'man', 'shouts', 'at', 'me', '.'],\n", - " ['he',\n", - " 'is',\n", - " 'absolutely',\n", - " 'serious',\n", - " ',',\n", - " 'convinced',\n", - " 'that',\n", - " 'help',\n", - " 'is',\n", - " 'on',\n", - " 'the',\n", - " 'way',\n", - " '--',\n", - " 'if',\n", - " 'only',\n", - " 'he',\n", - " 'could',\n", - " 'reach',\n", - " 'the',\n", - " 'white',\n", - " 'house',\n", - " '.'],\n", - " ['leifman',\n", - " 'tells',\n", - " 'me',\n", - " 'that',\n", - " 'these',\n", - " 'prisoner-patients',\n", - " 'will',\n", - " 'often',\n", - " 'circulate',\n", - " 'through',\n", - " 'the',\n", - " 'system',\n", - " ',',\n", - " 'occasionally',\n", - " 'stabilizing',\n", - " 'in',\n", - " 'a',\n", - " 'mental',\n", - " 'hospital',\n", - " ',',\n", - " 'only',\n", - " 'to',\n", - " 'return',\n", - " 'to',\n", - " 'jail',\n", - " 'to',\n", - " 'face',\n", - " 'their',\n", - " 'charges',\n", - " '.'],\n", - " ['it',\n", - " \"'s\",\n", - " 'brutally',\n", - " 'unjust',\n", - " ',',\n", - " 'in',\n", - " 'his',\n", - " 'mind',\n", - " ',',\n", - " 'and',\n", - " 'he',\n", - " 'has',\n", - " 'become',\n", - " 'a',\n", - " 'strong',\n", - " 'advocate',\n", - " 'for',\n", - " 'changing',\n", - " 'things',\n", - " 'in',\n", - " 'miami',\n", - " '.'],\n", - " ['over',\n", - " 'a',\n", - " 'meal',\n", - " 'later',\n", - " ',',\n", - " 'we',\n", - " 'talk',\n", - " 'about',\n", - " 'how',\n", - " 'things',\n", - " 'got',\n", - " 'this',\n", - " 'way',\n", - " 'for',\n", - " 'mental',\n", - " 'patients',\n", - " '.'],\n", - " ['leifman',\n", - " 'says',\n", - " '200',\n", - " 'years',\n", - " 'ago',\n", - " 'people',\n", - " 'were',\n", - " 'considered',\n", - " '``',\n", - " 'lunatics',\n", - " '``',\n", - " 'and',\n", - " 'they',\n", - " 'were',\n", - " 'locked',\n", - " 'up',\n", - " 'in',\n", - " 'jails',\n", - " 'even',\n", - " 'if',\n", - " 'they',\n", - " 'had',\n", - " 'no',\n", - " 'charges',\n", - " 'against',\n", - " 'them',\n", - " '.'],\n", - " ['they',\n", - " 'were',\n", - " 'just',\n", - " 'considered',\n", - " 'unfit',\n", - " 'to',\n", - " 'be',\n", - " 'in',\n", - " 'society',\n", - " '.'],\n", - " ['over',\n", - " 'the',\n", - " 'years',\n", - " ',',\n", - " 'he',\n", - " 'says',\n", - " ',',\n", - " 'there',\n", - " 'was',\n", - " 'some',\n", - " 'public',\n", - " 'outcry',\n", - " ',',\n", - " 'and',\n", - " 'the',\n", - " 'mentally',\n", - " 'ill',\n", - " 'were',\n", - " 'moved',\n", - " 'out',\n", - " 'of',\n", - " 'jails',\n", - " 'and',\n", - " 'into',\n", - " 'hospitals',\n", - " '.'],\n", - " ['but',\n", - " 'leifman',\n", - " 'says',\n", - " 'many',\n", - " 'of',\n", - " 'these',\n", - " 'mental',\n", - " 'hospitals',\n", - " 'were',\n", - " 'so',\n", - " 'horrible',\n", - " 'they',\n", - " 'were',\n", - " 'shut',\n", - " 'down',\n", - " '.'],\n", - " ['where', 'did', 'the', 'patients', 'go', '?'],\n", - " ['nowhere', '.'],\n", - " ['the', 'streets', '.'],\n", - " ['they',\n", - " 'became',\n", - " ',',\n", - " 'in',\n", - " 'many',\n", - " 'cases',\n", - " ',',\n", - " 'the',\n", - " 'homeless',\n", - " ',',\n", - " 'he',\n", - " 'says',\n", - " '.'],\n", - " ['they', 'never', 'got', 'treatment', '.'],\n", - " ['leifman',\n", - " 'says',\n", - " 'in',\n", - " '1955',\n", - " 'there',\n", - " 'were',\n", - " 'more',\n", - " 'than',\n", - " 'half',\n", - " 'a',\n", - " 'million',\n", - " 'people',\n", - " 'in',\n", - " 'state',\n", - " 'mental',\n", - " 'hospitals',\n", - " ',',\n", - " 'and',\n", - " 'today',\n", - " 'that',\n", - " 'number',\n", - " 'has',\n", - " 'been',\n", - " 'reduced',\n", - " '90',\n", - " 'percent',\n", - " ',',\n", - " 'and',\n", - " '40,000',\n", - " 'to',\n", - " '50,000',\n", - " 'people',\n", - " 'are',\n", - " 'in',\n", - " 'mental',\n", - " 'hospitals',\n", - " '.'],\n", - " ['the', 'judge', 'says', 'he', \"'s\", 'working', 'to', 'change', 'this', '.'],\n", - " ['starting',\n", - " 'in',\n", - " '2008',\n", - " ',',\n", - " 'many',\n", - " 'inmates',\n", - " 'who',\n", - " 'would',\n", - " 'otherwise',\n", - " 'have',\n", - " 'been',\n", - " 'brought',\n", - " 'to',\n", - " 'the',\n", - " '``',\n", - " 'forgotten',\n", - " 'floor',\n", - " '``',\n", - " 'will',\n", - " 'instead',\n", - " 'be',\n", - " 'sent',\n", - " 'to',\n", - " 'a',\n", - " 'new',\n", - " 'mental',\n", - " 'health',\n", - " 'facility',\n", - " '--',\n", - " 'the',\n", - " 'first',\n", - " 'step',\n", - " 'on',\n", - " 'a',\n", - " 'journey',\n", - " 'toward',\n", - " 'long-term',\n", - " 'treatment',\n", - " ',',\n", - " 'not',\n", - " 'just',\n", - " 'punishment',\n", - " '.'],\n", - " ['leifman',\n", - " 'says',\n", - " 'it',\n", - " \"'s\",\n", - " 'not',\n", - " 'the',\n", - " 'complete',\n", - " 'answer',\n", - " ',',\n", - " 'but',\n", - " 'it',\n", - " \"'s\",\n", - " 'a',\n", - " 'start',\n", - " '.'],\n", - " ['leifman',\n", - " 'says',\n", - " 'the',\n", - " 'best',\n", - " 'part',\n", - " 'is',\n", - " 'that',\n", - " 'it',\n", - " \"'s\",\n", - " 'a',\n", - " 'win-win',\n", - " 'solution',\n", - " '.'],\n", - " ['the',\n", - " 'patients',\n", - " 'win',\n", - " ',',\n", - " 'the',\n", - " 'families',\n", - " 'are',\n", - " 'relieved',\n", - " ',',\n", - " 'and',\n", - " 'the',\n", - " 'state',\n", - " 'saves',\n", - " 'money',\n", - " 'by',\n", - " 'simply',\n", - " 'not',\n", - " 'cycling',\n", - " 'these',\n", - " 'prisoners',\n", - " 'through',\n", - " 'again',\n", - " 'and',\n", - " 'again',\n", - " '.'],\n", - " ['and', ',', 'for', 'leifman', ',', 'justice', 'is', 'served', '.'],\n", - " ['e-mail', 'to', 'a', 'friend', '.']],\n", - " 'src_txt': [\"editor 's note : in our behind the scenes series , cnn correspondents share their experiences in covering news and analyze the stories behind the events .\",\n", - " \"here , soledad o'brien takes users inside a jail where many of the inmates are mentally ill .\",\n", - " 'an inmate housed on the \" forgotten floor , \" where many mentally ill inmates are housed in miami before trial .',\n", - " 'miami , florida ( cnn ) -- the ninth floor of the miami-dade pretrial detention facility is dubbed the \" forgotten floor . \"',\n", - " \"here , inmates with the most severe mental illnesses are incarcerated until they 're ready to appear in court .\",\n", - " 'most often , they face drug charges or charges of assaulting an officer -- charges that judge steven leifman says are usually \" avoidable felonies . \"',\n", - " 'he says the arrests often result from confrontations with police .',\n", - " \"mentally ill people often wo n't do what they 're told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid , delusional , and less likely to follow directions , according to leifman .\",\n", - " \"so , they end up on the ninth floor severely mentally disturbed , but not getting any real help because they 're in jail .\",\n", - " 'we toured the jail with leifman .',\n", - " 'he is well known in miami as an advocate for justice and the mentally ill .',\n", - " 'even though we were not exactly welcomed with open arms by the guards , we were given permission to shoot videotape and tour the floor .',\n", - " 'go inside the ` forgotten floor \\' \" at first , it \\'s hard to determine where the people are .',\n", - " 'the prisoners are wearing sleeveless robes .',\n", - " \"imagine cutting holes for arms and feet in a heavy wool sleeping bag -- that 's kind of what they look like .\",\n", - " \"they 're designed to keep the mentally ill patients from injuring themselves .\",\n", - " \"that 's also why they have no shoes , laces or mattresses .\",\n", - " 'leifman says about one-third of all people in miami-dade county jails are mentally ill .',\n", - " 'so , he says , the sheer volume is overwhelming the system , and the result is what we see on the ninth floor .',\n", - " \"of course , it is a jail , so it 's not supposed to be warm and comforting , but the lights glare , the cells are tiny and it 's loud .\",\n", - " 'we see two , sometimes three men -- sometimes in the robes , sometimes naked , lying or sitting in their cells . \"',\n", - " 'i am the son of the president .',\n", - " 'you need to get me out of here ! \"',\n", - " 'one man shouts at me .',\n", - " 'he is absolutely serious , convinced that help is on the way -- if only he could reach the white house .',\n", - " 'leifman tells me that these prisoner-patients will often circulate through the system , occasionally stabilizing in a mental hospital , only to return to jail to face their charges .',\n", - " \"it 's brutally unjust , in his mind , and he has become a strong advocate for changing things in miami .\",\n", - " 'over a meal later , we talk about how things got this way for mental patients .',\n", - " 'leifman says 200 years ago people were considered \" lunatics \" and they were locked up in jails even if they had no charges against them .',\n", - " 'they were just considered unfit to be in society .',\n", - " 'over the years , he says , there was some public outcry , and the mentally ill were moved out of jails and into hospitals .',\n", - " 'but leifman says many of these mental hospitals were so horrible they were shut down .',\n", - " 'where did the patients go ?',\n", - " 'nowhere .',\n", - " 'the streets .',\n", - " 'they became , in many cases , the homeless , he says .',\n", - " 'they never got treatment .',\n", - " 'leifman says in 1955 there were more than half a million people in state mental hospitals , and today that number has been reduced 90 percent , and 40,000 to 50,000 people are in mental hospitals .',\n", - " \"the judge says he 's working to change this .\",\n", - " 'starting in 2008 , many inmates who would otherwise have been brought to the \" forgotten floor \" will instead be sent to a new mental health facility -- the first step on a journey toward long-term treatment , not just punishment .',\n", - " \"leifman says it 's not the complete answer , but it 's a start .\",\n", - " \"leifman says the best part is that it 's a win-win solution .\",\n", - " 'the patients win , the families are relieved , and the state saves money by simply not cycling these prisoners through again and again .',\n", - " 'and , for leifman , justice is served .',\n", - " 'e-mail to a friend .'],\n", - " 'tgt': [['mentally',\n", - " 'ill',\n", - " 'inmates',\n", - " 'in',\n", - " 'miami',\n", - " 'are',\n", - " 'housed',\n", - " 'on',\n", - " 'the',\n", - " '``',\n", - " 'forgotten',\n", - " 'floor',\n", - " '``'],\n", - " ['judge',\n", - " 'steven',\n", - " 'leifman',\n", - " 'says',\n", - " 'most',\n", - " 'are',\n", - " 'there',\n", - " 'as',\n", - " 'a',\n", - " 'result',\n", - " 'of',\n", - " '``',\n", - " 'avoidable',\n", - " 'felonies',\n", - " '``'],\n", - " ['while',\n", - " 'cnn',\n", - " 'tours',\n", - " 'facility',\n", - " ',',\n", - " 'patient',\n", - " 'shouts',\n", - " ':',\n", - " '``',\n", - " 'i',\n", - " 'am',\n", - " 'the',\n", - " 'son',\n", - " 'of',\n", - " 'the',\n", - " 'president',\n", - " '``'],\n", - " ['leifman',\n", - " 'says',\n", - " 'the',\n", - " 'system',\n", - " 'is',\n", - " 'unjust',\n", - " 'and',\n", - " 'he',\n", - " \"'s\",\n", - " 'fighting',\n", - " 'for',\n", - " 'change',\n", - " '.'],\n", - " []],\n", - " 'tgt_txt': [' mentally ill inmates in miami are housed on the \" forgotten floor \" ',\n", - " ' judge steven leifman says most are there as a result of \" avoidable felonies \" ',\n", - " ' while cnn tours facility , patient shouts : \" i am the son of the president \" ',\n", - " \" leifman says the system is unjust and he 's fighting for change . \",\n", - " '\\n'],\n", - " 'oracle_ids': [2, 21, 38]}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ext_sum_train[0]" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['src', 'src_txt', 'tgt', 'tgt_txt', 'oracle_ids'])" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ext_sum_train[0].keys()" ] @@ -1308,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "tags": [ "parameters", @@ -1319,7 +332,7 @@ "source": [ "# the data path used to downloaded the preprocessed data from BERTSUM Repo.\n", "# if you have downloaded the dataset, change the code to use that path where the dataset is.\n", - "PROCESSED_DATA_PATH = \"./temp_data5/\"# TemporaryDirectory().name\n", + "PROCESSED_DATA_PATH = TemporaryDirectory().name\n", "os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)\n", "#data_path = \"./temp_data5/\"\n", "#PROCESSED_DATA_PATH = data_path" @@ -1327,17 +340,9 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading 1x0d61LP9UAN389YN00z0Pv-7jQgirVg6 into ./temp_data5/bertsum_data.zip... Done.\n" - ] - } - ], + "outputs": [], "source": [ "if USE_PREPROCSSED_DATA:\n", " download_path = CNNDMBertSumProcessedData.download(local_path=PROCESSED_DATA_PATH)\n", @@ -1368,13 +373,6 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 16, "metadata": { "tags": [ "parameters" @@ -1384,15 +382,16 @@ "source": [ "# notebook parameters\n", "# the cache data path during find tuning\n", - "CACHE_DIR = \"./temp\"\n", - " #TemporaryDirectory().name\n", + "CACHE_DIR = TemporaryDirectory().name\n", "\n", - "# batch size, unit is the number of tokens\n", - "BATCH_SIZE = 5\n", - "if USE_PREPROCSSED_DATA:\n", - " BATCH_SIZE = 3000\n", - " \n", + "\n", + "BATCH_SIZE = 20 # batch size, unit is the number of samples\n", "MAX_POS_LENGTH = 1025\n", + "if USE_PREPROCSSED_DATA: #if bertsum published data is used\n", + " BATCH_SIZE = 3000 # batch size, unit is the number of tokens\n", + " MAX_POS_LENGTH = 512\n", + " \n", + "\n", "\n", "# GPU used for training\n", "NUM_GPUS = torch.cuda.device_count()\n", @@ -1419,93 +418,25 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4cae5d356732418b8cf6ffb6fe5ab9e1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Downloading', max=546, style=ProgressStyle(description_width=…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "77827c1412c24d9a93f979f7ddd1a6c1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, description='Downloading', max=267967963, style=ProgressStyle(description_…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "summarizer = ExtractiveSummarizer(processor, MODEL_NAME, ENCODER, MAX_POS_LENGTH, CACHE_DIR)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Iteration: 0it [00:00, ?it/s]/dadendev/anaconda3/envs/cm3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n", - " warnings.warn('Was asked to gather along dimension 0, but all '\n", - "Iteration: 201it [00:50, 5.34it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "timestamp: 19/03/2020 03:47:16, average loss: 12.307219, time duration: 50.045471,\n", - " number of examples in current reporting: 1008, step 100\n", - " out of total 100\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "#\"\"\"\n", + "\n", "summarizer.fit(\n", " ext_sum_train,\n", " num_gpus=NUM_GPUS,\n", @@ -1517,7 +448,7 @@ " verbose=True,\n", " report_every=REPORT_EVERY,\n", " clip_grad_norm=False,\n", - " use_preprocessed_data=USE_PREPROCSSED_DATA,\n", + " use_preprocessed_data=USE_PREPROCSSED_DATA\n", " )\n", "\n", "#\"\"\"\n" @@ -1525,17 +456,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "saving through pytorch\n" - ] - } - ], + "outputs": [], "source": [ "summarizer.save_model(\n", " os.path.join(\n", @@ -1549,24 +472,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# for loading a previous saved model\n", - "# import torch\n", - "# summarizer.model.load_state_dict(torch.load(\"bertsumext.pt\", map_location=\"cpu\"))\n", + "\"\"\"\n", "import torch\n", "model_path = os.path.join(\n", " CACHE_DIR,\n", @@ -1574,7 +485,8 @@ " MODEL_NAME, USE_PREPROCSSED_DATA, MAX_STEPS\n", " ))\n", "summarizer = ExtractiveSummarizer(processor, MODEL_NAME, ENCODER, MAX_POS_LENGTH, CACHE_DIR)\n", - "summarizer.model.load_state_dict(torch.load(model_path, map_location=\"cpu\"))" + "summarizer.model.load_state_dict(torch.load(model_path, map_location=\"cpu\"))\n", + "\"\"\"" ] }, { @@ -1588,325 +500,95 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['src', 'labels', 'segs', 'clss', 'src_txt', 'tgt_txt'])" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ext_sum_test[0].keys()" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if \"segs\" in ext_sum_test[0]: # preprocessed_data\n", " source = [i['src_txt'] for i in ext_sum_test]\n", - " target = [i['tgt_txt'] for i in ext_sum_test]\n", + " target = [\"\\n\".join(i['tgt_txt'].split(\"\")) for i in ext_sum_test]\n", "else:\n", " source = []\n", - " target = []\n", + " temp_target = []\n", " for i in ext_sum_test:\n", " source.append(i[\"src_txt\"]) \n", - " target.append(\" \".join(j) for j in i['tgt']) " + " temp_target.append(\" \".join(j) for j in i['tgt']) \n", + " target = [''.join(i) for i in list(temp_target)]" ] }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11489" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(target)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "new_target = [''.join(i) for i in list(target)]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in januaryhe was flown back to chicago via air on march 20 but he died on sundayinitial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbedhis cousin claims he was attacked and thrown 40ft from a bridge'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_target[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['src', 'labels', 'segs', 'clss', 'src_txt', 'tgt_txt'])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ext_sum_test[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',\n", - " 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',\n", - " 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',\n", - " 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',\n", - " 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',\n", - " \"he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .\",\n", - " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',\n", - " \"on sunday , his cousin abby wrote online : ` this morning my cousin andrew 's soul was lifted up to heaven .\",\n", - " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed',\n", - " '` at the beginning of january he went to rome to study aboard and on the way home from a party he was brutally attacked and thrown off a 40ft bridge and hit the concrete below .',\n", - " \"` he was in a coma and in critical condition for months . '\",\n", - " 'paula barnett , who said she is a close family friend , told my suburban life , that mogni had only been in the country for six hours when the incident happened .',\n", - " 'she said he was was alone at the time of the alleged assault and personal items were stolen .',\n", - " 'she added that he was in a non-medically induced coma , having suffered serious infection and internal bleeding .',\n", - " 'mogni was a third-year finance major from glen ellyn , ill. , who was participating in a semester-long program at john cabot university .',\n", - " \"mogni belonged to the school 's chapter of the sigma nu fraternity , reports the chicago tribune who posted a sign outside a building reading ` pray for mogni . '\",\n", - " \"the fraternity 's iowa chapter announced sunday afternoon via twitter that a memorial service will be held on campus to remember mogni .\"]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ext_sum_test[0]['src_txt']" - ] - }, - { - "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Scoring: 100%|██████████| 120/120 [00:23<00:00, 5.52it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 57.3 s, sys: 24.2 s, total: 1min 21s\n", - "Wall time: 25 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", - "sentence_separator = \"\" if USE_PREPROCSSED_DATA else \"\"\n", - "prediction = summarizer.predict(ext_sum_test, num_gpus=NUM_GPUS, batch_size=96, sentence_separator=\"\")" + "sentence_separator = \"\\n\"\n", + "prediction = summarizer.predict(ext_sum_test, num_gpus=NUM_GPUS, batch_size=256, sentence_separator=sentence_separator)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11489" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(prediction)" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of candidates: 11489\n", - "Number of references: 11489\n", - "{'rouge-1': {'f': 0.38160550121428305,\n", - " 'p': 0.34522594905248344,\n", - " 'r': 0.460548544823775},\n", - " 'rouge-2': {'f': 0.15848162985826766,\n", - " 'p': 0.14344366690257168,\n", - " 'r': 0.19133653190737185},\n", - " 'rouge-l': {'f': 0.233200959909078,\n", - " 'p': 0.21056130586454902,\n", - " 'r': 0.28274224002248494}}\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "rouge_scores = compute_rouge_python(cand=prediction, ref=new_target)\n", - "pprint.pprint(rouge_scores)\n" + "rouge_scores = compute_rouge_python(cand=prediction, ref=target)\n", + "pprint.pprint(rouge_scores)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in januaryhe was flown back to chicago via air on march 20 but he died on sundayinitial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbedhis cousin claims he was attacked and thrown 40ft from a bridge'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "new_target[0]" + "target[0]" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robberyhe was taken to a medical facility in the chicago area , close to his family home in glen ellyn .'" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "prediction[0]" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',\n", - " 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',\n", - " 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',\n", - " 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',\n", - " 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',\n", - " \"he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .\",\n", - " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',\n", - " \"on sunday , his cousin abby wrote online : ` this morning my cousin andrew 's soul was lifted up to heaven .\",\n", - " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed',\n", - " '` at the beginning of january he went to rome to study aboard and on the way home from a party he was brutally attacked and thrown off a 40ft bridge and hit the concrete below .',\n", - " \"` he was in a coma and in critical condition for months . '\",\n", - " 'paula barnett , who said she is a close family friend , told my suburban life , that mogni had only been in the country for six hours when the incident happened .',\n", - " 'she said he was was alone at the time of the alleged assault and personal items were stolen .',\n", - " 'she added that he was in a non-medically induced coma , having suffered serious infection and internal bleeding .',\n", - " 'mogni was a third-year finance major from glen ellyn , ill. , who was participating in a semester-long program at john cabot university .',\n", - " \"mogni belonged to the school 's chapter of the sigma nu fraternity , reports the chicago tribune who posted a sign outside a building reading ` pray for mogni . '\",\n", - " \"the fraternity 's iowa chapter announced sunday afternoon via twitter that a memorial service will be held on campus to remember mogni .\"]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "source[0]" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.15848162985826766, - "encoder": "json", - "name": "rouge_2_f_score", - "version": 1 - } - }, - "metadata": { - "scrapbook": { - "data": true, - "display": false, - "name": "rouge_2_f_score" - } - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# for testing\n", "sb.glue(\"rouge_2_f_score\", rouge_scores['rouge-2']['f'])" @@ -1921,13 +603,11 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "source = \"\"\"\n", - "\n", - "\n", "But under the new rule, set to be announced in the next 48 hours, Border Patrol agents would immediately return anyone to Mexico — without any detainment and without any due process — who attempts to cross the southwestern border between the legal ports of entry. The person would not be held for any length of time in an American facility.\n", "\n", "Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border. Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.\n", @@ -1937,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1957,57 +637,27 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['src', 'src_txt'])" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "preprocessed_dataset[0].keys()" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Scoring: 100%|██████████| 1/1 [00:00<00:00, 1.05it/s]\n" - ] - } - ], + "outputs": [], "source": [ - "prediction = summarizer.predict(preprocessed_dataset, num_gpus=0, batch_size=1, sentence_separator=\"\")" + "prediction = summarizer.predict(preprocessed_dataset, num_gpus=0, batch_size=1, sentence_separator=\"\\n\")" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['The person would not be held for any length of time in an American facility.Although they advised that details could change before the announcement, administration officials said the measure was needed to avert what they fear could be a systemwide outbreak of the coronavirus inside detention facilities along the border.Such an outbreak could spread quickly through the immigrant population and could infect large numbers of Border Patrol agents, leaving the southwestern border defenses weakened, the officials argued.']" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "prediction" ] @@ -2021,7 +671,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [], "source": [