Output to data directory.
This commit is contained in:
Родитель
7d3247b1e6
Коммит
d9faf99553
|
@ -5,7 +5,7 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Preparation\n",
|
||||
"In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we provide the steps to prepare the data to use in model development for training a model that will match a new question with an existing original question.\n",
|
||||
"In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we provide the steps to prepare the data to use for training and testing a model that will match a new question with an existing original question. The data files produced are stored in a `data` directory for ease of reference and also to keep them separate from the training script.\n",
|
||||
"\n",
|
||||
"The data preparation steps are\n",
|
||||
"- [import libraries and define parameters](#import),\n",
|
||||
|
@ -41,10 +41,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_size = 0.20 # The proportion of duplicate questions in the test set.\n",
|
||||
"min_text = 150 # The minimum length of clean text.\n",
|
||||
"min_dupes = 12 # The minimum number of duplicates per question.\n",
|
||||
"match = 40 # The number of duplicate matches."
|
||||
"test_size = 0.20 # The proportion of duplicate questions in the test set.\n",
|
||||
"min_text = 150 # The minimum length of clean text.\n",
|
||||
"min_dupes = 12 # The minimum number of duplicates per question.\n",
|
||||
"match = 40 # The number of duplicate matches.\n",
|
||||
"output_path = os.path.join('.', 'data') # The location of data files."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -333,9 +334,9 @@
|
|||
"print('\\nDuplication statistics:')\n",
|
||||
"print(pd.DataFrame([dupes.AnswerId.value_counts().describe()\n",
|
||||
" .rename('duplications')]))\n",
|
||||
"print('\\nLargest class: {:.2%}'.format(\n",
|
||||
" dupes.AnswerId.value_counts().max()\n",
|
||||
" / dupes.shape[0]))"
|
||||
"print('\\nLargest class: {:.2%}'\n",
|
||||
" .format(dupes.AnswerId.value_counts().max()\n",
|
||||
" / dupes.shape[0]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -441,8 +442,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('Restrictions: min_text={}, min_dupes={}'.format(\n",
|
||||
" min_text, min_dupes))\n",
|
||||
"print('Restrictions: min_text={}, min_dupes={}'\n",
|
||||
" .format(min_text, min_dupes))\n",
|
||||
"print('Restricted text statistics:')\n",
|
||||
"print(pd.DataFrame([questions.Text.str.len().describe()\n",
|
||||
" .rename('questions'),\n",
|
||||
|
@ -451,9 +452,9 @@
|
|||
"print('\\nRestricted duplication statistics:')\n",
|
||||
"print(pd.DataFrame([dupes.AnswerId.value_counts().describe()\n",
|
||||
" .rename('duplications')]))\n",
|
||||
"print('\\nRestricted largest class: {:.2%}'.format(\n",
|
||||
" dupes.AnswerId.value_counts().max()\n",
|
||||
" / dupes.shape[0]))"
|
||||
"print('\\nRestricted largest class: {:.2%}'\n",
|
||||
" .format(dupes.AnswerId.value_counts().max()\n",
|
||||
" / dupes.shape[0]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -671,10 +672,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('balanced_pairs_train: {:,} rows with {:.2%} matches'.format(balanced_pairs_train.shape[0], \n",
|
||||
" balanced_pairs_train.Label.mean()))\n",
|
||||
"print('balanced_pairs_test: {:,} rows with {:.2%} matches'.format(balanced_pairs_test.shape[0], \n",
|
||||
" balanced_pairs_test.Label.mean()))"
|
||||
"print('balanced_pairs_train: {:,} rows with {:.2%} matches'\n",
|
||||
" .format(balanced_pairs_train.shape[0], \n",
|
||||
" balanced_pairs_train.Label.mean()))\n",
|
||||
"print('balanced_pairs_test: {:,} rows with {:.2%} matches'\n",
|
||||
" .format(balanced_pairs_test.shape[0], \n",
|
||||
" balanced_pairs_test.Label.mean()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -691,23 +694,25 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"questions_path = 'questions.tsv'\n",
|
||||
"os.makedirs(output_path, exist_ok=True)\n",
|
||||
"\n",
|
||||
"questions_path = os.path.join(output_path, 'questions.tsv')\n",
|
||||
"print('Writing {:,} rows to {}'.format(questions.shape[0], questions_path))\n",
|
||||
"questions.to_csv(questions_path, sep='\\t',header=True, index=False)\n",
|
||||
"\n",
|
||||
"dupes_train_path = 'dupes_train.tsv'\n",
|
||||
"dupes_train_path = os.path.join(output_path, 'dupes_train.tsv')\n",
|
||||
"print('Writing {:,} rows to {}'.format(dupes_train.shape[0], dupes_train_path))\n",
|
||||
"dupes_train.to_csv(dupes_train_path, sep='\\t',header=True, index=False)\n",
|
||||
"\n",
|
||||
"dupes_test_path = 'dupes_test.tsv'\n",
|
||||
"dupes_test_path = os.path.join(output_path, 'dupes_test.tsv')\n",
|
||||
"print('Writing {:,} rows to {}'.format(dupes_test.shape[0], dupes_test_path))\n",
|
||||
"dupes_test.to_csv(dupes_test_path, sep='\\t',header=True, index=False)\n",
|
||||
"\n",
|
||||
"balanced_pairs_train_path = 'balanced_pairs_train.tsv'\n",
|
||||
"balanced_pairs_train_path = os.path.join(output_path, 'balanced_pairs_train.tsv')\n",
|
||||
"print('Writing {:,} rows to {}'.format(balanced_pairs_train.shape[0], balanced_pairs_train_path))\n",
|
||||
"balanced_pairs_train.to_csv(balanced_pairs_train_path, sep='\\t',header=True, index=False)\n",
|
||||
"\n",
|
||||
"balanced_pairs_test_path = 'balanced_pairs_test.tsv'\n",
|
||||
"balanced_pairs_test_path = os.path.join(output_path, 'balanced_pairs_test.tsv')\n",
|
||||
"print('Writing {:,} rows to {}'.format(balanced_pairs_test.shape[0], balanced_pairs_test_path))\n",
|
||||
"balanced_pairs_test.to_csv(balanced_pairs_test_path, sep='\\t', header=True, index=False)"
|
||||
]
|
||||
|
@ -736,7 +741,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.5"
|
||||
"version": "3.6.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
Загрузка…
Ссылка в новой задаче