Adds missing planet kaggle utilities

This commit is contained in:
msalvaris 2017-05-21 16:31:01 +00:00
Родитель b438e9e0c2
Коммит d81db32a43
4 изменённых файлов: 166 добавлений и 80 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -39,8 +39,7 @@
"import seaborn\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from data_utils import *\n",
"from modeling import *\n",
"from experiments.libs.planet_kaggle import *\n",
"\n",
"% matplotlib inline\n",
"% load_ext autoreload\n",

Просмотреть файл

@ -6,12 +6,12 @@
"source": [
"# Experiment 05: Credit card Fraud\n",
"\n",
"This experiment uses the data from the Kaggle dataset [Credit Card Fraud Detection](https://www.kaggle.com/dalpozz/creditcardfraud). The dataset is "
"This experiment uses the data from the Kaggle dataset [Credit Card Fraud Detection](https://www.kaggle.com/dalpozz/creditcardfraud). The dataset is made up of a number of variables which are a result of PCA transformation."
]
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 1,
"metadata": {
"collapsed": false,
"deletable": true,
@ -19,11 +19,11 @@
},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
"/home/hoaphumanoid/anaconda3/envs/strata/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n"
]
}
],
@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 2,
"metadata": {
"collapsed": true
},
@ -73,7 +73,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"collapsed": false
},
@ -260,7 +260,7 @@
"[5 rows x 31 columns]"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -271,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 5,
"metadata": {
"collapsed": true
},
@ -283,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 6,
"metadata": {
"collapsed": true
},
@ -294,39 +294,35 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 146,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),\n",
" ('clf', XGBClassifier(max_depth=4, \n",
" ('clf', XGBClassifier(max_depth=8, \n",
" learning_rate=0.1, \n",
" scale_pos_weight=4,\n",
" n_estimators=100,\n",
" subsample=1))])"
" n_estimators=100))])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 147,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"lgbm_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),\n",
" ('clf', LGBMClassifier(num_leaves=2**4, \n",
" learning_rate=0.1, \n",
" scale_pos_weight=4,\n",
" n_estimators=100,\n",
" subsample=1))])"
" ('clf', LGBMClassifier(num_leaves=8, \n",
" learning_rate=0.1,\n",
" n_estimators=100))])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 148,
"metadata": {
"collapsed": true
},
@ -346,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 149,
"metadata": {
"collapsed": false
},
@ -358,7 +354,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 150,
"metadata": {
"collapsed": false
},
@ -369,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 151,
"metadata": {
"collapsed": true
},
@ -387,7 +383,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 152,
"metadata": {
"collapsed": false
},
@ -400,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 153,
"metadata": {
"collapsed": false
},
@ -423,9 +419,9 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 154,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
@ -436,7 +432,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 155,
"metadata": {
"collapsed": true
},
@ -452,7 +448,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 156,
"metadata": {
"collapsed": false
},
@ -464,23 +460,23 @@
"{\n",
" \"lgbm\": {\n",
" \"performance\": {\n",
" \"AUC\": 0.8816746886402201,\n",
" \"Accuracy\": 0.9994265182636377,\n",
" \"F1\": 0.8218181818181819,\n",
" \"Precision\": 0.889763779527559,\n",
" \"Recall\": 0.7635135135135135\n",
" \"AUC\": 0.8749589659417316,\n",
" \"Accuracy\": 0.9994850368081645,\n",
" \"F1\": 0.8345864661654134,\n",
" \"Precision\": 0.940677966101695,\n",
" \"Recall\": 0.75\n",
" },\n",
" \"time\": 0.8590561368037015\n",
" \"time\": 0.726549532962963\n",
" },\n",
" \"xgb\": {\n",
" \"performance\": {\n",
" \"AUC\": 0.8884197213803287,\n",
" \"Accuracy\": 0.9994265182636377,\n",
" \"F1\": 0.8243727598566308,\n",
" \"Precision\": 0.8778625954198473,\n",
" \"Recall\": 0.777027027027027\n",
" \"AUC\": 0.8749531039334075,\n",
" \"Accuracy\": 0.9994733330992591,\n",
" \"F1\": 0.8314606741573033,\n",
" \"Precision\": 0.9327731092436975,\n",
" \"Recall\": 0.75\n",
" },\n",
" \"time\": 5.040089489193633\n",
" \"time\": 7.974235568195581\n",
" }\n",
"}\n"
]
@ -490,6 +486,24 @@
"# Results\n",
"print(json.dumps(results_dict, indent=4, sort_keys=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {

Просмотреть файл

@ -0,0 +1,40 @@
import os
import numpy as np
import glob
def labels_from(labels_df):
""" Extracts the unique labels from the labels dataframe
"""
# Build list with unique labels
label_list = []
for tag_str in labels_df.tags.values:
labels = tag_str.split(' ')
for label in labels:
if label not in label_list:
label_list.append(label)
return label_list
def enrich_with_feature_encoding(labels_df):
# Add onehot features for every label
for label in labels_from(labels_df):
labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
return labels_df
def to_multi_label_dict(enriched_labels_df):
df = enriched_labels_df.set_index('image_name').drop('tags', axis=1)
return dict((filename, encoded_array) for filename, encoded_array in zip(df.index, df.values))
def get_file_count(folderpath):
""" Returns the number of files in a folder
"""
return len(glob.glob(folderpath))
def threshold_prediction(pred_y, threshold=0.5):# TODO: Needs to be tuned?
return pred_y > threshold